diff --git a/.gitattributes b/.gitattributes index 7d40cf8a1cfcdb0f81904f5984dea885b93c898c..6366cfff9cc0c4b655e3707bb316c31b12cf4683 100644 --- a/.gitattributes +++ b/.gitattributes @@ -36,3 +36,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text noever/model-30000.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text noever/model-30000.meta filter=lfs diff=lfs merge=lfs -text marques/tokenizer.json filter=lfs diff=lfs merge=lfs -text +marques/outputs/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +marques/outputs/checkpoint-1500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +marques/outputs/checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +marques/outputs/checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +marques/outputs/checkpoint-3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +marques/outputs/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +marques/outputs/checkpoint-60/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/marques/outputs/README.md b/marques/outputs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ac97fb20863f1aa6435f4dc099a49295f6e6d0b9 --- /dev/null +++ b/marques/outputs/README.md @@ -0,0 +1,59 @@ +--- +base_model: unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit +library_name: transformers +model_name: outputs +tags: +- generated_from_trainer +- trl +- unsloth +- sft +licence: license +--- + +# Model Card for outputs + +This model is a fine-tuned version of [unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit](https://huggingface.co/unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.24.0 +- Transformers: 4.57.2 +- Pytorch: 2.9.0+cu126 +- Datasets: 4.3.0 +- Tokenizers: 0.22.1 + +## Citations + + + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/marques/outputs/checkpoint-1000/README.md b/marques/outputs/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d90a96dfe2e51221657a6e936d376789e21081f9 --- /dev/null +++ b/marques/outputs/checkpoint-1000/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/marques/outputs/checkpoint-1000/adapter_config.json b/marques/outputs/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9930a191a30254256c9550b1bdffa58b8d7aee8 --- /dev/null +++ b/marques/outputs/checkpoint-1000/adapter_config.json @@ -0,0 +1,50 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "LlamaForCausalLM", + "parent_library": "transformers.models.llama.modeling_llama", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "gate_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/marques/outputs/checkpoint-1000/adapter_model.safetensors b/marques/outputs/checkpoint-1000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..85ef3e19352a1db4f948bf1028daca4a2f7cd67d --- /dev/null +++ b/marques/outputs/checkpoint-1000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aed999248c6169ba1d0be281222cb549cd7f7337baa082564011278c9ea95f6f +size 167832240 diff --git a/marques/outputs/checkpoint-1000/optimizer.pt b/marques/outputs/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3c6284cac1ca78535b11004f77eaa3ac80b17c4 --- /dev/null +++ b/marques/outputs/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecbedd93c3926596e5a00a099204fb9b643996dc0b40098921e9ca10fec08d87 +size 85724133 diff --git a/marques/outputs/checkpoint-1000/rng_state.pth b/marques/outputs/checkpoint-1000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ef66339b9befa098183fd5d69faed6838e526b0 --- /dev/null +++ b/marques/outputs/checkpoint-1000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1d565802a8e26c4e8a31328752b7a7fdc186d9401aa008e65697d0ad8c22e33 +size 14645 diff --git a/marques/outputs/checkpoint-1000/scheduler.pt b/marques/outputs/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f93b0f0140b76fae0c424c538aad7983445f528a --- /dev/null +++ b/marques/outputs/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b9d97b834a5898e2b6b87ad4b261648f9f9835c1ceec86c1fe07fce13558803 +size 1465 diff --git a/marques/outputs/checkpoint-1000/special_tokens_map.json b/marques/outputs/checkpoint-1000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..68b10c7f0a479eae0c358eac6a14959b3f9acdf1 --- /dev/null +++ b/marques/outputs/checkpoint-1000/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/marques/outputs/checkpoint-1000/tokenizer.json b/marques/outputs/checkpoint-1000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/marques/outputs/checkpoint-1000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/marques/outputs/checkpoint-1000/tokenizer_config.json b/marques/outputs/checkpoint-1000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..92b1d94e894e5474ebea1d171e14751be79ca3e5 --- /dev/null +++ b/marques/outputs/checkpoint-1000/tokenizer_config.json @@ -0,0 +1,2066 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/marques/outputs/checkpoint-1000/trainer_state.json b/marques/outputs/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f14eb6b300e58aabc536baaa12f8c97ae404729a --- /dev/null +++ b/marques/outputs/checkpoint-1000/trainer_state.json @@ -0,0 +1,7034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.000351009252516144, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.5100925251614403e-07, + "grad_norm": 0.53782719373703, + "learning_rate": 0.0, + "loss": 0.5835, + "step": 1 + }, + { + "epoch": 7.020185050322881e-07, + "grad_norm": 0.6201626062393188, + "learning_rate": 4e-05, + "loss": 0.5242, + "step": 2 + }, + { + "epoch": 1.053027757548432e-06, + "grad_norm": 0.7571901082992554, + "learning_rate": 8e-05, + "loss": 0.5642, + "step": 3 + }, + { + "epoch": 1.4040370100645761e-06, + "grad_norm": 0.5588695406913757, + "learning_rate": 0.00012, + "loss": 0.4859, + "step": 4 + }, + { + "epoch": 1.75504626258072e-06, + "grad_norm": 0.7208331227302551, + "learning_rate": 0.00016, + "loss": 0.4645, + "step": 5 + }, + { + "epoch": 2.106055515096864e-06, + "grad_norm": 0.8169743418693542, + "learning_rate": 0.0002, + "loss": 0.3702, + "step": 6 + }, + { + "epoch": 2.4570647676130083e-06, + "grad_norm": 2.051530599594116, + "learning_rate": 0.00019993322203672788, + "loss": 0.4856, + "step": 7 + }, + { + "epoch": 2.8080740201291522e-06, + "grad_norm": 1.2310550212860107, + "learning_rate": 0.00019986644407345576, + "loss": 0.5192, + "step": 8 + }, + { + "epoch": 3.1590832726452962e-06, + "grad_norm": 1.612046241760254, + "learning_rate": 0.00019979966611018366, + "loss": 0.4719, + "step": 9 + }, + { + "epoch": 3.51009252516144e-06, + "grad_norm": 1.4484680891036987, + "learning_rate": 0.00019973288814691153, + "loss": 0.4416, + "step": 10 + }, + { + "epoch": 3.861101777677584e-06, + "grad_norm": 1.4529719352722168, + "learning_rate": 0.0001996661101836394, + "loss": 0.6275, + "step": 11 + }, + { + "epoch": 4.212111030193728e-06, + "grad_norm": 1.3963671922683716, + "learning_rate": 0.00019959933222036728, + "loss": 0.5874, + "step": 12 + }, + { + "epoch": 4.563120282709872e-06, + "grad_norm": 1.4744153022766113, + "learning_rate": 0.00019953255425709515, + "loss": 0.6422, + "step": 13 + }, + { + "epoch": 4.9141295352260165e-06, + "grad_norm": 0.8640050888061523, + "learning_rate": 0.00019946577629382305, + "loss": 0.5064, + "step": 14 + }, + { + "epoch": 5.26513878774216e-06, + "grad_norm": 0.7137419581413269, + "learning_rate": 0.00019939899833055092, + "loss": 0.5218, + "step": 15 + }, + { + "epoch": 5.6161480402583045e-06, + "grad_norm": 0.7769026756286621, + "learning_rate": 0.00019933222036727882, + "loss": 0.5377, + "step": 16 + }, + { + "epoch": 5.967157292774448e-06, + "grad_norm": 0.7558479905128479, + "learning_rate": 0.0001992654424040067, + "loss": 0.5054, + "step": 17 + }, + { + "epoch": 6.3181665452905924e-06, + "grad_norm": 0.8237054347991943, + "learning_rate": 0.00019919866444073457, + "loss": 0.5094, + "step": 18 + }, + { + "epoch": 6.669175797806736e-06, + "grad_norm": 1.0375059843063354, + "learning_rate": 0.00019913188647746244, + "loss": 0.5751, + "step": 19 + }, + { + "epoch": 7.02018505032288e-06, + "grad_norm": 1.075869083404541, + "learning_rate": 0.00019906510851419034, + "loss": 0.594, + "step": 20 + }, + { + "epoch": 7.371194302839024e-06, + "grad_norm": 0.8041358590126038, + "learning_rate": 0.00019899833055091822, + "loss": 0.553, + "step": 21 + }, + { + "epoch": 7.722203555355168e-06, + "grad_norm": 0.9264736771583557, + "learning_rate": 0.0001989315525876461, + "loss": 0.5555, + "step": 22 + }, + { + "epoch": 8.073212807871313e-06, + "grad_norm": 1.0074031352996826, + "learning_rate": 0.00019886477462437396, + "loss": 0.5353, + "step": 23 + }, + { + "epoch": 8.424222060387455e-06, + "grad_norm": 0.8725020885467529, + "learning_rate": 0.00019879799666110183, + "loss": 0.5557, + "step": 24 + }, + { + "epoch": 8.7752313129036e-06, + "grad_norm": 0.8867582678794861, + "learning_rate": 0.00019873121869782974, + "loss": 0.5992, + "step": 25 + }, + { + "epoch": 9.126240565419744e-06, + "grad_norm": 0.9235608577728271, + "learning_rate": 0.0001986644407345576, + "loss": 0.516, + "step": 26 + }, + { + "epoch": 9.477249817935889e-06, + "grad_norm": 0.8653218150138855, + "learning_rate": 0.00019859766277128548, + "loss": 0.5249, + "step": 27 + }, + { + "epoch": 9.828259070452033e-06, + "grad_norm": 0.7479026913642883, + "learning_rate": 0.00019853088480801335, + "loss": 0.5037, + "step": 28 + }, + { + "epoch": 1.0179268322968176e-05, + "grad_norm": 0.9531452655792236, + "learning_rate": 0.00019846410684474123, + "loss": 0.5896, + "step": 29 + }, + { + "epoch": 1.053027757548432e-05, + "grad_norm": 1.1012492179870605, + "learning_rate": 0.00019839732888146913, + "loss": 0.5139, + "step": 30 + }, + { + "epoch": 1.0881286828000465e-05, + "grad_norm": 1.0198887586593628, + "learning_rate": 0.000198330550918197, + "loss": 0.5587, + "step": 31 + }, + { + "epoch": 1.1232296080516609e-05, + "grad_norm": 0.8081266283988953, + "learning_rate": 0.00019826377295492487, + "loss": 0.4762, + "step": 32 + }, + { + "epoch": 1.1583305333032752e-05, + "grad_norm": 1.1965891122817993, + "learning_rate": 0.00019819699499165277, + "loss": 0.5719, + "step": 33 + }, + { + "epoch": 1.1934314585548896e-05, + "grad_norm": 1.214903473854065, + "learning_rate": 0.00019813021702838065, + "loss": 0.5756, + "step": 34 + }, + { + "epoch": 1.228532383806504e-05, + "grad_norm": 0.8360006213188171, + "learning_rate": 0.00019806343906510852, + "loss": 0.5688, + "step": 35 + }, + { + "epoch": 1.2636333090581185e-05, + "grad_norm": 0.8328489065170288, + "learning_rate": 0.00019799666110183642, + "loss": 0.6418, + "step": 36 + }, + { + "epoch": 1.298734234309733e-05, + "grad_norm": 1.1427714824676514, + "learning_rate": 0.0001979298831385643, + "loss": 0.6531, + "step": 37 + }, + { + "epoch": 1.3338351595613472e-05, + "grad_norm": 1.0145376920700073, + "learning_rate": 0.00019786310517529217, + "loss": 0.6473, + "step": 38 + }, + { + "epoch": 1.3689360848129616e-05, + "grad_norm": 0.8427861928939819, + "learning_rate": 0.00019779632721202004, + "loss": 0.5882, + "step": 39 + }, + { + "epoch": 1.404037010064576e-05, + "grad_norm": 0.8792659044265747, + "learning_rate": 0.00019772954924874791, + "loss": 0.608, + "step": 40 + }, + { + "epoch": 1.4391379353161905e-05, + "grad_norm": 0.9338463544845581, + "learning_rate": 0.00019766277128547581, + "loss": 0.7118, + "step": 41 + }, + { + "epoch": 1.4742388605678048e-05, + "grad_norm": 0.7554420232772827, + "learning_rate": 0.0001975959933222037, + "loss": 0.5898, + "step": 42 + }, + { + "epoch": 1.5093397858194192e-05, + "grad_norm": 0.7700084447860718, + "learning_rate": 0.00019752921535893156, + "loss": 0.6466, + "step": 43 + }, + { + "epoch": 1.5444407110710337e-05, + "grad_norm": 0.8639333248138428, + "learning_rate": 0.00019746243739565943, + "loss": 0.7253, + "step": 44 + }, + { + "epoch": 1.579541636322648e-05, + "grad_norm": 0.7760612964630127, + "learning_rate": 0.0001973956594323873, + "loss": 0.7099, + "step": 45 + }, + { + "epoch": 1.6146425615742626e-05, + "grad_norm": 0.7319066524505615, + "learning_rate": 0.0001973288814691152, + "loss": 0.6664, + "step": 46 + }, + { + "epoch": 1.6497434868258768e-05, + "grad_norm": 0.7557100057601929, + "learning_rate": 0.00019726210350584308, + "loss": 0.6318, + "step": 47 + }, + { + "epoch": 1.684844412077491e-05, + "grad_norm": 0.6420389413833618, + "learning_rate": 0.00019719532554257095, + "loss": 0.6688, + "step": 48 + }, + { + "epoch": 1.7199453373291057e-05, + "grad_norm": 0.660383939743042, + "learning_rate": 0.00019712854757929883, + "loss": 0.6204, + "step": 49 + }, + { + "epoch": 1.75504626258072e-05, + "grad_norm": 0.5614909529685974, + "learning_rate": 0.00019706176961602673, + "loss": 0.664, + "step": 50 + }, + { + "epoch": 1.7901471878323346e-05, + "grad_norm": 0.502738356590271, + "learning_rate": 0.0001969949916527546, + "loss": 0.6918, + "step": 51 + }, + { + "epoch": 1.825248113083949e-05, + "grad_norm": 0.47578102350234985, + "learning_rate": 0.0001969282136894825, + "loss": 0.6747, + "step": 52 + }, + { + "epoch": 1.860349038335563e-05, + "grad_norm": 0.5528931617736816, + "learning_rate": 0.00019686143572621037, + "loss": 0.765, + "step": 53 + }, + { + "epoch": 1.8954499635871777e-05, + "grad_norm": 0.6176997423171997, + "learning_rate": 0.00019679465776293825, + "loss": 0.5959, + "step": 54 + }, + { + "epoch": 1.930550888838792e-05, + "grad_norm": 0.43425047397613525, + "learning_rate": 0.00019672787979966612, + "loss": 0.6437, + "step": 55 + }, + { + "epoch": 1.9656518140904066e-05, + "grad_norm": 0.5135884881019592, + "learning_rate": 0.000196661101836394, + "loss": 0.7019, + "step": 56 + }, + { + "epoch": 2.000752739342021e-05, + "grad_norm": 0.4628916084766388, + "learning_rate": 0.0001965943238731219, + "loss": 0.5722, + "step": 57 + }, + { + "epoch": 2.035853664593635e-05, + "grad_norm": 0.48201897740364075, + "learning_rate": 0.00019652754590984977, + "loss": 0.6288, + "step": 58 + }, + { + "epoch": 2.0709545898452498e-05, + "grad_norm": 0.5772811770439148, + "learning_rate": 0.00019646076794657764, + "loss": 0.6067, + "step": 59 + }, + { + "epoch": 2.106055515096864e-05, + "grad_norm": 0.4976802170276642, + "learning_rate": 0.0001963939899833055, + "loss": 0.4722, + "step": 60 + }, + { + "epoch": 2.1411564403484786e-05, + "grad_norm": 0.4842129051685333, + "learning_rate": 0.00019632721202003339, + "loss": 0.5876, + "step": 61 + }, + { + "epoch": 2.176257365600093e-05, + "grad_norm": 0.46149536967277527, + "learning_rate": 0.00019626043405676129, + "loss": 0.6373, + "step": 62 + }, + { + "epoch": 2.2113582908517072e-05, + "grad_norm": 0.47199445962905884, + "learning_rate": 0.00019619365609348916, + "loss": 0.5546, + "step": 63 + }, + { + "epoch": 2.2464592161033218e-05, + "grad_norm": 0.6109340190887451, + "learning_rate": 0.00019612687813021703, + "loss": 0.6069, + "step": 64 + }, + { + "epoch": 2.281560141354936e-05, + "grad_norm": 0.5529135465621948, + "learning_rate": 0.0001960601001669449, + "loss": 0.553, + "step": 65 + }, + { + "epoch": 2.3166610666065503e-05, + "grad_norm": 0.500245213508606, + "learning_rate": 0.00019599332220367278, + "loss": 0.6149, + "step": 66 + }, + { + "epoch": 2.351761991858165e-05, + "grad_norm": 0.4841914474964142, + "learning_rate": 0.00019592654424040068, + "loss": 0.6509, + "step": 67 + }, + { + "epoch": 2.3868629171097792e-05, + "grad_norm": 0.5308504104614258, + "learning_rate": 0.00019585976627712855, + "loss": 0.7017, + "step": 68 + }, + { + "epoch": 2.4219638423613938e-05, + "grad_norm": 0.5157874822616577, + "learning_rate": 0.00019579298831385645, + "loss": 0.7125, + "step": 69 + }, + { + "epoch": 2.457064767613008e-05, + "grad_norm": 0.47787800431251526, + "learning_rate": 0.00019572621035058433, + "loss": 0.5792, + "step": 70 + }, + { + "epoch": 2.4921656928646224e-05, + "grad_norm": 0.46792763471603394, + "learning_rate": 0.0001956594323873122, + "loss": 0.7, + "step": 71 + }, + { + "epoch": 2.527266618116237e-05, + "grad_norm": 0.5394675135612488, + "learning_rate": 0.00019559265442404007, + "loss": 0.5549, + "step": 72 + }, + { + "epoch": 2.5623675433678512e-05, + "grad_norm": 0.45065200328826904, + "learning_rate": 0.00019552587646076797, + "loss": 0.6663, + "step": 73 + }, + { + "epoch": 2.597468468619466e-05, + "grad_norm": 0.4026688039302826, + "learning_rate": 0.00019545909849749584, + "loss": 0.6315, + "step": 74 + }, + { + "epoch": 2.63256939387108e-05, + "grad_norm": 0.42353659868240356, + "learning_rate": 0.00019539232053422372, + "loss": 0.5419, + "step": 75 + }, + { + "epoch": 2.6676703191226944e-05, + "grad_norm": 0.45561954379081726, + "learning_rate": 0.0001953255425709516, + "loss": 0.6624, + "step": 76 + }, + { + "epoch": 2.702771244374309e-05, + "grad_norm": 0.3954075574874878, + "learning_rate": 0.00019525876460767946, + "loss": 0.5479, + "step": 77 + }, + { + "epoch": 2.7378721696259233e-05, + "grad_norm": 0.4994329512119293, + "learning_rate": 0.00019519198664440736, + "loss": 0.7224, + "step": 78 + }, + { + "epoch": 2.7729730948775375e-05, + "grad_norm": 0.41149672865867615, + "learning_rate": 0.00019512520868113524, + "loss": 0.5621, + "step": 79 + }, + { + "epoch": 2.808074020129152e-05, + "grad_norm": 0.4199008345603943, + "learning_rate": 0.0001950584307178631, + "loss": 0.7038, + "step": 80 + }, + { + "epoch": 2.8431749453807664e-05, + "grad_norm": 0.4378969371318817, + "learning_rate": 0.00019499165275459098, + "loss": 0.6654, + "step": 81 + }, + { + "epoch": 2.878275870632381e-05, + "grad_norm": 0.4653928279876709, + "learning_rate": 0.00019492487479131886, + "loss": 0.6241, + "step": 82 + }, + { + "epoch": 2.9133767958839953e-05, + "grad_norm": 0.5166454911231995, + "learning_rate": 0.00019485809682804673, + "loss": 0.5366, + "step": 83 + }, + { + "epoch": 2.9484777211356096e-05, + "grad_norm": 0.43180733919143677, + "learning_rate": 0.00019479131886477463, + "loss": 0.6178, + "step": 84 + }, + { + "epoch": 2.9835786463872242e-05, + "grad_norm": 0.44828200340270996, + "learning_rate": 0.0001947245409015025, + "loss": 0.6706, + "step": 85 + }, + { + "epoch": 3.0186795716388385e-05, + "grad_norm": 0.384175181388855, + "learning_rate": 0.0001946577629382304, + "loss": 0.5551, + "step": 86 + }, + { + "epoch": 3.053780496890453e-05, + "grad_norm": 0.4359772503376007, + "learning_rate": 0.00019459098497495828, + "loss": 0.5626, + "step": 87 + }, + { + "epoch": 3.0888814221420673e-05, + "grad_norm": 0.4177016615867615, + "learning_rate": 0.00019452420701168615, + "loss": 0.6023, + "step": 88 + }, + { + "epoch": 3.1239823473936816e-05, + "grad_norm": 0.43592438101768494, + "learning_rate": 0.00019445742904841405, + "loss": 0.682, + "step": 89 + }, + { + "epoch": 3.159083272645296e-05, + "grad_norm": 0.48027974367141724, + "learning_rate": 0.00019439065108514192, + "loss": 0.7596, + "step": 90 + }, + { + "epoch": 3.194184197896911e-05, + "grad_norm": 0.35989537835121155, + "learning_rate": 0.0001943238731218698, + "loss": 0.6018, + "step": 91 + }, + { + "epoch": 3.229285123148525e-05, + "grad_norm": 0.48477092385292053, + "learning_rate": 0.00019425709515859767, + "loss": 0.512, + "step": 92 + }, + { + "epoch": 3.2643860484001394e-05, + "grad_norm": 0.38858646154403687, + "learning_rate": 0.00019419031719532554, + "loss": 0.6371, + "step": 93 + }, + { + "epoch": 3.2994869736517536e-05, + "grad_norm": 0.5323147177696228, + "learning_rate": 0.00019412353923205344, + "loss": 0.5221, + "step": 94 + }, + { + "epoch": 3.334587898903368e-05, + "grad_norm": 0.3784274160861969, + "learning_rate": 0.00019405676126878132, + "loss": 0.6158, + "step": 95 + }, + { + "epoch": 3.369688824154982e-05, + "grad_norm": 0.4076334834098816, + "learning_rate": 0.0001939899833055092, + "loss": 0.5535, + "step": 96 + }, + { + "epoch": 3.404789749406597e-05, + "grad_norm": 0.43930479884147644, + "learning_rate": 0.00019392320534223706, + "loss": 0.6482, + "step": 97 + }, + { + "epoch": 3.4398906746582114e-05, + "grad_norm": 0.4266909658908844, + "learning_rate": 0.00019385642737896494, + "loss": 0.6, + "step": 98 + }, + { + "epoch": 3.474991599909826e-05, + "grad_norm": 0.45353513956069946, + "learning_rate": 0.0001937896494156928, + "loss": 0.6596, + "step": 99 + }, + { + "epoch": 3.51009252516144e-05, + "grad_norm": 0.3424838185310364, + "learning_rate": 0.0001937228714524207, + "loss": 0.555, + "step": 100 + }, + { + "epoch": 3.545193450413054e-05, + "grad_norm": 0.40126165747642517, + "learning_rate": 0.00019365609348914858, + "loss": 0.6921, + "step": 101 + }, + { + "epoch": 3.580294375664669e-05, + "grad_norm": 0.36572012305259705, + "learning_rate": 0.00019358931552587646, + "loss": 0.5485, + "step": 102 + }, + { + "epoch": 3.6153953009162834e-05, + "grad_norm": 0.3972407281398773, + "learning_rate": 0.00019352253756260436, + "loss": 0.5884, + "step": 103 + }, + { + "epoch": 3.650496226167898e-05, + "grad_norm": 0.3900579512119293, + "learning_rate": 0.00019345575959933223, + "loss": 0.6664, + "step": 104 + }, + { + "epoch": 3.685597151419512e-05, + "grad_norm": 0.31666621565818787, + "learning_rate": 0.00019338898163606013, + "loss": 0.5009, + "step": 105 + }, + { + "epoch": 3.720698076671126e-05, + "grad_norm": 0.5269597172737122, + "learning_rate": 0.000193322203672788, + "loss": 0.6292, + "step": 106 + }, + { + "epoch": 3.755799001922741e-05, + "grad_norm": 0.4645126163959503, + "learning_rate": 0.00019325542570951588, + "loss": 0.636, + "step": 107 + }, + { + "epoch": 3.7908999271743555e-05, + "grad_norm": 0.3900754153728485, + "learning_rate": 0.00019318864774624375, + "loss": 0.5367, + "step": 108 + }, + { + "epoch": 3.82600085242597e-05, + "grad_norm": 0.42533883452415466, + "learning_rate": 0.00019312186978297162, + "loss": 0.6862, + "step": 109 + }, + { + "epoch": 3.861101777677584e-05, + "grad_norm": 0.6809422969818115, + "learning_rate": 0.00019305509181969952, + "loss": 0.6434, + "step": 110 + }, + { + "epoch": 3.896202702929198e-05, + "grad_norm": 0.5127860307693481, + "learning_rate": 0.0001929883138564274, + "loss": 0.6266, + "step": 111 + }, + { + "epoch": 3.931303628180813e-05, + "grad_norm": 0.5254234671592712, + "learning_rate": 0.00019292153589315527, + "loss": 0.6982, + "step": 112 + }, + { + "epoch": 3.9664045534324275e-05, + "grad_norm": 0.3699031472206116, + "learning_rate": 0.00019285475792988314, + "loss": 0.6037, + "step": 113 + }, + { + "epoch": 4.001505478684042e-05, + "grad_norm": 0.3807130455970764, + "learning_rate": 0.00019278797996661101, + "loss": 0.5861, + "step": 114 + }, + { + "epoch": 4.036606403935656e-05, + "grad_norm": 0.4455645978450775, + "learning_rate": 0.0001927212020033389, + "loss": 0.5658, + "step": 115 + }, + { + "epoch": 4.07170732918727e-05, + "grad_norm": 0.3830210864543915, + "learning_rate": 0.0001926544240400668, + "loss": 0.606, + "step": 116 + }, + { + "epoch": 4.106808254438885e-05, + "grad_norm": 0.41419631242752075, + "learning_rate": 0.00019258764607679466, + "loss": 0.6095, + "step": 117 + }, + { + "epoch": 4.1419091796904995e-05, + "grad_norm": 0.3929574489593506, + "learning_rate": 0.00019252086811352253, + "loss": 0.6464, + "step": 118 + }, + { + "epoch": 4.177010104942114e-05, + "grad_norm": 0.35958629846572876, + "learning_rate": 0.0001924540901502504, + "loss": 0.5185, + "step": 119 + }, + { + "epoch": 4.212111030193728e-05, + "grad_norm": 0.3790556490421295, + "learning_rate": 0.0001923873121869783, + "loss": 0.5156, + "step": 120 + }, + { + "epoch": 4.2472119554453423e-05, + "grad_norm": 0.37452438473701477, + "learning_rate": 0.00019232053422370618, + "loss": 0.5711, + "step": 121 + }, + { + "epoch": 4.282312880696957e-05, + "grad_norm": 0.38976770639419556, + "learning_rate": 0.00019225375626043408, + "loss": 0.6075, + "step": 122 + }, + { + "epoch": 4.3174138059485716e-05, + "grad_norm": 0.4098513424396515, + "learning_rate": 0.00019218697829716195, + "loss": 0.5312, + "step": 123 + }, + { + "epoch": 4.352514731200186e-05, + "grad_norm": 0.33890047669410706, + "learning_rate": 0.00019212020033388983, + "loss": 0.4984, + "step": 124 + }, + { + "epoch": 4.3876156564518e-05, + "grad_norm": 0.49077001214027405, + "learning_rate": 0.0001920534223706177, + "loss": 0.7159, + "step": 125 + }, + { + "epoch": 4.4227165817034144e-05, + "grad_norm": 0.41653814911842346, + "learning_rate": 0.0001919866444073456, + "loss": 0.5642, + "step": 126 + }, + { + "epoch": 4.4578175069550286e-05, + "grad_norm": 0.45710283517837524, + "learning_rate": 0.00019191986644407347, + "loss": 0.6936, + "step": 127 + }, + { + "epoch": 4.4929184322066436e-05, + "grad_norm": 0.36976873874664307, + "learning_rate": 0.00019185308848080135, + "loss": 0.5407, + "step": 128 + }, + { + "epoch": 4.528019357458258e-05, + "grad_norm": 0.42852675914764404, + "learning_rate": 0.00019178631051752922, + "loss": 0.6731, + "step": 129 + }, + { + "epoch": 4.563120282709872e-05, + "grad_norm": 0.5426310300827026, + "learning_rate": 0.0001917195325542571, + "loss": 0.5775, + "step": 130 + }, + { + "epoch": 4.5982212079614864e-05, + "grad_norm": 0.38442543148994446, + "learning_rate": 0.00019165275459098497, + "loss": 0.5994, + "step": 131 + }, + { + "epoch": 4.633322133213101e-05, + "grad_norm": 0.4298035502433777, + "learning_rate": 0.00019158597662771287, + "loss": 0.5563, + "step": 132 + }, + { + "epoch": 4.6684230584647156e-05, + "grad_norm": 0.40397605299949646, + "learning_rate": 0.00019151919866444074, + "loss": 0.6924, + "step": 133 + }, + { + "epoch": 4.70352398371633e-05, + "grad_norm": 0.4338497519493103, + "learning_rate": 0.0001914524207011686, + "loss": 0.5739, + "step": 134 + }, + { + "epoch": 4.738624908967944e-05, + "grad_norm": 0.39713653922080994, + "learning_rate": 0.0001913856427378965, + "loss": 0.4529, + "step": 135 + }, + { + "epoch": 4.7737258342195584e-05, + "grad_norm": 0.31409478187561035, + "learning_rate": 0.0001913188647746244, + "loss": 0.562, + "step": 136 + }, + { + "epoch": 4.808826759471173e-05, + "grad_norm": 0.371624618768692, + "learning_rate": 0.00019125208681135226, + "loss": 0.5288, + "step": 137 + }, + { + "epoch": 4.8439276847227877e-05, + "grad_norm": 0.4600190818309784, + "learning_rate": 0.00019118530884808016, + "loss": 0.6215, + "step": 138 + }, + { + "epoch": 4.879028609974402e-05, + "grad_norm": 0.45351359248161316, + "learning_rate": 0.00019111853088480803, + "loss": 0.686, + "step": 139 + }, + { + "epoch": 4.914129535226016e-05, + "grad_norm": 0.42282962799072266, + "learning_rate": 0.0001910517529215359, + "loss": 0.5966, + "step": 140 + }, + { + "epoch": 4.9492304604776305e-05, + "grad_norm": 0.41479986906051636, + "learning_rate": 0.00019098497495826378, + "loss": 0.5948, + "step": 141 + }, + { + "epoch": 4.984331385729245e-05, + "grad_norm": 0.40453553199768066, + "learning_rate": 0.00019091819699499168, + "loss": 0.6411, + "step": 142 + }, + { + "epoch": 5.01943231098086e-05, + "grad_norm": 0.3939369320869446, + "learning_rate": 0.00019085141903171955, + "loss": 0.5513, + "step": 143 + }, + { + "epoch": 5.054533236232474e-05, + "grad_norm": 0.3700481653213501, + "learning_rate": 0.00019078464106844743, + "loss": 0.5459, + "step": 144 + }, + { + "epoch": 5.089634161484088e-05, + "grad_norm": 0.4377487897872925, + "learning_rate": 0.0001907178631051753, + "loss": 0.6076, + "step": 145 + }, + { + "epoch": 5.1247350867357025e-05, + "grad_norm": 0.37919673323631287, + "learning_rate": 0.00019065108514190317, + "loss": 0.5207, + "step": 146 + }, + { + "epoch": 5.159836011987317e-05, + "grad_norm": 0.3841630816459656, + "learning_rate": 0.00019058430717863107, + "loss": 0.614, + "step": 147 + }, + { + "epoch": 5.194936937238932e-05, + "grad_norm": 0.43541714549064636, + "learning_rate": 0.00019051752921535895, + "loss": 0.6283, + "step": 148 + }, + { + "epoch": 5.230037862490546e-05, + "grad_norm": 0.4853285253047943, + "learning_rate": 0.00019045075125208682, + "loss": 0.5807, + "step": 149 + }, + { + "epoch": 5.26513878774216e-05, + "grad_norm": 0.3572970926761627, + "learning_rate": 0.0001903839732888147, + "loss": 0.6866, + "step": 150 + }, + { + "epoch": 5.3002397129937745e-05, + "grad_norm": 0.3674347698688507, + "learning_rate": 0.00019031719532554257, + "loss": 0.5552, + "step": 151 + }, + { + "epoch": 5.335340638245389e-05, + "grad_norm": 0.37748461961746216, + "learning_rate": 0.00019025041736227044, + "loss": 0.6278, + "step": 152 + }, + { + "epoch": 5.370441563497003e-05, + "grad_norm": 0.3788503408432007, + "learning_rate": 0.00019018363939899834, + "loss": 0.622, + "step": 153 + }, + { + "epoch": 5.405542488748618e-05, + "grad_norm": 0.3736303150653839, + "learning_rate": 0.0001901168614357262, + "loss": 0.5822, + "step": 154 + }, + { + "epoch": 5.440643414000232e-05, + "grad_norm": 0.32680070400238037, + "learning_rate": 0.0001900500834724541, + "loss": 0.5715, + "step": 155 + }, + { + "epoch": 5.4757443392518466e-05, + "grad_norm": 0.34495192766189575, + "learning_rate": 0.00018998330550918199, + "loss": 0.6497, + "step": 156 + }, + { + "epoch": 5.510845264503461e-05, + "grad_norm": 0.4244193136692047, + "learning_rate": 0.00018991652754590986, + "loss": 0.5519, + "step": 157 + }, + { + "epoch": 5.545946189755075e-05, + "grad_norm": 0.4024031162261963, + "learning_rate": 0.00018984974958263776, + "loss": 0.5339, + "step": 158 + }, + { + "epoch": 5.58104711500669e-05, + "grad_norm": 0.46051299571990967, + "learning_rate": 0.00018978297161936563, + "loss": 0.5979, + "step": 159 + }, + { + "epoch": 5.616148040258304e-05, + "grad_norm": 0.49051615595817566, + "learning_rate": 0.0001897161936560935, + "loss": 0.5563, + "step": 160 + }, + { + "epoch": 5.6512489655099186e-05, + "grad_norm": 0.43045854568481445, + "learning_rate": 0.00018964941569282138, + "loss": 0.5984, + "step": 161 + }, + { + "epoch": 5.686349890761533e-05, + "grad_norm": 0.37778228521347046, + "learning_rate": 0.00018958263772954925, + "loss": 0.5955, + "step": 162 + }, + { + "epoch": 5.721450816013147e-05, + "grad_norm": 0.3736341893672943, + "learning_rate": 0.00018951585976627715, + "loss": 0.6438, + "step": 163 + }, + { + "epoch": 5.756551741264762e-05, + "grad_norm": 0.3940117061138153, + "learning_rate": 0.00018944908180300502, + "loss": 0.503, + "step": 164 + }, + { + "epoch": 5.7916526665163763e-05, + "grad_norm": 0.4193519055843353, + "learning_rate": 0.0001893823038397329, + "loss": 0.6324, + "step": 165 + }, + { + "epoch": 5.8267535917679906e-05, + "grad_norm": 0.34481996297836304, + "learning_rate": 0.00018931552587646077, + "loss": 0.5745, + "step": 166 + }, + { + "epoch": 5.861854517019605e-05, + "grad_norm": 0.38285771012306213, + "learning_rate": 0.00018924874791318864, + "loss": 0.639, + "step": 167 + }, + { + "epoch": 5.896955442271219e-05, + "grad_norm": 0.36933982372283936, + "learning_rate": 0.00018918196994991652, + "loss": 0.6681, + "step": 168 + }, + { + "epoch": 5.932056367522834e-05, + "grad_norm": 0.36970776319503784, + "learning_rate": 0.00018911519198664442, + "loss": 0.5626, + "step": 169 + }, + { + "epoch": 5.9671572927744484e-05, + "grad_norm": 0.38494783639907837, + "learning_rate": 0.0001890484140233723, + "loss": 0.6066, + "step": 170 + }, + { + "epoch": 6.0022582180260627e-05, + "grad_norm": 0.3446069061756134, + "learning_rate": 0.00018898163606010016, + "loss": 0.6354, + "step": 171 + }, + { + "epoch": 6.037359143277677e-05, + "grad_norm": 0.4466759264469147, + "learning_rate": 0.00018891485809682806, + "loss": 0.4737, + "step": 172 + }, + { + "epoch": 6.072460068529291e-05, + "grad_norm": 0.43630918860435486, + "learning_rate": 0.00018884808013355594, + "loss": 0.6839, + "step": 173 + }, + { + "epoch": 6.107560993780906e-05, + "grad_norm": 0.37083202600479126, + "learning_rate": 0.00018878130217028384, + "loss": 0.5372, + "step": 174 + }, + { + "epoch": 6.14266191903252e-05, + "grad_norm": 0.37066200375556946, + "learning_rate": 0.0001887145242070117, + "loss": 0.6653, + "step": 175 + }, + { + "epoch": 6.177762844284135e-05, + "grad_norm": 0.5191747546195984, + "learning_rate": 0.00018864774624373958, + "loss": 0.6677, + "step": 176 + }, + { + "epoch": 6.21286376953575e-05, + "grad_norm": 0.4235158860683441, + "learning_rate": 0.00018858096828046746, + "loss": 0.5971, + "step": 177 + }, + { + "epoch": 6.247964694787363e-05, + "grad_norm": 0.405074805021286, + "learning_rate": 0.00018851419031719533, + "loss": 0.5717, + "step": 178 + }, + { + "epoch": 6.283065620038978e-05, + "grad_norm": 0.45817336440086365, + "learning_rate": 0.00018844741235392323, + "loss": 0.5878, + "step": 179 + }, + { + "epoch": 6.318166545290592e-05, + "grad_norm": 0.6313037276268005, + "learning_rate": 0.0001883806343906511, + "loss": 0.62, + "step": 180 + }, + { + "epoch": 6.353267470542207e-05, + "grad_norm": 0.41896742582321167, + "learning_rate": 0.00018831385642737898, + "loss": 0.5565, + "step": 181 + }, + { + "epoch": 6.388368395793822e-05, + "grad_norm": 0.4143432676792145, + "learning_rate": 0.00018824707846410685, + "loss": 0.5552, + "step": 182 + }, + { + "epoch": 6.423469321045435e-05, + "grad_norm": 0.38745641708374023, + "learning_rate": 0.00018818030050083472, + "loss": 0.5949, + "step": 183 + }, + { + "epoch": 6.45857024629705e-05, + "grad_norm": 0.7472612261772156, + "learning_rate": 0.0001881135225375626, + "loss": 0.6708, + "step": 184 + }, + { + "epoch": 6.493671171548664e-05, + "grad_norm": 0.4416198432445526, + "learning_rate": 0.0001880467445742905, + "loss": 0.6069, + "step": 185 + }, + { + "epoch": 6.528772096800279e-05, + "grad_norm": 0.4312993884086609, + "learning_rate": 0.00018797996661101837, + "loss": 0.5778, + "step": 186 + }, + { + "epoch": 6.563873022051894e-05, + "grad_norm": 0.4524860978126526, + "learning_rate": 0.00018791318864774624, + "loss": 0.5091, + "step": 187 + }, + { + "epoch": 6.598973947303507e-05, + "grad_norm": 0.4320828914642334, + "learning_rate": 0.00018784641068447412, + "loss": 0.6557, + "step": 188 + }, + { + "epoch": 6.634074872555122e-05, + "grad_norm": 0.6967452168464661, + "learning_rate": 0.00018777963272120202, + "loss": 0.612, + "step": 189 + }, + { + "epoch": 6.669175797806736e-05, + "grad_norm": 0.4389924705028534, + "learning_rate": 0.0001877128547579299, + "loss": 0.6271, + "step": 190 + }, + { + "epoch": 6.704276723058351e-05, + "grad_norm": 0.3693922162055969, + "learning_rate": 0.0001876460767946578, + "loss": 0.6715, + "step": 191 + }, + { + "epoch": 6.739377648309964e-05, + "grad_norm": 0.32230404019355774, + "learning_rate": 0.00018757929883138566, + "loss": 0.6344, + "step": 192 + }, + { + "epoch": 6.774478573561579e-05, + "grad_norm": 0.4440002143383026, + "learning_rate": 0.00018751252086811354, + "loss": 0.6671, + "step": 193 + }, + { + "epoch": 6.809579498813194e-05, + "grad_norm": 0.5676587820053101, + "learning_rate": 0.0001874457429048414, + "loss": 0.6818, + "step": 194 + }, + { + "epoch": 6.844680424064808e-05, + "grad_norm": 0.36207348108291626, + "learning_rate": 0.0001873789649415693, + "loss": 0.5029, + "step": 195 + }, + { + "epoch": 6.879781349316423e-05, + "grad_norm": 0.35714131593704224, + "learning_rate": 0.00018731218697829718, + "loss": 0.6127, + "step": 196 + }, + { + "epoch": 6.914882274568036e-05, + "grad_norm": 0.4285273551940918, + "learning_rate": 0.00018724540901502506, + "loss": 0.6355, + "step": 197 + }, + { + "epoch": 6.949983199819651e-05, + "grad_norm": 0.42585939168930054, + "learning_rate": 0.00018717863105175293, + "loss": 0.6302, + "step": 198 + }, + { + "epoch": 6.985084125071266e-05, + "grad_norm": 0.524303138256073, + "learning_rate": 0.0001871118530884808, + "loss": 0.6683, + "step": 199 + }, + { + "epoch": 7.02018505032288e-05, + "grad_norm": 0.39635923504829407, + "learning_rate": 0.00018704507512520868, + "loss": 0.6694, + "step": 200 + }, + { + "epoch": 7.055285975574495e-05, + "grad_norm": 0.39712437987327576, + "learning_rate": 0.00018697829716193658, + "loss": 0.5794, + "step": 201 + }, + { + "epoch": 7.090386900826108e-05, + "grad_norm": 0.4115397334098816, + "learning_rate": 0.00018691151919866445, + "loss": 0.5579, + "step": 202 + }, + { + "epoch": 7.125487826077723e-05, + "grad_norm": 0.4776385724544525, + "learning_rate": 0.00018684474123539232, + "loss": 0.5589, + "step": 203 + }, + { + "epoch": 7.160588751329338e-05, + "grad_norm": 0.35574638843536377, + "learning_rate": 0.0001867779632721202, + "loss": 0.5311, + "step": 204 + }, + { + "epoch": 7.195689676580952e-05, + "grad_norm": 0.44872432947158813, + "learning_rate": 0.00018671118530884807, + "loss": 0.635, + "step": 205 + }, + { + "epoch": 7.230790601832567e-05, + "grad_norm": 0.3511079251766205, + "learning_rate": 0.00018664440734557597, + "loss": 0.5317, + "step": 206 + }, + { + "epoch": 7.26589152708418e-05, + "grad_norm": 0.39862194657325745, + "learning_rate": 0.00018657762938230384, + "loss": 0.6653, + "step": 207 + }, + { + "epoch": 7.300992452335795e-05, + "grad_norm": 0.4046575725078583, + "learning_rate": 0.00018651085141903174, + "loss": 0.6065, + "step": 208 + }, + { + "epoch": 7.33609337758741e-05, + "grad_norm": 0.4231868088245392, + "learning_rate": 0.00018644407345575962, + "loss": 0.7078, + "step": 209 + }, + { + "epoch": 7.371194302839024e-05, + "grad_norm": 0.364700049161911, + "learning_rate": 0.0001863772954924875, + "loss": 0.6309, + "step": 210 + }, + { + "epoch": 7.406295228090639e-05, + "grad_norm": 0.5385531187057495, + "learning_rate": 0.0001863105175292154, + "loss": 0.4233, + "step": 211 + }, + { + "epoch": 7.441396153342252e-05, + "grad_norm": 0.39415115118026733, + "learning_rate": 0.00018624373956594326, + "loss": 0.5928, + "step": 212 + }, + { + "epoch": 7.476497078593867e-05, + "grad_norm": 0.6021363735198975, + "learning_rate": 0.00018617696160267113, + "loss": 0.6611, + "step": 213 + }, + { + "epoch": 7.511598003845482e-05, + "grad_norm": 0.3709903061389923, + "learning_rate": 0.000186110183639399, + "loss": 0.6136, + "step": 214 + }, + { + "epoch": 7.546698929097096e-05, + "grad_norm": 0.36710435152053833, + "learning_rate": 0.00018604340567612688, + "loss": 0.5267, + "step": 215 + }, + { + "epoch": 7.581799854348711e-05, + "grad_norm": 0.4379352033138275, + "learning_rate": 0.00018597662771285475, + "loss": 0.6429, + "step": 216 + }, + { + "epoch": 7.616900779600325e-05, + "grad_norm": 0.3408482074737549, + "learning_rate": 0.00018590984974958265, + "loss": 0.5379, + "step": 217 + }, + { + "epoch": 7.65200170485194e-05, + "grad_norm": 0.4487043023109436, + "learning_rate": 0.00018584307178631053, + "loss": 0.6582, + "step": 218 + }, + { + "epoch": 7.687102630103554e-05, + "grad_norm": 0.42003679275512695, + "learning_rate": 0.0001857762938230384, + "loss": 0.5712, + "step": 219 + }, + { + "epoch": 7.722203555355168e-05, + "grad_norm": 0.4698665738105774, + "learning_rate": 0.00018570951585976627, + "loss": 0.5715, + "step": 220 + }, + { + "epoch": 7.757304480606783e-05, + "grad_norm": 0.3777780830860138, + "learning_rate": 0.00018564273789649415, + "loss": 0.4667, + "step": 221 + }, + { + "epoch": 7.792405405858397e-05, + "grad_norm": 0.36794212460517883, + "learning_rate": 0.00018557595993322205, + "loss": 0.5382, + "step": 222 + }, + { + "epoch": 7.827506331110012e-05, + "grad_norm": 0.4582989513874054, + "learning_rate": 0.00018550918196994992, + "loss": 0.6437, + "step": 223 + }, + { + "epoch": 7.862607256361626e-05, + "grad_norm": 0.4065852761268616, + "learning_rate": 0.0001854424040066778, + "loss": 0.6928, + "step": 224 + }, + { + "epoch": 7.89770818161324e-05, + "grad_norm": 0.3857649564743042, + "learning_rate": 0.0001853756260434057, + "loss": 0.5405, + "step": 225 + }, + { + "epoch": 7.932809106864855e-05, + "grad_norm": 0.40056589245796204, + "learning_rate": 0.00018530884808013357, + "loss": 0.6425, + "step": 226 + }, + { + "epoch": 7.967910032116469e-05, + "grad_norm": 0.43137016892433167, + "learning_rate": 0.00018524207011686147, + "loss": 0.5001, + "step": 227 + }, + { + "epoch": 8.003010957368084e-05, + "grad_norm": 0.3723987340927124, + "learning_rate": 0.00018517529215358934, + "loss": 0.5118, + "step": 228 + }, + { + "epoch": 8.038111882619698e-05, + "grad_norm": 0.34196361899375916, + "learning_rate": 0.00018510851419031721, + "loss": 0.5468, + "step": 229 + }, + { + "epoch": 8.073212807871312e-05, + "grad_norm": 0.4319117069244385, + "learning_rate": 0.0001850417362270451, + "loss": 0.5703, + "step": 230 + }, + { + "epoch": 8.108313733122927e-05, + "grad_norm": 0.4467247724533081, + "learning_rate": 0.00018497495826377296, + "loss": 0.6536, + "step": 231 + }, + { + "epoch": 8.14341465837454e-05, + "grad_norm": 0.3569909632205963, + "learning_rate": 0.00018490818030050083, + "loss": 0.5335, + "step": 232 + }, + { + "epoch": 8.178515583626156e-05, + "grad_norm": 0.33486437797546387, + "learning_rate": 0.00018484140233722873, + "loss": 0.6803, + "step": 233 + }, + { + "epoch": 8.21361650887777e-05, + "grad_norm": 0.3783140480518341, + "learning_rate": 0.0001847746243739566, + "loss": 0.6361, + "step": 234 + }, + { + "epoch": 8.248717434129384e-05, + "grad_norm": 0.4844662547111511, + "learning_rate": 0.00018470784641068448, + "loss": 0.5322, + "step": 235 + }, + { + "epoch": 8.283818359380999e-05, + "grad_norm": 0.508406400680542, + "learning_rate": 0.00018464106844741235, + "loss": 0.6676, + "step": 236 + }, + { + "epoch": 8.318919284632613e-05, + "grad_norm": 0.3710225820541382, + "learning_rate": 0.00018457429048414023, + "loss": 0.6656, + "step": 237 + }, + { + "epoch": 8.354020209884228e-05, + "grad_norm": 0.3757292628288269, + "learning_rate": 0.00018450751252086813, + "loss": 0.6095, + "step": 238 + }, + { + "epoch": 8.389121135135843e-05, + "grad_norm": 0.40651261806488037, + "learning_rate": 0.000184440734557596, + "loss": 0.6626, + "step": 239 + }, + { + "epoch": 8.424222060387456e-05, + "grad_norm": 0.40700778365135193, + "learning_rate": 0.00018437395659432387, + "loss": 0.5328, + "step": 240 + }, + { + "epoch": 8.459322985639071e-05, + "grad_norm": 0.5067440867424011, + "learning_rate": 0.00018430717863105175, + "loss": 0.4811, + "step": 241 + }, + { + "epoch": 8.494423910890685e-05, + "grad_norm": 0.3934602737426758, + "learning_rate": 0.00018424040066777965, + "loss": 0.5691, + "step": 242 + }, + { + "epoch": 8.5295248361423e-05, + "grad_norm": 0.3360019624233246, + "learning_rate": 0.00018417362270450752, + "loss": 0.5542, + "step": 243 + }, + { + "epoch": 8.564625761393915e-05, + "grad_norm": 0.4023631513118744, + "learning_rate": 0.00018410684474123542, + "loss": 0.5192, + "step": 244 + }, + { + "epoch": 8.599726686645528e-05, + "grad_norm": 0.41704171895980835, + "learning_rate": 0.0001840400667779633, + "loss": 0.5018, + "step": 245 + }, + { + "epoch": 8.634827611897143e-05, + "grad_norm": 0.361977756023407, + "learning_rate": 0.00018397328881469117, + "loss": 0.6193, + "step": 246 + }, + { + "epoch": 8.669928537148757e-05, + "grad_norm": 0.37774717807769775, + "learning_rate": 0.00018390651085141904, + "loss": 0.5552, + "step": 247 + }, + { + "epoch": 8.705029462400372e-05, + "grad_norm": 0.3408471941947937, + "learning_rate": 0.0001838397328881469, + "loss": 0.5876, + "step": 248 + }, + { + "epoch": 8.740130387651985e-05, + "grad_norm": 0.3892226815223694, + "learning_rate": 0.0001837729549248748, + "loss": 0.4227, + "step": 249 + }, + { + "epoch": 8.7752313129036e-05, + "grad_norm": 0.5315036177635193, + "learning_rate": 0.00018370617696160269, + "loss": 0.5826, + "step": 250 + }, + { + "epoch": 8.810332238155215e-05, + "grad_norm": 0.35433024168014526, + "learning_rate": 0.00018363939899833056, + "loss": 0.5992, + "step": 251 + }, + { + "epoch": 8.845433163406829e-05, + "grad_norm": 0.34777382016181946, + "learning_rate": 0.00018357262103505843, + "loss": 0.4973, + "step": 252 + }, + { + "epoch": 8.880534088658444e-05, + "grad_norm": 0.3936387002468109, + "learning_rate": 0.0001835058430717863, + "loss": 0.6254, + "step": 253 + }, + { + "epoch": 8.915635013910057e-05, + "grad_norm": 0.4009217917919159, + "learning_rate": 0.0001834390651085142, + "loss": 0.4843, + "step": 254 + }, + { + "epoch": 8.950735939161672e-05, + "grad_norm": 0.4863683879375458, + "learning_rate": 0.00018337228714524208, + "loss": 0.5204, + "step": 255 + }, + { + "epoch": 8.985836864413287e-05, + "grad_norm": 0.6100988984107971, + "learning_rate": 0.00018330550918196995, + "loss": 0.7296, + "step": 256 + }, + { + "epoch": 9.020937789664901e-05, + "grad_norm": 0.40949374437332153, + "learning_rate": 0.00018323873121869782, + "loss": 0.5707, + "step": 257 + }, + { + "epoch": 9.056038714916516e-05, + "grad_norm": 0.47316402196884155, + "learning_rate": 0.0001831719532554257, + "loss": 0.6655, + "step": 258 + }, + { + "epoch": 9.091139640168129e-05, + "grad_norm": 0.4053696393966675, + "learning_rate": 0.0001831051752921536, + "loss": 0.5822, + "step": 259 + }, + { + "epoch": 9.126240565419744e-05, + "grad_norm": 0.4582972228527069, + "learning_rate": 0.00018303839732888147, + "loss": 0.5475, + "step": 260 + }, + { + "epoch": 9.161341490671359e-05, + "grad_norm": 0.38666802644729614, + "learning_rate": 0.00018297161936560937, + "loss": 0.4744, + "step": 261 + }, + { + "epoch": 9.196442415922973e-05, + "grad_norm": 0.31954991817474365, + "learning_rate": 0.00018290484140233724, + "loss": 0.6337, + "step": 262 + }, + { + "epoch": 9.231543341174588e-05, + "grad_norm": 0.3590424358844757, + "learning_rate": 0.00018283806343906512, + "loss": 0.5683, + "step": 263 + }, + { + "epoch": 9.266644266426201e-05, + "grad_norm": 0.4042195975780487, + "learning_rate": 0.000182771285475793, + "loss": 0.6142, + "step": 264 + }, + { + "epoch": 9.301745191677816e-05, + "grad_norm": 0.3474234342575073, + "learning_rate": 0.0001827045075125209, + "loss": 0.6035, + "step": 265 + }, + { + "epoch": 9.336846116929431e-05, + "grad_norm": 0.337091326713562, + "learning_rate": 0.00018263772954924876, + "loss": 0.6107, + "step": 266 + }, + { + "epoch": 9.371947042181045e-05, + "grad_norm": 0.3313732445240021, + "learning_rate": 0.00018257095158597664, + "loss": 0.6491, + "step": 267 + }, + { + "epoch": 9.40704796743266e-05, + "grad_norm": 0.3931679129600525, + "learning_rate": 0.0001825041736227045, + "loss": 0.5492, + "step": 268 + }, + { + "epoch": 9.442148892684273e-05, + "grad_norm": 0.5848420262336731, + "learning_rate": 0.00018243739565943238, + "loss": 0.7091, + "step": 269 + }, + { + "epoch": 9.477249817935888e-05, + "grad_norm": 0.4851846992969513, + "learning_rate": 0.00018237061769616028, + "loss": 0.5856, + "step": 270 + }, + { + "epoch": 9.512350743187503e-05, + "grad_norm": 0.3434993326663971, + "learning_rate": 0.00018230383973288816, + "loss": 0.5085, + "step": 271 + }, + { + "epoch": 9.547451668439117e-05, + "grad_norm": 0.2978988587856293, + "learning_rate": 0.00018223706176961603, + "loss": 0.481, + "step": 272 + }, + { + "epoch": 9.582552593690732e-05, + "grad_norm": 0.34215858578681946, + "learning_rate": 0.0001821702838063439, + "loss": 0.5723, + "step": 273 + }, + { + "epoch": 9.617653518942345e-05, + "grad_norm": 0.43445509672164917, + "learning_rate": 0.00018210350584307178, + "loss": 0.5691, + "step": 274 + }, + { + "epoch": 9.65275444419396e-05, + "grad_norm": 0.36094945669174194, + "learning_rate": 0.00018203672787979968, + "loss": 0.5543, + "step": 275 + }, + { + "epoch": 9.687855369445575e-05, + "grad_norm": 0.386106014251709, + "learning_rate": 0.00018196994991652755, + "loss": 0.5561, + "step": 276 + }, + { + "epoch": 9.722956294697189e-05, + "grad_norm": 0.36676689982414246, + "learning_rate": 0.00018190317195325542, + "loss": 0.5479, + "step": 277 + }, + { + "epoch": 9.758057219948804e-05, + "grad_norm": 0.37988394498825073, + "learning_rate": 0.00018183639398998332, + "loss": 0.5772, + "step": 278 + }, + { + "epoch": 9.793158145200417e-05, + "grad_norm": 0.4024789035320282, + "learning_rate": 0.0001817696160267112, + "loss": 0.6065, + "step": 279 + }, + { + "epoch": 9.828259070452032e-05, + "grad_norm": 0.3697255551815033, + "learning_rate": 0.0001817028380634391, + "loss": 0.5021, + "step": 280 + }, + { + "epoch": 9.863359995703647e-05, + "grad_norm": 0.43579426407814026, + "learning_rate": 0.00018163606010016697, + "loss": 0.555, + "step": 281 + }, + { + "epoch": 9.898460920955261e-05, + "grad_norm": 0.4760832190513611, + "learning_rate": 0.00018156928213689484, + "loss": 0.6438, + "step": 282 + }, + { + "epoch": 9.933561846206876e-05, + "grad_norm": 0.45258408784866333, + "learning_rate": 0.00018150250417362272, + "loss": 0.4717, + "step": 283 + }, + { + "epoch": 9.96866277145849e-05, + "grad_norm": 0.428108274936676, + "learning_rate": 0.0001814357262103506, + "loss": 0.6029, + "step": 284 + }, + { + "epoch": 0.00010003763696710104, + "grad_norm": 0.3999852240085602, + "learning_rate": 0.00018136894824707846, + "loss": 0.4524, + "step": 285 + }, + { + "epoch": 0.0001003886462196172, + "grad_norm": 0.44319403171539307, + "learning_rate": 0.00018130217028380636, + "loss": 0.6619, + "step": 286 + }, + { + "epoch": 0.00010073965547213333, + "grad_norm": 0.43008357286453247, + "learning_rate": 0.00018123539232053424, + "loss": 0.6105, + "step": 287 + }, + { + "epoch": 0.00010109066472464948, + "grad_norm": 0.38037821650505066, + "learning_rate": 0.0001811686143572621, + "loss": 0.6649, + "step": 288 + }, + { + "epoch": 0.00010144167397716562, + "grad_norm": 0.3713517487049103, + "learning_rate": 0.00018110183639398998, + "loss": 0.6381, + "step": 289 + }, + { + "epoch": 0.00010179268322968176, + "grad_norm": 0.3437170386314392, + "learning_rate": 0.00018103505843071786, + "loss": 0.4563, + "step": 290 + }, + { + "epoch": 0.00010214369248219791, + "grad_norm": 0.3661468029022217, + "learning_rate": 0.00018096828046744576, + "loss": 0.606, + "step": 291 + }, + { + "epoch": 0.00010249470173471405, + "grad_norm": 0.36346200108528137, + "learning_rate": 0.00018090150250417363, + "loss": 0.5895, + "step": 292 + }, + { + "epoch": 0.0001028457109872302, + "grad_norm": 0.31052225828170776, + "learning_rate": 0.0001808347245409015, + "loss": 0.4409, + "step": 293 + }, + { + "epoch": 0.00010319672023974634, + "grad_norm": 0.37012970447540283, + "learning_rate": 0.00018076794657762938, + "loss": 0.505, + "step": 294 + }, + { + "epoch": 0.00010354772949226248, + "grad_norm": 0.3958667814731598, + "learning_rate": 0.00018070116861435728, + "loss": 0.5371, + "step": 295 + }, + { + "epoch": 0.00010389873874477863, + "grad_norm": 0.4892179071903229, + "learning_rate": 0.00018063439065108515, + "loss": 0.6737, + "step": 296 + }, + { + "epoch": 0.00010424974799729477, + "grad_norm": 0.41874751448631287, + "learning_rate": 0.00018056761268781305, + "loss": 0.651, + "step": 297 + }, + { + "epoch": 0.00010460075724981092, + "grad_norm": 0.4167911410331726, + "learning_rate": 0.00018050083472454092, + "loss": 0.5531, + "step": 298 + }, + { + "epoch": 0.00010495176650232706, + "grad_norm": 0.3758225440979004, + "learning_rate": 0.0001804340567612688, + "loss": 0.6285, + "step": 299 + }, + { + "epoch": 0.0001053027757548432, + "grad_norm": 0.3688598573207855, + "learning_rate": 0.00018036727879799667, + "loss": 0.5219, + "step": 300 + }, + { + "epoch": 0.00010565378500735934, + "grad_norm": 0.3501751124858856, + "learning_rate": 0.00018030050083472454, + "loss": 0.6351, + "step": 301 + }, + { + "epoch": 0.00010600479425987549, + "grad_norm": 0.42876511812210083, + "learning_rate": 0.00018023372287145244, + "loss": 0.544, + "step": 302 + }, + { + "epoch": 0.00010635580351239164, + "grad_norm": 0.47046172618865967, + "learning_rate": 0.00018016694490818031, + "loss": 0.6304, + "step": 303 + }, + { + "epoch": 0.00010670681276490778, + "grad_norm": 0.402271032333374, + "learning_rate": 0.0001801001669449082, + "loss": 0.5039, + "step": 304 + }, + { + "epoch": 0.00010705782201742393, + "grad_norm": 0.41232413053512573, + "learning_rate": 0.00018003338898163606, + "loss": 0.5892, + "step": 305 + }, + { + "epoch": 0.00010740883126994006, + "grad_norm": 0.3628154993057251, + "learning_rate": 0.00017996661101836393, + "loss": 0.5737, + "step": 306 + }, + { + "epoch": 0.00010775984052245621, + "grad_norm": 0.4291020631790161, + "learning_rate": 0.00017989983305509183, + "loss": 0.6597, + "step": 307 + }, + { + "epoch": 0.00010811084977497236, + "grad_norm": 0.33218181133270264, + "learning_rate": 0.0001798330550918197, + "loss": 0.5726, + "step": 308 + }, + { + "epoch": 0.0001084618590274885, + "grad_norm": 0.3439387381076813, + "learning_rate": 0.00017976627712854758, + "loss": 0.5615, + "step": 309 + }, + { + "epoch": 0.00010881286828000465, + "grad_norm": 0.3523644208908081, + "learning_rate": 0.00017969949916527545, + "loss": 0.4968, + "step": 310 + }, + { + "epoch": 0.00010916387753252078, + "grad_norm": 0.4045630991458893, + "learning_rate": 0.00017963272120200333, + "loss": 0.6425, + "step": 311 + }, + { + "epoch": 0.00010951488678503693, + "grad_norm": 0.3726767599582672, + "learning_rate": 0.00017956594323873123, + "loss": 0.6575, + "step": 312 + }, + { + "epoch": 0.00010986589603755308, + "grad_norm": 0.32131972908973694, + "learning_rate": 0.0001794991652754591, + "loss": 0.5146, + "step": 313 + }, + { + "epoch": 0.00011021690529006922, + "grad_norm": 0.5013764500617981, + "learning_rate": 0.000179432387312187, + "loss": 0.53, + "step": 314 + }, + { + "epoch": 0.00011056791454258537, + "grad_norm": 0.36830246448516846, + "learning_rate": 0.00017936560934891487, + "loss": 0.6291, + "step": 315 + }, + { + "epoch": 0.0001109189237951015, + "grad_norm": 0.3587378263473511, + "learning_rate": 0.00017929883138564275, + "loss": 0.4954, + "step": 316 + }, + { + "epoch": 0.00011126993304761765, + "grad_norm": 0.3480195105075836, + "learning_rate": 0.00017923205342237062, + "loss": 0.606, + "step": 317 + }, + { + "epoch": 0.0001116209423001338, + "grad_norm": 0.38415858149528503, + "learning_rate": 0.00017916527545909852, + "loss": 0.7281, + "step": 318 + }, + { + "epoch": 0.00011197195155264994, + "grad_norm": 0.35853826999664307, + "learning_rate": 0.0001790984974958264, + "loss": 0.5851, + "step": 319 + }, + { + "epoch": 0.00011232296080516609, + "grad_norm": 0.42092210054397583, + "learning_rate": 0.00017903171953255427, + "loss": 0.5324, + "step": 320 + }, + { + "epoch": 0.00011267397005768222, + "grad_norm": 0.34538987278938293, + "learning_rate": 0.00017896494156928214, + "loss": 0.6387, + "step": 321 + }, + { + "epoch": 0.00011302497931019837, + "grad_norm": 0.38299745321273804, + "learning_rate": 0.00017889816360601, + "loss": 0.6013, + "step": 322 + }, + { + "epoch": 0.00011337598856271452, + "grad_norm": 0.32100436091423035, + "learning_rate": 0.0001788313856427379, + "loss": 0.4627, + "step": 323 + }, + { + "epoch": 0.00011372699781523066, + "grad_norm": 0.3458426594734192, + "learning_rate": 0.0001787646076794658, + "loss": 0.5865, + "step": 324 + }, + { + "epoch": 0.0001140780070677468, + "grad_norm": 0.33228665590286255, + "learning_rate": 0.00017869782971619366, + "loss": 0.4611, + "step": 325 + }, + { + "epoch": 0.00011442901632026294, + "grad_norm": 0.38747021555900574, + "learning_rate": 0.00017863105175292153, + "loss": 0.5777, + "step": 326 + }, + { + "epoch": 0.00011478002557277909, + "grad_norm": 0.3888608515262604, + "learning_rate": 0.0001785642737896494, + "loss": 0.5664, + "step": 327 + }, + { + "epoch": 0.00011513103482529524, + "grad_norm": 0.4084737002849579, + "learning_rate": 0.0001784974958263773, + "loss": 0.5939, + "step": 328 + }, + { + "epoch": 0.00011548204407781138, + "grad_norm": 0.4964492917060852, + "learning_rate": 0.00017843071786310518, + "loss": 0.6256, + "step": 329 + }, + { + "epoch": 0.00011583305333032753, + "grad_norm": 0.37329745292663574, + "learning_rate": 0.00017836393989983305, + "loss": 0.5388, + "step": 330 + }, + { + "epoch": 0.00011618406258284366, + "grad_norm": 0.37680140137672424, + "learning_rate": 0.00017829716193656095, + "loss": 0.6203, + "step": 331 + }, + { + "epoch": 0.00011653507183535981, + "grad_norm": 0.4162957966327667, + "learning_rate": 0.00017823038397328883, + "loss": 0.6478, + "step": 332 + }, + { + "epoch": 0.00011688608108787596, + "grad_norm": 0.3473896086215973, + "learning_rate": 0.0001781636060100167, + "loss": 0.589, + "step": 333 + }, + { + "epoch": 0.0001172370903403921, + "grad_norm": 0.4039511978626251, + "learning_rate": 0.0001780968280467446, + "loss": 0.5681, + "step": 334 + }, + { + "epoch": 0.00011758809959290825, + "grad_norm": 0.3135715425014496, + "learning_rate": 0.00017803005008347247, + "loss": 0.5069, + "step": 335 + }, + { + "epoch": 0.00011793910884542438, + "grad_norm": 0.4296559989452362, + "learning_rate": 0.00017796327212020035, + "loss": 0.5413, + "step": 336 + }, + { + "epoch": 0.00011829011809794053, + "grad_norm": 0.4197536110877991, + "learning_rate": 0.00017789649415692822, + "loss": 0.694, + "step": 337 + }, + { + "epoch": 0.00011864112735045668, + "grad_norm": 0.3633468449115753, + "learning_rate": 0.0001778297161936561, + "loss": 0.5475, + "step": 338 + }, + { + "epoch": 0.00011899213660297282, + "grad_norm": 0.2867147922515869, + "learning_rate": 0.000177762938230384, + "loss": 0.485, + "step": 339 + }, + { + "epoch": 0.00011934314585548897, + "grad_norm": 0.3445490300655365, + "learning_rate": 0.00017769616026711187, + "loss": 0.6304, + "step": 340 + }, + { + "epoch": 0.0001196941551080051, + "grad_norm": 0.31692221760749817, + "learning_rate": 0.00017762938230383974, + "loss": 0.5804, + "step": 341 + }, + { + "epoch": 0.00012004516436052125, + "grad_norm": 0.31391167640686035, + "learning_rate": 0.0001775626043405676, + "loss": 0.5945, + "step": 342 + }, + { + "epoch": 0.0001203961736130374, + "grad_norm": 0.3484472632408142, + "learning_rate": 0.00017749582637729548, + "loss": 0.6577, + "step": 343 + }, + { + "epoch": 0.00012074718286555354, + "grad_norm": 0.37430596351623535, + "learning_rate": 0.00017742904841402339, + "loss": 0.6854, + "step": 344 + }, + { + "epoch": 0.00012109819211806969, + "grad_norm": 0.34305211901664734, + "learning_rate": 0.00017736227045075126, + "loss": 0.5123, + "step": 345 + }, + { + "epoch": 0.00012144920137058582, + "grad_norm": 0.3398534059524536, + "learning_rate": 0.00017729549248747913, + "loss": 0.5602, + "step": 346 + }, + { + "epoch": 0.00012180021062310197, + "grad_norm": 0.4278014600276947, + "learning_rate": 0.000177228714524207, + "loss": 0.5152, + "step": 347 + }, + { + "epoch": 0.00012215121987561812, + "grad_norm": 0.4011085629463196, + "learning_rate": 0.0001771619365609349, + "loss": 0.6217, + "step": 348 + }, + { + "epoch": 0.00012250222912813427, + "grad_norm": 0.3425695598125458, + "learning_rate": 0.00017709515859766278, + "loss": 0.5037, + "step": 349 + }, + { + "epoch": 0.0001228532383806504, + "grad_norm": 0.34036242961883545, + "learning_rate": 0.00017702838063439068, + "loss": 0.649, + "step": 350 + }, + { + "epoch": 0.00012320424763316654, + "grad_norm": 0.5631874203681946, + "learning_rate": 0.00017696160267111855, + "loss": 0.5656, + "step": 351 + }, + { + "epoch": 0.0001235552568856827, + "grad_norm": 0.4195176661014557, + "learning_rate": 0.00017689482470784642, + "loss": 0.6899, + "step": 352 + }, + { + "epoch": 0.00012390626613819884, + "grad_norm": 0.41814154386520386, + "learning_rate": 0.0001768280467445743, + "loss": 0.551, + "step": 353 + }, + { + "epoch": 0.000124257275390715, + "grad_norm": 0.3374340534210205, + "learning_rate": 0.00017676126878130217, + "loss": 0.7022, + "step": 354 + }, + { + "epoch": 0.00012460828464323112, + "grad_norm": 0.41464921832084656, + "learning_rate": 0.00017669449081803007, + "loss": 0.5301, + "step": 355 + }, + { + "epoch": 0.00012495929389574726, + "grad_norm": 0.4443178176879883, + "learning_rate": 0.00017662771285475794, + "loss": 0.5487, + "step": 356 + }, + { + "epoch": 0.00012531030314826341, + "grad_norm": 0.3389272093772888, + "learning_rate": 0.00017656093489148582, + "loss": 0.581, + "step": 357 + }, + { + "epoch": 0.00012566131240077956, + "grad_norm": 0.29650986194610596, + "learning_rate": 0.0001764941569282137, + "loss": 0.5801, + "step": 358 + }, + { + "epoch": 0.0001260123216532957, + "grad_norm": 0.40271905064582825, + "learning_rate": 0.00017642737896494156, + "loss": 0.6738, + "step": 359 + }, + { + "epoch": 0.00012636333090581184, + "grad_norm": 0.352225661277771, + "learning_rate": 0.00017636060100166946, + "loss": 0.5727, + "step": 360 + }, + { + "epoch": 0.00012671434015832798, + "grad_norm": 0.3469563126564026, + "learning_rate": 0.00017629382303839734, + "loss": 0.5188, + "step": 361 + }, + { + "epoch": 0.00012706534941084413, + "grad_norm": 0.30644670128822327, + "learning_rate": 0.0001762270450751252, + "loss": 0.497, + "step": 362 + }, + { + "epoch": 0.00012741635866336028, + "grad_norm": 0.3472917377948761, + "learning_rate": 0.00017616026711185308, + "loss": 0.6363, + "step": 363 + }, + { + "epoch": 0.00012776736791587643, + "grad_norm": 0.37184756994247437, + "learning_rate": 0.00017609348914858096, + "loss": 0.5223, + "step": 364 + }, + { + "epoch": 0.00012811837716839256, + "grad_norm": 0.3247138559818268, + "learning_rate": 0.00017602671118530886, + "loss": 0.5457, + "step": 365 + }, + { + "epoch": 0.0001284693864209087, + "grad_norm": 0.5236158967018127, + "learning_rate": 0.00017595993322203673, + "loss": 0.615, + "step": 366 + }, + { + "epoch": 0.00012882039567342485, + "grad_norm": 0.33708465099334717, + "learning_rate": 0.00017589315525876463, + "loss": 0.6163, + "step": 367 + }, + { + "epoch": 0.000129171404925941, + "grad_norm": 0.33848705887794495, + "learning_rate": 0.0001758263772954925, + "loss": 0.4229, + "step": 368 + }, + { + "epoch": 0.00012952241417845715, + "grad_norm": 0.5827682018280029, + "learning_rate": 0.00017575959933222038, + "loss": 0.5668, + "step": 369 + }, + { + "epoch": 0.00012987342343097328, + "grad_norm": 0.36217448115348816, + "learning_rate": 0.00017569282136894825, + "loss": 0.4983, + "step": 370 + }, + { + "epoch": 0.00013022443268348943, + "grad_norm": 0.329414963722229, + "learning_rate": 0.00017562604340567615, + "loss": 0.4281, + "step": 371 + }, + { + "epoch": 0.00013057544193600557, + "grad_norm": 0.36746612191200256, + "learning_rate": 0.00017555926544240402, + "loss": 0.6629, + "step": 372 + }, + { + "epoch": 0.00013092645118852172, + "grad_norm": 0.3954717516899109, + "learning_rate": 0.0001754924874791319, + "loss": 0.5784, + "step": 373 + }, + { + "epoch": 0.00013127746044103787, + "grad_norm": 0.41279932856559753, + "learning_rate": 0.00017542570951585977, + "loss": 0.5994, + "step": 374 + }, + { + "epoch": 0.000131628469693554, + "grad_norm": 0.3019951581954956, + "learning_rate": 0.00017535893155258764, + "loss": 0.5584, + "step": 375 + }, + { + "epoch": 0.00013197947894607015, + "grad_norm": 0.3079768121242523, + "learning_rate": 0.00017529215358931554, + "loss": 0.5904, + "step": 376 + }, + { + "epoch": 0.0001323304881985863, + "grad_norm": 0.5678027272224426, + "learning_rate": 0.00017522537562604342, + "loss": 0.6441, + "step": 377 + }, + { + "epoch": 0.00013268149745110244, + "grad_norm": 0.38624581694602966, + "learning_rate": 0.0001751585976627713, + "loss": 0.5582, + "step": 378 + }, + { + "epoch": 0.0001330325067036186, + "grad_norm": 0.4368002712726593, + "learning_rate": 0.00017509181969949916, + "loss": 0.686, + "step": 379 + }, + { + "epoch": 0.00013338351595613472, + "grad_norm": 0.3409269154071808, + "learning_rate": 0.00017502504173622704, + "loss": 0.582, + "step": 380 + }, + { + "epoch": 0.00013373452520865087, + "grad_norm": 0.3772698938846588, + "learning_rate": 0.0001749582637729549, + "loss": 0.5314, + "step": 381 + }, + { + "epoch": 0.00013408553446116702, + "grad_norm": 0.3791707158088684, + "learning_rate": 0.0001748914858096828, + "loss": 0.6143, + "step": 382 + }, + { + "epoch": 0.00013443654371368317, + "grad_norm": 0.4441101551055908, + "learning_rate": 0.0001748247078464107, + "loss": 0.5726, + "step": 383 + }, + { + "epoch": 0.0001347875529661993, + "grad_norm": 0.4160211980342865, + "learning_rate": 0.00017475792988313858, + "loss": 0.6003, + "step": 384 + }, + { + "epoch": 0.00013513856221871544, + "grad_norm": 0.41698628664016724, + "learning_rate": 0.00017469115191986646, + "loss": 0.4539, + "step": 385 + }, + { + "epoch": 0.00013548957147123159, + "grad_norm": 0.337007999420166, + "learning_rate": 0.00017462437395659433, + "loss": 0.5176, + "step": 386 + }, + { + "epoch": 0.00013584058072374774, + "grad_norm": 0.30926409363746643, + "learning_rate": 0.00017455759599332223, + "loss": 0.6072, + "step": 387 + }, + { + "epoch": 0.00013619158997626389, + "grad_norm": 0.3663052022457123, + "learning_rate": 0.0001744908180300501, + "loss": 0.538, + "step": 388 + }, + { + "epoch": 0.00013654259922878, + "grad_norm": 0.3410074710845947, + "learning_rate": 0.00017442404006677798, + "loss": 0.5687, + "step": 389 + }, + { + "epoch": 0.00013689360848129616, + "grad_norm": 0.5266095399856567, + "learning_rate": 0.00017435726210350585, + "loss": 0.6685, + "step": 390 + }, + { + "epoch": 0.0001372446177338123, + "grad_norm": 0.4020686149597168, + "learning_rate": 0.00017429048414023372, + "loss": 0.586, + "step": 391 + }, + { + "epoch": 0.00013759562698632846, + "grad_norm": 0.39995548129081726, + "learning_rate": 0.00017422370617696162, + "loss": 0.6958, + "step": 392 + }, + { + "epoch": 0.0001379466362388446, + "grad_norm": 0.4024721682071686, + "learning_rate": 0.0001741569282136895, + "loss": 0.6411, + "step": 393 + }, + { + "epoch": 0.00013829764549136073, + "grad_norm": 0.38193392753601074, + "learning_rate": 0.00017409015025041737, + "loss": 0.5857, + "step": 394 + }, + { + "epoch": 0.00013864865474387688, + "grad_norm": 0.39786526560783386, + "learning_rate": 0.00017402337228714524, + "loss": 0.5215, + "step": 395 + }, + { + "epoch": 0.00013899966399639303, + "grad_norm": 0.49223974347114563, + "learning_rate": 0.00017395659432387311, + "loss": 0.5881, + "step": 396 + }, + { + "epoch": 0.00013935067324890918, + "grad_norm": 0.3398894667625427, + "learning_rate": 0.00017388981636060101, + "loss": 0.5466, + "step": 397 + }, + { + "epoch": 0.00013970168250142533, + "grad_norm": 0.34891223907470703, + "learning_rate": 0.0001738230383973289, + "loss": 0.5901, + "step": 398 + }, + { + "epoch": 0.00014005269175394145, + "grad_norm": 0.47644108533859253, + "learning_rate": 0.00017375626043405676, + "loss": 0.5075, + "step": 399 + }, + { + "epoch": 0.0001404037010064576, + "grad_norm": 0.42530229687690735, + "learning_rate": 0.00017368948247078466, + "loss": 0.663, + "step": 400 + }, + { + "epoch": 0.00014075471025897375, + "grad_norm": 0.30858534574508667, + "learning_rate": 0.00017362270450751253, + "loss": 0.4724, + "step": 401 + }, + { + "epoch": 0.0001411057195114899, + "grad_norm": 0.42453449964523315, + "learning_rate": 0.0001735559265442404, + "loss": 0.6074, + "step": 402 + }, + { + "epoch": 0.00014145672876400605, + "grad_norm": 0.3964505195617676, + "learning_rate": 0.0001734891485809683, + "loss": 0.4913, + "step": 403 + }, + { + "epoch": 0.00014180773801652217, + "grad_norm": 0.3317703902721405, + "learning_rate": 0.00017342237061769618, + "loss": 0.5504, + "step": 404 + }, + { + "epoch": 0.00014215874726903832, + "grad_norm": 0.3912264108657837, + "learning_rate": 0.00017335559265442405, + "loss": 0.6301, + "step": 405 + }, + { + "epoch": 0.00014250975652155447, + "grad_norm": 0.3582877218723297, + "learning_rate": 0.00017328881469115193, + "loss": 0.6205, + "step": 406 + }, + { + "epoch": 0.00014286076577407062, + "grad_norm": 0.3691099286079407, + "learning_rate": 0.0001732220367278798, + "loss": 0.5348, + "step": 407 + }, + { + "epoch": 0.00014321177502658677, + "grad_norm": 0.35860803723335266, + "learning_rate": 0.0001731552587646077, + "loss": 0.6029, + "step": 408 + }, + { + "epoch": 0.0001435627842791029, + "grad_norm": 0.3640693426132202, + "learning_rate": 0.00017308848080133557, + "loss": 0.6673, + "step": 409 + }, + { + "epoch": 0.00014391379353161904, + "grad_norm": 0.3550623953342438, + "learning_rate": 0.00017302170283806345, + "loss": 0.4659, + "step": 410 + }, + { + "epoch": 0.0001442648027841352, + "grad_norm": 0.45885637402534485, + "learning_rate": 0.00017295492487479132, + "loss": 0.4781, + "step": 411 + }, + { + "epoch": 0.00014461581203665134, + "grad_norm": 0.3703556954860687, + "learning_rate": 0.0001728881469115192, + "loss": 0.4829, + "step": 412 + }, + { + "epoch": 0.0001449668212891675, + "grad_norm": 0.5436837077140808, + "learning_rate": 0.0001728213689482471, + "loss": 0.6056, + "step": 413 + }, + { + "epoch": 0.0001453178305416836, + "grad_norm": 0.3953244686126709, + "learning_rate": 0.00017275459098497497, + "loss": 0.4884, + "step": 414 + }, + { + "epoch": 0.00014566883979419976, + "grad_norm": 0.34003904461860657, + "learning_rate": 0.00017268781302170284, + "loss": 0.6014, + "step": 415 + }, + { + "epoch": 0.0001460198490467159, + "grad_norm": 0.3463648557662964, + "learning_rate": 0.0001726210350584307, + "loss": 0.603, + "step": 416 + }, + { + "epoch": 0.00014637085829923206, + "grad_norm": 0.4293590784072876, + "learning_rate": 0.0001725542570951586, + "loss": 0.6686, + "step": 417 + }, + { + "epoch": 0.0001467218675517482, + "grad_norm": 0.4243469834327698, + "learning_rate": 0.0001724874791318865, + "loss": 0.6422, + "step": 418 + }, + { + "epoch": 0.00014707287680426433, + "grad_norm": 0.38327839970588684, + "learning_rate": 0.0001724207011686144, + "loss": 0.5595, + "step": 419 + }, + { + "epoch": 0.00014742388605678048, + "grad_norm": 0.31334301829338074, + "learning_rate": 0.00017235392320534226, + "loss": 0.474, + "step": 420 + }, + { + "epoch": 0.00014777489530929663, + "grad_norm": 0.3335350453853607, + "learning_rate": 0.00017228714524207013, + "loss": 0.6172, + "step": 421 + }, + { + "epoch": 0.00014812590456181278, + "grad_norm": 0.373696506023407, + "learning_rate": 0.000172220367278798, + "loss": 0.6183, + "step": 422 + }, + { + "epoch": 0.00014847691381432893, + "grad_norm": 0.45814886689186096, + "learning_rate": 0.00017215358931552588, + "loss": 0.5059, + "step": 423 + }, + { + "epoch": 0.00014882792306684505, + "grad_norm": 0.3578277826309204, + "learning_rate": 0.00017208681135225378, + "loss": 0.5771, + "step": 424 + }, + { + "epoch": 0.0001491789323193612, + "grad_norm": 0.42081883549690247, + "learning_rate": 0.00017202003338898165, + "loss": 0.5604, + "step": 425 + }, + { + "epoch": 0.00014952994157187735, + "grad_norm": 0.3173503875732422, + "learning_rate": 0.00017195325542570953, + "loss": 0.5738, + "step": 426 + }, + { + "epoch": 0.0001498809508243935, + "grad_norm": 0.38292011618614197, + "learning_rate": 0.0001718864774624374, + "loss": 0.6067, + "step": 427 + }, + { + "epoch": 0.00015023196007690965, + "grad_norm": 0.3518977463245392, + "learning_rate": 0.00017181969949916527, + "loss": 0.5073, + "step": 428 + }, + { + "epoch": 0.00015058296932942577, + "grad_norm": 0.5157706141471863, + "learning_rate": 0.00017175292153589317, + "loss": 0.5496, + "step": 429 + }, + { + "epoch": 0.00015093397858194192, + "grad_norm": 0.32064110040664673, + "learning_rate": 0.00017168614357262105, + "loss": 0.4766, + "step": 430 + }, + { + "epoch": 0.00015128498783445807, + "grad_norm": 0.42229798436164856, + "learning_rate": 0.00017161936560934892, + "loss": 0.5953, + "step": 431 + }, + { + "epoch": 0.00015163599708697422, + "grad_norm": 0.4723895192146301, + "learning_rate": 0.0001715525876460768, + "loss": 0.4783, + "step": 432 + }, + { + "epoch": 0.00015198700633949037, + "grad_norm": 0.3841445744037628, + "learning_rate": 0.00017148580968280467, + "loss": 0.5003, + "step": 433 + }, + { + "epoch": 0.0001523380155920065, + "grad_norm": 0.38026461005210876, + "learning_rate": 0.00017141903171953257, + "loss": 0.5093, + "step": 434 + }, + { + "epoch": 0.00015268902484452264, + "grad_norm": 0.37034904956817627, + "learning_rate": 0.00017135225375626044, + "loss": 0.6158, + "step": 435 + }, + { + "epoch": 0.0001530400340970388, + "grad_norm": 0.3876091241836548, + "learning_rate": 0.00017128547579298834, + "loss": 0.5287, + "step": 436 + }, + { + "epoch": 0.00015339104334955494, + "grad_norm": 0.30055519938468933, + "learning_rate": 0.0001712186978297162, + "loss": 0.5018, + "step": 437 + }, + { + "epoch": 0.0001537420526020711, + "grad_norm": 0.36094966530799866, + "learning_rate": 0.00017115191986644409, + "loss": 0.4961, + "step": 438 + }, + { + "epoch": 0.0001540930618545872, + "grad_norm": 0.3300524055957794, + "learning_rate": 0.00017108514190317196, + "loss": 0.5246, + "step": 439 + }, + { + "epoch": 0.00015444407110710336, + "grad_norm": 0.40980783104896545, + "learning_rate": 0.00017101836393989986, + "loss": 0.5705, + "step": 440 + }, + { + "epoch": 0.0001547950803596195, + "grad_norm": 0.3442326784133911, + "learning_rate": 0.00017095158597662773, + "loss": 0.5595, + "step": 441 + }, + { + "epoch": 0.00015514608961213566, + "grad_norm": 0.48015034198760986, + "learning_rate": 0.0001708848080133556, + "loss": 0.5642, + "step": 442 + }, + { + "epoch": 0.0001554970988646518, + "grad_norm": 0.5570142269134521, + "learning_rate": 0.00017081803005008348, + "loss": 0.6111, + "step": 443 + }, + { + "epoch": 0.00015584810811716793, + "grad_norm": 0.30470094084739685, + "learning_rate": 0.00017075125208681135, + "loss": 0.5151, + "step": 444 + }, + { + "epoch": 0.00015619911736968408, + "grad_norm": 0.31946614384651184, + "learning_rate": 0.00017068447412353925, + "loss": 0.5265, + "step": 445 + }, + { + "epoch": 0.00015655012662220023, + "grad_norm": 0.38980719447135925, + "learning_rate": 0.00017061769616026712, + "loss": 0.575, + "step": 446 + }, + { + "epoch": 0.00015690113587471638, + "grad_norm": 0.4077732264995575, + "learning_rate": 0.000170550918196995, + "loss": 0.5729, + "step": 447 + }, + { + "epoch": 0.00015725214512723253, + "grad_norm": 0.38632732629776, + "learning_rate": 0.00017048414023372287, + "loss": 0.594, + "step": 448 + }, + { + "epoch": 0.00015760315437974865, + "grad_norm": 0.37193921208381653, + "learning_rate": 0.00017041736227045074, + "loss": 0.6062, + "step": 449 + }, + { + "epoch": 0.0001579541636322648, + "grad_norm": 0.399029016494751, + "learning_rate": 0.00017035058430717862, + "loss": 0.4538, + "step": 450 + }, + { + "epoch": 0.00015830517288478095, + "grad_norm": 0.37710487842559814, + "learning_rate": 0.00017028380634390652, + "loss": 0.5615, + "step": 451 + }, + { + "epoch": 0.0001586561821372971, + "grad_norm": 0.38591668009757996, + "learning_rate": 0.0001702170283806344, + "loss": 0.5316, + "step": 452 + }, + { + "epoch": 0.00015900719138981325, + "grad_norm": 0.3453538417816162, + "learning_rate": 0.0001701502504173623, + "loss": 0.4645, + "step": 453 + }, + { + "epoch": 0.00015935820064232937, + "grad_norm": 0.34171512722969055, + "learning_rate": 0.00017008347245409016, + "loss": 0.5856, + "step": 454 + }, + { + "epoch": 0.00015970920989484552, + "grad_norm": 0.39591720700263977, + "learning_rate": 0.00017001669449081804, + "loss": 0.573, + "step": 455 + }, + { + "epoch": 0.00016006021914736167, + "grad_norm": 0.4127822816371918, + "learning_rate": 0.00016994991652754594, + "loss": 0.5183, + "step": 456 + }, + { + "epoch": 0.00016041122839987782, + "grad_norm": 0.37893375754356384, + "learning_rate": 0.0001698831385642738, + "loss": 0.566, + "step": 457 + }, + { + "epoch": 0.00016076223765239397, + "grad_norm": 0.33429333567619324, + "learning_rate": 0.00016981636060100168, + "loss": 0.449, + "step": 458 + }, + { + "epoch": 0.0001611132469049101, + "grad_norm": 0.3333180546760559, + "learning_rate": 0.00016974958263772956, + "loss": 0.4441, + "step": 459 + }, + { + "epoch": 0.00016146425615742624, + "grad_norm": 0.3591359257698059, + "learning_rate": 0.00016968280467445743, + "loss": 0.55, + "step": 460 + }, + { + "epoch": 0.0001618152654099424, + "grad_norm": 0.35390427708625793, + "learning_rate": 0.00016961602671118533, + "loss": 0.6445, + "step": 461 + }, + { + "epoch": 0.00016216627466245854, + "grad_norm": 0.42036697268486023, + "learning_rate": 0.0001695492487479132, + "loss": 0.5411, + "step": 462 + }, + { + "epoch": 0.0001625172839149747, + "grad_norm": 0.42147770524024963, + "learning_rate": 0.00016948247078464108, + "loss": 0.6218, + "step": 463 + }, + { + "epoch": 0.0001628682931674908, + "grad_norm": 0.3960399329662323, + "learning_rate": 0.00016941569282136895, + "loss": 0.6608, + "step": 464 + }, + { + "epoch": 0.00016321930242000696, + "grad_norm": 0.39676985144615173, + "learning_rate": 0.00016934891485809682, + "loss": 0.5838, + "step": 465 + }, + { + "epoch": 0.0001635703116725231, + "grad_norm": 0.2839520573616028, + "learning_rate": 0.0001692821368948247, + "loss": 0.5334, + "step": 466 + }, + { + "epoch": 0.00016392132092503926, + "grad_norm": 0.3654347062110901, + "learning_rate": 0.0001692153589315526, + "loss": 0.6065, + "step": 467 + }, + { + "epoch": 0.0001642723301775554, + "grad_norm": 0.3709166646003723, + "learning_rate": 0.00016914858096828047, + "loss": 0.509, + "step": 468 + }, + { + "epoch": 0.00016462333943007153, + "grad_norm": 0.29224780201911926, + "learning_rate": 0.00016908180300500834, + "loss": 0.5372, + "step": 469 + }, + { + "epoch": 0.00016497434868258768, + "grad_norm": 0.34979283809661865, + "learning_rate": 0.00016901502504173624, + "loss": 0.3968, + "step": 470 + }, + { + "epoch": 0.00016532535793510383, + "grad_norm": 0.34580183029174805, + "learning_rate": 0.00016894824707846412, + "loss": 0.6032, + "step": 471 + }, + { + "epoch": 0.00016567636718761998, + "grad_norm": 0.39046213030815125, + "learning_rate": 0.00016888146911519202, + "loss": 0.5628, + "step": 472 + }, + { + "epoch": 0.00016602737644013613, + "grad_norm": 0.35301411151885986, + "learning_rate": 0.0001688146911519199, + "loss": 0.607, + "step": 473 + }, + { + "epoch": 0.00016637838569265225, + "grad_norm": 0.4572748839855194, + "learning_rate": 0.00016874791318864776, + "loss": 0.5018, + "step": 474 + }, + { + "epoch": 0.0001667293949451684, + "grad_norm": 0.38230374455451965, + "learning_rate": 0.00016868113522537564, + "loss": 0.5026, + "step": 475 + }, + { + "epoch": 0.00016708040419768455, + "grad_norm": 0.37066343426704407, + "learning_rate": 0.0001686143572621035, + "loss": 0.5819, + "step": 476 + }, + { + "epoch": 0.0001674314134502007, + "grad_norm": 0.3658660054206848, + "learning_rate": 0.0001685475792988314, + "loss": 0.6825, + "step": 477 + }, + { + "epoch": 0.00016778242270271685, + "grad_norm": 0.42174890637397766, + "learning_rate": 0.00016848080133555928, + "loss": 0.6065, + "step": 478 + }, + { + "epoch": 0.00016813343195523297, + "grad_norm": 0.3462882936000824, + "learning_rate": 0.00016841402337228716, + "loss": 0.5888, + "step": 479 + }, + { + "epoch": 0.00016848444120774912, + "grad_norm": 0.44681960344314575, + "learning_rate": 0.00016834724540901503, + "loss": 0.4987, + "step": 480 + }, + { + "epoch": 0.00016883545046026527, + "grad_norm": 0.3535650372505188, + "learning_rate": 0.0001682804674457429, + "loss": 0.6478, + "step": 481 + }, + { + "epoch": 0.00016918645971278142, + "grad_norm": 0.3357018232345581, + "learning_rate": 0.00016821368948247077, + "loss": 0.4949, + "step": 482 + }, + { + "epoch": 0.00016953746896529757, + "grad_norm": 0.42756739258766174, + "learning_rate": 0.00016814691151919868, + "loss": 0.6475, + "step": 483 + }, + { + "epoch": 0.0001698884782178137, + "grad_norm": 0.36174866557121277, + "learning_rate": 0.00016808013355592655, + "loss": 0.598, + "step": 484 + }, + { + "epoch": 0.00017023948747032984, + "grad_norm": 0.37115278840065, + "learning_rate": 0.00016801335559265442, + "loss": 0.6215, + "step": 485 + }, + { + "epoch": 0.000170590496722846, + "grad_norm": 0.340249627828598, + "learning_rate": 0.0001679465776293823, + "loss": 0.5702, + "step": 486 + }, + { + "epoch": 0.00017094150597536214, + "grad_norm": 0.31226348876953125, + "learning_rate": 0.0001678797996661102, + "loss": 0.6531, + "step": 487 + }, + { + "epoch": 0.0001712925152278783, + "grad_norm": 0.35571998357772827, + "learning_rate": 0.00016781302170283807, + "loss": 0.6406, + "step": 488 + }, + { + "epoch": 0.00017164352448039441, + "grad_norm": 0.4167378842830658, + "learning_rate": 0.00016774624373956597, + "loss": 0.5111, + "step": 489 + }, + { + "epoch": 0.00017199453373291056, + "grad_norm": 0.292304128408432, + "learning_rate": 0.00016767946577629384, + "loss": 0.6643, + "step": 490 + }, + { + "epoch": 0.0001723455429854267, + "grad_norm": 0.38789069652557373, + "learning_rate": 0.00016761268781302171, + "loss": 0.4542, + "step": 491 + }, + { + "epoch": 0.00017269655223794286, + "grad_norm": 0.33764714002609253, + "learning_rate": 0.0001675459098497496, + "loss": 0.4158, + "step": 492 + }, + { + "epoch": 0.00017304756149045898, + "grad_norm": 0.34849148988723755, + "learning_rate": 0.0001674791318864775, + "loss": 0.4737, + "step": 493 + }, + { + "epoch": 0.00017339857074297513, + "grad_norm": 0.2921352684497833, + "learning_rate": 0.00016741235392320536, + "loss": 0.679, + "step": 494 + }, + { + "epoch": 0.00017374957999549128, + "grad_norm": 0.33746641874313354, + "learning_rate": 0.00016734557595993323, + "loss": 0.4957, + "step": 495 + }, + { + "epoch": 0.00017410058924800743, + "grad_norm": 0.4029395878314972, + "learning_rate": 0.0001672787979966611, + "loss": 0.6708, + "step": 496 + }, + { + "epoch": 0.00017445159850052358, + "grad_norm": 0.440033882856369, + "learning_rate": 0.00016721202003338898, + "loss": 0.5889, + "step": 497 + }, + { + "epoch": 0.0001748026077530397, + "grad_norm": 0.330692857503891, + "learning_rate": 0.00016714524207011685, + "loss": 0.5942, + "step": 498 + }, + { + "epoch": 0.00017515361700555585, + "grad_norm": 0.3111809492111206, + "learning_rate": 0.00016707846410684475, + "loss": 0.5506, + "step": 499 + }, + { + "epoch": 0.000175504626258072, + "grad_norm": 0.38885676860809326, + "learning_rate": 0.00016701168614357263, + "loss": 0.4713, + "step": 500 + }, + { + "epoch": 0.00017585563551058815, + "grad_norm": 0.3697550296783447, + "learning_rate": 0.0001669449081803005, + "loss": 0.5955, + "step": 501 + }, + { + "epoch": 0.0001762066447631043, + "grad_norm": 0.35807061195373535, + "learning_rate": 0.00016687813021702837, + "loss": 0.555, + "step": 502 + }, + { + "epoch": 0.00017655765401562043, + "grad_norm": 0.44033464789390564, + "learning_rate": 0.00016681135225375625, + "loss": 0.5668, + "step": 503 + }, + { + "epoch": 0.00017690866326813657, + "grad_norm": 0.3363400399684906, + "learning_rate": 0.00016674457429048415, + "loss": 0.6176, + "step": 504 + }, + { + "epoch": 0.00017725967252065272, + "grad_norm": 0.31457507610321045, + "learning_rate": 0.00016667779632721202, + "loss": 0.6524, + "step": 505 + }, + { + "epoch": 0.00017761068177316887, + "grad_norm": 0.38115641474723816, + "learning_rate": 0.00016661101836393992, + "loss": 0.5848, + "step": 506 + }, + { + "epoch": 0.00017796169102568502, + "grad_norm": 0.3387603759765625, + "learning_rate": 0.0001665442404006678, + "loss": 0.6992, + "step": 507 + }, + { + "epoch": 0.00017831270027820115, + "grad_norm": 0.31671345233917236, + "learning_rate": 0.00016647746243739567, + "loss": 0.5744, + "step": 508 + }, + { + "epoch": 0.0001786637095307173, + "grad_norm": 0.3776471018791199, + "learning_rate": 0.00016641068447412357, + "loss": 0.622, + "step": 509 + }, + { + "epoch": 0.00017901471878323344, + "grad_norm": 0.37572941184043884, + "learning_rate": 0.00016634390651085144, + "loss": 0.5259, + "step": 510 + }, + { + "epoch": 0.0001793657280357496, + "grad_norm": 0.3335510194301605, + "learning_rate": 0.0001662771285475793, + "loss": 0.547, + "step": 511 + }, + { + "epoch": 0.00017971673728826574, + "grad_norm": 0.33241015672683716, + "learning_rate": 0.00016621035058430719, + "loss": 0.5827, + "step": 512 + }, + { + "epoch": 0.00018006774654078187, + "grad_norm": 0.3761122524738312, + "learning_rate": 0.00016614357262103506, + "loss": 0.6962, + "step": 513 + }, + { + "epoch": 0.00018041875579329802, + "grad_norm": 0.4172234833240509, + "learning_rate": 0.00016607679465776293, + "loss": 0.4922, + "step": 514 + }, + { + "epoch": 0.00018076976504581416, + "grad_norm": 0.45372599363327026, + "learning_rate": 0.00016601001669449083, + "loss": 0.5804, + "step": 515 + }, + { + "epoch": 0.00018112077429833031, + "grad_norm": 0.3854759931564331, + "learning_rate": 0.0001659432387312187, + "loss": 0.6026, + "step": 516 + }, + { + "epoch": 0.00018147178355084646, + "grad_norm": 0.3399171829223633, + "learning_rate": 0.00016587646076794658, + "loss": 0.4773, + "step": 517 + }, + { + "epoch": 0.00018182279280336259, + "grad_norm": 0.36649778485298157, + "learning_rate": 0.00016580968280467445, + "loss": 0.59, + "step": 518 + }, + { + "epoch": 0.00018217380205587874, + "grad_norm": 0.39988765120506287, + "learning_rate": 0.00016574290484140233, + "loss": 0.6094, + "step": 519 + }, + { + "epoch": 0.00018252481130839489, + "grad_norm": 0.34659436345100403, + "learning_rate": 0.00016567612687813023, + "loss": 0.4832, + "step": 520 + }, + { + "epoch": 0.00018287582056091103, + "grad_norm": 0.3742654025554657, + "learning_rate": 0.0001656093489148581, + "loss": 0.413, + "step": 521 + }, + { + "epoch": 0.00018322682981342718, + "grad_norm": 0.43068456649780273, + "learning_rate": 0.00016554257095158597, + "loss": 0.6576, + "step": 522 + }, + { + "epoch": 0.0001835778390659433, + "grad_norm": 0.42455193400382996, + "learning_rate": 0.00016547579298831387, + "loss": 0.5897, + "step": 523 + }, + { + "epoch": 0.00018392884831845946, + "grad_norm": 0.3290526568889618, + "learning_rate": 0.00016540901502504175, + "loss": 0.4022, + "step": 524 + }, + { + "epoch": 0.0001842798575709756, + "grad_norm": 0.3744141161441803, + "learning_rate": 0.00016534223706176965, + "loss": 0.5577, + "step": 525 + }, + { + "epoch": 0.00018463086682349176, + "grad_norm": 0.3516618609428406, + "learning_rate": 0.00016527545909849752, + "loss": 0.5481, + "step": 526 + }, + { + "epoch": 0.0001849818760760079, + "grad_norm": 0.3591526448726654, + "learning_rate": 0.0001652086811352254, + "loss": 0.6339, + "step": 527 + }, + { + "epoch": 0.00018533288532852403, + "grad_norm": 0.4024425745010376, + "learning_rate": 0.00016514190317195327, + "loss": 0.5268, + "step": 528 + }, + { + "epoch": 0.00018568389458104018, + "grad_norm": 0.3502136766910553, + "learning_rate": 0.00016507512520868114, + "loss": 0.5112, + "step": 529 + }, + { + "epoch": 0.00018603490383355633, + "grad_norm": 0.3338727056980133, + "learning_rate": 0.00016500834724540904, + "loss": 0.5623, + "step": 530 + }, + { + "epoch": 0.00018638591308607248, + "grad_norm": 0.43554845452308655, + "learning_rate": 0.0001649415692821369, + "loss": 0.5853, + "step": 531 + }, + { + "epoch": 0.00018673692233858862, + "grad_norm": 0.34424322843551636, + "learning_rate": 0.00016487479131886478, + "loss": 0.4951, + "step": 532 + }, + { + "epoch": 0.00018708793159110475, + "grad_norm": 0.4424237012863159, + "learning_rate": 0.00016480801335559266, + "loss": 0.4576, + "step": 533 + }, + { + "epoch": 0.0001874389408436209, + "grad_norm": 0.4616681933403015, + "learning_rate": 0.00016474123539232053, + "loss": 0.4974, + "step": 534 + }, + { + "epoch": 0.00018778995009613705, + "grad_norm": 0.3599206507205963, + "learning_rate": 0.0001646744574290484, + "loss": 0.5987, + "step": 535 + }, + { + "epoch": 0.0001881409593486532, + "grad_norm": 0.40468478202819824, + "learning_rate": 0.0001646076794657763, + "loss": 0.5914, + "step": 536 + }, + { + "epoch": 0.00018849196860116935, + "grad_norm": 0.5389227271080017, + "learning_rate": 0.00016454090150250418, + "loss": 0.6459, + "step": 537 + }, + { + "epoch": 0.00018884297785368547, + "grad_norm": 0.3493568003177643, + "learning_rate": 0.00016447412353923205, + "loss": 0.5191, + "step": 538 + }, + { + "epoch": 0.00018919398710620162, + "grad_norm": 0.31237804889678955, + "learning_rate": 0.00016440734557595992, + "loss": 0.4819, + "step": 539 + }, + { + "epoch": 0.00018954499635871777, + "grad_norm": 0.31142041087150574, + "learning_rate": 0.00016434056761268782, + "loss": 0.5659, + "step": 540 + }, + { + "epoch": 0.00018989600561123392, + "grad_norm": 0.3323245644569397, + "learning_rate": 0.0001642737896494157, + "loss": 0.5779, + "step": 541 + }, + { + "epoch": 0.00019024701486375007, + "grad_norm": 0.3679036498069763, + "learning_rate": 0.0001642070116861436, + "loss": 0.6919, + "step": 542 + }, + { + "epoch": 0.0001905980241162662, + "grad_norm": 0.3094903528690338, + "learning_rate": 0.00016414023372287147, + "loss": 0.4773, + "step": 543 + }, + { + "epoch": 0.00019094903336878234, + "grad_norm": 0.37995582818984985, + "learning_rate": 0.00016407345575959934, + "loss": 0.539, + "step": 544 + }, + { + "epoch": 0.0001913000426212985, + "grad_norm": 0.46415746212005615, + "learning_rate": 0.00016400667779632722, + "loss": 0.6708, + "step": 545 + }, + { + "epoch": 0.00019165105187381464, + "grad_norm": 0.3479398190975189, + "learning_rate": 0.00016393989983305512, + "loss": 0.5496, + "step": 546 + }, + { + "epoch": 0.00019200206112633079, + "grad_norm": 0.3740891218185425, + "learning_rate": 0.000163873121869783, + "loss": 0.6256, + "step": 547 + }, + { + "epoch": 0.0001923530703788469, + "grad_norm": 0.4934074878692627, + "learning_rate": 0.00016380634390651086, + "loss": 0.6788, + "step": 548 + }, + { + "epoch": 0.00019270407963136306, + "grad_norm": 0.42659157514572144, + "learning_rate": 0.00016373956594323874, + "loss": 0.5981, + "step": 549 + }, + { + "epoch": 0.0001930550888838792, + "grad_norm": 0.35727575421333313, + "learning_rate": 0.0001636727879799666, + "loss": 0.4095, + "step": 550 + }, + { + "epoch": 0.00019340609813639536, + "grad_norm": 0.4294300377368927, + "learning_rate": 0.00016360601001669448, + "loss": 0.5386, + "step": 551 + }, + { + "epoch": 0.0001937571073889115, + "grad_norm": 0.33482253551483154, + "learning_rate": 0.00016353923205342238, + "loss": 0.4901, + "step": 552 + }, + { + "epoch": 0.00019410811664142763, + "grad_norm": 0.3379746079444885, + "learning_rate": 0.00016347245409015026, + "loss": 0.5454, + "step": 553 + }, + { + "epoch": 0.00019445912589394378, + "grad_norm": 0.42393919825553894, + "learning_rate": 0.00016340567612687813, + "loss": 0.5959, + "step": 554 + }, + { + "epoch": 0.00019481013514645993, + "grad_norm": 0.31975501775741577, + "learning_rate": 0.000163338898163606, + "loss": 0.6048, + "step": 555 + }, + { + "epoch": 0.00019516114439897608, + "grad_norm": 0.43404972553253174, + "learning_rate": 0.00016327212020033388, + "loss": 0.6252, + "step": 556 + }, + { + "epoch": 0.00019551215365149223, + "grad_norm": 0.3559292256832123, + "learning_rate": 0.00016320534223706178, + "loss": 0.6036, + "step": 557 + }, + { + "epoch": 0.00019586316290400835, + "grad_norm": 0.3134891092777252, + "learning_rate": 0.00016313856427378965, + "loss": 0.5656, + "step": 558 + }, + { + "epoch": 0.0001962141721565245, + "grad_norm": 0.32056671380996704, + "learning_rate": 0.00016307178631051755, + "loss": 0.6509, + "step": 559 + }, + { + "epoch": 0.00019656518140904065, + "grad_norm": 0.46249130368232727, + "learning_rate": 0.00016300500834724542, + "loss": 0.6379, + "step": 560 + }, + { + "epoch": 0.0001969161906615568, + "grad_norm": 0.36366966366767883, + "learning_rate": 0.0001629382303839733, + "loss": 0.5334, + "step": 561 + }, + { + "epoch": 0.00019726719991407295, + "grad_norm": 0.4234124422073364, + "learning_rate": 0.0001628714524207012, + "loss": 0.4864, + "step": 562 + }, + { + "epoch": 0.00019761820916658907, + "grad_norm": 0.3687801659107208, + "learning_rate": 0.00016280467445742907, + "loss": 0.4855, + "step": 563 + }, + { + "epoch": 0.00019796921841910522, + "grad_norm": 0.37247028946876526, + "learning_rate": 0.00016273789649415694, + "loss": 0.6215, + "step": 564 + }, + { + "epoch": 0.00019832022767162137, + "grad_norm": 0.30445635318756104, + "learning_rate": 0.00016267111853088482, + "loss": 0.5741, + "step": 565 + }, + { + "epoch": 0.00019867123692413752, + "grad_norm": 0.3349187970161438, + "learning_rate": 0.0001626043405676127, + "loss": 0.4524, + "step": 566 + }, + { + "epoch": 0.00019902224617665367, + "grad_norm": 0.36938101053237915, + "learning_rate": 0.00016253756260434056, + "loss": 0.5046, + "step": 567 + }, + { + "epoch": 0.0001993732554291698, + "grad_norm": 0.37673529982566833, + "learning_rate": 0.00016247078464106846, + "loss": 0.5001, + "step": 568 + }, + { + "epoch": 0.00019972426468168594, + "grad_norm": 0.3571556508541107, + "learning_rate": 0.00016240400667779634, + "loss": 0.6419, + "step": 569 + }, + { + "epoch": 0.0002000752739342021, + "grad_norm": 0.35543423891067505, + "learning_rate": 0.0001623372287145242, + "loss": 0.6191, + "step": 570 + }, + { + "epoch": 0.00020042628318671824, + "grad_norm": 0.3096729516983032, + "learning_rate": 0.00016227045075125208, + "loss": 0.5373, + "step": 571 + }, + { + "epoch": 0.0002007772924392344, + "grad_norm": 0.30310383439064026, + "learning_rate": 0.00016220367278797996, + "loss": 0.558, + "step": 572 + }, + { + "epoch": 0.0002011283016917505, + "grad_norm": 0.3616211712360382, + "learning_rate": 0.00016213689482470786, + "loss": 0.6504, + "step": 573 + }, + { + "epoch": 0.00020147931094426666, + "grad_norm": 0.34818220138549805, + "learning_rate": 0.00016207011686143573, + "loss": 0.6136, + "step": 574 + }, + { + "epoch": 0.0002018303201967828, + "grad_norm": 0.36225444078445435, + "learning_rate": 0.0001620033388981636, + "loss": 0.4905, + "step": 575 + }, + { + "epoch": 0.00020218132944929896, + "grad_norm": 0.40039536356925964, + "learning_rate": 0.0001619365609348915, + "loss": 0.5997, + "step": 576 + }, + { + "epoch": 0.0002025323387018151, + "grad_norm": 0.33715930581092834, + "learning_rate": 0.00016186978297161938, + "loss": 0.5284, + "step": 577 + }, + { + "epoch": 0.00020288334795433123, + "grad_norm": 0.4137067198753357, + "learning_rate": 0.00016180300500834728, + "loss": 0.6873, + "step": 578 + }, + { + "epoch": 0.00020323435720684738, + "grad_norm": 0.41598305106163025, + "learning_rate": 0.00016173622704507515, + "loss": 0.491, + "step": 579 + }, + { + "epoch": 0.00020358536645936353, + "grad_norm": 0.5466423034667969, + "learning_rate": 0.00016166944908180302, + "loss": 0.6188, + "step": 580 + }, + { + "epoch": 0.00020393637571187968, + "grad_norm": 0.3718060851097107, + "learning_rate": 0.0001616026711185309, + "loss": 0.5573, + "step": 581 + }, + { + "epoch": 0.00020428738496439583, + "grad_norm": 0.33747225999832153, + "learning_rate": 0.00016153589315525877, + "loss": 0.4887, + "step": 582 + }, + { + "epoch": 0.00020463839421691195, + "grad_norm": 0.36478081345558167, + "learning_rate": 0.00016146911519198664, + "loss": 0.553, + "step": 583 + }, + { + "epoch": 0.0002049894034694281, + "grad_norm": 0.38441962003707886, + "learning_rate": 0.00016140233722871454, + "loss": 0.4833, + "step": 584 + }, + { + "epoch": 0.00020534041272194425, + "grad_norm": 0.45594358444213867, + "learning_rate": 0.00016133555926544241, + "loss": 0.5877, + "step": 585 + }, + { + "epoch": 0.0002056914219744604, + "grad_norm": 0.356517493724823, + "learning_rate": 0.0001612687813021703, + "loss": 0.5614, + "step": 586 + }, + { + "epoch": 0.00020604243122697655, + "grad_norm": 0.4051963686943054, + "learning_rate": 0.00016120200333889816, + "loss": 0.5208, + "step": 587 + }, + { + "epoch": 0.00020639344047949267, + "grad_norm": 0.36947959661483765, + "learning_rate": 0.00016113522537562603, + "loss": 0.4385, + "step": 588 + }, + { + "epoch": 0.00020674444973200882, + "grad_norm": 0.45947200059890747, + "learning_rate": 0.00016106844741235393, + "loss": 0.4972, + "step": 589 + }, + { + "epoch": 0.00020709545898452497, + "grad_norm": 0.40610602498054504, + "learning_rate": 0.0001610016694490818, + "loss": 0.4022, + "step": 590 + }, + { + "epoch": 0.00020744646823704112, + "grad_norm": 0.3529384732246399, + "learning_rate": 0.00016093489148580968, + "loss": 0.5222, + "step": 591 + }, + { + "epoch": 0.00020779747748955727, + "grad_norm": 0.35114821791648865, + "learning_rate": 0.00016086811352253755, + "loss": 0.6224, + "step": 592 + }, + { + "epoch": 0.0002081484867420734, + "grad_norm": 0.3596336841583252, + "learning_rate": 0.00016080133555926545, + "loss": 0.5081, + "step": 593 + }, + { + "epoch": 0.00020849949599458954, + "grad_norm": 0.4214174747467041, + "learning_rate": 0.00016073455759599333, + "loss": 0.5189, + "step": 594 + }, + { + "epoch": 0.0002088505052471057, + "grad_norm": 0.39635175466537476, + "learning_rate": 0.00016066777963272123, + "loss": 0.582, + "step": 595 + }, + { + "epoch": 0.00020920151449962184, + "grad_norm": 0.36160576343536377, + "learning_rate": 0.0001606010016694491, + "loss": 0.568, + "step": 596 + }, + { + "epoch": 0.000209552523752138, + "grad_norm": 0.4242927134037018, + "learning_rate": 0.00016053422370617697, + "loss": 0.6235, + "step": 597 + }, + { + "epoch": 0.0002099035330046541, + "grad_norm": 0.4257853925228119, + "learning_rate": 0.00016046744574290485, + "loss": 0.5294, + "step": 598 + }, + { + "epoch": 0.00021025454225717026, + "grad_norm": 0.3890500068664551, + "learning_rate": 0.00016040066777963272, + "loss": 0.6224, + "step": 599 + }, + { + "epoch": 0.0002106055515096864, + "grad_norm": 0.2971879541873932, + "learning_rate": 0.00016033388981636062, + "loss": 0.5951, + "step": 600 + }, + { + "epoch": 0.00021095656076220256, + "grad_norm": 0.29551970958709717, + "learning_rate": 0.0001602671118530885, + "loss": 0.6713, + "step": 601 + }, + { + "epoch": 0.00021130757001471868, + "grad_norm": 0.31588122248649597, + "learning_rate": 0.00016020033388981637, + "loss": 0.6384, + "step": 602 + }, + { + "epoch": 0.00021165857926723483, + "grad_norm": 0.3138657510280609, + "learning_rate": 0.00016013355592654424, + "loss": 0.5846, + "step": 603 + }, + { + "epoch": 0.00021200958851975098, + "grad_norm": 0.31286585330963135, + "learning_rate": 0.0001600667779632721, + "loss": 0.6236, + "step": 604 + }, + { + "epoch": 0.00021236059777226713, + "grad_norm": 0.32098105549812317, + "learning_rate": 0.00016, + "loss": 0.4926, + "step": 605 + }, + { + "epoch": 0.00021271160702478328, + "grad_norm": 0.371427446603775, + "learning_rate": 0.00015993322203672789, + "loss": 0.6205, + "step": 606 + }, + { + "epoch": 0.0002130626162772994, + "grad_norm": 0.28764042258262634, + "learning_rate": 0.00015986644407345576, + "loss": 0.449, + "step": 607 + }, + { + "epoch": 0.00021341362552981555, + "grad_norm": 0.35086238384246826, + "learning_rate": 0.00015979966611018363, + "loss": 0.549, + "step": 608 + }, + { + "epoch": 0.0002137646347823317, + "grad_norm": 0.3118048906326294, + "learning_rate": 0.0001597328881469115, + "loss": 0.6037, + "step": 609 + }, + { + "epoch": 0.00021411564403484785, + "grad_norm": 0.3894517123699188, + "learning_rate": 0.0001596661101836394, + "loss": 0.5989, + "step": 610 + }, + { + "epoch": 0.000214466653287364, + "grad_norm": 0.39642322063446045, + "learning_rate": 0.00015959933222036728, + "loss": 0.566, + "step": 611 + }, + { + "epoch": 0.00021481766253988012, + "grad_norm": 0.35333508253097534, + "learning_rate": 0.00015953255425709518, + "loss": 0.5055, + "step": 612 + }, + { + "epoch": 0.00021516867179239627, + "grad_norm": 0.39200490713119507, + "learning_rate": 0.00015946577629382305, + "loss": 0.5951, + "step": 613 + }, + { + "epoch": 0.00021551968104491242, + "grad_norm": 0.38436442613601685, + "learning_rate": 0.00015939899833055093, + "loss": 0.4876, + "step": 614 + }, + { + "epoch": 0.00021587069029742857, + "grad_norm": 0.3397504389286041, + "learning_rate": 0.0001593322203672788, + "loss": 0.6287, + "step": 615 + }, + { + "epoch": 0.00021622169954994472, + "grad_norm": 0.35870012640953064, + "learning_rate": 0.0001592654424040067, + "loss": 0.5857, + "step": 616 + }, + { + "epoch": 0.00021657270880246084, + "grad_norm": 0.31163597106933594, + "learning_rate": 0.00015919866444073457, + "loss": 0.4831, + "step": 617 + }, + { + "epoch": 0.000216923718054977, + "grad_norm": 0.35106539726257324, + "learning_rate": 0.00015913188647746245, + "loss": 0.5776, + "step": 618 + }, + { + "epoch": 0.00021727472730749314, + "grad_norm": 0.3639923334121704, + "learning_rate": 0.00015906510851419032, + "loss": 0.5039, + "step": 619 + }, + { + "epoch": 0.0002176257365600093, + "grad_norm": 0.3622918128967285, + "learning_rate": 0.0001589983305509182, + "loss": 0.6293, + "step": 620 + }, + { + "epoch": 0.00021797674581252544, + "grad_norm": 0.3899349868297577, + "learning_rate": 0.0001589315525876461, + "loss": 0.567, + "step": 621 + }, + { + "epoch": 0.00021832775506504156, + "grad_norm": 0.3834361732006073, + "learning_rate": 0.00015886477462437397, + "loss": 0.5106, + "step": 622 + }, + { + "epoch": 0.0002186787643175577, + "grad_norm": 0.34996962547302246, + "learning_rate": 0.00015879799666110184, + "loss": 0.5155, + "step": 623 + }, + { + "epoch": 0.00021902977357007386, + "grad_norm": 0.47908079624176025, + "learning_rate": 0.0001587312186978297, + "loss": 0.4529, + "step": 624 + }, + { + "epoch": 0.00021938078282259, + "grad_norm": 0.3167901635169983, + "learning_rate": 0.00015866444073455758, + "loss": 0.6075, + "step": 625 + }, + { + "epoch": 0.00021973179207510616, + "grad_norm": 0.4254927337169647, + "learning_rate": 0.00015859766277128548, + "loss": 0.6404, + "step": 626 + }, + { + "epoch": 0.00022008280132762228, + "grad_norm": 0.4317469000816345, + "learning_rate": 0.00015853088480801336, + "loss": 0.5881, + "step": 627 + }, + { + "epoch": 0.00022043381058013843, + "grad_norm": 0.4441644251346588, + "learning_rate": 0.00015846410684474123, + "loss": 0.5864, + "step": 628 + }, + { + "epoch": 0.00022078481983265458, + "grad_norm": 0.37883102893829346, + "learning_rate": 0.00015839732888146913, + "loss": 0.5664, + "step": 629 + }, + { + "epoch": 0.00022113582908517073, + "grad_norm": 0.35548868775367737, + "learning_rate": 0.000158330550918197, + "loss": 0.5712, + "step": 630 + }, + { + "epoch": 0.00022148683833768688, + "grad_norm": 0.31588616967201233, + "learning_rate": 0.00015826377295492488, + "loss": 0.4856, + "step": 631 + }, + { + "epoch": 0.000221837847590203, + "grad_norm": 0.3186424672603607, + "learning_rate": 0.00015819699499165278, + "loss": 0.542, + "step": 632 + }, + { + "epoch": 0.00022218885684271915, + "grad_norm": 0.41098466515541077, + "learning_rate": 0.00015813021702838065, + "loss": 0.6311, + "step": 633 + }, + { + "epoch": 0.0002225398660952353, + "grad_norm": 0.413401335477829, + "learning_rate": 0.00015806343906510852, + "loss": 0.5036, + "step": 634 + }, + { + "epoch": 0.00022289087534775145, + "grad_norm": 0.34203773736953735, + "learning_rate": 0.0001579966611018364, + "loss": 0.5508, + "step": 635 + }, + { + "epoch": 0.0002232418846002676, + "grad_norm": 0.34416648745536804, + "learning_rate": 0.00015792988313856427, + "loss": 0.5442, + "step": 636 + }, + { + "epoch": 0.00022359289385278372, + "grad_norm": 0.3439941704273224, + "learning_rate": 0.00015786310517529217, + "loss": 0.4969, + "step": 637 + }, + { + "epoch": 0.00022394390310529987, + "grad_norm": 0.3547762930393219, + "learning_rate": 0.00015779632721202004, + "loss": 0.5564, + "step": 638 + }, + { + "epoch": 0.00022429491235781602, + "grad_norm": 0.35666894912719727, + "learning_rate": 0.00015772954924874792, + "loss": 0.4759, + "step": 639 + }, + { + "epoch": 0.00022464592161033217, + "grad_norm": 0.3175058364868164, + "learning_rate": 0.0001576627712854758, + "loss": 0.5708, + "step": 640 + }, + { + "epoch": 0.00022499693086284832, + "grad_norm": 0.4329943358898163, + "learning_rate": 0.00015759599332220366, + "loss": 0.5293, + "step": 641 + }, + { + "epoch": 0.00022534794011536444, + "grad_norm": 0.5703821778297424, + "learning_rate": 0.00015752921535893156, + "loss": 0.6187, + "step": 642 + }, + { + "epoch": 0.0002256989493678806, + "grad_norm": 0.32244032621383667, + "learning_rate": 0.00015746243739565944, + "loss": 0.4847, + "step": 643 + }, + { + "epoch": 0.00022604995862039674, + "grad_norm": 0.36224085092544556, + "learning_rate": 0.0001573956594323873, + "loss": 0.6804, + "step": 644 + }, + { + "epoch": 0.0002264009678729129, + "grad_norm": 0.3316931426525116, + "learning_rate": 0.0001573288814691152, + "loss": 0.6413, + "step": 645 + }, + { + "epoch": 0.00022675197712542904, + "grad_norm": 0.38156425952911377, + "learning_rate": 0.00015726210350584308, + "loss": 0.5659, + "step": 646 + }, + { + "epoch": 0.00022710298637794516, + "grad_norm": 0.48353493213653564, + "learning_rate": 0.00015719532554257096, + "loss": 0.5788, + "step": 647 + }, + { + "epoch": 0.00022745399563046131, + "grad_norm": 0.3913673758506775, + "learning_rate": 0.00015712854757929886, + "loss": 0.6899, + "step": 648 + }, + { + "epoch": 0.00022780500488297746, + "grad_norm": 0.46836981177330017, + "learning_rate": 0.00015706176961602673, + "loss": 0.5712, + "step": 649 + }, + { + "epoch": 0.0002281560141354936, + "grad_norm": 0.34713172912597656, + "learning_rate": 0.0001569949916527546, + "loss": 0.381, + "step": 650 + }, + { + "epoch": 0.00022850702338800976, + "grad_norm": 0.3837398886680603, + "learning_rate": 0.00015692821368948248, + "loss": 0.5236, + "step": 651 + }, + { + "epoch": 0.00022885803264052589, + "grad_norm": 0.5181556940078735, + "learning_rate": 0.00015686143572621035, + "loss": 0.5889, + "step": 652 + }, + { + "epoch": 0.00022920904189304203, + "grad_norm": 0.42713961005210876, + "learning_rate": 0.00015679465776293825, + "loss": 0.5346, + "step": 653 + }, + { + "epoch": 0.00022956005114555818, + "grad_norm": 0.2868479788303375, + "learning_rate": 0.00015672787979966612, + "loss": 0.5546, + "step": 654 + }, + { + "epoch": 0.00022991106039807433, + "grad_norm": 0.31901800632476807, + "learning_rate": 0.000156661101836394, + "loss": 0.5014, + "step": 655 + }, + { + "epoch": 0.00023026206965059048, + "grad_norm": 0.41681963205337524, + "learning_rate": 0.00015659432387312187, + "loss": 0.5709, + "step": 656 + }, + { + "epoch": 0.0002306130789031066, + "grad_norm": 0.5942090749740601, + "learning_rate": 0.00015652754590984974, + "loss": 0.6022, + "step": 657 + }, + { + "epoch": 0.00023096408815562276, + "grad_norm": 0.405391126871109, + "learning_rate": 0.00015646076794657764, + "loss": 0.5363, + "step": 658 + }, + { + "epoch": 0.0002313150974081389, + "grad_norm": 0.3201390206813812, + "learning_rate": 0.00015639398998330552, + "loss": 0.6045, + "step": 659 + }, + { + "epoch": 0.00023166610666065505, + "grad_norm": 0.2989407479763031, + "learning_rate": 0.0001563272120200334, + "loss": 0.5604, + "step": 660 + }, + { + "epoch": 0.0002320171159131712, + "grad_norm": 0.3919268548488617, + "learning_rate": 0.00015626043405676126, + "loss": 0.5413, + "step": 661 + }, + { + "epoch": 0.00023236812516568733, + "grad_norm": 0.4080122709274292, + "learning_rate": 0.00015619365609348916, + "loss": 0.498, + "step": 662 + }, + { + "epoch": 0.00023271913441820348, + "grad_norm": 0.38974156975746155, + "learning_rate": 0.00015612687813021704, + "loss": 0.6149, + "step": 663 + }, + { + "epoch": 0.00023307014367071962, + "grad_norm": 0.3145015835762024, + "learning_rate": 0.00015606010016694494, + "loss": 0.4886, + "step": 664 + }, + { + "epoch": 0.00023342115292323577, + "grad_norm": 0.3009328246116638, + "learning_rate": 0.0001559933222036728, + "loss": 0.5534, + "step": 665 + }, + { + "epoch": 0.00023377216217575192, + "grad_norm": 0.4774717092514038, + "learning_rate": 0.00015592654424040068, + "loss": 0.6006, + "step": 666 + }, + { + "epoch": 0.00023412317142826805, + "grad_norm": 0.32965418696403503, + "learning_rate": 0.00015585976627712856, + "loss": 0.5463, + "step": 667 + }, + { + "epoch": 0.0002344741806807842, + "grad_norm": 0.3066554665565491, + "learning_rate": 0.00015579298831385643, + "loss": 0.5675, + "step": 668 + }, + { + "epoch": 0.00023482518993330035, + "grad_norm": 0.3879207372665405, + "learning_rate": 0.00015572621035058433, + "loss": 0.5825, + "step": 669 + }, + { + "epoch": 0.0002351761991858165, + "grad_norm": 0.3171943128108978, + "learning_rate": 0.0001556594323873122, + "loss": 0.5677, + "step": 670 + }, + { + "epoch": 0.00023552720843833264, + "grad_norm": 0.36982622742652893, + "learning_rate": 0.00015559265442404007, + "loss": 0.5885, + "step": 671 + }, + { + "epoch": 0.00023587821769084877, + "grad_norm": 0.30437183380126953, + "learning_rate": 0.00015552587646076795, + "loss": 0.6288, + "step": 672 + }, + { + "epoch": 0.00023622922694336492, + "grad_norm": 0.30654504895210266, + "learning_rate": 0.00015545909849749582, + "loss": 0.5924, + "step": 673 + }, + { + "epoch": 0.00023658023619588107, + "grad_norm": 0.3771214783191681, + "learning_rate": 0.00015539232053422372, + "loss": 0.4901, + "step": 674 + }, + { + "epoch": 0.00023693124544839721, + "grad_norm": 0.3018699884414673, + "learning_rate": 0.0001553255425709516, + "loss": 0.6159, + "step": 675 + }, + { + "epoch": 0.00023728225470091336, + "grad_norm": 0.32899734377861023, + "learning_rate": 0.00015525876460767947, + "loss": 0.6197, + "step": 676 + }, + { + "epoch": 0.0002376332639534295, + "grad_norm": 0.31837883591651917, + "learning_rate": 0.00015519198664440734, + "loss": 0.5449, + "step": 677 + }, + { + "epoch": 0.00023798427320594564, + "grad_norm": 0.35326528549194336, + "learning_rate": 0.00015512520868113521, + "loss": 0.6315, + "step": 678 + }, + { + "epoch": 0.00023833528245846179, + "grad_norm": 0.3714829385280609, + "learning_rate": 0.00015505843071786311, + "loss": 0.6352, + "step": 679 + }, + { + "epoch": 0.00023868629171097794, + "grad_norm": 0.4002094864845276, + "learning_rate": 0.000154991652754591, + "loss": 0.4235, + "step": 680 + }, + { + "epoch": 0.00023903730096349408, + "grad_norm": 0.3382783532142639, + "learning_rate": 0.0001549248747913189, + "loss": 0.5476, + "step": 681 + }, + { + "epoch": 0.0002393883102160102, + "grad_norm": 0.2985747158527374, + "learning_rate": 0.00015485809682804676, + "loss": 0.5684, + "step": 682 + }, + { + "epoch": 0.00023973931946852636, + "grad_norm": 0.3288929760456085, + "learning_rate": 0.00015479131886477463, + "loss": 0.5657, + "step": 683 + }, + { + "epoch": 0.0002400903287210425, + "grad_norm": 0.39641210436820984, + "learning_rate": 0.0001547245409015025, + "loss": 0.6283, + "step": 684 + }, + { + "epoch": 0.00024044133797355866, + "grad_norm": 0.37413230538368225, + "learning_rate": 0.0001546577629382304, + "loss": 0.5778, + "step": 685 + }, + { + "epoch": 0.0002407923472260748, + "grad_norm": 0.28837504982948303, + "learning_rate": 0.00015459098497495828, + "loss": 0.5079, + "step": 686 + }, + { + "epoch": 0.00024114335647859093, + "grad_norm": 0.32851526141166687, + "learning_rate": 0.00015452420701168615, + "loss": 0.649, + "step": 687 + }, + { + "epoch": 0.00024149436573110708, + "grad_norm": 0.3848758637905121, + "learning_rate": 0.00015445742904841403, + "loss": 0.6099, + "step": 688 + }, + { + "epoch": 0.00024184537498362323, + "grad_norm": 0.35494935512542725, + "learning_rate": 0.0001543906510851419, + "loss": 0.6498, + "step": 689 + }, + { + "epoch": 0.00024219638423613938, + "grad_norm": 0.3431280553340912, + "learning_rate": 0.0001543238731218698, + "loss": 0.4934, + "step": 690 + }, + { + "epoch": 0.00024254739348865553, + "grad_norm": 0.33980974555015564, + "learning_rate": 0.00015425709515859767, + "loss": 0.5556, + "step": 691 + }, + { + "epoch": 0.00024289840274117165, + "grad_norm": 0.3086068034172058, + "learning_rate": 0.00015419031719532555, + "loss": 0.5955, + "step": 692 + }, + { + "epoch": 0.0002432494119936878, + "grad_norm": 0.33093178272247314, + "learning_rate": 0.00015412353923205342, + "loss": 0.5926, + "step": 693 + }, + { + "epoch": 0.00024360042124620395, + "grad_norm": 0.3660534620285034, + "learning_rate": 0.0001540567612687813, + "loss": 0.5494, + "step": 694 + }, + { + "epoch": 0.0002439514304987201, + "grad_norm": 0.29803964495658875, + "learning_rate": 0.0001539899833055092, + "loss": 0.6074, + "step": 695 + }, + { + "epoch": 0.00024430243975123625, + "grad_norm": 0.36542224884033203, + "learning_rate": 0.00015392320534223707, + "loss": 0.59, + "step": 696 + }, + { + "epoch": 0.00024465344900375237, + "grad_norm": 0.34015166759490967, + "learning_rate": 0.00015385642737896494, + "loss": 0.6029, + "step": 697 + }, + { + "epoch": 0.00024500445825626854, + "grad_norm": 0.3211725950241089, + "learning_rate": 0.00015378964941569284, + "loss": 0.535, + "step": 698 + }, + { + "epoch": 0.00024535546750878467, + "grad_norm": 0.37027183175086975, + "learning_rate": 0.0001537228714524207, + "loss": 0.6265, + "step": 699 + }, + { + "epoch": 0.0002457064767613008, + "grad_norm": 0.3447396159172058, + "learning_rate": 0.00015365609348914859, + "loss": 0.6061, + "step": 700 + }, + { + "epoch": 0.00024605748601381697, + "grad_norm": 0.3344075679779053, + "learning_rate": 0.00015358931552587649, + "loss": 0.5412, + "step": 701 + }, + { + "epoch": 0.0002464084952663331, + "grad_norm": 0.29049620032310486, + "learning_rate": 0.00015352253756260436, + "loss": 0.5137, + "step": 702 + }, + { + "epoch": 0.00024675950451884926, + "grad_norm": 0.37048932909965515, + "learning_rate": 0.00015345575959933223, + "loss": 0.6118, + "step": 703 + }, + { + "epoch": 0.0002471105137713654, + "grad_norm": 0.38212522864341736, + "learning_rate": 0.0001533889816360601, + "loss": 0.466, + "step": 704 + }, + { + "epoch": 0.0002474615230238815, + "grad_norm": 0.3576483428478241, + "learning_rate": 0.00015332220367278798, + "loss": 0.561, + "step": 705 + }, + { + "epoch": 0.0002478125322763977, + "grad_norm": 0.3550293743610382, + "learning_rate": 0.00015325542570951588, + "loss": 0.5634, + "step": 706 + }, + { + "epoch": 0.0002481635415289138, + "grad_norm": 0.362474650144577, + "learning_rate": 0.00015318864774624375, + "loss": 0.5608, + "step": 707 + }, + { + "epoch": 0.00024851455078143, + "grad_norm": 0.39463603496551514, + "learning_rate": 0.00015312186978297163, + "loss": 0.64, + "step": 708 + }, + { + "epoch": 0.0002488655600339461, + "grad_norm": 0.3456307649612427, + "learning_rate": 0.0001530550918196995, + "loss": 0.4631, + "step": 709 + }, + { + "epoch": 0.00024921656928646223, + "grad_norm": 0.3300929367542267, + "learning_rate": 0.00015298831385642737, + "loss": 0.3984, + "step": 710 + }, + { + "epoch": 0.0002495675785389784, + "grad_norm": 0.35923343896865845, + "learning_rate": 0.00015292153589315527, + "loss": 0.6003, + "step": 711 + }, + { + "epoch": 0.00024991858779149453, + "grad_norm": 0.4047611653804779, + "learning_rate": 0.00015285475792988315, + "loss": 0.5715, + "step": 712 + }, + { + "epoch": 0.0002502695970440107, + "grad_norm": 0.43539851903915405, + "learning_rate": 0.00015278797996661102, + "loss": 0.571, + "step": 713 + }, + { + "epoch": 0.00025062060629652683, + "grad_norm": 0.34745046496391296, + "learning_rate": 0.0001527212020033389, + "loss": 0.622, + "step": 714 + }, + { + "epoch": 0.00025097161554904295, + "grad_norm": 0.3130028247833252, + "learning_rate": 0.0001526544240400668, + "loss": 0.507, + "step": 715 + }, + { + "epoch": 0.0002513226248015591, + "grad_norm": 0.3093617558479309, + "learning_rate": 0.00015258764607679466, + "loss": 0.4951, + "step": 716 + }, + { + "epoch": 0.00025167363405407525, + "grad_norm": 0.34299540519714355, + "learning_rate": 0.00015252086811352257, + "loss": 0.539, + "step": 717 + }, + { + "epoch": 0.0002520246433065914, + "grad_norm": 0.32698413729667664, + "learning_rate": 0.00015245409015025044, + "loss": 0.4588, + "step": 718 + }, + { + "epoch": 0.00025237565255910755, + "grad_norm": 0.37853989005088806, + "learning_rate": 0.0001523873121869783, + "loss": 0.6227, + "step": 719 + }, + { + "epoch": 0.00025272666181162367, + "grad_norm": 0.32887300848960876, + "learning_rate": 0.00015232053422370618, + "loss": 0.5893, + "step": 720 + }, + { + "epoch": 0.00025307767106413985, + "grad_norm": 0.43352028727531433, + "learning_rate": 0.00015225375626043406, + "loss": 0.5811, + "step": 721 + }, + { + "epoch": 0.00025342868031665597, + "grad_norm": 0.42844903469085693, + "learning_rate": 0.00015218697829716196, + "loss": 0.6196, + "step": 722 + }, + { + "epoch": 0.00025377968956917215, + "grad_norm": 0.39929670095443726, + "learning_rate": 0.00015212020033388983, + "loss": 0.6722, + "step": 723 + }, + { + "epoch": 0.00025413069882168827, + "grad_norm": 0.5063486695289612, + "learning_rate": 0.0001520534223706177, + "loss": 0.6086, + "step": 724 + }, + { + "epoch": 0.0002544817080742044, + "grad_norm": 0.3625267446041107, + "learning_rate": 0.00015198664440734558, + "loss": 0.6331, + "step": 725 + }, + { + "epoch": 0.00025483271732672057, + "grad_norm": 0.3452700078487396, + "learning_rate": 0.00015191986644407345, + "loss": 0.5812, + "step": 726 + }, + { + "epoch": 0.0002551837265792367, + "grad_norm": 0.31915003061294556, + "learning_rate": 0.00015185308848080135, + "loss": 0.5653, + "step": 727 + }, + { + "epoch": 0.00025553473583175287, + "grad_norm": 0.3085877299308777, + "learning_rate": 0.00015178631051752922, + "loss": 0.4702, + "step": 728 + }, + { + "epoch": 0.000255885745084269, + "grad_norm": 0.31519320607185364, + "learning_rate": 0.0001517195325542571, + "loss": 0.5096, + "step": 729 + }, + { + "epoch": 0.0002562367543367851, + "grad_norm": 0.3637699782848358, + "learning_rate": 0.00015165275459098497, + "loss": 0.6001, + "step": 730 + }, + { + "epoch": 0.0002565877635893013, + "grad_norm": 0.34056970477104187, + "learning_rate": 0.00015158597662771284, + "loss": 0.5546, + "step": 731 + }, + { + "epoch": 0.0002569387728418174, + "grad_norm": 0.37110257148742676, + "learning_rate": 0.00015151919866444074, + "loss": 0.5612, + "step": 732 + }, + { + "epoch": 0.0002572897820943336, + "grad_norm": 0.35854101181030273, + "learning_rate": 0.00015145242070116862, + "loss": 0.6364, + "step": 733 + }, + { + "epoch": 0.0002576407913468497, + "grad_norm": 0.4340030252933502, + "learning_rate": 0.00015138564273789652, + "loss": 0.5772, + "step": 734 + }, + { + "epoch": 0.00025799180059936583, + "grad_norm": 0.3807721436023712, + "learning_rate": 0.0001513188647746244, + "loss": 0.4986, + "step": 735 + }, + { + "epoch": 0.000258342809851882, + "grad_norm": 0.3522527813911438, + "learning_rate": 0.00015125208681135226, + "loss": 0.5982, + "step": 736 + }, + { + "epoch": 0.00025869381910439813, + "grad_norm": 0.31251296401023865, + "learning_rate": 0.00015118530884808014, + "loss": 0.5239, + "step": 737 + }, + { + "epoch": 0.0002590448283569143, + "grad_norm": 0.3460885286331177, + "learning_rate": 0.00015111853088480804, + "loss": 0.5881, + "step": 738 + }, + { + "epoch": 0.00025939583760943043, + "grad_norm": 0.33298879861831665, + "learning_rate": 0.0001510517529215359, + "loss": 0.5272, + "step": 739 + }, + { + "epoch": 0.00025974684686194655, + "grad_norm": 0.351468950510025, + "learning_rate": 0.00015098497495826378, + "loss": 0.6049, + "step": 740 + }, + { + "epoch": 0.00026009785611446273, + "grad_norm": 0.3449242413043976, + "learning_rate": 0.00015091819699499166, + "loss": 0.5983, + "step": 741 + }, + { + "epoch": 0.00026044886536697885, + "grad_norm": 0.34724265336990356, + "learning_rate": 0.00015085141903171953, + "loss": 0.5292, + "step": 742 + }, + { + "epoch": 0.00026079987461949503, + "grad_norm": 0.3525671660900116, + "learning_rate": 0.00015078464106844743, + "loss": 0.5391, + "step": 743 + }, + { + "epoch": 0.00026115088387201115, + "grad_norm": 0.33959653973579407, + "learning_rate": 0.0001507178631051753, + "loss": 0.5898, + "step": 744 + }, + { + "epoch": 0.00026150189312452727, + "grad_norm": 0.5051225423812866, + "learning_rate": 0.00015065108514190318, + "loss": 0.5408, + "step": 745 + }, + { + "epoch": 0.00026185290237704345, + "grad_norm": 0.3298085629940033, + "learning_rate": 0.00015058430717863105, + "loss": 0.557, + "step": 746 + }, + { + "epoch": 0.00026220391162955957, + "grad_norm": 0.3375703990459442, + "learning_rate": 0.00015051752921535892, + "loss": 0.5541, + "step": 747 + }, + { + "epoch": 0.00026255492088207575, + "grad_norm": 0.27896445989608765, + "learning_rate": 0.0001504507512520868, + "loss": 0.5273, + "step": 748 + }, + { + "epoch": 0.00026290593013459187, + "grad_norm": 0.30591917037963867, + "learning_rate": 0.0001503839732888147, + "loss": 0.5988, + "step": 749 + }, + { + "epoch": 0.000263256939387108, + "grad_norm": 0.41014084219932556, + "learning_rate": 0.00015031719532554257, + "loss": 0.555, + "step": 750 + }, + { + "epoch": 0.00026360794863962417, + "grad_norm": 0.2935464084148407, + "learning_rate": 0.00015025041736227047, + "loss": 0.625, + "step": 751 + }, + { + "epoch": 0.0002639589578921403, + "grad_norm": 0.46361032128334045, + "learning_rate": 0.00015018363939899834, + "loss": 0.4753, + "step": 752 + }, + { + "epoch": 0.00026430996714465647, + "grad_norm": 0.35808300971984863, + "learning_rate": 0.00015011686143572622, + "loss": 0.5531, + "step": 753 + }, + { + "epoch": 0.0002646609763971726, + "grad_norm": 0.3411274254322052, + "learning_rate": 0.00015005008347245412, + "loss": 0.5577, + "step": 754 + }, + { + "epoch": 0.0002650119856496887, + "grad_norm": 0.34169328212738037, + "learning_rate": 0.000149983305509182, + "loss": 0.4856, + "step": 755 + }, + { + "epoch": 0.0002653629949022049, + "grad_norm": 0.38024139404296875, + "learning_rate": 0.00014991652754590986, + "loss": 0.5203, + "step": 756 + }, + { + "epoch": 0.000265714004154721, + "grad_norm": 0.35004425048828125, + "learning_rate": 0.00014984974958263774, + "loss": 0.4999, + "step": 757 + }, + { + "epoch": 0.0002660650134072372, + "grad_norm": 0.47526153922080994, + "learning_rate": 0.0001497829716193656, + "loss": 0.5503, + "step": 758 + }, + { + "epoch": 0.0002664160226597533, + "grad_norm": 0.35096925497055054, + "learning_rate": 0.0001497161936560935, + "loss": 0.5812, + "step": 759 + }, + { + "epoch": 0.00026676703191226943, + "grad_norm": 0.4505446255207062, + "learning_rate": 0.00014964941569282138, + "loss": 0.6069, + "step": 760 + }, + { + "epoch": 0.0002671180411647856, + "grad_norm": 0.3261663019657135, + "learning_rate": 0.00014958263772954926, + "loss": 0.5601, + "step": 761 + }, + { + "epoch": 0.00026746905041730173, + "grad_norm": 0.3397548794746399, + "learning_rate": 0.00014951585976627713, + "loss": 0.5572, + "step": 762 + }, + { + "epoch": 0.00026782005966981785, + "grad_norm": 0.35547688603401184, + "learning_rate": 0.000149449081803005, + "loss": 0.5983, + "step": 763 + }, + { + "epoch": 0.00026817106892233403, + "grad_norm": 0.41515079140663147, + "learning_rate": 0.00014938230383973287, + "loss": 0.6106, + "step": 764 + }, + { + "epoch": 0.00026852207817485015, + "grad_norm": 0.3840051591396332, + "learning_rate": 0.00014931552587646077, + "loss": 0.5328, + "step": 765 + }, + { + "epoch": 0.00026887308742736633, + "grad_norm": 0.3401285707950592, + "learning_rate": 0.00014924874791318865, + "loss": 0.4666, + "step": 766 + }, + { + "epoch": 0.00026922409667988245, + "grad_norm": 0.32983794808387756, + "learning_rate": 0.00014918196994991652, + "loss": 0.5214, + "step": 767 + }, + { + "epoch": 0.0002695751059323986, + "grad_norm": 0.30202198028564453, + "learning_rate": 0.00014911519198664442, + "loss": 0.4969, + "step": 768 + }, + { + "epoch": 0.00026992611518491475, + "grad_norm": 0.3222092092037201, + "learning_rate": 0.0001490484140233723, + "loss": 0.5093, + "step": 769 + }, + { + "epoch": 0.0002702771244374309, + "grad_norm": 0.4211997091770172, + "learning_rate": 0.0001489816360601002, + "loss": 0.6295, + "step": 770 + }, + { + "epoch": 0.00027062813368994705, + "grad_norm": 0.32112184166908264, + "learning_rate": 0.00014891485809682807, + "loss": 0.5611, + "step": 771 + }, + { + "epoch": 0.00027097914294246317, + "grad_norm": 0.3272956609725952, + "learning_rate": 0.00014884808013355594, + "loss": 0.6438, + "step": 772 + }, + { + "epoch": 0.0002713301521949793, + "grad_norm": 0.39423295855522156, + "learning_rate": 0.00014878130217028381, + "loss": 0.6029, + "step": 773 + }, + { + "epoch": 0.00027168116144749547, + "grad_norm": 0.3053528070449829, + "learning_rate": 0.0001487145242070117, + "loss": 0.4978, + "step": 774 + }, + { + "epoch": 0.0002720321707000116, + "grad_norm": 0.312774658203125, + "learning_rate": 0.0001486477462437396, + "loss": 0.5753, + "step": 775 + }, + { + "epoch": 0.00027238317995252777, + "grad_norm": 0.343964546918869, + "learning_rate": 0.00014858096828046746, + "loss": 0.5173, + "step": 776 + }, + { + "epoch": 0.0002727341892050439, + "grad_norm": 0.39104631543159485, + "learning_rate": 0.00014851419031719533, + "loss": 0.6381, + "step": 777 + }, + { + "epoch": 0.00027308519845756, + "grad_norm": 0.3958207070827484, + "learning_rate": 0.0001484474123539232, + "loss": 0.6046, + "step": 778 + }, + { + "epoch": 0.0002734362077100762, + "grad_norm": 0.36198097467422485, + "learning_rate": 0.00014838063439065108, + "loss": 0.6066, + "step": 779 + }, + { + "epoch": 0.0002737872169625923, + "grad_norm": 0.29619571566581726, + "learning_rate": 0.00014831385642737895, + "loss": 0.5131, + "step": 780 + }, + { + "epoch": 0.0002741382262151085, + "grad_norm": 0.344784677028656, + "learning_rate": 0.00014824707846410685, + "loss": 0.5626, + "step": 781 + }, + { + "epoch": 0.0002744892354676246, + "grad_norm": 0.35641250014305115, + "learning_rate": 0.00014818030050083473, + "loss": 0.5451, + "step": 782 + }, + { + "epoch": 0.00027484024472014074, + "grad_norm": 0.3496847152709961, + "learning_rate": 0.0001481135225375626, + "loss": 0.4814, + "step": 783 + }, + { + "epoch": 0.0002751912539726569, + "grad_norm": 0.3726658821105957, + "learning_rate": 0.00014804674457429047, + "loss": 0.6244, + "step": 784 + }, + { + "epoch": 0.00027554226322517303, + "grad_norm": 0.3317565619945526, + "learning_rate": 0.00014797996661101837, + "loss": 0.562, + "step": 785 + }, + { + "epoch": 0.0002758932724776892, + "grad_norm": 0.3478979468345642, + "learning_rate": 0.00014791318864774625, + "loss": 0.613, + "step": 786 + }, + { + "epoch": 0.00027624428173020533, + "grad_norm": 0.3572550415992737, + "learning_rate": 0.00014784641068447415, + "loss": 0.4841, + "step": 787 + }, + { + "epoch": 0.00027659529098272146, + "grad_norm": 0.34030210971832275, + "learning_rate": 0.00014777963272120202, + "loss": 0.4879, + "step": 788 + }, + { + "epoch": 0.00027694630023523763, + "grad_norm": 0.378203421831131, + "learning_rate": 0.0001477128547579299, + "loss": 0.6086, + "step": 789 + }, + { + "epoch": 0.00027729730948775375, + "grad_norm": 0.3390562832355499, + "learning_rate": 0.00014764607679465777, + "loss": 0.586, + "step": 790 + }, + { + "epoch": 0.00027764831874026993, + "grad_norm": 0.4986645579338074, + "learning_rate": 0.00014757929883138567, + "loss": 0.5592, + "step": 791 + }, + { + "epoch": 0.00027799932799278605, + "grad_norm": 0.3361869156360626, + "learning_rate": 0.00014751252086811354, + "loss": 0.4632, + "step": 792 + }, + { + "epoch": 0.0002783503372453022, + "grad_norm": 0.3726123571395874, + "learning_rate": 0.0001474457429048414, + "loss": 0.4915, + "step": 793 + }, + { + "epoch": 0.00027870134649781835, + "grad_norm": 0.3358845114707947, + "learning_rate": 0.00014737896494156929, + "loss": 0.5593, + "step": 794 + }, + { + "epoch": 0.0002790523557503345, + "grad_norm": 0.30473607778549194, + "learning_rate": 0.00014731218697829716, + "loss": 0.3672, + "step": 795 + }, + { + "epoch": 0.00027940336500285065, + "grad_norm": 0.33929023146629333, + "learning_rate": 0.00014724540901502506, + "loss": 0.5404, + "step": 796 + }, + { + "epoch": 0.0002797543742553668, + "grad_norm": 0.30778205394744873, + "learning_rate": 0.00014717863105175293, + "loss": 0.4379, + "step": 797 + }, + { + "epoch": 0.0002801053835078829, + "grad_norm": 0.286443829536438, + "learning_rate": 0.0001471118530884808, + "loss": 0.5579, + "step": 798 + }, + { + "epoch": 0.0002804563927603991, + "grad_norm": 0.4246799051761627, + "learning_rate": 0.00014704507512520868, + "loss": 0.536, + "step": 799 + }, + { + "epoch": 0.0002808074020129152, + "grad_norm": 0.4085538983345032, + "learning_rate": 0.00014697829716193655, + "loss": 0.5309, + "step": 800 + }, + { + "epoch": 0.00028115841126543137, + "grad_norm": 0.35396453738212585, + "learning_rate": 0.00014691151919866443, + "loss": 0.5307, + "step": 801 + }, + { + "epoch": 0.0002815094205179475, + "grad_norm": 0.45588648319244385, + "learning_rate": 0.00014684474123539233, + "loss": 0.5905, + "step": 802 + }, + { + "epoch": 0.0002818604297704636, + "grad_norm": 0.3353815972805023, + "learning_rate": 0.0001467779632721202, + "loss": 0.612, + "step": 803 + }, + { + "epoch": 0.0002822114390229798, + "grad_norm": 0.4152653217315674, + "learning_rate": 0.0001467111853088481, + "loss": 0.592, + "step": 804 + }, + { + "epoch": 0.0002825624482754959, + "grad_norm": 0.3651511073112488, + "learning_rate": 0.00014664440734557597, + "loss": 0.5909, + "step": 805 + }, + { + "epoch": 0.0002829134575280121, + "grad_norm": 0.3518235385417938, + "learning_rate": 0.00014657762938230385, + "loss": 0.5684, + "step": 806 + }, + { + "epoch": 0.0002832644667805282, + "grad_norm": 0.33562156558036804, + "learning_rate": 0.00014651085141903175, + "loss": 0.5165, + "step": 807 + }, + { + "epoch": 0.00028361547603304434, + "grad_norm": 0.3648052513599396, + "learning_rate": 0.00014644407345575962, + "loss": 0.5451, + "step": 808 + }, + { + "epoch": 0.0002839664852855605, + "grad_norm": 0.44342300295829773, + "learning_rate": 0.0001463772954924875, + "loss": 0.5907, + "step": 809 + }, + { + "epoch": 0.00028431749453807664, + "grad_norm": 0.33331966400146484, + "learning_rate": 0.00014631051752921536, + "loss": 0.4254, + "step": 810 + }, + { + "epoch": 0.0002846685037905928, + "grad_norm": 0.3444873094558716, + "learning_rate": 0.00014624373956594324, + "loss": 0.5201, + "step": 811 + }, + { + "epoch": 0.00028501951304310894, + "grad_norm": 0.4239615201950073, + "learning_rate": 0.00014617696160267114, + "loss": 0.5098, + "step": 812 + }, + { + "epoch": 0.00028537052229562506, + "grad_norm": 0.47895997762680054, + "learning_rate": 0.000146110183639399, + "loss": 0.6243, + "step": 813 + }, + { + "epoch": 0.00028572153154814123, + "grad_norm": 0.47322046756744385, + "learning_rate": 0.00014604340567612688, + "loss": 0.6841, + "step": 814 + }, + { + "epoch": 0.00028607254080065736, + "grad_norm": 0.35017871856689453, + "learning_rate": 0.00014597662771285476, + "loss": 0.5313, + "step": 815 + }, + { + "epoch": 0.00028642355005317353, + "grad_norm": 0.4342300295829773, + "learning_rate": 0.00014590984974958263, + "loss": 0.4363, + "step": 816 + }, + { + "epoch": 0.00028677455930568966, + "grad_norm": 0.2966228723526001, + "learning_rate": 0.0001458430717863105, + "loss": 0.6428, + "step": 817 + }, + { + "epoch": 0.0002871255685582058, + "grad_norm": 0.3320361375808716, + "learning_rate": 0.0001457762938230384, + "loss": 0.5266, + "step": 818 + }, + { + "epoch": 0.00028747657781072195, + "grad_norm": 0.3318590223789215, + "learning_rate": 0.00014570951585976628, + "loss": 0.5676, + "step": 819 + }, + { + "epoch": 0.0002878275870632381, + "grad_norm": 0.38573157787323, + "learning_rate": 0.00014564273789649415, + "loss": 0.7083, + "step": 820 + }, + { + "epoch": 0.00028817859631575425, + "grad_norm": 0.3731164038181305, + "learning_rate": 0.00014557595993322205, + "loss": 0.578, + "step": 821 + }, + { + "epoch": 0.0002885296055682704, + "grad_norm": 0.33610039949417114, + "learning_rate": 0.00014550918196994992, + "loss": 0.5923, + "step": 822 + }, + { + "epoch": 0.0002888806148207865, + "grad_norm": 0.3393179476261139, + "learning_rate": 0.00014544240400667782, + "loss": 0.5162, + "step": 823 + }, + { + "epoch": 0.0002892316240733027, + "grad_norm": 0.35552918910980225, + "learning_rate": 0.0001453756260434057, + "loss": 0.556, + "step": 824 + }, + { + "epoch": 0.0002895826333258188, + "grad_norm": 0.32425832748413086, + "learning_rate": 0.00014530884808013357, + "loss": 0.5157, + "step": 825 + }, + { + "epoch": 0.000289933642578335, + "grad_norm": 0.3353455662727356, + "learning_rate": 0.00014524207011686144, + "loss": 0.483, + "step": 826 + }, + { + "epoch": 0.0002902846518308511, + "grad_norm": 0.46254628896713257, + "learning_rate": 0.00014517529215358932, + "loss": 0.633, + "step": 827 + }, + { + "epoch": 0.0002906356610833672, + "grad_norm": 0.3275732100009918, + "learning_rate": 0.00014510851419031722, + "loss": 0.5502, + "step": 828 + }, + { + "epoch": 0.0002909866703358834, + "grad_norm": 0.3495190441608429, + "learning_rate": 0.0001450417362270451, + "loss": 0.368, + "step": 829 + }, + { + "epoch": 0.0002913376795883995, + "grad_norm": 0.35350501537323, + "learning_rate": 0.00014497495826377296, + "loss": 0.5819, + "step": 830 + }, + { + "epoch": 0.0002916886888409157, + "grad_norm": 0.37886378169059753, + "learning_rate": 0.00014490818030050084, + "loss": 0.5418, + "step": 831 + }, + { + "epoch": 0.0002920396980934318, + "grad_norm": 0.4279928505420685, + "learning_rate": 0.0001448414023372287, + "loss": 0.5199, + "step": 832 + }, + { + "epoch": 0.00029239070734594794, + "grad_norm": 0.33105382323265076, + "learning_rate": 0.00014477462437395658, + "loss": 0.5952, + "step": 833 + }, + { + "epoch": 0.0002927417165984641, + "grad_norm": 0.40114086866378784, + "learning_rate": 0.00014470784641068448, + "loss": 0.4611, + "step": 834 + }, + { + "epoch": 0.00029309272585098024, + "grad_norm": 0.3294037878513336, + "learning_rate": 0.00014464106844741236, + "loss": 0.5562, + "step": 835 + }, + { + "epoch": 0.0002934437351034964, + "grad_norm": 0.3391546607017517, + "learning_rate": 0.00014457429048414023, + "loss": 0.5748, + "step": 836 + }, + { + "epoch": 0.00029379474435601254, + "grad_norm": 0.4093922972679138, + "learning_rate": 0.0001445075125208681, + "loss": 0.4607, + "step": 837 + }, + { + "epoch": 0.00029414575360852866, + "grad_norm": 0.3331819176673889, + "learning_rate": 0.000144440734557596, + "loss": 0.5874, + "step": 838 + }, + { + "epoch": 0.00029449676286104484, + "grad_norm": 0.43205946683883667, + "learning_rate": 0.00014437395659432388, + "loss": 0.6152, + "step": 839 + }, + { + "epoch": 0.00029484777211356096, + "grad_norm": 0.36046868562698364, + "learning_rate": 0.00014430717863105178, + "loss": 0.4781, + "step": 840 + }, + { + "epoch": 0.00029519878136607713, + "grad_norm": 0.35514524579048157, + "learning_rate": 0.00014424040066777965, + "loss": 0.568, + "step": 841 + }, + { + "epoch": 0.00029554979061859326, + "grad_norm": 0.40260326862335205, + "learning_rate": 0.00014417362270450752, + "loss": 0.6075, + "step": 842 + }, + { + "epoch": 0.0002959007998711094, + "grad_norm": 0.3102671205997467, + "learning_rate": 0.0001441068447412354, + "loss": 0.4927, + "step": 843 + }, + { + "epoch": 0.00029625180912362556, + "grad_norm": 0.30940982699394226, + "learning_rate": 0.0001440400667779633, + "loss": 0.5549, + "step": 844 + }, + { + "epoch": 0.0002966028183761417, + "grad_norm": 0.3652762174606323, + "learning_rate": 0.00014397328881469117, + "loss": 0.6085, + "step": 845 + }, + { + "epoch": 0.00029695382762865786, + "grad_norm": 0.43056777119636536, + "learning_rate": 0.00014390651085141904, + "loss": 0.494, + "step": 846 + }, + { + "epoch": 0.000297304836881174, + "grad_norm": 0.3112967014312744, + "learning_rate": 0.00014383973288814692, + "loss": 0.5141, + "step": 847 + }, + { + "epoch": 0.0002976558461336901, + "grad_norm": 0.36729326844215393, + "learning_rate": 0.0001437729549248748, + "loss": 0.5435, + "step": 848 + }, + { + "epoch": 0.0002980068553862063, + "grad_norm": 0.3128114938735962, + "learning_rate": 0.00014370617696160266, + "loss": 0.5419, + "step": 849 + }, + { + "epoch": 0.0002983578646387224, + "grad_norm": 0.4030589163303375, + "learning_rate": 0.00014363939899833056, + "loss": 0.5959, + "step": 850 + }, + { + "epoch": 0.0002987088738912386, + "grad_norm": 0.39571288228034973, + "learning_rate": 0.00014357262103505844, + "loss": 0.6798, + "step": 851 + }, + { + "epoch": 0.0002990598831437547, + "grad_norm": 0.3388408422470093, + "learning_rate": 0.0001435058430717863, + "loss": 0.4887, + "step": 852 + }, + { + "epoch": 0.0002994108923962708, + "grad_norm": 0.39615562558174133, + "learning_rate": 0.00014343906510851418, + "loss": 0.5654, + "step": 853 + }, + { + "epoch": 0.000299761901648787, + "grad_norm": 0.3967401683330536, + "learning_rate": 0.00014337228714524205, + "loss": 0.6192, + "step": 854 + }, + { + "epoch": 0.0003001129109013031, + "grad_norm": 0.5597772002220154, + "learning_rate": 0.00014330550918196995, + "loss": 0.5808, + "step": 855 + }, + { + "epoch": 0.0003004639201538193, + "grad_norm": 0.36231061816215515, + "learning_rate": 0.00014323873121869783, + "loss": 0.4936, + "step": 856 + }, + { + "epoch": 0.0003008149294063354, + "grad_norm": 0.3775942027568817, + "learning_rate": 0.00014317195325542573, + "loss": 0.5706, + "step": 857 + }, + { + "epoch": 0.00030116593865885154, + "grad_norm": 0.4139408767223358, + "learning_rate": 0.0001431051752921536, + "loss": 0.5784, + "step": 858 + }, + { + "epoch": 0.0003015169479113677, + "grad_norm": 0.4101429879665375, + "learning_rate": 0.00014303839732888147, + "loss": 0.5937, + "step": 859 + }, + { + "epoch": 0.00030186795716388384, + "grad_norm": 0.5272162556648254, + "learning_rate": 0.00014297161936560937, + "loss": 0.5244, + "step": 860 + }, + { + "epoch": 0.0003022189664164, + "grad_norm": 0.3587292730808258, + "learning_rate": 0.00014290484140233725, + "loss": 0.6333, + "step": 861 + }, + { + "epoch": 0.00030256997566891614, + "grad_norm": 0.3284890353679657, + "learning_rate": 0.00014283806343906512, + "loss": 0.5414, + "step": 862 + }, + { + "epoch": 0.00030292098492143226, + "grad_norm": 0.414974182844162, + "learning_rate": 0.000142771285475793, + "loss": 0.6116, + "step": 863 + }, + { + "epoch": 0.00030327199417394844, + "grad_norm": 0.33619245886802673, + "learning_rate": 0.00014270450751252087, + "loss": 0.5506, + "step": 864 + }, + { + "epoch": 0.00030362300342646456, + "grad_norm": 0.45475640892982483, + "learning_rate": 0.00014263772954924874, + "loss": 0.6347, + "step": 865 + }, + { + "epoch": 0.00030397401267898074, + "grad_norm": 0.2695920765399933, + "learning_rate": 0.00014257095158597664, + "loss": 0.4529, + "step": 866 + }, + { + "epoch": 0.00030432502193149686, + "grad_norm": 0.3314480781555176, + "learning_rate": 0.00014250417362270451, + "loss": 0.5812, + "step": 867 + }, + { + "epoch": 0.000304676031184013, + "grad_norm": 0.31949582695961, + "learning_rate": 0.0001424373956594324, + "loss": 0.5213, + "step": 868 + }, + { + "epoch": 0.00030502704043652916, + "grad_norm": 0.34049752354621887, + "learning_rate": 0.00014237061769616026, + "loss": 0.4645, + "step": 869 + }, + { + "epoch": 0.0003053780496890453, + "grad_norm": 0.4304719567298889, + "learning_rate": 0.00014230383973288813, + "loss": 0.5065, + "step": 870 + }, + { + "epoch": 0.00030572905894156146, + "grad_norm": 0.32379043102264404, + "learning_rate": 0.00014223706176961603, + "loss": 0.553, + "step": 871 + }, + { + "epoch": 0.0003060800681940776, + "grad_norm": 0.33285439014434814, + "learning_rate": 0.0001421702838063439, + "loss": 0.5092, + "step": 872 + }, + { + "epoch": 0.0003064310774465937, + "grad_norm": 0.336795449256897, + "learning_rate": 0.00014210350584307178, + "loss": 0.4967, + "step": 873 + }, + { + "epoch": 0.0003067820866991099, + "grad_norm": 0.34653040766716003, + "learning_rate": 0.00014203672787979968, + "loss": 0.5353, + "step": 874 + }, + { + "epoch": 0.000307133095951626, + "grad_norm": 0.3352467715740204, + "learning_rate": 0.00014196994991652755, + "loss": 0.5594, + "step": 875 + }, + { + "epoch": 0.0003074841052041422, + "grad_norm": 0.38723453879356384, + "learning_rate": 0.00014190317195325545, + "loss": 0.5897, + "step": 876 + }, + { + "epoch": 0.0003078351144566583, + "grad_norm": 0.3987238109111786, + "learning_rate": 0.00014183639398998333, + "loss": 0.4647, + "step": 877 + }, + { + "epoch": 0.0003081861237091744, + "grad_norm": 0.3452693223953247, + "learning_rate": 0.0001417696160267112, + "loss": 0.5687, + "step": 878 + }, + { + "epoch": 0.0003085371329616906, + "grad_norm": 0.3561328649520874, + "learning_rate": 0.00014170283806343907, + "loss": 0.5845, + "step": 879 + }, + { + "epoch": 0.0003088881422142067, + "grad_norm": 0.29658418893814087, + "learning_rate": 0.00014163606010016695, + "loss": 0.5202, + "step": 880 + }, + { + "epoch": 0.0003092391514667229, + "grad_norm": 0.3908213973045349, + "learning_rate": 0.00014156928213689482, + "loss": 0.4439, + "step": 881 + }, + { + "epoch": 0.000309590160719239, + "grad_norm": 0.35816919803619385, + "learning_rate": 0.00014150250417362272, + "loss": 0.5384, + "step": 882 + }, + { + "epoch": 0.00030994116997175514, + "grad_norm": 0.3681255877017975, + "learning_rate": 0.0001414357262103506, + "loss": 0.5999, + "step": 883 + }, + { + "epoch": 0.0003102921792242713, + "grad_norm": 0.31137388944625854, + "learning_rate": 0.00014136894824707847, + "loss": 0.4495, + "step": 884 + }, + { + "epoch": 0.00031064318847678744, + "grad_norm": 0.2831423878669739, + "learning_rate": 0.00014130217028380634, + "loss": 0.4576, + "step": 885 + }, + { + "epoch": 0.0003109941977293036, + "grad_norm": 0.25953516364097595, + "learning_rate": 0.0001412353923205342, + "loss": 0.5606, + "step": 886 + }, + { + "epoch": 0.00031134520698181974, + "grad_norm": 0.31105297803878784, + "learning_rate": 0.0001411686143572621, + "loss": 0.5986, + "step": 887 + }, + { + "epoch": 0.00031169621623433586, + "grad_norm": 0.35177484154701233, + "learning_rate": 0.00014110183639398999, + "loss": 0.3394, + "step": 888 + }, + { + "epoch": 0.00031204722548685204, + "grad_norm": 0.373470276594162, + "learning_rate": 0.00014103505843071786, + "loss": 0.5862, + "step": 889 + }, + { + "epoch": 0.00031239823473936816, + "grad_norm": 0.37227189540863037, + "learning_rate": 0.00014096828046744576, + "loss": 0.4677, + "step": 890 + }, + { + "epoch": 0.00031274924399188434, + "grad_norm": 0.3799666464328766, + "learning_rate": 0.00014090150250417363, + "loss": 0.5255, + "step": 891 + }, + { + "epoch": 0.00031310025324440046, + "grad_norm": 0.3630129098892212, + "learning_rate": 0.00014083472454090153, + "loss": 0.5111, + "step": 892 + }, + { + "epoch": 0.0003134512624969166, + "grad_norm": 0.5131457448005676, + "learning_rate": 0.0001407679465776294, + "loss": 0.5207, + "step": 893 + }, + { + "epoch": 0.00031380227174943276, + "grad_norm": 0.3759867548942566, + "learning_rate": 0.00014070116861435728, + "loss": 0.6678, + "step": 894 + }, + { + "epoch": 0.0003141532810019489, + "grad_norm": 0.5577414631843567, + "learning_rate": 0.00014063439065108515, + "loss": 0.62, + "step": 895 + }, + { + "epoch": 0.00031450429025446506, + "grad_norm": 0.2789120376110077, + "learning_rate": 0.00014056761268781303, + "loss": 0.4204, + "step": 896 + }, + { + "epoch": 0.0003148552995069812, + "grad_norm": 0.2897239327430725, + "learning_rate": 0.0001405008347245409, + "loss": 0.432, + "step": 897 + }, + { + "epoch": 0.0003152063087594973, + "grad_norm": 0.3552323579788208, + "learning_rate": 0.0001404340567612688, + "loss": 0.5512, + "step": 898 + }, + { + "epoch": 0.0003155573180120135, + "grad_norm": 0.49963894486427307, + "learning_rate": 0.00014036727879799667, + "loss": 0.5868, + "step": 899 + }, + { + "epoch": 0.0003159083272645296, + "grad_norm": 0.37479934096336365, + "learning_rate": 0.00014030050083472454, + "loss": 0.6682, + "step": 900 + }, + { + "epoch": 0.0003162593365170458, + "grad_norm": 0.3415648639202118, + "learning_rate": 0.00014023372287145242, + "loss": 0.5301, + "step": 901 + }, + { + "epoch": 0.0003166103457695619, + "grad_norm": 0.37530943751335144, + "learning_rate": 0.0001401669449081803, + "loss": 0.5409, + "step": 902 + }, + { + "epoch": 0.000316961355022078, + "grad_norm": 0.37487658858299255, + "learning_rate": 0.0001401001669449082, + "loss": 0.5976, + "step": 903 + }, + { + "epoch": 0.0003173123642745942, + "grad_norm": 0.37174728512763977, + "learning_rate": 0.00014003338898163606, + "loss": 0.5933, + "step": 904 + }, + { + "epoch": 0.0003176633735271103, + "grad_norm": 0.491584450006485, + "learning_rate": 0.00013996661101836394, + "loss": 0.5112, + "step": 905 + }, + { + "epoch": 0.0003180143827796265, + "grad_norm": 0.38381487131118774, + "learning_rate": 0.0001398998330550918, + "loss": 0.6486, + "step": 906 + }, + { + "epoch": 0.0003183653920321426, + "grad_norm": 0.2867659330368042, + "learning_rate": 0.0001398330550918197, + "loss": 0.5033, + "step": 907 + }, + { + "epoch": 0.00031871640128465874, + "grad_norm": 0.3146355450153351, + "learning_rate": 0.00013976627712854758, + "loss": 0.5878, + "step": 908 + }, + { + "epoch": 0.0003190674105371749, + "grad_norm": 0.3454856276512146, + "learning_rate": 0.00013969949916527548, + "loss": 0.4751, + "step": 909 + }, + { + "epoch": 0.00031941841978969104, + "grad_norm": 0.32241204380989075, + "learning_rate": 0.00013963272120200336, + "loss": 0.6378, + "step": 910 + }, + { + "epoch": 0.0003197694290422072, + "grad_norm": 0.33703315258026123, + "learning_rate": 0.00013956594323873123, + "loss": 0.4634, + "step": 911 + }, + { + "epoch": 0.00032012043829472334, + "grad_norm": 0.3781648576259613, + "learning_rate": 0.0001394991652754591, + "loss": 0.5218, + "step": 912 + }, + { + "epoch": 0.00032047144754723946, + "grad_norm": 0.4124391973018646, + "learning_rate": 0.00013943238731218698, + "loss": 0.4958, + "step": 913 + }, + { + "epoch": 0.00032082245679975564, + "grad_norm": 0.3970220685005188, + "learning_rate": 0.00013936560934891488, + "loss": 0.5624, + "step": 914 + }, + { + "epoch": 0.00032117346605227176, + "grad_norm": 0.43682703375816345, + "learning_rate": 0.00013929883138564275, + "loss": 0.544, + "step": 915 + }, + { + "epoch": 0.00032152447530478794, + "grad_norm": 0.3476586639881134, + "learning_rate": 0.00013923205342237062, + "loss": 0.4418, + "step": 916 + }, + { + "epoch": 0.00032187548455730406, + "grad_norm": 0.36963552236557007, + "learning_rate": 0.0001391652754590985, + "loss": 0.5946, + "step": 917 + }, + { + "epoch": 0.0003222264938098202, + "grad_norm": 0.3445582985877991, + "learning_rate": 0.00013909849749582637, + "loss": 0.5879, + "step": 918 + }, + { + "epoch": 0.00032257750306233636, + "grad_norm": 0.39813530445098877, + "learning_rate": 0.00013903171953255427, + "loss": 0.5759, + "step": 919 + }, + { + "epoch": 0.0003229285123148525, + "grad_norm": 0.3314265012741089, + "learning_rate": 0.00013896494156928214, + "loss": 0.6165, + "step": 920 + }, + { + "epoch": 0.00032327952156736866, + "grad_norm": 0.4094330072402954, + "learning_rate": 0.00013889816360601002, + "loss": 0.5787, + "step": 921 + }, + { + "epoch": 0.0003236305308198848, + "grad_norm": 0.36821484565734863, + "learning_rate": 0.0001388313856427379, + "loss": 0.5303, + "step": 922 + }, + { + "epoch": 0.0003239815400724009, + "grad_norm": 0.3517453968524933, + "learning_rate": 0.00013876460767946576, + "loss": 0.4586, + "step": 923 + }, + { + "epoch": 0.0003243325493249171, + "grad_norm": 0.2959018647670746, + "learning_rate": 0.00013869782971619366, + "loss": 0.5225, + "step": 924 + }, + { + "epoch": 0.0003246835585774332, + "grad_norm": 0.3286895751953125, + "learning_rate": 0.00013863105175292154, + "loss": 0.5353, + "step": 925 + }, + { + "epoch": 0.0003250345678299494, + "grad_norm": 0.3328275680541992, + "learning_rate": 0.00013856427378964944, + "loss": 0.5915, + "step": 926 + }, + { + "epoch": 0.0003253855770824655, + "grad_norm": 0.3400813937187195, + "learning_rate": 0.0001384974958263773, + "loss": 0.4598, + "step": 927 + }, + { + "epoch": 0.0003257365863349816, + "grad_norm": 0.2876541018486023, + "learning_rate": 0.00013843071786310518, + "loss": 0.4835, + "step": 928 + }, + { + "epoch": 0.0003260875955874978, + "grad_norm": 0.3401765525341034, + "learning_rate": 0.00013836393989983308, + "loss": 0.56, + "step": 929 + }, + { + "epoch": 0.0003264386048400139, + "grad_norm": 0.34506598114967346, + "learning_rate": 0.00013829716193656096, + "loss": 0.6234, + "step": 930 + }, + { + "epoch": 0.0003267896140925301, + "grad_norm": 0.33732855319976807, + "learning_rate": 0.00013823038397328883, + "loss": 0.5686, + "step": 931 + }, + { + "epoch": 0.0003271406233450462, + "grad_norm": 0.34300100803375244, + "learning_rate": 0.0001381636060100167, + "loss": 0.6091, + "step": 932 + }, + { + "epoch": 0.00032749163259756235, + "grad_norm": 0.30349200963974, + "learning_rate": 0.00013809682804674458, + "loss": 0.4836, + "step": 933 + }, + { + "epoch": 0.0003278426418500785, + "grad_norm": 0.35742175579071045, + "learning_rate": 0.00013803005008347245, + "loss": 0.6443, + "step": 934 + }, + { + "epoch": 0.00032819365110259464, + "grad_norm": 0.33582496643066406, + "learning_rate": 0.00013796327212020035, + "loss": 0.6361, + "step": 935 + }, + { + "epoch": 0.0003285446603551108, + "grad_norm": 0.33403804898262024, + "learning_rate": 0.00013789649415692822, + "loss": 0.5911, + "step": 936 + }, + { + "epoch": 0.00032889566960762694, + "grad_norm": 0.4263191521167755, + "learning_rate": 0.0001378297161936561, + "loss": 0.5243, + "step": 937 + }, + { + "epoch": 0.00032924667886014307, + "grad_norm": 0.31543296575546265, + "learning_rate": 0.00013776293823038397, + "loss": 0.554, + "step": 938 + }, + { + "epoch": 0.00032959768811265924, + "grad_norm": 0.38975203037261963, + "learning_rate": 0.00013769616026711184, + "loss": 0.5358, + "step": 939 + }, + { + "epoch": 0.00032994869736517536, + "grad_norm": 0.3175157904624939, + "learning_rate": 0.00013762938230383974, + "loss": 0.5385, + "step": 940 + }, + { + "epoch": 0.00033029970661769154, + "grad_norm": 0.32753151655197144, + "learning_rate": 0.00013756260434056762, + "loss": 0.5191, + "step": 941 + }, + { + "epoch": 0.00033065071587020766, + "grad_norm": 0.2516227066516876, + "learning_rate": 0.0001374958263772955, + "loss": 0.3496, + "step": 942 + }, + { + "epoch": 0.0003310017251227238, + "grad_norm": 0.275806188583374, + "learning_rate": 0.0001374290484140234, + "loss": 0.4197, + "step": 943 + }, + { + "epoch": 0.00033135273437523996, + "grad_norm": 0.30234864354133606, + "learning_rate": 0.00013736227045075126, + "loss": 0.4909, + "step": 944 + }, + { + "epoch": 0.0003317037436277561, + "grad_norm": 0.32561683654785156, + "learning_rate": 0.00013729549248747916, + "loss": 0.5865, + "step": 945 + }, + { + "epoch": 0.00033205475288027226, + "grad_norm": 0.32075145840644836, + "learning_rate": 0.00013722871452420704, + "loss": 0.5957, + "step": 946 + }, + { + "epoch": 0.0003324057621327884, + "grad_norm": 0.3077705204486847, + "learning_rate": 0.0001371619365609349, + "loss": 0.6026, + "step": 947 + }, + { + "epoch": 0.0003327567713853045, + "grad_norm": 0.3092177212238312, + "learning_rate": 0.00013709515859766278, + "loss": 0.553, + "step": 948 + }, + { + "epoch": 0.0003331077806378207, + "grad_norm": 0.3611501157283783, + "learning_rate": 0.00013702838063439065, + "loss": 0.5707, + "step": 949 + }, + { + "epoch": 0.0003334587898903368, + "grad_norm": 0.3343827724456787, + "learning_rate": 0.00013696160267111853, + "loss": 0.5626, + "step": 950 + }, + { + "epoch": 0.000333809799142853, + "grad_norm": 0.3330281376838684, + "learning_rate": 0.00013689482470784643, + "loss": 0.6353, + "step": 951 + }, + { + "epoch": 0.0003341608083953691, + "grad_norm": 0.4045816957950592, + "learning_rate": 0.0001368280467445743, + "loss": 0.5781, + "step": 952 + }, + { + "epoch": 0.0003345118176478852, + "grad_norm": 0.3618166446685791, + "learning_rate": 0.00013676126878130217, + "loss": 0.6702, + "step": 953 + }, + { + "epoch": 0.0003348628269004014, + "grad_norm": 0.2836553752422333, + "learning_rate": 0.00013669449081803005, + "loss": 0.4371, + "step": 954 + }, + { + "epoch": 0.0003352138361529175, + "grad_norm": 0.3100498914718628, + "learning_rate": 0.00013662771285475792, + "loss": 0.5184, + "step": 955 + }, + { + "epoch": 0.0003355648454054337, + "grad_norm": 0.34877723455429077, + "learning_rate": 0.00013656093489148582, + "loss": 0.4778, + "step": 956 + }, + { + "epoch": 0.0003359158546579498, + "grad_norm": 0.27756938338279724, + "learning_rate": 0.0001364941569282137, + "loss": 0.4314, + "step": 957 + }, + { + "epoch": 0.00033626686391046595, + "grad_norm": 0.36129051446914673, + "learning_rate": 0.00013642737896494157, + "loss": 0.5837, + "step": 958 + }, + { + "epoch": 0.0003366178731629821, + "grad_norm": 0.35625776648521423, + "learning_rate": 0.00013636060100166944, + "loss": 0.5579, + "step": 959 + }, + { + "epoch": 0.00033696888241549825, + "grad_norm": 0.3735104501247406, + "learning_rate": 0.00013629382303839734, + "loss": 0.5283, + "step": 960 + }, + { + "epoch": 0.0003373198916680144, + "grad_norm": 0.34185606241226196, + "learning_rate": 0.00013622704507512521, + "loss": 0.5669, + "step": 961 + }, + { + "epoch": 0.00033767090092053054, + "grad_norm": 0.29324260354042053, + "learning_rate": 0.00013616026711185311, + "loss": 0.4468, + "step": 962 + }, + { + "epoch": 0.00033802191017304667, + "grad_norm": 0.3439052700996399, + "learning_rate": 0.000136093489148581, + "loss": 0.5196, + "step": 963 + }, + { + "epoch": 0.00033837291942556284, + "grad_norm": 0.3536570370197296, + "learning_rate": 0.00013602671118530886, + "loss": 0.5251, + "step": 964 + }, + { + "epoch": 0.00033872392867807897, + "grad_norm": 0.4759911298751831, + "learning_rate": 0.00013595993322203673, + "loss": 0.7017, + "step": 965 + }, + { + "epoch": 0.00033907493793059514, + "grad_norm": 0.2958674728870392, + "learning_rate": 0.0001358931552587646, + "loss": 0.4936, + "step": 966 + }, + { + "epoch": 0.00033942594718311126, + "grad_norm": 0.32770562171936035, + "learning_rate": 0.0001358263772954925, + "loss": 0.5741, + "step": 967 + }, + { + "epoch": 0.0003397769564356274, + "grad_norm": 0.35697153210639954, + "learning_rate": 0.00013575959933222038, + "loss": 0.428, + "step": 968 + }, + { + "epoch": 0.00034012796568814356, + "grad_norm": 0.3409043252468109, + "learning_rate": 0.00013569282136894825, + "loss": 0.6142, + "step": 969 + }, + { + "epoch": 0.0003404789749406597, + "grad_norm": 0.47055551409721375, + "learning_rate": 0.00013562604340567613, + "loss": 0.463, + "step": 970 + }, + { + "epoch": 0.00034082998419317586, + "grad_norm": 0.38270413875579834, + "learning_rate": 0.000135559265442404, + "loss": 0.462, + "step": 971 + }, + { + "epoch": 0.000341180993445692, + "grad_norm": 0.26209867000579834, + "learning_rate": 0.0001354924874791319, + "loss": 0.5341, + "step": 972 + }, + { + "epoch": 0.0003415320026982081, + "grad_norm": 0.37498748302459717, + "learning_rate": 0.00013542570951585977, + "loss": 0.5196, + "step": 973 + }, + { + "epoch": 0.0003418830119507243, + "grad_norm": 0.36789608001708984, + "learning_rate": 0.00013535893155258765, + "loss": 0.4723, + "step": 974 + }, + { + "epoch": 0.0003422340212032404, + "grad_norm": 0.33915975689888, + "learning_rate": 0.00013529215358931552, + "loss": 0.5511, + "step": 975 + }, + { + "epoch": 0.0003425850304557566, + "grad_norm": 0.43045058846473694, + "learning_rate": 0.0001352253756260434, + "loss": 0.5667, + "step": 976 + }, + { + "epoch": 0.0003429360397082727, + "grad_norm": 0.2948949933052063, + "learning_rate": 0.0001351585976627713, + "loss": 0.4804, + "step": 977 + }, + { + "epoch": 0.00034328704896078883, + "grad_norm": 0.3249470889568329, + "learning_rate": 0.00013509181969949917, + "loss": 0.6041, + "step": 978 + }, + { + "epoch": 0.000343638058213305, + "grad_norm": 0.2865908741950989, + "learning_rate": 0.00013502504173622707, + "loss": 0.5617, + "step": 979 + }, + { + "epoch": 0.0003439890674658211, + "grad_norm": 0.3190818428993225, + "learning_rate": 0.00013495826377295494, + "loss": 0.4902, + "step": 980 + }, + { + "epoch": 0.00034434007671833725, + "grad_norm": 0.3111664950847626, + "learning_rate": 0.0001348914858096828, + "loss": 0.5504, + "step": 981 + }, + { + "epoch": 0.0003446910859708534, + "grad_norm": 0.3255857229232788, + "learning_rate": 0.00013482470784641069, + "loss": 0.5592, + "step": 982 + }, + { + "epoch": 0.00034504209522336955, + "grad_norm": 0.30806589126586914, + "learning_rate": 0.00013475792988313859, + "loss": 0.5567, + "step": 983 + }, + { + "epoch": 0.0003453931044758857, + "grad_norm": 0.33785945177078247, + "learning_rate": 0.00013469115191986646, + "loss": 0.5881, + "step": 984 + }, + { + "epoch": 0.00034574411372840185, + "grad_norm": 0.34626781940460205, + "learning_rate": 0.00013462437395659433, + "loss": 0.578, + "step": 985 + }, + { + "epoch": 0.00034609512298091797, + "grad_norm": 0.367034912109375, + "learning_rate": 0.0001345575959933222, + "loss": 0.5893, + "step": 986 + }, + { + "epoch": 0.00034644613223343415, + "grad_norm": 0.37824952602386475, + "learning_rate": 0.00013449081803005008, + "loss": 0.5681, + "step": 987 + }, + { + "epoch": 0.00034679714148595027, + "grad_norm": 0.4054035544395447, + "learning_rate": 0.00013442404006677798, + "loss": 0.6108, + "step": 988 + }, + { + "epoch": 0.00034714815073846645, + "grad_norm": 0.4374067485332489, + "learning_rate": 0.00013435726210350585, + "loss": 0.6002, + "step": 989 + }, + { + "epoch": 0.00034749915999098257, + "grad_norm": 0.3554278016090393, + "learning_rate": 0.00013429048414023373, + "loss": 0.6444, + "step": 990 + }, + { + "epoch": 0.0003478501692434987, + "grad_norm": 0.3428646922111511, + "learning_rate": 0.0001342237061769616, + "loss": 0.6527, + "step": 991 + }, + { + "epoch": 0.00034820117849601487, + "grad_norm": 0.25603657960891724, + "learning_rate": 0.00013415692821368947, + "loss": 0.5244, + "step": 992 + }, + { + "epoch": 0.000348552187748531, + "grad_norm": 0.35237595438957214, + "learning_rate": 0.00013409015025041737, + "loss": 0.557, + "step": 993 + }, + { + "epoch": 0.00034890319700104717, + "grad_norm": 0.33666110038757324, + "learning_rate": 0.00013402337228714524, + "loss": 0.5674, + "step": 994 + }, + { + "epoch": 0.0003492542062535633, + "grad_norm": 0.30283182859420776, + "learning_rate": 0.00013395659432387312, + "loss": 0.6081, + "step": 995 + }, + { + "epoch": 0.0003496052155060794, + "grad_norm": 0.30893146991729736, + "learning_rate": 0.00013388981636060102, + "loss": 0.6089, + "step": 996 + }, + { + "epoch": 0.0003499562247585956, + "grad_norm": 0.2617473304271698, + "learning_rate": 0.0001338230383973289, + "loss": 0.6104, + "step": 997 + }, + { + "epoch": 0.0003503072340111117, + "grad_norm": 0.29493093490600586, + "learning_rate": 0.00013375626043405676, + "loss": 0.5047, + "step": 998 + }, + { + "epoch": 0.0003506582432636279, + "grad_norm": 0.3991663157939911, + "learning_rate": 0.00013368948247078466, + "loss": 0.5137, + "step": 999 + }, + { + "epoch": 0.000351009252516144, + "grad_norm": 0.31760329008102417, + "learning_rate": 0.00013362270450751254, + "loss": 0.4371, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 3000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.846828653872742e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/marques/outputs/checkpoint-1000/training_args.bin b/marques/outputs/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..fd0ba520c124bb1ece608079704fa15e0236be45 --- /dev/null +++ b/marques/outputs/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09362706a3d58d219e41be1682b770b8f5069fcd630f7dbcadb71e4d4ce8859b +size 6289 diff --git a/marques/outputs/checkpoint-1500/README.md b/marques/outputs/checkpoint-1500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d90a96dfe2e51221657a6e936d376789e21081f9 --- /dev/null +++ b/marques/outputs/checkpoint-1500/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/marques/outputs/checkpoint-1500/adapter_config.json b/marques/outputs/checkpoint-1500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9930a191a30254256c9550b1bdffa58b8d7aee8 --- /dev/null +++ b/marques/outputs/checkpoint-1500/adapter_config.json @@ -0,0 +1,50 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "LlamaForCausalLM", + "parent_library": "transformers.models.llama.modeling_llama", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "gate_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/marques/outputs/checkpoint-1500/adapter_model.safetensors b/marques/outputs/checkpoint-1500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5321962dcd075a66b4317a20c0cf91e9b456f22d --- /dev/null +++ b/marques/outputs/checkpoint-1500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9e74c98f93237846393a86d59258dcd983e11759bd05132985010ffcd7210e8 +size 167832240 diff --git a/marques/outputs/checkpoint-1500/optimizer.pt b/marques/outputs/checkpoint-1500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f13aa76a69b4a672b777087558c5e7c9715f4ed3 --- /dev/null +++ b/marques/outputs/checkpoint-1500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b160483fdd4cf6e658e144d1f3d0fed2f8ae6fcd199c7561afa8e2482a41e59a +size 85724133 diff --git a/marques/outputs/checkpoint-1500/rng_state.pth b/marques/outputs/checkpoint-1500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ef66339b9befa098183fd5d69faed6838e526b0 --- /dev/null +++ b/marques/outputs/checkpoint-1500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1d565802a8e26c4e8a31328752b7a7fdc186d9401aa008e65697d0ad8c22e33 +size 14645 diff --git a/marques/outputs/checkpoint-1500/scheduler.pt b/marques/outputs/checkpoint-1500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..176fcdbb9d9a8f1ceea12d3eee370cee58210ca9 --- /dev/null +++ b/marques/outputs/checkpoint-1500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8df509c45cdfe65ace9475affb106a13e2ffaf4925922c20c73fd78a1c86166 +size 1465 diff --git a/marques/outputs/checkpoint-1500/special_tokens_map.json b/marques/outputs/checkpoint-1500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..68b10c7f0a479eae0c358eac6a14959b3f9acdf1 --- /dev/null +++ b/marques/outputs/checkpoint-1500/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/marques/outputs/checkpoint-1500/tokenizer.json b/marques/outputs/checkpoint-1500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/marques/outputs/checkpoint-1500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/marques/outputs/checkpoint-1500/tokenizer_config.json b/marques/outputs/checkpoint-1500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..92b1d94e894e5474ebea1d171e14751be79ca3e5 --- /dev/null +++ b/marques/outputs/checkpoint-1500/tokenizer_config.json @@ -0,0 +1,2066 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/marques/outputs/checkpoint-1500/trainer_state.json b/marques/outputs/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..010eee32aeabae760c81b12d2ac28998c6b1efd8 --- /dev/null +++ b/marques/outputs/checkpoint-1500/trainer_state.json @@ -0,0 +1,10534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.000526513878774216, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.5100925251614403e-07, + "grad_norm": 0.53782719373703, + "learning_rate": 0.0, + "loss": 0.5835, + "step": 1 + }, + { + "epoch": 7.020185050322881e-07, + "grad_norm": 0.6201626062393188, + "learning_rate": 4e-05, + "loss": 0.5242, + "step": 2 + }, + { + "epoch": 1.053027757548432e-06, + "grad_norm": 0.7571901082992554, + "learning_rate": 8e-05, + "loss": 0.5642, + "step": 3 + }, + { + "epoch": 1.4040370100645761e-06, + "grad_norm": 0.5588695406913757, + "learning_rate": 0.00012, + "loss": 0.4859, + "step": 4 + }, + { + "epoch": 1.75504626258072e-06, + "grad_norm": 0.7208331227302551, + "learning_rate": 0.00016, + "loss": 0.4645, + "step": 5 + }, + { + "epoch": 2.106055515096864e-06, + "grad_norm": 0.8169743418693542, + "learning_rate": 0.0002, + "loss": 0.3702, + "step": 6 + }, + { + "epoch": 2.4570647676130083e-06, + "grad_norm": 2.051530599594116, + "learning_rate": 0.00019993322203672788, + "loss": 0.4856, + "step": 7 + }, + { + "epoch": 2.8080740201291522e-06, + "grad_norm": 1.2310550212860107, + "learning_rate": 0.00019986644407345576, + "loss": 0.5192, + "step": 8 + }, + { + "epoch": 3.1590832726452962e-06, + "grad_norm": 1.612046241760254, + "learning_rate": 0.00019979966611018366, + "loss": 0.4719, + "step": 9 + }, + { + "epoch": 3.51009252516144e-06, + "grad_norm": 1.4484680891036987, + "learning_rate": 0.00019973288814691153, + "loss": 0.4416, + "step": 10 + }, + { + "epoch": 3.861101777677584e-06, + "grad_norm": 1.4529719352722168, + "learning_rate": 0.0001996661101836394, + "loss": 0.6275, + "step": 11 + }, + { + "epoch": 4.212111030193728e-06, + "grad_norm": 1.3963671922683716, + "learning_rate": 0.00019959933222036728, + "loss": 0.5874, + "step": 12 + }, + { + "epoch": 4.563120282709872e-06, + "grad_norm": 1.4744153022766113, + "learning_rate": 0.00019953255425709515, + "loss": 0.6422, + "step": 13 + }, + { + "epoch": 4.9141295352260165e-06, + "grad_norm": 0.8640050888061523, + "learning_rate": 0.00019946577629382305, + "loss": 0.5064, + "step": 14 + }, + { + "epoch": 5.26513878774216e-06, + "grad_norm": 0.7137419581413269, + "learning_rate": 0.00019939899833055092, + "loss": 0.5218, + "step": 15 + }, + { + "epoch": 5.6161480402583045e-06, + "grad_norm": 0.7769026756286621, + "learning_rate": 0.00019933222036727882, + "loss": 0.5377, + "step": 16 + }, + { + "epoch": 5.967157292774448e-06, + "grad_norm": 0.7558479905128479, + "learning_rate": 0.0001992654424040067, + "loss": 0.5054, + "step": 17 + }, + { + "epoch": 6.3181665452905924e-06, + "grad_norm": 0.8237054347991943, + "learning_rate": 0.00019919866444073457, + "loss": 0.5094, + "step": 18 + }, + { + "epoch": 6.669175797806736e-06, + "grad_norm": 1.0375059843063354, + "learning_rate": 0.00019913188647746244, + "loss": 0.5751, + "step": 19 + }, + { + "epoch": 7.02018505032288e-06, + "grad_norm": 1.075869083404541, + "learning_rate": 0.00019906510851419034, + "loss": 0.594, + "step": 20 + }, + { + "epoch": 7.371194302839024e-06, + "grad_norm": 0.8041358590126038, + "learning_rate": 0.00019899833055091822, + "loss": 0.553, + "step": 21 + }, + { + "epoch": 7.722203555355168e-06, + "grad_norm": 0.9264736771583557, + "learning_rate": 0.0001989315525876461, + "loss": 0.5555, + "step": 22 + }, + { + "epoch": 8.073212807871313e-06, + "grad_norm": 1.0074031352996826, + "learning_rate": 0.00019886477462437396, + "loss": 0.5353, + "step": 23 + }, + { + "epoch": 8.424222060387455e-06, + "grad_norm": 0.8725020885467529, + "learning_rate": 0.00019879799666110183, + "loss": 0.5557, + "step": 24 + }, + { + "epoch": 8.7752313129036e-06, + "grad_norm": 0.8867582678794861, + "learning_rate": 0.00019873121869782974, + "loss": 0.5992, + "step": 25 + }, + { + "epoch": 9.126240565419744e-06, + "grad_norm": 0.9235608577728271, + "learning_rate": 0.0001986644407345576, + "loss": 0.516, + "step": 26 + }, + { + "epoch": 9.477249817935889e-06, + "grad_norm": 0.8653218150138855, + "learning_rate": 0.00019859766277128548, + "loss": 0.5249, + "step": 27 + }, + { + "epoch": 9.828259070452033e-06, + "grad_norm": 0.7479026913642883, + "learning_rate": 0.00019853088480801335, + "loss": 0.5037, + "step": 28 + }, + { + "epoch": 1.0179268322968176e-05, + "grad_norm": 0.9531452655792236, + "learning_rate": 0.00019846410684474123, + "loss": 0.5896, + "step": 29 + }, + { + "epoch": 1.053027757548432e-05, + "grad_norm": 1.1012492179870605, + "learning_rate": 0.00019839732888146913, + "loss": 0.5139, + "step": 30 + }, + { + "epoch": 1.0881286828000465e-05, + "grad_norm": 1.0198887586593628, + "learning_rate": 0.000198330550918197, + "loss": 0.5587, + "step": 31 + }, + { + "epoch": 1.1232296080516609e-05, + "grad_norm": 0.8081266283988953, + "learning_rate": 0.00019826377295492487, + "loss": 0.4762, + "step": 32 + }, + { + "epoch": 1.1583305333032752e-05, + "grad_norm": 1.1965891122817993, + "learning_rate": 0.00019819699499165277, + "loss": 0.5719, + "step": 33 + }, + { + "epoch": 1.1934314585548896e-05, + "grad_norm": 1.214903473854065, + "learning_rate": 0.00019813021702838065, + "loss": 0.5756, + "step": 34 + }, + { + "epoch": 1.228532383806504e-05, + "grad_norm": 0.8360006213188171, + "learning_rate": 0.00019806343906510852, + "loss": 0.5688, + "step": 35 + }, + { + "epoch": 1.2636333090581185e-05, + "grad_norm": 0.8328489065170288, + "learning_rate": 0.00019799666110183642, + "loss": 0.6418, + "step": 36 + }, + { + "epoch": 1.298734234309733e-05, + "grad_norm": 1.1427714824676514, + "learning_rate": 0.0001979298831385643, + "loss": 0.6531, + "step": 37 + }, + { + "epoch": 1.3338351595613472e-05, + "grad_norm": 1.0145376920700073, + "learning_rate": 0.00019786310517529217, + "loss": 0.6473, + "step": 38 + }, + { + "epoch": 1.3689360848129616e-05, + "grad_norm": 0.8427861928939819, + "learning_rate": 0.00019779632721202004, + "loss": 0.5882, + "step": 39 + }, + { + "epoch": 1.404037010064576e-05, + "grad_norm": 0.8792659044265747, + "learning_rate": 0.00019772954924874791, + "loss": 0.608, + "step": 40 + }, + { + "epoch": 1.4391379353161905e-05, + "grad_norm": 0.9338463544845581, + "learning_rate": 0.00019766277128547581, + "loss": 0.7118, + "step": 41 + }, + { + "epoch": 1.4742388605678048e-05, + "grad_norm": 0.7554420232772827, + "learning_rate": 0.0001975959933222037, + "loss": 0.5898, + "step": 42 + }, + { + "epoch": 1.5093397858194192e-05, + "grad_norm": 0.7700084447860718, + "learning_rate": 0.00019752921535893156, + "loss": 0.6466, + "step": 43 + }, + { + "epoch": 1.5444407110710337e-05, + "grad_norm": 0.8639333248138428, + "learning_rate": 0.00019746243739565943, + "loss": 0.7253, + "step": 44 + }, + { + "epoch": 1.579541636322648e-05, + "grad_norm": 0.7760612964630127, + "learning_rate": 0.0001973956594323873, + "loss": 0.7099, + "step": 45 + }, + { + "epoch": 1.6146425615742626e-05, + "grad_norm": 0.7319066524505615, + "learning_rate": 0.0001973288814691152, + "loss": 0.6664, + "step": 46 + }, + { + "epoch": 1.6497434868258768e-05, + "grad_norm": 0.7557100057601929, + "learning_rate": 0.00019726210350584308, + "loss": 0.6318, + "step": 47 + }, + { + "epoch": 1.684844412077491e-05, + "grad_norm": 0.6420389413833618, + "learning_rate": 0.00019719532554257095, + "loss": 0.6688, + "step": 48 + }, + { + "epoch": 1.7199453373291057e-05, + "grad_norm": 0.660383939743042, + "learning_rate": 0.00019712854757929883, + "loss": 0.6204, + "step": 49 + }, + { + "epoch": 1.75504626258072e-05, + "grad_norm": 0.5614909529685974, + "learning_rate": 0.00019706176961602673, + "loss": 0.664, + "step": 50 + }, + { + "epoch": 1.7901471878323346e-05, + "grad_norm": 0.502738356590271, + "learning_rate": 0.0001969949916527546, + "loss": 0.6918, + "step": 51 + }, + { + "epoch": 1.825248113083949e-05, + "grad_norm": 0.47578102350234985, + "learning_rate": 0.0001969282136894825, + "loss": 0.6747, + "step": 52 + }, + { + "epoch": 1.860349038335563e-05, + "grad_norm": 0.5528931617736816, + "learning_rate": 0.00019686143572621037, + "loss": 0.765, + "step": 53 + }, + { + "epoch": 1.8954499635871777e-05, + "grad_norm": 0.6176997423171997, + "learning_rate": 0.00019679465776293825, + "loss": 0.5959, + "step": 54 + }, + { + "epoch": 1.930550888838792e-05, + "grad_norm": 0.43425047397613525, + "learning_rate": 0.00019672787979966612, + "loss": 0.6437, + "step": 55 + }, + { + "epoch": 1.9656518140904066e-05, + "grad_norm": 0.5135884881019592, + "learning_rate": 0.000196661101836394, + "loss": 0.7019, + "step": 56 + }, + { + "epoch": 2.000752739342021e-05, + "grad_norm": 0.4628916084766388, + "learning_rate": 0.0001965943238731219, + "loss": 0.5722, + "step": 57 + }, + { + "epoch": 2.035853664593635e-05, + "grad_norm": 0.48201897740364075, + "learning_rate": 0.00019652754590984977, + "loss": 0.6288, + "step": 58 + }, + { + "epoch": 2.0709545898452498e-05, + "grad_norm": 0.5772811770439148, + "learning_rate": 0.00019646076794657764, + "loss": 0.6067, + "step": 59 + }, + { + "epoch": 2.106055515096864e-05, + "grad_norm": 0.4976802170276642, + "learning_rate": 0.0001963939899833055, + "loss": 0.4722, + "step": 60 + }, + { + "epoch": 2.1411564403484786e-05, + "grad_norm": 0.4842129051685333, + "learning_rate": 0.00019632721202003339, + "loss": 0.5876, + "step": 61 + }, + { + "epoch": 2.176257365600093e-05, + "grad_norm": 0.46149536967277527, + "learning_rate": 0.00019626043405676129, + "loss": 0.6373, + "step": 62 + }, + { + "epoch": 2.2113582908517072e-05, + "grad_norm": 0.47199445962905884, + "learning_rate": 0.00019619365609348916, + "loss": 0.5546, + "step": 63 + }, + { + "epoch": 2.2464592161033218e-05, + "grad_norm": 0.6109340190887451, + "learning_rate": 0.00019612687813021703, + "loss": 0.6069, + "step": 64 + }, + { + "epoch": 2.281560141354936e-05, + "grad_norm": 0.5529135465621948, + "learning_rate": 0.0001960601001669449, + "loss": 0.553, + "step": 65 + }, + { + "epoch": 2.3166610666065503e-05, + "grad_norm": 0.500245213508606, + "learning_rate": 0.00019599332220367278, + "loss": 0.6149, + "step": 66 + }, + { + "epoch": 2.351761991858165e-05, + "grad_norm": 0.4841914474964142, + "learning_rate": 0.00019592654424040068, + "loss": 0.6509, + "step": 67 + }, + { + "epoch": 2.3868629171097792e-05, + "grad_norm": 0.5308504104614258, + "learning_rate": 0.00019585976627712855, + "loss": 0.7017, + "step": 68 + }, + { + "epoch": 2.4219638423613938e-05, + "grad_norm": 0.5157874822616577, + "learning_rate": 0.00019579298831385645, + "loss": 0.7125, + "step": 69 + }, + { + "epoch": 2.457064767613008e-05, + "grad_norm": 0.47787800431251526, + "learning_rate": 0.00019572621035058433, + "loss": 0.5792, + "step": 70 + }, + { + "epoch": 2.4921656928646224e-05, + "grad_norm": 0.46792763471603394, + "learning_rate": 0.0001956594323873122, + "loss": 0.7, + "step": 71 + }, + { + "epoch": 2.527266618116237e-05, + "grad_norm": 0.5394675135612488, + "learning_rate": 0.00019559265442404007, + "loss": 0.5549, + "step": 72 + }, + { + "epoch": 2.5623675433678512e-05, + "grad_norm": 0.45065200328826904, + "learning_rate": 0.00019552587646076797, + "loss": 0.6663, + "step": 73 + }, + { + "epoch": 2.597468468619466e-05, + "grad_norm": 0.4026688039302826, + "learning_rate": 0.00019545909849749584, + "loss": 0.6315, + "step": 74 + }, + { + "epoch": 2.63256939387108e-05, + "grad_norm": 0.42353659868240356, + "learning_rate": 0.00019539232053422372, + "loss": 0.5419, + "step": 75 + }, + { + "epoch": 2.6676703191226944e-05, + "grad_norm": 0.45561954379081726, + "learning_rate": 0.0001953255425709516, + "loss": 0.6624, + "step": 76 + }, + { + "epoch": 2.702771244374309e-05, + "grad_norm": 0.3954075574874878, + "learning_rate": 0.00019525876460767946, + "loss": 0.5479, + "step": 77 + }, + { + "epoch": 2.7378721696259233e-05, + "grad_norm": 0.4994329512119293, + "learning_rate": 0.00019519198664440736, + "loss": 0.7224, + "step": 78 + }, + { + "epoch": 2.7729730948775375e-05, + "grad_norm": 0.41149672865867615, + "learning_rate": 0.00019512520868113524, + "loss": 0.5621, + "step": 79 + }, + { + "epoch": 2.808074020129152e-05, + "grad_norm": 0.4199008345603943, + "learning_rate": 0.0001950584307178631, + "loss": 0.7038, + "step": 80 + }, + { + "epoch": 2.8431749453807664e-05, + "grad_norm": 0.4378969371318817, + "learning_rate": 0.00019499165275459098, + "loss": 0.6654, + "step": 81 + }, + { + "epoch": 2.878275870632381e-05, + "grad_norm": 0.4653928279876709, + "learning_rate": 0.00019492487479131886, + "loss": 0.6241, + "step": 82 + }, + { + "epoch": 2.9133767958839953e-05, + "grad_norm": 0.5166454911231995, + "learning_rate": 0.00019485809682804673, + "loss": 0.5366, + "step": 83 + }, + { + "epoch": 2.9484777211356096e-05, + "grad_norm": 0.43180733919143677, + "learning_rate": 0.00019479131886477463, + "loss": 0.6178, + "step": 84 + }, + { + "epoch": 2.9835786463872242e-05, + "grad_norm": 0.44828200340270996, + "learning_rate": 0.0001947245409015025, + "loss": 0.6706, + "step": 85 + }, + { + "epoch": 3.0186795716388385e-05, + "grad_norm": 0.384175181388855, + "learning_rate": 0.0001946577629382304, + "loss": 0.5551, + "step": 86 + }, + { + "epoch": 3.053780496890453e-05, + "grad_norm": 0.4359772503376007, + "learning_rate": 0.00019459098497495828, + "loss": 0.5626, + "step": 87 + }, + { + "epoch": 3.0888814221420673e-05, + "grad_norm": 0.4177016615867615, + "learning_rate": 0.00019452420701168615, + "loss": 0.6023, + "step": 88 + }, + { + "epoch": 3.1239823473936816e-05, + "grad_norm": 0.43592438101768494, + "learning_rate": 0.00019445742904841405, + "loss": 0.682, + "step": 89 + }, + { + "epoch": 3.159083272645296e-05, + "grad_norm": 0.48027974367141724, + "learning_rate": 0.00019439065108514192, + "loss": 0.7596, + "step": 90 + }, + { + "epoch": 3.194184197896911e-05, + "grad_norm": 0.35989537835121155, + "learning_rate": 0.0001943238731218698, + "loss": 0.6018, + "step": 91 + }, + { + "epoch": 3.229285123148525e-05, + "grad_norm": 0.48477092385292053, + "learning_rate": 0.00019425709515859767, + "loss": 0.512, + "step": 92 + }, + { + "epoch": 3.2643860484001394e-05, + "grad_norm": 0.38858646154403687, + "learning_rate": 0.00019419031719532554, + "loss": 0.6371, + "step": 93 + }, + { + "epoch": 3.2994869736517536e-05, + "grad_norm": 0.5323147177696228, + "learning_rate": 0.00019412353923205344, + "loss": 0.5221, + "step": 94 + }, + { + "epoch": 3.334587898903368e-05, + "grad_norm": 0.3784274160861969, + "learning_rate": 0.00019405676126878132, + "loss": 0.6158, + "step": 95 + }, + { + "epoch": 3.369688824154982e-05, + "grad_norm": 0.4076334834098816, + "learning_rate": 0.0001939899833055092, + "loss": 0.5535, + "step": 96 + }, + { + "epoch": 3.404789749406597e-05, + "grad_norm": 0.43930479884147644, + "learning_rate": 0.00019392320534223706, + "loss": 0.6482, + "step": 97 + }, + { + "epoch": 3.4398906746582114e-05, + "grad_norm": 0.4266909658908844, + "learning_rate": 0.00019385642737896494, + "loss": 0.6, + "step": 98 + }, + { + "epoch": 3.474991599909826e-05, + "grad_norm": 0.45353513956069946, + "learning_rate": 0.0001937896494156928, + "loss": 0.6596, + "step": 99 + }, + { + "epoch": 3.51009252516144e-05, + "grad_norm": 0.3424838185310364, + "learning_rate": 0.0001937228714524207, + "loss": 0.555, + "step": 100 + }, + { + "epoch": 3.545193450413054e-05, + "grad_norm": 0.40126165747642517, + "learning_rate": 0.00019365609348914858, + "loss": 0.6921, + "step": 101 + }, + { + "epoch": 3.580294375664669e-05, + "grad_norm": 0.36572012305259705, + "learning_rate": 0.00019358931552587646, + "loss": 0.5485, + "step": 102 + }, + { + "epoch": 3.6153953009162834e-05, + "grad_norm": 0.3972407281398773, + "learning_rate": 0.00019352253756260436, + "loss": 0.5884, + "step": 103 + }, + { + "epoch": 3.650496226167898e-05, + "grad_norm": 0.3900579512119293, + "learning_rate": 0.00019345575959933223, + "loss": 0.6664, + "step": 104 + }, + { + "epoch": 3.685597151419512e-05, + "grad_norm": 0.31666621565818787, + "learning_rate": 0.00019338898163606013, + "loss": 0.5009, + "step": 105 + }, + { + "epoch": 3.720698076671126e-05, + "grad_norm": 0.5269597172737122, + "learning_rate": 0.000193322203672788, + "loss": 0.6292, + "step": 106 + }, + { + "epoch": 3.755799001922741e-05, + "grad_norm": 0.4645126163959503, + "learning_rate": 0.00019325542570951588, + "loss": 0.636, + "step": 107 + }, + { + "epoch": 3.7908999271743555e-05, + "grad_norm": 0.3900754153728485, + "learning_rate": 0.00019318864774624375, + "loss": 0.5367, + "step": 108 + }, + { + "epoch": 3.82600085242597e-05, + "grad_norm": 0.42533883452415466, + "learning_rate": 0.00019312186978297162, + "loss": 0.6862, + "step": 109 + }, + { + "epoch": 3.861101777677584e-05, + "grad_norm": 0.6809422969818115, + "learning_rate": 0.00019305509181969952, + "loss": 0.6434, + "step": 110 + }, + { + "epoch": 3.896202702929198e-05, + "grad_norm": 0.5127860307693481, + "learning_rate": 0.0001929883138564274, + "loss": 0.6266, + "step": 111 + }, + { + "epoch": 3.931303628180813e-05, + "grad_norm": 0.5254234671592712, + "learning_rate": 0.00019292153589315527, + "loss": 0.6982, + "step": 112 + }, + { + "epoch": 3.9664045534324275e-05, + "grad_norm": 0.3699031472206116, + "learning_rate": 0.00019285475792988314, + "loss": 0.6037, + "step": 113 + }, + { + "epoch": 4.001505478684042e-05, + "grad_norm": 0.3807130455970764, + "learning_rate": 0.00019278797996661101, + "loss": 0.5861, + "step": 114 + }, + { + "epoch": 4.036606403935656e-05, + "grad_norm": 0.4455645978450775, + "learning_rate": 0.0001927212020033389, + "loss": 0.5658, + "step": 115 + }, + { + "epoch": 4.07170732918727e-05, + "grad_norm": 0.3830210864543915, + "learning_rate": 0.0001926544240400668, + "loss": 0.606, + "step": 116 + }, + { + "epoch": 4.106808254438885e-05, + "grad_norm": 0.41419631242752075, + "learning_rate": 0.00019258764607679466, + "loss": 0.6095, + "step": 117 + }, + { + "epoch": 4.1419091796904995e-05, + "grad_norm": 0.3929574489593506, + "learning_rate": 0.00019252086811352253, + "loss": 0.6464, + "step": 118 + }, + { + "epoch": 4.177010104942114e-05, + "grad_norm": 0.35958629846572876, + "learning_rate": 0.0001924540901502504, + "loss": 0.5185, + "step": 119 + }, + { + "epoch": 4.212111030193728e-05, + "grad_norm": 0.3790556490421295, + "learning_rate": 0.0001923873121869783, + "loss": 0.5156, + "step": 120 + }, + { + "epoch": 4.2472119554453423e-05, + "grad_norm": 0.37452438473701477, + "learning_rate": 0.00019232053422370618, + "loss": 0.5711, + "step": 121 + }, + { + "epoch": 4.282312880696957e-05, + "grad_norm": 0.38976770639419556, + "learning_rate": 0.00019225375626043408, + "loss": 0.6075, + "step": 122 + }, + { + "epoch": 4.3174138059485716e-05, + "grad_norm": 0.4098513424396515, + "learning_rate": 0.00019218697829716195, + "loss": 0.5312, + "step": 123 + }, + { + "epoch": 4.352514731200186e-05, + "grad_norm": 0.33890047669410706, + "learning_rate": 0.00019212020033388983, + "loss": 0.4984, + "step": 124 + }, + { + "epoch": 4.3876156564518e-05, + "grad_norm": 0.49077001214027405, + "learning_rate": 0.0001920534223706177, + "loss": 0.7159, + "step": 125 + }, + { + "epoch": 4.4227165817034144e-05, + "grad_norm": 0.41653814911842346, + "learning_rate": 0.0001919866444073456, + "loss": 0.5642, + "step": 126 + }, + { + "epoch": 4.4578175069550286e-05, + "grad_norm": 0.45710283517837524, + "learning_rate": 0.00019191986644407347, + "loss": 0.6936, + "step": 127 + }, + { + "epoch": 4.4929184322066436e-05, + "grad_norm": 0.36976873874664307, + "learning_rate": 0.00019185308848080135, + "loss": 0.5407, + "step": 128 + }, + { + "epoch": 4.528019357458258e-05, + "grad_norm": 0.42852675914764404, + "learning_rate": 0.00019178631051752922, + "loss": 0.6731, + "step": 129 + }, + { + "epoch": 4.563120282709872e-05, + "grad_norm": 0.5426310300827026, + "learning_rate": 0.0001917195325542571, + "loss": 0.5775, + "step": 130 + }, + { + "epoch": 4.5982212079614864e-05, + "grad_norm": 0.38442543148994446, + "learning_rate": 0.00019165275459098497, + "loss": 0.5994, + "step": 131 + }, + { + "epoch": 4.633322133213101e-05, + "grad_norm": 0.4298035502433777, + "learning_rate": 0.00019158597662771287, + "loss": 0.5563, + "step": 132 + }, + { + "epoch": 4.6684230584647156e-05, + "grad_norm": 0.40397605299949646, + "learning_rate": 0.00019151919866444074, + "loss": 0.6924, + "step": 133 + }, + { + "epoch": 4.70352398371633e-05, + "grad_norm": 0.4338497519493103, + "learning_rate": 0.0001914524207011686, + "loss": 0.5739, + "step": 134 + }, + { + "epoch": 4.738624908967944e-05, + "grad_norm": 0.39713653922080994, + "learning_rate": 0.0001913856427378965, + "loss": 0.4529, + "step": 135 + }, + { + "epoch": 4.7737258342195584e-05, + "grad_norm": 0.31409478187561035, + "learning_rate": 0.0001913188647746244, + "loss": 0.562, + "step": 136 + }, + { + "epoch": 4.808826759471173e-05, + "grad_norm": 0.371624618768692, + "learning_rate": 0.00019125208681135226, + "loss": 0.5288, + "step": 137 + }, + { + "epoch": 4.8439276847227877e-05, + "grad_norm": 0.4600190818309784, + "learning_rate": 0.00019118530884808016, + "loss": 0.6215, + "step": 138 + }, + { + "epoch": 4.879028609974402e-05, + "grad_norm": 0.45351359248161316, + "learning_rate": 0.00019111853088480803, + "loss": 0.686, + "step": 139 + }, + { + "epoch": 4.914129535226016e-05, + "grad_norm": 0.42282962799072266, + "learning_rate": 0.0001910517529215359, + "loss": 0.5966, + "step": 140 + }, + { + "epoch": 4.9492304604776305e-05, + "grad_norm": 0.41479986906051636, + "learning_rate": 0.00019098497495826378, + "loss": 0.5948, + "step": 141 + }, + { + "epoch": 4.984331385729245e-05, + "grad_norm": 0.40453553199768066, + "learning_rate": 0.00019091819699499168, + "loss": 0.6411, + "step": 142 + }, + { + "epoch": 5.01943231098086e-05, + "grad_norm": 0.3939369320869446, + "learning_rate": 0.00019085141903171955, + "loss": 0.5513, + "step": 143 + }, + { + "epoch": 5.054533236232474e-05, + "grad_norm": 0.3700481653213501, + "learning_rate": 0.00019078464106844743, + "loss": 0.5459, + "step": 144 + }, + { + "epoch": 5.089634161484088e-05, + "grad_norm": 0.4377487897872925, + "learning_rate": 0.0001907178631051753, + "loss": 0.6076, + "step": 145 + }, + { + "epoch": 5.1247350867357025e-05, + "grad_norm": 0.37919673323631287, + "learning_rate": 0.00019065108514190317, + "loss": 0.5207, + "step": 146 + }, + { + "epoch": 5.159836011987317e-05, + "grad_norm": 0.3841630816459656, + "learning_rate": 0.00019058430717863107, + "loss": 0.614, + "step": 147 + }, + { + "epoch": 5.194936937238932e-05, + "grad_norm": 0.43541714549064636, + "learning_rate": 0.00019051752921535895, + "loss": 0.6283, + "step": 148 + }, + { + "epoch": 5.230037862490546e-05, + "grad_norm": 0.4853285253047943, + "learning_rate": 0.00019045075125208682, + "loss": 0.5807, + "step": 149 + }, + { + "epoch": 5.26513878774216e-05, + "grad_norm": 0.3572970926761627, + "learning_rate": 0.0001903839732888147, + "loss": 0.6866, + "step": 150 + }, + { + "epoch": 5.3002397129937745e-05, + "grad_norm": 0.3674347698688507, + "learning_rate": 0.00019031719532554257, + "loss": 0.5552, + "step": 151 + }, + { + "epoch": 5.335340638245389e-05, + "grad_norm": 0.37748461961746216, + "learning_rate": 0.00019025041736227044, + "loss": 0.6278, + "step": 152 + }, + { + "epoch": 5.370441563497003e-05, + "grad_norm": 0.3788503408432007, + "learning_rate": 0.00019018363939899834, + "loss": 0.622, + "step": 153 + }, + { + "epoch": 5.405542488748618e-05, + "grad_norm": 0.3736303150653839, + "learning_rate": 0.0001901168614357262, + "loss": 0.5822, + "step": 154 + }, + { + "epoch": 5.440643414000232e-05, + "grad_norm": 0.32680070400238037, + "learning_rate": 0.0001900500834724541, + "loss": 0.5715, + "step": 155 + }, + { + "epoch": 5.4757443392518466e-05, + "grad_norm": 0.34495192766189575, + "learning_rate": 0.00018998330550918199, + "loss": 0.6497, + "step": 156 + }, + { + "epoch": 5.510845264503461e-05, + "grad_norm": 0.4244193136692047, + "learning_rate": 0.00018991652754590986, + "loss": 0.5519, + "step": 157 + }, + { + "epoch": 5.545946189755075e-05, + "grad_norm": 0.4024031162261963, + "learning_rate": 0.00018984974958263776, + "loss": 0.5339, + "step": 158 + }, + { + "epoch": 5.58104711500669e-05, + "grad_norm": 0.46051299571990967, + "learning_rate": 0.00018978297161936563, + "loss": 0.5979, + "step": 159 + }, + { + "epoch": 5.616148040258304e-05, + "grad_norm": 0.49051615595817566, + "learning_rate": 0.0001897161936560935, + "loss": 0.5563, + "step": 160 + }, + { + "epoch": 5.6512489655099186e-05, + "grad_norm": 0.43045854568481445, + "learning_rate": 0.00018964941569282138, + "loss": 0.5984, + "step": 161 + }, + { + "epoch": 5.686349890761533e-05, + "grad_norm": 0.37778228521347046, + "learning_rate": 0.00018958263772954925, + "loss": 0.5955, + "step": 162 + }, + { + "epoch": 5.721450816013147e-05, + "grad_norm": 0.3736341893672943, + "learning_rate": 0.00018951585976627715, + "loss": 0.6438, + "step": 163 + }, + { + "epoch": 5.756551741264762e-05, + "grad_norm": 0.3940117061138153, + "learning_rate": 0.00018944908180300502, + "loss": 0.503, + "step": 164 + }, + { + "epoch": 5.7916526665163763e-05, + "grad_norm": 0.4193519055843353, + "learning_rate": 0.0001893823038397329, + "loss": 0.6324, + "step": 165 + }, + { + "epoch": 5.8267535917679906e-05, + "grad_norm": 0.34481996297836304, + "learning_rate": 0.00018931552587646077, + "loss": 0.5745, + "step": 166 + }, + { + "epoch": 5.861854517019605e-05, + "grad_norm": 0.38285771012306213, + "learning_rate": 0.00018924874791318864, + "loss": 0.639, + "step": 167 + }, + { + "epoch": 5.896955442271219e-05, + "grad_norm": 0.36933982372283936, + "learning_rate": 0.00018918196994991652, + "loss": 0.6681, + "step": 168 + }, + { + "epoch": 5.932056367522834e-05, + "grad_norm": 0.36970776319503784, + "learning_rate": 0.00018911519198664442, + "loss": 0.5626, + "step": 169 + }, + { + "epoch": 5.9671572927744484e-05, + "grad_norm": 0.38494783639907837, + "learning_rate": 0.0001890484140233723, + "loss": 0.6066, + "step": 170 + }, + { + "epoch": 6.0022582180260627e-05, + "grad_norm": 0.3446069061756134, + "learning_rate": 0.00018898163606010016, + "loss": 0.6354, + "step": 171 + }, + { + "epoch": 6.037359143277677e-05, + "grad_norm": 0.4466759264469147, + "learning_rate": 0.00018891485809682806, + "loss": 0.4737, + "step": 172 + }, + { + "epoch": 6.072460068529291e-05, + "grad_norm": 0.43630918860435486, + "learning_rate": 0.00018884808013355594, + "loss": 0.6839, + "step": 173 + }, + { + "epoch": 6.107560993780906e-05, + "grad_norm": 0.37083202600479126, + "learning_rate": 0.00018878130217028384, + "loss": 0.5372, + "step": 174 + }, + { + "epoch": 6.14266191903252e-05, + "grad_norm": 0.37066200375556946, + "learning_rate": 0.0001887145242070117, + "loss": 0.6653, + "step": 175 + }, + { + "epoch": 6.177762844284135e-05, + "grad_norm": 0.5191747546195984, + "learning_rate": 0.00018864774624373958, + "loss": 0.6677, + "step": 176 + }, + { + "epoch": 6.21286376953575e-05, + "grad_norm": 0.4235158860683441, + "learning_rate": 0.00018858096828046746, + "loss": 0.5971, + "step": 177 + }, + { + "epoch": 6.247964694787363e-05, + "grad_norm": 0.405074805021286, + "learning_rate": 0.00018851419031719533, + "loss": 0.5717, + "step": 178 + }, + { + "epoch": 6.283065620038978e-05, + "grad_norm": 0.45817336440086365, + "learning_rate": 0.00018844741235392323, + "loss": 0.5878, + "step": 179 + }, + { + "epoch": 6.318166545290592e-05, + "grad_norm": 0.6313037276268005, + "learning_rate": 0.0001883806343906511, + "loss": 0.62, + "step": 180 + }, + { + "epoch": 6.353267470542207e-05, + "grad_norm": 0.41896742582321167, + "learning_rate": 0.00018831385642737898, + "loss": 0.5565, + "step": 181 + }, + { + "epoch": 6.388368395793822e-05, + "grad_norm": 0.4143432676792145, + "learning_rate": 0.00018824707846410685, + "loss": 0.5552, + "step": 182 + }, + { + "epoch": 6.423469321045435e-05, + "grad_norm": 0.38745641708374023, + "learning_rate": 0.00018818030050083472, + "loss": 0.5949, + "step": 183 + }, + { + "epoch": 6.45857024629705e-05, + "grad_norm": 0.7472612261772156, + "learning_rate": 0.0001881135225375626, + "loss": 0.6708, + "step": 184 + }, + { + "epoch": 6.493671171548664e-05, + "grad_norm": 0.4416198432445526, + "learning_rate": 0.0001880467445742905, + "loss": 0.6069, + "step": 185 + }, + { + "epoch": 6.528772096800279e-05, + "grad_norm": 0.4312993884086609, + "learning_rate": 0.00018797996661101837, + "loss": 0.5778, + "step": 186 + }, + { + "epoch": 6.563873022051894e-05, + "grad_norm": 0.4524860978126526, + "learning_rate": 0.00018791318864774624, + "loss": 0.5091, + "step": 187 + }, + { + "epoch": 6.598973947303507e-05, + "grad_norm": 0.4320828914642334, + "learning_rate": 0.00018784641068447412, + "loss": 0.6557, + "step": 188 + }, + { + "epoch": 6.634074872555122e-05, + "grad_norm": 0.6967452168464661, + "learning_rate": 0.00018777963272120202, + "loss": 0.612, + "step": 189 + }, + { + "epoch": 6.669175797806736e-05, + "grad_norm": 0.4389924705028534, + "learning_rate": 0.0001877128547579299, + "loss": 0.6271, + "step": 190 + }, + { + "epoch": 6.704276723058351e-05, + "grad_norm": 0.3693922162055969, + "learning_rate": 0.0001876460767946578, + "loss": 0.6715, + "step": 191 + }, + { + "epoch": 6.739377648309964e-05, + "grad_norm": 0.32230404019355774, + "learning_rate": 0.00018757929883138566, + "loss": 0.6344, + "step": 192 + }, + { + "epoch": 6.774478573561579e-05, + "grad_norm": 0.4440002143383026, + "learning_rate": 0.00018751252086811354, + "loss": 0.6671, + "step": 193 + }, + { + "epoch": 6.809579498813194e-05, + "grad_norm": 0.5676587820053101, + "learning_rate": 0.0001874457429048414, + "loss": 0.6818, + "step": 194 + }, + { + "epoch": 6.844680424064808e-05, + "grad_norm": 0.36207348108291626, + "learning_rate": 0.0001873789649415693, + "loss": 0.5029, + "step": 195 + }, + { + "epoch": 6.879781349316423e-05, + "grad_norm": 0.35714131593704224, + "learning_rate": 0.00018731218697829718, + "loss": 0.6127, + "step": 196 + }, + { + "epoch": 6.914882274568036e-05, + "grad_norm": 0.4285273551940918, + "learning_rate": 0.00018724540901502506, + "loss": 0.6355, + "step": 197 + }, + { + "epoch": 6.949983199819651e-05, + "grad_norm": 0.42585939168930054, + "learning_rate": 0.00018717863105175293, + "loss": 0.6302, + "step": 198 + }, + { + "epoch": 6.985084125071266e-05, + "grad_norm": 0.524303138256073, + "learning_rate": 0.0001871118530884808, + "loss": 0.6683, + "step": 199 + }, + { + "epoch": 7.02018505032288e-05, + "grad_norm": 0.39635923504829407, + "learning_rate": 0.00018704507512520868, + "loss": 0.6694, + "step": 200 + }, + { + "epoch": 7.055285975574495e-05, + "grad_norm": 0.39712437987327576, + "learning_rate": 0.00018697829716193658, + "loss": 0.5794, + "step": 201 + }, + { + "epoch": 7.090386900826108e-05, + "grad_norm": 0.4115397334098816, + "learning_rate": 0.00018691151919866445, + "loss": 0.5579, + "step": 202 + }, + { + "epoch": 7.125487826077723e-05, + "grad_norm": 0.4776385724544525, + "learning_rate": 0.00018684474123539232, + "loss": 0.5589, + "step": 203 + }, + { + "epoch": 7.160588751329338e-05, + "grad_norm": 0.35574638843536377, + "learning_rate": 0.0001867779632721202, + "loss": 0.5311, + "step": 204 + }, + { + "epoch": 7.195689676580952e-05, + "grad_norm": 0.44872432947158813, + "learning_rate": 0.00018671118530884807, + "loss": 0.635, + "step": 205 + }, + { + "epoch": 7.230790601832567e-05, + "grad_norm": 0.3511079251766205, + "learning_rate": 0.00018664440734557597, + "loss": 0.5317, + "step": 206 + }, + { + "epoch": 7.26589152708418e-05, + "grad_norm": 0.39862194657325745, + "learning_rate": 0.00018657762938230384, + "loss": 0.6653, + "step": 207 + }, + { + "epoch": 7.300992452335795e-05, + "grad_norm": 0.4046575725078583, + "learning_rate": 0.00018651085141903174, + "loss": 0.6065, + "step": 208 + }, + { + "epoch": 7.33609337758741e-05, + "grad_norm": 0.4231868088245392, + "learning_rate": 0.00018644407345575962, + "loss": 0.7078, + "step": 209 + }, + { + "epoch": 7.371194302839024e-05, + "grad_norm": 0.364700049161911, + "learning_rate": 0.0001863772954924875, + "loss": 0.6309, + "step": 210 + }, + { + "epoch": 7.406295228090639e-05, + "grad_norm": 0.5385531187057495, + "learning_rate": 0.0001863105175292154, + "loss": 0.4233, + "step": 211 + }, + { + "epoch": 7.441396153342252e-05, + "grad_norm": 0.39415115118026733, + "learning_rate": 0.00018624373956594326, + "loss": 0.5928, + "step": 212 + }, + { + "epoch": 7.476497078593867e-05, + "grad_norm": 0.6021363735198975, + "learning_rate": 0.00018617696160267113, + "loss": 0.6611, + "step": 213 + }, + { + "epoch": 7.511598003845482e-05, + "grad_norm": 0.3709903061389923, + "learning_rate": 0.000186110183639399, + "loss": 0.6136, + "step": 214 + }, + { + "epoch": 7.546698929097096e-05, + "grad_norm": 0.36710435152053833, + "learning_rate": 0.00018604340567612688, + "loss": 0.5267, + "step": 215 + }, + { + "epoch": 7.581799854348711e-05, + "grad_norm": 0.4379352033138275, + "learning_rate": 0.00018597662771285475, + "loss": 0.6429, + "step": 216 + }, + { + "epoch": 7.616900779600325e-05, + "grad_norm": 0.3408482074737549, + "learning_rate": 0.00018590984974958265, + "loss": 0.5379, + "step": 217 + }, + { + "epoch": 7.65200170485194e-05, + "grad_norm": 0.4487043023109436, + "learning_rate": 0.00018584307178631053, + "loss": 0.6582, + "step": 218 + }, + { + "epoch": 7.687102630103554e-05, + "grad_norm": 0.42003679275512695, + "learning_rate": 0.0001857762938230384, + "loss": 0.5712, + "step": 219 + }, + { + "epoch": 7.722203555355168e-05, + "grad_norm": 0.4698665738105774, + "learning_rate": 0.00018570951585976627, + "loss": 0.5715, + "step": 220 + }, + { + "epoch": 7.757304480606783e-05, + "grad_norm": 0.3777780830860138, + "learning_rate": 0.00018564273789649415, + "loss": 0.4667, + "step": 221 + }, + { + "epoch": 7.792405405858397e-05, + "grad_norm": 0.36794212460517883, + "learning_rate": 0.00018557595993322205, + "loss": 0.5382, + "step": 222 + }, + { + "epoch": 7.827506331110012e-05, + "grad_norm": 0.4582989513874054, + "learning_rate": 0.00018550918196994992, + "loss": 0.6437, + "step": 223 + }, + { + "epoch": 7.862607256361626e-05, + "grad_norm": 0.4065852761268616, + "learning_rate": 0.0001854424040066778, + "loss": 0.6928, + "step": 224 + }, + { + "epoch": 7.89770818161324e-05, + "grad_norm": 0.3857649564743042, + "learning_rate": 0.0001853756260434057, + "loss": 0.5405, + "step": 225 + }, + { + "epoch": 7.932809106864855e-05, + "grad_norm": 0.40056589245796204, + "learning_rate": 0.00018530884808013357, + "loss": 0.6425, + "step": 226 + }, + { + "epoch": 7.967910032116469e-05, + "grad_norm": 0.43137016892433167, + "learning_rate": 0.00018524207011686147, + "loss": 0.5001, + "step": 227 + }, + { + "epoch": 8.003010957368084e-05, + "grad_norm": 0.3723987340927124, + "learning_rate": 0.00018517529215358934, + "loss": 0.5118, + "step": 228 + }, + { + "epoch": 8.038111882619698e-05, + "grad_norm": 0.34196361899375916, + "learning_rate": 0.00018510851419031721, + "loss": 0.5468, + "step": 229 + }, + { + "epoch": 8.073212807871312e-05, + "grad_norm": 0.4319117069244385, + "learning_rate": 0.0001850417362270451, + "loss": 0.5703, + "step": 230 + }, + { + "epoch": 8.108313733122927e-05, + "grad_norm": 0.4467247724533081, + "learning_rate": 0.00018497495826377296, + "loss": 0.6536, + "step": 231 + }, + { + "epoch": 8.14341465837454e-05, + "grad_norm": 0.3569909632205963, + "learning_rate": 0.00018490818030050083, + "loss": 0.5335, + "step": 232 + }, + { + "epoch": 8.178515583626156e-05, + "grad_norm": 0.33486437797546387, + "learning_rate": 0.00018484140233722873, + "loss": 0.6803, + "step": 233 + }, + { + "epoch": 8.21361650887777e-05, + "grad_norm": 0.3783140480518341, + "learning_rate": 0.0001847746243739566, + "loss": 0.6361, + "step": 234 + }, + { + "epoch": 8.248717434129384e-05, + "grad_norm": 0.4844662547111511, + "learning_rate": 0.00018470784641068448, + "loss": 0.5322, + "step": 235 + }, + { + "epoch": 8.283818359380999e-05, + "grad_norm": 0.508406400680542, + "learning_rate": 0.00018464106844741235, + "loss": 0.6676, + "step": 236 + }, + { + "epoch": 8.318919284632613e-05, + "grad_norm": 0.3710225820541382, + "learning_rate": 0.00018457429048414023, + "loss": 0.6656, + "step": 237 + }, + { + "epoch": 8.354020209884228e-05, + "grad_norm": 0.3757292628288269, + "learning_rate": 0.00018450751252086813, + "loss": 0.6095, + "step": 238 + }, + { + "epoch": 8.389121135135843e-05, + "grad_norm": 0.40651261806488037, + "learning_rate": 0.000184440734557596, + "loss": 0.6626, + "step": 239 + }, + { + "epoch": 8.424222060387456e-05, + "grad_norm": 0.40700778365135193, + "learning_rate": 0.00018437395659432387, + "loss": 0.5328, + "step": 240 + }, + { + "epoch": 8.459322985639071e-05, + "grad_norm": 0.5067440867424011, + "learning_rate": 0.00018430717863105175, + "loss": 0.4811, + "step": 241 + }, + { + "epoch": 8.494423910890685e-05, + "grad_norm": 0.3934602737426758, + "learning_rate": 0.00018424040066777965, + "loss": 0.5691, + "step": 242 + }, + { + "epoch": 8.5295248361423e-05, + "grad_norm": 0.3360019624233246, + "learning_rate": 0.00018417362270450752, + "loss": 0.5542, + "step": 243 + }, + { + "epoch": 8.564625761393915e-05, + "grad_norm": 0.4023631513118744, + "learning_rate": 0.00018410684474123542, + "loss": 0.5192, + "step": 244 + }, + { + "epoch": 8.599726686645528e-05, + "grad_norm": 0.41704171895980835, + "learning_rate": 0.0001840400667779633, + "loss": 0.5018, + "step": 245 + }, + { + "epoch": 8.634827611897143e-05, + "grad_norm": 0.361977756023407, + "learning_rate": 0.00018397328881469117, + "loss": 0.6193, + "step": 246 + }, + { + "epoch": 8.669928537148757e-05, + "grad_norm": 0.37774717807769775, + "learning_rate": 0.00018390651085141904, + "loss": 0.5552, + "step": 247 + }, + { + "epoch": 8.705029462400372e-05, + "grad_norm": 0.3408471941947937, + "learning_rate": 0.0001838397328881469, + "loss": 0.5876, + "step": 248 + }, + { + "epoch": 8.740130387651985e-05, + "grad_norm": 0.3892226815223694, + "learning_rate": 0.0001837729549248748, + "loss": 0.4227, + "step": 249 + }, + { + "epoch": 8.7752313129036e-05, + "grad_norm": 0.5315036177635193, + "learning_rate": 0.00018370617696160269, + "loss": 0.5826, + "step": 250 + }, + { + "epoch": 8.810332238155215e-05, + "grad_norm": 0.35433024168014526, + "learning_rate": 0.00018363939899833056, + "loss": 0.5992, + "step": 251 + }, + { + "epoch": 8.845433163406829e-05, + "grad_norm": 0.34777382016181946, + "learning_rate": 0.00018357262103505843, + "loss": 0.4973, + "step": 252 + }, + { + "epoch": 8.880534088658444e-05, + "grad_norm": 0.3936387002468109, + "learning_rate": 0.0001835058430717863, + "loss": 0.6254, + "step": 253 + }, + { + "epoch": 8.915635013910057e-05, + "grad_norm": 0.4009217917919159, + "learning_rate": 0.0001834390651085142, + "loss": 0.4843, + "step": 254 + }, + { + "epoch": 8.950735939161672e-05, + "grad_norm": 0.4863683879375458, + "learning_rate": 0.00018337228714524208, + "loss": 0.5204, + "step": 255 + }, + { + "epoch": 8.985836864413287e-05, + "grad_norm": 0.6100988984107971, + "learning_rate": 0.00018330550918196995, + "loss": 0.7296, + "step": 256 + }, + { + "epoch": 9.020937789664901e-05, + "grad_norm": 0.40949374437332153, + "learning_rate": 0.00018323873121869782, + "loss": 0.5707, + "step": 257 + }, + { + "epoch": 9.056038714916516e-05, + "grad_norm": 0.47316402196884155, + "learning_rate": 0.0001831719532554257, + "loss": 0.6655, + "step": 258 + }, + { + "epoch": 9.091139640168129e-05, + "grad_norm": 0.4053696393966675, + "learning_rate": 0.0001831051752921536, + "loss": 0.5822, + "step": 259 + }, + { + "epoch": 9.126240565419744e-05, + "grad_norm": 0.4582972228527069, + "learning_rate": 0.00018303839732888147, + "loss": 0.5475, + "step": 260 + }, + { + "epoch": 9.161341490671359e-05, + "grad_norm": 0.38666802644729614, + "learning_rate": 0.00018297161936560937, + "loss": 0.4744, + "step": 261 + }, + { + "epoch": 9.196442415922973e-05, + "grad_norm": 0.31954991817474365, + "learning_rate": 0.00018290484140233724, + "loss": 0.6337, + "step": 262 + }, + { + "epoch": 9.231543341174588e-05, + "grad_norm": 0.3590424358844757, + "learning_rate": 0.00018283806343906512, + "loss": 0.5683, + "step": 263 + }, + { + "epoch": 9.266644266426201e-05, + "grad_norm": 0.4042195975780487, + "learning_rate": 0.000182771285475793, + "loss": 0.6142, + "step": 264 + }, + { + "epoch": 9.301745191677816e-05, + "grad_norm": 0.3474234342575073, + "learning_rate": 0.0001827045075125209, + "loss": 0.6035, + "step": 265 + }, + { + "epoch": 9.336846116929431e-05, + "grad_norm": 0.337091326713562, + "learning_rate": 0.00018263772954924876, + "loss": 0.6107, + "step": 266 + }, + { + "epoch": 9.371947042181045e-05, + "grad_norm": 0.3313732445240021, + "learning_rate": 0.00018257095158597664, + "loss": 0.6491, + "step": 267 + }, + { + "epoch": 9.40704796743266e-05, + "grad_norm": 0.3931679129600525, + "learning_rate": 0.0001825041736227045, + "loss": 0.5492, + "step": 268 + }, + { + "epoch": 9.442148892684273e-05, + "grad_norm": 0.5848420262336731, + "learning_rate": 0.00018243739565943238, + "loss": 0.7091, + "step": 269 + }, + { + "epoch": 9.477249817935888e-05, + "grad_norm": 0.4851846992969513, + "learning_rate": 0.00018237061769616028, + "loss": 0.5856, + "step": 270 + }, + { + "epoch": 9.512350743187503e-05, + "grad_norm": 0.3434993326663971, + "learning_rate": 0.00018230383973288816, + "loss": 0.5085, + "step": 271 + }, + { + "epoch": 9.547451668439117e-05, + "grad_norm": 0.2978988587856293, + "learning_rate": 0.00018223706176961603, + "loss": 0.481, + "step": 272 + }, + { + "epoch": 9.582552593690732e-05, + "grad_norm": 0.34215858578681946, + "learning_rate": 0.0001821702838063439, + "loss": 0.5723, + "step": 273 + }, + { + "epoch": 9.617653518942345e-05, + "grad_norm": 0.43445509672164917, + "learning_rate": 0.00018210350584307178, + "loss": 0.5691, + "step": 274 + }, + { + "epoch": 9.65275444419396e-05, + "grad_norm": 0.36094945669174194, + "learning_rate": 0.00018203672787979968, + "loss": 0.5543, + "step": 275 + }, + { + "epoch": 9.687855369445575e-05, + "grad_norm": 0.386106014251709, + "learning_rate": 0.00018196994991652755, + "loss": 0.5561, + "step": 276 + }, + { + "epoch": 9.722956294697189e-05, + "grad_norm": 0.36676689982414246, + "learning_rate": 0.00018190317195325542, + "loss": 0.5479, + "step": 277 + }, + { + "epoch": 9.758057219948804e-05, + "grad_norm": 0.37988394498825073, + "learning_rate": 0.00018183639398998332, + "loss": 0.5772, + "step": 278 + }, + { + "epoch": 9.793158145200417e-05, + "grad_norm": 0.4024789035320282, + "learning_rate": 0.0001817696160267112, + "loss": 0.6065, + "step": 279 + }, + { + "epoch": 9.828259070452032e-05, + "grad_norm": 0.3697255551815033, + "learning_rate": 0.0001817028380634391, + "loss": 0.5021, + "step": 280 + }, + { + "epoch": 9.863359995703647e-05, + "grad_norm": 0.43579426407814026, + "learning_rate": 0.00018163606010016697, + "loss": 0.555, + "step": 281 + }, + { + "epoch": 9.898460920955261e-05, + "grad_norm": 0.4760832190513611, + "learning_rate": 0.00018156928213689484, + "loss": 0.6438, + "step": 282 + }, + { + "epoch": 9.933561846206876e-05, + "grad_norm": 0.45258408784866333, + "learning_rate": 0.00018150250417362272, + "loss": 0.4717, + "step": 283 + }, + { + "epoch": 9.96866277145849e-05, + "grad_norm": 0.428108274936676, + "learning_rate": 0.0001814357262103506, + "loss": 0.6029, + "step": 284 + }, + { + "epoch": 0.00010003763696710104, + "grad_norm": 0.3999852240085602, + "learning_rate": 0.00018136894824707846, + "loss": 0.4524, + "step": 285 + }, + { + "epoch": 0.0001003886462196172, + "grad_norm": 0.44319403171539307, + "learning_rate": 0.00018130217028380636, + "loss": 0.6619, + "step": 286 + }, + { + "epoch": 0.00010073965547213333, + "grad_norm": 0.43008357286453247, + "learning_rate": 0.00018123539232053424, + "loss": 0.6105, + "step": 287 + }, + { + "epoch": 0.00010109066472464948, + "grad_norm": 0.38037821650505066, + "learning_rate": 0.0001811686143572621, + "loss": 0.6649, + "step": 288 + }, + { + "epoch": 0.00010144167397716562, + "grad_norm": 0.3713517487049103, + "learning_rate": 0.00018110183639398998, + "loss": 0.6381, + "step": 289 + }, + { + "epoch": 0.00010179268322968176, + "grad_norm": 0.3437170386314392, + "learning_rate": 0.00018103505843071786, + "loss": 0.4563, + "step": 290 + }, + { + "epoch": 0.00010214369248219791, + "grad_norm": 0.3661468029022217, + "learning_rate": 0.00018096828046744576, + "loss": 0.606, + "step": 291 + }, + { + "epoch": 0.00010249470173471405, + "grad_norm": 0.36346200108528137, + "learning_rate": 0.00018090150250417363, + "loss": 0.5895, + "step": 292 + }, + { + "epoch": 0.0001028457109872302, + "grad_norm": 0.31052225828170776, + "learning_rate": 0.0001808347245409015, + "loss": 0.4409, + "step": 293 + }, + { + "epoch": 0.00010319672023974634, + "grad_norm": 0.37012970447540283, + "learning_rate": 0.00018076794657762938, + "loss": 0.505, + "step": 294 + }, + { + "epoch": 0.00010354772949226248, + "grad_norm": 0.3958667814731598, + "learning_rate": 0.00018070116861435728, + "loss": 0.5371, + "step": 295 + }, + { + "epoch": 0.00010389873874477863, + "grad_norm": 0.4892179071903229, + "learning_rate": 0.00018063439065108515, + "loss": 0.6737, + "step": 296 + }, + { + "epoch": 0.00010424974799729477, + "grad_norm": 0.41874751448631287, + "learning_rate": 0.00018056761268781305, + "loss": 0.651, + "step": 297 + }, + { + "epoch": 0.00010460075724981092, + "grad_norm": 0.4167911410331726, + "learning_rate": 0.00018050083472454092, + "loss": 0.5531, + "step": 298 + }, + { + "epoch": 0.00010495176650232706, + "grad_norm": 0.3758225440979004, + "learning_rate": 0.0001804340567612688, + "loss": 0.6285, + "step": 299 + }, + { + "epoch": 0.0001053027757548432, + "grad_norm": 0.3688598573207855, + "learning_rate": 0.00018036727879799667, + "loss": 0.5219, + "step": 300 + }, + { + "epoch": 0.00010565378500735934, + "grad_norm": 0.3501751124858856, + "learning_rate": 0.00018030050083472454, + "loss": 0.6351, + "step": 301 + }, + { + "epoch": 0.00010600479425987549, + "grad_norm": 0.42876511812210083, + "learning_rate": 0.00018023372287145244, + "loss": 0.544, + "step": 302 + }, + { + "epoch": 0.00010635580351239164, + "grad_norm": 0.47046172618865967, + "learning_rate": 0.00018016694490818031, + "loss": 0.6304, + "step": 303 + }, + { + "epoch": 0.00010670681276490778, + "grad_norm": 0.402271032333374, + "learning_rate": 0.0001801001669449082, + "loss": 0.5039, + "step": 304 + }, + { + "epoch": 0.00010705782201742393, + "grad_norm": 0.41232413053512573, + "learning_rate": 0.00018003338898163606, + "loss": 0.5892, + "step": 305 + }, + { + "epoch": 0.00010740883126994006, + "grad_norm": 0.3628154993057251, + "learning_rate": 0.00017996661101836393, + "loss": 0.5737, + "step": 306 + }, + { + "epoch": 0.00010775984052245621, + "grad_norm": 0.4291020631790161, + "learning_rate": 0.00017989983305509183, + "loss": 0.6597, + "step": 307 + }, + { + "epoch": 0.00010811084977497236, + "grad_norm": 0.33218181133270264, + "learning_rate": 0.0001798330550918197, + "loss": 0.5726, + "step": 308 + }, + { + "epoch": 0.0001084618590274885, + "grad_norm": 0.3439387381076813, + "learning_rate": 0.00017976627712854758, + "loss": 0.5615, + "step": 309 + }, + { + "epoch": 0.00010881286828000465, + "grad_norm": 0.3523644208908081, + "learning_rate": 0.00017969949916527545, + "loss": 0.4968, + "step": 310 + }, + { + "epoch": 0.00010916387753252078, + "grad_norm": 0.4045630991458893, + "learning_rate": 0.00017963272120200333, + "loss": 0.6425, + "step": 311 + }, + { + "epoch": 0.00010951488678503693, + "grad_norm": 0.3726767599582672, + "learning_rate": 0.00017956594323873123, + "loss": 0.6575, + "step": 312 + }, + { + "epoch": 0.00010986589603755308, + "grad_norm": 0.32131972908973694, + "learning_rate": 0.0001794991652754591, + "loss": 0.5146, + "step": 313 + }, + { + "epoch": 0.00011021690529006922, + "grad_norm": 0.5013764500617981, + "learning_rate": 0.000179432387312187, + "loss": 0.53, + "step": 314 + }, + { + "epoch": 0.00011056791454258537, + "grad_norm": 0.36830246448516846, + "learning_rate": 0.00017936560934891487, + "loss": 0.6291, + "step": 315 + }, + { + "epoch": 0.0001109189237951015, + "grad_norm": 0.3587378263473511, + "learning_rate": 0.00017929883138564275, + "loss": 0.4954, + "step": 316 + }, + { + "epoch": 0.00011126993304761765, + "grad_norm": 0.3480195105075836, + "learning_rate": 0.00017923205342237062, + "loss": 0.606, + "step": 317 + }, + { + "epoch": 0.0001116209423001338, + "grad_norm": 0.38415858149528503, + "learning_rate": 0.00017916527545909852, + "loss": 0.7281, + "step": 318 + }, + { + "epoch": 0.00011197195155264994, + "grad_norm": 0.35853826999664307, + "learning_rate": 0.0001790984974958264, + "loss": 0.5851, + "step": 319 + }, + { + "epoch": 0.00011232296080516609, + "grad_norm": 0.42092210054397583, + "learning_rate": 0.00017903171953255427, + "loss": 0.5324, + "step": 320 + }, + { + "epoch": 0.00011267397005768222, + "grad_norm": 0.34538987278938293, + "learning_rate": 0.00017896494156928214, + "loss": 0.6387, + "step": 321 + }, + { + "epoch": 0.00011302497931019837, + "grad_norm": 0.38299745321273804, + "learning_rate": 0.00017889816360601, + "loss": 0.6013, + "step": 322 + }, + { + "epoch": 0.00011337598856271452, + "grad_norm": 0.32100436091423035, + "learning_rate": 0.0001788313856427379, + "loss": 0.4627, + "step": 323 + }, + { + "epoch": 0.00011372699781523066, + "grad_norm": 0.3458426594734192, + "learning_rate": 0.0001787646076794658, + "loss": 0.5865, + "step": 324 + }, + { + "epoch": 0.0001140780070677468, + "grad_norm": 0.33228665590286255, + "learning_rate": 0.00017869782971619366, + "loss": 0.4611, + "step": 325 + }, + { + "epoch": 0.00011442901632026294, + "grad_norm": 0.38747021555900574, + "learning_rate": 0.00017863105175292153, + "loss": 0.5777, + "step": 326 + }, + { + "epoch": 0.00011478002557277909, + "grad_norm": 0.3888608515262604, + "learning_rate": 0.0001785642737896494, + "loss": 0.5664, + "step": 327 + }, + { + "epoch": 0.00011513103482529524, + "grad_norm": 0.4084737002849579, + "learning_rate": 0.0001784974958263773, + "loss": 0.5939, + "step": 328 + }, + { + "epoch": 0.00011548204407781138, + "grad_norm": 0.4964492917060852, + "learning_rate": 0.00017843071786310518, + "loss": 0.6256, + "step": 329 + }, + { + "epoch": 0.00011583305333032753, + "grad_norm": 0.37329745292663574, + "learning_rate": 0.00017836393989983305, + "loss": 0.5388, + "step": 330 + }, + { + "epoch": 0.00011618406258284366, + "grad_norm": 0.37680140137672424, + "learning_rate": 0.00017829716193656095, + "loss": 0.6203, + "step": 331 + }, + { + "epoch": 0.00011653507183535981, + "grad_norm": 0.4162957966327667, + "learning_rate": 0.00017823038397328883, + "loss": 0.6478, + "step": 332 + }, + { + "epoch": 0.00011688608108787596, + "grad_norm": 0.3473896086215973, + "learning_rate": 0.0001781636060100167, + "loss": 0.589, + "step": 333 + }, + { + "epoch": 0.0001172370903403921, + "grad_norm": 0.4039511978626251, + "learning_rate": 0.0001780968280467446, + "loss": 0.5681, + "step": 334 + }, + { + "epoch": 0.00011758809959290825, + "grad_norm": 0.3135715425014496, + "learning_rate": 0.00017803005008347247, + "loss": 0.5069, + "step": 335 + }, + { + "epoch": 0.00011793910884542438, + "grad_norm": 0.4296559989452362, + "learning_rate": 0.00017796327212020035, + "loss": 0.5413, + "step": 336 + }, + { + "epoch": 0.00011829011809794053, + "grad_norm": 0.4197536110877991, + "learning_rate": 0.00017789649415692822, + "loss": 0.694, + "step": 337 + }, + { + "epoch": 0.00011864112735045668, + "grad_norm": 0.3633468449115753, + "learning_rate": 0.0001778297161936561, + "loss": 0.5475, + "step": 338 + }, + { + "epoch": 0.00011899213660297282, + "grad_norm": 0.2867147922515869, + "learning_rate": 0.000177762938230384, + "loss": 0.485, + "step": 339 + }, + { + "epoch": 0.00011934314585548897, + "grad_norm": 0.3445490300655365, + "learning_rate": 0.00017769616026711187, + "loss": 0.6304, + "step": 340 + }, + { + "epoch": 0.0001196941551080051, + "grad_norm": 0.31692221760749817, + "learning_rate": 0.00017762938230383974, + "loss": 0.5804, + "step": 341 + }, + { + "epoch": 0.00012004516436052125, + "grad_norm": 0.31391167640686035, + "learning_rate": 0.0001775626043405676, + "loss": 0.5945, + "step": 342 + }, + { + "epoch": 0.0001203961736130374, + "grad_norm": 0.3484472632408142, + "learning_rate": 0.00017749582637729548, + "loss": 0.6577, + "step": 343 + }, + { + "epoch": 0.00012074718286555354, + "grad_norm": 0.37430596351623535, + "learning_rate": 0.00017742904841402339, + "loss": 0.6854, + "step": 344 + }, + { + "epoch": 0.00012109819211806969, + "grad_norm": 0.34305211901664734, + "learning_rate": 0.00017736227045075126, + "loss": 0.5123, + "step": 345 + }, + { + "epoch": 0.00012144920137058582, + "grad_norm": 0.3398534059524536, + "learning_rate": 0.00017729549248747913, + "loss": 0.5602, + "step": 346 + }, + { + "epoch": 0.00012180021062310197, + "grad_norm": 0.4278014600276947, + "learning_rate": 0.000177228714524207, + "loss": 0.5152, + "step": 347 + }, + { + "epoch": 0.00012215121987561812, + "grad_norm": 0.4011085629463196, + "learning_rate": 0.0001771619365609349, + "loss": 0.6217, + "step": 348 + }, + { + "epoch": 0.00012250222912813427, + "grad_norm": 0.3425695598125458, + "learning_rate": 0.00017709515859766278, + "loss": 0.5037, + "step": 349 + }, + { + "epoch": 0.0001228532383806504, + "grad_norm": 0.34036242961883545, + "learning_rate": 0.00017702838063439068, + "loss": 0.649, + "step": 350 + }, + { + "epoch": 0.00012320424763316654, + "grad_norm": 0.5631874203681946, + "learning_rate": 0.00017696160267111855, + "loss": 0.5656, + "step": 351 + }, + { + "epoch": 0.0001235552568856827, + "grad_norm": 0.4195176661014557, + "learning_rate": 0.00017689482470784642, + "loss": 0.6899, + "step": 352 + }, + { + "epoch": 0.00012390626613819884, + "grad_norm": 0.41814154386520386, + "learning_rate": 0.0001768280467445743, + "loss": 0.551, + "step": 353 + }, + { + "epoch": 0.000124257275390715, + "grad_norm": 0.3374340534210205, + "learning_rate": 0.00017676126878130217, + "loss": 0.7022, + "step": 354 + }, + { + "epoch": 0.00012460828464323112, + "grad_norm": 0.41464921832084656, + "learning_rate": 0.00017669449081803007, + "loss": 0.5301, + "step": 355 + }, + { + "epoch": 0.00012495929389574726, + "grad_norm": 0.4443178176879883, + "learning_rate": 0.00017662771285475794, + "loss": 0.5487, + "step": 356 + }, + { + "epoch": 0.00012531030314826341, + "grad_norm": 0.3389272093772888, + "learning_rate": 0.00017656093489148582, + "loss": 0.581, + "step": 357 + }, + { + "epoch": 0.00012566131240077956, + "grad_norm": 0.29650986194610596, + "learning_rate": 0.0001764941569282137, + "loss": 0.5801, + "step": 358 + }, + { + "epoch": 0.0001260123216532957, + "grad_norm": 0.40271905064582825, + "learning_rate": 0.00017642737896494156, + "loss": 0.6738, + "step": 359 + }, + { + "epoch": 0.00012636333090581184, + "grad_norm": 0.352225661277771, + "learning_rate": 0.00017636060100166946, + "loss": 0.5727, + "step": 360 + }, + { + "epoch": 0.00012671434015832798, + "grad_norm": 0.3469563126564026, + "learning_rate": 0.00017629382303839734, + "loss": 0.5188, + "step": 361 + }, + { + "epoch": 0.00012706534941084413, + "grad_norm": 0.30644670128822327, + "learning_rate": 0.0001762270450751252, + "loss": 0.497, + "step": 362 + }, + { + "epoch": 0.00012741635866336028, + "grad_norm": 0.3472917377948761, + "learning_rate": 0.00017616026711185308, + "loss": 0.6363, + "step": 363 + }, + { + "epoch": 0.00012776736791587643, + "grad_norm": 0.37184756994247437, + "learning_rate": 0.00017609348914858096, + "loss": 0.5223, + "step": 364 + }, + { + "epoch": 0.00012811837716839256, + "grad_norm": 0.3247138559818268, + "learning_rate": 0.00017602671118530886, + "loss": 0.5457, + "step": 365 + }, + { + "epoch": 0.0001284693864209087, + "grad_norm": 0.5236158967018127, + "learning_rate": 0.00017595993322203673, + "loss": 0.615, + "step": 366 + }, + { + "epoch": 0.00012882039567342485, + "grad_norm": 0.33708465099334717, + "learning_rate": 0.00017589315525876463, + "loss": 0.6163, + "step": 367 + }, + { + "epoch": 0.000129171404925941, + "grad_norm": 0.33848705887794495, + "learning_rate": 0.0001758263772954925, + "loss": 0.4229, + "step": 368 + }, + { + "epoch": 0.00012952241417845715, + "grad_norm": 0.5827682018280029, + "learning_rate": 0.00017575959933222038, + "loss": 0.5668, + "step": 369 + }, + { + "epoch": 0.00012987342343097328, + "grad_norm": 0.36217448115348816, + "learning_rate": 0.00017569282136894825, + "loss": 0.4983, + "step": 370 + }, + { + "epoch": 0.00013022443268348943, + "grad_norm": 0.329414963722229, + "learning_rate": 0.00017562604340567615, + "loss": 0.4281, + "step": 371 + }, + { + "epoch": 0.00013057544193600557, + "grad_norm": 0.36746612191200256, + "learning_rate": 0.00017555926544240402, + "loss": 0.6629, + "step": 372 + }, + { + "epoch": 0.00013092645118852172, + "grad_norm": 0.3954717516899109, + "learning_rate": 0.0001754924874791319, + "loss": 0.5784, + "step": 373 + }, + { + "epoch": 0.00013127746044103787, + "grad_norm": 0.41279932856559753, + "learning_rate": 0.00017542570951585977, + "loss": 0.5994, + "step": 374 + }, + { + "epoch": 0.000131628469693554, + "grad_norm": 0.3019951581954956, + "learning_rate": 0.00017535893155258764, + "loss": 0.5584, + "step": 375 + }, + { + "epoch": 0.00013197947894607015, + "grad_norm": 0.3079768121242523, + "learning_rate": 0.00017529215358931554, + "loss": 0.5904, + "step": 376 + }, + { + "epoch": 0.0001323304881985863, + "grad_norm": 0.5678027272224426, + "learning_rate": 0.00017522537562604342, + "loss": 0.6441, + "step": 377 + }, + { + "epoch": 0.00013268149745110244, + "grad_norm": 0.38624581694602966, + "learning_rate": 0.0001751585976627713, + "loss": 0.5582, + "step": 378 + }, + { + "epoch": 0.0001330325067036186, + "grad_norm": 0.4368002712726593, + "learning_rate": 0.00017509181969949916, + "loss": 0.686, + "step": 379 + }, + { + "epoch": 0.00013338351595613472, + "grad_norm": 0.3409269154071808, + "learning_rate": 0.00017502504173622704, + "loss": 0.582, + "step": 380 + }, + { + "epoch": 0.00013373452520865087, + "grad_norm": 0.3772698938846588, + "learning_rate": 0.0001749582637729549, + "loss": 0.5314, + "step": 381 + }, + { + "epoch": 0.00013408553446116702, + "grad_norm": 0.3791707158088684, + "learning_rate": 0.0001748914858096828, + "loss": 0.6143, + "step": 382 + }, + { + "epoch": 0.00013443654371368317, + "grad_norm": 0.4441101551055908, + "learning_rate": 0.0001748247078464107, + "loss": 0.5726, + "step": 383 + }, + { + "epoch": 0.0001347875529661993, + "grad_norm": 0.4160211980342865, + "learning_rate": 0.00017475792988313858, + "loss": 0.6003, + "step": 384 + }, + { + "epoch": 0.00013513856221871544, + "grad_norm": 0.41698628664016724, + "learning_rate": 0.00017469115191986646, + "loss": 0.4539, + "step": 385 + }, + { + "epoch": 0.00013548957147123159, + "grad_norm": 0.337007999420166, + "learning_rate": 0.00017462437395659433, + "loss": 0.5176, + "step": 386 + }, + { + "epoch": 0.00013584058072374774, + "grad_norm": 0.30926409363746643, + "learning_rate": 0.00017455759599332223, + "loss": 0.6072, + "step": 387 + }, + { + "epoch": 0.00013619158997626389, + "grad_norm": 0.3663052022457123, + "learning_rate": 0.0001744908180300501, + "loss": 0.538, + "step": 388 + }, + { + "epoch": 0.00013654259922878, + "grad_norm": 0.3410074710845947, + "learning_rate": 0.00017442404006677798, + "loss": 0.5687, + "step": 389 + }, + { + "epoch": 0.00013689360848129616, + "grad_norm": 0.5266095399856567, + "learning_rate": 0.00017435726210350585, + "loss": 0.6685, + "step": 390 + }, + { + "epoch": 0.0001372446177338123, + "grad_norm": 0.4020686149597168, + "learning_rate": 0.00017429048414023372, + "loss": 0.586, + "step": 391 + }, + { + "epoch": 0.00013759562698632846, + "grad_norm": 0.39995548129081726, + "learning_rate": 0.00017422370617696162, + "loss": 0.6958, + "step": 392 + }, + { + "epoch": 0.0001379466362388446, + "grad_norm": 0.4024721682071686, + "learning_rate": 0.0001741569282136895, + "loss": 0.6411, + "step": 393 + }, + { + "epoch": 0.00013829764549136073, + "grad_norm": 0.38193392753601074, + "learning_rate": 0.00017409015025041737, + "loss": 0.5857, + "step": 394 + }, + { + "epoch": 0.00013864865474387688, + "grad_norm": 0.39786526560783386, + "learning_rate": 0.00017402337228714524, + "loss": 0.5215, + "step": 395 + }, + { + "epoch": 0.00013899966399639303, + "grad_norm": 0.49223974347114563, + "learning_rate": 0.00017395659432387311, + "loss": 0.5881, + "step": 396 + }, + { + "epoch": 0.00013935067324890918, + "grad_norm": 0.3398894667625427, + "learning_rate": 0.00017388981636060101, + "loss": 0.5466, + "step": 397 + }, + { + "epoch": 0.00013970168250142533, + "grad_norm": 0.34891223907470703, + "learning_rate": 0.0001738230383973289, + "loss": 0.5901, + "step": 398 + }, + { + "epoch": 0.00014005269175394145, + "grad_norm": 0.47644108533859253, + "learning_rate": 0.00017375626043405676, + "loss": 0.5075, + "step": 399 + }, + { + "epoch": 0.0001404037010064576, + "grad_norm": 0.42530229687690735, + "learning_rate": 0.00017368948247078466, + "loss": 0.663, + "step": 400 + }, + { + "epoch": 0.00014075471025897375, + "grad_norm": 0.30858534574508667, + "learning_rate": 0.00017362270450751253, + "loss": 0.4724, + "step": 401 + }, + { + "epoch": 0.0001411057195114899, + "grad_norm": 0.42453449964523315, + "learning_rate": 0.0001735559265442404, + "loss": 0.6074, + "step": 402 + }, + { + "epoch": 0.00014145672876400605, + "grad_norm": 0.3964505195617676, + "learning_rate": 0.0001734891485809683, + "loss": 0.4913, + "step": 403 + }, + { + "epoch": 0.00014180773801652217, + "grad_norm": 0.3317703902721405, + "learning_rate": 0.00017342237061769618, + "loss": 0.5504, + "step": 404 + }, + { + "epoch": 0.00014215874726903832, + "grad_norm": 0.3912264108657837, + "learning_rate": 0.00017335559265442405, + "loss": 0.6301, + "step": 405 + }, + { + "epoch": 0.00014250975652155447, + "grad_norm": 0.3582877218723297, + "learning_rate": 0.00017328881469115193, + "loss": 0.6205, + "step": 406 + }, + { + "epoch": 0.00014286076577407062, + "grad_norm": 0.3691099286079407, + "learning_rate": 0.0001732220367278798, + "loss": 0.5348, + "step": 407 + }, + { + "epoch": 0.00014321177502658677, + "grad_norm": 0.35860803723335266, + "learning_rate": 0.0001731552587646077, + "loss": 0.6029, + "step": 408 + }, + { + "epoch": 0.0001435627842791029, + "grad_norm": 0.3640693426132202, + "learning_rate": 0.00017308848080133557, + "loss": 0.6673, + "step": 409 + }, + { + "epoch": 0.00014391379353161904, + "grad_norm": 0.3550623953342438, + "learning_rate": 0.00017302170283806345, + "loss": 0.4659, + "step": 410 + }, + { + "epoch": 0.0001442648027841352, + "grad_norm": 0.45885637402534485, + "learning_rate": 0.00017295492487479132, + "loss": 0.4781, + "step": 411 + }, + { + "epoch": 0.00014461581203665134, + "grad_norm": 0.3703556954860687, + "learning_rate": 0.0001728881469115192, + "loss": 0.4829, + "step": 412 + }, + { + "epoch": 0.0001449668212891675, + "grad_norm": 0.5436837077140808, + "learning_rate": 0.0001728213689482471, + "loss": 0.6056, + "step": 413 + }, + { + "epoch": 0.0001453178305416836, + "grad_norm": 0.3953244686126709, + "learning_rate": 0.00017275459098497497, + "loss": 0.4884, + "step": 414 + }, + { + "epoch": 0.00014566883979419976, + "grad_norm": 0.34003904461860657, + "learning_rate": 0.00017268781302170284, + "loss": 0.6014, + "step": 415 + }, + { + "epoch": 0.0001460198490467159, + "grad_norm": 0.3463648557662964, + "learning_rate": 0.0001726210350584307, + "loss": 0.603, + "step": 416 + }, + { + "epoch": 0.00014637085829923206, + "grad_norm": 0.4293590784072876, + "learning_rate": 0.0001725542570951586, + "loss": 0.6686, + "step": 417 + }, + { + "epoch": 0.0001467218675517482, + "grad_norm": 0.4243469834327698, + "learning_rate": 0.0001724874791318865, + "loss": 0.6422, + "step": 418 + }, + { + "epoch": 0.00014707287680426433, + "grad_norm": 0.38327839970588684, + "learning_rate": 0.0001724207011686144, + "loss": 0.5595, + "step": 419 + }, + { + "epoch": 0.00014742388605678048, + "grad_norm": 0.31334301829338074, + "learning_rate": 0.00017235392320534226, + "loss": 0.474, + "step": 420 + }, + { + "epoch": 0.00014777489530929663, + "grad_norm": 0.3335350453853607, + "learning_rate": 0.00017228714524207013, + "loss": 0.6172, + "step": 421 + }, + { + "epoch": 0.00014812590456181278, + "grad_norm": 0.373696506023407, + "learning_rate": 0.000172220367278798, + "loss": 0.6183, + "step": 422 + }, + { + "epoch": 0.00014847691381432893, + "grad_norm": 0.45814886689186096, + "learning_rate": 0.00017215358931552588, + "loss": 0.5059, + "step": 423 + }, + { + "epoch": 0.00014882792306684505, + "grad_norm": 0.3578277826309204, + "learning_rate": 0.00017208681135225378, + "loss": 0.5771, + "step": 424 + }, + { + "epoch": 0.0001491789323193612, + "grad_norm": 0.42081883549690247, + "learning_rate": 0.00017202003338898165, + "loss": 0.5604, + "step": 425 + }, + { + "epoch": 0.00014952994157187735, + "grad_norm": 0.3173503875732422, + "learning_rate": 0.00017195325542570953, + "loss": 0.5738, + "step": 426 + }, + { + "epoch": 0.0001498809508243935, + "grad_norm": 0.38292011618614197, + "learning_rate": 0.0001718864774624374, + "loss": 0.6067, + "step": 427 + }, + { + "epoch": 0.00015023196007690965, + "grad_norm": 0.3518977463245392, + "learning_rate": 0.00017181969949916527, + "loss": 0.5073, + "step": 428 + }, + { + "epoch": 0.00015058296932942577, + "grad_norm": 0.5157706141471863, + "learning_rate": 0.00017175292153589317, + "loss": 0.5496, + "step": 429 + }, + { + "epoch": 0.00015093397858194192, + "grad_norm": 0.32064110040664673, + "learning_rate": 0.00017168614357262105, + "loss": 0.4766, + "step": 430 + }, + { + "epoch": 0.00015128498783445807, + "grad_norm": 0.42229798436164856, + "learning_rate": 0.00017161936560934892, + "loss": 0.5953, + "step": 431 + }, + { + "epoch": 0.00015163599708697422, + "grad_norm": 0.4723895192146301, + "learning_rate": 0.0001715525876460768, + "loss": 0.4783, + "step": 432 + }, + { + "epoch": 0.00015198700633949037, + "grad_norm": 0.3841445744037628, + "learning_rate": 0.00017148580968280467, + "loss": 0.5003, + "step": 433 + }, + { + "epoch": 0.0001523380155920065, + "grad_norm": 0.38026461005210876, + "learning_rate": 0.00017141903171953257, + "loss": 0.5093, + "step": 434 + }, + { + "epoch": 0.00015268902484452264, + "grad_norm": 0.37034904956817627, + "learning_rate": 0.00017135225375626044, + "loss": 0.6158, + "step": 435 + }, + { + "epoch": 0.0001530400340970388, + "grad_norm": 0.3876091241836548, + "learning_rate": 0.00017128547579298834, + "loss": 0.5287, + "step": 436 + }, + { + "epoch": 0.00015339104334955494, + "grad_norm": 0.30055519938468933, + "learning_rate": 0.0001712186978297162, + "loss": 0.5018, + "step": 437 + }, + { + "epoch": 0.0001537420526020711, + "grad_norm": 0.36094966530799866, + "learning_rate": 0.00017115191986644409, + "loss": 0.4961, + "step": 438 + }, + { + "epoch": 0.0001540930618545872, + "grad_norm": 0.3300524055957794, + "learning_rate": 0.00017108514190317196, + "loss": 0.5246, + "step": 439 + }, + { + "epoch": 0.00015444407110710336, + "grad_norm": 0.40980783104896545, + "learning_rate": 0.00017101836393989986, + "loss": 0.5705, + "step": 440 + }, + { + "epoch": 0.0001547950803596195, + "grad_norm": 0.3442326784133911, + "learning_rate": 0.00017095158597662773, + "loss": 0.5595, + "step": 441 + }, + { + "epoch": 0.00015514608961213566, + "grad_norm": 0.48015034198760986, + "learning_rate": 0.0001708848080133556, + "loss": 0.5642, + "step": 442 + }, + { + "epoch": 0.0001554970988646518, + "grad_norm": 0.5570142269134521, + "learning_rate": 0.00017081803005008348, + "loss": 0.6111, + "step": 443 + }, + { + "epoch": 0.00015584810811716793, + "grad_norm": 0.30470094084739685, + "learning_rate": 0.00017075125208681135, + "loss": 0.5151, + "step": 444 + }, + { + "epoch": 0.00015619911736968408, + "grad_norm": 0.31946614384651184, + "learning_rate": 0.00017068447412353925, + "loss": 0.5265, + "step": 445 + }, + { + "epoch": 0.00015655012662220023, + "grad_norm": 0.38980719447135925, + "learning_rate": 0.00017061769616026712, + "loss": 0.575, + "step": 446 + }, + { + "epoch": 0.00015690113587471638, + "grad_norm": 0.4077732264995575, + "learning_rate": 0.000170550918196995, + "loss": 0.5729, + "step": 447 + }, + { + "epoch": 0.00015725214512723253, + "grad_norm": 0.38632732629776, + "learning_rate": 0.00017048414023372287, + "loss": 0.594, + "step": 448 + }, + { + "epoch": 0.00015760315437974865, + "grad_norm": 0.37193921208381653, + "learning_rate": 0.00017041736227045074, + "loss": 0.6062, + "step": 449 + }, + { + "epoch": 0.0001579541636322648, + "grad_norm": 0.399029016494751, + "learning_rate": 0.00017035058430717862, + "loss": 0.4538, + "step": 450 + }, + { + "epoch": 0.00015830517288478095, + "grad_norm": 0.37710487842559814, + "learning_rate": 0.00017028380634390652, + "loss": 0.5615, + "step": 451 + }, + { + "epoch": 0.0001586561821372971, + "grad_norm": 0.38591668009757996, + "learning_rate": 0.0001702170283806344, + "loss": 0.5316, + "step": 452 + }, + { + "epoch": 0.00015900719138981325, + "grad_norm": 0.3453538417816162, + "learning_rate": 0.0001701502504173623, + "loss": 0.4645, + "step": 453 + }, + { + "epoch": 0.00015935820064232937, + "grad_norm": 0.34171512722969055, + "learning_rate": 0.00017008347245409016, + "loss": 0.5856, + "step": 454 + }, + { + "epoch": 0.00015970920989484552, + "grad_norm": 0.39591720700263977, + "learning_rate": 0.00017001669449081804, + "loss": 0.573, + "step": 455 + }, + { + "epoch": 0.00016006021914736167, + "grad_norm": 0.4127822816371918, + "learning_rate": 0.00016994991652754594, + "loss": 0.5183, + "step": 456 + }, + { + "epoch": 0.00016041122839987782, + "grad_norm": 0.37893375754356384, + "learning_rate": 0.0001698831385642738, + "loss": 0.566, + "step": 457 + }, + { + "epoch": 0.00016076223765239397, + "grad_norm": 0.33429333567619324, + "learning_rate": 0.00016981636060100168, + "loss": 0.449, + "step": 458 + }, + { + "epoch": 0.0001611132469049101, + "grad_norm": 0.3333180546760559, + "learning_rate": 0.00016974958263772956, + "loss": 0.4441, + "step": 459 + }, + { + "epoch": 0.00016146425615742624, + "grad_norm": 0.3591359257698059, + "learning_rate": 0.00016968280467445743, + "loss": 0.55, + "step": 460 + }, + { + "epoch": 0.0001618152654099424, + "grad_norm": 0.35390427708625793, + "learning_rate": 0.00016961602671118533, + "loss": 0.6445, + "step": 461 + }, + { + "epoch": 0.00016216627466245854, + "grad_norm": 0.42036697268486023, + "learning_rate": 0.0001695492487479132, + "loss": 0.5411, + "step": 462 + }, + { + "epoch": 0.0001625172839149747, + "grad_norm": 0.42147770524024963, + "learning_rate": 0.00016948247078464108, + "loss": 0.6218, + "step": 463 + }, + { + "epoch": 0.0001628682931674908, + "grad_norm": 0.3960399329662323, + "learning_rate": 0.00016941569282136895, + "loss": 0.6608, + "step": 464 + }, + { + "epoch": 0.00016321930242000696, + "grad_norm": 0.39676985144615173, + "learning_rate": 0.00016934891485809682, + "loss": 0.5838, + "step": 465 + }, + { + "epoch": 0.0001635703116725231, + "grad_norm": 0.2839520573616028, + "learning_rate": 0.0001692821368948247, + "loss": 0.5334, + "step": 466 + }, + { + "epoch": 0.00016392132092503926, + "grad_norm": 0.3654347062110901, + "learning_rate": 0.0001692153589315526, + "loss": 0.6065, + "step": 467 + }, + { + "epoch": 0.0001642723301775554, + "grad_norm": 0.3709166646003723, + "learning_rate": 0.00016914858096828047, + "loss": 0.509, + "step": 468 + }, + { + "epoch": 0.00016462333943007153, + "grad_norm": 0.29224780201911926, + "learning_rate": 0.00016908180300500834, + "loss": 0.5372, + "step": 469 + }, + { + "epoch": 0.00016497434868258768, + "grad_norm": 0.34979283809661865, + "learning_rate": 0.00016901502504173624, + "loss": 0.3968, + "step": 470 + }, + { + "epoch": 0.00016532535793510383, + "grad_norm": 0.34580183029174805, + "learning_rate": 0.00016894824707846412, + "loss": 0.6032, + "step": 471 + }, + { + "epoch": 0.00016567636718761998, + "grad_norm": 0.39046213030815125, + "learning_rate": 0.00016888146911519202, + "loss": 0.5628, + "step": 472 + }, + { + "epoch": 0.00016602737644013613, + "grad_norm": 0.35301411151885986, + "learning_rate": 0.0001688146911519199, + "loss": 0.607, + "step": 473 + }, + { + "epoch": 0.00016637838569265225, + "grad_norm": 0.4572748839855194, + "learning_rate": 0.00016874791318864776, + "loss": 0.5018, + "step": 474 + }, + { + "epoch": 0.0001667293949451684, + "grad_norm": 0.38230374455451965, + "learning_rate": 0.00016868113522537564, + "loss": 0.5026, + "step": 475 + }, + { + "epoch": 0.00016708040419768455, + "grad_norm": 0.37066343426704407, + "learning_rate": 0.0001686143572621035, + "loss": 0.5819, + "step": 476 + }, + { + "epoch": 0.0001674314134502007, + "grad_norm": 0.3658660054206848, + "learning_rate": 0.0001685475792988314, + "loss": 0.6825, + "step": 477 + }, + { + "epoch": 0.00016778242270271685, + "grad_norm": 0.42174890637397766, + "learning_rate": 0.00016848080133555928, + "loss": 0.6065, + "step": 478 + }, + { + "epoch": 0.00016813343195523297, + "grad_norm": 0.3462882936000824, + "learning_rate": 0.00016841402337228716, + "loss": 0.5888, + "step": 479 + }, + { + "epoch": 0.00016848444120774912, + "grad_norm": 0.44681960344314575, + "learning_rate": 0.00016834724540901503, + "loss": 0.4987, + "step": 480 + }, + { + "epoch": 0.00016883545046026527, + "grad_norm": 0.3535650372505188, + "learning_rate": 0.0001682804674457429, + "loss": 0.6478, + "step": 481 + }, + { + "epoch": 0.00016918645971278142, + "grad_norm": 0.3357018232345581, + "learning_rate": 0.00016821368948247077, + "loss": 0.4949, + "step": 482 + }, + { + "epoch": 0.00016953746896529757, + "grad_norm": 0.42756739258766174, + "learning_rate": 0.00016814691151919868, + "loss": 0.6475, + "step": 483 + }, + { + "epoch": 0.0001698884782178137, + "grad_norm": 0.36174866557121277, + "learning_rate": 0.00016808013355592655, + "loss": 0.598, + "step": 484 + }, + { + "epoch": 0.00017023948747032984, + "grad_norm": 0.37115278840065, + "learning_rate": 0.00016801335559265442, + "loss": 0.6215, + "step": 485 + }, + { + "epoch": 0.000170590496722846, + "grad_norm": 0.340249627828598, + "learning_rate": 0.0001679465776293823, + "loss": 0.5702, + "step": 486 + }, + { + "epoch": 0.00017094150597536214, + "grad_norm": 0.31226348876953125, + "learning_rate": 0.0001678797996661102, + "loss": 0.6531, + "step": 487 + }, + { + "epoch": 0.0001712925152278783, + "grad_norm": 0.35571998357772827, + "learning_rate": 0.00016781302170283807, + "loss": 0.6406, + "step": 488 + }, + { + "epoch": 0.00017164352448039441, + "grad_norm": 0.4167378842830658, + "learning_rate": 0.00016774624373956597, + "loss": 0.5111, + "step": 489 + }, + { + "epoch": 0.00017199453373291056, + "grad_norm": 0.292304128408432, + "learning_rate": 0.00016767946577629384, + "loss": 0.6643, + "step": 490 + }, + { + "epoch": 0.0001723455429854267, + "grad_norm": 0.38789069652557373, + "learning_rate": 0.00016761268781302171, + "loss": 0.4542, + "step": 491 + }, + { + "epoch": 0.00017269655223794286, + "grad_norm": 0.33764714002609253, + "learning_rate": 0.0001675459098497496, + "loss": 0.4158, + "step": 492 + }, + { + "epoch": 0.00017304756149045898, + "grad_norm": 0.34849148988723755, + "learning_rate": 0.0001674791318864775, + "loss": 0.4737, + "step": 493 + }, + { + "epoch": 0.00017339857074297513, + "grad_norm": 0.2921352684497833, + "learning_rate": 0.00016741235392320536, + "loss": 0.679, + "step": 494 + }, + { + "epoch": 0.00017374957999549128, + "grad_norm": 0.33746641874313354, + "learning_rate": 0.00016734557595993323, + "loss": 0.4957, + "step": 495 + }, + { + "epoch": 0.00017410058924800743, + "grad_norm": 0.4029395878314972, + "learning_rate": 0.0001672787979966611, + "loss": 0.6708, + "step": 496 + }, + { + "epoch": 0.00017445159850052358, + "grad_norm": 0.440033882856369, + "learning_rate": 0.00016721202003338898, + "loss": 0.5889, + "step": 497 + }, + { + "epoch": 0.0001748026077530397, + "grad_norm": 0.330692857503891, + "learning_rate": 0.00016714524207011685, + "loss": 0.5942, + "step": 498 + }, + { + "epoch": 0.00017515361700555585, + "grad_norm": 0.3111809492111206, + "learning_rate": 0.00016707846410684475, + "loss": 0.5506, + "step": 499 + }, + { + "epoch": 0.000175504626258072, + "grad_norm": 0.38885676860809326, + "learning_rate": 0.00016701168614357263, + "loss": 0.4713, + "step": 500 + }, + { + "epoch": 0.00017585563551058815, + "grad_norm": 0.3697550296783447, + "learning_rate": 0.0001669449081803005, + "loss": 0.5955, + "step": 501 + }, + { + "epoch": 0.0001762066447631043, + "grad_norm": 0.35807061195373535, + "learning_rate": 0.00016687813021702837, + "loss": 0.555, + "step": 502 + }, + { + "epoch": 0.00017655765401562043, + "grad_norm": 0.44033464789390564, + "learning_rate": 0.00016681135225375625, + "loss": 0.5668, + "step": 503 + }, + { + "epoch": 0.00017690866326813657, + "grad_norm": 0.3363400399684906, + "learning_rate": 0.00016674457429048415, + "loss": 0.6176, + "step": 504 + }, + { + "epoch": 0.00017725967252065272, + "grad_norm": 0.31457507610321045, + "learning_rate": 0.00016667779632721202, + "loss": 0.6524, + "step": 505 + }, + { + "epoch": 0.00017761068177316887, + "grad_norm": 0.38115641474723816, + "learning_rate": 0.00016661101836393992, + "loss": 0.5848, + "step": 506 + }, + { + "epoch": 0.00017796169102568502, + "grad_norm": 0.3387603759765625, + "learning_rate": 0.0001665442404006678, + "loss": 0.6992, + "step": 507 + }, + { + "epoch": 0.00017831270027820115, + "grad_norm": 0.31671345233917236, + "learning_rate": 0.00016647746243739567, + "loss": 0.5744, + "step": 508 + }, + { + "epoch": 0.0001786637095307173, + "grad_norm": 0.3776471018791199, + "learning_rate": 0.00016641068447412357, + "loss": 0.622, + "step": 509 + }, + { + "epoch": 0.00017901471878323344, + "grad_norm": 0.37572941184043884, + "learning_rate": 0.00016634390651085144, + "loss": 0.5259, + "step": 510 + }, + { + "epoch": 0.0001793657280357496, + "grad_norm": 0.3335510194301605, + "learning_rate": 0.0001662771285475793, + "loss": 0.547, + "step": 511 + }, + { + "epoch": 0.00017971673728826574, + "grad_norm": 0.33241015672683716, + "learning_rate": 0.00016621035058430719, + "loss": 0.5827, + "step": 512 + }, + { + "epoch": 0.00018006774654078187, + "grad_norm": 0.3761122524738312, + "learning_rate": 0.00016614357262103506, + "loss": 0.6962, + "step": 513 + }, + { + "epoch": 0.00018041875579329802, + "grad_norm": 0.4172234833240509, + "learning_rate": 0.00016607679465776293, + "loss": 0.4922, + "step": 514 + }, + { + "epoch": 0.00018076976504581416, + "grad_norm": 0.45372599363327026, + "learning_rate": 0.00016601001669449083, + "loss": 0.5804, + "step": 515 + }, + { + "epoch": 0.00018112077429833031, + "grad_norm": 0.3854759931564331, + "learning_rate": 0.0001659432387312187, + "loss": 0.6026, + "step": 516 + }, + { + "epoch": 0.00018147178355084646, + "grad_norm": 0.3399171829223633, + "learning_rate": 0.00016587646076794658, + "loss": 0.4773, + "step": 517 + }, + { + "epoch": 0.00018182279280336259, + "grad_norm": 0.36649778485298157, + "learning_rate": 0.00016580968280467445, + "loss": 0.59, + "step": 518 + }, + { + "epoch": 0.00018217380205587874, + "grad_norm": 0.39988765120506287, + "learning_rate": 0.00016574290484140233, + "loss": 0.6094, + "step": 519 + }, + { + "epoch": 0.00018252481130839489, + "grad_norm": 0.34659436345100403, + "learning_rate": 0.00016567612687813023, + "loss": 0.4832, + "step": 520 + }, + { + "epoch": 0.00018287582056091103, + "grad_norm": 0.3742654025554657, + "learning_rate": 0.0001656093489148581, + "loss": 0.413, + "step": 521 + }, + { + "epoch": 0.00018322682981342718, + "grad_norm": 0.43068456649780273, + "learning_rate": 0.00016554257095158597, + "loss": 0.6576, + "step": 522 + }, + { + "epoch": 0.0001835778390659433, + "grad_norm": 0.42455193400382996, + "learning_rate": 0.00016547579298831387, + "loss": 0.5897, + "step": 523 + }, + { + "epoch": 0.00018392884831845946, + "grad_norm": 0.3290526568889618, + "learning_rate": 0.00016540901502504175, + "loss": 0.4022, + "step": 524 + }, + { + "epoch": 0.0001842798575709756, + "grad_norm": 0.3744141161441803, + "learning_rate": 0.00016534223706176965, + "loss": 0.5577, + "step": 525 + }, + { + "epoch": 0.00018463086682349176, + "grad_norm": 0.3516618609428406, + "learning_rate": 0.00016527545909849752, + "loss": 0.5481, + "step": 526 + }, + { + "epoch": 0.0001849818760760079, + "grad_norm": 0.3591526448726654, + "learning_rate": 0.0001652086811352254, + "loss": 0.6339, + "step": 527 + }, + { + "epoch": 0.00018533288532852403, + "grad_norm": 0.4024425745010376, + "learning_rate": 0.00016514190317195327, + "loss": 0.5268, + "step": 528 + }, + { + "epoch": 0.00018568389458104018, + "grad_norm": 0.3502136766910553, + "learning_rate": 0.00016507512520868114, + "loss": 0.5112, + "step": 529 + }, + { + "epoch": 0.00018603490383355633, + "grad_norm": 0.3338727056980133, + "learning_rate": 0.00016500834724540904, + "loss": 0.5623, + "step": 530 + }, + { + "epoch": 0.00018638591308607248, + "grad_norm": 0.43554845452308655, + "learning_rate": 0.0001649415692821369, + "loss": 0.5853, + "step": 531 + }, + { + "epoch": 0.00018673692233858862, + "grad_norm": 0.34424322843551636, + "learning_rate": 0.00016487479131886478, + "loss": 0.4951, + "step": 532 + }, + { + "epoch": 0.00018708793159110475, + "grad_norm": 0.4424237012863159, + "learning_rate": 0.00016480801335559266, + "loss": 0.4576, + "step": 533 + }, + { + "epoch": 0.0001874389408436209, + "grad_norm": 0.4616681933403015, + "learning_rate": 0.00016474123539232053, + "loss": 0.4974, + "step": 534 + }, + { + "epoch": 0.00018778995009613705, + "grad_norm": 0.3599206507205963, + "learning_rate": 0.0001646744574290484, + "loss": 0.5987, + "step": 535 + }, + { + "epoch": 0.0001881409593486532, + "grad_norm": 0.40468478202819824, + "learning_rate": 0.0001646076794657763, + "loss": 0.5914, + "step": 536 + }, + { + "epoch": 0.00018849196860116935, + "grad_norm": 0.5389227271080017, + "learning_rate": 0.00016454090150250418, + "loss": 0.6459, + "step": 537 + }, + { + "epoch": 0.00018884297785368547, + "grad_norm": 0.3493568003177643, + "learning_rate": 0.00016447412353923205, + "loss": 0.5191, + "step": 538 + }, + { + "epoch": 0.00018919398710620162, + "grad_norm": 0.31237804889678955, + "learning_rate": 0.00016440734557595992, + "loss": 0.4819, + "step": 539 + }, + { + "epoch": 0.00018954499635871777, + "grad_norm": 0.31142041087150574, + "learning_rate": 0.00016434056761268782, + "loss": 0.5659, + "step": 540 + }, + { + "epoch": 0.00018989600561123392, + "grad_norm": 0.3323245644569397, + "learning_rate": 0.0001642737896494157, + "loss": 0.5779, + "step": 541 + }, + { + "epoch": 0.00019024701486375007, + "grad_norm": 0.3679036498069763, + "learning_rate": 0.0001642070116861436, + "loss": 0.6919, + "step": 542 + }, + { + "epoch": 0.0001905980241162662, + "grad_norm": 0.3094903528690338, + "learning_rate": 0.00016414023372287147, + "loss": 0.4773, + "step": 543 + }, + { + "epoch": 0.00019094903336878234, + "grad_norm": 0.37995582818984985, + "learning_rate": 0.00016407345575959934, + "loss": 0.539, + "step": 544 + }, + { + "epoch": 0.0001913000426212985, + "grad_norm": 0.46415746212005615, + "learning_rate": 0.00016400667779632722, + "loss": 0.6708, + "step": 545 + }, + { + "epoch": 0.00019165105187381464, + "grad_norm": 0.3479398190975189, + "learning_rate": 0.00016393989983305512, + "loss": 0.5496, + "step": 546 + }, + { + "epoch": 0.00019200206112633079, + "grad_norm": 0.3740891218185425, + "learning_rate": 0.000163873121869783, + "loss": 0.6256, + "step": 547 + }, + { + "epoch": 0.0001923530703788469, + "grad_norm": 0.4934074878692627, + "learning_rate": 0.00016380634390651086, + "loss": 0.6788, + "step": 548 + }, + { + "epoch": 0.00019270407963136306, + "grad_norm": 0.42659157514572144, + "learning_rate": 0.00016373956594323874, + "loss": 0.5981, + "step": 549 + }, + { + "epoch": 0.0001930550888838792, + "grad_norm": 0.35727575421333313, + "learning_rate": 0.0001636727879799666, + "loss": 0.4095, + "step": 550 + }, + { + "epoch": 0.00019340609813639536, + "grad_norm": 0.4294300377368927, + "learning_rate": 0.00016360601001669448, + "loss": 0.5386, + "step": 551 + }, + { + "epoch": 0.0001937571073889115, + "grad_norm": 0.33482253551483154, + "learning_rate": 0.00016353923205342238, + "loss": 0.4901, + "step": 552 + }, + { + "epoch": 0.00019410811664142763, + "grad_norm": 0.3379746079444885, + "learning_rate": 0.00016347245409015026, + "loss": 0.5454, + "step": 553 + }, + { + "epoch": 0.00019445912589394378, + "grad_norm": 0.42393919825553894, + "learning_rate": 0.00016340567612687813, + "loss": 0.5959, + "step": 554 + }, + { + "epoch": 0.00019481013514645993, + "grad_norm": 0.31975501775741577, + "learning_rate": 0.000163338898163606, + "loss": 0.6048, + "step": 555 + }, + { + "epoch": 0.00019516114439897608, + "grad_norm": 0.43404972553253174, + "learning_rate": 0.00016327212020033388, + "loss": 0.6252, + "step": 556 + }, + { + "epoch": 0.00019551215365149223, + "grad_norm": 0.3559292256832123, + "learning_rate": 0.00016320534223706178, + "loss": 0.6036, + "step": 557 + }, + { + "epoch": 0.00019586316290400835, + "grad_norm": 0.3134891092777252, + "learning_rate": 0.00016313856427378965, + "loss": 0.5656, + "step": 558 + }, + { + "epoch": 0.0001962141721565245, + "grad_norm": 0.32056671380996704, + "learning_rate": 0.00016307178631051755, + "loss": 0.6509, + "step": 559 + }, + { + "epoch": 0.00019656518140904065, + "grad_norm": 0.46249130368232727, + "learning_rate": 0.00016300500834724542, + "loss": 0.6379, + "step": 560 + }, + { + "epoch": 0.0001969161906615568, + "grad_norm": 0.36366966366767883, + "learning_rate": 0.0001629382303839733, + "loss": 0.5334, + "step": 561 + }, + { + "epoch": 0.00019726719991407295, + "grad_norm": 0.4234124422073364, + "learning_rate": 0.0001628714524207012, + "loss": 0.4864, + "step": 562 + }, + { + "epoch": 0.00019761820916658907, + "grad_norm": 0.3687801659107208, + "learning_rate": 0.00016280467445742907, + "loss": 0.4855, + "step": 563 + }, + { + "epoch": 0.00019796921841910522, + "grad_norm": 0.37247028946876526, + "learning_rate": 0.00016273789649415694, + "loss": 0.6215, + "step": 564 + }, + { + "epoch": 0.00019832022767162137, + "grad_norm": 0.30445635318756104, + "learning_rate": 0.00016267111853088482, + "loss": 0.5741, + "step": 565 + }, + { + "epoch": 0.00019867123692413752, + "grad_norm": 0.3349187970161438, + "learning_rate": 0.0001626043405676127, + "loss": 0.4524, + "step": 566 + }, + { + "epoch": 0.00019902224617665367, + "grad_norm": 0.36938101053237915, + "learning_rate": 0.00016253756260434056, + "loss": 0.5046, + "step": 567 + }, + { + "epoch": 0.0001993732554291698, + "grad_norm": 0.37673529982566833, + "learning_rate": 0.00016247078464106846, + "loss": 0.5001, + "step": 568 + }, + { + "epoch": 0.00019972426468168594, + "grad_norm": 0.3571556508541107, + "learning_rate": 0.00016240400667779634, + "loss": 0.6419, + "step": 569 + }, + { + "epoch": 0.0002000752739342021, + "grad_norm": 0.35543423891067505, + "learning_rate": 0.0001623372287145242, + "loss": 0.6191, + "step": 570 + }, + { + "epoch": 0.00020042628318671824, + "grad_norm": 0.3096729516983032, + "learning_rate": 0.00016227045075125208, + "loss": 0.5373, + "step": 571 + }, + { + "epoch": 0.0002007772924392344, + "grad_norm": 0.30310383439064026, + "learning_rate": 0.00016220367278797996, + "loss": 0.558, + "step": 572 + }, + { + "epoch": 0.0002011283016917505, + "grad_norm": 0.3616211712360382, + "learning_rate": 0.00016213689482470786, + "loss": 0.6504, + "step": 573 + }, + { + "epoch": 0.00020147931094426666, + "grad_norm": 0.34818220138549805, + "learning_rate": 0.00016207011686143573, + "loss": 0.6136, + "step": 574 + }, + { + "epoch": 0.0002018303201967828, + "grad_norm": 0.36225444078445435, + "learning_rate": 0.0001620033388981636, + "loss": 0.4905, + "step": 575 + }, + { + "epoch": 0.00020218132944929896, + "grad_norm": 0.40039536356925964, + "learning_rate": 0.0001619365609348915, + "loss": 0.5997, + "step": 576 + }, + { + "epoch": 0.0002025323387018151, + "grad_norm": 0.33715930581092834, + "learning_rate": 0.00016186978297161938, + "loss": 0.5284, + "step": 577 + }, + { + "epoch": 0.00020288334795433123, + "grad_norm": 0.4137067198753357, + "learning_rate": 0.00016180300500834728, + "loss": 0.6873, + "step": 578 + }, + { + "epoch": 0.00020323435720684738, + "grad_norm": 0.41598305106163025, + "learning_rate": 0.00016173622704507515, + "loss": 0.491, + "step": 579 + }, + { + "epoch": 0.00020358536645936353, + "grad_norm": 0.5466423034667969, + "learning_rate": 0.00016166944908180302, + "loss": 0.6188, + "step": 580 + }, + { + "epoch": 0.00020393637571187968, + "grad_norm": 0.3718060851097107, + "learning_rate": 0.0001616026711185309, + "loss": 0.5573, + "step": 581 + }, + { + "epoch": 0.00020428738496439583, + "grad_norm": 0.33747225999832153, + "learning_rate": 0.00016153589315525877, + "loss": 0.4887, + "step": 582 + }, + { + "epoch": 0.00020463839421691195, + "grad_norm": 0.36478081345558167, + "learning_rate": 0.00016146911519198664, + "loss": 0.553, + "step": 583 + }, + { + "epoch": 0.0002049894034694281, + "grad_norm": 0.38441962003707886, + "learning_rate": 0.00016140233722871454, + "loss": 0.4833, + "step": 584 + }, + { + "epoch": 0.00020534041272194425, + "grad_norm": 0.45594358444213867, + "learning_rate": 0.00016133555926544241, + "loss": 0.5877, + "step": 585 + }, + { + "epoch": 0.0002056914219744604, + "grad_norm": 0.356517493724823, + "learning_rate": 0.0001612687813021703, + "loss": 0.5614, + "step": 586 + }, + { + "epoch": 0.00020604243122697655, + "grad_norm": 0.4051963686943054, + "learning_rate": 0.00016120200333889816, + "loss": 0.5208, + "step": 587 + }, + { + "epoch": 0.00020639344047949267, + "grad_norm": 0.36947959661483765, + "learning_rate": 0.00016113522537562603, + "loss": 0.4385, + "step": 588 + }, + { + "epoch": 0.00020674444973200882, + "grad_norm": 0.45947200059890747, + "learning_rate": 0.00016106844741235393, + "loss": 0.4972, + "step": 589 + }, + { + "epoch": 0.00020709545898452497, + "grad_norm": 0.40610602498054504, + "learning_rate": 0.0001610016694490818, + "loss": 0.4022, + "step": 590 + }, + { + "epoch": 0.00020744646823704112, + "grad_norm": 0.3529384732246399, + "learning_rate": 0.00016093489148580968, + "loss": 0.5222, + "step": 591 + }, + { + "epoch": 0.00020779747748955727, + "grad_norm": 0.35114821791648865, + "learning_rate": 0.00016086811352253755, + "loss": 0.6224, + "step": 592 + }, + { + "epoch": 0.0002081484867420734, + "grad_norm": 0.3596336841583252, + "learning_rate": 0.00016080133555926545, + "loss": 0.5081, + "step": 593 + }, + { + "epoch": 0.00020849949599458954, + "grad_norm": 0.4214174747467041, + "learning_rate": 0.00016073455759599333, + "loss": 0.5189, + "step": 594 + }, + { + "epoch": 0.0002088505052471057, + "grad_norm": 0.39635175466537476, + "learning_rate": 0.00016066777963272123, + "loss": 0.582, + "step": 595 + }, + { + "epoch": 0.00020920151449962184, + "grad_norm": 0.36160576343536377, + "learning_rate": 0.0001606010016694491, + "loss": 0.568, + "step": 596 + }, + { + "epoch": 0.000209552523752138, + "grad_norm": 0.4242927134037018, + "learning_rate": 0.00016053422370617697, + "loss": 0.6235, + "step": 597 + }, + { + "epoch": 0.0002099035330046541, + "grad_norm": 0.4257853925228119, + "learning_rate": 0.00016046744574290485, + "loss": 0.5294, + "step": 598 + }, + { + "epoch": 0.00021025454225717026, + "grad_norm": 0.3890500068664551, + "learning_rate": 0.00016040066777963272, + "loss": 0.6224, + "step": 599 + }, + { + "epoch": 0.0002106055515096864, + "grad_norm": 0.2971879541873932, + "learning_rate": 0.00016033388981636062, + "loss": 0.5951, + "step": 600 + }, + { + "epoch": 0.00021095656076220256, + "grad_norm": 0.29551970958709717, + "learning_rate": 0.0001602671118530885, + "loss": 0.6713, + "step": 601 + }, + { + "epoch": 0.00021130757001471868, + "grad_norm": 0.31588122248649597, + "learning_rate": 0.00016020033388981637, + "loss": 0.6384, + "step": 602 + }, + { + "epoch": 0.00021165857926723483, + "grad_norm": 0.3138657510280609, + "learning_rate": 0.00016013355592654424, + "loss": 0.5846, + "step": 603 + }, + { + "epoch": 0.00021200958851975098, + "grad_norm": 0.31286585330963135, + "learning_rate": 0.0001600667779632721, + "loss": 0.6236, + "step": 604 + }, + { + "epoch": 0.00021236059777226713, + "grad_norm": 0.32098105549812317, + "learning_rate": 0.00016, + "loss": 0.4926, + "step": 605 + }, + { + "epoch": 0.00021271160702478328, + "grad_norm": 0.371427446603775, + "learning_rate": 0.00015993322203672789, + "loss": 0.6205, + "step": 606 + }, + { + "epoch": 0.0002130626162772994, + "grad_norm": 0.28764042258262634, + "learning_rate": 0.00015986644407345576, + "loss": 0.449, + "step": 607 + }, + { + "epoch": 0.00021341362552981555, + "grad_norm": 0.35086238384246826, + "learning_rate": 0.00015979966611018363, + "loss": 0.549, + "step": 608 + }, + { + "epoch": 0.0002137646347823317, + "grad_norm": 0.3118048906326294, + "learning_rate": 0.0001597328881469115, + "loss": 0.6037, + "step": 609 + }, + { + "epoch": 0.00021411564403484785, + "grad_norm": 0.3894517123699188, + "learning_rate": 0.0001596661101836394, + "loss": 0.5989, + "step": 610 + }, + { + "epoch": 0.000214466653287364, + "grad_norm": 0.39642322063446045, + "learning_rate": 0.00015959933222036728, + "loss": 0.566, + "step": 611 + }, + { + "epoch": 0.00021481766253988012, + "grad_norm": 0.35333508253097534, + "learning_rate": 0.00015953255425709518, + "loss": 0.5055, + "step": 612 + }, + { + "epoch": 0.00021516867179239627, + "grad_norm": 0.39200490713119507, + "learning_rate": 0.00015946577629382305, + "loss": 0.5951, + "step": 613 + }, + { + "epoch": 0.00021551968104491242, + "grad_norm": 0.38436442613601685, + "learning_rate": 0.00015939899833055093, + "loss": 0.4876, + "step": 614 + }, + { + "epoch": 0.00021587069029742857, + "grad_norm": 0.3397504389286041, + "learning_rate": 0.0001593322203672788, + "loss": 0.6287, + "step": 615 + }, + { + "epoch": 0.00021622169954994472, + "grad_norm": 0.35870012640953064, + "learning_rate": 0.0001592654424040067, + "loss": 0.5857, + "step": 616 + }, + { + "epoch": 0.00021657270880246084, + "grad_norm": 0.31163597106933594, + "learning_rate": 0.00015919866444073457, + "loss": 0.4831, + "step": 617 + }, + { + "epoch": 0.000216923718054977, + "grad_norm": 0.35106539726257324, + "learning_rate": 0.00015913188647746245, + "loss": 0.5776, + "step": 618 + }, + { + "epoch": 0.00021727472730749314, + "grad_norm": 0.3639923334121704, + "learning_rate": 0.00015906510851419032, + "loss": 0.5039, + "step": 619 + }, + { + "epoch": 0.0002176257365600093, + "grad_norm": 0.3622918128967285, + "learning_rate": 0.0001589983305509182, + "loss": 0.6293, + "step": 620 + }, + { + "epoch": 0.00021797674581252544, + "grad_norm": 0.3899349868297577, + "learning_rate": 0.0001589315525876461, + "loss": 0.567, + "step": 621 + }, + { + "epoch": 0.00021832775506504156, + "grad_norm": 0.3834361732006073, + "learning_rate": 0.00015886477462437397, + "loss": 0.5106, + "step": 622 + }, + { + "epoch": 0.0002186787643175577, + "grad_norm": 0.34996962547302246, + "learning_rate": 0.00015879799666110184, + "loss": 0.5155, + "step": 623 + }, + { + "epoch": 0.00021902977357007386, + "grad_norm": 0.47908079624176025, + "learning_rate": 0.0001587312186978297, + "loss": 0.4529, + "step": 624 + }, + { + "epoch": 0.00021938078282259, + "grad_norm": 0.3167901635169983, + "learning_rate": 0.00015866444073455758, + "loss": 0.6075, + "step": 625 + }, + { + "epoch": 0.00021973179207510616, + "grad_norm": 0.4254927337169647, + "learning_rate": 0.00015859766277128548, + "loss": 0.6404, + "step": 626 + }, + { + "epoch": 0.00022008280132762228, + "grad_norm": 0.4317469000816345, + "learning_rate": 0.00015853088480801336, + "loss": 0.5881, + "step": 627 + }, + { + "epoch": 0.00022043381058013843, + "grad_norm": 0.4441644251346588, + "learning_rate": 0.00015846410684474123, + "loss": 0.5864, + "step": 628 + }, + { + "epoch": 0.00022078481983265458, + "grad_norm": 0.37883102893829346, + "learning_rate": 0.00015839732888146913, + "loss": 0.5664, + "step": 629 + }, + { + "epoch": 0.00022113582908517073, + "grad_norm": 0.35548868775367737, + "learning_rate": 0.000158330550918197, + "loss": 0.5712, + "step": 630 + }, + { + "epoch": 0.00022148683833768688, + "grad_norm": 0.31588616967201233, + "learning_rate": 0.00015826377295492488, + "loss": 0.4856, + "step": 631 + }, + { + "epoch": 0.000221837847590203, + "grad_norm": 0.3186424672603607, + "learning_rate": 0.00015819699499165278, + "loss": 0.542, + "step": 632 + }, + { + "epoch": 0.00022218885684271915, + "grad_norm": 0.41098466515541077, + "learning_rate": 0.00015813021702838065, + "loss": 0.6311, + "step": 633 + }, + { + "epoch": 0.0002225398660952353, + "grad_norm": 0.413401335477829, + "learning_rate": 0.00015806343906510852, + "loss": 0.5036, + "step": 634 + }, + { + "epoch": 0.00022289087534775145, + "grad_norm": 0.34203773736953735, + "learning_rate": 0.0001579966611018364, + "loss": 0.5508, + "step": 635 + }, + { + "epoch": 0.0002232418846002676, + "grad_norm": 0.34416648745536804, + "learning_rate": 0.00015792988313856427, + "loss": 0.5442, + "step": 636 + }, + { + "epoch": 0.00022359289385278372, + "grad_norm": 0.3439941704273224, + "learning_rate": 0.00015786310517529217, + "loss": 0.4969, + "step": 637 + }, + { + "epoch": 0.00022394390310529987, + "grad_norm": 0.3547762930393219, + "learning_rate": 0.00015779632721202004, + "loss": 0.5564, + "step": 638 + }, + { + "epoch": 0.00022429491235781602, + "grad_norm": 0.35666894912719727, + "learning_rate": 0.00015772954924874792, + "loss": 0.4759, + "step": 639 + }, + { + "epoch": 0.00022464592161033217, + "grad_norm": 0.3175058364868164, + "learning_rate": 0.0001576627712854758, + "loss": 0.5708, + "step": 640 + }, + { + "epoch": 0.00022499693086284832, + "grad_norm": 0.4329943358898163, + "learning_rate": 0.00015759599332220366, + "loss": 0.5293, + "step": 641 + }, + { + "epoch": 0.00022534794011536444, + "grad_norm": 0.5703821778297424, + "learning_rate": 0.00015752921535893156, + "loss": 0.6187, + "step": 642 + }, + { + "epoch": 0.0002256989493678806, + "grad_norm": 0.32244032621383667, + "learning_rate": 0.00015746243739565944, + "loss": 0.4847, + "step": 643 + }, + { + "epoch": 0.00022604995862039674, + "grad_norm": 0.36224085092544556, + "learning_rate": 0.0001573956594323873, + "loss": 0.6804, + "step": 644 + }, + { + "epoch": 0.0002264009678729129, + "grad_norm": 0.3316931426525116, + "learning_rate": 0.0001573288814691152, + "loss": 0.6413, + "step": 645 + }, + { + "epoch": 0.00022675197712542904, + "grad_norm": 0.38156425952911377, + "learning_rate": 0.00015726210350584308, + "loss": 0.5659, + "step": 646 + }, + { + "epoch": 0.00022710298637794516, + "grad_norm": 0.48353493213653564, + "learning_rate": 0.00015719532554257096, + "loss": 0.5788, + "step": 647 + }, + { + "epoch": 0.00022745399563046131, + "grad_norm": 0.3913673758506775, + "learning_rate": 0.00015712854757929886, + "loss": 0.6899, + "step": 648 + }, + { + "epoch": 0.00022780500488297746, + "grad_norm": 0.46836981177330017, + "learning_rate": 0.00015706176961602673, + "loss": 0.5712, + "step": 649 + }, + { + "epoch": 0.0002281560141354936, + "grad_norm": 0.34713172912597656, + "learning_rate": 0.0001569949916527546, + "loss": 0.381, + "step": 650 + }, + { + "epoch": 0.00022850702338800976, + "grad_norm": 0.3837398886680603, + "learning_rate": 0.00015692821368948248, + "loss": 0.5236, + "step": 651 + }, + { + "epoch": 0.00022885803264052589, + "grad_norm": 0.5181556940078735, + "learning_rate": 0.00015686143572621035, + "loss": 0.5889, + "step": 652 + }, + { + "epoch": 0.00022920904189304203, + "grad_norm": 0.42713961005210876, + "learning_rate": 0.00015679465776293825, + "loss": 0.5346, + "step": 653 + }, + { + "epoch": 0.00022956005114555818, + "grad_norm": 0.2868479788303375, + "learning_rate": 0.00015672787979966612, + "loss": 0.5546, + "step": 654 + }, + { + "epoch": 0.00022991106039807433, + "grad_norm": 0.31901800632476807, + "learning_rate": 0.000156661101836394, + "loss": 0.5014, + "step": 655 + }, + { + "epoch": 0.00023026206965059048, + "grad_norm": 0.41681963205337524, + "learning_rate": 0.00015659432387312187, + "loss": 0.5709, + "step": 656 + }, + { + "epoch": 0.0002306130789031066, + "grad_norm": 0.5942090749740601, + "learning_rate": 0.00015652754590984974, + "loss": 0.6022, + "step": 657 + }, + { + "epoch": 0.00023096408815562276, + "grad_norm": 0.405391126871109, + "learning_rate": 0.00015646076794657764, + "loss": 0.5363, + "step": 658 + }, + { + "epoch": 0.0002313150974081389, + "grad_norm": 0.3201390206813812, + "learning_rate": 0.00015639398998330552, + "loss": 0.6045, + "step": 659 + }, + { + "epoch": 0.00023166610666065505, + "grad_norm": 0.2989407479763031, + "learning_rate": 0.0001563272120200334, + "loss": 0.5604, + "step": 660 + }, + { + "epoch": 0.0002320171159131712, + "grad_norm": 0.3919268548488617, + "learning_rate": 0.00015626043405676126, + "loss": 0.5413, + "step": 661 + }, + { + "epoch": 0.00023236812516568733, + "grad_norm": 0.4080122709274292, + "learning_rate": 0.00015619365609348916, + "loss": 0.498, + "step": 662 + }, + { + "epoch": 0.00023271913441820348, + "grad_norm": 0.38974156975746155, + "learning_rate": 0.00015612687813021704, + "loss": 0.6149, + "step": 663 + }, + { + "epoch": 0.00023307014367071962, + "grad_norm": 0.3145015835762024, + "learning_rate": 0.00015606010016694494, + "loss": 0.4886, + "step": 664 + }, + { + "epoch": 0.00023342115292323577, + "grad_norm": 0.3009328246116638, + "learning_rate": 0.0001559933222036728, + "loss": 0.5534, + "step": 665 + }, + { + "epoch": 0.00023377216217575192, + "grad_norm": 0.4774717092514038, + "learning_rate": 0.00015592654424040068, + "loss": 0.6006, + "step": 666 + }, + { + "epoch": 0.00023412317142826805, + "grad_norm": 0.32965418696403503, + "learning_rate": 0.00015585976627712856, + "loss": 0.5463, + "step": 667 + }, + { + "epoch": 0.0002344741806807842, + "grad_norm": 0.3066554665565491, + "learning_rate": 0.00015579298831385643, + "loss": 0.5675, + "step": 668 + }, + { + "epoch": 0.00023482518993330035, + "grad_norm": 0.3879207372665405, + "learning_rate": 0.00015572621035058433, + "loss": 0.5825, + "step": 669 + }, + { + "epoch": 0.0002351761991858165, + "grad_norm": 0.3171943128108978, + "learning_rate": 0.0001556594323873122, + "loss": 0.5677, + "step": 670 + }, + { + "epoch": 0.00023552720843833264, + "grad_norm": 0.36982622742652893, + "learning_rate": 0.00015559265442404007, + "loss": 0.5885, + "step": 671 + }, + { + "epoch": 0.00023587821769084877, + "grad_norm": 0.30437183380126953, + "learning_rate": 0.00015552587646076795, + "loss": 0.6288, + "step": 672 + }, + { + "epoch": 0.00023622922694336492, + "grad_norm": 0.30654504895210266, + "learning_rate": 0.00015545909849749582, + "loss": 0.5924, + "step": 673 + }, + { + "epoch": 0.00023658023619588107, + "grad_norm": 0.3771214783191681, + "learning_rate": 0.00015539232053422372, + "loss": 0.4901, + "step": 674 + }, + { + "epoch": 0.00023693124544839721, + "grad_norm": 0.3018699884414673, + "learning_rate": 0.0001553255425709516, + "loss": 0.6159, + "step": 675 + }, + { + "epoch": 0.00023728225470091336, + "grad_norm": 0.32899734377861023, + "learning_rate": 0.00015525876460767947, + "loss": 0.6197, + "step": 676 + }, + { + "epoch": 0.0002376332639534295, + "grad_norm": 0.31837883591651917, + "learning_rate": 0.00015519198664440734, + "loss": 0.5449, + "step": 677 + }, + { + "epoch": 0.00023798427320594564, + "grad_norm": 0.35326528549194336, + "learning_rate": 0.00015512520868113521, + "loss": 0.6315, + "step": 678 + }, + { + "epoch": 0.00023833528245846179, + "grad_norm": 0.3714829385280609, + "learning_rate": 0.00015505843071786311, + "loss": 0.6352, + "step": 679 + }, + { + "epoch": 0.00023868629171097794, + "grad_norm": 0.4002094864845276, + "learning_rate": 0.000154991652754591, + "loss": 0.4235, + "step": 680 + }, + { + "epoch": 0.00023903730096349408, + "grad_norm": 0.3382783532142639, + "learning_rate": 0.0001549248747913189, + "loss": 0.5476, + "step": 681 + }, + { + "epoch": 0.0002393883102160102, + "grad_norm": 0.2985747158527374, + "learning_rate": 0.00015485809682804676, + "loss": 0.5684, + "step": 682 + }, + { + "epoch": 0.00023973931946852636, + "grad_norm": 0.3288929760456085, + "learning_rate": 0.00015479131886477463, + "loss": 0.5657, + "step": 683 + }, + { + "epoch": 0.0002400903287210425, + "grad_norm": 0.39641210436820984, + "learning_rate": 0.0001547245409015025, + "loss": 0.6283, + "step": 684 + }, + { + "epoch": 0.00024044133797355866, + "grad_norm": 0.37413230538368225, + "learning_rate": 0.0001546577629382304, + "loss": 0.5778, + "step": 685 + }, + { + "epoch": 0.0002407923472260748, + "grad_norm": 0.28837504982948303, + "learning_rate": 0.00015459098497495828, + "loss": 0.5079, + "step": 686 + }, + { + "epoch": 0.00024114335647859093, + "grad_norm": 0.32851526141166687, + "learning_rate": 0.00015452420701168615, + "loss": 0.649, + "step": 687 + }, + { + "epoch": 0.00024149436573110708, + "grad_norm": 0.3848758637905121, + "learning_rate": 0.00015445742904841403, + "loss": 0.6099, + "step": 688 + }, + { + "epoch": 0.00024184537498362323, + "grad_norm": 0.35494935512542725, + "learning_rate": 0.0001543906510851419, + "loss": 0.6498, + "step": 689 + }, + { + "epoch": 0.00024219638423613938, + "grad_norm": 0.3431280553340912, + "learning_rate": 0.0001543238731218698, + "loss": 0.4934, + "step": 690 + }, + { + "epoch": 0.00024254739348865553, + "grad_norm": 0.33980974555015564, + "learning_rate": 0.00015425709515859767, + "loss": 0.5556, + "step": 691 + }, + { + "epoch": 0.00024289840274117165, + "grad_norm": 0.3086068034172058, + "learning_rate": 0.00015419031719532555, + "loss": 0.5955, + "step": 692 + }, + { + "epoch": 0.0002432494119936878, + "grad_norm": 0.33093178272247314, + "learning_rate": 0.00015412353923205342, + "loss": 0.5926, + "step": 693 + }, + { + "epoch": 0.00024360042124620395, + "grad_norm": 0.3660534620285034, + "learning_rate": 0.0001540567612687813, + "loss": 0.5494, + "step": 694 + }, + { + "epoch": 0.0002439514304987201, + "grad_norm": 0.29803964495658875, + "learning_rate": 0.0001539899833055092, + "loss": 0.6074, + "step": 695 + }, + { + "epoch": 0.00024430243975123625, + "grad_norm": 0.36542224884033203, + "learning_rate": 0.00015392320534223707, + "loss": 0.59, + "step": 696 + }, + { + "epoch": 0.00024465344900375237, + "grad_norm": 0.34015166759490967, + "learning_rate": 0.00015385642737896494, + "loss": 0.6029, + "step": 697 + }, + { + "epoch": 0.00024500445825626854, + "grad_norm": 0.3211725950241089, + "learning_rate": 0.00015378964941569284, + "loss": 0.535, + "step": 698 + }, + { + "epoch": 0.00024535546750878467, + "grad_norm": 0.37027183175086975, + "learning_rate": 0.0001537228714524207, + "loss": 0.6265, + "step": 699 + }, + { + "epoch": 0.0002457064767613008, + "grad_norm": 0.3447396159172058, + "learning_rate": 0.00015365609348914859, + "loss": 0.6061, + "step": 700 + }, + { + "epoch": 0.00024605748601381697, + "grad_norm": 0.3344075679779053, + "learning_rate": 0.00015358931552587649, + "loss": 0.5412, + "step": 701 + }, + { + "epoch": 0.0002464084952663331, + "grad_norm": 0.29049620032310486, + "learning_rate": 0.00015352253756260436, + "loss": 0.5137, + "step": 702 + }, + { + "epoch": 0.00024675950451884926, + "grad_norm": 0.37048932909965515, + "learning_rate": 0.00015345575959933223, + "loss": 0.6118, + "step": 703 + }, + { + "epoch": 0.0002471105137713654, + "grad_norm": 0.38212522864341736, + "learning_rate": 0.0001533889816360601, + "loss": 0.466, + "step": 704 + }, + { + "epoch": 0.0002474615230238815, + "grad_norm": 0.3576483428478241, + "learning_rate": 0.00015332220367278798, + "loss": 0.561, + "step": 705 + }, + { + "epoch": 0.0002478125322763977, + "grad_norm": 0.3550293743610382, + "learning_rate": 0.00015325542570951588, + "loss": 0.5634, + "step": 706 + }, + { + "epoch": 0.0002481635415289138, + "grad_norm": 0.362474650144577, + "learning_rate": 0.00015318864774624375, + "loss": 0.5608, + "step": 707 + }, + { + "epoch": 0.00024851455078143, + "grad_norm": 0.39463603496551514, + "learning_rate": 0.00015312186978297163, + "loss": 0.64, + "step": 708 + }, + { + "epoch": 0.0002488655600339461, + "grad_norm": 0.3456307649612427, + "learning_rate": 0.0001530550918196995, + "loss": 0.4631, + "step": 709 + }, + { + "epoch": 0.00024921656928646223, + "grad_norm": 0.3300929367542267, + "learning_rate": 0.00015298831385642737, + "loss": 0.3984, + "step": 710 + }, + { + "epoch": 0.0002495675785389784, + "grad_norm": 0.35923343896865845, + "learning_rate": 0.00015292153589315527, + "loss": 0.6003, + "step": 711 + }, + { + "epoch": 0.00024991858779149453, + "grad_norm": 0.4047611653804779, + "learning_rate": 0.00015285475792988315, + "loss": 0.5715, + "step": 712 + }, + { + "epoch": 0.0002502695970440107, + "grad_norm": 0.43539851903915405, + "learning_rate": 0.00015278797996661102, + "loss": 0.571, + "step": 713 + }, + { + "epoch": 0.00025062060629652683, + "grad_norm": 0.34745046496391296, + "learning_rate": 0.0001527212020033389, + "loss": 0.622, + "step": 714 + }, + { + "epoch": 0.00025097161554904295, + "grad_norm": 0.3130028247833252, + "learning_rate": 0.0001526544240400668, + "loss": 0.507, + "step": 715 + }, + { + "epoch": 0.0002513226248015591, + "grad_norm": 0.3093617558479309, + "learning_rate": 0.00015258764607679466, + "loss": 0.4951, + "step": 716 + }, + { + "epoch": 0.00025167363405407525, + "grad_norm": 0.34299540519714355, + "learning_rate": 0.00015252086811352257, + "loss": 0.539, + "step": 717 + }, + { + "epoch": 0.0002520246433065914, + "grad_norm": 0.32698413729667664, + "learning_rate": 0.00015245409015025044, + "loss": 0.4588, + "step": 718 + }, + { + "epoch": 0.00025237565255910755, + "grad_norm": 0.37853989005088806, + "learning_rate": 0.0001523873121869783, + "loss": 0.6227, + "step": 719 + }, + { + "epoch": 0.00025272666181162367, + "grad_norm": 0.32887300848960876, + "learning_rate": 0.00015232053422370618, + "loss": 0.5893, + "step": 720 + }, + { + "epoch": 0.00025307767106413985, + "grad_norm": 0.43352028727531433, + "learning_rate": 0.00015225375626043406, + "loss": 0.5811, + "step": 721 + }, + { + "epoch": 0.00025342868031665597, + "grad_norm": 0.42844903469085693, + "learning_rate": 0.00015218697829716196, + "loss": 0.6196, + "step": 722 + }, + { + "epoch": 0.00025377968956917215, + "grad_norm": 0.39929670095443726, + "learning_rate": 0.00015212020033388983, + "loss": 0.6722, + "step": 723 + }, + { + "epoch": 0.00025413069882168827, + "grad_norm": 0.5063486695289612, + "learning_rate": 0.0001520534223706177, + "loss": 0.6086, + "step": 724 + }, + { + "epoch": 0.0002544817080742044, + "grad_norm": 0.3625267446041107, + "learning_rate": 0.00015198664440734558, + "loss": 0.6331, + "step": 725 + }, + { + "epoch": 0.00025483271732672057, + "grad_norm": 0.3452700078487396, + "learning_rate": 0.00015191986644407345, + "loss": 0.5812, + "step": 726 + }, + { + "epoch": 0.0002551837265792367, + "grad_norm": 0.31915003061294556, + "learning_rate": 0.00015185308848080135, + "loss": 0.5653, + "step": 727 + }, + { + "epoch": 0.00025553473583175287, + "grad_norm": 0.3085877299308777, + "learning_rate": 0.00015178631051752922, + "loss": 0.4702, + "step": 728 + }, + { + "epoch": 0.000255885745084269, + "grad_norm": 0.31519320607185364, + "learning_rate": 0.0001517195325542571, + "loss": 0.5096, + "step": 729 + }, + { + "epoch": 0.0002562367543367851, + "grad_norm": 0.3637699782848358, + "learning_rate": 0.00015165275459098497, + "loss": 0.6001, + "step": 730 + }, + { + "epoch": 0.0002565877635893013, + "grad_norm": 0.34056970477104187, + "learning_rate": 0.00015158597662771284, + "loss": 0.5546, + "step": 731 + }, + { + "epoch": 0.0002569387728418174, + "grad_norm": 0.37110257148742676, + "learning_rate": 0.00015151919866444074, + "loss": 0.5612, + "step": 732 + }, + { + "epoch": 0.0002572897820943336, + "grad_norm": 0.35854101181030273, + "learning_rate": 0.00015145242070116862, + "loss": 0.6364, + "step": 733 + }, + { + "epoch": 0.0002576407913468497, + "grad_norm": 0.4340030252933502, + "learning_rate": 0.00015138564273789652, + "loss": 0.5772, + "step": 734 + }, + { + "epoch": 0.00025799180059936583, + "grad_norm": 0.3807721436023712, + "learning_rate": 0.0001513188647746244, + "loss": 0.4986, + "step": 735 + }, + { + "epoch": 0.000258342809851882, + "grad_norm": 0.3522527813911438, + "learning_rate": 0.00015125208681135226, + "loss": 0.5982, + "step": 736 + }, + { + "epoch": 0.00025869381910439813, + "grad_norm": 0.31251296401023865, + "learning_rate": 0.00015118530884808014, + "loss": 0.5239, + "step": 737 + }, + { + "epoch": 0.0002590448283569143, + "grad_norm": 0.3460885286331177, + "learning_rate": 0.00015111853088480804, + "loss": 0.5881, + "step": 738 + }, + { + "epoch": 0.00025939583760943043, + "grad_norm": 0.33298879861831665, + "learning_rate": 0.0001510517529215359, + "loss": 0.5272, + "step": 739 + }, + { + "epoch": 0.00025974684686194655, + "grad_norm": 0.351468950510025, + "learning_rate": 0.00015098497495826378, + "loss": 0.6049, + "step": 740 + }, + { + "epoch": 0.00026009785611446273, + "grad_norm": 0.3449242413043976, + "learning_rate": 0.00015091819699499166, + "loss": 0.5983, + "step": 741 + }, + { + "epoch": 0.00026044886536697885, + "grad_norm": 0.34724265336990356, + "learning_rate": 0.00015085141903171953, + "loss": 0.5292, + "step": 742 + }, + { + "epoch": 0.00026079987461949503, + "grad_norm": 0.3525671660900116, + "learning_rate": 0.00015078464106844743, + "loss": 0.5391, + "step": 743 + }, + { + "epoch": 0.00026115088387201115, + "grad_norm": 0.33959653973579407, + "learning_rate": 0.0001507178631051753, + "loss": 0.5898, + "step": 744 + }, + { + "epoch": 0.00026150189312452727, + "grad_norm": 0.5051225423812866, + "learning_rate": 0.00015065108514190318, + "loss": 0.5408, + "step": 745 + }, + { + "epoch": 0.00026185290237704345, + "grad_norm": 0.3298085629940033, + "learning_rate": 0.00015058430717863105, + "loss": 0.557, + "step": 746 + }, + { + "epoch": 0.00026220391162955957, + "grad_norm": 0.3375703990459442, + "learning_rate": 0.00015051752921535892, + "loss": 0.5541, + "step": 747 + }, + { + "epoch": 0.00026255492088207575, + "grad_norm": 0.27896445989608765, + "learning_rate": 0.0001504507512520868, + "loss": 0.5273, + "step": 748 + }, + { + "epoch": 0.00026290593013459187, + "grad_norm": 0.30591917037963867, + "learning_rate": 0.0001503839732888147, + "loss": 0.5988, + "step": 749 + }, + { + "epoch": 0.000263256939387108, + "grad_norm": 0.41014084219932556, + "learning_rate": 0.00015031719532554257, + "loss": 0.555, + "step": 750 + }, + { + "epoch": 0.00026360794863962417, + "grad_norm": 0.2935464084148407, + "learning_rate": 0.00015025041736227047, + "loss": 0.625, + "step": 751 + }, + { + "epoch": 0.0002639589578921403, + "grad_norm": 0.46361032128334045, + "learning_rate": 0.00015018363939899834, + "loss": 0.4753, + "step": 752 + }, + { + "epoch": 0.00026430996714465647, + "grad_norm": 0.35808300971984863, + "learning_rate": 0.00015011686143572622, + "loss": 0.5531, + "step": 753 + }, + { + "epoch": 0.0002646609763971726, + "grad_norm": 0.3411274254322052, + "learning_rate": 0.00015005008347245412, + "loss": 0.5577, + "step": 754 + }, + { + "epoch": 0.0002650119856496887, + "grad_norm": 0.34169328212738037, + "learning_rate": 0.000149983305509182, + "loss": 0.4856, + "step": 755 + }, + { + "epoch": 0.0002653629949022049, + "grad_norm": 0.38024139404296875, + "learning_rate": 0.00014991652754590986, + "loss": 0.5203, + "step": 756 + }, + { + "epoch": 0.000265714004154721, + "grad_norm": 0.35004425048828125, + "learning_rate": 0.00014984974958263774, + "loss": 0.4999, + "step": 757 + }, + { + "epoch": 0.0002660650134072372, + "grad_norm": 0.47526153922080994, + "learning_rate": 0.0001497829716193656, + "loss": 0.5503, + "step": 758 + }, + { + "epoch": 0.0002664160226597533, + "grad_norm": 0.35096925497055054, + "learning_rate": 0.0001497161936560935, + "loss": 0.5812, + "step": 759 + }, + { + "epoch": 0.00026676703191226943, + "grad_norm": 0.4505446255207062, + "learning_rate": 0.00014964941569282138, + "loss": 0.6069, + "step": 760 + }, + { + "epoch": 0.0002671180411647856, + "grad_norm": 0.3261663019657135, + "learning_rate": 0.00014958263772954926, + "loss": 0.5601, + "step": 761 + }, + { + "epoch": 0.00026746905041730173, + "grad_norm": 0.3397548794746399, + "learning_rate": 0.00014951585976627713, + "loss": 0.5572, + "step": 762 + }, + { + "epoch": 0.00026782005966981785, + "grad_norm": 0.35547688603401184, + "learning_rate": 0.000149449081803005, + "loss": 0.5983, + "step": 763 + }, + { + "epoch": 0.00026817106892233403, + "grad_norm": 0.41515079140663147, + "learning_rate": 0.00014938230383973287, + "loss": 0.6106, + "step": 764 + }, + { + "epoch": 0.00026852207817485015, + "grad_norm": 0.3840051591396332, + "learning_rate": 0.00014931552587646077, + "loss": 0.5328, + "step": 765 + }, + { + "epoch": 0.00026887308742736633, + "grad_norm": 0.3401285707950592, + "learning_rate": 0.00014924874791318865, + "loss": 0.4666, + "step": 766 + }, + { + "epoch": 0.00026922409667988245, + "grad_norm": 0.32983794808387756, + "learning_rate": 0.00014918196994991652, + "loss": 0.5214, + "step": 767 + }, + { + "epoch": 0.0002695751059323986, + "grad_norm": 0.30202198028564453, + "learning_rate": 0.00014911519198664442, + "loss": 0.4969, + "step": 768 + }, + { + "epoch": 0.00026992611518491475, + "grad_norm": 0.3222092092037201, + "learning_rate": 0.0001490484140233723, + "loss": 0.5093, + "step": 769 + }, + { + "epoch": 0.0002702771244374309, + "grad_norm": 0.4211997091770172, + "learning_rate": 0.0001489816360601002, + "loss": 0.6295, + "step": 770 + }, + { + "epoch": 0.00027062813368994705, + "grad_norm": 0.32112184166908264, + "learning_rate": 0.00014891485809682807, + "loss": 0.5611, + "step": 771 + }, + { + "epoch": 0.00027097914294246317, + "grad_norm": 0.3272956609725952, + "learning_rate": 0.00014884808013355594, + "loss": 0.6438, + "step": 772 + }, + { + "epoch": 0.0002713301521949793, + "grad_norm": 0.39423295855522156, + "learning_rate": 0.00014878130217028381, + "loss": 0.6029, + "step": 773 + }, + { + "epoch": 0.00027168116144749547, + "grad_norm": 0.3053528070449829, + "learning_rate": 0.0001487145242070117, + "loss": 0.4978, + "step": 774 + }, + { + "epoch": 0.0002720321707000116, + "grad_norm": 0.312774658203125, + "learning_rate": 0.0001486477462437396, + "loss": 0.5753, + "step": 775 + }, + { + "epoch": 0.00027238317995252777, + "grad_norm": 0.343964546918869, + "learning_rate": 0.00014858096828046746, + "loss": 0.5173, + "step": 776 + }, + { + "epoch": 0.0002727341892050439, + "grad_norm": 0.39104631543159485, + "learning_rate": 0.00014851419031719533, + "loss": 0.6381, + "step": 777 + }, + { + "epoch": 0.00027308519845756, + "grad_norm": 0.3958207070827484, + "learning_rate": 0.0001484474123539232, + "loss": 0.6046, + "step": 778 + }, + { + "epoch": 0.0002734362077100762, + "grad_norm": 0.36198097467422485, + "learning_rate": 0.00014838063439065108, + "loss": 0.6066, + "step": 779 + }, + { + "epoch": 0.0002737872169625923, + "grad_norm": 0.29619571566581726, + "learning_rate": 0.00014831385642737895, + "loss": 0.5131, + "step": 780 + }, + { + "epoch": 0.0002741382262151085, + "grad_norm": 0.344784677028656, + "learning_rate": 0.00014824707846410685, + "loss": 0.5626, + "step": 781 + }, + { + "epoch": 0.0002744892354676246, + "grad_norm": 0.35641250014305115, + "learning_rate": 0.00014818030050083473, + "loss": 0.5451, + "step": 782 + }, + { + "epoch": 0.00027484024472014074, + "grad_norm": 0.3496847152709961, + "learning_rate": 0.0001481135225375626, + "loss": 0.4814, + "step": 783 + }, + { + "epoch": 0.0002751912539726569, + "grad_norm": 0.3726658821105957, + "learning_rate": 0.00014804674457429047, + "loss": 0.6244, + "step": 784 + }, + { + "epoch": 0.00027554226322517303, + "grad_norm": 0.3317565619945526, + "learning_rate": 0.00014797996661101837, + "loss": 0.562, + "step": 785 + }, + { + "epoch": 0.0002758932724776892, + "grad_norm": 0.3478979468345642, + "learning_rate": 0.00014791318864774625, + "loss": 0.613, + "step": 786 + }, + { + "epoch": 0.00027624428173020533, + "grad_norm": 0.3572550415992737, + "learning_rate": 0.00014784641068447415, + "loss": 0.4841, + "step": 787 + }, + { + "epoch": 0.00027659529098272146, + "grad_norm": 0.34030210971832275, + "learning_rate": 0.00014777963272120202, + "loss": 0.4879, + "step": 788 + }, + { + "epoch": 0.00027694630023523763, + "grad_norm": 0.378203421831131, + "learning_rate": 0.0001477128547579299, + "loss": 0.6086, + "step": 789 + }, + { + "epoch": 0.00027729730948775375, + "grad_norm": 0.3390562832355499, + "learning_rate": 0.00014764607679465777, + "loss": 0.586, + "step": 790 + }, + { + "epoch": 0.00027764831874026993, + "grad_norm": 0.4986645579338074, + "learning_rate": 0.00014757929883138567, + "loss": 0.5592, + "step": 791 + }, + { + "epoch": 0.00027799932799278605, + "grad_norm": 0.3361869156360626, + "learning_rate": 0.00014751252086811354, + "loss": 0.4632, + "step": 792 + }, + { + "epoch": 0.0002783503372453022, + "grad_norm": 0.3726123571395874, + "learning_rate": 0.0001474457429048414, + "loss": 0.4915, + "step": 793 + }, + { + "epoch": 0.00027870134649781835, + "grad_norm": 0.3358845114707947, + "learning_rate": 0.00014737896494156929, + "loss": 0.5593, + "step": 794 + }, + { + "epoch": 0.0002790523557503345, + "grad_norm": 0.30473607778549194, + "learning_rate": 0.00014731218697829716, + "loss": 0.3672, + "step": 795 + }, + { + "epoch": 0.00027940336500285065, + "grad_norm": 0.33929023146629333, + "learning_rate": 0.00014724540901502506, + "loss": 0.5404, + "step": 796 + }, + { + "epoch": 0.0002797543742553668, + "grad_norm": 0.30778205394744873, + "learning_rate": 0.00014717863105175293, + "loss": 0.4379, + "step": 797 + }, + { + "epoch": 0.0002801053835078829, + "grad_norm": 0.286443829536438, + "learning_rate": 0.0001471118530884808, + "loss": 0.5579, + "step": 798 + }, + { + "epoch": 0.0002804563927603991, + "grad_norm": 0.4246799051761627, + "learning_rate": 0.00014704507512520868, + "loss": 0.536, + "step": 799 + }, + { + "epoch": 0.0002808074020129152, + "grad_norm": 0.4085538983345032, + "learning_rate": 0.00014697829716193655, + "loss": 0.5309, + "step": 800 + }, + { + "epoch": 0.00028115841126543137, + "grad_norm": 0.35396453738212585, + "learning_rate": 0.00014691151919866443, + "loss": 0.5307, + "step": 801 + }, + { + "epoch": 0.0002815094205179475, + "grad_norm": 0.45588648319244385, + "learning_rate": 0.00014684474123539233, + "loss": 0.5905, + "step": 802 + }, + { + "epoch": 0.0002818604297704636, + "grad_norm": 0.3353815972805023, + "learning_rate": 0.0001467779632721202, + "loss": 0.612, + "step": 803 + }, + { + "epoch": 0.0002822114390229798, + "grad_norm": 0.4152653217315674, + "learning_rate": 0.0001467111853088481, + "loss": 0.592, + "step": 804 + }, + { + "epoch": 0.0002825624482754959, + "grad_norm": 0.3651511073112488, + "learning_rate": 0.00014664440734557597, + "loss": 0.5909, + "step": 805 + }, + { + "epoch": 0.0002829134575280121, + "grad_norm": 0.3518235385417938, + "learning_rate": 0.00014657762938230385, + "loss": 0.5684, + "step": 806 + }, + { + "epoch": 0.0002832644667805282, + "grad_norm": 0.33562156558036804, + "learning_rate": 0.00014651085141903175, + "loss": 0.5165, + "step": 807 + }, + { + "epoch": 0.00028361547603304434, + "grad_norm": 0.3648052513599396, + "learning_rate": 0.00014644407345575962, + "loss": 0.5451, + "step": 808 + }, + { + "epoch": 0.0002839664852855605, + "grad_norm": 0.44342300295829773, + "learning_rate": 0.0001463772954924875, + "loss": 0.5907, + "step": 809 + }, + { + "epoch": 0.00028431749453807664, + "grad_norm": 0.33331966400146484, + "learning_rate": 0.00014631051752921536, + "loss": 0.4254, + "step": 810 + }, + { + "epoch": 0.0002846685037905928, + "grad_norm": 0.3444873094558716, + "learning_rate": 0.00014624373956594324, + "loss": 0.5201, + "step": 811 + }, + { + "epoch": 0.00028501951304310894, + "grad_norm": 0.4239615201950073, + "learning_rate": 0.00014617696160267114, + "loss": 0.5098, + "step": 812 + }, + { + "epoch": 0.00028537052229562506, + "grad_norm": 0.47895997762680054, + "learning_rate": 0.000146110183639399, + "loss": 0.6243, + "step": 813 + }, + { + "epoch": 0.00028572153154814123, + "grad_norm": 0.47322046756744385, + "learning_rate": 0.00014604340567612688, + "loss": 0.6841, + "step": 814 + }, + { + "epoch": 0.00028607254080065736, + "grad_norm": 0.35017871856689453, + "learning_rate": 0.00014597662771285476, + "loss": 0.5313, + "step": 815 + }, + { + "epoch": 0.00028642355005317353, + "grad_norm": 0.4342300295829773, + "learning_rate": 0.00014590984974958263, + "loss": 0.4363, + "step": 816 + }, + { + "epoch": 0.00028677455930568966, + "grad_norm": 0.2966228723526001, + "learning_rate": 0.0001458430717863105, + "loss": 0.6428, + "step": 817 + }, + { + "epoch": 0.0002871255685582058, + "grad_norm": 0.3320361375808716, + "learning_rate": 0.0001457762938230384, + "loss": 0.5266, + "step": 818 + }, + { + "epoch": 0.00028747657781072195, + "grad_norm": 0.3318590223789215, + "learning_rate": 0.00014570951585976628, + "loss": 0.5676, + "step": 819 + }, + { + "epoch": 0.0002878275870632381, + "grad_norm": 0.38573157787323, + "learning_rate": 0.00014564273789649415, + "loss": 0.7083, + "step": 820 + }, + { + "epoch": 0.00028817859631575425, + "grad_norm": 0.3731164038181305, + "learning_rate": 0.00014557595993322205, + "loss": 0.578, + "step": 821 + }, + { + "epoch": 0.0002885296055682704, + "grad_norm": 0.33610039949417114, + "learning_rate": 0.00014550918196994992, + "loss": 0.5923, + "step": 822 + }, + { + "epoch": 0.0002888806148207865, + "grad_norm": 0.3393179476261139, + "learning_rate": 0.00014544240400667782, + "loss": 0.5162, + "step": 823 + }, + { + "epoch": 0.0002892316240733027, + "grad_norm": 0.35552918910980225, + "learning_rate": 0.0001453756260434057, + "loss": 0.556, + "step": 824 + }, + { + "epoch": 0.0002895826333258188, + "grad_norm": 0.32425832748413086, + "learning_rate": 0.00014530884808013357, + "loss": 0.5157, + "step": 825 + }, + { + "epoch": 0.000289933642578335, + "grad_norm": 0.3353455662727356, + "learning_rate": 0.00014524207011686144, + "loss": 0.483, + "step": 826 + }, + { + "epoch": 0.0002902846518308511, + "grad_norm": 0.46254628896713257, + "learning_rate": 0.00014517529215358932, + "loss": 0.633, + "step": 827 + }, + { + "epoch": 0.0002906356610833672, + "grad_norm": 0.3275732100009918, + "learning_rate": 0.00014510851419031722, + "loss": 0.5502, + "step": 828 + }, + { + "epoch": 0.0002909866703358834, + "grad_norm": 0.3495190441608429, + "learning_rate": 0.0001450417362270451, + "loss": 0.368, + "step": 829 + }, + { + "epoch": 0.0002913376795883995, + "grad_norm": 0.35350501537323, + "learning_rate": 0.00014497495826377296, + "loss": 0.5819, + "step": 830 + }, + { + "epoch": 0.0002916886888409157, + "grad_norm": 0.37886378169059753, + "learning_rate": 0.00014490818030050084, + "loss": 0.5418, + "step": 831 + }, + { + "epoch": 0.0002920396980934318, + "grad_norm": 0.4279928505420685, + "learning_rate": 0.0001448414023372287, + "loss": 0.5199, + "step": 832 + }, + { + "epoch": 0.00029239070734594794, + "grad_norm": 0.33105382323265076, + "learning_rate": 0.00014477462437395658, + "loss": 0.5952, + "step": 833 + }, + { + "epoch": 0.0002927417165984641, + "grad_norm": 0.40114086866378784, + "learning_rate": 0.00014470784641068448, + "loss": 0.4611, + "step": 834 + }, + { + "epoch": 0.00029309272585098024, + "grad_norm": 0.3294037878513336, + "learning_rate": 0.00014464106844741236, + "loss": 0.5562, + "step": 835 + }, + { + "epoch": 0.0002934437351034964, + "grad_norm": 0.3391546607017517, + "learning_rate": 0.00014457429048414023, + "loss": 0.5748, + "step": 836 + }, + { + "epoch": 0.00029379474435601254, + "grad_norm": 0.4093922972679138, + "learning_rate": 0.0001445075125208681, + "loss": 0.4607, + "step": 837 + }, + { + "epoch": 0.00029414575360852866, + "grad_norm": 0.3331819176673889, + "learning_rate": 0.000144440734557596, + "loss": 0.5874, + "step": 838 + }, + { + "epoch": 0.00029449676286104484, + "grad_norm": 0.43205946683883667, + "learning_rate": 0.00014437395659432388, + "loss": 0.6152, + "step": 839 + }, + { + "epoch": 0.00029484777211356096, + "grad_norm": 0.36046868562698364, + "learning_rate": 0.00014430717863105178, + "loss": 0.4781, + "step": 840 + }, + { + "epoch": 0.00029519878136607713, + "grad_norm": 0.35514524579048157, + "learning_rate": 0.00014424040066777965, + "loss": 0.568, + "step": 841 + }, + { + "epoch": 0.00029554979061859326, + "grad_norm": 0.40260326862335205, + "learning_rate": 0.00014417362270450752, + "loss": 0.6075, + "step": 842 + }, + { + "epoch": 0.0002959007998711094, + "grad_norm": 0.3102671205997467, + "learning_rate": 0.0001441068447412354, + "loss": 0.4927, + "step": 843 + }, + { + "epoch": 0.00029625180912362556, + "grad_norm": 0.30940982699394226, + "learning_rate": 0.0001440400667779633, + "loss": 0.5549, + "step": 844 + }, + { + "epoch": 0.0002966028183761417, + "grad_norm": 0.3652762174606323, + "learning_rate": 0.00014397328881469117, + "loss": 0.6085, + "step": 845 + }, + { + "epoch": 0.00029695382762865786, + "grad_norm": 0.43056777119636536, + "learning_rate": 0.00014390651085141904, + "loss": 0.494, + "step": 846 + }, + { + "epoch": 0.000297304836881174, + "grad_norm": 0.3112967014312744, + "learning_rate": 0.00014383973288814692, + "loss": 0.5141, + "step": 847 + }, + { + "epoch": 0.0002976558461336901, + "grad_norm": 0.36729326844215393, + "learning_rate": 0.0001437729549248748, + "loss": 0.5435, + "step": 848 + }, + { + "epoch": 0.0002980068553862063, + "grad_norm": 0.3128114938735962, + "learning_rate": 0.00014370617696160266, + "loss": 0.5419, + "step": 849 + }, + { + "epoch": 0.0002983578646387224, + "grad_norm": 0.4030589163303375, + "learning_rate": 0.00014363939899833056, + "loss": 0.5959, + "step": 850 + }, + { + "epoch": 0.0002987088738912386, + "grad_norm": 0.39571288228034973, + "learning_rate": 0.00014357262103505844, + "loss": 0.6798, + "step": 851 + }, + { + "epoch": 0.0002990598831437547, + "grad_norm": 0.3388408422470093, + "learning_rate": 0.0001435058430717863, + "loss": 0.4887, + "step": 852 + }, + { + "epoch": 0.0002994108923962708, + "grad_norm": 0.39615562558174133, + "learning_rate": 0.00014343906510851418, + "loss": 0.5654, + "step": 853 + }, + { + "epoch": 0.000299761901648787, + "grad_norm": 0.3967401683330536, + "learning_rate": 0.00014337228714524205, + "loss": 0.6192, + "step": 854 + }, + { + "epoch": 0.0003001129109013031, + "grad_norm": 0.5597772002220154, + "learning_rate": 0.00014330550918196995, + "loss": 0.5808, + "step": 855 + }, + { + "epoch": 0.0003004639201538193, + "grad_norm": 0.36231061816215515, + "learning_rate": 0.00014323873121869783, + "loss": 0.4936, + "step": 856 + }, + { + "epoch": 0.0003008149294063354, + "grad_norm": 0.3775942027568817, + "learning_rate": 0.00014317195325542573, + "loss": 0.5706, + "step": 857 + }, + { + "epoch": 0.00030116593865885154, + "grad_norm": 0.4139408767223358, + "learning_rate": 0.0001431051752921536, + "loss": 0.5784, + "step": 858 + }, + { + "epoch": 0.0003015169479113677, + "grad_norm": 0.4101429879665375, + "learning_rate": 0.00014303839732888147, + "loss": 0.5937, + "step": 859 + }, + { + "epoch": 0.00030186795716388384, + "grad_norm": 0.5272162556648254, + "learning_rate": 0.00014297161936560937, + "loss": 0.5244, + "step": 860 + }, + { + "epoch": 0.0003022189664164, + "grad_norm": 0.3587292730808258, + "learning_rate": 0.00014290484140233725, + "loss": 0.6333, + "step": 861 + }, + { + "epoch": 0.00030256997566891614, + "grad_norm": 0.3284890353679657, + "learning_rate": 0.00014283806343906512, + "loss": 0.5414, + "step": 862 + }, + { + "epoch": 0.00030292098492143226, + "grad_norm": 0.414974182844162, + "learning_rate": 0.000142771285475793, + "loss": 0.6116, + "step": 863 + }, + { + "epoch": 0.00030327199417394844, + "grad_norm": 0.33619245886802673, + "learning_rate": 0.00014270450751252087, + "loss": 0.5506, + "step": 864 + }, + { + "epoch": 0.00030362300342646456, + "grad_norm": 0.45475640892982483, + "learning_rate": 0.00014263772954924874, + "loss": 0.6347, + "step": 865 + }, + { + "epoch": 0.00030397401267898074, + "grad_norm": 0.2695920765399933, + "learning_rate": 0.00014257095158597664, + "loss": 0.4529, + "step": 866 + }, + { + "epoch": 0.00030432502193149686, + "grad_norm": 0.3314480781555176, + "learning_rate": 0.00014250417362270451, + "loss": 0.5812, + "step": 867 + }, + { + "epoch": 0.000304676031184013, + "grad_norm": 0.31949582695961, + "learning_rate": 0.0001424373956594324, + "loss": 0.5213, + "step": 868 + }, + { + "epoch": 0.00030502704043652916, + "grad_norm": 0.34049752354621887, + "learning_rate": 0.00014237061769616026, + "loss": 0.4645, + "step": 869 + }, + { + "epoch": 0.0003053780496890453, + "grad_norm": 0.4304719567298889, + "learning_rate": 0.00014230383973288813, + "loss": 0.5065, + "step": 870 + }, + { + "epoch": 0.00030572905894156146, + "grad_norm": 0.32379043102264404, + "learning_rate": 0.00014223706176961603, + "loss": 0.553, + "step": 871 + }, + { + "epoch": 0.0003060800681940776, + "grad_norm": 0.33285439014434814, + "learning_rate": 0.0001421702838063439, + "loss": 0.5092, + "step": 872 + }, + { + "epoch": 0.0003064310774465937, + "grad_norm": 0.336795449256897, + "learning_rate": 0.00014210350584307178, + "loss": 0.4967, + "step": 873 + }, + { + "epoch": 0.0003067820866991099, + "grad_norm": 0.34653040766716003, + "learning_rate": 0.00014203672787979968, + "loss": 0.5353, + "step": 874 + }, + { + "epoch": 0.000307133095951626, + "grad_norm": 0.3352467715740204, + "learning_rate": 0.00014196994991652755, + "loss": 0.5594, + "step": 875 + }, + { + "epoch": 0.0003074841052041422, + "grad_norm": 0.38723453879356384, + "learning_rate": 0.00014190317195325545, + "loss": 0.5897, + "step": 876 + }, + { + "epoch": 0.0003078351144566583, + "grad_norm": 0.3987238109111786, + "learning_rate": 0.00014183639398998333, + "loss": 0.4647, + "step": 877 + }, + { + "epoch": 0.0003081861237091744, + "grad_norm": 0.3452693223953247, + "learning_rate": 0.0001417696160267112, + "loss": 0.5687, + "step": 878 + }, + { + "epoch": 0.0003085371329616906, + "grad_norm": 0.3561328649520874, + "learning_rate": 0.00014170283806343907, + "loss": 0.5845, + "step": 879 + }, + { + "epoch": 0.0003088881422142067, + "grad_norm": 0.29658418893814087, + "learning_rate": 0.00014163606010016695, + "loss": 0.5202, + "step": 880 + }, + { + "epoch": 0.0003092391514667229, + "grad_norm": 0.3908213973045349, + "learning_rate": 0.00014156928213689482, + "loss": 0.4439, + "step": 881 + }, + { + "epoch": 0.000309590160719239, + "grad_norm": 0.35816919803619385, + "learning_rate": 0.00014150250417362272, + "loss": 0.5384, + "step": 882 + }, + { + "epoch": 0.00030994116997175514, + "grad_norm": 0.3681255877017975, + "learning_rate": 0.0001414357262103506, + "loss": 0.5999, + "step": 883 + }, + { + "epoch": 0.0003102921792242713, + "grad_norm": 0.31137388944625854, + "learning_rate": 0.00014136894824707847, + "loss": 0.4495, + "step": 884 + }, + { + "epoch": 0.00031064318847678744, + "grad_norm": 0.2831423878669739, + "learning_rate": 0.00014130217028380634, + "loss": 0.4576, + "step": 885 + }, + { + "epoch": 0.0003109941977293036, + "grad_norm": 0.25953516364097595, + "learning_rate": 0.0001412353923205342, + "loss": 0.5606, + "step": 886 + }, + { + "epoch": 0.00031134520698181974, + "grad_norm": 0.31105297803878784, + "learning_rate": 0.0001411686143572621, + "loss": 0.5986, + "step": 887 + }, + { + "epoch": 0.00031169621623433586, + "grad_norm": 0.35177484154701233, + "learning_rate": 0.00014110183639398999, + "loss": 0.3394, + "step": 888 + }, + { + "epoch": 0.00031204722548685204, + "grad_norm": 0.373470276594162, + "learning_rate": 0.00014103505843071786, + "loss": 0.5862, + "step": 889 + }, + { + "epoch": 0.00031239823473936816, + "grad_norm": 0.37227189540863037, + "learning_rate": 0.00014096828046744576, + "loss": 0.4677, + "step": 890 + }, + { + "epoch": 0.00031274924399188434, + "grad_norm": 0.3799666464328766, + "learning_rate": 0.00014090150250417363, + "loss": 0.5255, + "step": 891 + }, + { + "epoch": 0.00031310025324440046, + "grad_norm": 0.3630129098892212, + "learning_rate": 0.00014083472454090153, + "loss": 0.5111, + "step": 892 + }, + { + "epoch": 0.0003134512624969166, + "grad_norm": 0.5131457448005676, + "learning_rate": 0.0001407679465776294, + "loss": 0.5207, + "step": 893 + }, + { + "epoch": 0.00031380227174943276, + "grad_norm": 0.3759867548942566, + "learning_rate": 0.00014070116861435728, + "loss": 0.6678, + "step": 894 + }, + { + "epoch": 0.0003141532810019489, + "grad_norm": 0.5577414631843567, + "learning_rate": 0.00014063439065108515, + "loss": 0.62, + "step": 895 + }, + { + "epoch": 0.00031450429025446506, + "grad_norm": 0.2789120376110077, + "learning_rate": 0.00014056761268781303, + "loss": 0.4204, + "step": 896 + }, + { + "epoch": 0.0003148552995069812, + "grad_norm": 0.2897239327430725, + "learning_rate": 0.0001405008347245409, + "loss": 0.432, + "step": 897 + }, + { + "epoch": 0.0003152063087594973, + "grad_norm": 0.3552323579788208, + "learning_rate": 0.0001404340567612688, + "loss": 0.5512, + "step": 898 + }, + { + "epoch": 0.0003155573180120135, + "grad_norm": 0.49963894486427307, + "learning_rate": 0.00014036727879799667, + "loss": 0.5868, + "step": 899 + }, + { + "epoch": 0.0003159083272645296, + "grad_norm": 0.37479934096336365, + "learning_rate": 0.00014030050083472454, + "loss": 0.6682, + "step": 900 + }, + { + "epoch": 0.0003162593365170458, + "grad_norm": 0.3415648639202118, + "learning_rate": 0.00014023372287145242, + "loss": 0.5301, + "step": 901 + }, + { + "epoch": 0.0003166103457695619, + "grad_norm": 0.37530943751335144, + "learning_rate": 0.0001401669449081803, + "loss": 0.5409, + "step": 902 + }, + { + "epoch": 0.000316961355022078, + "grad_norm": 0.37487658858299255, + "learning_rate": 0.0001401001669449082, + "loss": 0.5976, + "step": 903 + }, + { + "epoch": 0.0003173123642745942, + "grad_norm": 0.37174728512763977, + "learning_rate": 0.00014003338898163606, + "loss": 0.5933, + "step": 904 + }, + { + "epoch": 0.0003176633735271103, + "grad_norm": 0.491584450006485, + "learning_rate": 0.00013996661101836394, + "loss": 0.5112, + "step": 905 + }, + { + "epoch": 0.0003180143827796265, + "grad_norm": 0.38381487131118774, + "learning_rate": 0.0001398998330550918, + "loss": 0.6486, + "step": 906 + }, + { + "epoch": 0.0003183653920321426, + "grad_norm": 0.2867659330368042, + "learning_rate": 0.0001398330550918197, + "loss": 0.5033, + "step": 907 + }, + { + "epoch": 0.00031871640128465874, + "grad_norm": 0.3146355450153351, + "learning_rate": 0.00013976627712854758, + "loss": 0.5878, + "step": 908 + }, + { + "epoch": 0.0003190674105371749, + "grad_norm": 0.3454856276512146, + "learning_rate": 0.00013969949916527548, + "loss": 0.4751, + "step": 909 + }, + { + "epoch": 0.00031941841978969104, + "grad_norm": 0.32241204380989075, + "learning_rate": 0.00013963272120200336, + "loss": 0.6378, + "step": 910 + }, + { + "epoch": 0.0003197694290422072, + "grad_norm": 0.33703315258026123, + "learning_rate": 0.00013956594323873123, + "loss": 0.4634, + "step": 911 + }, + { + "epoch": 0.00032012043829472334, + "grad_norm": 0.3781648576259613, + "learning_rate": 0.0001394991652754591, + "loss": 0.5218, + "step": 912 + }, + { + "epoch": 0.00032047144754723946, + "grad_norm": 0.4124391973018646, + "learning_rate": 0.00013943238731218698, + "loss": 0.4958, + "step": 913 + }, + { + "epoch": 0.00032082245679975564, + "grad_norm": 0.3970220685005188, + "learning_rate": 0.00013936560934891488, + "loss": 0.5624, + "step": 914 + }, + { + "epoch": 0.00032117346605227176, + "grad_norm": 0.43682703375816345, + "learning_rate": 0.00013929883138564275, + "loss": 0.544, + "step": 915 + }, + { + "epoch": 0.00032152447530478794, + "grad_norm": 0.3476586639881134, + "learning_rate": 0.00013923205342237062, + "loss": 0.4418, + "step": 916 + }, + { + "epoch": 0.00032187548455730406, + "grad_norm": 0.36963552236557007, + "learning_rate": 0.0001391652754590985, + "loss": 0.5946, + "step": 917 + }, + { + "epoch": 0.0003222264938098202, + "grad_norm": 0.3445582985877991, + "learning_rate": 0.00013909849749582637, + "loss": 0.5879, + "step": 918 + }, + { + "epoch": 0.00032257750306233636, + "grad_norm": 0.39813530445098877, + "learning_rate": 0.00013903171953255427, + "loss": 0.5759, + "step": 919 + }, + { + "epoch": 0.0003229285123148525, + "grad_norm": 0.3314265012741089, + "learning_rate": 0.00013896494156928214, + "loss": 0.6165, + "step": 920 + }, + { + "epoch": 0.00032327952156736866, + "grad_norm": 0.4094330072402954, + "learning_rate": 0.00013889816360601002, + "loss": 0.5787, + "step": 921 + }, + { + "epoch": 0.0003236305308198848, + "grad_norm": 0.36821484565734863, + "learning_rate": 0.0001388313856427379, + "loss": 0.5303, + "step": 922 + }, + { + "epoch": 0.0003239815400724009, + "grad_norm": 0.3517453968524933, + "learning_rate": 0.00013876460767946576, + "loss": 0.4586, + "step": 923 + }, + { + "epoch": 0.0003243325493249171, + "grad_norm": 0.2959018647670746, + "learning_rate": 0.00013869782971619366, + "loss": 0.5225, + "step": 924 + }, + { + "epoch": 0.0003246835585774332, + "grad_norm": 0.3286895751953125, + "learning_rate": 0.00013863105175292154, + "loss": 0.5353, + "step": 925 + }, + { + "epoch": 0.0003250345678299494, + "grad_norm": 0.3328275680541992, + "learning_rate": 0.00013856427378964944, + "loss": 0.5915, + "step": 926 + }, + { + "epoch": 0.0003253855770824655, + "grad_norm": 0.3400813937187195, + "learning_rate": 0.0001384974958263773, + "loss": 0.4598, + "step": 927 + }, + { + "epoch": 0.0003257365863349816, + "grad_norm": 0.2876541018486023, + "learning_rate": 0.00013843071786310518, + "loss": 0.4835, + "step": 928 + }, + { + "epoch": 0.0003260875955874978, + "grad_norm": 0.3401765525341034, + "learning_rate": 0.00013836393989983308, + "loss": 0.56, + "step": 929 + }, + { + "epoch": 0.0003264386048400139, + "grad_norm": 0.34506598114967346, + "learning_rate": 0.00013829716193656096, + "loss": 0.6234, + "step": 930 + }, + { + "epoch": 0.0003267896140925301, + "grad_norm": 0.33732855319976807, + "learning_rate": 0.00013823038397328883, + "loss": 0.5686, + "step": 931 + }, + { + "epoch": 0.0003271406233450462, + "grad_norm": 0.34300100803375244, + "learning_rate": 0.0001381636060100167, + "loss": 0.6091, + "step": 932 + }, + { + "epoch": 0.00032749163259756235, + "grad_norm": 0.30349200963974, + "learning_rate": 0.00013809682804674458, + "loss": 0.4836, + "step": 933 + }, + { + "epoch": 0.0003278426418500785, + "grad_norm": 0.35742175579071045, + "learning_rate": 0.00013803005008347245, + "loss": 0.6443, + "step": 934 + }, + { + "epoch": 0.00032819365110259464, + "grad_norm": 0.33582496643066406, + "learning_rate": 0.00013796327212020035, + "loss": 0.6361, + "step": 935 + }, + { + "epoch": 0.0003285446603551108, + "grad_norm": 0.33403804898262024, + "learning_rate": 0.00013789649415692822, + "loss": 0.5911, + "step": 936 + }, + { + "epoch": 0.00032889566960762694, + "grad_norm": 0.4263191521167755, + "learning_rate": 0.0001378297161936561, + "loss": 0.5243, + "step": 937 + }, + { + "epoch": 0.00032924667886014307, + "grad_norm": 0.31543296575546265, + "learning_rate": 0.00013776293823038397, + "loss": 0.554, + "step": 938 + }, + { + "epoch": 0.00032959768811265924, + "grad_norm": 0.38975203037261963, + "learning_rate": 0.00013769616026711184, + "loss": 0.5358, + "step": 939 + }, + { + "epoch": 0.00032994869736517536, + "grad_norm": 0.3175157904624939, + "learning_rate": 0.00013762938230383974, + "loss": 0.5385, + "step": 940 + }, + { + "epoch": 0.00033029970661769154, + "grad_norm": 0.32753151655197144, + "learning_rate": 0.00013756260434056762, + "loss": 0.5191, + "step": 941 + }, + { + "epoch": 0.00033065071587020766, + "grad_norm": 0.2516227066516876, + "learning_rate": 0.0001374958263772955, + "loss": 0.3496, + "step": 942 + }, + { + "epoch": 0.0003310017251227238, + "grad_norm": 0.275806188583374, + "learning_rate": 0.0001374290484140234, + "loss": 0.4197, + "step": 943 + }, + { + "epoch": 0.00033135273437523996, + "grad_norm": 0.30234864354133606, + "learning_rate": 0.00013736227045075126, + "loss": 0.4909, + "step": 944 + }, + { + "epoch": 0.0003317037436277561, + "grad_norm": 0.32561683654785156, + "learning_rate": 0.00013729549248747916, + "loss": 0.5865, + "step": 945 + }, + { + "epoch": 0.00033205475288027226, + "grad_norm": 0.32075145840644836, + "learning_rate": 0.00013722871452420704, + "loss": 0.5957, + "step": 946 + }, + { + "epoch": 0.0003324057621327884, + "grad_norm": 0.3077705204486847, + "learning_rate": 0.0001371619365609349, + "loss": 0.6026, + "step": 947 + }, + { + "epoch": 0.0003327567713853045, + "grad_norm": 0.3092177212238312, + "learning_rate": 0.00013709515859766278, + "loss": 0.553, + "step": 948 + }, + { + "epoch": 0.0003331077806378207, + "grad_norm": 0.3611501157283783, + "learning_rate": 0.00013702838063439065, + "loss": 0.5707, + "step": 949 + }, + { + "epoch": 0.0003334587898903368, + "grad_norm": 0.3343827724456787, + "learning_rate": 0.00013696160267111853, + "loss": 0.5626, + "step": 950 + }, + { + "epoch": 0.000333809799142853, + "grad_norm": 0.3330281376838684, + "learning_rate": 0.00013689482470784643, + "loss": 0.6353, + "step": 951 + }, + { + "epoch": 0.0003341608083953691, + "grad_norm": 0.4045816957950592, + "learning_rate": 0.0001368280467445743, + "loss": 0.5781, + "step": 952 + }, + { + "epoch": 0.0003345118176478852, + "grad_norm": 0.3618166446685791, + "learning_rate": 0.00013676126878130217, + "loss": 0.6702, + "step": 953 + }, + { + "epoch": 0.0003348628269004014, + "grad_norm": 0.2836553752422333, + "learning_rate": 0.00013669449081803005, + "loss": 0.4371, + "step": 954 + }, + { + "epoch": 0.0003352138361529175, + "grad_norm": 0.3100498914718628, + "learning_rate": 0.00013662771285475792, + "loss": 0.5184, + "step": 955 + }, + { + "epoch": 0.0003355648454054337, + "grad_norm": 0.34877723455429077, + "learning_rate": 0.00013656093489148582, + "loss": 0.4778, + "step": 956 + }, + { + "epoch": 0.0003359158546579498, + "grad_norm": 0.27756938338279724, + "learning_rate": 0.0001364941569282137, + "loss": 0.4314, + "step": 957 + }, + { + "epoch": 0.00033626686391046595, + "grad_norm": 0.36129051446914673, + "learning_rate": 0.00013642737896494157, + "loss": 0.5837, + "step": 958 + }, + { + "epoch": 0.0003366178731629821, + "grad_norm": 0.35625776648521423, + "learning_rate": 0.00013636060100166944, + "loss": 0.5579, + "step": 959 + }, + { + "epoch": 0.00033696888241549825, + "grad_norm": 0.3735104501247406, + "learning_rate": 0.00013629382303839734, + "loss": 0.5283, + "step": 960 + }, + { + "epoch": 0.0003373198916680144, + "grad_norm": 0.34185606241226196, + "learning_rate": 0.00013622704507512521, + "loss": 0.5669, + "step": 961 + }, + { + "epoch": 0.00033767090092053054, + "grad_norm": 0.29324260354042053, + "learning_rate": 0.00013616026711185311, + "loss": 0.4468, + "step": 962 + }, + { + "epoch": 0.00033802191017304667, + "grad_norm": 0.3439052700996399, + "learning_rate": 0.000136093489148581, + "loss": 0.5196, + "step": 963 + }, + { + "epoch": 0.00033837291942556284, + "grad_norm": 0.3536570370197296, + "learning_rate": 0.00013602671118530886, + "loss": 0.5251, + "step": 964 + }, + { + "epoch": 0.00033872392867807897, + "grad_norm": 0.4759911298751831, + "learning_rate": 0.00013595993322203673, + "loss": 0.7017, + "step": 965 + }, + { + "epoch": 0.00033907493793059514, + "grad_norm": 0.2958674728870392, + "learning_rate": 0.0001358931552587646, + "loss": 0.4936, + "step": 966 + }, + { + "epoch": 0.00033942594718311126, + "grad_norm": 0.32770562171936035, + "learning_rate": 0.0001358263772954925, + "loss": 0.5741, + "step": 967 + }, + { + "epoch": 0.0003397769564356274, + "grad_norm": 0.35697153210639954, + "learning_rate": 0.00013575959933222038, + "loss": 0.428, + "step": 968 + }, + { + "epoch": 0.00034012796568814356, + "grad_norm": 0.3409043252468109, + "learning_rate": 0.00013569282136894825, + "loss": 0.6142, + "step": 969 + }, + { + "epoch": 0.0003404789749406597, + "grad_norm": 0.47055551409721375, + "learning_rate": 0.00013562604340567613, + "loss": 0.463, + "step": 970 + }, + { + "epoch": 0.00034082998419317586, + "grad_norm": 0.38270413875579834, + "learning_rate": 0.000135559265442404, + "loss": 0.462, + "step": 971 + }, + { + "epoch": 0.000341180993445692, + "grad_norm": 0.26209867000579834, + "learning_rate": 0.0001354924874791319, + "loss": 0.5341, + "step": 972 + }, + { + "epoch": 0.0003415320026982081, + "grad_norm": 0.37498748302459717, + "learning_rate": 0.00013542570951585977, + "loss": 0.5196, + "step": 973 + }, + { + "epoch": 0.0003418830119507243, + "grad_norm": 0.36789608001708984, + "learning_rate": 0.00013535893155258765, + "loss": 0.4723, + "step": 974 + }, + { + "epoch": 0.0003422340212032404, + "grad_norm": 0.33915975689888, + "learning_rate": 0.00013529215358931552, + "loss": 0.5511, + "step": 975 + }, + { + "epoch": 0.0003425850304557566, + "grad_norm": 0.43045058846473694, + "learning_rate": 0.0001352253756260434, + "loss": 0.5667, + "step": 976 + }, + { + "epoch": 0.0003429360397082727, + "grad_norm": 0.2948949933052063, + "learning_rate": 0.0001351585976627713, + "loss": 0.4804, + "step": 977 + }, + { + "epoch": 0.00034328704896078883, + "grad_norm": 0.3249470889568329, + "learning_rate": 0.00013509181969949917, + "loss": 0.6041, + "step": 978 + }, + { + "epoch": 0.000343638058213305, + "grad_norm": 0.2865908741950989, + "learning_rate": 0.00013502504173622707, + "loss": 0.5617, + "step": 979 + }, + { + "epoch": 0.0003439890674658211, + "grad_norm": 0.3190818428993225, + "learning_rate": 0.00013495826377295494, + "loss": 0.4902, + "step": 980 + }, + { + "epoch": 0.00034434007671833725, + "grad_norm": 0.3111664950847626, + "learning_rate": 0.0001348914858096828, + "loss": 0.5504, + "step": 981 + }, + { + "epoch": 0.0003446910859708534, + "grad_norm": 0.3255857229232788, + "learning_rate": 0.00013482470784641069, + "loss": 0.5592, + "step": 982 + }, + { + "epoch": 0.00034504209522336955, + "grad_norm": 0.30806589126586914, + "learning_rate": 0.00013475792988313859, + "loss": 0.5567, + "step": 983 + }, + { + "epoch": 0.0003453931044758857, + "grad_norm": 0.33785945177078247, + "learning_rate": 0.00013469115191986646, + "loss": 0.5881, + "step": 984 + }, + { + "epoch": 0.00034574411372840185, + "grad_norm": 0.34626781940460205, + "learning_rate": 0.00013462437395659433, + "loss": 0.578, + "step": 985 + }, + { + "epoch": 0.00034609512298091797, + "grad_norm": 0.367034912109375, + "learning_rate": 0.0001345575959933222, + "loss": 0.5893, + "step": 986 + }, + { + "epoch": 0.00034644613223343415, + "grad_norm": 0.37824952602386475, + "learning_rate": 0.00013449081803005008, + "loss": 0.5681, + "step": 987 + }, + { + "epoch": 0.00034679714148595027, + "grad_norm": 0.4054035544395447, + "learning_rate": 0.00013442404006677798, + "loss": 0.6108, + "step": 988 + }, + { + "epoch": 0.00034714815073846645, + "grad_norm": 0.4374067485332489, + "learning_rate": 0.00013435726210350585, + "loss": 0.6002, + "step": 989 + }, + { + "epoch": 0.00034749915999098257, + "grad_norm": 0.3554278016090393, + "learning_rate": 0.00013429048414023373, + "loss": 0.6444, + "step": 990 + }, + { + "epoch": 0.0003478501692434987, + "grad_norm": 0.3428646922111511, + "learning_rate": 0.0001342237061769616, + "loss": 0.6527, + "step": 991 + }, + { + "epoch": 0.00034820117849601487, + "grad_norm": 0.25603657960891724, + "learning_rate": 0.00013415692821368947, + "loss": 0.5244, + "step": 992 + }, + { + "epoch": 0.000348552187748531, + "grad_norm": 0.35237595438957214, + "learning_rate": 0.00013409015025041737, + "loss": 0.557, + "step": 993 + }, + { + "epoch": 0.00034890319700104717, + "grad_norm": 0.33666110038757324, + "learning_rate": 0.00013402337228714524, + "loss": 0.5674, + "step": 994 + }, + { + "epoch": 0.0003492542062535633, + "grad_norm": 0.30283182859420776, + "learning_rate": 0.00013395659432387312, + "loss": 0.6081, + "step": 995 + }, + { + "epoch": 0.0003496052155060794, + "grad_norm": 0.30893146991729736, + "learning_rate": 0.00013388981636060102, + "loss": 0.6089, + "step": 996 + }, + { + "epoch": 0.0003499562247585956, + "grad_norm": 0.2617473304271698, + "learning_rate": 0.0001338230383973289, + "loss": 0.6104, + "step": 997 + }, + { + "epoch": 0.0003503072340111117, + "grad_norm": 0.29493093490600586, + "learning_rate": 0.00013375626043405676, + "loss": 0.5047, + "step": 998 + }, + { + "epoch": 0.0003506582432636279, + "grad_norm": 0.3991663157939911, + "learning_rate": 0.00013368948247078466, + "loss": 0.5137, + "step": 999 + }, + { + "epoch": 0.000351009252516144, + "grad_norm": 0.31760329008102417, + "learning_rate": 0.00013362270450751254, + "loss": 0.4371, + "step": 1000 + }, + { + "epoch": 0.00035136026176866013, + "grad_norm": 0.35144907236099243, + "learning_rate": 0.0001335559265442404, + "loss": 0.5085, + "step": 1001 + }, + { + "epoch": 0.0003517112710211763, + "grad_norm": 0.3597724735736847, + "learning_rate": 0.00013348914858096828, + "loss": 0.593, + "step": 1002 + }, + { + "epoch": 0.00035206228027369243, + "grad_norm": 0.33647072315216064, + "learning_rate": 0.00013342237061769616, + "loss": 0.6011, + "step": 1003 + }, + { + "epoch": 0.0003524132895262086, + "grad_norm": 0.3377489745616913, + "learning_rate": 0.00013335559265442406, + "loss": 0.6285, + "step": 1004 + }, + { + "epoch": 0.00035276429877872473, + "grad_norm": 0.3210775852203369, + "learning_rate": 0.00013328881469115193, + "loss": 0.5214, + "step": 1005 + }, + { + "epoch": 0.00035311530803124085, + "grad_norm": 0.33832573890686035, + "learning_rate": 0.0001332220367278798, + "loss": 0.5788, + "step": 1006 + }, + { + "epoch": 0.00035346631728375703, + "grad_norm": 0.3025464117527008, + "learning_rate": 0.00013315525876460768, + "loss": 0.3762, + "step": 1007 + }, + { + "epoch": 0.00035381732653627315, + "grad_norm": 0.33917921781539917, + "learning_rate": 0.00013308848080133555, + "loss": 0.5816, + "step": 1008 + }, + { + "epoch": 0.0003541683357887893, + "grad_norm": 0.3070494830608368, + "learning_rate": 0.00013302170283806345, + "loss": 0.522, + "step": 1009 + }, + { + "epoch": 0.00035451934504130545, + "grad_norm": 0.31389573216438293, + "learning_rate": 0.00013295492487479132, + "loss": 0.5966, + "step": 1010 + }, + { + "epoch": 0.00035487035429382157, + "grad_norm": 0.33663564920425415, + "learning_rate": 0.0001328881469115192, + "loss": 0.5857, + "step": 1011 + }, + { + "epoch": 0.00035522136354633775, + "grad_norm": 0.3280203640460968, + "learning_rate": 0.00013282136894824707, + "loss": 0.562, + "step": 1012 + }, + { + "epoch": 0.00035557237279885387, + "grad_norm": 0.3307760953903198, + "learning_rate": 0.00013275459098497497, + "loss": 0.6258, + "step": 1013 + }, + { + "epoch": 0.00035592338205137005, + "grad_norm": 0.34378358721733093, + "learning_rate": 0.00013268781302170284, + "loss": 0.5026, + "step": 1014 + }, + { + "epoch": 0.00035627439130388617, + "grad_norm": 0.32818603515625, + "learning_rate": 0.00013262103505843074, + "loss": 0.513, + "step": 1015 + }, + { + "epoch": 0.0003566254005564023, + "grad_norm": 0.3015523850917816, + "learning_rate": 0.00013255425709515862, + "loss": 0.5448, + "step": 1016 + }, + { + "epoch": 0.00035697640980891847, + "grad_norm": 0.2927173674106598, + "learning_rate": 0.0001324874791318865, + "loss": 0.6565, + "step": 1017 + }, + { + "epoch": 0.0003573274190614346, + "grad_norm": 0.3502102196216583, + "learning_rate": 0.00013242070116861436, + "loss": 0.6235, + "step": 1018 + }, + { + "epoch": 0.00035767842831395077, + "grad_norm": 0.32151371240615845, + "learning_rate": 0.00013235392320534224, + "loss": 0.5613, + "step": 1019 + }, + { + "epoch": 0.0003580294375664669, + "grad_norm": 0.31253233551979065, + "learning_rate": 0.00013228714524207014, + "loss": 0.4744, + "step": 1020 + }, + { + "epoch": 0.000358380446818983, + "grad_norm": 0.2831304669380188, + "learning_rate": 0.000132220367278798, + "loss": 0.5385, + "step": 1021 + }, + { + "epoch": 0.0003587314560714992, + "grad_norm": 0.32526761293411255, + "learning_rate": 0.00013215358931552588, + "loss": 0.6316, + "step": 1022 + }, + { + "epoch": 0.0003590824653240153, + "grad_norm": 0.3305005729198456, + "learning_rate": 0.00013208681135225376, + "loss": 0.5287, + "step": 1023 + }, + { + "epoch": 0.0003594334745765315, + "grad_norm": 0.29515331983566284, + "learning_rate": 0.00013202003338898163, + "loss": 0.5478, + "step": 1024 + }, + { + "epoch": 0.0003597844838290476, + "grad_norm": 0.32527396082878113, + "learning_rate": 0.00013195325542570953, + "loss": 0.6309, + "step": 1025 + }, + { + "epoch": 0.00036013549308156373, + "grad_norm": 0.3407800793647766, + "learning_rate": 0.0001318864774624374, + "loss": 0.5958, + "step": 1026 + }, + { + "epoch": 0.0003604865023340799, + "grad_norm": 0.40766170620918274, + "learning_rate": 0.00013181969949916528, + "loss": 0.5281, + "step": 1027 + }, + { + "epoch": 0.00036083751158659603, + "grad_norm": 0.3853365480899811, + "learning_rate": 0.00013175292153589315, + "loss": 0.6349, + "step": 1028 + }, + { + "epoch": 0.0003611885208391122, + "grad_norm": 0.2854768633842468, + "learning_rate": 0.00013168614357262102, + "loss": 0.4515, + "step": 1029 + }, + { + "epoch": 0.00036153953009162833, + "grad_norm": 0.3713400065898895, + "learning_rate": 0.00013161936560934892, + "loss": 0.5256, + "step": 1030 + }, + { + "epoch": 0.00036189053934414445, + "grad_norm": 0.3738803565502167, + "learning_rate": 0.0001315525876460768, + "loss": 0.647, + "step": 1031 + }, + { + "epoch": 0.00036224154859666063, + "grad_norm": 0.3904534578323364, + "learning_rate": 0.0001314858096828047, + "loss": 0.6047, + "step": 1032 + }, + { + "epoch": 0.00036259255784917675, + "grad_norm": 0.3647315204143524, + "learning_rate": 0.00013141903171953257, + "loss": 0.5027, + "step": 1033 + }, + { + "epoch": 0.00036294356710169293, + "grad_norm": 0.3410654366016388, + "learning_rate": 0.00013135225375626044, + "loss": 0.6187, + "step": 1034 + }, + { + "epoch": 0.00036329457635420905, + "grad_norm": 0.3227837383747101, + "learning_rate": 0.00013128547579298832, + "loss": 0.4749, + "step": 1035 + }, + { + "epoch": 0.00036364558560672517, + "grad_norm": 0.2792038917541504, + "learning_rate": 0.00013121869782971622, + "loss": 0.4981, + "step": 1036 + }, + { + "epoch": 0.00036399659485924135, + "grad_norm": 0.339101642370224, + "learning_rate": 0.0001311519198664441, + "loss": 0.5875, + "step": 1037 + }, + { + "epoch": 0.00036434760411175747, + "grad_norm": 0.369004487991333, + "learning_rate": 0.00013108514190317196, + "loss": 0.4854, + "step": 1038 + }, + { + "epoch": 0.00036469861336427365, + "grad_norm": 0.39061155915260315, + "learning_rate": 0.00013101836393989983, + "loss": 0.5887, + "step": 1039 + }, + { + "epoch": 0.00036504962261678977, + "grad_norm": 0.3913773000240326, + "learning_rate": 0.0001309515859766277, + "loss": 0.5388, + "step": 1040 + }, + { + "epoch": 0.0003654006318693059, + "grad_norm": 0.27972474694252014, + "learning_rate": 0.0001308848080133556, + "loss": 0.3841, + "step": 1041 + }, + { + "epoch": 0.00036575164112182207, + "grad_norm": 0.3185168504714966, + "learning_rate": 0.00013081803005008348, + "loss": 0.4955, + "step": 1042 + }, + { + "epoch": 0.0003661026503743382, + "grad_norm": 0.6088166236877441, + "learning_rate": 0.00013075125208681135, + "loss": 0.5242, + "step": 1043 + }, + { + "epoch": 0.00036645365962685437, + "grad_norm": 0.4608970582485199, + "learning_rate": 0.00013068447412353923, + "loss": 0.5375, + "step": 1044 + }, + { + "epoch": 0.0003668046688793705, + "grad_norm": 0.38970229029655457, + "learning_rate": 0.0001306176961602671, + "loss": 0.5227, + "step": 1045 + }, + { + "epoch": 0.0003671556781318866, + "grad_norm": 0.3537042438983917, + "learning_rate": 0.00013055091819699497, + "loss": 0.5022, + "step": 1046 + }, + { + "epoch": 0.0003675066873844028, + "grad_norm": 0.3243977725505829, + "learning_rate": 0.00013048414023372287, + "loss": 0.4638, + "step": 1047 + }, + { + "epoch": 0.0003678576966369189, + "grad_norm": 0.5033393502235413, + "learning_rate": 0.00013041736227045075, + "loss": 0.6124, + "step": 1048 + }, + { + "epoch": 0.0003682087058894351, + "grad_norm": 0.3304978907108307, + "learning_rate": 0.00013035058430717865, + "loss": 0.5645, + "step": 1049 + }, + { + "epoch": 0.0003685597151419512, + "grad_norm": 0.36042529344558716, + "learning_rate": 0.00013028380634390652, + "loss": 0.4484, + "step": 1050 + }, + { + "epoch": 0.00036891072439446733, + "grad_norm": 0.4284050166606903, + "learning_rate": 0.0001302170283806344, + "loss": 0.6074, + "step": 1051 + }, + { + "epoch": 0.0003692617336469835, + "grad_norm": 0.28319039940834045, + "learning_rate": 0.0001301502504173623, + "loss": 0.563, + "step": 1052 + }, + { + "epoch": 0.00036961274289949963, + "grad_norm": 0.35593390464782715, + "learning_rate": 0.00013008347245409017, + "loss": 0.5548, + "step": 1053 + }, + { + "epoch": 0.0003699637521520158, + "grad_norm": 0.3092995285987854, + "learning_rate": 0.00013001669449081804, + "loss": 0.5512, + "step": 1054 + }, + { + "epoch": 0.00037031476140453193, + "grad_norm": 0.39928558468818665, + "learning_rate": 0.00012994991652754591, + "loss": 0.5828, + "step": 1055 + }, + { + "epoch": 0.00037066577065704805, + "grad_norm": 0.3541167974472046, + "learning_rate": 0.0001298831385642738, + "loss": 0.5943, + "step": 1056 + }, + { + "epoch": 0.00037101677990956423, + "grad_norm": 0.3520177900791168, + "learning_rate": 0.0001298163606010017, + "loss": 0.5629, + "step": 1057 + }, + { + "epoch": 0.00037136778916208035, + "grad_norm": 0.26769620180130005, + "learning_rate": 0.00012974958263772956, + "loss": 0.4686, + "step": 1058 + }, + { + "epoch": 0.00037171879841459653, + "grad_norm": 0.4143349528312683, + "learning_rate": 0.00012968280467445743, + "loss": 0.5898, + "step": 1059 + }, + { + "epoch": 0.00037206980766711265, + "grad_norm": 0.29856693744659424, + "learning_rate": 0.0001296160267111853, + "loss": 0.5795, + "step": 1060 + }, + { + "epoch": 0.0003724208169196288, + "grad_norm": 0.3835422396659851, + "learning_rate": 0.00012954924874791318, + "loss": 0.657, + "step": 1061 + }, + { + "epoch": 0.00037277182617214495, + "grad_norm": 0.3311139941215515, + "learning_rate": 0.00012948247078464108, + "loss": 0.5206, + "step": 1062 + }, + { + "epoch": 0.0003731228354246611, + "grad_norm": 0.38118553161621094, + "learning_rate": 0.00012941569282136895, + "loss": 0.6101, + "step": 1063 + }, + { + "epoch": 0.00037347384467717725, + "grad_norm": 0.3357555568218231, + "learning_rate": 0.00012934891485809683, + "loss": 0.4583, + "step": 1064 + }, + { + "epoch": 0.00037382485392969337, + "grad_norm": 0.3239798843860626, + "learning_rate": 0.0001292821368948247, + "loss": 0.5717, + "step": 1065 + }, + { + "epoch": 0.0003741758631822095, + "grad_norm": 0.31502071022987366, + "learning_rate": 0.0001292153589315526, + "loss": 0.5528, + "step": 1066 + }, + { + "epoch": 0.00037452687243472567, + "grad_norm": 0.35177144408226013, + "learning_rate": 0.00012914858096828047, + "loss": 0.5404, + "step": 1067 + }, + { + "epoch": 0.0003748778816872418, + "grad_norm": 0.3457860052585602, + "learning_rate": 0.00012908180300500837, + "loss": 0.5311, + "step": 1068 + }, + { + "epoch": 0.00037522889093975797, + "grad_norm": 0.31016480922698975, + "learning_rate": 0.00012901502504173625, + "loss": 0.521, + "step": 1069 + }, + { + "epoch": 0.0003755799001922741, + "grad_norm": 0.2800024151802063, + "learning_rate": 0.00012894824707846412, + "loss": 0.4831, + "step": 1070 + }, + { + "epoch": 0.0003759309094447902, + "grad_norm": 0.3560345470905304, + "learning_rate": 0.000128881469115192, + "loss": 0.4771, + "step": 1071 + }, + { + "epoch": 0.0003762819186973064, + "grad_norm": 0.28846535086631775, + "learning_rate": 0.00012881469115191987, + "loss": 0.4444, + "step": 1072 + }, + { + "epoch": 0.0003766329279498225, + "grad_norm": 0.29720595479011536, + "learning_rate": 0.00012874791318864777, + "loss": 0.5048, + "step": 1073 + }, + { + "epoch": 0.0003769839372023387, + "grad_norm": 0.40147536993026733, + "learning_rate": 0.00012868113522537564, + "loss": 0.5521, + "step": 1074 + }, + { + "epoch": 0.0003773349464548548, + "grad_norm": 0.36368894577026367, + "learning_rate": 0.0001286143572621035, + "loss": 0.5211, + "step": 1075 + }, + { + "epoch": 0.00037768595570737094, + "grad_norm": 0.34239786863327026, + "learning_rate": 0.00012854757929883139, + "loss": 0.4327, + "step": 1076 + }, + { + "epoch": 0.0003780369649598871, + "grad_norm": 0.3420031666755676, + "learning_rate": 0.00012848080133555926, + "loss": 0.5377, + "step": 1077 + }, + { + "epoch": 0.00037838797421240323, + "grad_norm": 0.32050299644470215, + "learning_rate": 0.00012841402337228716, + "loss": 0.6428, + "step": 1078 + }, + { + "epoch": 0.0003787389834649194, + "grad_norm": 0.31478747725486755, + "learning_rate": 0.00012834724540901503, + "loss": 0.4042, + "step": 1079 + }, + { + "epoch": 0.00037908999271743553, + "grad_norm": 0.4019688367843628, + "learning_rate": 0.0001282804674457429, + "loss": 0.5806, + "step": 1080 + }, + { + "epoch": 0.00037944100196995166, + "grad_norm": 0.3169090151786804, + "learning_rate": 0.00012821368948247078, + "loss": 0.6143, + "step": 1081 + }, + { + "epoch": 0.00037979201122246783, + "grad_norm": 0.3160766363143921, + "learning_rate": 0.00012814691151919865, + "loss": 0.4358, + "step": 1082 + }, + { + "epoch": 0.00038014302047498395, + "grad_norm": 0.30607977509498596, + "learning_rate": 0.00012808013355592655, + "loss": 0.611, + "step": 1083 + }, + { + "epoch": 0.00038049402972750013, + "grad_norm": 0.3392901122570038, + "learning_rate": 0.00012801335559265442, + "loss": 0.4677, + "step": 1084 + }, + { + "epoch": 0.00038084503898001625, + "grad_norm": 0.3608296513557434, + "learning_rate": 0.00012794657762938233, + "loss": 0.4681, + "step": 1085 + }, + { + "epoch": 0.0003811960482325324, + "grad_norm": 0.35469377040863037, + "learning_rate": 0.0001278797996661102, + "loss": 0.5122, + "step": 1086 + }, + { + "epoch": 0.00038154705748504855, + "grad_norm": 0.42851918935775757, + "learning_rate": 0.00012781302170283807, + "loss": 0.511, + "step": 1087 + }, + { + "epoch": 0.0003818980667375647, + "grad_norm": 0.31718799471855164, + "learning_rate": 0.00012774624373956594, + "loss": 0.5504, + "step": 1088 + }, + { + "epoch": 0.00038224907599008085, + "grad_norm": 0.31201183795928955, + "learning_rate": 0.00012767946577629384, + "loss": 0.5846, + "step": 1089 + }, + { + "epoch": 0.000382600085242597, + "grad_norm": 0.44880107045173645, + "learning_rate": 0.00012761268781302172, + "loss": 0.6351, + "step": 1090 + }, + { + "epoch": 0.0003829510944951131, + "grad_norm": 0.3685932755470276, + "learning_rate": 0.0001275459098497496, + "loss": 0.4946, + "step": 1091 + }, + { + "epoch": 0.00038330210374762927, + "grad_norm": 0.38342320919036865, + "learning_rate": 0.00012747913188647746, + "loss": 0.4357, + "step": 1092 + }, + { + "epoch": 0.0003836531130001454, + "grad_norm": 0.2710161805152893, + "learning_rate": 0.00012741235392320534, + "loss": 0.4635, + "step": 1093 + }, + { + "epoch": 0.00038400412225266157, + "grad_norm": 0.3405950963497162, + "learning_rate": 0.00012734557595993324, + "loss": 0.4272, + "step": 1094 + }, + { + "epoch": 0.0003843551315051777, + "grad_norm": 0.3414493203163147, + "learning_rate": 0.0001272787979966611, + "loss": 0.5387, + "step": 1095 + }, + { + "epoch": 0.0003847061407576938, + "grad_norm": 0.30659371614456177, + "learning_rate": 0.00012721202003338898, + "loss": 0.451, + "step": 1096 + }, + { + "epoch": 0.00038505715001021, + "grad_norm": 0.33229631185531616, + "learning_rate": 0.00012714524207011686, + "loss": 0.6062, + "step": 1097 + }, + { + "epoch": 0.0003854081592627261, + "grad_norm": 0.29991772770881653, + "learning_rate": 0.00012707846410684473, + "loss": 0.5812, + "step": 1098 + }, + { + "epoch": 0.0003857591685152423, + "grad_norm": 0.2937552332878113, + "learning_rate": 0.0001270116861435726, + "loss": 0.4762, + "step": 1099 + }, + { + "epoch": 0.0003861101777677584, + "grad_norm": 0.3993151783943176, + "learning_rate": 0.0001269449081803005, + "loss": 0.5288, + "step": 1100 + }, + { + "epoch": 0.00038646118702027454, + "grad_norm": 0.34012341499328613, + "learning_rate": 0.00012687813021702838, + "loss": 0.5858, + "step": 1101 + }, + { + "epoch": 0.0003868121962727907, + "grad_norm": 0.31721460819244385, + "learning_rate": 0.00012681135225375628, + "loss": 0.4543, + "step": 1102 + }, + { + "epoch": 0.00038716320552530684, + "grad_norm": 0.404480904340744, + "learning_rate": 0.00012674457429048415, + "loss": 0.6425, + "step": 1103 + }, + { + "epoch": 0.000387514214777823, + "grad_norm": 0.2888083755970001, + "learning_rate": 0.00012667779632721202, + "loss": 0.5737, + "step": 1104 + }, + { + "epoch": 0.00038786522403033913, + "grad_norm": 0.316724568605423, + "learning_rate": 0.00012661101836393992, + "loss": 0.4774, + "step": 1105 + }, + { + "epoch": 0.00038821623328285526, + "grad_norm": 0.34277236461639404, + "learning_rate": 0.0001265442404006678, + "loss": 0.5722, + "step": 1106 + }, + { + "epoch": 0.00038856724253537143, + "grad_norm": 0.3688976764678955, + "learning_rate": 0.00012647746243739567, + "loss": 0.478, + "step": 1107 + }, + { + "epoch": 0.00038891825178788756, + "grad_norm": 0.30905240774154663, + "learning_rate": 0.00012641068447412354, + "loss": 0.5578, + "step": 1108 + }, + { + "epoch": 0.00038926926104040373, + "grad_norm": 0.31679004430770874, + "learning_rate": 0.00012634390651085142, + "loss": 0.5564, + "step": 1109 + }, + { + "epoch": 0.00038962027029291985, + "grad_norm": 0.31234732270240784, + "learning_rate": 0.00012627712854757932, + "loss": 0.5403, + "step": 1110 + }, + { + "epoch": 0.000389971279545436, + "grad_norm": 0.2693454921245575, + "learning_rate": 0.0001262103505843072, + "loss": 0.577, + "step": 1111 + }, + { + "epoch": 0.00039032228879795215, + "grad_norm": 0.36127611994743347, + "learning_rate": 0.00012614357262103506, + "loss": 0.5558, + "step": 1112 + }, + { + "epoch": 0.0003906732980504683, + "grad_norm": 0.3124391436576843, + "learning_rate": 0.00012607679465776294, + "loss": 0.5198, + "step": 1113 + }, + { + "epoch": 0.00039102430730298445, + "grad_norm": 0.339495986700058, + "learning_rate": 0.0001260100166944908, + "loss": 0.4415, + "step": 1114 + }, + { + "epoch": 0.0003913753165555006, + "grad_norm": 0.3561634421348572, + "learning_rate": 0.00012594323873121868, + "loss": 0.5413, + "step": 1115 + }, + { + "epoch": 0.0003917263258080167, + "grad_norm": 0.30160975456237793, + "learning_rate": 0.00012587646076794658, + "loss": 0.5754, + "step": 1116 + }, + { + "epoch": 0.0003920773350605329, + "grad_norm": 0.583508312702179, + "learning_rate": 0.00012580968280467446, + "loss": 0.5645, + "step": 1117 + }, + { + "epoch": 0.000392428344313049, + "grad_norm": 0.3197818100452423, + "learning_rate": 0.00012574290484140233, + "loss": 0.5326, + "step": 1118 + }, + { + "epoch": 0.0003927793535655652, + "grad_norm": 0.3258291482925415, + "learning_rate": 0.00012567612687813023, + "loss": 0.5504, + "step": 1119 + }, + { + "epoch": 0.0003931303628180813, + "grad_norm": 0.2790183424949646, + "learning_rate": 0.0001256093489148581, + "loss": 0.4691, + "step": 1120 + }, + { + "epoch": 0.0003934813720705974, + "grad_norm": 0.4802376627922058, + "learning_rate": 0.000125542570951586, + "loss": 0.5689, + "step": 1121 + }, + { + "epoch": 0.0003938323813231136, + "grad_norm": 0.42296934127807617, + "learning_rate": 0.00012547579298831388, + "loss": 0.5082, + "step": 1122 + }, + { + "epoch": 0.0003941833905756297, + "grad_norm": 0.4018993377685547, + "learning_rate": 0.00012540901502504175, + "loss": 0.5967, + "step": 1123 + }, + { + "epoch": 0.0003945343998281459, + "grad_norm": 0.2756693661212921, + "learning_rate": 0.00012534223706176962, + "loss": 0.5071, + "step": 1124 + }, + { + "epoch": 0.000394885409080662, + "grad_norm": 0.28827816247940063, + "learning_rate": 0.0001252754590984975, + "loss": 0.446, + "step": 1125 + }, + { + "epoch": 0.00039523641833317814, + "grad_norm": 0.33188387751579285, + "learning_rate": 0.0001252086811352254, + "loss": 0.59, + "step": 1126 + }, + { + "epoch": 0.0003955874275856943, + "grad_norm": 0.3057992458343506, + "learning_rate": 0.00012514190317195327, + "loss": 0.4665, + "step": 1127 + }, + { + "epoch": 0.00039593843683821044, + "grad_norm": 0.423970103263855, + "learning_rate": 0.00012507512520868114, + "loss": 0.5603, + "step": 1128 + }, + { + "epoch": 0.0003962894460907266, + "grad_norm": 0.4346948266029358, + "learning_rate": 0.00012500834724540902, + "loss": 0.7188, + "step": 1129 + }, + { + "epoch": 0.00039664045534324274, + "grad_norm": 0.3196350932121277, + "learning_rate": 0.0001249415692821369, + "loss": 0.499, + "step": 1130 + }, + { + "epoch": 0.00039699146459575886, + "grad_norm": 0.32787612080574036, + "learning_rate": 0.00012487479131886476, + "loss": 0.562, + "step": 1131 + }, + { + "epoch": 0.00039734247384827504, + "grad_norm": 0.3701760768890381, + "learning_rate": 0.00012480801335559266, + "loss": 0.5906, + "step": 1132 + }, + { + "epoch": 0.00039769348310079116, + "grad_norm": 0.2836174964904785, + "learning_rate": 0.00012474123539232053, + "loss": 0.5241, + "step": 1133 + }, + { + "epoch": 0.00039804449235330733, + "grad_norm": 0.3123319745063782, + "learning_rate": 0.0001246744574290484, + "loss": 0.5591, + "step": 1134 + }, + { + "epoch": 0.00039839550160582346, + "grad_norm": 0.2965394854545593, + "learning_rate": 0.0001246076794657763, + "loss": 0.5522, + "step": 1135 + }, + { + "epoch": 0.0003987465108583396, + "grad_norm": 0.3452530801296234, + "learning_rate": 0.00012454090150250418, + "loss": 0.5572, + "step": 1136 + }, + { + "epoch": 0.00039909752011085576, + "grad_norm": 0.3368155062198639, + "learning_rate": 0.00012447412353923208, + "loss": 0.4947, + "step": 1137 + }, + { + "epoch": 0.0003994485293633719, + "grad_norm": 0.31308281421661377, + "learning_rate": 0.00012440734557595995, + "loss": 0.5395, + "step": 1138 + }, + { + "epoch": 0.00039979953861588805, + "grad_norm": 0.36880385875701904, + "learning_rate": 0.00012434056761268783, + "loss": 0.5449, + "step": 1139 + }, + { + "epoch": 0.0004001505478684042, + "grad_norm": 0.3276751935482025, + "learning_rate": 0.0001242737896494157, + "loss": 0.5714, + "step": 1140 + }, + { + "epoch": 0.0004005015571209203, + "grad_norm": 0.34474796056747437, + "learning_rate": 0.00012420701168614357, + "loss": 0.5579, + "step": 1141 + }, + { + "epoch": 0.0004008525663734365, + "grad_norm": 0.3203624188899994, + "learning_rate": 0.00012414023372287147, + "loss": 0.5848, + "step": 1142 + }, + { + "epoch": 0.0004012035756259526, + "grad_norm": 0.33093470335006714, + "learning_rate": 0.00012407345575959935, + "loss": 0.5515, + "step": 1143 + }, + { + "epoch": 0.0004015545848784688, + "grad_norm": 0.2994841933250427, + "learning_rate": 0.00012400667779632722, + "loss": 0.4696, + "step": 1144 + }, + { + "epoch": 0.0004019055941309849, + "grad_norm": 0.43979793787002563, + "learning_rate": 0.0001239398998330551, + "loss": 0.5531, + "step": 1145 + }, + { + "epoch": 0.000402256603383501, + "grad_norm": 0.33747658133506775, + "learning_rate": 0.00012387312186978297, + "loss": 0.5442, + "step": 1146 + }, + { + "epoch": 0.0004026076126360172, + "grad_norm": 0.3129333257675171, + "learning_rate": 0.00012380634390651084, + "loss": 0.5812, + "step": 1147 + }, + { + "epoch": 0.0004029586218885333, + "grad_norm": 0.27842286229133606, + "learning_rate": 0.00012373956594323874, + "loss": 0.5571, + "step": 1148 + }, + { + "epoch": 0.0004033096311410495, + "grad_norm": 0.30332496762275696, + "learning_rate": 0.00012367278797996661, + "loss": 0.5264, + "step": 1149 + }, + { + "epoch": 0.0004036606403935656, + "grad_norm": 0.41959401965141296, + "learning_rate": 0.0001236060100166945, + "loss": 0.6208, + "step": 1150 + }, + { + "epoch": 0.00040401164964608174, + "grad_norm": 0.2994483411312103, + "learning_rate": 0.00012353923205342236, + "loss": 0.5311, + "step": 1151 + }, + { + "epoch": 0.0004043626588985979, + "grad_norm": 0.28562021255493164, + "learning_rate": 0.00012347245409015026, + "loss": 0.4664, + "step": 1152 + }, + { + "epoch": 0.00040471366815111404, + "grad_norm": 0.3773499131202698, + "learning_rate": 0.00012340567612687813, + "loss": 0.6372, + "step": 1153 + }, + { + "epoch": 0.0004050646774036302, + "grad_norm": 0.3149654269218445, + "learning_rate": 0.00012333889816360603, + "loss": 0.5295, + "step": 1154 + }, + { + "epoch": 0.00040541568665614634, + "grad_norm": 0.345595121383667, + "learning_rate": 0.0001232721202003339, + "loss": 0.5568, + "step": 1155 + }, + { + "epoch": 0.00040576669590866246, + "grad_norm": 0.2795856297016144, + "learning_rate": 0.00012320534223706178, + "loss": 0.4909, + "step": 1156 + }, + { + "epoch": 0.00040611770516117864, + "grad_norm": 0.37467122077941895, + "learning_rate": 0.00012313856427378965, + "loss": 0.5733, + "step": 1157 + }, + { + "epoch": 0.00040646871441369476, + "grad_norm": 0.33086350560188293, + "learning_rate": 0.00012307178631051755, + "loss": 0.5371, + "step": 1158 + }, + { + "epoch": 0.00040681972366621094, + "grad_norm": 0.3587074279785156, + "learning_rate": 0.00012300500834724543, + "loss": 0.5555, + "step": 1159 + }, + { + "epoch": 0.00040717073291872706, + "grad_norm": 0.35360291600227356, + "learning_rate": 0.0001229382303839733, + "loss": 0.5686, + "step": 1160 + }, + { + "epoch": 0.0004075217421712432, + "grad_norm": 0.32877933979034424, + "learning_rate": 0.00012287145242070117, + "loss": 0.6232, + "step": 1161 + }, + { + "epoch": 0.00040787275142375936, + "grad_norm": 0.3402215540409088, + "learning_rate": 0.00012280467445742905, + "loss": 0.5923, + "step": 1162 + }, + { + "epoch": 0.0004082237606762755, + "grad_norm": 0.3712671399116516, + "learning_rate": 0.00012273789649415692, + "loss": 0.4405, + "step": 1163 + }, + { + "epoch": 0.00040857476992879166, + "grad_norm": 0.34966424107551575, + "learning_rate": 0.00012267111853088482, + "loss": 0.5987, + "step": 1164 + }, + { + "epoch": 0.0004089257791813078, + "grad_norm": 0.8779903650283813, + "learning_rate": 0.0001226043405676127, + "loss": 0.5677, + "step": 1165 + }, + { + "epoch": 0.0004092767884338239, + "grad_norm": 0.30721041560173035, + "learning_rate": 0.00012253756260434057, + "loss": 0.4803, + "step": 1166 + }, + { + "epoch": 0.0004096277976863401, + "grad_norm": 0.3509838879108429, + "learning_rate": 0.00012247078464106844, + "loss": 0.4216, + "step": 1167 + }, + { + "epoch": 0.0004099788069388562, + "grad_norm": 0.2961578071117401, + "learning_rate": 0.0001224040066777963, + "loss": 0.5599, + "step": 1168 + }, + { + "epoch": 0.0004103298161913724, + "grad_norm": 0.28842684626579285, + "learning_rate": 0.0001223372287145242, + "loss": 0.5023, + "step": 1169 + }, + { + "epoch": 0.0004106808254438885, + "grad_norm": 0.3395219147205353, + "learning_rate": 0.00012227045075125209, + "loss": 0.6371, + "step": 1170 + }, + { + "epoch": 0.0004110318346964046, + "grad_norm": 0.2860247492790222, + "learning_rate": 0.00012220367278797999, + "loss": 0.3881, + "step": 1171 + }, + { + "epoch": 0.0004113828439489208, + "grad_norm": 0.5463435053825378, + "learning_rate": 0.00012213689482470786, + "loss": 0.5751, + "step": 1172 + }, + { + "epoch": 0.0004117338532014369, + "grad_norm": 0.30383020639419556, + "learning_rate": 0.00012207011686143572, + "loss": 0.4892, + "step": 1173 + }, + { + "epoch": 0.0004120848624539531, + "grad_norm": 0.6111129522323608, + "learning_rate": 0.00012200333889816362, + "loss": 0.6786, + "step": 1174 + }, + { + "epoch": 0.0004124358717064692, + "grad_norm": 0.32131698727607727, + "learning_rate": 0.00012193656093489149, + "loss": 0.6301, + "step": 1175 + }, + { + "epoch": 0.00041278688095898534, + "grad_norm": 0.3574715256690979, + "learning_rate": 0.00012186978297161938, + "loss": 0.5705, + "step": 1176 + }, + { + "epoch": 0.0004131378902115015, + "grad_norm": 0.46258190274238586, + "learning_rate": 0.00012180300500834725, + "loss": 0.54, + "step": 1177 + }, + { + "epoch": 0.00041348889946401764, + "grad_norm": 0.385326623916626, + "learning_rate": 0.00012173622704507512, + "loss": 0.5792, + "step": 1178 + }, + { + "epoch": 0.0004138399087165338, + "grad_norm": 0.3880153000354767, + "learning_rate": 0.00012166944908180303, + "loss": 0.5396, + "step": 1179 + }, + { + "epoch": 0.00041419091796904994, + "grad_norm": 0.32916024327278137, + "learning_rate": 0.0001216026711185309, + "loss": 0.5632, + "step": 1180 + }, + { + "epoch": 0.00041454192722156606, + "grad_norm": 0.30234548449516296, + "learning_rate": 0.00012153589315525877, + "loss": 0.5162, + "step": 1181 + }, + { + "epoch": 0.00041489293647408224, + "grad_norm": 0.3654727339744568, + "learning_rate": 0.00012146911519198664, + "loss": 0.6333, + "step": 1182 + }, + { + "epoch": 0.00041524394572659836, + "grad_norm": 0.3166685700416565, + "learning_rate": 0.00012140233722871452, + "loss": 0.5276, + "step": 1183 + }, + { + "epoch": 0.00041559495497911454, + "grad_norm": 0.3722357153892517, + "learning_rate": 0.0001213355592654424, + "loss": 0.5771, + "step": 1184 + }, + { + "epoch": 0.00041594596423163066, + "grad_norm": 0.3407818377017975, + "learning_rate": 0.00012126878130217029, + "loss": 0.5998, + "step": 1185 + }, + { + "epoch": 0.0004162969734841468, + "grad_norm": 0.28665193915367126, + "learning_rate": 0.00012120200333889818, + "loss": 0.5457, + "step": 1186 + }, + { + "epoch": 0.00041664798273666296, + "grad_norm": 0.3052026629447937, + "learning_rate": 0.00012113522537562605, + "loss": 0.5204, + "step": 1187 + }, + { + "epoch": 0.0004169989919891791, + "grad_norm": 0.286080002784729, + "learning_rate": 0.00012106844741235392, + "loss": 0.4346, + "step": 1188 + }, + { + "epoch": 0.00041735000124169526, + "grad_norm": 0.306473970413208, + "learning_rate": 0.0001210016694490818, + "loss": 0.5544, + "step": 1189 + }, + { + "epoch": 0.0004177010104942114, + "grad_norm": 0.3347833454608917, + "learning_rate": 0.0001209348914858097, + "loss": 0.4619, + "step": 1190 + }, + { + "epoch": 0.0004180520197467275, + "grad_norm": 0.28040143847465515, + "learning_rate": 0.00012086811352253757, + "loss": 0.5492, + "step": 1191 + }, + { + "epoch": 0.0004184030289992437, + "grad_norm": 0.2940806448459625, + "learning_rate": 0.00012080133555926544, + "loss": 0.5653, + "step": 1192 + }, + { + "epoch": 0.0004187540382517598, + "grad_norm": 0.37384578585624695, + "learning_rate": 0.00012073455759599333, + "loss": 0.4931, + "step": 1193 + }, + { + "epoch": 0.000419105047504276, + "grad_norm": 0.28816068172454834, + "learning_rate": 0.0001206677796327212, + "loss": 0.5292, + "step": 1194 + }, + { + "epoch": 0.0004194560567567921, + "grad_norm": 0.31325826048851013, + "learning_rate": 0.0001206010016694491, + "loss": 0.5288, + "step": 1195 + }, + { + "epoch": 0.0004198070660093082, + "grad_norm": 0.30658552050590515, + "learning_rate": 0.00012053422370617698, + "loss": 0.5854, + "step": 1196 + }, + { + "epoch": 0.0004201580752618244, + "grad_norm": 0.341240257024765, + "learning_rate": 0.00012046744574290485, + "loss": 0.5358, + "step": 1197 + }, + { + "epoch": 0.0004205090845143405, + "grad_norm": 0.3595687747001648, + "learning_rate": 0.00012040066777963272, + "loss": 0.5944, + "step": 1198 + }, + { + "epoch": 0.00042086009376685664, + "grad_norm": 0.3249213397502899, + "learning_rate": 0.0001203338898163606, + "loss": 0.4873, + "step": 1199 + }, + { + "epoch": 0.0004212111030193728, + "grad_norm": 0.37282127141952515, + "learning_rate": 0.00012026711185308848, + "loss": 0.5173, + "step": 1200 + }, + { + "epoch": 0.00042156211227188894, + "grad_norm": 0.325110524892807, + "learning_rate": 0.00012020033388981637, + "loss": 0.4819, + "step": 1201 + }, + { + "epoch": 0.0004219131215244051, + "grad_norm": 0.313388466835022, + "learning_rate": 0.00012013355592654426, + "loss": 0.5613, + "step": 1202 + }, + { + "epoch": 0.00042226413077692124, + "grad_norm": 0.38384371995925903, + "learning_rate": 0.00012006677796327213, + "loss": 0.5711, + "step": 1203 + }, + { + "epoch": 0.00042261514002943736, + "grad_norm": 0.3431423008441925, + "learning_rate": 0.00012, + "loss": 0.5593, + "step": 1204 + }, + { + "epoch": 0.00042296614928195354, + "grad_norm": 0.3032066822052002, + "learning_rate": 0.00011993322203672788, + "loss": 0.559, + "step": 1205 + }, + { + "epoch": 0.00042331715853446966, + "grad_norm": 0.30639907717704773, + "learning_rate": 0.00011986644407345578, + "loss": 0.5727, + "step": 1206 + }, + { + "epoch": 0.00042366816778698584, + "grad_norm": 0.2970695197582245, + "learning_rate": 0.00011979966611018365, + "loss": 0.5933, + "step": 1207 + }, + { + "epoch": 0.00042401917703950196, + "grad_norm": 0.3868466317653656, + "learning_rate": 0.00011973288814691152, + "loss": 0.5779, + "step": 1208 + }, + { + "epoch": 0.0004243701862920181, + "grad_norm": 0.29085230827331543, + "learning_rate": 0.0001196661101836394, + "loss": 0.6558, + "step": 1209 + }, + { + "epoch": 0.00042472119554453426, + "grad_norm": 0.33766743540763855, + "learning_rate": 0.00011959933222036728, + "loss": 0.5809, + "step": 1210 + }, + { + "epoch": 0.0004250722047970504, + "grad_norm": 0.6739090085029602, + "learning_rate": 0.00011953255425709517, + "loss": 0.6085, + "step": 1211 + }, + { + "epoch": 0.00042542321404956656, + "grad_norm": 0.35693222284317017, + "learning_rate": 0.00011946577629382306, + "loss": 0.5855, + "step": 1212 + }, + { + "epoch": 0.0004257742233020827, + "grad_norm": 0.3087833523750305, + "learning_rate": 0.00011939899833055093, + "loss": 0.6379, + "step": 1213 + }, + { + "epoch": 0.0004261252325545988, + "grad_norm": 0.3548837900161743, + "learning_rate": 0.0001193322203672788, + "loss": 0.5303, + "step": 1214 + }, + { + "epoch": 0.000426476241807115, + "grad_norm": 0.46040648221969604, + "learning_rate": 0.00011926544240400668, + "loss": 0.5171, + "step": 1215 + }, + { + "epoch": 0.0004268272510596311, + "grad_norm": 0.5730584859848022, + "learning_rate": 0.00011919866444073455, + "loss": 0.615, + "step": 1216 + }, + { + "epoch": 0.0004271782603121473, + "grad_norm": 0.34618711471557617, + "learning_rate": 0.00011913188647746245, + "loss": 0.5605, + "step": 1217 + }, + { + "epoch": 0.0004275292695646634, + "grad_norm": 0.3499528169631958, + "learning_rate": 0.00011906510851419032, + "loss": 0.5184, + "step": 1218 + }, + { + "epoch": 0.0004278802788171795, + "grad_norm": 0.33638936281204224, + "learning_rate": 0.00011899833055091821, + "loss": 0.6276, + "step": 1219 + }, + { + "epoch": 0.0004282312880696957, + "grad_norm": 0.34646880626678467, + "learning_rate": 0.00011893155258764608, + "loss": 0.5737, + "step": 1220 + }, + { + "epoch": 0.0004285822973222118, + "grad_norm": 0.2783110439777374, + "learning_rate": 0.00011886477462437396, + "loss": 0.4424, + "step": 1221 + }, + { + "epoch": 0.000428933306574728, + "grad_norm": 0.33892807364463806, + "learning_rate": 0.00011879799666110186, + "loss": 0.5656, + "step": 1222 + }, + { + "epoch": 0.0004292843158272441, + "grad_norm": 0.2782565653324127, + "learning_rate": 0.00011873121869782973, + "loss": 0.5504, + "step": 1223 + }, + { + "epoch": 0.00042963532507976025, + "grad_norm": 0.3684981167316437, + "learning_rate": 0.0001186644407345576, + "loss": 0.5532, + "step": 1224 + }, + { + "epoch": 0.0004299863343322764, + "grad_norm": 0.4034316837787628, + "learning_rate": 0.00011859766277128547, + "loss": 0.5417, + "step": 1225 + }, + { + "epoch": 0.00043033734358479254, + "grad_norm": 0.5182071924209595, + "learning_rate": 0.00011853088480801335, + "loss": 0.6118, + "step": 1226 + }, + { + "epoch": 0.0004306883528373087, + "grad_norm": 0.3137674033641815, + "learning_rate": 0.00011846410684474125, + "loss": 0.6485, + "step": 1227 + }, + { + "epoch": 0.00043103936208982484, + "grad_norm": 0.4069771468639374, + "learning_rate": 0.00011839732888146912, + "loss": 0.5452, + "step": 1228 + }, + { + "epoch": 0.00043139037134234097, + "grad_norm": 0.5212397575378418, + "learning_rate": 0.00011833055091819701, + "loss": 0.5212, + "step": 1229 + }, + { + "epoch": 0.00043174138059485714, + "grad_norm": 0.3622184693813324, + "learning_rate": 0.00011826377295492488, + "loss": 0.4333, + "step": 1230 + }, + { + "epoch": 0.00043209238984737326, + "grad_norm": 0.335044801235199, + "learning_rate": 0.00011819699499165275, + "loss": 0.5606, + "step": 1231 + }, + { + "epoch": 0.00043244339909988944, + "grad_norm": 0.31680893898010254, + "learning_rate": 0.00011813021702838063, + "loss": 0.4988, + "step": 1232 + }, + { + "epoch": 0.00043279440835240556, + "grad_norm": 0.5272301435470581, + "learning_rate": 0.00011806343906510853, + "loss": 0.6024, + "step": 1233 + }, + { + "epoch": 0.0004331454176049217, + "grad_norm": 0.3663223385810852, + "learning_rate": 0.0001179966611018364, + "loss": 0.5964, + "step": 1234 + }, + { + "epoch": 0.00043349642685743786, + "grad_norm": 0.35138314962387085, + "learning_rate": 0.00011792988313856427, + "loss": 0.5908, + "step": 1235 + }, + { + "epoch": 0.000433847436109954, + "grad_norm": 0.3744595944881439, + "learning_rate": 0.00011786310517529216, + "loss": 0.551, + "step": 1236 + }, + { + "epoch": 0.00043419844536247016, + "grad_norm": 0.31489259004592896, + "learning_rate": 0.00011779632721202003, + "loss": 0.6431, + "step": 1237 + }, + { + "epoch": 0.0004345494546149863, + "grad_norm": 0.3356812298297882, + "learning_rate": 0.00011772954924874793, + "loss": 0.4507, + "step": 1238 + }, + { + "epoch": 0.0004349004638675024, + "grad_norm": 0.3018808364868164, + "learning_rate": 0.00011766277128547581, + "loss": 0.4796, + "step": 1239 + }, + { + "epoch": 0.0004352514731200186, + "grad_norm": 0.3201460540294647, + "learning_rate": 0.00011759599332220368, + "loss": 0.4768, + "step": 1240 + }, + { + "epoch": 0.0004356024823725347, + "grad_norm": 0.3269093334674835, + "learning_rate": 0.00011752921535893155, + "loss": 0.5419, + "step": 1241 + }, + { + "epoch": 0.0004359534916250509, + "grad_norm": 0.28690990805625916, + "learning_rate": 0.00011746243739565943, + "loss": 0.5088, + "step": 1242 + }, + { + "epoch": 0.000436304500877567, + "grad_norm": 0.32765012979507446, + "learning_rate": 0.00011739565943238733, + "loss": 0.4953, + "step": 1243 + }, + { + "epoch": 0.0004366555101300831, + "grad_norm": 0.28830674290657043, + "learning_rate": 0.0001173288814691152, + "loss": 0.5179, + "step": 1244 + }, + { + "epoch": 0.0004370065193825993, + "grad_norm": 0.37793827056884766, + "learning_rate": 0.00011726210350584307, + "loss": 0.5951, + "step": 1245 + }, + { + "epoch": 0.0004373575286351154, + "grad_norm": 0.37173348665237427, + "learning_rate": 0.00011719532554257096, + "loss": 0.6059, + "step": 1246 + }, + { + "epoch": 0.0004377085378876316, + "grad_norm": 0.5363826155662537, + "learning_rate": 0.00011712854757929883, + "loss": 0.5183, + "step": 1247 + }, + { + "epoch": 0.0004380595471401477, + "grad_norm": 0.31671205163002014, + "learning_rate": 0.0001170617696160267, + "loss": 0.5711, + "step": 1248 + }, + { + "epoch": 0.00043841055639266385, + "grad_norm": 0.3112623989582062, + "learning_rate": 0.0001169949916527546, + "loss": 0.5647, + "step": 1249 + }, + { + "epoch": 0.00043876156564518, + "grad_norm": 0.3153972923755646, + "learning_rate": 0.00011692821368948248, + "loss": 0.4939, + "step": 1250 + }, + { + "epoch": 0.00043911257489769615, + "grad_norm": 0.29940372705459595, + "learning_rate": 0.00011686143572621035, + "loss": 0.5509, + "step": 1251 + }, + { + "epoch": 0.0004394635841502123, + "grad_norm": 0.42540279030799866, + "learning_rate": 0.00011679465776293823, + "loss": 0.4104, + "step": 1252 + }, + { + "epoch": 0.00043981459340272844, + "grad_norm": 0.3222522437572479, + "learning_rate": 0.00011672787979966611, + "loss": 0.6237, + "step": 1253 + }, + { + "epoch": 0.00044016560265524457, + "grad_norm": 0.34896525740623474, + "learning_rate": 0.000116661101836394, + "loss": 0.5162, + "step": 1254 + }, + { + "epoch": 0.00044051661190776074, + "grad_norm": 0.29780149459838867, + "learning_rate": 0.00011659432387312189, + "loss": 0.5805, + "step": 1255 + }, + { + "epoch": 0.00044086762116027687, + "grad_norm": 0.3533996343612671, + "learning_rate": 0.00011652754590984976, + "loss": 0.5749, + "step": 1256 + }, + { + "epoch": 0.00044121863041279304, + "grad_norm": 0.30867093801498413, + "learning_rate": 0.00011646076794657763, + "loss": 0.479, + "step": 1257 + }, + { + "epoch": 0.00044156963966530917, + "grad_norm": 0.31176280975341797, + "learning_rate": 0.0001163939899833055, + "loss": 0.5007, + "step": 1258 + }, + { + "epoch": 0.0004419206489178253, + "grad_norm": 0.3480489253997803, + "learning_rate": 0.0001163272120200334, + "loss": 0.5595, + "step": 1259 + }, + { + "epoch": 0.00044227165817034146, + "grad_norm": 0.37473055720329285, + "learning_rate": 0.00011626043405676128, + "loss": 0.5042, + "step": 1260 + }, + { + "epoch": 0.0004426226674228576, + "grad_norm": 0.3167501986026764, + "learning_rate": 0.00011619365609348915, + "loss": 0.5335, + "step": 1261 + }, + { + "epoch": 0.00044297367667537376, + "grad_norm": 0.31276339292526245, + "learning_rate": 0.00011612687813021703, + "loss": 0.5594, + "step": 1262 + }, + { + "epoch": 0.0004433246859278899, + "grad_norm": 0.42910438776016235, + "learning_rate": 0.00011606010016694491, + "loss": 0.4659, + "step": 1263 + }, + { + "epoch": 0.000443675695180406, + "grad_norm": 0.3169635534286499, + "learning_rate": 0.00011599332220367279, + "loss": 0.5463, + "step": 1264 + }, + { + "epoch": 0.0004440267044329222, + "grad_norm": 0.3419555425643921, + "learning_rate": 0.00011592654424040069, + "loss": 0.5091, + "step": 1265 + }, + { + "epoch": 0.0004443777136854383, + "grad_norm": 0.31462714076042175, + "learning_rate": 0.00011585976627712856, + "loss": 0.6233, + "step": 1266 + }, + { + "epoch": 0.0004447287229379545, + "grad_norm": 0.36186134815216064, + "learning_rate": 0.00011579298831385643, + "loss": 0.5634, + "step": 1267 + }, + { + "epoch": 0.0004450797321904706, + "grad_norm": 0.385903000831604, + "learning_rate": 0.0001157262103505843, + "loss": 0.5892, + "step": 1268 + }, + { + "epoch": 0.00044543074144298673, + "grad_norm": 0.28669610619544983, + "learning_rate": 0.00011565943238731218, + "loss": 0.4746, + "step": 1269 + }, + { + "epoch": 0.0004457817506955029, + "grad_norm": 0.37557515501976013, + "learning_rate": 0.00011559265442404008, + "loss": 0.5946, + "step": 1270 + }, + { + "epoch": 0.00044613275994801903, + "grad_norm": 0.30455920100212097, + "learning_rate": 0.00011552587646076795, + "loss": 0.4064, + "step": 1271 + }, + { + "epoch": 0.0004464837692005352, + "grad_norm": 0.36547228693962097, + "learning_rate": 0.00011545909849749584, + "loss": 0.4354, + "step": 1272 + }, + { + "epoch": 0.0004468347784530513, + "grad_norm": 0.3912973999977112, + "learning_rate": 0.00011539232053422371, + "loss": 0.544, + "step": 1273 + }, + { + "epoch": 0.00044718578770556745, + "grad_norm": 0.2993258237838745, + "learning_rate": 0.00011532554257095158, + "loss": 0.4623, + "step": 1274 + }, + { + "epoch": 0.0004475367969580836, + "grad_norm": 0.39676982164382935, + "learning_rate": 0.00011525876460767948, + "loss": 0.4735, + "step": 1275 + }, + { + "epoch": 0.00044788780621059975, + "grad_norm": 0.43738967180252075, + "learning_rate": 0.00011519198664440736, + "loss": 0.5639, + "step": 1276 + }, + { + "epoch": 0.0004482388154631159, + "grad_norm": 0.4572802186012268, + "learning_rate": 0.00011512520868113523, + "loss": 0.5043, + "step": 1277 + }, + { + "epoch": 0.00044858982471563205, + "grad_norm": 0.301929771900177, + "learning_rate": 0.0001150584307178631, + "loss": 0.3962, + "step": 1278 + }, + { + "epoch": 0.00044894083396814817, + "grad_norm": 0.42450666427612305, + "learning_rate": 0.00011499165275459098, + "loss": 0.5885, + "step": 1279 + }, + { + "epoch": 0.00044929184322066435, + "grad_norm": 0.3520278036594391, + "learning_rate": 0.00011492487479131886, + "loss": 0.5557, + "step": 1280 + }, + { + "epoch": 0.00044964285247318047, + "grad_norm": 0.32748425006866455, + "learning_rate": 0.00011485809682804675, + "loss": 0.5788, + "step": 1281 + }, + { + "epoch": 0.00044999386172569664, + "grad_norm": 0.3404058516025543, + "learning_rate": 0.00011479131886477464, + "loss": 0.431, + "step": 1282 + }, + { + "epoch": 0.00045034487097821277, + "grad_norm": 0.30703750252723694, + "learning_rate": 0.00011472454090150251, + "loss": 0.5603, + "step": 1283 + }, + { + "epoch": 0.0004506958802307289, + "grad_norm": 0.3476982116699219, + "learning_rate": 0.00011465776293823038, + "loss": 0.4984, + "step": 1284 + }, + { + "epoch": 0.00045104688948324507, + "grad_norm": 0.361433207988739, + "learning_rate": 0.00011459098497495826, + "loss": 0.4012, + "step": 1285 + }, + { + "epoch": 0.0004513978987357612, + "grad_norm": 0.31583985686302185, + "learning_rate": 0.00011452420701168616, + "loss": 0.5115, + "step": 1286 + }, + { + "epoch": 0.00045174890798827736, + "grad_norm": 0.3581843376159668, + "learning_rate": 0.00011445742904841403, + "loss": 0.5795, + "step": 1287 + }, + { + "epoch": 0.0004520999172407935, + "grad_norm": 0.30088526010513306, + "learning_rate": 0.0001143906510851419, + "loss": 0.4995, + "step": 1288 + }, + { + "epoch": 0.0004524509264933096, + "grad_norm": 0.34739211201667786, + "learning_rate": 0.00011432387312186979, + "loss": 0.5513, + "step": 1289 + }, + { + "epoch": 0.0004528019357458258, + "grad_norm": 0.3440413177013397, + "learning_rate": 0.00011425709515859766, + "loss": 0.626, + "step": 1290 + }, + { + "epoch": 0.0004531529449983419, + "grad_norm": 0.34715211391448975, + "learning_rate": 0.00011419031719532556, + "loss": 0.5567, + "step": 1291 + }, + { + "epoch": 0.0004535039542508581, + "grad_norm": 0.3141072690486908, + "learning_rate": 0.00011412353923205344, + "loss": 0.515, + "step": 1292 + }, + { + "epoch": 0.0004538549635033742, + "grad_norm": 0.3693056106567383, + "learning_rate": 0.00011405676126878131, + "loss": 0.6039, + "step": 1293 + }, + { + "epoch": 0.00045420597275589033, + "grad_norm": 0.2877582609653473, + "learning_rate": 0.00011398998330550918, + "loss": 0.627, + "step": 1294 + }, + { + "epoch": 0.0004545569820084065, + "grad_norm": 0.30727502703666687, + "learning_rate": 0.00011392320534223706, + "loss": 0.4439, + "step": 1295 + }, + { + "epoch": 0.00045490799126092263, + "grad_norm": 0.340834379196167, + "learning_rate": 0.00011385642737896493, + "loss": 0.6043, + "step": 1296 + }, + { + "epoch": 0.0004552590005134388, + "grad_norm": 0.37094762921333313, + "learning_rate": 0.00011378964941569283, + "loss": 0.5279, + "step": 1297 + }, + { + "epoch": 0.00045561000976595493, + "grad_norm": 0.352252721786499, + "learning_rate": 0.0001137228714524207, + "loss": 0.4534, + "step": 1298 + }, + { + "epoch": 0.00045596101901847105, + "grad_norm": 0.3592413663864136, + "learning_rate": 0.00011365609348914859, + "loss": 0.6009, + "step": 1299 + }, + { + "epoch": 0.0004563120282709872, + "grad_norm": 0.3028002679347992, + "learning_rate": 0.00011358931552587646, + "loss": 0.5451, + "step": 1300 + }, + { + "epoch": 0.00045666303752350335, + "grad_norm": 0.3545093238353729, + "learning_rate": 0.00011352253756260434, + "loss": 0.6022, + "step": 1301 + }, + { + "epoch": 0.0004570140467760195, + "grad_norm": 0.31239053606987, + "learning_rate": 0.00011345575959933224, + "loss": 0.5893, + "step": 1302 + }, + { + "epoch": 0.00045736505602853565, + "grad_norm": 0.2930079996585846, + "learning_rate": 0.00011338898163606011, + "loss": 0.6469, + "step": 1303 + }, + { + "epoch": 0.00045771606528105177, + "grad_norm": 0.3328670263290405, + "learning_rate": 0.00011332220367278798, + "loss": 0.551, + "step": 1304 + }, + { + "epoch": 0.00045806707453356795, + "grad_norm": 0.2958623766899109, + "learning_rate": 0.00011325542570951586, + "loss": 0.4699, + "step": 1305 + }, + { + "epoch": 0.00045841808378608407, + "grad_norm": 0.26540592312812805, + "learning_rate": 0.00011318864774624374, + "loss": 0.5651, + "step": 1306 + }, + { + "epoch": 0.00045876909303860025, + "grad_norm": 0.30372926592826843, + "learning_rate": 0.00011312186978297163, + "loss": 0.4466, + "step": 1307 + }, + { + "epoch": 0.00045912010229111637, + "grad_norm": 0.32394206523895264, + "learning_rate": 0.00011305509181969952, + "loss": 0.4651, + "step": 1308 + }, + { + "epoch": 0.0004594711115436325, + "grad_norm": 0.2792419493198395, + "learning_rate": 0.00011298831385642739, + "loss": 0.4761, + "step": 1309 + }, + { + "epoch": 0.00045982212079614867, + "grad_norm": 0.26445260643959045, + "learning_rate": 0.00011292153589315526, + "loss": 0.4564, + "step": 1310 + }, + { + "epoch": 0.0004601731300486648, + "grad_norm": 0.3601842224597931, + "learning_rate": 0.00011285475792988314, + "loss": 0.5397, + "step": 1311 + }, + { + "epoch": 0.00046052413930118097, + "grad_norm": 0.3574691712856293, + "learning_rate": 0.00011278797996661104, + "loss": 0.5961, + "step": 1312 + }, + { + "epoch": 0.0004608751485536971, + "grad_norm": 0.3000461161136627, + "learning_rate": 0.00011272120200333891, + "loss": 0.4527, + "step": 1313 + }, + { + "epoch": 0.0004612261578062132, + "grad_norm": 0.34302622079849243, + "learning_rate": 0.00011265442404006678, + "loss": 0.6379, + "step": 1314 + }, + { + "epoch": 0.0004615771670587294, + "grad_norm": 0.3945535123348236, + "learning_rate": 0.00011258764607679465, + "loss": 0.5631, + "step": 1315 + }, + { + "epoch": 0.0004619281763112455, + "grad_norm": 0.4170839786529541, + "learning_rate": 0.00011252086811352254, + "loss": 0.6339, + "step": 1316 + }, + { + "epoch": 0.0004622791855637617, + "grad_norm": 0.36513859033584595, + "learning_rate": 0.00011245409015025041, + "loss": 0.5528, + "step": 1317 + }, + { + "epoch": 0.0004626301948162778, + "grad_norm": 0.45692166686058044, + "learning_rate": 0.00011238731218697832, + "loss": 0.6315, + "step": 1318 + }, + { + "epoch": 0.00046298120406879393, + "grad_norm": 0.3772307336330414, + "learning_rate": 0.00011232053422370619, + "loss": 0.5349, + "step": 1319 + }, + { + "epoch": 0.0004633322133213101, + "grad_norm": 0.3114742636680603, + "learning_rate": 0.00011225375626043406, + "loss": 0.4121, + "step": 1320 + }, + { + "epoch": 0.00046368322257382623, + "grad_norm": 0.3508698344230652, + "learning_rate": 0.00011218697829716193, + "loss": 0.638, + "step": 1321 + }, + { + "epoch": 0.0004640342318263424, + "grad_norm": 0.34588712453842163, + "learning_rate": 0.00011212020033388981, + "loss": 0.4898, + "step": 1322 + }, + { + "epoch": 0.00046438524107885853, + "grad_norm": 0.2846747934818268, + "learning_rate": 0.00011205342237061771, + "loss": 0.5521, + "step": 1323 + }, + { + "epoch": 0.00046473625033137465, + "grad_norm": 0.31673532724380493, + "learning_rate": 0.00011198664440734558, + "loss": 0.4676, + "step": 1324 + }, + { + "epoch": 0.00046508725958389083, + "grad_norm": 0.3159814774990082, + "learning_rate": 0.00011191986644407347, + "loss": 0.508, + "step": 1325 + }, + { + "epoch": 0.00046543826883640695, + "grad_norm": 0.3438906967639923, + "learning_rate": 0.00011185308848080134, + "loss": 0.6521, + "step": 1326 + }, + { + "epoch": 0.00046578927808892313, + "grad_norm": 0.28350135684013367, + "learning_rate": 0.00011178631051752921, + "loss": 0.517, + "step": 1327 + }, + { + "epoch": 0.00046614028734143925, + "grad_norm": 0.3244381844997406, + "learning_rate": 0.00011171953255425711, + "loss": 0.4975, + "step": 1328 + }, + { + "epoch": 0.00046649129659395537, + "grad_norm": 0.32338446378707886, + "learning_rate": 0.00011165275459098499, + "loss": 0.5581, + "step": 1329 + }, + { + "epoch": 0.00046684230584647155, + "grad_norm": 0.3385190963745117, + "learning_rate": 0.00011158597662771286, + "loss": 0.5287, + "step": 1330 + }, + { + "epoch": 0.00046719331509898767, + "grad_norm": 0.30869290232658386, + "learning_rate": 0.00011151919866444073, + "loss": 0.5694, + "step": 1331 + }, + { + "epoch": 0.00046754432435150385, + "grad_norm": 0.39800670742988586, + "learning_rate": 0.00011145242070116862, + "loss": 0.6783, + "step": 1332 + }, + { + "epoch": 0.00046789533360401997, + "grad_norm": 0.3691728413105011, + "learning_rate": 0.0001113856427378965, + "loss": 0.5814, + "step": 1333 + }, + { + "epoch": 0.0004682463428565361, + "grad_norm": 0.34991732239723206, + "learning_rate": 0.0001113188647746244, + "loss": 0.414, + "step": 1334 + }, + { + "epoch": 0.00046859735210905227, + "grad_norm": 0.3095676302909851, + "learning_rate": 0.00011125208681135227, + "loss": 0.5982, + "step": 1335 + }, + { + "epoch": 0.0004689483613615684, + "grad_norm": 0.3367360830307007, + "learning_rate": 0.00011118530884808014, + "loss": 0.5794, + "step": 1336 + }, + { + "epoch": 0.00046929937061408457, + "grad_norm": 0.3058132529258728, + "learning_rate": 0.00011111853088480801, + "loss": 0.5001, + "step": 1337 + }, + { + "epoch": 0.0004696503798666007, + "grad_norm": 0.32190924882888794, + "learning_rate": 0.00011105175292153589, + "loss": 0.6184, + "step": 1338 + }, + { + "epoch": 0.0004700013891191168, + "grad_norm": 0.2544103264808655, + "learning_rate": 0.00011098497495826379, + "loss": 0.5338, + "step": 1339 + }, + { + "epoch": 0.000470352398371633, + "grad_norm": 0.3533720374107361, + "learning_rate": 0.00011091819699499166, + "loss": 0.5817, + "step": 1340 + }, + { + "epoch": 0.0004707034076241491, + "grad_norm": 0.29889243841171265, + "learning_rate": 0.00011085141903171953, + "loss": 0.4836, + "step": 1341 + }, + { + "epoch": 0.0004710544168766653, + "grad_norm": 0.3215756118297577, + "learning_rate": 0.00011078464106844742, + "loss": 0.5438, + "step": 1342 + }, + { + "epoch": 0.0004714054261291814, + "grad_norm": 0.3005795478820801, + "learning_rate": 0.00011071786310517529, + "loss": 0.5341, + "step": 1343 + }, + { + "epoch": 0.00047175643538169753, + "grad_norm": 0.31172803044319153, + "learning_rate": 0.0001106510851419032, + "loss": 0.5517, + "step": 1344 + }, + { + "epoch": 0.0004721074446342137, + "grad_norm": 0.3667462468147278, + "learning_rate": 0.00011058430717863107, + "loss": 0.5487, + "step": 1345 + }, + { + "epoch": 0.00047245845388672983, + "grad_norm": 0.3609708249568939, + "learning_rate": 0.00011051752921535894, + "loss": 0.5514, + "step": 1346 + }, + { + "epoch": 0.000472809463139246, + "grad_norm": 0.36390745639801025, + "learning_rate": 0.00011045075125208681, + "loss": 0.609, + "step": 1347 + }, + { + "epoch": 0.00047316047239176213, + "grad_norm": 0.3918192982673645, + "learning_rate": 0.00011038397328881469, + "loss": 0.5841, + "step": 1348 + }, + { + "epoch": 0.00047351148164427825, + "grad_norm": 0.3789425194263458, + "learning_rate": 0.00011031719532554257, + "loss": 0.5551, + "step": 1349 + }, + { + "epoch": 0.00047386249089679443, + "grad_norm": 0.31591498851776123, + "learning_rate": 0.00011025041736227046, + "loss": 0.5445, + "step": 1350 + }, + { + "epoch": 0.00047421350014931055, + "grad_norm": 0.3711070120334625, + "learning_rate": 0.00011018363939899835, + "loss": 0.6124, + "step": 1351 + }, + { + "epoch": 0.00047456450940182673, + "grad_norm": 0.3442644476890564, + "learning_rate": 0.00011011686143572622, + "loss": 0.5793, + "step": 1352 + }, + { + "epoch": 0.00047491551865434285, + "grad_norm": 0.2866378426551819, + "learning_rate": 0.00011005008347245409, + "loss": 0.5144, + "step": 1353 + }, + { + "epoch": 0.000475266527906859, + "grad_norm": 0.3127586841583252, + "learning_rate": 0.00010998330550918197, + "loss": 0.6036, + "step": 1354 + }, + { + "epoch": 0.00047561753715937515, + "grad_norm": 0.32305601239204407, + "learning_rate": 0.00010991652754590987, + "loss": 0.5215, + "step": 1355 + }, + { + "epoch": 0.00047596854641189127, + "grad_norm": 0.30483660101890564, + "learning_rate": 0.00010984974958263774, + "loss": 0.6094, + "step": 1356 + }, + { + "epoch": 0.00047631955566440745, + "grad_norm": 0.33019503951072693, + "learning_rate": 0.00010978297161936561, + "loss": 0.5646, + "step": 1357 + }, + { + "epoch": 0.00047667056491692357, + "grad_norm": 0.3414929509162903, + "learning_rate": 0.00010971619365609349, + "loss": 0.5262, + "step": 1358 + }, + { + "epoch": 0.0004770215741694397, + "grad_norm": 0.3471517860889435, + "learning_rate": 0.00010964941569282137, + "loss": 0.492, + "step": 1359 + }, + { + "epoch": 0.00047737258342195587, + "grad_norm": 0.3226645588874817, + "learning_rate": 0.00010958263772954926, + "loss": 0.6318, + "step": 1360 + }, + { + "epoch": 0.000477723592674472, + "grad_norm": 0.3425777852535248, + "learning_rate": 0.00010951585976627715, + "loss": 0.5878, + "step": 1361 + }, + { + "epoch": 0.00047807460192698817, + "grad_norm": 0.307462215423584, + "learning_rate": 0.00010944908180300502, + "loss": 0.4948, + "step": 1362 + }, + { + "epoch": 0.0004784256111795043, + "grad_norm": 0.34796106815338135, + "learning_rate": 0.00010938230383973289, + "loss": 0.5525, + "step": 1363 + }, + { + "epoch": 0.0004787766204320204, + "grad_norm": 0.2861281633377075, + "learning_rate": 0.00010931552587646076, + "loss": 0.4578, + "step": 1364 + }, + { + "epoch": 0.0004791276296845366, + "grad_norm": 0.2861836552619934, + "learning_rate": 0.00010924874791318864, + "loss": 0.5761, + "step": 1365 + }, + { + "epoch": 0.0004794786389370527, + "grad_norm": 0.3063654601573944, + "learning_rate": 0.00010918196994991654, + "loss": 0.5338, + "step": 1366 + }, + { + "epoch": 0.0004798296481895689, + "grad_norm": 0.3108372390270233, + "learning_rate": 0.00010911519198664441, + "loss": 0.4896, + "step": 1367 + }, + { + "epoch": 0.000480180657442085, + "grad_norm": 0.3263947069644928, + "learning_rate": 0.0001090484140233723, + "loss": 0.6142, + "step": 1368 + }, + { + "epoch": 0.00048053166669460113, + "grad_norm": 0.27663156390190125, + "learning_rate": 0.00010898163606010017, + "loss": 0.3852, + "step": 1369 + }, + { + "epoch": 0.0004808826759471173, + "grad_norm": 0.2791202962398529, + "learning_rate": 0.00010891485809682804, + "loss": 0.6032, + "step": 1370 + }, + { + "epoch": 0.00048123368519963343, + "grad_norm": 0.2715228199958801, + "learning_rate": 0.00010884808013355594, + "loss": 0.4717, + "step": 1371 + }, + { + "epoch": 0.0004815846944521496, + "grad_norm": 0.3232786953449249, + "learning_rate": 0.00010878130217028382, + "loss": 0.5511, + "step": 1372 + }, + { + "epoch": 0.00048193570370466573, + "grad_norm": 0.42948031425476074, + "learning_rate": 0.00010871452420701169, + "loss": 0.5223, + "step": 1373 + }, + { + "epoch": 0.00048228671295718185, + "grad_norm": 0.31973496079444885, + "learning_rate": 0.00010864774624373956, + "loss": 0.4532, + "step": 1374 + }, + { + "epoch": 0.00048263772220969803, + "grad_norm": 0.3149821162223816, + "learning_rate": 0.00010858096828046744, + "loss": 0.4894, + "step": 1375 + }, + { + "epoch": 0.00048298873146221415, + "grad_norm": 0.30229589343070984, + "learning_rate": 0.00010851419031719534, + "loss": 0.5039, + "step": 1376 + }, + { + "epoch": 0.00048333974071473033, + "grad_norm": 0.36127185821533203, + "learning_rate": 0.00010844741235392321, + "loss": 0.4379, + "step": 1377 + }, + { + "epoch": 0.00048369074996724645, + "grad_norm": 0.3135043978691101, + "learning_rate": 0.0001083806343906511, + "loss": 0.5172, + "step": 1378 + }, + { + "epoch": 0.0004840417592197626, + "grad_norm": 0.33123600482940674, + "learning_rate": 0.00010831385642737897, + "loss": 0.4959, + "step": 1379 + }, + { + "epoch": 0.00048439276847227875, + "grad_norm": 0.32165780663490295, + "learning_rate": 0.00010824707846410684, + "loss": 0.5152, + "step": 1380 + }, + { + "epoch": 0.0004847437777247949, + "grad_norm": 0.28580865263938904, + "learning_rate": 0.00010818030050083472, + "loss": 0.4879, + "step": 1381 + }, + { + "epoch": 0.00048509478697731105, + "grad_norm": 0.4019862711429596, + "learning_rate": 0.00010811352253756262, + "loss": 0.5475, + "step": 1382 + }, + { + "epoch": 0.0004854457962298272, + "grad_norm": 0.34479352831840515, + "learning_rate": 0.00010804674457429049, + "loss": 0.4279, + "step": 1383 + }, + { + "epoch": 0.0004857968054823433, + "grad_norm": 0.3664172887802124, + "learning_rate": 0.00010797996661101836, + "loss": 0.5815, + "step": 1384 + }, + { + "epoch": 0.00048614781473485947, + "grad_norm": 0.34667205810546875, + "learning_rate": 0.00010791318864774625, + "loss": 0.5453, + "step": 1385 + }, + { + "epoch": 0.0004864988239873756, + "grad_norm": 0.36878061294555664, + "learning_rate": 0.00010784641068447412, + "loss": 0.5464, + "step": 1386 + }, + { + "epoch": 0.00048684983323989177, + "grad_norm": 0.3552783727645874, + "learning_rate": 0.00010777963272120202, + "loss": 0.5668, + "step": 1387 + }, + { + "epoch": 0.0004872008424924079, + "grad_norm": 0.35390666127204895, + "learning_rate": 0.0001077128547579299, + "loss": 0.4799, + "step": 1388 + }, + { + "epoch": 0.000487551851744924, + "grad_norm": 0.3539852797985077, + "learning_rate": 0.00010764607679465777, + "loss": 0.6264, + "step": 1389 + }, + { + "epoch": 0.0004879028609974402, + "grad_norm": 0.3104274868965149, + "learning_rate": 0.00010757929883138564, + "loss": 0.4881, + "step": 1390 + }, + { + "epoch": 0.0004882538702499563, + "grad_norm": 0.29643991589546204, + "learning_rate": 0.00010751252086811352, + "loss": 0.5277, + "step": 1391 + }, + { + "epoch": 0.0004886048795024725, + "grad_norm": 0.3498566448688507, + "learning_rate": 0.00010744574290484142, + "loss": 0.4394, + "step": 1392 + }, + { + "epoch": 0.0004889558887549886, + "grad_norm": 0.31261810660362244, + "learning_rate": 0.00010737896494156929, + "loss": 0.4557, + "step": 1393 + }, + { + "epoch": 0.0004893068980075047, + "grad_norm": 0.301792711019516, + "learning_rate": 0.00010731218697829716, + "loss": 0.471, + "step": 1394 + }, + { + "epoch": 0.0004896579072600209, + "grad_norm": 0.34246626496315, + "learning_rate": 0.00010724540901502505, + "loss": 0.5917, + "step": 1395 + }, + { + "epoch": 0.0004900089165125371, + "grad_norm": 0.2901524305343628, + "learning_rate": 0.00010717863105175292, + "loss": 0.441, + "step": 1396 + }, + { + "epoch": 0.0004903599257650532, + "grad_norm": 0.3026966452598572, + "learning_rate": 0.0001071118530884808, + "loss": 0.5373, + "step": 1397 + }, + { + "epoch": 0.0004907109350175693, + "grad_norm": 0.29963356256484985, + "learning_rate": 0.0001070450751252087, + "loss": 0.4464, + "step": 1398 + }, + { + "epoch": 0.0004910619442700855, + "grad_norm": 0.26481980085372925, + "learning_rate": 0.00010697829716193657, + "loss": 0.5372, + "step": 1399 + }, + { + "epoch": 0.0004914129535226016, + "grad_norm": 0.26084020733833313, + "learning_rate": 0.00010691151919866444, + "loss": 0.5523, + "step": 1400 + }, + { + "epoch": 0.0004917639627751178, + "grad_norm": 0.34062638878822327, + "learning_rate": 0.00010684474123539232, + "loss": 0.5466, + "step": 1401 + }, + { + "epoch": 0.0004921149720276339, + "grad_norm": 0.3231668472290039, + "learning_rate": 0.0001067779632721202, + "loss": 0.5019, + "step": 1402 + }, + { + "epoch": 0.00049246598128015, + "grad_norm": 0.3362787961959839, + "learning_rate": 0.00010671118530884809, + "loss": 0.5251, + "step": 1403 + }, + { + "epoch": 0.0004928169905326662, + "grad_norm": 0.28928473591804504, + "learning_rate": 0.00010664440734557598, + "loss": 0.5346, + "step": 1404 + }, + { + "epoch": 0.0004931679997851824, + "grad_norm": 0.32969072461128235, + "learning_rate": 0.00010657762938230385, + "loss": 0.6131, + "step": 1405 + }, + { + "epoch": 0.0004935190090376985, + "grad_norm": 0.29733914136886597, + "learning_rate": 0.00010651085141903172, + "loss": 0.4406, + "step": 1406 + }, + { + "epoch": 0.0004938700182902146, + "grad_norm": 0.36437737941741943, + "learning_rate": 0.0001064440734557596, + "loss": 0.551, + "step": 1407 + }, + { + "epoch": 0.0004942210275427308, + "grad_norm": 0.33889076113700867, + "learning_rate": 0.0001063772954924875, + "loss": 0.5904, + "step": 1408 + }, + { + "epoch": 0.000494572036795247, + "grad_norm": 0.3446680009365082, + "learning_rate": 0.00010631051752921537, + "loss": 0.394, + "step": 1409 + }, + { + "epoch": 0.000494923046047763, + "grad_norm": 0.33298397064208984, + "learning_rate": 0.00010624373956594324, + "loss": 0.5048, + "step": 1410 + }, + { + "epoch": 0.0004952740553002792, + "grad_norm": 0.3153474032878876, + "learning_rate": 0.00010617696160267111, + "loss": 0.5314, + "step": 1411 + }, + { + "epoch": 0.0004956250645527954, + "grad_norm": 0.27105385065078735, + "learning_rate": 0.000106110183639399, + "loss": 0.5098, + "step": 1412 + }, + { + "epoch": 0.0004959760738053114, + "grad_norm": 0.3450585901737213, + "learning_rate": 0.00010604340567612687, + "loss": 0.5249, + "step": 1413 + }, + { + "epoch": 0.0004963270830578276, + "grad_norm": 0.35962969064712524, + "learning_rate": 0.00010597662771285477, + "loss": 0.4714, + "step": 1414 + }, + { + "epoch": 0.0004966780923103438, + "grad_norm": 0.33413732051849365, + "learning_rate": 0.00010590984974958265, + "loss": 0.5618, + "step": 1415 + }, + { + "epoch": 0.00049702910156286, + "grad_norm": 0.37907567620277405, + "learning_rate": 0.00010584307178631052, + "loss": 0.5751, + "step": 1416 + }, + { + "epoch": 0.000497380110815376, + "grad_norm": 0.3324087858200073, + "learning_rate": 0.0001057762938230384, + "loss": 0.5032, + "step": 1417 + }, + { + "epoch": 0.0004977311200678922, + "grad_norm": 0.2794540822505951, + "learning_rate": 0.00010570951585976627, + "loss": 0.4823, + "step": 1418 + }, + { + "epoch": 0.0004980821293204084, + "grad_norm": 0.31896448135375977, + "learning_rate": 0.00010564273789649417, + "loss": 0.5293, + "step": 1419 + }, + { + "epoch": 0.0004984331385729245, + "grad_norm": 0.39455580711364746, + "learning_rate": 0.00010557595993322204, + "loss": 0.6312, + "step": 1420 + }, + { + "epoch": 0.0004987841478254406, + "grad_norm": 0.3108445107936859, + "learning_rate": 0.00010550918196994993, + "loss": 0.4614, + "step": 1421 + }, + { + "epoch": 0.0004991351570779568, + "grad_norm": 0.2984072268009186, + "learning_rate": 0.0001054424040066778, + "loss": 0.5516, + "step": 1422 + }, + { + "epoch": 0.0004994861663304729, + "grad_norm": 0.3056257665157318, + "learning_rate": 0.00010537562604340567, + "loss": 0.5906, + "step": 1423 + }, + { + "epoch": 0.0004998371755829891, + "grad_norm": 0.29374566674232483, + "learning_rate": 0.00010530884808013357, + "loss": 0.599, + "step": 1424 + }, + { + "epoch": 0.0005001881848355052, + "grad_norm": 0.3665946424007416, + "learning_rate": 0.00010524207011686145, + "loss": 0.5599, + "step": 1425 + }, + { + "epoch": 0.0005005391940880214, + "grad_norm": 0.31262800097465515, + "learning_rate": 0.00010517529215358932, + "loss": 0.5566, + "step": 1426 + }, + { + "epoch": 0.0005008902033405375, + "grad_norm": 0.3117959797382355, + "learning_rate": 0.0001051085141903172, + "loss": 0.4372, + "step": 1427 + }, + { + "epoch": 0.0005012412125930537, + "grad_norm": 0.3499256670475006, + "learning_rate": 0.00010504173622704507, + "loss": 0.543, + "step": 1428 + }, + { + "epoch": 0.0005015922218455698, + "grad_norm": 0.3630000948905945, + "learning_rate": 0.00010497495826377295, + "loss": 0.5099, + "step": 1429 + }, + { + "epoch": 0.0005019432310980859, + "grad_norm": 0.3609743118286133, + "learning_rate": 0.00010490818030050084, + "loss": 0.5304, + "step": 1430 + }, + { + "epoch": 0.0005022942403506021, + "grad_norm": 0.3600139617919922, + "learning_rate": 0.00010484140233722873, + "loss": 0.4811, + "step": 1431 + }, + { + "epoch": 0.0005026452496031183, + "grad_norm": 0.30108320713043213, + "learning_rate": 0.0001047746243739566, + "loss": 0.6055, + "step": 1432 + }, + { + "epoch": 0.0005029962588556343, + "grad_norm": 0.34729886054992676, + "learning_rate": 0.00010470784641068447, + "loss": 0.5011, + "step": 1433 + }, + { + "epoch": 0.0005033472681081505, + "grad_norm": 0.33984988927841187, + "learning_rate": 0.00010464106844741235, + "loss": 0.5905, + "step": 1434 + }, + { + "epoch": 0.0005036982773606667, + "grad_norm": 0.3109802007675171, + "learning_rate": 0.00010457429048414025, + "loss": 0.5228, + "step": 1435 + }, + { + "epoch": 0.0005040492866131829, + "grad_norm": 0.37691593170166016, + "learning_rate": 0.00010450751252086812, + "loss": 0.5839, + "step": 1436 + }, + { + "epoch": 0.0005044002958656989, + "grad_norm": 0.3665965497493744, + "learning_rate": 0.00010444073455759599, + "loss": 0.5381, + "step": 1437 + }, + { + "epoch": 0.0005047513051182151, + "grad_norm": 0.29414570331573486, + "learning_rate": 0.00010437395659432388, + "loss": 0.6072, + "step": 1438 + }, + { + "epoch": 0.0005051023143707313, + "grad_norm": 0.3206839859485626, + "learning_rate": 0.00010430717863105175, + "loss": 0.5285, + "step": 1439 + }, + { + "epoch": 0.0005054533236232473, + "grad_norm": 0.3003496527671814, + "learning_rate": 0.00010424040066777965, + "loss": 0.4037, + "step": 1440 + }, + { + "epoch": 0.0005058043328757635, + "grad_norm": 0.2955014109611511, + "learning_rate": 0.00010417362270450753, + "loss": 0.4646, + "step": 1441 + }, + { + "epoch": 0.0005061553421282797, + "grad_norm": 0.3399007022380829, + "learning_rate": 0.0001041068447412354, + "loss": 0.5649, + "step": 1442 + }, + { + "epoch": 0.0005065063513807958, + "grad_norm": 0.3394736349582672, + "learning_rate": 0.00010404006677796327, + "loss": 0.5512, + "step": 1443 + }, + { + "epoch": 0.0005068573606333119, + "grad_norm": 0.31650441884994507, + "learning_rate": 0.00010397328881469115, + "loss": 0.4669, + "step": 1444 + }, + { + "epoch": 0.0005072083698858281, + "grad_norm": 0.3380611538887024, + "learning_rate": 0.00010390651085141905, + "loss": 0.6714, + "step": 1445 + }, + { + "epoch": 0.0005075593791383443, + "grad_norm": 0.29049673676490784, + "learning_rate": 0.00010383973288814692, + "loss": 0.5652, + "step": 1446 + }, + { + "epoch": 0.0005079103883908604, + "grad_norm": 0.37694746255874634, + "learning_rate": 0.0001037729549248748, + "loss": 0.4355, + "step": 1447 + }, + { + "epoch": 0.0005082613976433765, + "grad_norm": 0.36622750759124756, + "learning_rate": 0.00010370617696160268, + "loss": 0.4758, + "step": 1448 + }, + { + "epoch": 0.0005086124068958927, + "grad_norm": 0.3366115093231201, + "learning_rate": 0.00010363939899833055, + "loss": 0.5498, + "step": 1449 + }, + { + "epoch": 0.0005089634161484088, + "grad_norm": 0.2836514711380005, + "learning_rate": 0.00010357262103505843, + "loss": 0.5405, + "step": 1450 + }, + { + "epoch": 0.000509314425400925, + "grad_norm": 0.357666015625, + "learning_rate": 0.00010350584307178633, + "loss": 0.4738, + "step": 1451 + }, + { + "epoch": 0.0005096654346534411, + "grad_norm": 0.37991905212402344, + "learning_rate": 0.0001034390651085142, + "loss": 0.4932, + "step": 1452 + }, + { + "epoch": 0.0005100164439059572, + "grad_norm": 0.2862101197242737, + "learning_rate": 0.00010337228714524207, + "loss": 0.5387, + "step": 1453 + }, + { + "epoch": 0.0005103674531584734, + "grad_norm": 0.3000154197216034, + "learning_rate": 0.00010330550918196994, + "loss": 0.509, + "step": 1454 + }, + { + "epoch": 0.0005107184624109896, + "grad_norm": 0.29454153776168823, + "learning_rate": 0.00010323873121869783, + "loss": 0.3872, + "step": 1455 + }, + { + "epoch": 0.0005110694716635057, + "grad_norm": 0.305803507566452, + "learning_rate": 0.00010317195325542572, + "loss": 0.5, + "step": 1456 + }, + { + "epoch": 0.0005114204809160218, + "grad_norm": 0.3164152204990387, + "learning_rate": 0.0001031051752921536, + "loss": 0.5426, + "step": 1457 + }, + { + "epoch": 0.000511771490168538, + "grad_norm": 0.3026213049888611, + "learning_rate": 0.00010303839732888148, + "loss": 0.5783, + "step": 1458 + }, + { + "epoch": 0.0005121224994210542, + "grad_norm": 0.3170768618583679, + "learning_rate": 0.00010297161936560935, + "loss": 0.5701, + "step": 1459 + }, + { + "epoch": 0.0005124735086735702, + "grad_norm": 0.3275301456451416, + "learning_rate": 0.00010290484140233722, + "loss": 0.4884, + "step": 1460 + }, + { + "epoch": 0.0005128245179260864, + "grad_norm": 0.3446187973022461, + "learning_rate": 0.00010283806343906512, + "loss": 0.4516, + "step": 1461 + }, + { + "epoch": 0.0005131755271786026, + "grad_norm": 0.3188260495662689, + "learning_rate": 0.000102771285475793, + "loss": 0.561, + "step": 1462 + }, + { + "epoch": 0.0005135265364311186, + "grad_norm": 0.3547864258289337, + "learning_rate": 0.00010270450751252087, + "loss": 0.5768, + "step": 1463 + }, + { + "epoch": 0.0005138775456836348, + "grad_norm": 0.3740866482257843, + "learning_rate": 0.00010263772954924876, + "loss": 0.4197, + "step": 1464 + }, + { + "epoch": 0.000514228554936151, + "grad_norm": 0.38915491104125977, + "learning_rate": 0.00010257095158597663, + "loss": 0.553, + "step": 1465 + }, + { + "epoch": 0.0005145795641886672, + "grad_norm": 0.38494518399238586, + "learning_rate": 0.0001025041736227045, + "loss": 0.6247, + "step": 1466 + }, + { + "epoch": 0.0005149305734411832, + "grad_norm": 0.2716946303844452, + "learning_rate": 0.0001024373956594324, + "loss": 0.4426, + "step": 1467 + }, + { + "epoch": 0.0005152815826936994, + "grad_norm": 0.33764415979385376, + "learning_rate": 0.00010237061769616028, + "loss": 0.5939, + "step": 1468 + }, + { + "epoch": 0.0005156325919462156, + "grad_norm": 0.34384095668792725, + "learning_rate": 0.00010230383973288815, + "loss": 0.604, + "step": 1469 + }, + { + "epoch": 0.0005159836011987317, + "grad_norm": 0.3203445076942444, + "learning_rate": 0.00010223706176961602, + "loss": 0.5255, + "step": 1470 + }, + { + "epoch": 0.0005163346104512478, + "grad_norm": 0.2592601180076599, + "learning_rate": 0.0001021702838063439, + "loss": 0.4509, + "step": 1471 + }, + { + "epoch": 0.000516685619703764, + "grad_norm": 0.3425324261188507, + "learning_rate": 0.0001021035058430718, + "loss": 0.5498, + "step": 1472 + }, + { + "epoch": 0.0005170366289562801, + "grad_norm": 0.3077262341976166, + "learning_rate": 0.00010203672787979967, + "loss": 0.5364, + "step": 1473 + }, + { + "epoch": 0.0005173876382087963, + "grad_norm": 0.2831708788871765, + "learning_rate": 0.00010196994991652756, + "loss": 0.434, + "step": 1474 + }, + { + "epoch": 0.0005177386474613124, + "grad_norm": 0.29104581475257874, + "learning_rate": 0.00010190317195325543, + "loss": 0.5875, + "step": 1475 + }, + { + "epoch": 0.0005180896567138286, + "grad_norm": 0.29584741592407227, + "learning_rate": 0.0001018363939899833, + "loss": 0.4574, + "step": 1476 + }, + { + "epoch": 0.0005184406659663447, + "grad_norm": 0.41971537470817566, + "learning_rate": 0.0001017696160267112, + "loss": 0.5616, + "step": 1477 + }, + { + "epoch": 0.0005187916752188609, + "grad_norm": 0.3439647853374481, + "learning_rate": 0.00010170283806343908, + "loss": 0.4288, + "step": 1478 + }, + { + "epoch": 0.000519142684471377, + "grad_norm": 0.35867923498153687, + "learning_rate": 0.00010163606010016695, + "loss": 0.4415, + "step": 1479 + }, + { + "epoch": 0.0005194936937238931, + "grad_norm": 0.368987500667572, + "learning_rate": 0.00010156928213689482, + "loss": 0.5474, + "step": 1480 + }, + { + "epoch": 0.0005198447029764093, + "grad_norm": 0.30241629481315613, + "learning_rate": 0.00010150250417362271, + "loss": 0.4113, + "step": 1481 + }, + { + "epoch": 0.0005201957122289255, + "grad_norm": 0.31089895963668823, + "learning_rate": 0.00010143572621035058, + "loss": 0.4726, + "step": 1482 + }, + { + "epoch": 0.0005205467214814415, + "grad_norm": 0.2900741994380951, + "learning_rate": 0.00010136894824707848, + "loss": 0.4591, + "step": 1483 + }, + { + "epoch": 0.0005208977307339577, + "grad_norm": 0.2920607030391693, + "learning_rate": 0.00010130217028380636, + "loss": 0.508, + "step": 1484 + }, + { + "epoch": 0.0005212487399864739, + "grad_norm": 0.5145193338394165, + "learning_rate": 0.00010123539232053423, + "loss": 0.6125, + "step": 1485 + }, + { + "epoch": 0.0005215997492389901, + "grad_norm": 0.3466121554374695, + "learning_rate": 0.0001011686143572621, + "loss": 0.5236, + "step": 1486 + }, + { + "epoch": 0.0005219507584915061, + "grad_norm": 0.2820659577846527, + "learning_rate": 0.00010110183639398998, + "loss": 0.4886, + "step": 1487 + }, + { + "epoch": 0.0005223017677440223, + "grad_norm": 0.31797733902931213, + "learning_rate": 0.00010103505843071788, + "loss": 0.4605, + "step": 1488 + }, + { + "epoch": 0.0005226527769965385, + "grad_norm": 0.3547564148902893, + "learning_rate": 0.00010096828046744575, + "loss": 0.5559, + "step": 1489 + }, + { + "epoch": 0.0005230037862490545, + "grad_norm": 0.3584667146205902, + "learning_rate": 0.00010090150250417362, + "loss": 0.4402, + "step": 1490 + }, + { + "epoch": 0.0005233547955015707, + "grad_norm": 0.3230780065059662, + "learning_rate": 0.00010083472454090151, + "loss": 0.5187, + "step": 1491 + }, + { + "epoch": 0.0005237058047540869, + "grad_norm": 0.3932897448539734, + "learning_rate": 0.00010076794657762938, + "loss": 0.5758, + "step": 1492 + }, + { + "epoch": 0.000524056814006603, + "grad_norm": 0.39378783106803894, + "learning_rate": 0.00010070116861435728, + "loss": 0.5199, + "step": 1493 + }, + { + "epoch": 0.0005244078232591191, + "grad_norm": 0.33147481083869934, + "learning_rate": 0.00010063439065108516, + "loss": 0.4489, + "step": 1494 + }, + { + "epoch": 0.0005247588325116353, + "grad_norm": 0.3706863522529602, + "learning_rate": 0.00010056761268781303, + "loss": 0.4601, + "step": 1495 + }, + { + "epoch": 0.0005251098417641515, + "grad_norm": 0.45806849002838135, + "learning_rate": 0.0001005008347245409, + "loss": 0.4522, + "step": 1496 + }, + { + "epoch": 0.0005254608510166676, + "grad_norm": 0.2931700050830841, + "learning_rate": 0.00010043405676126878, + "loss": 0.3673, + "step": 1497 + }, + { + "epoch": 0.0005258118602691837, + "grad_norm": 0.31791719794273376, + "learning_rate": 0.00010036727879799666, + "loss": 0.497, + "step": 1498 + }, + { + "epoch": 0.0005261628695216999, + "grad_norm": 0.51285719871521, + "learning_rate": 0.00010030050083472455, + "loss": 0.4736, + "step": 1499 + }, + { + "epoch": 0.000526513878774216, + "grad_norm": 0.37526455521583557, + "learning_rate": 0.00010023372287145244, + "loss": 0.5242, + "step": 1500 + } + ], + "logging_steps": 1, + "max_steps": 3000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.772683637827174e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/marques/outputs/checkpoint-1500/training_args.bin b/marques/outputs/checkpoint-1500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..fd0ba520c124bb1ece608079704fa15e0236be45 --- /dev/null +++ b/marques/outputs/checkpoint-1500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09362706a3d58d219e41be1682b770b8f5069fcd630f7dbcadb71e4d4ce8859b +size 6289 diff --git a/marques/outputs/checkpoint-2000/README.md b/marques/outputs/checkpoint-2000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d90a96dfe2e51221657a6e936d376789e21081f9 --- /dev/null +++ b/marques/outputs/checkpoint-2000/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/marques/outputs/checkpoint-2000/adapter_config.json b/marques/outputs/checkpoint-2000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9930a191a30254256c9550b1bdffa58b8d7aee8 --- /dev/null +++ b/marques/outputs/checkpoint-2000/adapter_config.json @@ -0,0 +1,50 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "LlamaForCausalLM", + "parent_library": "transformers.models.llama.modeling_llama", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "gate_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/marques/outputs/checkpoint-2000/adapter_model.safetensors b/marques/outputs/checkpoint-2000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..26f70b697b3acd905a7070ebae516b45075d0a71 --- /dev/null +++ b/marques/outputs/checkpoint-2000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55515bf3b25b8e3d4b82b10f225a12db27f8c9b3313c368788ea21f67ec9564a +size 167832240 diff --git a/marques/outputs/checkpoint-2000/optimizer.pt b/marques/outputs/checkpoint-2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a8631bb808fa14eb345a811b804f47b22fbd86b --- /dev/null +++ b/marques/outputs/checkpoint-2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:602a52ab4b0b1444c75d390dad91f71c6faed9d09f830aab43d0a5988c5dadc1 +size 85724133 diff --git a/marques/outputs/checkpoint-2000/rng_state.pth b/marques/outputs/checkpoint-2000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ef66339b9befa098183fd5d69faed6838e526b0 --- /dev/null +++ b/marques/outputs/checkpoint-2000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1d565802a8e26c4e8a31328752b7a7fdc186d9401aa008e65697d0ad8c22e33 +size 14645 diff --git a/marques/outputs/checkpoint-2000/scheduler.pt b/marques/outputs/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bbc0037871dfcf2a38f23ba7d83e7445044ca47b --- /dev/null +++ b/marques/outputs/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4765206172912504dac36bf4d211fd9351d03267862d1fba158b8b5c77e94c09 +size 1465 diff --git a/marques/outputs/checkpoint-2000/special_tokens_map.json b/marques/outputs/checkpoint-2000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..68b10c7f0a479eae0c358eac6a14959b3f9acdf1 --- /dev/null +++ b/marques/outputs/checkpoint-2000/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/marques/outputs/checkpoint-2000/tokenizer.json b/marques/outputs/checkpoint-2000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/marques/outputs/checkpoint-2000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/marques/outputs/checkpoint-2000/tokenizer_config.json b/marques/outputs/checkpoint-2000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..92b1d94e894e5474ebea1d171e14751be79ca3e5 --- /dev/null +++ b/marques/outputs/checkpoint-2000/tokenizer_config.json @@ -0,0 +1,2066 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/marques/outputs/checkpoint-2000/trainer_state.json b/marques/outputs/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4b478f2a9bccedd78122d4f23d2623d3945b3a88 --- /dev/null +++ b/marques/outputs/checkpoint-2000/trainer_state.json @@ -0,0 +1,14034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.000702018505032288, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.5100925251614403e-07, + "grad_norm": 0.53782719373703, + "learning_rate": 0.0, + "loss": 0.5835, + "step": 1 + }, + { + "epoch": 7.020185050322881e-07, + "grad_norm": 0.6201626062393188, + "learning_rate": 4e-05, + "loss": 0.5242, + "step": 2 + }, + { + "epoch": 1.053027757548432e-06, + "grad_norm": 0.7571901082992554, + "learning_rate": 8e-05, + "loss": 0.5642, + "step": 3 + }, + { + "epoch": 1.4040370100645761e-06, + "grad_norm": 0.5588695406913757, + "learning_rate": 0.00012, + "loss": 0.4859, + "step": 4 + }, + { + "epoch": 1.75504626258072e-06, + "grad_norm": 0.7208331227302551, + "learning_rate": 0.00016, + "loss": 0.4645, + "step": 5 + }, + { + "epoch": 2.106055515096864e-06, + "grad_norm": 0.8169743418693542, + "learning_rate": 0.0002, + "loss": 0.3702, + "step": 6 + }, + { + "epoch": 2.4570647676130083e-06, + "grad_norm": 2.051530599594116, + "learning_rate": 0.00019993322203672788, + "loss": 0.4856, + "step": 7 + }, + { + "epoch": 2.8080740201291522e-06, + "grad_norm": 1.2310550212860107, + "learning_rate": 0.00019986644407345576, + "loss": 0.5192, + "step": 8 + }, + { + "epoch": 3.1590832726452962e-06, + "grad_norm": 1.612046241760254, + "learning_rate": 0.00019979966611018366, + "loss": 0.4719, + "step": 9 + }, + { + "epoch": 3.51009252516144e-06, + "grad_norm": 1.4484680891036987, + "learning_rate": 0.00019973288814691153, + "loss": 0.4416, + "step": 10 + }, + { + "epoch": 3.861101777677584e-06, + "grad_norm": 1.4529719352722168, + "learning_rate": 0.0001996661101836394, + "loss": 0.6275, + "step": 11 + }, + { + "epoch": 4.212111030193728e-06, + "grad_norm": 1.3963671922683716, + "learning_rate": 0.00019959933222036728, + "loss": 0.5874, + "step": 12 + }, + { + "epoch": 4.563120282709872e-06, + "grad_norm": 1.4744153022766113, + "learning_rate": 0.00019953255425709515, + "loss": 0.6422, + "step": 13 + }, + { + "epoch": 4.9141295352260165e-06, + "grad_norm": 0.8640050888061523, + "learning_rate": 0.00019946577629382305, + "loss": 0.5064, + "step": 14 + }, + { + "epoch": 5.26513878774216e-06, + "grad_norm": 0.7137419581413269, + "learning_rate": 0.00019939899833055092, + "loss": 0.5218, + "step": 15 + }, + { + "epoch": 5.6161480402583045e-06, + "grad_norm": 0.7769026756286621, + "learning_rate": 0.00019933222036727882, + "loss": 0.5377, + "step": 16 + }, + { + "epoch": 5.967157292774448e-06, + "grad_norm": 0.7558479905128479, + "learning_rate": 0.0001992654424040067, + "loss": 0.5054, + "step": 17 + }, + { + "epoch": 6.3181665452905924e-06, + "grad_norm": 0.8237054347991943, + "learning_rate": 0.00019919866444073457, + "loss": 0.5094, + "step": 18 + }, + { + "epoch": 6.669175797806736e-06, + "grad_norm": 1.0375059843063354, + "learning_rate": 0.00019913188647746244, + "loss": 0.5751, + "step": 19 + }, + { + "epoch": 7.02018505032288e-06, + "grad_norm": 1.075869083404541, + "learning_rate": 0.00019906510851419034, + "loss": 0.594, + "step": 20 + }, + { + "epoch": 7.371194302839024e-06, + "grad_norm": 0.8041358590126038, + "learning_rate": 0.00019899833055091822, + "loss": 0.553, + "step": 21 + }, + { + "epoch": 7.722203555355168e-06, + "grad_norm": 0.9264736771583557, + "learning_rate": 0.0001989315525876461, + "loss": 0.5555, + "step": 22 + }, + { + "epoch": 8.073212807871313e-06, + "grad_norm": 1.0074031352996826, + "learning_rate": 0.00019886477462437396, + "loss": 0.5353, + "step": 23 + }, + { + "epoch": 8.424222060387455e-06, + "grad_norm": 0.8725020885467529, + "learning_rate": 0.00019879799666110183, + "loss": 0.5557, + "step": 24 + }, + { + "epoch": 8.7752313129036e-06, + "grad_norm": 0.8867582678794861, + "learning_rate": 0.00019873121869782974, + "loss": 0.5992, + "step": 25 + }, + { + "epoch": 9.126240565419744e-06, + "grad_norm": 0.9235608577728271, + "learning_rate": 0.0001986644407345576, + "loss": 0.516, + "step": 26 + }, + { + "epoch": 9.477249817935889e-06, + "grad_norm": 0.8653218150138855, + "learning_rate": 0.00019859766277128548, + "loss": 0.5249, + "step": 27 + }, + { + "epoch": 9.828259070452033e-06, + "grad_norm": 0.7479026913642883, + "learning_rate": 0.00019853088480801335, + "loss": 0.5037, + "step": 28 + }, + { + "epoch": 1.0179268322968176e-05, + "grad_norm": 0.9531452655792236, + "learning_rate": 0.00019846410684474123, + "loss": 0.5896, + "step": 29 + }, + { + "epoch": 1.053027757548432e-05, + "grad_norm": 1.1012492179870605, + "learning_rate": 0.00019839732888146913, + "loss": 0.5139, + "step": 30 + }, + { + "epoch": 1.0881286828000465e-05, + "grad_norm": 1.0198887586593628, + "learning_rate": 0.000198330550918197, + "loss": 0.5587, + "step": 31 + }, + { + "epoch": 1.1232296080516609e-05, + "grad_norm": 0.8081266283988953, + "learning_rate": 0.00019826377295492487, + "loss": 0.4762, + "step": 32 + }, + { + "epoch": 1.1583305333032752e-05, + "grad_norm": 1.1965891122817993, + "learning_rate": 0.00019819699499165277, + "loss": 0.5719, + "step": 33 + }, + { + "epoch": 1.1934314585548896e-05, + "grad_norm": 1.214903473854065, + "learning_rate": 0.00019813021702838065, + "loss": 0.5756, + "step": 34 + }, + { + "epoch": 1.228532383806504e-05, + "grad_norm": 0.8360006213188171, + "learning_rate": 0.00019806343906510852, + "loss": 0.5688, + "step": 35 + }, + { + "epoch": 1.2636333090581185e-05, + "grad_norm": 0.8328489065170288, + "learning_rate": 0.00019799666110183642, + "loss": 0.6418, + "step": 36 + }, + { + "epoch": 1.298734234309733e-05, + "grad_norm": 1.1427714824676514, + "learning_rate": 0.0001979298831385643, + "loss": 0.6531, + "step": 37 + }, + { + "epoch": 1.3338351595613472e-05, + "grad_norm": 1.0145376920700073, + "learning_rate": 0.00019786310517529217, + "loss": 0.6473, + "step": 38 + }, + { + "epoch": 1.3689360848129616e-05, + "grad_norm": 0.8427861928939819, + "learning_rate": 0.00019779632721202004, + "loss": 0.5882, + "step": 39 + }, + { + "epoch": 1.404037010064576e-05, + "grad_norm": 0.8792659044265747, + "learning_rate": 0.00019772954924874791, + "loss": 0.608, + "step": 40 + }, + { + "epoch": 1.4391379353161905e-05, + "grad_norm": 0.9338463544845581, + "learning_rate": 0.00019766277128547581, + "loss": 0.7118, + "step": 41 + }, + { + "epoch": 1.4742388605678048e-05, + "grad_norm": 0.7554420232772827, + "learning_rate": 0.0001975959933222037, + "loss": 0.5898, + "step": 42 + }, + { + "epoch": 1.5093397858194192e-05, + "grad_norm": 0.7700084447860718, + "learning_rate": 0.00019752921535893156, + "loss": 0.6466, + "step": 43 + }, + { + "epoch": 1.5444407110710337e-05, + "grad_norm": 0.8639333248138428, + "learning_rate": 0.00019746243739565943, + "loss": 0.7253, + "step": 44 + }, + { + "epoch": 1.579541636322648e-05, + "grad_norm": 0.7760612964630127, + "learning_rate": 0.0001973956594323873, + "loss": 0.7099, + "step": 45 + }, + { + "epoch": 1.6146425615742626e-05, + "grad_norm": 0.7319066524505615, + "learning_rate": 0.0001973288814691152, + "loss": 0.6664, + "step": 46 + }, + { + "epoch": 1.6497434868258768e-05, + "grad_norm": 0.7557100057601929, + "learning_rate": 0.00019726210350584308, + "loss": 0.6318, + "step": 47 + }, + { + "epoch": 1.684844412077491e-05, + "grad_norm": 0.6420389413833618, + "learning_rate": 0.00019719532554257095, + "loss": 0.6688, + "step": 48 + }, + { + "epoch": 1.7199453373291057e-05, + "grad_norm": 0.660383939743042, + "learning_rate": 0.00019712854757929883, + "loss": 0.6204, + "step": 49 + }, + { + "epoch": 1.75504626258072e-05, + "grad_norm": 0.5614909529685974, + "learning_rate": 0.00019706176961602673, + "loss": 0.664, + "step": 50 + }, + { + "epoch": 1.7901471878323346e-05, + "grad_norm": 0.502738356590271, + "learning_rate": 0.0001969949916527546, + "loss": 0.6918, + "step": 51 + }, + { + "epoch": 1.825248113083949e-05, + "grad_norm": 0.47578102350234985, + "learning_rate": 0.0001969282136894825, + "loss": 0.6747, + "step": 52 + }, + { + "epoch": 1.860349038335563e-05, + "grad_norm": 0.5528931617736816, + "learning_rate": 0.00019686143572621037, + "loss": 0.765, + "step": 53 + }, + { + "epoch": 1.8954499635871777e-05, + "grad_norm": 0.6176997423171997, + "learning_rate": 0.00019679465776293825, + "loss": 0.5959, + "step": 54 + }, + { + "epoch": 1.930550888838792e-05, + "grad_norm": 0.43425047397613525, + "learning_rate": 0.00019672787979966612, + "loss": 0.6437, + "step": 55 + }, + { + "epoch": 1.9656518140904066e-05, + "grad_norm": 0.5135884881019592, + "learning_rate": 0.000196661101836394, + "loss": 0.7019, + "step": 56 + }, + { + "epoch": 2.000752739342021e-05, + "grad_norm": 0.4628916084766388, + "learning_rate": 0.0001965943238731219, + "loss": 0.5722, + "step": 57 + }, + { + "epoch": 2.035853664593635e-05, + "grad_norm": 0.48201897740364075, + "learning_rate": 0.00019652754590984977, + "loss": 0.6288, + "step": 58 + }, + { + "epoch": 2.0709545898452498e-05, + "grad_norm": 0.5772811770439148, + "learning_rate": 0.00019646076794657764, + "loss": 0.6067, + "step": 59 + }, + { + "epoch": 2.106055515096864e-05, + "grad_norm": 0.4976802170276642, + "learning_rate": 0.0001963939899833055, + "loss": 0.4722, + "step": 60 + }, + { + "epoch": 2.1411564403484786e-05, + "grad_norm": 0.4842129051685333, + "learning_rate": 0.00019632721202003339, + "loss": 0.5876, + "step": 61 + }, + { + "epoch": 2.176257365600093e-05, + "grad_norm": 0.46149536967277527, + "learning_rate": 0.00019626043405676129, + "loss": 0.6373, + "step": 62 + }, + { + "epoch": 2.2113582908517072e-05, + "grad_norm": 0.47199445962905884, + "learning_rate": 0.00019619365609348916, + "loss": 0.5546, + "step": 63 + }, + { + "epoch": 2.2464592161033218e-05, + "grad_norm": 0.6109340190887451, + "learning_rate": 0.00019612687813021703, + "loss": 0.6069, + "step": 64 + }, + { + "epoch": 2.281560141354936e-05, + "grad_norm": 0.5529135465621948, + "learning_rate": 0.0001960601001669449, + "loss": 0.553, + "step": 65 + }, + { + "epoch": 2.3166610666065503e-05, + "grad_norm": 0.500245213508606, + "learning_rate": 0.00019599332220367278, + "loss": 0.6149, + "step": 66 + }, + { + "epoch": 2.351761991858165e-05, + "grad_norm": 0.4841914474964142, + "learning_rate": 0.00019592654424040068, + "loss": 0.6509, + "step": 67 + }, + { + "epoch": 2.3868629171097792e-05, + "grad_norm": 0.5308504104614258, + "learning_rate": 0.00019585976627712855, + "loss": 0.7017, + "step": 68 + }, + { + "epoch": 2.4219638423613938e-05, + "grad_norm": 0.5157874822616577, + "learning_rate": 0.00019579298831385645, + "loss": 0.7125, + "step": 69 + }, + { + "epoch": 2.457064767613008e-05, + "grad_norm": 0.47787800431251526, + "learning_rate": 0.00019572621035058433, + "loss": 0.5792, + "step": 70 + }, + { + "epoch": 2.4921656928646224e-05, + "grad_norm": 0.46792763471603394, + "learning_rate": 0.0001956594323873122, + "loss": 0.7, + "step": 71 + }, + { + "epoch": 2.527266618116237e-05, + "grad_norm": 0.5394675135612488, + "learning_rate": 0.00019559265442404007, + "loss": 0.5549, + "step": 72 + }, + { + "epoch": 2.5623675433678512e-05, + "grad_norm": 0.45065200328826904, + "learning_rate": 0.00019552587646076797, + "loss": 0.6663, + "step": 73 + }, + { + "epoch": 2.597468468619466e-05, + "grad_norm": 0.4026688039302826, + "learning_rate": 0.00019545909849749584, + "loss": 0.6315, + "step": 74 + }, + { + "epoch": 2.63256939387108e-05, + "grad_norm": 0.42353659868240356, + "learning_rate": 0.00019539232053422372, + "loss": 0.5419, + "step": 75 + }, + { + "epoch": 2.6676703191226944e-05, + "grad_norm": 0.45561954379081726, + "learning_rate": 0.0001953255425709516, + "loss": 0.6624, + "step": 76 + }, + { + "epoch": 2.702771244374309e-05, + "grad_norm": 0.3954075574874878, + "learning_rate": 0.00019525876460767946, + "loss": 0.5479, + "step": 77 + }, + { + "epoch": 2.7378721696259233e-05, + "grad_norm": 0.4994329512119293, + "learning_rate": 0.00019519198664440736, + "loss": 0.7224, + "step": 78 + }, + { + "epoch": 2.7729730948775375e-05, + "grad_norm": 0.41149672865867615, + "learning_rate": 0.00019512520868113524, + "loss": 0.5621, + "step": 79 + }, + { + "epoch": 2.808074020129152e-05, + "grad_norm": 0.4199008345603943, + "learning_rate": 0.0001950584307178631, + "loss": 0.7038, + "step": 80 + }, + { + "epoch": 2.8431749453807664e-05, + "grad_norm": 0.4378969371318817, + "learning_rate": 0.00019499165275459098, + "loss": 0.6654, + "step": 81 + }, + { + "epoch": 2.878275870632381e-05, + "grad_norm": 0.4653928279876709, + "learning_rate": 0.00019492487479131886, + "loss": 0.6241, + "step": 82 + }, + { + "epoch": 2.9133767958839953e-05, + "grad_norm": 0.5166454911231995, + "learning_rate": 0.00019485809682804673, + "loss": 0.5366, + "step": 83 + }, + { + "epoch": 2.9484777211356096e-05, + "grad_norm": 0.43180733919143677, + "learning_rate": 0.00019479131886477463, + "loss": 0.6178, + "step": 84 + }, + { + "epoch": 2.9835786463872242e-05, + "grad_norm": 0.44828200340270996, + "learning_rate": 0.0001947245409015025, + "loss": 0.6706, + "step": 85 + }, + { + "epoch": 3.0186795716388385e-05, + "grad_norm": 0.384175181388855, + "learning_rate": 0.0001946577629382304, + "loss": 0.5551, + "step": 86 + }, + { + "epoch": 3.053780496890453e-05, + "grad_norm": 0.4359772503376007, + "learning_rate": 0.00019459098497495828, + "loss": 0.5626, + "step": 87 + }, + { + "epoch": 3.0888814221420673e-05, + "grad_norm": 0.4177016615867615, + "learning_rate": 0.00019452420701168615, + "loss": 0.6023, + "step": 88 + }, + { + "epoch": 3.1239823473936816e-05, + "grad_norm": 0.43592438101768494, + "learning_rate": 0.00019445742904841405, + "loss": 0.682, + "step": 89 + }, + { + "epoch": 3.159083272645296e-05, + "grad_norm": 0.48027974367141724, + "learning_rate": 0.00019439065108514192, + "loss": 0.7596, + "step": 90 + }, + { + "epoch": 3.194184197896911e-05, + "grad_norm": 0.35989537835121155, + "learning_rate": 0.0001943238731218698, + "loss": 0.6018, + "step": 91 + }, + { + "epoch": 3.229285123148525e-05, + "grad_norm": 0.48477092385292053, + "learning_rate": 0.00019425709515859767, + "loss": 0.512, + "step": 92 + }, + { + "epoch": 3.2643860484001394e-05, + "grad_norm": 0.38858646154403687, + "learning_rate": 0.00019419031719532554, + "loss": 0.6371, + "step": 93 + }, + { + "epoch": 3.2994869736517536e-05, + "grad_norm": 0.5323147177696228, + "learning_rate": 0.00019412353923205344, + "loss": 0.5221, + "step": 94 + }, + { + "epoch": 3.334587898903368e-05, + "grad_norm": 0.3784274160861969, + "learning_rate": 0.00019405676126878132, + "loss": 0.6158, + "step": 95 + }, + { + "epoch": 3.369688824154982e-05, + "grad_norm": 0.4076334834098816, + "learning_rate": 0.0001939899833055092, + "loss": 0.5535, + "step": 96 + }, + { + "epoch": 3.404789749406597e-05, + "grad_norm": 0.43930479884147644, + "learning_rate": 0.00019392320534223706, + "loss": 0.6482, + "step": 97 + }, + { + "epoch": 3.4398906746582114e-05, + "grad_norm": 0.4266909658908844, + "learning_rate": 0.00019385642737896494, + "loss": 0.6, + "step": 98 + }, + { + "epoch": 3.474991599909826e-05, + "grad_norm": 0.45353513956069946, + "learning_rate": 0.0001937896494156928, + "loss": 0.6596, + "step": 99 + }, + { + "epoch": 3.51009252516144e-05, + "grad_norm": 0.3424838185310364, + "learning_rate": 0.0001937228714524207, + "loss": 0.555, + "step": 100 + }, + { + "epoch": 3.545193450413054e-05, + "grad_norm": 0.40126165747642517, + "learning_rate": 0.00019365609348914858, + "loss": 0.6921, + "step": 101 + }, + { + "epoch": 3.580294375664669e-05, + "grad_norm": 0.36572012305259705, + "learning_rate": 0.00019358931552587646, + "loss": 0.5485, + "step": 102 + }, + { + "epoch": 3.6153953009162834e-05, + "grad_norm": 0.3972407281398773, + "learning_rate": 0.00019352253756260436, + "loss": 0.5884, + "step": 103 + }, + { + "epoch": 3.650496226167898e-05, + "grad_norm": 0.3900579512119293, + "learning_rate": 0.00019345575959933223, + "loss": 0.6664, + "step": 104 + }, + { + "epoch": 3.685597151419512e-05, + "grad_norm": 0.31666621565818787, + "learning_rate": 0.00019338898163606013, + "loss": 0.5009, + "step": 105 + }, + { + "epoch": 3.720698076671126e-05, + "grad_norm": 0.5269597172737122, + "learning_rate": 0.000193322203672788, + "loss": 0.6292, + "step": 106 + }, + { + "epoch": 3.755799001922741e-05, + "grad_norm": 0.4645126163959503, + "learning_rate": 0.00019325542570951588, + "loss": 0.636, + "step": 107 + }, + { + "epoch": 3.7908999271743555e-05, + "grad_norm": 0.3900754153728485, + "learning_rate": 0.00019318864774624375, + "loss": 0.5367, + "step": 108 + }, + { + "epoch": 3.82600085242597e-05, + "grad_norm": 0.42533883452415466, + "learning_rate": 0.00019312186978297162, + "loss": 0.6862, + "step": 109 + }, + { + "epoch": 3.861101777677584e-05, + "grad_norm": 0.6809422969818115, + "learning_rate": 0.00019305509181969952, + "loss": 0.6434, + "step": 110 + }, + { + "epoch": 3.896202702929198e-05, + "grad_norm": 0.5127860307693481, + "learning_rate": 0.0001929883138564274, + "loss": 0.6266, + "step": 111 + }, + { + "epoch": 3.931303628180813e-05, + "grad_norm": 0.5254234671592712, + "learning_rate": 0.00019292153589315527, + "loss": 0.6982, + "step": 112 + }, + { + "epoch": 3.9664045534324275e-05, + "grad_norm": 0.3699031472206116, + "learning_rate": 0.00019285475792988314, + "loss": 0.6037, + "step": 113 + }, + { + "epoch": 4.001505478684042e-05, + "grad_norm": 0.3807130455970764, + "learning_rate": 0.00019278797996661101, + "loss": 0.5861, + "step": 114 + }, + { + "epoch": 4.036606403935656e-05, + "grad_norm": 0.4455645978450775, + "learning_rate": 0.0001927212020033389, + "loss": 0.5658, + "step": 115 + }, + { + "epoch": 4.07170732918727e-05, + "grad_norm": 0.3830210864543915, + "learning_rate": 0.0001926544240400668, + "loss": 0.606, + "step": 116 + }, + { + "epoch": 4.106808254438885e-05, + "grad_norm": 0.41419631242752075, + "learning_rate": 0.00019258764607679466, + "loss": 0.6095, + "step": 117 + }, + { + "epoch": 4.1419091796904995e-05, + "grad_norm": 0.3929574489593506, + "learning_rate": 0.00019252086811352253, + "loss": 0.6464, + "step": 118 + }, + { + "epoch": 4.177010104942114e-05, + "grad_norm": 0.35958629846572876, + "learning_rate": 0.0001924540901502504, + "loss": 0.5185, + "step": 119 + }, + { + "epoch": 4.212111030193728e-05, + "grad_norm": 0.3790556490421295, + "learning_rate": 0.0001923873121869783, + "loss": 0.5156, + "step": 120 + }, + { + "epoch": 4.2472119554453423e-05, + "grad_norm": 0.37452438473701477, + "learning_rate": 0.00019232053422370618, + "loss": 0.5711, + "step": 121 + }, + { + "epoch": 4.282312880696957e-05, + "grad_norm": 0.38976770639419556, + "learning_rate": 0.00019225375626043408, + "loss": 0.6075, + "step": 122 + }, + { + "epoch": 4.3174138059485716e-05, + "grad_norm": 0.4098513424396515, + "learning_rate": 0.00019218697829716195, + "loss": 0.5312, + "step": 123 + }, + { + "epoch": 4.352514731200186e-05, + "grad_norm": 0.33890047669410706, + "learning_rate": 0.00019212020033388983, + "loss": 0.4984, + "step": 124 + }, + { + "epoch": 4.3876156564518e-05, + "grad_norm": 0.49077001214027405, + "learning_rate": 0.0001920534223706177, + "loss": 0.7159, + "step": 125 + }, + { + "epoch": 4.4227165817034144e-05, + "grad_norm": 0.41653814911842346, + "learning_rate": 0.0001919866444073456, + "loss": 0.5642, + "step": 126 + }, + { + "epoch": 4.4578175069550286e-05, + "grad_norm": 0.45710283517837524, + "learning_rate": 0.00019191986644407347, + "loss": 0.6936, + "step": 127 + }, + { + "epoch": 4.4929184322066436e-05, + "grad_norm": 0.36976873874664307, + "learning_rate": 0.00019185308848080135, + "loss": 0.5407, + "step": 128 + }, + { + "epoch": 4.528019357458258e-05, + "grad_norm": 0.42852675914764404, + "learning_rate": 0.00019178631051752922, + "loss": 0.6731, + "step": 129 + }, + { + "epoch": 4.563120282709872e-05, + "grad_norm": 0.5426310300827026, + "learning_rate": 0.0001917195325542571, + "loss": 0.5775, + "step": 130 + }, + { + "epoch": 4.5982212079614864e-05, + "grad_norm": 0.38442543148994446, + "learning_rate": 0.00019165275459098497, + "loss": 0.5994, + "step": 131 + }, + { + "epoch": 4.633322133213101e-05, + "grad_norm": 0.4298035502433777, + "learning_rate": 0.00019158597662771287, + "loss": 0.5563, + "step": 132 + }, + { + "epoch": 4.6684230584647156e-05, + "grad_norm": 0.40397605299949646, + "learning_rate": 0.00019151919866444074, + "loss": 0.6924, + "step": 133 + }, + { + "epoch": 4.70352398371633e-05, + "grad_norm": 0.4338497519493103, + "learning_rate": 0.0001914524207011686, + "loss": 0.5739, + "step": 134 + }, + { + "epoch": 4.738624908967944e-05, + "grad_norm": 0.39713653922080994, + "learning_rate": 0.0001913856427378965, + "loss": 0.4529, + "step": 135 + }, + { + "epoch": 4.7737258342195584e-05, + "grad_norm": 0.31409478187561035, + "learning_rate": 0.0001913188647746244, + "loss": 0.562, + "step": 136 + }, + { + "epoch": 4.808826759471173e-05, + "grad_norm": 0.371624618768692, + "learning_rate": 0.00019125208681135226, + "loss": 0.5288, + "step": 137 + }, + { + "epoch": 4.8439276847227877e-05, + "grad_norm": 0.4600190818309784, + "learning_rate": 0.00019118530884808016, + "loss": 0.6215, + "step": 138 + }, + { + "epoch": 4.879028609974402e-05, + "grad_norm": 0.45351359248161316, + "learning_rate": 0.00019111853088480803, + "loss": 0.686, + "step": 139 + }, + { + "epoch": 4.914129535226016e-05, + "grad_norm": 0.42282962799072266, + "learning_rate": 0.0001910517529215359, + "loss": 0.5966, + "step": 140 + }, + { + "epoch": 4.9492304604776305e-05, + "grad_norm": 0.41479986906051636, + "learning_rate": 0.00019098497495826378, + "loss": 0.5948, + "step": 141 + }, + { + "epoch": 4.984331385729245e-05, + "grad_norm": 0.40453553199768066, + "learning_rate": 0.00019091819699499168, + "loss": 0.6411, + "step": 142 + }, + { + "epoch": 5.01943231098086e-05, + "grad_norm": 0.3939369320869446, + "learning_rate": 0.00019085141903171955, + "loss": 0.5513, + "step": 143 + }, + { + "epoch": 5.054533236232474e-05, + "grad_norm": 0.3700481653213501, + "learning_rate": 0.00019078464106844743, + "loss": 0.5459, + "step": 144 + }, + { + "epoch": 5.089634161484088e-05, + "grad_norm": 0.4377487897872925, + "learning_rate": 0.0001907178631051753, + "loss": 0.6076, + "step": 145 + }, + { + "epoch": 5.1247350867357025e-05, + "grad_norm": 0.37919673323631287, + "learning_rate": 0.00019065108514190317, + "loss": 0.5207, + "step": 146 + }, + { + "epoch": 5.159836011987317e-05, + "grad_norm": 0.3841630816459656, + "learning_rate": 0.00019058430717863107, + "loss": 0.614, + "step": 147 + }, + { + "epoch": 5.194936937238932e-05, + "grad_norm": 0.43541714549064636, + "learning_rate": 0.00019051752921535895, + "loss": 0.6283, + "step": 148 + }, + { + "epoch": 5.230037862490546e-05, + "grad_norm": 0.4853285253047943, + "learning_rate": 0.00019045075125208682, + "loss": 0.5807, + "step": 149 + }, + { + "epoch": 5.26513878774216e-05, + "grad_norm": 0.3572970926761627, + "learning_rate": 0.0001903839732888147, + "loss": 0.6866, + "step": 150 + }, + { + "epoch": 5.3002397129937745e-05, + "grad_norm": 0.3674347698688507, + "learning_rate": 0.00019031719532554257, + "loss": 0.5552, + "step": 151 + }, + { + "epoch": 5.335340638245389e-05, + "grad_norm": 0.37748461961746216, + "learning_rate": 0.00019025041736227044, + "loss": 0.6278, + "step": 152 + }, + { + "epoch": 5.370441563497003e-05, + "grad_norm": 0.3788503408432007, + "learning_rate": 0.00019018363939899834, + "loss": 0.622, + "step": 153 + }, + { + "epoch": 5.405542488748618e-05, + "grad_norm": 0.3736303150653839, + "learning_rate": 0.0001901168614357262, + "loss": 0.5822, + "step": 154 + }, + { + "epoch": 5.440643414000232e-05, + "grad_norm": 0.32680070400238037, + "learning_rate": 0.0001900500834724541, + "loss": 0.5715, + "step": 155 + }, + { + "epoch": 5.4757443392518466e-05, + "grad_norm": 0.34495192766189575, + "learning_rate": 0.00018998330550918199, + "loss": 0.6497, + "step": 156 + }, + { + "epoch": 5.510845264503461e-05, + "grad_norm": 0.4244193136692047, + "learning_rate": 0.00018991652754590986, + "loss": 0.5519, + "step": 157 + }, + { + "epoch": 5.545946189755075e-05, + "grad_norm": 0.4024031162261963, + "learning_rate": 0.00018984974958263776, + "loss": 0.5339, + "step": 158 + }, + { + "epoch": 5.58104711500669e-05, + "grad_norm": 0.46051299571990967, + "learning_rate": 0.00018978297161936563, + "loss": 0.5979, + "step": 159 + }, + { + "epoch": 5.616148040258304e-05, + "grad_norm": 0.49051615595817566, + "learning_rate": 0.0001897161936560935, + "loss": 0.5563, + "step": 160 + }, + { + "epoch": 5.6512489655099186e-05, + "grad_norm": 0.43045854568481445, + "learning_rate": 0.00018964941569282138, + "loss": 0.5984, + "step": 161 + }, + { + "epoch": 5.686349890761533e-05, + "grad_norm": 0.37778228521347046, + "learning_rate": 0.00018958263772954925, + "loss": 0.5955, + "step": 162 + }, + { + "epoch": 5.721450816013147e-05, + "grad_norm": 0.3736341893672943, + "learning_rate": 0.00018951585976627715, + "loss": 0.6438, + "step": 163 + }, + { + "epoch": 5.756551741264762e-05, + "grad_norm": 0.3940117061138153, + "learning_rate": 0.00018944908180300502, + "loss": 0.503, + "step": 164 + }, + { + "epoch": 5.7916526665163763e-05, + "grad_norm": 0.4193519055843353, + "learning_rate": 0.0001893823038397329, + "loss": 0.6324, + "step": 165 + }, + { + "epoch": 5.8267535917679906e-05, + "grad_norm": 0.34481996297836304, + "learning_rate": 0.00018931552587646077, + "loss": 0.5745, + "step": 166 + }, + { + "epoch": 5.861854517019605e-05, + "grad_norm": 0.38285771012306213, + "learning_rate": 0.00018924874791318864, + "loss": 0.639, + "step": 167 + }, + { + "epoch": 5.896955442271219e-05, + "grad_norm": 0.36933982372283936, + "learning_rate": 0.00018918196994991652, + "loss": 0.6681, + "step": 168 + }, + { + "epoch": 5.932056367522834e-05, + "grad_norm": 0.36970776319503784, + "learning_rate": 0.00018911519198664442, + "loss": 0.5626, + "step": 169 + }, + { + "epoch": 5.9671572927744484e-05, + "grad_norm": 0.38494783639907837, + "learning_rate": 0.0001890484140233723, + "loss": 0.6066, + "step": 170 + }, + { + "epoch": 6.0022582180260627e-05, + "grad_norm": 0.3446069061756134, + "learning_rate": 0.00018898163606010016, + "loss": 0.6354, + "step": 171 + }, + { + "epoch": 6.037359143277677e-05, + "grad_norm": 0.4466759264469147, + "learning_rate": 0.00018891485809682806, + "loss": 0.4737, + "step": 172 + }, + { + "epoch": 6.072460068529291e-05, + "grad_norm": 0.43630918860435486, + "learning_rate": 0.00018884808013355594, + "loss": 0.6839, + "step": 173 + }, + { + "epoch": 6.107560993780906e-05, + "grad_norm": 0.37083202600479126, + "learning_rate": 0.00018878130217028384, + "loss": 0.5372, + "step": 174 + }, + { + "epoch": 6.14266191903252e-05, + "grad_norm": 0.37066200375556946, + "learning_rate": 0.0001887145242070117, + "loss": 0.6653, + "step": 175 + }, + { + "epoch": 6.177762844284135e-05, + "grad_norm": 0.5191747546195984, + "learning_rate": 0.00018864774624373958, + "loss": 0.6677, + "step": 176 + }, + { + "epoch": 6.21286376953575e-05, + "grad_norm": 0.4235158860683441, + "learning_rate": 0.00018858096828046746, + "loss": 0.5971, + "step": 177 + }, + { + "epoch": 6.247964694787363e-05, + "grad_norm": 0.405074805021286, + "learning_rate": 0.00018851419031719533, + "loss": 0.5717, + "step": 178 + }, + { + "epoch": 6.283065620038978e-05, + "grad_norm": 0.45817336440086365, + "learning_rate": 0.00018844741235392323, + "loss": 0.5878, + "step": 179 + }, + { + "epoch": 6.318166545290592e-05, + "grad_norm": 0.6313037276268005, + "learning_rate": 0.0001883806343906511, + "loss": 0.62, + "step": 180 + }, + { + "epoch": 6.353267470542207e-05, + "grad_norm": 0.41896742582321167, + "learning_rate": 0.00018831385642737898, + "loss": 0.5565, + "step": 181 + }, + { + "epoch": 6.388368395793822e-05, + "grad_norm": 0.4143432676792145, + "learning_rate": 0.00018824707846410685, + "loss": 0.5552, + "step": 182 + }, + { + "epoch": 6.423469321045435e-05, + "grad_norm": 0.38745641708374023, + "learning_rate": 0.00018818030050083472, + "loss": 0.5949, + "step": 183 + }, + { + "epoch": 6.45857024629705e-05, + "grad_norm": 0.7472612261772156, + "learning_rate": 0.0001881135225375626, + "loss": 0.6708, + "step": 184 + }, + { + "epoch": 6.493671171548664e-05, + "grad_norm": 0.4416198432445526, + "learning_rate": 0.0001880467445742905, + "loss": 0.6069, + "step": 185 + }, + { + "epoch": 6.528772096800279e-05, + "grad_norm": 0.4312993884086609, + "learning_rate": 0.00018797996661101837, + "loss": 0.5778, + "step": 186 + }, + { + "epoch": 6.563873022051894e-05, + "grad_norm": 0.4524860978126526, + "learning_rate": 0.00018791318864774624, + "loss": 0.5091, + "step": 187 + }, + { + "epoch": 6.598973947303507e-05, + "grad_norm": 0.4320828914642334, + "learning_rate": 0.00018784641068447412, + "loss": 0.6557, + "step": 188 + }, + { + "epoch": 6.634074872555122e-05, + "grad_norm": 0.6967452168464661, + "learning_rate": 0.00018777963272120202, + "loss": 0.612, + "step": 189 + }, + { + "epoch": 6.669175797806736e-05, + "grad_norm": 0.4389924705028534, + "learning_rate": 0.0001877128547579299, + "loss": 0.6271, + "step": 190 + }, + { + "epoch": 6.704276723058351e-05, + "grad_norm": 0.3693922162055969, + "learning_rate": 0.0001876460767946578, + "loss": 0.6715, + "step": 191 + }, + { + "epoch": 6.739377648309964e-05, + "grad_norm": 0.32230404019355774, + "learning_rate": 0.00018757929883138566, + "loss": 0.6344, + "step": 192 + }, + { + "epoch": 6.774478573561579e-05, + "grad_norm": 0.4440002143383026, + "learning_rate": 0.00018751252086811354, + "loss": 0.6671, + "step": 193 + }, + { + "epoch": 6.809579498813194e-05, + "grad_norm": 0.5676587820053101, + "learning_rate": 0.0001874457429048414, + "loss": 0.6818, + "step": 194 + }, + { + "epoch": 6.844680424064808e-05, + "grad_norm": 0.36207348108291626, + "learning_rate": 0.0001873789649415693, + "loss": 0.5029, + "step": 195 + }, + { + "epoch": 6.879781349316423e-05, + "grad_norm": 0.35714131593704224, + "learning_rate": 0.00018731218697829718, + "loss": 0.6127, + "step": 196 + }, + { + "epoch": 6.914882274568036e-05, + "grad_norm": 0.4285273551940918, + "learning_rate": 0.00018724540901502506, + "loss": 0.6355, + "step": 197 + }, + { + "epoch": 6.949983199819651e-05, + "grad_norm": 0.42585939168930054, + "learning_rate": 0.00018717863105175293, + "loss": 0.6302, + "step": 198 + }, + { + "epoch": 6.985084125071266e-05, + "grad_norm": 0.524303138256073, + "learning_rate": 0.0001871118530884808, + "loss": 0.6683, + "step": 199 + }, + { + "epoch": 7.02018505032288e-05, + "grad_norm": 0.39635923504829407, + "learning_rate": 0.00018704507512520868, + "loss": 0.6694, + "step": 200 + }, + { + "epoch": 7.055285975574495e-05, + "grad_norm": 0.39712437987327576, + "learning_rate": 0.00018697829716193658, + "loss": 0.5794, + "step": 201 + }, + { + "epoch": 7.090386900826108e-05, + "grad_norm": 0.4115397334098816, + "learning_rate": 0.00018691151919866445, + "loss": 0.5579, + "step": 202 + }, + { + "epoch": 7.125487826077723e-05, + "grad_norm": 0.4776385724544525, + "learning_rate": 0.00018684474123539232, + "loss": 0.5589, + "step": 203 + }, + { + "epoch": 7.160588751329338e-05, + "grad_norm": 0.35574638843536377, + "learning_rate": 0.0001867779632721202, + "loss": 0.5311, + "step": 204 + }, + { + "epoch": 7.195689676580952e-05, + "grad_norm": 0.44872432947158813, + "learning_rate": 0.00018671118530884807, + "loss": 0.635, + "step": 205 + }, + { + "epoch": 7.230790601832567e-05, + "grad_norm": 0.3511079251766205, + "learning_rate": 0.00018664440734557597, + "loss": 0.5317, + "step": 206 + }, + { + "epoch": 7.26589152708418e-05, + "grad_norm": 0.39862194657325745, + "learning_rate": 0.00018657762938230384, + "loss": 0.6653, + "step": 207 + }, + { + "epoch": 7.300992452335795e-05, + "grad_norm": 0.4046575725078583, + "learning_rate": 0.00018651085141903174, + "loss": 0.6065, + "step": 208 + }, + { + "epoch": 7.33609337758741e-05, + "grad_norm": 0.4231868088245392, + "learning_rate": 0.00018644407345575962, + "loss": 0.7078, + "step": 209 + }, + { + "epoch": 7.371194302839024e-05, + "grad_norm": 0.364700049161911, + "learning_rate": 0.0001863772954924875, + "loss": 0.6309, + "step": 210 + }, + { + "epoch": 7.406295228090639e-05, + "grad_norm": 0.5385531187057495, + "learning_rate": 0.0001863105175292154, + "loss": 0.4233, + "step": 211 + }, + { + "epoch": 7.441396153342252e-05, + "grad_norm": 0.39415115118026733, + "learning_rate": 0.00018624373956594326, + "loss": 0.5928, + "step": 212 + }, + { + "epoch": 7.476497078593867e-05, + "grad_norm": 0.6021363735198975, + "learning_rate": 0.00018617696160267113, + "loss": 0.6611, + "step": 213 + }, + { + "epoch": 7.511598003845482e-05, + "grad_norm": 0.3709903061389923, + "learning_rate": 0.000186110183639399, + "loss": 0.6136, + "step": 214 + }, + { + "epoch": 7.546698929097096e-05, + "grad_norm": 0.36710435152053833, + "learning_rate": 0.00018604340567612688, + "loss": 0.5267, + "step": 215 + }, + { + "epoch": 7.581799854348711e-05, + "grad_norm": 0.4379352033138275, + "learning_rate": 0.00018597662771285475, + "loss": 0.6429, + "step": 216 + }, + { + "epoch": 7.616900779600325e-05, + "grad_norm": 0.3408482074737549, + "learning_rate": 0.00018590984974958265, + "loss": 0.5379, + "step": 217 + }, + { + "epoch": 7.65200170485194e-05, + "grad_norm": 0.4487043023109436, + "learning_rate": 0.00018584307178631053, + "loss": 0.6582, + "step": 218 + }, + { + "epoch": 7.687102630103554e-05, + "grad_norm": 0.42003679275512695, + "learning_rate": 0.0001857762938230384, + "loss": 0.5712, + "step": 219 + }, + { + "epoch": 7.722203555355168e-05, + "grad_norm": 0.4698665738105774, + "learning_rate": 0.00018570951585976627, + "loss": 0.5715, + "step": 220 + }, + { + "epoch": 7.757304480606783e-05, + "grad_norm": 0.3777780830860138, + "learning_rate": 0.00018564273789649415, + "loss": 0.4667, + "step": 221 + }, + { + "epoch": 7.792405405858397e-05, + "grad_norm": 0.36794212460517883, + "learning_rate": 0.00018557595993322205, + "loss": 0.5382, + "step": 222 + }, + { + "epoch": 7.827506331110012e-05, + "grad_norm": 0.4582989513874054, + "learning_rate": 0.00018550918196994992, + "loss": 0.6437, + "step": 223 + }, + { + "epoch": 7.862607256361626e-05, + "grad_norm": 0.4065852761268616, + "learning_rate": 0.0001854424040066778, + "loss": 0.6928, + "step": 224 + }, + { + "epoch": 7.89770818161324e-05, + "grad_norm": 0.3857649564743042, + "learning_rate": 0.0001853756260434057, + "loss": 0.5405, + "step": 225 + }, + { + "epoch": 7.932809106864855e-05, + "grad_norm": 0.40056589245796204, + "learning_rate": 0.00018530884808013357, + "loss": 0.6425, + "step": 226 + }, + { + "epoch": 7.967910032116469e-05, + "grad_norm": 0.43137016892433167, + "learning_rate": 0.00018524207011686147, + "loss": 0.5001, + "step": 227 + }, + { + "epoch": 8.003010957368084e-05, + "grad_norm": 0.3723987340927124, + "learning_rate": 0.00018517529215358934, + "loss": 0.5118, + "step": 228 + }, + { + "epoch": 8.038111882619698e-05, + "grad_norm": 0.34196361899375916, + "learning_rate": 0.00018510851419031721, + "loss": 0.5468, + "step": 229 + }, + { + "epoch": 8.073212807871312e-05, + "grad_norm": 0.4319117069244385, + "learning_rate": 0.0001850417362270451, + "loss": 0.5703, + "step": 230 + }, + { + "epoch": 8.108313733122927e-05, + "grad_norm": 0.4467247724533081, + "learning_rate": 0.00018497495826377296, + "loss": 0.6536, + "step": 231 + }, + { + "epoch": 8.14341465837454e-05, + "grad_norm": 0.3569909632205963, + "learning_rate": 0.00018490818030050083, + "loss": 0.5335, + "step": 232 + }, + { + "epoch": 8.178515583626156e-05, + "grad_norm": 0.33486437797546387, + "learning_rate": 0.00018484140233722873, + "loss": 0.6803, + "step": 233 + }, + { + "epoch": 8.21361650887777e-05, + "grad_norm": 0.3783140480518341, + "learning_rate": 0.0001847746243739566, + "loss": 0.6361, + "step": 234 + }, + { + "epoch": 8.248717434129384e-05, + "grad_norm": 0.4844662547111511, + "learning_rate": 0.00018470784641068448, + "loss": 0.5322, + "step": 235 + }, + { + "epoch": 8.283818359380999e-05, + "grad_norm": 0.508406400680542, + "learning_rate": 0.00018464106844741235, + "loss": 0.6676, + "step": 236 + }, + { + "epoch": 8.318919284632613e-05, + "grad_norm": 0.3710225820541382, + "learning_rate": 0.00018457429048414023, + "loss": 0.6656, + "step": 237 + }, + { + "epoch": 8.354020209884228e-05, + "grad_norm": 0.3757292628288269, + "learning_rate": 0.00018450751252086813, + "loss": 0.6095, + "step": 238 + }, + { + "epoch": 8.389121135135843e-05, + "grad_norm": 0.40651261806488037, + "learning_rate": 0.000184440734557596, + "loss": 0.6626, + "step": 239 + }, + { + "epoch": 8.424222060387456e-05, + "grad_norm": 0.40700778365135193, + "learning_rate": 0.00018437395659432387, + "loss": 0.5328, + "step": 240 + }, + { + "epoch": 8.459322985639071e-05, + "grad_norm": 0.5067440867424011, + "learning_rate": 0.00018430717863105175, + "loss": 0.4811, + "step": 241 + }, + { + "epoch": 8.494423910890685e-05, + "grad_norm": 0.3934602737426758, + "learning_rate": 0.00018424040066777965, + "loss": 0.5691, + "step": 242 + }, + { + "epoch": 8.5295248361423e-05, + "grad_norm": 0.3360019624233246, + "learning_rate": 0.00018417362270450752, + "loss": 0.5542, + "step": 243 + }, + { + "epoch": 8.564625761393915e-05, + "grad_norm": 0.4023631513118744, + "learning_rate": 0.00018410684474123542, + "loss": 0.5192, + "step": 244 + }, + { + "epoch": 8.599726686645528e-05, + "grad_norm": 0.41704171895980835, + "learning_rate": 0.0001840400667779633, + "loss": 0.5018, + "step": 245 + }, + { + "epoch": 8.634827611897143e-05, + "grad_norm": 0.361977756023407, + "learning_rate": 0.00018397328881469117, + "loss": 0.6193, + "step": 246 + }, + { + "epoch": 8.669928537148757e-05, + "grad_norm": 0.37774717807769775, + "learning_rate": 0.00018390651085141904, + "loss": 0.5552, + "step": 247 + }, + { + "epoch": 8.705029462400372e-05, + "grad_norm": 0.3408471941947937, + "learning_rate": 0.0001838397328881469, + "loss": 0.5876, + "step": 248 + }, + { + "epoch": 8.740130387651985e-05, + "grad_norm": 0.3892226815223694, + "learning_rate": 0.0001837729549248748, + "loss": 0.4227, + "step": 249 + }, + { + "epoch": 8.7752313129036e-05, + "grad_norm": 0.5315036177635193, + "learning_rate": 0.00018370617696160269, + "loss": 0.5826, + "step": 250 + }, + { + "epoch": 8.810332238155215e-05, + "grad_norm": 0.35433024168014526, + "learning_rate": 0.00018363939899833056, + "loss": 0.5992, + "step": 251 + }, + { + "epoch": 8.845433163406829e-05, + "grad_norm": 0.34777382016181946, + "learning_rate": 0.00018357262103505843, + "loss": 0.4973, + "step": 252 + }, + { + "epoch": 8.880534088658444e-05, + "grad_norm": 0.3936387002468109, + "learning_rate": 0.0001835058430717863, + "loss": 0.6254, + "step": 253 + }, + { + "epoch": 8.915635013910057e-05, + "grad_norm": 0.4009217917919159, + "learning_rate": 0.0001834390651085142, + "loss": 0.4843, + "step": 254 + }, + { + "epoch": 8.950735939161672e-05, + "grad_norm": 0.4863683879375458, + "learning_rate": 0.00018337228714524208, + "loss": 0.5204, + "step": 255 + }, + { + "epoch": 8.985836864413287e-05, + "grad_norm": 0.6100988984107971, + "learning_rate": 0.00018330550918196995, + "loss": 0.7296, + "step": 256 + }, + { + "epoch": 9.020937789664901e-05, + "grad_norm": 0.40949374437332153, + "learning_rate": 0.00018323873121869782, + "loss": 0.5707, + "step": 257 + }, + { + "epoch": 9.056038714916516e-05, + "grad_norm": 0.47316402196884155, + "learning_rate": 0.0001831719532554257, + "loss": 0.6655, + "step": 258 + }, + { + "epoch": 9.091139640168129e-05, + "grad_norm": 0.4053696393966675, + "learning_rate": 0.0001831051752921536, + "loss": 0.5822, + "step": 259 + }, + { + "epoch": 9.126240565419744e-05, + "grad_norm": 0.4582972228527069, + "learning_rate": 0.00018303839732888147, + "loss": 0.5475, + "step": 260 + }, + { + "epoch": 9.161341490671359e-05, + "grad_norm": 0.38666802644729614, + "learning_rate": 0.00018297161936560937, + "loss": 0.4744, + "step": 261 + }, + { + "epoch": 9.196442415922973e-05, + "grad_norm": 0.31954991817474365, + "learning_rate": 0.00018290484140233724, + "loss": 0.6337, + "step": 262 + }, + { + "epoch": 9.231543341174588e-05, + "grad_norm": 0.3590424358844757, + "learning_rate": 0.00018283806343906512, + "loss": 0.5683, + "step": 263 + }, + { + "epoch": 9.266644266426201e-05, + "grad_norm": 0.4042195975780487, + "learning_rate": 0.000182771285475793, + "loss": 0.6142, + "step": 264 + }, + { + "epoch": 9.301745191677816e-05, + "grad_norm": 0.3474234342575073, + "learning_rate": 0.0001827045075125209, + "loss": 0.6035, + "step": 265 + }, + { + "epoch": 9.336846116929431e-05, + "grad_norm": 0.337091326713562, + "learning_rate": 0.00018263772954924876, + "loss": 0.6107, + "step": 266 + }, + { + "epoch": 9.371947042181045e-05, + "grad_norm": 0.3313732445240021, + "learning_rate": 0.00018257095158597664, + "loss": 0.6491, + "step": 267 + }, + { + "epoch": 9.40704796743266e-05, + "grad_norm": 0.3931679129600525, + "learning_rate": 0.0001825041736227045, + "loss": 0.5492, + "step": 268 + }, + { + "epoch": 9.442148892684273e-05, + "grad_norm": 0.5848420262336731, + "learning_rate": 0.00018243739565943238, + "loss": 0.7091, + "step": 269 + }, + { + "epoch": 9.477249817935888e-05, + "grad_norm": 0.4851846992969513, + "learning_rate": 0.00018237061769616028, + "loss": 0.5856, + "step": 270 + }, + { + "epoch": 9.512350743187503e-05, + "grad_norm": 0.3434993326663971, + "learning_rate": 0.00018230383973288816, + "loss": 0.5085, + "step": 271 + }, + { + "epoch": 9.547451668439117e-05, + "grad_norm": 0.2978988587856293, + "learning_rate": 0.00018223706176961603, + "loss": 0.481, + "step": 272 + }, + { + "epoch": 9.582552593690732e-05, + "grad_norm": 0.34215858578681946, + "learning_rate": 0.0001821702838063439, + "loss": 0.5723, + "step": 273 + }, + { + "epoch": 9.617653518942345e-05, + "grad_norm": 0.43445509672164917, + "learning_rate": 0.00018210350584307178, + "loss": 0.5691, + "step": 274 + }, + { + "epoch": 9.65275444419396e-05, + "grad_norm": 0.36094945669174194, + "learning_rate": 0.00018203672787979968, + "loss": 0.5543, + "step": 275 + }, + { + "epoch": 9.687855369445575e-05, + "grad_norm": 0.386106014251709, + "learning_rate": 0.00018196994991652755, + "loss": 0.5561, + "step": 276 + }, + { + "epoch": 9.722956294697189e-05, + "grad_norm": 0.36676689982414246, + "learning_rate": 0.00018190317195325542, + "loss": 0.5479, + "step": 277 + }, + { + "epoch": 9.758057219948804e-05, + "grad_norm": 0.37988394498825073, + "learning_rate": 0.00018183639398998332, + "loss": 0.5772, + "step": 278 + }, + { + "epoch": 9.793158145200417e-05, + "grad_norm": 0.4024789035320282, + "learning_rate": 0.0001817696160267112, + "loss": 0.6065, + "step": 279 + }, + { + "epoch": 9.828259070452032e-05, + "grad_norm": 0.3697255551815033, + "learning_rate": 0.0001817028380634391, + "loss": 0.5021, + "step": 280 + }, + { + "epoch": 9.863359995703647e-05, + "grad_norm": 0.43579426407814026, + "learning_rate": 0.00018163606010016697, + "loss": 0.555, + "step": 281 + }, + { + "epoch": 9.898460920955261e-05, + "grad_norm": 0.4760832190513611, + "learning_rate": 0.00018156928213689484, + "loss": 0.6438, + "step": 282 + }, + { + "epoch": 9.933561846206876e-05, + "grad_norm": 0.45258408784866333, + "learning_rate": 0.00018150250417362272, + "loss": 0.4717, + "step": 283 + }, + { + "epoch": 9.96866277145849e-05, + "grad_norm": 0.428108274936676, + "learning_rate": 0.0001814357262103506, + "loss": 0.6029, + "step": 284 + }, + { + "epoch": 0.00010003763696710104, + "grad_norm": 0.3999852240085602, + "learning_rate": 0.00018136894824707846, + "loss": 0.4524, + "step": 285 + }, + { + "epoch": 0.0001003886462196172, + "grad_norm": 0.44319403171539307, + "learning_rate": 0.00018130217028380636, + "loss": 0.6619, + "step": 286 + }, + { + "epoch": 0.00010073965547213333, + "grad_norm": 0.43008357286453247, + "learning_rate": 0.00018123539232053424, + "loss": 0.6105, + "step": 287 + }, + { + "epoch": 0.00010109066472464948, + "grad_norm": 0.38037821650505066, + "learning_rate": 0.0001811686143572621, + "loss": 0.6649, + "step": 288 + }, + { + "epoch": 0.00010144167397716562, + "grad_norm": 0.3713517487049103, + "learning_rate": 0.00018110183639398998, + "loss": 0.6381, + "step": 289 + }, + { + "epoch": 0.00010179268322968176, + "grad_norm": 0.3437170386314392, + "learning_rate": 0.00018103505843071786, + "loss": 0.4563, + "step": 290 + }, + { + "epoch": 0.00010214369248219791, + "grad_norm": 0.3661468029022217, + "learning_rate": 0.00018096828046744576, + "loss": 0.606, + "step": 291 + }, + { + "epoch": 0.00010249470173471405, + "grad_norm": 0.36346200108528137, + "learning_rate": 0.00018090150250417363, + "loss": 0.5895, + "step": 292 + }, + { + "epoch": 0.0001028457109872302, + "grad_norm": 0.31052225828170776, + "learning_rate": 0.0001808347245409015, + "loss": 0.4409, + "step": 293 + }, + { + "epoch": 0.00010319672023974634, + "grad_norm": 0.37012970447540283, + "learning_rate": 0.00018076794657762938, + "loss": 0.505, + "step": 294 + }, + { + "epoch": 0.00010354772949226248, + "grad_norm": 0.3958667814731598, + "learning_rate": 0.00018070116861435728, + "loss": 0.5371, + "step": 295 + }, + { + "epoch": 0.00010389873874477863, + "grad_norm": 0.4892179071903229, + "learning_rate": 0.00018063439065108515, + "loss": 0.6737, + "step": 296 + }, + { + "epoch": 0.00010424974799729477, + "grad_norm": 0.41874751448631287, + "learning_rate": 0.00018056761268781305, + "loss": 0.651, + "step": 297 + }, + { + "epoch": 0.00010460075724981092, + "grad_norm": 0.4167911410331726, + "learning_rate": 0.00018050083472454092, + "loss": 0.5531, + "step": 298 + }, + { + "epoch": 0.00010495176650232706, + "grad_norm": 0.3758225440979004, + "learning_rate": 0.0001804340567612688, + "loss": 0.6285, + "step": 299 + }, + { + "epoch": 0.0001053027757548432, + "grad_norm": 0.3688598573207855, + "learning_rate": 0.00018036727879799667, + "loss": 0.5219, + "step": 300 + }, + { + "epoch": 0.00010565378500735934, + "grad_norm": 0.3501751124858856, + "learning_rate": 0.00018030050083472454, + "loss": 0.6351, + "step": 301 + }, + { + "epoch": 0.00010600479425987549, + "grad_norm": 0.42876511812210083, + "learning_rate": 0.00018023372287145244, + "loss": 0.544, + "step": 302 + }, + { + "epoch": 0.00010635580351239164, + "grad_norm": 0.47046172618865967, + "learning_rate": 0.00018016694490818031, + "loss": 0.6304, + "step": 303 + }, + { + "epoch": 0.00010670681276490778, + "grad_norm": 0.402271032333374, + "learning_rate": 0.0001801001669449082, + "loss": 0.5039, + "step": 304 + }, + { + "epoch": 0.00010705782201742393, + "grad_norm": 0.41232413053512573, + "learning_rate": 0.00018003338898163606, + "loss": 0.5892, + "step": 305 + }, + { + "epoch": 0.00010740883126994006, + "grad_norm": 0.3628154993057251, + "learning_rate": 0.00017996661101836393, + "loss": 0.5737, + "step": 306 + }, + { + "epoch": 0.00010775984052245621, + "grad_norm": 0.4291020631790161, + "learning_rate": 0.00017989983305509183, + "loss": 0.6597, + "step": 307 + }, + { + "epoch": 0.00010811084977497236, + "grad_norm": 0.33218181133270264, + "learning_rate": 0.0001798330550918197, + "loss": 0.5726, + "step": 308 + }, + { + "epoch": 0.0001084618590274885, + "grad_norm": 0.3439387381076813, + "learning_rate": 0.00017976627712854758, + "loss": 0.5615, + "step": 309 + }, + { + "epoch": 0.00010881286828000465, + "grad_norm": 0.3523644208908081, + "learning_rate": 0.00017969949916527545, + "loss": 0.4968, + "step": 310 + }, + { + "epoch": 0.00010916387753252078, + "grad_norm": 0.4045630991458893, + "learning_rate": 0.00017963272120200333, + "loss": 0.6425, + "step": 311 + }, + { + "epoch": 0.00010951488678503693, + "grad_norm": 0.3726767599582672, + "learning_rate": 0.00017956594323873123, + "loss": 0.6575, + "step": 312 + }, + { + "epoch": 0.00010986589603755308, + "grad_norm": 0.32131972908973694, + "learning_rate": 0.0001794991652754591, + "loss": 0.5146, + "step": 313 + }, + { + "epoch": 0.00011021690529006922, + "grad_norm": 0.5013764500617981, + "learning_rate": 0.000179432387312187, + "loss": 0.53, + "step": 314 + }, + { + "epoch": 0.00011056791454258537, + "grad_norm": 0.36830246448516846, + "learning_rate": 0.00017936560934891487, + "loss": 0.6291, + "step": 315 + }, + { + "epoch": 0.0001109189237951015, + "grad_norm": 0.3587378263473511, + "learning_rate": 0.00017929883138564275, + "loss": 0.4954, + "step": 316 + }, + { + "epoch": 0.00011126993304761765, + "grad_norm": 0.3480195105075836, + "learning_rate": 0.00017923205342237062, + "loss": 0.606, + "step": 317 + }, + { + "epoch": 0.0001116209423001338, + "grad_norm": 0.38415858149528503, + "learning_rate": 0.00017916527545909852, + "loss": 0.7281, + "step": 318 + }, + { + "epoch": 0.00011197195155264994, + "grad_norm": 0.35853826999664307, + "learning_rate": 0.0001790984974958264, + "loss": 0.5851, + "step": 319 + }, + { + "epoch": 0.00011232296080516609, + "grad_norm": 0.42092210054397583, + "learning_rate": 0.00017903171953255427, + "loss": 0.5324, + "step": 320 + }, + { + "epoch": 0.00011267397005768222, + "grad_norm": 0.34538987278938293, + "learning_rate": 0.00017896494156928214, + "loss": 0.6387, + "step": 321 + }, + { + "epoch": 0.00011302497931019837, + "grad_norm": 0.38299745321273804, + "learning_rate": 0.00017889816360601, + "loss": 0.6013, + "step": 322 + }, + { + "epoch": 0.00011337598856271452, + "grad_norm": 0.32100436091423035, + "learning_rate": 0.0001788313856427379, + "loss": 0.4627, + "step": 323 + }, + { + "epoch": 0.00011372699781523066, + "grad_norm": 0.3458426594734192, + "learning_rate": 0.0001787646076794658, + "loss": 0.5865, + "step": 324 + }, + { + "epoch": 0.0001140780070677468, + "grad_norm": 0.33228665590286255, + "learning_rate": 0.00017869782971619366, + "loss": 0.4611, + "step": 325 + }, + { + "epoch": 0.00011442901632026294, + "grad_norm": 0.38747021555900574, + "learning_rate": 0.00017863105175292153, + "loss": 0.5777, + "step": 326 + }, + { + "epoch": 0.00011478002557277909, + "grad_norm": 0.3888608515262604, + "learning_rate": 0.0001785642737896494, + "loss": 0.5664, + "step": 327 + }, + { + "epoch": 0.00011513103482529524, + "grad_norm": 0.4084737002849579, + "learning_rate": 0.0001784974958263773, + "loss": 0.5939, + "step": 328 + }, + { + "epoch": 0.00011548204407781138, + "grad_norm": 0.4964492917060852, + "learning_rate": 0.00017843071786310518, + "loss": 0.6256, + "step": 329 + }, + { + "epoch": 0.00011583305333032753, + "grad_norm": 0.37329745292663574, + "learning_rate": 0.00017836393989983305, + "loss": 0.5388, + "step": 330 + }, + { + "epoch": 0.00011618406258284366, + "grad_norm": 0.37680140137672424, + "learning_rate": 0.00017829716193656095, + "loss": 0.6203, + "step": 331 + }, + { + "epoch": 0.00011653507183535981, + "grad_norm": 0.4162957966327667, + "learning_rate": 0.00017823038397328883, + "loss": 0.6478, + "step": 332 + }, + { + "epoch": 0.00011688608108787596, + "grad_norm": 0.3473896086215973, + "learning_rate": 0.0001781636060100167, + "loss": 0.589, + "step": 333 + }, + { + "epoch": 0.0001172370903403921, + "grad_norm": 0.4039511978626251, + "learning_rate": 0.0001780968280467446, + "loss": 0.5681, + "step": 334 + }, + { + "epoch": 0.00011758809959290825, + "grad_norm": 0.3135715425014496, + "learning_rate": 0.00017803005008347247, + "loss": 0.5069, + "step": 335 + }, + { + "epoch": 0.00011793910884542438, + "grad_norm": 0.4296559989452362, + "learning_rate": 0.00017796327212020035, + "loss": 0.5413, + "step": 336 + }, + { + "epoch": 0.00011829011809794053, + "grad_norm": 0.4197536110877991, + "learning_rate": 0.00017789649415692822, + "loss": 0.694, + "step": 337 + }, + { + "epoch": 0.00011864112735045668, + "grad_norm": 0.3633468449115753, + "learning_rate": 0.0001778297161936561, + "loss": 0.5475, + "step": 338 + }, + { + "epoch": 0.00011899213660297282, + "grad_norm": 0.2867147922515869, + "learning_rate": 0.000177762938230384, + "loss": 0.485, + "step": 339 + }, + { + "epoch": 0.00011934314585548897, + "grad_norm": 0.3445490300655365, + "learning_rate": 0.00017769616026711187, + "loss": 0.6304, + "step": 340 + }, + { + "epoch": 0.0001196941551080051, + "grad_norm": 0.31692221760749817, + "learning_rate": 0.00017762938230383974, + "loss": 0.5804, + "step": 341 + }, + { + "epoch": 0.00012004516436052125, + "grad_norm": 0.31391167640686035, + "learning_rate": 0.0001775626043405676, + "loss": 0.5945, + "step": 342 + }, + { + "epoch": 0.0001203961736130374, + "grad_norm": 0.3484472632408142, + "learning_rate": 0.00017749582637729548, + "loss": 0.6577, + "step": 343 + }, + { + "epoch": 0.00012074718286555354, + "grad_norm": 0.37430596351623535, + "learning_rate": 0.00017742904841402339, + "loss": 0.6854, + "step": 344 + }, + { + "epoch": 0.00012109819211806969, + "grad_norm": 0.34305211901664734, + "learning_rate": 0.00017736227045075126, + "loss": 0.5123, + "step": 345 + }, + { + "epoch": 0.00012144920137058582, + "grad_norm": 0.3398534059524536, + "learning_rate": 0.00017729549248747913, + "loss": 0.5602, + "step": 346 + }, + { + "epoch": 0.00012180021062310197, + "grad_norm": 0.4278014600276947, + "learning_rate": 0.000177228714524207, + "loss": 0.5152, + "step": 347 + }, + { + "epoch": 0.00012215121987561812, + "grad_norm": 0.4011085629463196, + "learning_rate": 0.0001771619365609349, + "loss": 0.6217, + "step": 348 + }, + { + "epoch": 0.00012250222912813427, + "grad_norm": 0.3425695598125458, + "learning_rate": 0.00017709515859766278, + "loss": 0.5037, + "step": 349 + }, + { + "epoch": 0.0001228532383806504, + "grad_norm": 0.34036242961883545, + "learning_rate": 0.00017702838063439068, + "loss": 0.649, + "step": 350 + }, + { + "epoch": 0.00012320424763316654, + "grad_norm": 0.5631874203681946, + "learning_rate": 0.00017696160267111855, + "loss": 0.5656, + "step": 351 + }, + { + "epoch": 0.0001235552568856827, + "grad_norm": 0.4195176661014557, + "learning_rate": 0.00017689482470784642, + "loss": 0.6899, + "step": 352 + }, + { + "epoch": 0.00012390626613819884, + "grad_norm": 0.41814154386520386, + "learning_rate": 0.0001768280467445743, + "loss": 0.551, + "step": 353 + }, + { + "epoch": 0.000124257275390715, + "grad_norm": 0.3374340534210205, + "learning_rate": 0.00017676126878130217, + "loss": 0.7022, + "step": 354 + }, + { + "epoch": 0.00012460828464323112, + "grad_norm": 0.41464921832084656, + "learning_rate": 0.00017669449081803007, + "loss": 0.5301, + "step": 355 + }, + { + "epoch": 0.00012495929389574726, + "grad_norm": 0.4443178176879883, + "learning_rate": 0.00017662771285475794, + "loss": 0.5487, + "step": 356 + }, + { + "epoch": 0.00012531030314826341, + "grad_norm": 0.3389272093772888, + "learning_rate": 0.00017656093489148582, + "loss": 0.581, + "step": 357 + }, + { + "epoch": 0.00012566131240077956, + "grad_norm": 0.29650986194610596, + "learning_rate": 0.0001764941569282137, + "loss": 0.5801, + "step": 358 + }, + { + "epoch": 0.0001260123216532957, + "grad_norm": 0.40271905064582825, + "learning_rate": 0.00017642737896494156, + "loss": 0.6738, + "step": 359 + }, + { + "epoch": 0.00012636333090581184, + "grad_norm": 0.352225661277771, + "learning_rate": 0.00017636060100166946, + "loss": 0.5727, + "step": 360 + }, + { + "epoch": 0.00012671434015832798, + "grad_norm": 0.3469563126564026, + "learning_rate": 0.00017629382303839734, + "loss": 0.5188, + "step": 361 + }, + { + "epoch": 0.00012706534941084413, + "grad_norm": 0.30644670128822327, + "learning_rate": 0.0001762270450751252, + "loss": 0.497, + "step": 362 + }, + { + "epoch": 0.00012741635866336028, + "grad_norm": 0.3472917377948761, + "learning_rate": 0.00017616026711185308, + "loss": 0.6363, + "step": 363 + }, + { + "epoch": 0.00012776736791587643, + "grad_norm": 0.37184756994247437, + "learning_rate": 0.00017609348914858096, + "loss": 0.5223, + "step": 364 + }, + { + "epoch": 0.00012811837716839256, + "grad_norm": 0.3247138559818268, + "learning_rate": 0.00017602671118530886, + "loss": 0.5457, + "step": 365 + }, + { + "epoch": 0.0001284693864209087, + "grad_norm": 0.5236158967018127, + "learning_rate": 0.00017595993322203673, + "loss": 0.615, + "step": 366 + }, + { + "epoch": 0.00012882039567342485, + "grad_norm": 0.33708465099334717, + "learning_rate": 0.00017589315525876463, + "loss": 0.6163, + "step": 367 + }, + { + "epoch": 0.000129171404925941, + "grad_norm": 0.33848705887794495, + "learning_rate": 0.0001758263772954925, + "loss": 0.4229, + "step": 368 + }, + { + "epoch": 0.00012952241417845715, + "grad_norm": 0.5827682018280029, + "learning_rate": 0.00017575959933222038, + "loss": 0.5668, + "step": 369 + }, + { + "epoch": 0.00012987342343097328, + "grad_norm": 0.36217448115348816, + "learning_rate": 0.00017569282136894825, + "loss": 0.4983, + "step": 370 + }, + { + "epoch": 0.00013022443268348943, + "grad_norm": 0.329414963722229, + "learning_rate": 0.00017562604340567615, + "loss": 0.4281, + "step": 371 + }, + { + "epoch": 0.00013057544193600557, + "grad_norm": 0.36746612191200256, + "learning_rate": 0.00017555926544240402, + "loss": 0.6629, + "step": 372 + }, + { + "epoch": 0.00013092645118852172, + "grad_norm": 0.3954717516899109, + "learning_rate": 0.0001754924874791319, + "loss": 0.5784, + "step": 373 + }, + { + "epoch": 0.00013127746044103787, + "grad_norm": 0.41279932856559753, + "learning_rate": 0.00017542570951585977, + "loss": 0.5994, + "step": 374 + }, + { + "epoch": 0.000131628469693554, + "grad_norm": 0.3019951581954956, + "learning_rate": 0.00017535893155258764, + "loss": 0.5584, + "step": 375 + }, + { + "epoch": 0.00013197947894607015, + "grad_norm": 0.3079768121242523, + "learning_rate": 0.00017529215358931554, + "loss": 0.5904, + "step": 376 + }, + { + "epoch": 0.0001323304881985863, + "grad_norm": 0.5678027272224426, + "learning_rate": 0.00017522537562604342, + "loss": 0.6441, + "step": 377 + }, + { + "epoch": 0.00013268149745110244, + "grad_norm": 0.38624581694602966, + "learning_rate": 0.0001751585976627713, + "loss": 0.5582, + "step": 378 + }, + { + "epoch": 0.0001330325067036186, + "grad_norm": 0.4368002712726593, + "learning_rate": 0.00017509181969949916, + "loss": 0.686, + "step": 379 + }, + { + "epoch": 0.00013338351595613472, + "grad_norm": 0.3409269154071808, + "learning_rate": 0.00017502504173622704, + "loss": 0.582, + "step": 380 + }, + { + "epoch": 0.00013373452520865087, + "grad_norm": 0.3772698938846588, + "learning_rate": 0.0001749582637729549, + "loss": 0.5314, + "step": 381 + }, + { + "epoch": 0.00013408553446116702, + "grad_norm": 0.3791707158088684, + "learning_rate": 0.0001748914858096828, + "loss": 0.6143, + "step": 382 + }, + { + "epoch": 0.00013443654371368317, + "grad_norm": 0.4441101551055908, + "learning_rate": 0.0001748247078464107, + "loss": 0.5726, + "step": 383 + }, + { + "epoch": 0.0001347875529661993, + "grad_norm": 0.4160211980342865, + "learning_rate": 0.00017475792988313858, + "loss": 0.6003, + "step": 384 + }, + { + "epoch": 0.00013513856221871544, + "grad_norm": 0.41698628664016724, + "learning_rate": 0.00017469115191986646, + "loss": 0.4539, + "step": 385 + }, + { + "epoch": 0.00013548957147123159, + "grad_norm": 0.337007999420166, + "learning_rate": 0.00017462437395659433, + "loss": 0.5176, + "step": 386 + }, + { + "epoch": 0.00013584058072374774, + "grad_norm": 0.30926409363746643, + "learning_rate": 0.00017455759599332223, + "loss": 0.6072, + "step": 387 + }, + { + "epoch": 0.00013619158997626389, + "grad_norm": 0.3663052022457123, + "learning_rate": 0.0001744908180300501, + "loss": 0.538, + "step": 388 + }, + { + "epoch": 0.00013654259922878, + "grad_norm": 0.3410074710845947, + "learning_rate": 0.00017442404006677798, + "loss": 0.5687, + "step": 389 + }, + { + "epoch": 0.00013689360848129616, + "grad_norm": 0.5266095399856567, + "learning_rate": 0.00017435726210350585, + "loss": 0.6685, + "step": 390 + }, + { + "epoch": 0.0001372446177338123, + "grad_norm": 0.4020686149597168, + "learning_rate": 0.00017429048414023372, + "loss": 0.586, + "step": 391 + }, + { + "epoch": 0.00013759562698632846, + "grad_norm": 0.39995548129081726, + "learning_rate": 0.00017422370617696162, + "loss": 0.6958, + "step": 392 + }, + { + "epoch": 0.0001379466362388446, + "grad_norm": 0.4024721682071686, + "learning_rate": 0.0001741569282136895, + "loss": 0.6411, + "step": 393 + }, + { + "epoch": 0.00013829764549136073, + "grad_norm": 0.38193392753601074, + "learning_rate": 0.00017409015025041737, + "loss": 0.5857, + "step": 394 + }, + { + "epoch": 0.00013864865474387688, + "grad_norm": 0.39786526560783386, + "learning_rate": 0.00017402337228714524, + "loss": 0.5215, + "step": 395 + }, + { + "epoch": 0.00013899966399639303, + "grad_norm": 0.49223974347114563, + "learning_rate": 0.00017395659432387311, + "loss": 0.5881, + "step": 396 + }, + { + "epoch": 0.00013935067324890918, + "grad_norm": 0.3398894667625427, + "learning_rate": 0.00017388981636060101, + "loss": 0.5466, + "step": 397 + }, + { + "epoch": 0.00013970168250142533, + "grad_norm": 0.34891223907470703, + "learning_rate": 0.0001738230383973289, + "loss": 0.5901, + "step": 398 + }, + { + "epoch": 0.00014005269175394145, + "grad_norm": 0.47644108533859253, + "learning_rate": 0.00017375626043405676, + "loss": 0.5075, + "step": 399 + }, + { + "epoch": 0.0001404037010064576, + "grad_norm": 0.42530229687690735, + "learning_rate": 0.00017368948247078466, + "loss": 0.663, + "step": 400 + }, + { + "epoch": 0.00014075471025897375, + "grad_norm": 0.30858534574508667, + "learning_rate": 0.00017362270450751253, + "loss": 0.4724, + "step": 401 + }, + { + "epoch": 0.0001411057195114899, + "grad_norm": 0.42453449964523315, + "learning_rate": 0.0001735559265442404, + "loss": 0.6074, + "step": 402 + }, + { + "epoch": 0.00014145672876400605, + "grad_norm": 0.3964505195617676, + "learning_rate": 0.0001734891485809683, + "loss": 0.4913, + "step": 403 + }, + { + "epoch": 0.00014180773801652217, + "grad_norm": 0.3317703902721405, + "learning_rate": 0.00017342237061769618, + "loss": 0.5504, + "step": 404 + }, + { + "epoch": 0.00014215874726903832, + "grad_norm": 0.3912264108657837, + "learning_rate": 0.00017335559265442405, + "loss": 0.6301, + "step": 405 + }, + { + "epoch": 0.00014250975652155447, + "grad_norm": 0.3582877218723297, + "learning_rate": 0.00017328881469115193, + "loss": 0.6205, + "step": 406 + }, + { + "epoch": 0.00014286076577407062, + "grad_norm": 0.3691099286079407, + "learning_rate": 0.0001732220367278798, + "loss": 0.5348, + "step": 407 + }, + { + "epoch": 0.00014321177502658677, + "grad_norm": 0.35860803723335266, + "learning_rate": 0.0001731552587646077, + "loss": 0.6029, + "step": 408 + }, + { + "epoch": 0.0001435627842791029, + "grad_norm": 0.3640693426132202, + "learning_rate": 0.00017308848080133557, + "loss": 0.6673, + "step": 409 + }, + { + "epoch": 0.00014391379353161904, + "grad_norm": 0.3550623953342438, + "learning_rate": 0.00017302170283806345, + "loss": 0.4659, + "step": 410 + }, + { + "epoch": 0.0001442648027841352, + "grad_norm": 0.45885637402534485, + "learning_rate": 0.00017295492487479132, + "loss": 0.4781, + "step": 411 + }, + { + "epoch": 0.00014461581203665134, + "grad_norm": 0.3703556954860687, + "learning_rate": 0.0001728881469115192, + "loss": 0.4829, + "step": 412 + }, + { + "epoch": 0.0001449668212891675, + "grad_norm": 0.5436837077140808, + "learning_rate": 0.0001728213689482471, + "loss": 0.6056, + "step": 413 + }, + { + "epoch": 0.0001453178305416836, + "grad_norm": 0.3953244686126709, + "learning_rate": 0.00017275459098497497, + "loss": 0.4884, + "step": 414 + }, + { + "epoch": 0.00014566883979419976, + "grad_norm": 0.34003904461860657, + "learning_rate": 0.00017268781302170284, + "loss": 0.6014, + "step": 415 + }, + { + "epoch": 0.0001460198490467159, + "grad_norm": 0.3463648557662964, + "learning_rate": 0.0001726210350584307, + "loss": 0.603, + "step": 416 + }, + { + "epoch": 0.00014637085829923206, + "grad_norm": 0.4293590784072876, + "learning_rate": 0.0001725542570951586, + "loss": 0.6686, + "step": 417 + }, + { + "epoch": 0.0001467218675517482, + "grad_norm": 0.4243469834327698, + "learning_rate": 0.0001724874791318865, + "loss": 0.6422, + "step": 418 + }, + { + "epoch": 0.00014707287680426433, + "grad_norm": 0.38327839970588684, + "learning_rate": 0.0001724207011686144, + "loss": 0.5595, + "step": 419 + }, + { + "epoch": 0.00014742388605678048, + "grad_norm": 0.31334301829338074, + "learning_rate": 0.00017235392320534226, + "loss": 0.474, + "step": 420 + }, + { + "epoch": 0.00014777489530929663, + "grad_norm": 0.3335350453853607, + "learning_rate": 0.00017228714524207013, + "loss": 0.6172, + "step": 421 + }, + { + "epoch": 0.00014812590456181278, + "grad_norm": 0.373696506023407, + "learning_rate": 0.000172220367278798, + "loss": 0.6183, + "step": 422 + }, + { + "epoch": 0.00014847691381432893, + "grad_norm": 0.45814886689186096, + "learning_rate": 0.00017215358931552588, + "loss": 0.5059, + "step": 423 + }, + { + "epoch": 0.00014882792306684505, + "grad_norm": 0.3578277826309204, + "learning_rate": 0.00017208681135225378, + "loss": 0.5771, + "step": 424 + }, + { + "epoch": 0.0001491789323193612, + "grad_norm": 0.42081883549690247, + "learning_rate": 0.00017202003338898165, + "loss": 0.5604, + "step": 425 + }, + { + "epoch": 0.00014952994157187735, + "grad_norm": 0.3173503875732422, + "learning_rate": 0.00017195325542570953, + "loss": 0.5738, + "step": 426 + }, + { + "epoch": 0.0001498809508243935, + "grad_norm": 0.38292011618614197, + "learning_rate": 0.0001718864774624374, + "loss": 0.6067, + "step": 427 + }, + { + "epoch": 0.00015023196007690965, + "grad_norm": 0.3518977463245392, + "learning_rate": 0.00017181969949916527, + "loss": 0.5073, + "step": 428 + }, + { + "epoch": 0.00015058296932942577, + "grad_norm": 0.5157706141471863, + "learning_rate": 0.00017175292153589317, + "loss": 0.5496, + "step": 429 + }, + { + "epoch": 0.00015093397858194192, + "grad_norm": 0.32064110040664673, + "learning_rate": 0.00017168614357262105, + "loss": 0.4766, + "step": 430 + }, + { + "epoch": 0.00015128498783445807, + "grad_norm": 0.42229798436164856, + "learning_rate": 0.00017161936560934892, + "loss": 0.5953, + "step": 431 + }, + { + "epoch": 0.00015163599708697422, + "grad_norm": 0.4723895192146301, + "learning_rate": 0.0001715525876460768, + "loss": 0.4783, + "step": 432 + }, + { + "epoch": 0.00015198700633949037, + "grad_norm": 0.3841445744037628, + "learning_rate": 0.00017148580968280467, + "loss": 0.5003, + "step": 433 + }, + { + "epoch": 0.0001523380155920065, + "grad_norm": 0.38026461005210876, + "learning_rate": 0.00017141903171953257, + "loss": 0.5093, + "step": 434 + }, + { + "epoch": 0.00015268902484452264, + "grad_norm": 0.37034904956817627, + "learning_rate": 0.00017135225375626044, + "loss": 0.6158, + "step": 435 + }, + { + "epoch": 0.0001530400340970388, + "grad_norm": 0.3876091241836548, + "learning_rate": 0.00017128547579298834, + "loss": 0.5287, + "step": 436 + }, + { + "epoch": 0.00015339104334955494, + "grad_norm": 0.30055519938468933, + "learning_rate": 0.0001712186978297162, + "loss": 0.5018, + "step": 437 + }, + { + "epoch": 0.0001537420526020711, + "grad_norm": 0.36094966530799866, + "learning_rate": 0.00017115191986644409, + "loss": 0.4961, + "step": 438 + }, + { + "epoch": 0.0001540930618545872, + "grad_norm": 0.3300524055957794, + "learning_rate": 0.00017108514190317196, + "loss": 0.5246, + "step": 439 + }, + { + "epoch": 0.00015444407110710336, + "grad_norm": 0.40980783104896545, + "learning_rate": 0.00017101836393989986, + "loss": 0.5705, + "step": 440 + }, + { + "epoch": 0.0001547950803596195, + "grad_norm": 0.3442326784133911, + "learning_rate": 0.00017095158597662773, + "loss": 0.5595, + "step": 441 + }, + { + "epoch": 0.00015514608961213566, + "grad_norm": 0.48015034198760986, + "learning_rate": 0.0001708848080133556, + "loss": 0.5642, + "step": 442 + }, + { + "epoch": 0.0001554970988646518, + "grad_norm": 0.5570142269134521, + "learning_rate": 0.00017081803005008348, + "loss": 0.6111, + "step": 443 + }, + { + "epoch": 0.00015584810811716793, + "grad_norm": 0.30470094084739685, + "learning_rate": 0.00017075125208681135, + "loss": 0.5151, + "step": 444 + }, + { + "epoch": 0.00015619911736968408, + "grad_norm": 0.31946614384651184, + "learning_rate": 0.00017068447412353925, + "loss": 0.5265, + "step": 445 + }, + { + "epoch": 0.00015655012662220023, + "grad_norm": 0.38980719447135925, + "learning_rate": 0.00017061769616026712, + "loss": 0.575, + "step": 446 + }, + { + "epoch": 0.00015690113587471638, + "grad_norm": 0.4077732264995575, + "learning_rate": 0.000170550918196995, + "loss": 0.5729, + "step": 447 + }, + { + "epoch": 0.00015725214512723253, + "grad_norm": 0.38632732629776, + "learning_rate": 0.00017048414023372287, + "loss": 0.594, + "step": 448 + }, + { + "epoch": 0.00015760315437974865, + "grad_norm": 0.37193921208381653, + "learning_rate": 0.00017041736227045074, + "loss": 0.6062, + "step": 449 + }, + { + "epoch": 0.0001579541636322648, + "grad_norm": 0.399029016494751, + "learning_rate": 0.00017035058430717862, + "loss": 0.4538, + "step": 450 + }, + { + "epoch": 0.00015830517288478095, + "grad_norm": 0.37710487842559814, + "learning_rate": 0.00017028380634390652, + "loss": 0.5615, + "step": 451 + }, + { + "epoch": 0.0001586561821372971, + "grad_norm": 0.38591668009757996, + "learning_rate": 0.0001702170283806344, + "loss": 0.5316, + "step": 452 + }, + { + "epoch": 0.00015900719138981325, + "grad_norm": 0.3453538417816162, + "learning_rate": 0.0001701502504173623, + "loss": 0.4645, + "step": 453 + }, + { + "epoch": 0.00015935820064232937, + "grad_norm": 0.34171512722969055, + "learning_rate": 0.00017008347245409016, + "loss": 0.5856, + "step": 454 + }, + { + "epoch": 0.00015970920989484552, + "grad_norm": 0.39591720700263977, + "learning_rate": 0.00017001669449081804, + "loss": 0.573, + "step": 455 + }, + { + "epoch": 0.00016006021914736167, + "grad_norm": 0.4127822816371918, + "learning_rate": 0.00016994991652754594, + "loss": 0.5183, + "step": 456 + }, + { + "epoch": 0.00016041122839987782, + "grad_norm": 0.37893375754356384, + "learning_rate": 0.0001698831385642738, + "loss": 0.566, + "step": 457 + }, + { + "epoch": 0.00016076223765239397, + "grad_norm": 0.33429333567619324, + "learning_rate": 0.00016981636060100168, + "loss": 0.449, + "step": 458 + }, + { + "epoch": 0.0001611132469049101, + "grad_norm": 0.3333180546760559, + "learning_rate": 0.00016974958263772956, + "loss": 0.4441, + "step": 459 + }, + { + "epoch": 0.00016146425615742624, + "grad_norm": 0.3591359257698059, + "learning_rate": 0.00016968280467445743, + "loss": 0.55, + "step": 460 + }, + { + "epoch": 0.0001618152654099424, + "grad_norm": 0.35390427708625793, + "learning_rate": 0.00016961602671118533, + "loss": 0.6445, + "step": 461 + }, + { + "epoch": 0.00016216627466245854, + "grad_norm": 0.42036697268486023, + "learning_rate": 0.0001695492487479132, + "loss": 0.5411, + "step": 462 + }, + { + "epoch": 0.0001625172839149747, + "grad_norm": 0.42147770524024963, + "learning_rate": 0.00016948247078464108, + "loss": 0.6218, + "step": 463 + }, + { + "epoch": 0.0001628682931674908, + "grad_norm": 0.3960399329662323, + "learning_rate": 0.00016941569282136895, + "loss": 0.6608, + "step": 464 + }, + { + "epoch": 0.00016321930242000696, + "grad_norm": 0.39676985144615173, + "learning_rate": 0.00016934891485809682, + "loss": 0.5838, + "step": 465 + }, + { + "epoch": 0.0001635703116725231, + "grad_norm": 0.2839520573616028, + "learning_rate": 0.0001692821368948247, + "loss": 0.5334, + "step": 466 + }, + { + "epoch": 0.00016392132092503926, + "grad_norm": 0.3654347062110901, + "learning_rate": 0.0001692153589315526, + "loss": 0.6065, + "step": 467 + }, + { + "epoch": 0.0001642723301775554, + "grad_norm": 0.3709166646003723, + "learning_rate": 0.00016914858096828047, + "loss": 0.509, + "step": 468 + }, + { + "epoch": 0.00016462333943007153, + "grad_norm": 0.29224780201911926, + "learning_rate": 0.00016908180300500834, + "loss": 0.5372, + "step": 469 + }, + { + "epoch": 0.00016497434868258768, + "grad_norm": 0.34979283809661865, + "learning_rate": 0.00016901502504173624, + "loss": 0.3968, + "step": 470 + }, + { + "epoch": 0.00016532535793510383, + "grad_norm": 0.34580183029174805, + "learning_rate": 0.00016894824707846412, + "loss": 0.6032, + "step": 471 + }, + { + "epoch": 0.00016567636718761998, + "grad_norm": 0.39046213030815125, + "learning_rate": 0.00016888146911519202, + "loss": 0.5628, + "step": 472 + }, + { + "epoch": 0.00016602737644013613, + "grad_norm": 0.35301411151885986, + "learning_rate": 0.0001688146911519199, + "loss": 0.607, + "step": 473 + }, + { + "epoch": 0.00016637838569265225, + "grad_norm": 0.4572748839855194, + "learning_rate": 0.00016874791318864776, + "loss": 0.5018, + "step": 474 + }, + { + "epoch": 0.0001667293949451684, + "grad_norm": 0.38230374455451965, + "learning_rate": 0.00016868113522537564, + "loss": 0.5026, + "step": 475 + }, + { + "epoch": 0.00016708040419768455, + "grad_norm": 0.37066343426704407, + "learning_rate": 0.0001686143572621035, + "loss": 0.5819, + "step": 476 + }, + { + "epoch": 0.0001674314134502007, + "grad_norm": 0.3658660054206848, + "learning_rate": 0.0001685475792988314, + "loss": 0.6825, + "step": 477 + }, + { + "epoch": 0.00016778242270271685, + "grad_norm": 0.42174890637397766, + "learning_rate": 0.00016848080133555928, + "loss": 0.6065, + "step": 478 + }, + { + "epoch": 0.00016813343195523297, + "grad_norm": 0.3462882936000824, + "learning_rate": 0.00016841402337228716, + "loss": 0.5888, + "step": 479 + }, + { + "epoch": 0.00016848444120774912, + "grad_norm": 0.44681960344314575, + "learning_rate": 0.00016834724540901503, + "loss": 0.4987, + "step": 480 + }, + { + "epoch": 0.00016883545046026527, + "grad_norm": 0.3535650372505188, + "learning_rate": 0.0001682804674457429, + "loss": 0.6478, + "step": 481 + }, + { + "epoch": 0.00016918645971278142, + "grad_norm": 0.3357018232345581, + "learning_rate": 0.00016821368948247077, + "loss": 0.4949, + "step": 482 + }, + { + "epoch": 0.00016953746896529757, + "grad_norm": 0.42756739258766174, + "learning_rate": 0.00016814691151919868, + "loss": 0.6475, + "step": 483 + }, + { + "epoch": 0.0001698884782178137, + "grad_norm": 0.36174866557121277, + "learning_rate": 0.00016808013355592655, + "loss": 0.598, + "step": 484 + }, + { + "epoch": 0.00017023948747032984, + "grad_norm": 0.37115278840065, + "learning_rate": 0.00016801335559265442, + "loss": 0.6215, + "step": 485 + }, + { + "epoch": 0.000170590496722846, + "grad_norm": 0.340249627828598, + "learning_rate": 0.0001679465776293823, + "loss": 0.5702, + "step": 486 + }, + { + "epoch": 0.00017094150597536214, + "grad_norm": 0.31226348876953125, + "learning_rate": 0.0001678797996661102, + "loss": 0.6531, + "step": 487 + }, + { + "epoch": 0.0001712925152278783, + "grad_norm": 0.35571998357772827, + "learning_rate": 0.00016781302170283807, + "loss": 0.6406, + "step": 488 + }, + { + "epoch": 0.00017164352448039441, + "grad_norm": 0.4167378842830658, + "learning_rate": 0.00016774624373956597, + "loss": 0.5111, + "step": 489 + }, + { + "epoch": 0.00017199453373291056, + "grad_norm": 0.292304128408432, + "learning_rate": 0.00016767946577629384, + "loss": 0.6643, + "step": 490 + }, + { + "epoch": 0.0001723455429854267, + "grad_norm": 0.38789069652557373, + "learning_rate": 0.00016761268781302171, + "loss": 0.4542, + "step": 491 + }, + { + "epoch": 0.00017269655223794286, + "grad_norm": 0.33764714002609253, + "learning_rate": 0.0001675459098497496, + "loss": 0.4158, + "step": 492 + }, + { + "epoch": 0.00017304756149045898, + "grad_norm": 0.34849148988723755, + "learning_rate": 0.0001674791318864775, + "loss": 0.4737, + "step": 493 + }, + { + "epoch": 0.00017339857074297513, + "grad_norm": 0.2921352684497833, + "learning_rate": 0.00016741235392320536, + "loss": 0.679, + "step": 494 + }, + { + "epoch": 0.00017374957999549128, + "grad_norm": 0.33746641874313354, + "learning_rate": 0.00016734557595993323, + "loss": 0.4957, + "step": 495 + }, + { + "epoch": 0.00017410058924800743, + "grad_norm": 0.4029395878314972, + "learning_rate": 0.0001672787979966611, + "loss": 0.6708, + "step": 496 + }, + { + "epoch": 0.00017445159850052358, + "grad_norm": 0.440033882856369, + "learning_rate": 0.00016721202003338898, + "loss": 0.5889, + "step": 497 + }, + { + "epoch": 0.0001748026077530397, + "grad_norm": 0.330692857503891, + "learning_rate": 0.00016714524207011685, + "loss": 0.5942, + "step": 498 + }, + { + "epoch": 0.00017515361700555585, + "grad_norm": 0.3111809492111206, + "learning_rate": 0.00016707846410684475, + "loss": 0.5506, + "step": 499 + }, + { + "epoch": 0.000175504626258072, + "grad_norm": 0.38885676860809326, + "learning_rate": 0.00016701168614357263, + "loss": 0.4713, + "step": 500 + }, + { + "epoch": 0.00017585563551058815, + "grad_norm": 0.3697550296783447, + "learning_rate": 0.0001669449081803005, + "loss": 0.5955, + "step": 501 + }, + { + "epoch": 0.0001762066447631043, + "grad_norm": 0.35807061195373535, + "learning_rate": 0.00016687813021702837, + "loss": 0.555, + "step": 502 + }, + { + "epoch": 0.00017655765401562043, + "grad_norm": 0.44033464789390564, + "learning_rate": 0.00016681135225375625, + "loss": 0.5668, + "step": 503 + }, + { + "epoch": 0.00017690866326813657, + "grad_norm": 0.3363400399684906, + "learning_rate": 0.00016674457429048415, + "loss": 0.6176, + "step": 504 + }, + { + "epoch": 0.00017725967252065272, + "grad_norm": 0.31457507610321045, + "learning_rate": 0.00016667779632721202, + "loss": 0.6524, + "step": 505 + }, + { + "epoch": 0.00017761068177316887, + "grad_norm": 0.38115641474723816, + "learning_rate": 0.00016661101836393992, + "loss": 0.5848, + "step": 506 + }, + { + "epoch": 0.00017796169102568502, + "grad_norm": 0.3387603759765625, + "learning_rate": 0.0001665442404006678, + "loss": 0.6992, + "step": 507 + }, + { + "epoch": 0.00017831270027820115, + "grad_norm": 0.31671345233917236, + "learning_rate": 0.00016647746243739567, + "loss": 0.5744, + "step": 508 + }, + { + "epoch": 0.0001786637095307173, + "grad_norm": 0.3776471018791199, + "learning_rate": 0.00016641068447412357, + "loss": 0.622, + "step": 509 + }, + { + "epoch": 0.00017901471878323344, + "grad_norm": 0.37572941184043884, + "learning_rate": 0.00016634390651085144, + "loss": 0.5259, + "step": 510 + }, + { + "epoch": 0.0001793657280357496, + "grad_norm": 0.3335510194301605, + "learning_rate": 0.0001662771285475793, + "loss": 0.547, + "step": 511 + }, + { + "epoch": 0.00017971673728826574, + "grad_norm": 0.33241015672683716, + "learning_rate": 0.00016621035058430719, + "loss": 0.5827, + "step": 512 + }, + { + "epoch": 0.00018006774654078187, + "grad_norm": 0.3761122524738312, + "learning_rate": 0.00016614357262103506, + "loss": 0.6962, + "step": 513 + }, + { + "epoch": 0.00018041875579329802, + "grad_norm": 0.4172234833240509, + "learning_rate": 0.00016607679465776293, + "loss": 0.4922, + "step": 514 + }, + { + "epoch": 0.00018076976504581416, + "grad_norm": 0.45372599363327026, + "learning_rate": 0.00016601001669449083, + "loss": 0.5804, + "step": 515 + }, + { + "epoch": 0.00018112077429833031, + "grad_norm": 0.3854759931564331, + "learning_rate": 0.0001659432387312187, + "loss": 0.6026, + "step": 516 + }, + { + "epoch": 0.00018147178355084646, + "grad_norm": 0.3399171829223633, + "learning_rate": 0.00016587646076794658, + "loss": 0.4773, + "step": 517 + }, + { + "epoch": 0.00018182279280336259, + "grad_norm": 0.36649778485298157, + "learning_rate": 0.00016580968280467445, + "loss": 0.59, + "step": 518 + }, + { + "epoch": 0.00018217380205587874, + "grad_norm": 0.39988765120506287, + "learning_rate": 0.00016574290484140233, + "loss": 0.6094, + "step": 519 + }, + { + "epoch": 0.00018252481130839489, + "grad_norm": 0.34659436345100403, + "learning_rate": 0.00016567612687813023, + "loss": 0.4832, + "step": 520 + }, + { + "epoch": 0.00018287582056091103, + "grad_norm": 0.3742654025554657, + "learning_rate": 0.0001656093489148581, + "loss": 0.413, + "step": 521 + }, + { + "epoch": 0.00018322682981342718, + "grad_norm": 0.43068456649780273, + "learning_rate": 0.00016554257095158597, + "loss": 0.6576, + "step": 522 + }, + { + "epoch": 0.0001835778390659433, + "grad_norm": 0.42455193400382996, + "learning_rate": 0.00016547579298831387, + "loss": 0.5897, + "step": 523 + }, + { + "epoch": 0.00018392884831845946, + "grad_norm": 0.3290526568889618, + "learning_rate": 0.00016540901502504175, + "loss": 0.4022, + "step": 524 + }, + { + "epoch": 0.0001842798575709756, + "grad_norm": 0.3744141161441803, + "learning_rate": 0.00016534223706176965, + "loss": 0.5577, + "step": 525 + }, + { + "epoch": 0.00018463086682349176, + "grad_norm": 0.3516618609428406, + "learning_rate": 0.00016527545909849752, + "loss": 0.5481, + "step": 526 + }, + { + "epoch": 0.0001849818760760079, + "grad_norm": 0.3591526448726654, + "learning_rate": 0.0001652086811352254, + "loss": 0.6339, + "step": 527 + }, + { + "epoch": 0.00018533288532852403, + "grad_norm": 0.4024425745010376, + "learning_rate": 0.00016514190317195327, + "loss": 0.5268, + "step": 528 + }, + { + "epoch": 0.00018568389458104018, + "grad_norm": 0.3502136766910553, + "learning_rate": 0.00016507512520868114, + "loss": 0.5112, + "step": 529 + }, + { + "epoch": 0.00018603490383355633, + "grad_norm": 0.3338727056980133, + "learning_rate": 0.00016500834724540904, + "loss": 0.5623, + "step": 530 + }, + { + "epoch": 0.00018638591308607248, + "grad_norm": 0.43554845452308655, + "learning_rate": 0.0001649415692821369, + "loss": 0.5853, + "step": 531 + }, + { + "epoch": 0.00018673692233858862, + "grad_norm": 0.34424322843551636, + "learning_rate": 0.00016487479131886478, + "loss": 0.4951, + "step": 532 + }, + { + "epoch": 0.00018708793159110475, + "grad_norm": 0.4424237012863159, + "learning_rate": 0.00016480801335559266, + "loss": 0.4576, + "step": 533 + }, + { + "epoch": 0.0001874389408436209, + "grad_norm": 0.4616681933403015, + "learning_rate": 0.00016474123539232053, + "loss": 0.4974, + "step": 534 + }, + { + "epoch": 0.00018778995009613705, + "grad_norm": 0.3599206507205963, + "learning_rate": 0.0001646744574290484, + "loss": 0.5987, + "step": 535 + }, + { + "epoch": 0.0001881409593486532, + "grad_norm": 0.40468478202819824, + "learning_rate": 0.0001646076794657763, + "loss": 0.5914, + "step": 536 + }, + { + "epoch": 0.00018849196860116935, + "grad_norm": 0.5389227271080017, + "learning_rate": 0.00016454090150250418, + "loss": 0.6459, + "step": 537 + }, + { + "epoch": 0.00018884297785368547, + "grad_norm": 0.3493568003177643, + "learning_rate": 0.00016447412353923205, + "loss": 0.5191, + "step": 538 + }, + { + "epoch": 0.00018919398710620162, + "grad_norm": 0.31237804889678955, + "learning_rate": 0.00016440734557595992, + "loss": 0.4819, + "step": 539 + }, + { + "epoch": 0.00018954499635871777, + "grad_norm": 0.31142041087150574, + "learning_rate": 0.00016434056761268782, + "loss": 0.5659, + "step": 540 + }, + { + "epoch": 0.00018989600561123392, + "grad_norm": 0.3323245644569397, + "learning_rate": 0.0001642737896494157, + "loss": 0.5779, + "step": 541 + }, + { + "epoch": 0.00019024701486375007, + "grad_norm": 0.3679036498069763, + "learning_rate": 0.0001642070116861436, + "loss": 0.6919, + "step": 542 + }, + { + "epoch": 0.0001905980241162662, + "grad_norm": 0.3094903528690338, + "learning_rate": 0.00016414023372287147, + "loss": 0.4773, + "step": 543 + }, + { + "epoch": 0.00019094903336878234, + "grad_norm": 0.37995582818984985, + "learning_rate": 0.00016407345575959934, + "loss": 0.539, + "step": 544 + }, + { + "epoch": 0.0001913000426212985, + "grad_norm": 0.46415746212005615, + "learning_rate": 0.00016400667779632722, + "loss": 0.6708, + "step": 545 + }, + { + "epoch": 0.00019165105187381464, + "grad_norm": 0.3479398190975189, + "learning_rate": 0.00016393989983305512, + "loss": 0.5496, + "step": 546 + }, + { + "epoch": 0.00019200206112633079, + "grad_norm": 0.3740891218185425, + "learning_rate": 0.000163873121869783, + "loss": 0.6256, + "step": 547 + }, + { + "epoch": 0.0001923530703788469, + "grad_norm": 0.4934074878692627, + "learning_rate": 0.00016380634390651086, + "loss": 0.6788, + "step": 548 + }, + { + "epoch": 0.00019270407963136306, + "grad_norm": 0.42659157514572144, + "learning_rate": 0.00016373956594323874, + "loss": 0.5981, + "step": 549 + }, + { + "epoch": 0.0001930550888838792, + "grad_norm": 0.35727575421333313, + "learning_rate": 0.0001636727879799666, + "loss": 0.4095, + "step": 550 + }, + { + "epoch": 0.00019340609813639536, + "grad_norm": 0.4294300377368927, + "learning_rate": 0.00016360601001669448, + "loss": 0.5386, + "step": 551 + }, + { + "epoch": 0.0001937571073889115, + "grad_norm": 0.33482253551483154, + "learning_rate": 0.00016353923205342238, + "loss": 0.4901, + "step": 552 + }, + { + "epoch": 0.00019410811664142763, + "grad_norm": 0.3379746079444885, + "learning_rate": 0.00016347245409015026, + "loss": 0.5454, + "step": 553 + }, + { + "epoch": 0.00019445912589394378, + "grad_norm": 0.42393919825553894, + "learning_rate": 0.00016340567612687813, + "loss": 0.5959, + "step": 554 + }, + { + "epoch": 0.00019481013514645993, + "grad_norm": 0.31975501775741577, + "learning_rate": 0.000163338898163606, + "loss": 0.6048, + "step": 555 + }, + { + "epoch": 0.00019516114439897608, + "grad_norm": 0.43404972553253174, + "learning_rate": 0.00016327212020033388, + "loss": 0.6252, + "step": 556 + }, + { + "epoch": 0.00019551215365149223, + "grad_norm": 0.3559292256832123, + "learning_rate": 0.00016320534223706178, + "loss": 0.6036, + "step": 557 + }, + { + "epoch": 0.00019586316290400835, + "grad_norm": 0.3134891092777252, + "learning_rate": 0.00016313856427378965, + "loss": 0.5656, + "step": 558 + }, + { + "epoch": 0.0001962141721565245, + "grad_norm": 0.32056671380996704, + "learning_rate": 0.00016307178631051755, + "loss": 0.6509, + "step": 559 + }, + { + "epoch": 0.00019656518140904065, + "grad_norm": 0.46249130368232727, + "learning_rate": 0.00016300500834724542, + "loss": 0.6379, + "step": 560 + }, + { + "epoch": 0.0001969161906615568, + "grad_norm": 0.36366966366767883, + "learning_rate": 0.0001629382303839733, + "loss": 0.5334, + "step": 561 + }, + { + "epoch": 0.00019726719991407295, + "grad_norm": 0.4234124422073364, + "learning_rate": 0.0001628714524207012, + "loss": 0.4864, + "step": 562 + }, + { + "epoch": 0.00019761820916658907, + "grad_norm": 0.3687801659107208, + "learning_rate": 0.00016280467445742907, + "loss": 0.4855, + "step": 563 + }, + { + "epoch": 0.00019796921841910522, + "grad_norm": 0.37247028946876526, + "learning_rate": 0.00016273789649415694, + "loss": 0.6215, + "step": 564 + }, + { + "epoch": 0.00019832022767162137, + "grad_norm": 0.30445635318756104, + "learning_rate": 0.00016267111853088482, + "loss": 0.5741, + "step": 565 + }, + { + "epoch": 0.00019867123692413752, + "grad_norm": 0.3349187970161438, + "learning_rate": 0.0001626043405676127, + "loss": 0.4524, + "step": 566 + }, + { + "epoch": 0.00019902224617665367, + "grad_norm": 0.36938101053237915, + "learning_rate": 0.00016253756260434056, + "loss": 0.5046, + "step": 567 + }, + { + "epoch": 0.0001993732554291698, + "grad_norm": 0.37673529982566833, + "learning_rate": 0.00016247078464106846, + "loss": 0.5001, + "step": 568 + }, + { + "epoch": 0.00019972426468168594, + "grad_norm": 0.3571556508541107, + "learning_rate": 0.00016240400667779634, + "loss": 0.6419, + "step": 569 + }, + { + "epoch": 0.0002000752739342021, + "grad_norm": 0.35543423891067505, + "learning_rate": 0.0001623372287145242, + "loss": 0.6191, + "step": 570 + }, + { + "epoch": 0.00020042628318671824, + "grad_norm": 0.3096729516983032, + "learning_rate": 0.00016227045075125208, + "loss": 0.5373, + "step": 571 + }, + { + "epoch": 0.0002007772924392344, + "grad_norm": 0.30310383439064026, + "learning_rate": 0.00016220367278797996, + "loss": 0.558, + "step": 572 + }, + { + "epoch": 0.0002011283016917505, + "grad_norm": 0.3616211712360382, + "learning_rate": 0.00016213689482470786, + "loss": 0.6504, + "step": 573 + }, + { + "epoch": 0.00020147931094426666, + "grad_norm": 0.34818220138549805, + "learning_rate": 0.00016207011686143573, + "loss": 0.6136, + "step": 574 + }, + { + "epoch": 0.0002018303201967828, + "grad_norm": 0.36225444078445435, + "learning_rate": 0.0001620033388981636, + "loss": 0.4905, + "step": 575 + }, + { + "epoch": 0.00020218132944929896, + "grad_norm": 0.40039536356925964, + "learning_rate": 0.0001619365609348915, + "loss": 0.5997, + "step": 576 + }, + { + "epoch": 0.0002025323387018151, + "grad_norm": 0.33715930581092834, + "learning_rate": 0.00016186978297161938, + "loss": 0.5284, + "step": 577 + }, + { + "epoch": 0.00020288334795433123, + "grad_norm": 0.4137067198753357, + "learning_rate": 0.00016180300500834728, + "loss": 0.6873, + "step": 578 + }, + { + "epoch": 0.00020323435720684738, + "grad_norm": 0.41598305106163025, + "learning_rate": 0.00016173622704507515, + "loss": 0.491, + "step": 579 + }, + { + "epoch": 0.00020358536645936353, + "grad_norm": 0.5466423034667969, + "learning_rate": 0.00016166944908180302, + "loss": 0.6188, + "step": 580 + }, + { + "epoch": 0.00020393637571187968, + "grad_norm": 0.3718060851097107, + "learning_rate": 0.0001616026711185309, + "loss": 0.5573, + "step": 581 + }, + { + "epoch": 0.00020428738496439583, + "grad_norm": 0.33747225999832153, + "learning_rate": 0.00016153589315525877, + "loss": 0.4887, + "step": 582 + }, + { + "epoch": 0.00020463839421691195, + "grad_norm": 0.36478081345558167, + "learning_rate": 0.00016146911519198664, + "loss": 0.553, + "step": 583 + }, + { + "epoch": 0.0002049894034694281, + "grad_norm": 0.38441962003707886, + "learning_rate": 0.00016140233722871454, + "loss": 0.4833, + "step": 584 + }, + { + "epoch": 0.00020534041272194425, + "grad_norm": 0.45594358444213867, + "learning_rate": 0.00016133555926544241, + "loss": 0.5877, + "step": 585 + }, + { + "epoch": 0.0002056914219744604, + "grad_norm": 0.356517493724823, + "learning_rate": 0.0001612687813021703, + "loss": 0.5614, + "step": 586 + }, + { + "epoch": 0.00020604243122697655, + "grad_norm": 0.4051963686943054, + "learning_rate": 0.00016120200333889816, + "loss": 0.5208, + "step": 587 + }, + { + "epoch": 0.00020639344047949267, + "grad_norm": 0.36947959661483765, + "learning_rate": 0.00016113522537562603, + "loss": 0.4385, + "step": 588 + }, + { + "epoch": 0.00020674444973200882, + "grad_norm": 0.45947200059890747, + "learning_rate": 0.00016106844741235393, + "loss": 0.4972, + "step": 589 + }, + { + "epoch": 0.00020709545898452497, + "grad_norm": 0.40610602498054504, + "learning_rate": 0.0001610016694490818, + "loss": 0.4022, + "step": 590 + }, + { + "epoch": 0.00020744646823704112, + "grad_norm": 0.3529384732246399, + "learning_rate": 0.00016093489148580968, + "loss": 0.5222, + "step": 591 + }, + { + "epoch": 0.00020779747748955727, + "grad_norm": 0.35114821791648865, + "learning_rate": 0.00016086811352253755, + "loss": 0.6224, + "step": 592 + }, + { + "epoch": 0.0002081484867420734, + "grad_norm": 0.3596336841583252, + "learning_rate": 0.00016080133555926545, + "loss": 0.5081, + "step": 593 + }, + { + "epoch": 0.00020849949599458954, + "grad_norm": 0.4214174747467041, + "learning_rate": 0.00016073455759599333, + "loss": 0.5189, + "step": 594 + }, + { + "epoch": 0.0002088505052471057, + "grad_norm": 0.39635175466537476, + "learning_rate": 0.00016066777963272123, + "loss": 0.582, + "step": 595 + }, + { + "epoch": 0.00020920151449962184, + "grad_norm": 0.36160576343536377, + "learning_rate": 0.0001606010016694491, + "loss": 0.568, + "step": 596 + }, + { + "epoch": 0.000209552523752138, + "grad_norm": 0.4242927134037018, + "learning_rate": 0.00016053422370617697, + "loss": 0.6235, + "step": 597 + }, + { + "epoch": 0.0002099035330046541, + "grad_norm": 0.4257853925228119, + "learning_rate": 0.00016046744574290485, + "loss": 0.5294, + "step": 598 + }, + { + "epoch": 0.00021025454225717026, + "grad_norm": 0.3890500068664551, + "learning_rate": 0.00016040066777963272, + "loss": 0.6224, + "step": 599 + }, + { + "epoch": 0.0002106055515096864, + "grad_norm": 0.2971879541873932, + "learning_rate": 0.00016033388981636062, + "loss": 0.5951, + "step": 600 + }, + { + "epoch": 0.00021095656076220256, + "grad_norm": 0.29551970958709717, + "learning_rate": 0.0001602671118530885, + "loss": 0.6713, + "step": 601 + }, + { + "epoch": 0.00021130757001471868, + "grad_norm": 0.31588122248649597, + "learning_rate": 0.00016020033388981637, + "loss": 0.6384, + "step": 602 + }, + { + "epoch": 0.00021165857926723483, + "grad_norm": 0.3138657510280609, + "learning_rate": 0.00016013355592654424, + "loss": 0.5846, + "step": 603 + }, + { + "epoch": 0.00021200958851975098, + "grad_norm": 0.31286585330963135, + "learning_rate": 0.0001600667779632721, + "loss": 0.6236, + "step": 604 + }, + { + "epoch": 0.00021236059777226713, + "grad_norm": 0.32098105549812317, + "learning_rate": 0.00016, + "loss": 0.4926, + "step": 605 + }, + { + "epoch": 0.00021271160702478328, + "grad_norm": 0.371427446603775, + "learning_rate": 0.00015993322203672789, + "loss": 0.6205, + "step": 606 + }, + { + "epoch": 0.0002130626162772994, + "grad_norm": 0.28764042258262634, + "learning_rate": 0.00015986644407345576, + "loss": 0.449, + "step": 607 + }, + { + "epoch": 0.00021341362552981555, + "grad_norm": 0.35086238384246826, + "learning_rate": 0.00015979966611018363, + "loss": 0.549, + "step": 608 + }, + { + "epoch": 0.0002137646347823317, + "grad_norm": 0.3118048906326294, + "learning_rate": 0.0001597328881469115, + "loss": 0.6037, + "step": 609 + }, + { + "epoch": 0.00021411564403484785, + "grad_norm": 0.3894517123699188, + "learning_rate": 0.0001596661101836394, + "loss": 0.5989, + "step": 610 + }, + { + "epoch": 0.000214466653287364, + "grad_norm": 0.39642322063446045, + "learning_rate": 0.00015959933222036728, + "loss": 0.566, + "step": 611 + }, + { + "epoch": 0.00021481766253988012, + "grad_norm": 0.35333508253097534, + "learning_rate": 0.00015953255425709518, + "loss": 0.5055, + "step": 612 + }, + { + "epoch": 0.00021516867179239627, + "grad_norm": 0.39200490713119507, + "learning_rate": 0.00015946577629382305, + "loss": 0.5951, + "step": 613 + }, + { + "epoch": 0.00021551968104491242, + "grad_norm": 0.38436442613601685, + "learning_rate": 0.00015939899833055093, + "loss": 0.4876, + "step": 614 + }, + { + "epoch": 0.00021587069029742857, + "grad_norm": 0.3397504389286041, + "learning_rate": 0.0001593322203672788, + "loss": 0.6287, + "step": 615 + }, + { + "epoch": 0.00021622169954994472, + "grad_norm": 0.35870012640953064, + "learning_rate": 0.0001592654424040067, + "loss": 0.5857, + "step": 616 + }, + { + "epoch": 0.00021657270880246084, + "grad_norm": 0.31163597106933594, + "learning_rate": 0.00015919866444073457, + "loss": 0.4831, + "step": 617 + }, + { + "epoch": 0.000216923718054977, + "grad_norm": 0.35106539726257324, + "learning_rate": 0.00015913188647746245, + "loss": 0.5776, + "step": 618 + }, + { + "epoch": 0.00021727472730749314, + "grad_norm": 0.3639923334121704, + "learning_rate": 0.00015906510851419032, + "loss": 0.5039, + "step": 619 + }, + { + "epoch": 0.0002176257365600093, + "grad_norm": 0.3622918128967285, + "learning_rate": 0.0001589983305509182, + "loss": 0.6293, + "step": 620 + }, + { + "epoch": 0.00021797674581252544, + "grad_norm": 0.3899349868297577, + "learning_rate": 0.0001589315525876461, + "loss": 0.567, + "step": 621 + }, + { + "epoch": 0.00021832775506504156, + "grad_norm": 0.3834361732006073, + "learning_rate": 0.00015886477462437397, + "loss": 0.5106, + "step": 622 + }, + { + "epoch": 0.0002186787643175577, + "grad_norm": 0.34996962547302246, + "learning_rate": 0.00015879799666110184, + "loss": 0.5155, + "step": 623 + }, + { + "epoch": 0.00021902977357007386, + "grad_norm": 0.47908079624176025, + "learning_rate": 0.0001587312186978297, + "loss": 0.4529, + "step": 624 + }, + { + "epoch": 0.00021938078282259, + "grad_norm": 0.3167901635169983, + "learning_rate": 0.00015866444073455758, + "loss": 0.6075, + "step": 625 + }, + { + "epoch": 0.00021973179207510616, + "grad_norm": 0.4254927337169647, + "learning_rate": 0.00015859766277128548, + "loss": 0.6404, + "step": 626 + }, + { + "epoch": 0.00022008280132762228, + "grad_norm": 0.4317469000816345, + "learning_rate": 0.00015853088480801336, + "loss": 0.5881, + "step": 627 + }, + { + "epoch": 0.00022043381058013843, + "grad_norm": 0.4441644251346588, + "learning_rate": 0.00015846410684474123, + "loss": 0.5864, + "step": 628 + }, + { + "epoch": 0.00022078481983265458, + "grad_norm": 0.37883102893829346, + "learning_rate": 0.00015839732888146913, + "loss": 0.5664, + "step": 629 + }, + { + "epoch": 0.00022113582908517073, + "grad_norm": 0.35548868775367737, + "learning_rate": 0.000158330550918197, + "loss": 0.5712, + "step": 630 + }, + { + "epoch": 0.00022148683833768688, + "grad_norm": 0.31588616967201233, + "learning_rate": 0.00015826377295492488, + "loss": 0.4856, + "step": 631 + }, + { + "epoch": 0.000221837847590203, + "grad_norm": 0.3186424672603607, + "learning_rate": 0.00015819699499165278, + "loss": 0.542, + "step": 632 + }, + { + "epoch": 0.00022218885684271915, + "grad_norm": 0.41098466515541077, + "learning_rate": 0.00015813021702838065, + "loss": 0.6311, + "step": 633 + }, + { + "epoch": 0.0002225398660952353, + "grad_norm": 0.413401335477829, + "learning_rate": 0.00015806343906510852, + "loss": 0.5036, + "step": 634 + }, + { + "epoch": 0.00022289087534775145, + "grad_norm": 0.34203773736953735, + "learning_rate": 0.0001579966611018364, + "loss": 0.5508, + "step": 635 + }, + { + "epoch": 0.0002232418846002676, + "grad_norm": 0.34416648745536804, + "learning_rate": 0.00015792988313856427, + "loss": 0.5442, + "step": 636 + }, + { + "epoch": 0.00022359289385278372, + "grad_norm": 0.3439941704273224, + "learning_rate": 0.00015786310517529217, + "loss": 0.4969, + "step": 637 + }, + { + "epoch": 0.00022394390310529987, + "grad_norm": 0.3547762930393219, + "learning_rate": 0.00015779632721202004, + "loss": 0.5564, + "step": 638 + }, + { + "epoch": 0.00022429491235781602, + "grad_norm": 0.35666894912719727, + "learning_rate": 0.00015772954924874792, + "loss": 0.4759, + "step": 639 + }, + { + "epoch": 0.00022464592161033217, + "grad_norm": 0.3175058364868164, + "learning_rate": 0.0001576627712854758, + "loss": 0.5708, + "step": 640 + }, + { + "epoch": 0.00022499693086284832, + "grad_norm": 0.4329943358898163, + "learning_rate": 0.00015759599332220366, + "loss": 0.5293, + "step": 641 + }, + { + "epoch": 0.00022534794011536444, + "grad_norm": 0.5703821778297424, + "learning_rate": 0.00015752921535893156, + "loss": 0.6187, + "step": 642 + }, + { + "epoch": 0.0002256989493678806, + "grad_norm": 0.32244032621383667, + "learning_rate": 0.00015746243739565944, + "loss": 0.4847, + "step": 643 + }, + { + "epoch": 0.00022604995862039674, + "grad_norm": 0.36224085092544556, + "learning_rate": 0.0001573956594323873, + "loss": 0.6804, + "step": 644 + }, + { + "epoch": 0.0002264009678729129, + "grad_norm": 0.3316931426525116, + "learning_rate": 0.0001573288814691152, + "loss": 0.6413, + "step": 645 + }, + { + "epoch": 0.00022675197712542904, + "grad_norm": 0.38156425952911377, + "learning_rate": 0.00015726210350584308, + "loss": 0.5659, + "step": 646 + }, + { + "epoch": 0.00022710298637794516, + "grad_norm": 0.48353493213653564, + "learning_rate": 0.00015719532554257096, + "loss": 0.5788, + "step": 647 + }, + { + "epoch": 0.00022745399563046131, + "grad_norm": 0.3913673758506775, + "learning_rate": 0.00015712854757929886, + "loss": 0.6899, + "step": 648 + }, + { + "epoch": 0.00022780500488297746, + "grad_norm": 0.46836981177330017, + "learning_rate": 0.00015706176961602673, + "loss": 0.5712, + "step": 649 + }, + { + "epoch": 0.0002281560141354936, + "grad_norm": 0.34713172912597656, + "learning_rate": 0.0001569949916527546, + "loss": 0.381, + "step": 650 + }, + { + "epoch": 0.00022850702338800976, + "grad_norm": 0.3837398886680603, + "learning_rate": 0.00015692821368948248, + "loss": 0.5236, + "step": 651 + }, + { + "epoch": 0.00022885803264052589, + "grad_norm": 0.5181556940078735, + "learning_rate": 0.00015686143572621035, + "loss": 0.5889, + "step": 652 + }, + { + "epoch": 0.00022920904189304203, + "grad_norm": 0.42713961005210876, + "learning_rate": 0.00015679465776293825, + "loss": 0.5346, + "step": 653 + }, + { + "epoch": 0.00022956005114555818, + "grad_norm": 0.2868479788303375, + "learning_rate": 0.00015672787979966612, + "loss": 0.5546, + "step": 654 + }, + { + "epoch": 0.00022991106039807433, + "grad_norm": 0.31901800632476807, + "learning_rate": 0.000156661101836394, + "loss": 0.5014, + "step": 655 + }, + { + "epoch": 0.00023026206965059048, + "grad_norm": 0.41681963205337524, + "learning_rate": 0.00015659432387312187, + "loss": 0.5709, + "step": 656 + }, + { + "epoch": 0.0002306130789031066, + "grad_norm": 0.5942090749740601, + "learning_rate": 0.00015652754590984974, + "loss": 0.6022, + "step": 657 + }, + { + "epoch": 0.00023096408815562276, + "grad_norm": 0.405391126871109, + "learning_rate": 0.00015646076794657764, + "loss": 0.5363, + "step": 658 + }, + { + "epoch": 0.0002313150974081389, + "grad_norm": 0.3201390206813812, + "learning_rate": 0.00015639398998330552, + "loss": 0.6045, + "step": 659 + }, + { + "epoch": 0.00023166610666065505, + "grad_norm": 0.2989407479763031, + "learning_rate": 0.0001563272120200334, + "loss": 0.5604, + "step": 660 + }, + { + "epoch": 0.0002320171159131712, + "grad_norm": 0.3919268548488617, + "learning_rate": 0.00015626043405676126, + "loss": 0.5413, + "step": 661 + }, + { + "epoch": 0.00023236812516568733, + "grad_norm": 0.4080122709274292, + "learning_rate": 0.00015619365609348916, + "loss": 0.498, + "step": 662 + }, + { + "epoch": 0.00023271913441820348, + "grad_norm": 0.38974156975746155, + "learning_rate": 0.00015612687813021704, + "loss": 0.6149, + "step": 663 + }, + { + "epoch": 0.00023307014367071962, + "grad_norm": 0.3145015835762024, + "learning_rate": 0.00015606010016694494, + "loss": 0.4886, + "step": 664 + }, + { + "epoch": 0.00023342115292323577, + "grad_norm": 0.3009328246116638, + "learning_rate": 0.0001559933222036728, + "loss": 0.5534, + "step": 665 + }, + { + "epoch": 0.00023377216217575192, + "grad_norm": 0.4774717092514038, + "learning_rate": 0.00015592654424040068, + "loss": 0.6006, + "step": 666 + }, + { + "epoch": 0.00023412317142826805, + "grad_norm": 0.32965418696403503, + "learning_rate": 0.00015585976627712856, + "loss": 0.5463, + "step": 667 + }, + { + "epoch": 0.0002344741806807842, + "grad_norm": 0.3066554665565491, + "learning_rate": 0.00015579298831385643, + "loss": 0.5675, + "step": 668 + }, + { + "epoch": 0.00023482518993330035, + "grad_norm": 0.3879207372665405, + "learning_rate": 0.00015572621035058433, + "loss": 0.5825, + "step": 669 + }, + { + "epoch": 0.0002351761991858165, + "grad_norm": 0.3171943128108978, + "learning_rate": 0.0001556594323873122, + "loss": 0.5677, + "step": 670 + }, + { + "epoch": 0.00023552720843833264, + "grad_norm": 0.36982622742652893, + "learning_rate": 0.00015559265442404007, + "loss": 0.5885, + "step": 671 + }, + { + "epoch": 0.00023587821769084877, + "grad_norm": 0.30437183380126953, + "learning_rate": 0.00015552587646076795, + "loss": 0.6288, + "step": 672 + }, + { + "epoch": 0.00023622922694336492, + "grad_norm": 0.30654504895210266, + "learning_rate": 0.00015545909849749582, + "loss": 0.5924, + "step": 673 + }, + { + "epoch": 0.00023658023619588107, + "grad_norm": 0.3771214783191681, + "learning_rate": 0.00015539232053422372, + "loss": 0.4901, + "step": 674 + }, + { + "epoch": 0.00023693124544839721, + "grad_norm": 0.3018699884414673, + "learning_rate": 0.0001553255425709516, + "loss": 0.6159, + "step": 675 + }, + { + "epoch": 0.00023728225470091336, + "grad_norm": 0.32899734377861023, + "learning_rate": 0.00015525876460767947, + "loss": 0.6197, + "step": 676 + }, + { + "epoch": 0.0002376332639534295, + "grad_norm": 0.31837883591651917, + "learning_rate": 0.00015519198664440734, + "loss": 0.5449, + "step": 677 + }, + { + "epoch": 0.00023798427320594564, + "grad_norm": 0.35326528549194336, + "learning_rate": 0.00015512520868113521, + "loss": 0.6315, + "step": 678 + }, + { + "epoch": 0.00023833528245846179, + "grad_norm": 0.3714829385280609, + "learning_rate": 0.00015505843071786311, + "loss": 0.6352, + "step": 679 + }, + { + "epoch": 0.00023868629171097794, + "grad_norm": 0.4002094864845276, + "learning_rate": 0.000154991652754591, + "loss": 0.4235, + "step": 680 + }, + { + "epoch": 0.00023903730096349408, + "grad_norm": 0.3382783532142639, + "learning_rate": 0.0001549248747913189, + "loss": 0.5476, + "step": 681 + }, + { + "epoch": 0.0002393883102160102, + "grad_norm": 0.2985747158527374, + "learning_rate": 0.00015485809682804676, + "loss": 0.5684, + "step": 682 + }, + { + "epoch": 0.00023973931946852636, + "grad_norm": 0.3288929760456085, + "learning_rate": 0.00015479131886477463, + "loss": 0.5657, + "step": 683 + }, + { + "epoch": 0.0002400903287210425, + "grad_norm": 0.39641210436820984, + "learning_rate": 0.0001547245409015025, + "loss": 0.6283, + "step": 684 + }, + { + "epoch": 0.00024044133797355866, + "grad_norm": 0.37413230538368225, + "learning_rate": 0.0001546577629382304, + "loss": 0.5778, + "step": 685 + }, + { + "epoch": 0.0002407923472260748, + "grad_norm": 0.28837504982948303, + "learning_rate": 0.00015459098497495828, + "loss": 0.5079, + "step": 686 + }, + { + "epoch": 0.00024114335647859093, + "grad_norm": 0.32851526141166687, + "learning_rate": 0.00015452420701168615, + "loss": 0.649, + "step": 687 + }, + { + "epoch": 0.00024149436573110708, + "grad_norm": 0.3848758637905121, + "learning_rate": 0.00015445742904841403, + "loss": 0.6099, + "step": 688 + }, + { + "epoch": 0.00024184537498362323, + "grad_norm": 0.35494935512542725, + "learning_rate": 0.0001543906510851419, + "loss": 0.6498, + "step": 689 + }, + { + "epoch": 0.00024219638423613938, + "grad_norm": 0.3431280553340912, + "learning_rate": 0.0001543238731218698, + "loss": 0.4934, + "step": 690 + }, + { + "epoch": 0.00024254739348865553, + "grad_norm": 0.33980974555015564, + "learning_rate": 0.00015425709515859767, + "loss": 0.5556, + "step": 691 + }, + { + "epoch": 0.00024289840274117165, + "grad_norm": 0.3086068034172058, + "learning_rate": 0.00015419031719532555, + "loss": 0.5955, + "step": 692 + }, + { + "epoch": 0.0002432494119936878, + "grad_norm": 0.33093178272247314, + "learning_rate": 0.00015412353923205342, + "loss": 0.5926, + "step": 693 + }, + { + "epoch": 0.00024360042124620395, + "grad_norm": 0.3660534620285034, + "learning_rate": 0.0001540567612687813, + "loss": 0.5494, + "step": 694 + }, + { + "epoch": 0.0002439514304987201, + "grad_norm": 0.29803964495658875, + "learning_rate": 0.0001539899833055092, + "loss": 0.6074, + "step": 695 + }, + { + "epoch": 0.00024430243975123625, + "grad_norm": 0.36542224884033203, + "learning_rate": 0.00015392320534223707, + "loss": 0.59, + "step": 696 + }, + { + "epoch": 0.00024465344900375237, + "grad_norm": 0.34015166759490967, + "learning_rate": 0.00015385642737896494, + "loss": 0.6029, + "step": 697 + }, + { + "epoch": 0.00024500445825626854, + "grad_norm": 0.3211725950241089, + "learning_rate": 0.00015378964941569284, + "loss": 0.535, + "step": 698 + }, + { + "epoch": 0.00024535546750878467, + "grad_norm": 0.37027183175086975, + "learning_rate": 0.0001537228714524207, + "loss": 0.6265, + "step": 699 + }, + { + "epoch": 0.0002457064767613008, + "grad_norm": 0.3447396159172058, + "learning_rate": 0.00015365609348914859, + "loss": 0.6061, + "step": 700 + }, + { + "epoch": 0.00024605748601381697, + "grad_norm": 0.3344075679779053, + "learning_rate": 0.00015358931552587649, + "loss": 0.5412, + "step": 701 + }, + { + "epoch": 0.0002464084952663331, + "grad_norm": 0.29049620032310486, + "learning_rate": 0.00015352253756260436, + "loss": 0.5137, + "step": 702 + }, + { + "epoch": 0.00024675950451884926, + "grad_norm": 0.37048932909965515, + "learning_rate": 0.00015345575959933223, + "loss": 0.6118, + "step": 703 + }, + { + "epoch": 0.0002471105137713654, + "grad_norm": 0.38212522864341736, + "learning_rate": 0.0001533889816360601, + "loss": 0.466, + "step": 704 + }, + { + "epoch": 0.0002474615230238815, + "grad_norm": 0.3576483428478241, + "learning_rate": 0.00015332220367278798, + "loss": 0.561, + "step": 705 + }, + { + "epoch": 0.0002478125322763977, + "grad_norm": 0.3550293743610382, + "learning_rate": 0.00015325542570951588, + "loss": 0.5634, + "step": 706 + }, + { + "epoch": 0.0002481635415289138, + "grad_norm": 0.362474650144577, + "learning_rate": 0.00015318864774624375, + "loss": 0.5608, + "step": 707 + }, + { + "epoch": 0.00024851455078143, + "grad_norm": 0.39463603496551514, + "learning_rate": 0.00015312186978297163, + "loss": 0.64, + "step": 708 + }, + { + "epoch": 0.0002488655600339461, + "grad_norm": 0.3456307649612427, + "learning_rate": 0.0001530550918196995, + "loss": 0.4631, + "step": 709 + }, + { + "epoch": 0.00024921656928646223, + "grad_norm": 0.3300929367542267, + "learning_rate": 0.00015298831385642737, + "loss": 0.3984, + "step": 710 + }, + { + "epoch": 0.0002495675785389784, + "grad_norm": 0.35923343896865845, + "learning_rate": 0.00015292153589315527, + "loss": 0.6003, + "step": 711 + }, + { + "epoch": 0.00024991858779149453, + "grad_norm": 0.4047611653804779, + "learning_rate": 0.00015285475792988315, + "loss": 0.5715, + "step": 712 + }, + { + "epoch": 0.0002502695970440107, + "grad_norm": 0.43539851903915405, + "learning_rate": 0.00015278797996661102, + "loss": 0.571, + "step": 713 + }, + { + "epoch": 0.00025062060629652683, + "grad_norm": 0.34745046496391296, + "learning_rate": 0.0001527212020033389, + "loss": 0.622, + "step": 714 + }, + { + "epoch": 0.00025097161554904295, + "grad_norm": 0.3130028247833252, + "learning_rate": 0.0001526544240400668, + "loss": 0.507, + "step": 715 + }, + { + "epoch": 0.0002513226248015591, + "grad_norm": 0.3093617558479309, + "learning_rate": 0.00015258764607679466, + "loss": 0.4951, + "step": 716 + }, + { + "epoch": 0.00025167363405407525, + "grad_norm": 0.34299540519714355, + "learning_rate": 0.00015252086811352257, + "loss": 0.539, + "step": 717 + }, + { + "epoch": 0.0002520246433065914, + "grad_norm": 0.32698413729667664, + "learning_rate": 0.00015245409015025044, + "loss": 0.4588, + "step": 718 + }, + { + "epoch": 0.00025237565255910755, + "grad_norm": 0.37853989005088806, + "learning_rate": 0.0001523873121869783, + "loss": 0.6227, + "step": 719 + }, + { + "epoch": 0.00025272666181162367, + "grad_norm": 0.32887300848960876, + "learning_rate": 0.00015232053422370618, + "loss": 0.5893, + "step": 720 + }, + { + "epoch": 0.00025307767106413985, + "grad_norm": 0.43352028727531433, + "learning_rate": 0.00015225375626043406, + "loss": 0.5811, + "step": 721 + }, + { + "epoch": 0.00025342868031665597, + "grad_norm": 0.42844903469085693, + "learning_rate": 0.00015218697829716196, + "loss": 0.6196, + "step": 722 + }, + { + "epoch": 0.00025377968956917215, + "grad_norm": 0.39929670095443726, + "learning_rate": 0.00015212020033388983, + "loss": 0.6722, + "step": 723 + }, + { + "epoch": 0.00025413069882168827, + "grad_norm": 0.5063486695289612, + "learning_rate": 0.0001520534223706177, + "loss": 0.6086, + "step": 724 + }, + { + "epoch": 0.0002544817080742044, + "grad_norm": 0.3625267446041107, + "learning_rate": 0.00015198664440734558, + "loss": 0.6331, + "step": 725 + }, + { + "epoch": 0.00025483271732672057, + "grad_norm": 0.3452700078487396, + "learning_rate": 0.00015191986644407345, + "loss": 0.5812, + "step": 726 + }, + { + "epoch": 0.0002551837265792367, + "grad_norm": 0.31915003061294556, + "learning_rate": 0.00015185308848080135, + "loss": 0.5653, + "step": 727 + }, + { + "epoch": 0.00025553473583175287, + "grad_norm": 0.3085877299308777, + "learning_rate": 0.00015178631051752922, + "loss": 0.4702, + "step": 728 + }, + { + "epoch": 0.000255885745084269, + "grad_norm": 0.31519320607185364, + "learning_rate": 0.0001517195325542571, + "loss": 0.5096, + "step": 729 + }, + { + "epoch": 0.0002562367543367851, + "grad_norm": 0.3637699782848358, + "learning_rate": 0.00015165275459098497, + "loss": 0.6001, + "step": 730 + }, + { + "epoch": 0.0002565877635893013, + "grad_norm": 0.34056970477104187, + "learning_rate": 0.00015158597662771284, + "loss": 0.5546, + "step": 731 + }, + { + "epoch": 0.0002569387728418174, + "grad_norm": 0.37110257148742676, + "learning_rate": 0.00015151919866444074, + "loss": 0.5612, + "step": 732 + }, + { + "epoch": 0.0002572897820943336, + "grad_norm": 0.35854101181030273, + "learning_rate": 0.00015145242070116862, + "loss": 0.6364, + "step": 733 + }, + { + "epoch": 0.0002576407913468497, + "grad_norm": 0.4340030252933502, + "learning_rate": 0.00015138564273789652, + "loss": 0.5772, + "step": 734 + }, + { + "epoch": 0.00025799180059936583, + "grad_norm": 0.3807721436023712, + "learning_rate": 0.0001513188647746244, + "loss": 0.4986, + "step": 735 + }, + { + "epoch": 0.000258342809851882, + "grad_norm": 0.3522527813911438, + "learning_rate": 0.00015125208681135226, + "loss": 0.5982, + "step": 736 + }, + { + "epoch": 0.00025869381910439813, + "grad_norm": 0.31251296401023865, + "learning_rate": 0.00015118530884808014, + "loss": 0.5239, + "step": 737 + }, + { + "epoch": 0.0002590448283569143, + "grad_norm": 0.3460885286331177, + "learning_rate": 0.00015111853088480804, + "loss": 0.5881, + "step": 738 + }, + { + "epoch": 0.00025939583760943043, + "grad_norm": 0.33298879861831665, + "learning_rate": 0.0001510517529215359, + "loss": 0.5272, + "step": 739 + }, + { + "epoch": 0.00025974684686194655, + "grad_norm": 0.351468950510025, + "learning_rate": 0.00015098497495826378, + "loss": 0.6049, + "step": 740 + }, + { + "epoch": 0.00026009785611446273, + "grad_norm": 0.3449242413043976, + "learning_rate": 0.00015091819699499166, + "loss": 0.5983, + "step": 741 + }, + { + "epoch": 0.00026044886536697885, + "grad_norm": 0.34724265336990356, + "learning_rate": 0.00015085141903171953, + "loss": 0.5292, + "step": 742 + }, + { + "epoch": 0.00026079987461949503, + "grad_norm": 0.3525671660900116, + "learning_rate": 0.00015078464106844743, + "loss": 0.5391, + "step": 743 + }, + { + "epoch": 0.00026115088387201115, + "grad_norm": 0.33959653973579407, + "learning_rate": 0.0001507178631051753, + "loss": 0.5898, + "step": 744 + }, + { + "epoch": 0.00026150189312452727, + "grad_norm": 0.5051225423812866, + "learning_rate": 0.00015065108514190318, + "loss": 0.5408, + "step": 745 + }, + { + "epoch": 0.00026185290237704345, + "grad_norm": 0.3298085629940033, + "learning_rate": 0.00015058430717863105, + "loss": 0.557, + "step": 746 + }, + { + "epoch": 0.00026220391162955957, + "grad_norm": 0.3375703990459442, + "learning_rate": 0.00015051752921535892, + "loss": 0.5541, + "step": 747 + }, + { + "epoch": 0.00026255492088207575, + "grad_norm": 0.27896445989608765, + "learning_rate": 0.0001504507512520868, + "loss": 0.5273, + "step": 748 + }, + { + "epoch": 0.00026290593013459187, + "grad_norm": 0.30591917037963867, + "learning_rate": 0.0001503839732888147, + "loss": 0.5988, + "step": 749 + }, + { + "epoch": 0.000263256939387108, + "grad_norm": 0.41014084219932556, + "learning_rate": 0.00015031719532554257, + "loss": 0.555, + "step": 750 + }, + { + "epoch": 0.00026360794863962417, + "grad_norm": 0.2935464084148407, + "learning_rate": 0.00015025041736227047, + "loss": 0.625, + "step": 751 + }, + { + "epoch": 0.0002639589578921403, + "grad_norm": 0.46361032128334045, + "learning_rate": 0.00015018363939899834, + "loss": 0.4753, + "step": 752 + }, + { + "epoch": 0.00026430996714465647, + "grad_norm": 0.35808300971984863, + "learning_rate": 0.00015011686143572622, + "loss": 0.5531, + "step": 753 + }, + { + "epoch": 0.0002646609763971726, + "grad_norm": 0.3411274254322052, + "learning_rate": 0.00015005008347245412, + "loss": 0.5577, + "step": 754 + }, + { + "epoch": 0.0002650119856496887, + "grad_norm": 0.34169328212738037, + "learning_rate": 0.000149983305509182, + "loss": 0.4856, + "step": 755 + }, + { + "epoch": 0.0002653629949022049, + "grad_norm": 0.38024139404296875, + "learning_rate": 0.00014991652754590986, + "loss": 0.5203, + "step": 756 + }, + { + "epoch": 0.000265714004154721, + "grad_norm": 0.35004425048828125, + "learning_rate": 0.00014984974958263774, + "loss": 0.4999, + "step": 757 + }, + { + "epoch": 0.0002660650134072372, + "grad_norm": 0.47526153922080994, + "learning_rate": 0.0001497829716193656, + "loss": 0.5503, + "step": 758 + }, + { + "epoch": 0.0002664160226597533, + "grad_norm": 0.35096925497055054, + "learning_rate": 0.0001497161936560935, + "loss": 0.5812, + "step": 759 + }, + { + "epoch": 0.00026676703191226943, + "grad_norm": 0.4505446255207062, + "learning_rate": 0.00014964941569282138, + "loss": 0.6069, + "step": 760 + }, + { + "epoch": 0.0002671180411647856, + "grad_norm": 0.3261663019657135, + "learning_rate": 0.00014958263772954926, + "loss": 0.5601, + "step": 761 + }, + { + "epoch": 0.00026746905041730173, + "grad_norm": 0.3397548794746399, + "learning_rate": 0.00014951585976627713, + "loss": 0.5572, + "step": 762 + }, + { + "epoch": 0.00026782005966981785, + "grad_norm": 0.35547688603401184, + "learning_rate": 0.000149449081803005, + "loss": 0.5983, + "step": 763 + }, + { + "epoch": 0.00026817106892233403, + "grad_norm": 0.41515079140663147, + "learning_rate": 0.00014938230383973287, + "loss": 0.6106, + "step": 764 + }, + { + "epoch": 0.00026852207817485015, + "grad_norm": 0.3840051591396332, + "learning_rate": 0.00014931552587646077, + "loss": 0.5328, + "step": 765 + }, + { + "epoch": 0.00026887308742736633, + "grad_norm": 0.3401285707950592, + "learning_rate": 0.00014924874791318865, + "loss": 0.4666, + "step": 766 + }, + { + "epoch": 0.00026922409667988245, + "grad_norm": 0.32983794808387756, + "learning_rate": 0.00014918196994991652, + "loss": 0.5214, + "step": 767 + }, + { + "epoch": 0.0002695751059323986, + "grad_norm": 0.30202198028564453, + "learning_rate": 0.00014911519198664442, + "loss": 0.4969, + "step": 768 + }, + { + "epoch": 0.00026992611518491475, + "grad_norm": 0.3222092092037201, + "learning_rate": 0.0001490484140233723, + "loss": 0.5093, + "step": 769 + }, + { + "epoch": 0.0002702771244374309, + "grad_norm": 0.4211997091770172, + "learning_rate": 0.0001489816360601002, + "loss": 0.6295, + "step": 770 + }, + { + "epoch": 0.00027062813368994705, + "grad_norm": 0.32112184166908264, + "learning_rate": 0.00014891485809682807, + "loss": 0.5611, + "step": 771 + }, + { + "epoch": 0.00027097914294246317, + "grad_norm": 0.3272956609725952, + "learning_rate": 0.00014884808013355594, + "loss": 0.6438, + "step": 772 + }, + { + "epoch": 0.0002713301521949793, + "grad_norm": 0.39423295855522156, + "learning_rate": 0.00014878130217028381, + "loss": 0.6029, + "step": 773 + }, + { + "epoch": 0.00027168116144749547, + "grad_norm": 0.3053528070449829, + "learning_rate": 0.0001487145242070117, + "loss": 0.4978, + "step": 774 + }, + { + "epoch": 0.0002720321707000116, + "grad_norm": 0.312774658203125, + "learning_rate": 0.0001486477462437396, + "loss": 0.5753, + "step": 775 + }, + { + "epoch": 0.00027238317995252777, + "grad_norm": 0.343964546918869, + "learning_rate": 0.00014858096828046746, + "loss": 0.5173, + "step": 776 + }, + { + "epoch": 0.0002727341892050439, + "grad_norm": 0.39104631543159485, + "learning_rate": 0.00014851419031719533, + "loss": 0.6381, + "step": 777 + }, + { + "epoch": 0.00027308519845756, + "grad_norm": 0.3958207070827484, + "learning_rate": 0.0001484474123539232, + "loss": 0.6046, + "step": 778 + }, + { + "epoch": 0.0002734362077100762, + "grad_norm": 0.36198097467422485, + "learning_rate": 0.00014838063439065108, + "loss": 0.6066, + "step": 779 + }, + { + "epoch": 0.0002737872169625923, + "grad_norm": 0.29619571566581726, + "learning_rate": 0.00014831385642737895, + "loss": 0.5131, + "step": 780 + }, + { + "epoch": 0.0002741382262151085, + "grad_norm": 0.344784677028656, + "learning_rate": 0.00014824707846410685, + "loss": 0.5626, + "step": 781 + }, + { + "epoch": 0.0002744892354676246, + "grad_norm": 0.35641250014305115, + "learning_rate": 0.00014818030050083473, + "loss": 0.5451, + "step": 782 + }, + { + "epoch": 0.00027484024472014074, + "grad_norm": 0.3496847152709961, + "learning_rate": 0.0001481135225375626, + "loss": 0.4814, + "step": 783 + }, + { + "epoch": 0.0002751912539726569, + "grad_norm": 0.3726658821105957, + "learning_rate": 0.00014804674457429047, + "loss": 0.6244, + "step": 784 + }, + { + "epoch": 0.00027554226322517303, + "grad_norm": 0.3317565619945526, + "learning_rate": 0.00014797996661101837, + "loss": 0.562, + "step": 785 + }, + { + "epoch": 0.0002758932724776892, + "grad_norm": 0.3478979468345642, + "learning_rate": 0.00014791318864774625, + "loss": 0.613, + "step": 786 + }, + { + "epoch": 0.00027624428173020533, + "grad_norm": 0.3572550415992737, + "learning_rate": 0.00014784641068447415, + "loss": 0.4841, + "step": 787 + }, + { + "epoch": 0.00027659529098272146, + "grad_norm": 0.34030210971832275, + "learning_rate": 0.00014777963272120202, + "loss": 0.4879, + "step": 788 + }, + { + "epoch": 0.00027694630023523763, + "grad_norm": 0.378203421831131, + "learning_rate": 0.0001477128547579299, + "loss": 0.6086, + "step": 789 + }, + { + "epoch": 0.00027729730948775375, + "grad_norm": 0.3390562832355499, + "learning_rate": 0.00014764607679465777, + "loss": 0.586, + "step": 790 + }, + { + "epoch": 0.00027764831874026993, + "grad_norm": 0.4986645579338074, + "learning_rate": 0.00014757929883138567, + "loss": 0.5592, + "step": 791 + }, + { + "epoch": 0.00027799932799278605, + "grad_norm": 0.3361869156360626, + "learning_rate": 0.00014751252086811354, + "loss": 0.4632, + "step": 792 + }, + { + "epoch": 0.0002783503372453022, + "grad_norm": 0.3726123571395874, + "learning_rate": 0.0001474457429048414, + "loss": 0.4915, + "step": 793 + }, + { + "epoch": 0.00027870134649781835, + "grad_norm": 0.3358845114707947, + "learning_rate": 0.00014737896494156929, + "loss": 0.5593, + "step": 794 + }, + { + "epoch": 0.0002790523557503345, + "grad_norm": 0.30473607778549194, + "learning_rate": 0.00014731218697829716, + "loss": 0.3672, + "step": 795 + }, + { + "epoch": 0.00027940336500285065, + "grad_norm": 0.33929023146629333, + "learning_rate": 0.00014724540901502506, + "loss": 0.5404, + "step": 796 + }, + { + "epoch": 0.0002797543742553668, + "grad_norm": 0.30778205394744873, + "learning_rate": 0.00014717863105175293, + "loss": 0.4379, + "step": 797 + }, + { + "epoch": 0.0002801053835078829, + "grad_norm": 0.286443829536438, + "learning_rate": 0.0001471118530884808, + "loss": 0.5579, + "step": 798 + }, + { + "epoch": 0.0002804563927603991, + "grad_norm": 0.4246799051761627, + "learning_rate": 0.00014704507512520868, + "loss": 0.536, + "step": 799 + }, + { + "epoch": 0.0002808074020129152, + "grad_norm": 0.4085538983345032, + "learning_rate": 0.00014697829716193655, + "loss": 0.5309, + "step": 800 + }, + { + "epoch": 0.00028115841126543137, + "grad_norm": 0.35396453738212585, + "learning_rate": 0.00014691151919866443, + "loss": 0.5307, + "step": 801 + }, + { + "epoch": 0.0002815094205179475, + "grad_norm": 0.45588648319244385, + "learning_rate": 0.00014684474123539233, + "loss": 0.5905, + "step": 802 + }, + { + "epoch": 0.0002818604297704636, + "grad_norm": 0.3353815972805023, + "learning_rate": 0.0001467779632721202, + "loss": 0.612, + "step": 803 + }, + { + "epoch": 0.0002822114390229798, + "grad_norm": 0.4152653217315674, + "learning_rate": 0.0001467111853088481, + "loss": 0.592, + "step": 804 + }, + { + "epoch": 0.0002825624482754959, + "grad_norm": 0.3651511073112488, + "learning_rate": 0.00014664440734557597, + "loss": 0.5909, + "step": 805 + }, + { + "epoch": 0.0002829134575280121, + "grad_norm": 0.3518235385417938, + "learning_rate": 0.00014657762938230385, + "loss": 0.5684, + "step": 806 + }, + { + "epoch": 0.0002832644667805282, + "grad_norm": 0.33562156558036804, + "learning_rate": 0.00014651085141903175, + "loss": 0.5165, + "step": 807 + }, + { + "epoch": 0.00028361547603304434, + "grad_norm": 0.3648052513599396, + "learning_rate": 0.00014644407345575962, + "loss": 0.5451, + "step": 808 + }, + { + "epoch": 0.0002839664852855605, + "grad_norm": 0.44342300295829773, + "learning_rate": 0.0001463772954924875, + "loss": 0.5907, + "step": 809 + }, + { + "epoch": 0.00028431749453807664, + "grad_norm": 0.33331966400146484, + "learning_rate": 0.00014631051752921536, + "loss": 0.4254, + "step": 810 + }, + { + "epoch": 0.0002846685037905928, + "grad_norm": 0.3444873094558716, + "learning_rate": 0.00014624373956594324, + "loss": 0.5201, + "step": 811 + }, + { + "epoch": 0.00028501951304310894, + "grad_norm": 0.4239615201950073, + "learning_rate": 0.00014617696160267114, + "loss": 0.5098, + "step": 812 + }, + { + "epoch": 0.00028537052229562506, + "grad_norm": 0.47895997762680054, + "learning_rate": 0.000146110183639399, + "loss": 0.6243, + "step": 813 + }, + { + "epoch": 0.00028572153154814123, + "grad_norm": 0.47322046756744385, + "learning_rate": 0.00014604340567612688, + "loss": 0.6841, + "step": 814 + }, + { + "epoch": 0.00028607254080065736, + "grad_norm": 0.35017871856689453, + "learning_rate": 0.00014597662771285476, + "loss": 0.5313, + "step": 815 + }, + { + "epoch": 0.00028642355005317353, + "grad_norm": 0.4342300295829773, + "learning_rate": 0.00014590984974958263, + "loss": 0.4363, + "step": 816 + }, + { + "epoch": 0.00028677455930568966, + "grad_norm": 0.2966228723526001, + "learning_rate": 0.0001458430717863105, + "loss": 0.6428, + "step": 817 + }, + { + "epoch": 0.0002871255685582058, + "grad_norm": 0.3320361375808716, + "learning_rate": 0.0001457762938230384, + "loss": 0.5266, + "step": 818 + }, + { + "epoch": 0.00028747657781072195, + "grad_norm": 0.3318590223789215, + "learning_rate": 0.00014570951585976628, + "loss": 0.5676, + "step": 819 + }, + { + "epoch": 0.0002878275870632381, + "grad_norm": 0.38573157787323, + "learning_rate": 0.00014564273789649415, + "loss": 0.7083, + "step": 820 + }, + { + "epoch": 0.00028817859631575425, + "grad_norm": 0.3731164038181305, + "learning_rate": 0.00014557595993322205, + "loss": 0.578, + "step": 821 + }, + { + "epoch": 0.0002885296055682704, + "grad_norm": 0.33610039949417114, + "learning_rate": 0.00014550918196994992, + "loss": 0.5923, + "step": 822 + }, + { + "epoch": 0.0002888806148207865, + "grad_norm": 0.3393179476261139, + "learning_rate": 0.00014544240400667782, + "loss": 0.5162, + "step": 823 + }, + { + "epoch": 0.0002892316240733027, + "grad_norm": 0.35552918910980225, + "learning_rate": 0.0001453756260434057, + "loss": 0.556, + "step": 824 + }, + { + "epoch": 0.0002895826333258188, + "grad_norm": 0.32425832748413086, + "learning_rate": 0.00014530884808013357, + "loss": 0.5157, + "step": 825 + }, + { + "epoch": 0.000289933642578335, + "grad_norm": 0.3353455662727356, + "learning_rate": 0.00014524207011686144, + "loss": 0.483, + "step": 826 + }, + { + "epoch": 0.0002902846518308511, + "grad_norm": 0.46254628896713257, + "learning_rate": 0.00014517529215358932, + "loss": 0.633, + "step": 827 + }, + { + "epoch": 0.0002906356610833672, + "grad_norm": 0.3275732100009918, + "learning_rate": 0.00014510851419031722, + "loss": 0.5502, + "step": 828 + }, + { + "epoch": 0.0002909866703358834, + "grad_norm": 0.3495190441608429, + "learning_rate": 0.0001450417362270451, + "loss": 0.368, + "step": 829 + }, + { + "epoch": 0.0002913376795883995, + "grad_norm": 0.35350501537323, + "learning_rate": 0.00014497495826377296, + "loss": 0.5819, + "step": 830 + }, + { + "epoch": 0.0002916886888409157, + "grad_norm": 0.37886378169059753, + "learning_rate": 0.00014490818030050084, + "loss": 0.5418, + "step": 831 + }, + { + "epoch": 0.0002920396980934318, + "grad_norm": 0.4279928505420685, + "learning_rate": 0.0001448414023372287, + "loss": 0.5199, + "step": 832 + }, + { + "epoch": 0.00029239070734594794, + "grad_norm": 0.33105382323265076, + "learning_rate": 0.00014477462437395658, + "loss": 0.5952, + "step": 833 + }, + { + "epoch": 0.0002927417165984641, + "grad_norm": 0.40114086866378784, + "learning_rate": 0.00014470784641068448, + "loss": 0.4611, + "step": 834 + }, + { + "epoch": 0.00029309272585098024, + "grad_norm": 0.3294037878513336, + "learning_rate": 0.00014464106844741236, + "loss": 0.5562, + "step": 835 + }, + { + "epoch": 0.0002934437351034964, + "grad_norm": 0.3391546607017517, + "learning_rate": 0.00014457429048414023, + "loss": 0.5748, + "step": 836 + }, + { + "epoch": 0.00029379474435601254, + "grad_norm": 0.4093922972679138, + "learning_rate": 0.0001445075125208681, + "loss": 0.4607, + "step": 837 + }, + { + "epoch": 0.00029414575360852866, + "grad_norm": 0.3331819176673889, + "learning_rate": 0.000144440734557596, + "loss": 0.5874, + "step": 838 + }, + { + "epoch": 0.00029449676286104484, + "grad_norm": 0.43205946683883667, + "learning_rate": 0.00014437395659432388, + "loss": 0.6152, + "step": 839 + }, + { + "epoch": 0.00029484777211356096, + "grad_norm": 0.36046868562698364, + "learning_rate": 0.00014430717863105178, + "loss": 0.4781, + "step": 840 + }, + { + "epoch": 0.00029519878136607713, + "grad_norm": 0.35514524579048157, + "learning_rate": 0.00014424040066777965, + "loss": 0.568, + "step": 841 + }, + { + "epoch": 0.00029554979061859326, + "grad_norm": 0.40260326862335205, + "learning_rate": 0.00014417362270450752, + "loss": 0.6075, + "step": 842 + }, + { + "epoch": 0.0002959007998711094, + "grad_norm": 0.3102671205997467, + "learning_rate": 0.0001441068447412354, + "loss": 0.4927, + "step": 843 + }, + { + "epoch": 0.00029625180912362556, + "grad_norm": 0.30940982699394226, + "learning_rate": 0.0001440400667779633, + "loss": 0.5549, + "step": 844 + }, + { + "epoch": 0.0002966028183761417, + "grad_norm": 0.3652762174606323, + "learning_rate": 0.00014397328881469117, + "loss": 0.6085, + "step": 845 + }, + { + "epoch": 0.00029695382762865786, + "grad_norm": 0.43056777119636536, + "learning_rate": 0.00014390651085141904, + "loss": 0.494, + "step": 846 + }, + { + "epoch": 0.000297304836881174, + "grad_norm": 0.3112967014312744, + "learning_rate": 0.00014383973288814692, + "loss": 0.5141, + "step": 847 + }, + { + "epoch": 0.0002976558461336901, + "grad_norm": 0.36729326844215393, + "learning_rate": 0.0001437729549248748, + "loss": 0.5435, + "step": 848 + }, + { + "epoch": 0.0002980068553862063, + "grad_norm": 0.3128114938735962, + "learning_rate": 0.00014370617696160266, + "loss": 0.5419, + "step": 849 + }, + { + "epoch": 0.0002983578646387224, + "grad_norm": 0.4030589163303375, + "learning_rate": 0.00014363939899833056, + "loss": 0.5959, + "step": 850 + }, + { + "epoch": 0.0002987088738912386, + "grad_norm": 0.39571288228034973, + "learning_rate": 0.00014357262103505844, + "loss": 0.6798, + "step": 851 + }, + { + "epoch": 0.0002990598831437547, + "grad_norm": 0.3388408422470093, + "learning_rate": 0.0001435058430717863, + "loss": 0.4887, + "step": 852 + }, + { + "epoch": 0.0002994108923962708, + "grad_norm": 0.39615562558174133, + "learning_rate": 0.00014343906510851418, + "loss": 0.5654, + "step": 853 + }, + { + "epoch": 0.000299761901648787, + "grad_norm": 0.3967401683330536, + "learning_rate": 0.00014337228714524205, + "loss": 0.6192, + "step": 854 + }, + { + "epoch": 0.0003001129109013031, + "grad_norm": 0.5597772002220154, + "learning_rate": 0.00014330550918196995, + "loss": 0.5808, + "step": 855 + }, + { + "epoch": 0.0003004639201538193, + "grad_norm": 0.36231061816215515, + "learning_rate": 0.00014323873121869783, + "loss": 0.4936, + "step": 856 + }, + { + "epoch": 0.0003008149294063354, + "grad_norm": 0.3775942027568817, + "learning_rate": 0.00014317195325542573, + "loss": 0.5706, + "step": 857 + }, + { + "epoch": 0.00030116593865885154, + "grad_norm": 0.4139408767223358, + "learning_rate": 0.0001431051752921536, + "loss": 0.5784, + "step": 858 + }, + { + "epoch": 0.0003015169479113677, + "grad_norm": 0.4101429879665375, + "learning_rate": 0.00014303839732888147, + "loss": 0.5937, + "step": 859 + }, + { + "epoch": 0.00030186795716388384, + "grad_norm": 0.5272162556648254, + "learning_rate": 0.00014297161936560937, + "loss": 0.5244, + "step": 860 + }, + { + "epoch": 0.0003022189664164, + "grad_norm": 0.3587292730808258, + "learning_rate": 0.00014290484140233725, + "loss": 0.6333, + "step": 861 + }, + { + "epoch": 0.00030256997566891614, + "grad_norm": 0.3284890353679657, + "learning_rate": 0.00014283806343906512, + "loss": 0.5414, + "step": 862 + }, + { + "epoch": 0.00030292098492143226, + "grad_norm": 0.414974182844162, + "learning_rate": 0.000142771285475793, + "loss": 0.6116, + "step": 863 + }, + { + "epoch": 0.00030327199417394844, + "grad_norm": 0.33619245886802673, + "learning_rate": 0.00014270450751252087, + "loss": 0.5506, + "step": 864 + }, + { + "epoch": 0.00030362300342646456, + "grad_norm": 0.45475640892982483, + "learning_rate": 0.00014263772954924874, + "loss": 0.6347, + "step": 865 + }, + { + "epoch": 0.00030397401267898074, + "grad_norm": 0.2695920765399933, + "learning_rate": 0.00014257095158597664, + "loss": 0.4529, + "step": 866 + }, + { + "epoch": 0.00030432502193149686, + "grad_norm": 0.3314480781555176, + "learning_rate": 0.00014250417362270451, + "loss": 0.5812, + "step": 867 + }, + { + "epoch": 0.000304676031184013, + "grad_norm": 0.31949582695961, + "learning_rate": 0.0001424373956594324, + "loss": 0.5213, + "step": 868 + }, + { + "epoch": 0.00030502704043652916, + "grad_norm": 0.34049752354621887, + "learning_rate": 0.00014237061769616026, + "loss": 0.4645, + "step": 869 + }, + { + "epoch": 0.0003053780496890453, + "grad_norm": 0.4304719567298889, + "learning_rate": 0.00014230383973288813, + "loss": 0.5065, + "step": 870 + }, + { + "epoch": 0.00030572905894156146, + "grad_norm": 0.32379043102264404, + "learning_rate": 0.00014223706176961603, + "loss": 0.553, + "step": 871 + }, + { + "epoch": 0.0003060800681940776, + "grad_norm": 0.33285439014434814, + "learning_rate": 0.0001421702838063439, + "loss": 0.5092, + "step": 872 + }, + { + "epoch": 0.0003064310774465937, + "grad_norm": 0.336795449256897, + "learning_rate": 0.00014210350584307178, + "loss": 0.4967, + "step": 873 + }, + { + "epoch": 0.0003067820866991099, + "grad_norm": 0.34653040766716003, + "learning_rate": 0.00014203672787979968, + "loss": 0.5353, + "step": 874 + }, + { + "epoch": 0.000307133095951626, + "grad_norm": 0.3352467715740204, + "learning_rate": 0.00014196994991652755, + "loss": 0.5594, + "step": 875 + }, + { + "epoch": 0.0003074841052041422, + "grad_norm": 0.38723453879356384, + "learning_rate": 0.00014190317195325545, + "loss": 0.5897, + "step": 876 + }, + { + "epoch": 0.0003078351144566583, + "grad_norm": 0.3987238109111786, + "learning_rate": 0.00014183639398998333, + "loss": 0.4647, + "step": 877 + }, + { + "epoch": 0.0003081861237091744, + "grad_norm": 0.3452693223953247, + "learning_rate": 0.0001417696160267112, + "loss": 0.5687, + "step": 878 + }, + { + "epoch": 0.0003085371329616906, + "grad_norm": 0.3561328649520874, + "learning_rate": 0.00014170283806343907, + "loss": 0.5845, + "step": 879 + }, + { + "epoch": 0.0003088881422142067, + "grad_norm": 0.29658418893814087, + "learning_rate": 0.00014163606010016695, + "loss": 0.5202, + "step": 880 + }, + { + "epoch": 0.0003092391514667229, + "grad_norm": 0.3908213973045349, + "learning_rate": 0.00014156928213689482, + "loss": 0.4439, + "step": 881 + }, + { + "epoch": 0.000309590160719239, + "grad_norm": 0.35816919803619385, + "learning_rate": 0.00014150250417362272, + "loss": 0.5384, + "step": 882 + }, + { + "epoch": 0.00030994116997175514, + "grad_norm": 0.3681255877017975, + "learning_rate": 0.0001414357262103506, + "loss": 0.5999, + "step": 883 + }, + { + "epoch": 0.0003102921792242713, + "grad_norm": 0.31137388944625854, + "learning_rate": 0.00014136894824707847, + "loss": 0.4495, + "step": 884 + }, + { + "epoch": 0.00031064318847678744, + "grad_norm": 0.2831423878669739, + "learning_rate": 0.00014130217028380634, + "loss": 0.4576, + "step": 885 + }, + { + "epoch": 0.0003109941977293036, + "grad_norm": 0.25953516364097595, + "learning_rate": 0.0001412353923205342, + "loss": 0.5606, + "step": 886 + }, + { + "epoch": 0.00031134520698181974, + "grad_norm": 0.31105297803878784, + "learning_rate": 0.0001411686143572621, + "loss": 0.5986, + "step": 887 + }, + { + "epoch": 0.00031169621623433586, + "grad_norm": 0.35177484154701233, + "learning_rate": 0.00014110183639398999, + "loss": 0.3394, + "step": 888 + }, + { + "epoch": 0.00031204722548685204, + "grad_norm": 0.373470276594162, + "learning_rate": 0.00014103505843071786, + "loss": 0.5862, + "step": 889 + }, + { + "epoch": 0.00031239823473936816, + "grad_norm": 0.37227189540863037, + "learning_rate": 0.00014096828046744576, + "loss": 0.4677, + "step": 890 + }, + { + "epoch": 0.00031274924399188434, + "grad_norm": 0.3799666464328766, + "learning_rate": 0.00014090150250417363, + "loss": 0.5255, + "step": 891 + }, + { + "epoch": 0.00031310025324440046, + "grad_norm": 0.3630129098892212, + "learning_rate": 0.00014083472454090153, + "loss": 0.5111, + "step": 892 + }, + { + "epoch": 0.0003134512624969166, + "grad_norm": 0.5131457448005676, + "learning_rate": 0.0001407679465776294, + "loss": 0.5207, + "step": 893 + }, + { + "epoch": 0.00031380227174943276, + "grad_norm": 0.3759867548942566, + "learning_rate": 0.00014070116861435728, + "loss": 0.6678, + "step": 894 + }, + { + "epoch": 0.0003141532810019489, + "grad_norm": 0.5577414631843567, + "learning_rate": 0.00014063439065108515, + "loss": 0.62, + "step": 895 + }, + { + "epoch": 0.00031450429025446506, + "grad_norm": 0.2789120376110077, + "learning_rate": 0.00014056761268781303, + "loss": 0.4204, + "step": 896 + }, + { + "epoch": 0.0003148552995069812, + "grad_norm": 0.2897239327430725, + "learning_rate": 0.0001405008347245409, + "loss": 0.432, + "step": 897 + }, + { + "epoch": 0.0003152063087594973, + "grad_norm": 0.3552323579788208, + "learning_rate": 0.0001404340567612688, + "loss": 0.5512, + "step": 898 + }, + { + "epoch": 0.0003155573180120135, + "grad_norm": 0.49963894486427307, + "learning_rate": 0.00014036727879799667, + "loss": 0.5868, + "step": 899 + }, + { + "epoch": 0.0003159083272645296, + "grad_norm": 0.37479934096336365, + "learning_rate": 0.00014030050083472454, + "loss": 0.6682, + "step": 900 + }, + { + "epoch": 0.0003162593365170458, + "grad_norm": 0.3415648639202118, + "learning_rate": 0.00014023372287145242, + "loss": 0.5301, + "step": 901 + }, + { + "epoch": 0.0003166103457695619, + "grad_norm": 0.37530943751335144, + "learning_rate": 0.0001401669449081803, + "loss": 0.5409, + "step": 902 + }, + { + "epoch": 0.000316961355022078, + "grad_norm": 0.37487658858299255, + "learning_rate": 0.0001401001669449082, + "loss": 0.5976, + "step": 903 + }, + { + "epoch": 0.0003173123642745942, + "grad_norm": 0.37174728512763977, + "learning_rate": 0.00014003338898163606, + "loss": 0.5933, + "step": 904 + }, + { + "epoch": 0.0003176633735271103, + "grad_norm": 0.491584450006485, + "learning_rate": 0.00013996661101836394, + "loss": 0.5112, + "step": 905 + }, + { + "epoch": 0.0003180143827796265, + "grad_norm": 0.38381487131118774, + "learning_rate": 0.0001398998330550918, + "loss": 0.6486, + "step": 906 + }, + { + "epoch": 0.0003183653920321426, + "grad_norm": 0.2867659330368042, + "learning_rate": 0.0001398330550918197, + "loss": 0.5033, + "step": 907 + }, + { + "epoch": 0.00031871640128465874, + "grad_norm": 0.3146355450153351, + "learning_rate": 0.00013976627712854758, + "loss": 0.5878, + "step": 908 + }, + { + "epoch": 0.0003190674105371749, + "grad_norm": 0.3454856276512146, + "learning_rate": 0.00013969949916527548, + "loss": 0.4751, + "step": 909 + }, + { + "epoch": 0.00031941841978969104, + "grad_norm": 0.32241204380989075, + "learning_rate": 0.00013963272120200336, + "loss": 0.6378, + "step": 910 + }, + { + "epoch": 0.0003197694290422072, + "grad_norm": 0.33703315258026123, + "learning_rate": 0.00013956594323873123, + "loss": 0.4634, + "step": 911 + }, + { + "epoch": 0.00032012043829472334, + "grad_norm": 0.3781648576259613, + "learning_rate": 0.0001394991652754591, + "loss": 0.5218, + "step": 912 + }, + { + "epoch": 0.00032047144754723946, + "grad_norm": 0.4124391973018646, + "learning_rate": 0.00013943238731218698, + "loss": 0.4958, + "step": 913 + }, + { + "epoch": 0.00032082245679975564, + "grad_norm": 0.3970220685005188, + "learning_rate": 0.00013936560934891488, + "loss": 0.5624, + "step": 914 + }, + { + "epoch": 0.00032117346605227176, + "grad_norm": 0.43682703375816345, + "learning_rate": 0.00013929883138564275, + "loss": 0.544, + "step": 915 + }, + { + "epoch": 0.00032152447530478794, + "grad_norm": 0.3476586639881134, + "learning_rate": 0.00013923205342237062, + "loss": 0.4418, + "step": 916 + }, + { + "epoch": 0.00032187548455730406, + "grad_norm": 0.36963552236557007, + "learning_rate": 0.0001391652754590985, + "loss": 0.5946, + "step": 917 + }, + { + "epoch": 0.0003222264938098202, + "grad_norm": 0.3445582985877991, + "learning_rate": 0.00013909849749582637, + "loss": 0.5879, + "step": 918 + }, + { + "epoch": 0.00032257750306233636, + "grad_norm": 0.39813530445098877, + "learning_rate": 0.00013903171953255427, + "loss": 0.5759, + "step": 919 + }, + { + "epoch": 0.0003229285123148525, + "grad_norm": 0.3314265012741089, + "learning_rate": 0.00013896494156928214, + "loss": 0.6165, + "step": 920 + }, + { + "epoch": 0.00032327952156736866, + "grad_norm": 0.4094330072402954, + "learning_rate": 0.00013889816360601002, + "loss": 0.5787, + "step": 921 + }, + { + "epoch": 0.0003236305308198848, + "grad_norm": 0.36821484565734863, + "learning_rate": 0.0001388313856427379, + "loss": 0.5303, + "step": 922 + }, + { + "epoch": 0.0003239815400724009, + "grad_norm": 0.3517453968524933, + "learning_rate": 0.00013876460767946576, + "loss": 0.4586, + "step": 923 + }, + { + "epoch": 0.0003243325493249171, + "grad_norm": 0.2959018647670746, + "learning_rate": 0.00013869782971619366, + "loss": 0.5225, + "step": 924 + }, + { + "epoch": 0.0003246835585774332, + "grad_norm": 0.3286895751953125, + "learning_rate": 0.00013863105175292154, + "loss": 0.5353, + "step": 925 + }, + { + "epoch": 0.0003250345678299494, + "grad_norm": 0.3328275680541992, + "learning_rate": 0.00013856427378964944, + "loss": 0.5915, + "step": 926 + }, + { + "epoch": 0.0003253855770824655, + "grad_norm": 0.3400813937187195, + "learning_rate": 0.0001384974958263773, + "loss": 0.4598, + "step": 927 + }, + { + "epoch": 0.0003257365863349816, + "grad_norm": 0.2876541018486023, + "learning_rate": 0.00013843071786310518, + "loss": 0.4835, + "step": 928 + }, + { + "epoch": 0.0003260875955874978, + "grad_norm": 0.3401765525341034, + "learning_rate": 0.00013836393989983308, + "loss": 0.56, + "step": 929 + }, + { + "epoch": 0.0003264386048400139, + "grad_norm": 0.34506598114967346, + "learning_rate": 0.00013829716193656096, + "loss": 0.6234, + "step": 930 + }, + { + "epoch": 0.0003267896140925301, + "grad_norm": 0.33732855319976807, + "learning_rate": 0.00013823038397328883, + "loss": 0.5686, + "step": 931 + }, + { + "epoch": 0.0003271406233450462, + "grad_norm": 0.34300100803375244, + "learning_rate": 0.0001381636060100167, + "loss": 0.6091, + "step": 932 + }, + { + "epoch": 0.00032749163259756235, + "grad_norm": 0.30349200963974, + "learning_rate": 0.00013809682804674458, + "loss": 0.4836, + "step": 933 + }, + { + "epoch": 0.0003278426418500785, + "grad_norm": 0.35742175579071045, + "learning_rate": 0.00013803005008347245, + "loss": 0.6443, + "step": 934 + }, + { + "epoch": 0.00032819365110259464, + "grad_norm": 0.33582496643066406, + "learning_rate": 0.00013796327212020035, + "loss": 0.6361, + "step": 935 + }, + { + "epoch": 0.0003285446603551108, + "grad_norm": 0.33403804898262024, + "learning_rate": 0.00013789649415692822, + "loss": 0.5911, + "step": 936 + }, + { + "epoch": 0.00032889566960762694, + "grad_norm": 0.4263191521167755, + "learning_rate": 0.0001378297161936561, + "loss": 0.5243, + "step": 937 + }, + { + "epoch": 0.00032924667886014307, + "grad_norm": 0.31543296575546265, + "learning_rate": 0.00013776293823038397, + "loss": 0.554, + "step": 938 + }, + { + "epoch": 0.00032959768811265924, + "grad_norm": 0.38975203037261963, + "learning_rate": 0.00013769616026711184, + "loss": 0.5358, + "step": 939 + }, + { + "epoch": 0.00032994869736517536, + "grad_norm": 0.3175157904624939, + "learning_rate": 0.00013762938230383974, + "loss": 0.5385, + "step": 940 + }, + { + "epoch": 0.00033029970661769154, + "grad_norm": 0.32753151655197144, + "learning_rate": 0.00013756260434056762, + "loss": 0.5191, + "step": 941 + }, + { + "epoch": 0.00033065071587020766, + "grad_norm": 0.2516227066516876, + "learning_rate": 0.0001374958263772955, + "loss": 0.3496, + "step": 942 + }, + { + "epoch": 0.0003310017251227238, + "grad_norm": 0.275806188583374, + "learning_rate": 0.0001374290484140234, + "loss": 0.4197, + "step": 943 + }, + { + "epoch": 0.00033135273437523996, + "grad_norm": 0.30234864354133606, + "learning_rate": 0.00013736227045075126, + "loss": 0.4909, + "step": 944 + }, + { + "epoch": 0.0003317037436277561, + "grad_norm": 0.32561683654785156, + "learning_rate": 0.00013729549248747916, + "loss": 0.5865, + "step": 945 + }, + { + "epoch": 0.00033205475288027226, + "grad_norm": 0.32075145840644836, + "learning_rate": 0.00013722871452420704, + "loss": 0.5957, + "step": 946 + }, + { + "epoch": 0.0003324057621327884, + "grad_norm": 0.3077705204486847, + "learning_rate": 0.0001371619365609349, + "loss": 0.6026, + "step": 947 + }, + { + "epoch": 0.0003327567713853045, + "grad_norm": 0.3092177212238312, + "learning_rate": 0.00013709515859766278, + "loss": 0.553, + "step": 948 + }, + { + "epoch": 0.0003331077806378207, + "grad_norm": 0.3611501157283783, + "learning_rate": 0.00013702838063439065, + "loss": 0.5707, + "step": 949 + }, + { + "epoch": 0.0003334587898903368, + "grad_norm": 0.3343827724456787, + "learning_rate": 0.00013696160267111853, + "loss": 0.5626, + "step": 950 + }, + { + "epoch": 0.000333809799142853, + "grad_norm": 0.3330281376838684, + "learning_rate": 0.00013689482470784643, + "loss": 0.6353, + "step": 951 + }, + { + "epoch": 0.0003341608083953691, + "grad_norm": 0.4045816957950592, + "learning_rate": 0.0001368280467445743, + "loss": 0.5781, + "step": 952 + }, + { + "epoch": 0.0003345118176478852, + "grad_norm": 0.3618166446685791, + "learning_rate": 0.00013676126878130217, + "loss": 0.6702, + "step": 953 + }, + { + "epoch": 0.0003348628269004014, + "grad_norm": 0.2836553752422333, + "learning_rate": 0.00013669449081803005, + "loss": 0.4371, + "step": 954 + }, + { + "epoch": 0.0003352138361529175, + "grad_norm": 0.3100498914718628, + "learning_rate": 0.00013662771285475792, + "loss": 0.5184, + "step": 955 + }, + { + "epoch": 0.0003355648454054337, + "grad_norm": 0.34877723455429077, + "learning_rate": 0.00013656093489148582, + "loss": 0.4778, + "step": 956 + }, + { + "epoch": 0.0003359158546579498, + "grad_norm": 0.27756938338279724, + "learning_rate": 0.0001364941569282137, + "loss": 0.4314, + "step": 957 + }, + { + "epoch": 0.00033626686391046595, + "grad_norm": 0.36129051446914673, + "learning_rate": 0.00013642737896494157, + "loss": 0.5837, + "step": 958 + }, + { + "epoch": 0.0003366178731629821, + "grad_norm": 0.35625776648521423, + "learning_rate": 0.00013636060100166944, + "loss": 0.5579, + "step": 959 + }, + { + "epoch": 0.00033696888241549825, + "grad_norm": 0.3735104501247406, + "learning_rate": 0.00013629382303839734, + "loss": 0.5283, + "step": 960 + }, + { + "epoch": 0.0003373198916680144, + "grad_norm": 0.34185606241226196, + "learning_rate": 0.00013622704507512521, + "loss": 0.5669, + "step": 961 + }, + { + "epoch": 0.00033767090092053054, + "grad_norm": 0.29324260354042053, + "learning_rate": 0.00013616026711185311, + "loss": 0.4468, + "step": 962 + }, + { + "epoch": 0.00033802191017304667, + "grad_norm": 0.3439052700996399, + "learning_rate": 0.000136093489148581, + "loss": 0.5196, + "step": 963 + }, + { + "epoch": 0.00033837291942556284, + "grad_norm": 0.3536570370197296, + "learning_rate": 0.00013602671118530886, + "loss": 0.5251, + "step": 964 + }, + { + "epoch": 0.00033872392867807897, + "grad_norm": 0.4759911298751831, + "learning_rate": 0.00013595993322203673, + "loss": 0.7017, + "step": 965 + }, + { + "epoch": 0.00033907493793059514, + "grad_norm": 0.2958674728870392, + "learning_rate": 0.0001358931552587646, + "loss": 0.4936, + "step": 966 + }, + { + "epoch": 0.00033942594718311126, + "grad_norm": 0.32770562171936035, + "learning_rate": 0.0001358263772954925, + "loss": 0.5741, + "step": 967 + }, + { + "epoch": 0.0003397769564356274, + "grad_norm": 0.35697153210639954, + "learning_rate": 0.00013575959933222038, + "loss": 0.428, + "step": 968 + }, + { + "epoch": 0.00034012796568814356, + "grad_norm": 0.3409043252468109, + "learning_rate": 0.00013569282136894825, + "loss": 0.6142, + "step": 969 + }, + { + "epoch": 0.0003404789749406597, + "grad_norm": 0.47055551409721375, + "learning_rate": 0.00013562604340567613, + "loss": 0.463, + "step": 970 + }, + { + "epoch": 0.00034082998419317586, + "grad_norm": 0.38270413875579834, + "learning_rate": 0.000135559265442404, + "loss": 0.462, + "step": 971 + }, + { + "epoch": 0.000341180993445692, + "grad_norm": 0.26209867000579834, + "learning_rate": 0.0001354924874791319, + "loss": 0.5341, + "step": 972 + }, + { + "epoch": 0.0003415320026982081, + "grad_norm": 0.37498748302459717, + "learning_rate": 0.00013542570951585977, + "loss": 0.5196, + "step": 973 + }, + { + "epoch": 0.0003418830119507243, + "grad_norm": 0.36789608001708984, + "learning_rate": 0.00013535893155258765, + "loss": 0.4723, + "step": 974 + }, + { + "epoch": 0.0003422340212032404, + "grad_norm": 0.33915975689888, + "learning_rate": 0.00013529215358931552, + "loss": 0.5511, + "step": 975 + }, + { + "epoch": 0.0003425850304557566, + "grad_norm": 0.43045058846473694, + "learning_rate": 0.0001352253756260434, + "loss": 0.5667, + "step": 976 + }, + { + "epoch": 0.0003429360397082727, + "grad_norm": 0.2948949933052063, + "learning_rate": 0.0001351585976627713, + "loss": 0.4804, + "step": 977 + }, + { + "epoch": 0.00034328704896078883, + "grad_norm": 0.3249470889568329, + "learning_rate": 0.00013509181969949917, + "loss": 0.6041, + "step": 978 + }, + { + "epoch": 0.000343638058213305, + "grad_norm": 0.2865908741950989, + "learning_rate": 0.00013502504173622707, + "loss": 0.5617, + "step": 979 + }, + { + "epoch": 0.0003439890674658211, + "grad_norm": 0.3190818428993225, + "learning_rate": 0.00013495826377295494, + "loss": 0.4902, + "step": 980 + }, + { + "epoch": 0.00034434007671833725, + "grad_norm": 0.3111664950847626, + "learning_rate": 0.0001348914858096828, + "loss": 0.5504, + "step": 981 + }, + { + "epoch": 0.0003446910859708534, + "grad_norm": 0.3255857229232788, + "learning_rate": 0.00013482470784641069, + "loss": 0.5592, + "step": 982 + }, + { + "epoch": 0.00034504209522336955, + "grad_norm": 0.30806589126586914, + "learning_rate": 0.00013475792988313859, + "loss": 0.5567, + "step": 983 + }, + { + "epoch": 0.0003453931044758857, + "grad_norm": 0.33785945177078247, + "learning_rate": 0.00013469115191986646, + "loss": 0.5881, + "step": 984 + }, + { + "epoch": 0.00034574411372840185, + "grad_norm": 0.34626781940460205, + "learning_rate": 0.00013462437395659433, + "loss": 0.578, + "step": 985 + }, + { + "epoch": 0.00034609512298091797, + "grad_norm": 0.367034912109375, + "learning_rate": 0.0001345575959933222, + "loss": 0.5893, + "step": 986 + }, + { + "epoch": 0.00034644613223343415, + "grad_norm": 0.37824952602386475, + "learning_rate": 0.00013449081803005008, + "loss": 0.5681, + "step": 987 + }, + { + "epoch": 0.00034679714148595027, + "grad_norm": 0.4054035544395447, + "learning_rate": 0.00013442404006677798, + "loss": 0.6108, + "step": 988 + }, + { + "epoch": 0.00034714815073846645, + "grad_norm": 0.4374067485332489, + "learning_rate": 0.00013435726210350585, + "loss": 0.6002, + "step": 989 + }, + { + "epoch": 0.00034749915999098257, + "grad_norm": 0.3554278016090393, + "learning_rate": 0.00013429048414023373, + "loss": 0.6444, + "step": 990 + }, + { + "epoch": 0.0003478501692434987, + "grad_norm": 0.3428646922111511, + "learning_rate": 0.0001342237061769616, + "loss": 0.6527, + "step": 991 + }, + { + "epoch": 0.00034820117849601487, + "grad_norm": 0.25603657960891724, + "learning_rate": 0.00013415692821368947, + "loss": 0.5244, + "step": 992 + }, + { + "epoch": 0.000348552187748531, + "grad_norm": 0.35237595438957214, + "learning_rate": 0.00013409015025041737, + "loss": 0.557, + "step": 993 + }, + { + "epoch": 0.00034890319700104717, + "grad_norm": 0.33666110038757324, + "learning_rate": 0.00013402337228714524, + "loss": 0.5674, + "step": 994 + }, + { + "epoch": 0.0003492542062535633, + "grad_norm": 0.30283182859420776, + "learning_rate": 0.00013395659432387312, + "loss": 0.6081, + "step": 995 + }, + { + "epoch": 0.0003496052155060794, + "grad_norm": 0.30893146991729736, + "learning_rate": 0.00013388981636060102, + "loss": 0.6089, + "step": 996 + }, + { + "epoch": 0.0003499562247585956, + "grad_norm": 0.2617473304271698, + "learning_rate": 0.0001338230383973289, + "loss": 0.6104, + "step": 997 + }, + { + "epoch": 0.0003503072340111117, + "grad_norm": 0.29493093490600586, + "learning_rate": 0.00013375626043405676, + "loss": 0.5047, + "step": 998 + }, + { + "epoch": 0.0003506582432636279, + "grad_norm": 0.3991663157939911, + "learning_rate": 0.00013368948247078466, + "loss": 0.5137, + "step": 999 + }, + { + "epoch": 0.000351009252516144, + "grad_norm": 0.31760329008102417, + "learning_rate": 0.00013362270450751254, + "loss": 0.4371, + "step": 1000 + }, + { + "epoch": 0.00035136026176866013, + "grad_norm": 0.35144907236099243, + "learning_rate": 0.0001335559265442404, + "loss": 0.5085, + "step": 1001 + }, + { + "epoch": 0.0003517112710211763, + "grad_norm": 0.3597724735736847, + "learning_rate": 0.00013348914858096828, + "loss": 0.593, + "step": 1002 + }, + { + "epoch": 0.00035206228027369243, + "grad_norm": 0.33647072315216064, + "learning_rate": 0.00013342237061769616, + "loss": 0.6011, + "step": 1003 + }, + { + "epoch": 0.0003524132895262086, + "grad_norm": 0.3377489745616913, + "learning_rate": 0.00013335559265442406, + "loss": 0.6285, + "step": 1004 + }, + { + "epoch": 0.00035276429877872473, + "grad_norm": 0.3210775852203369, + "learning_rate": 0.00013328881469115193, + "loss": 0.5214, + "step": 1005 + }, + { + "epoch": 0.00035311530803124085, + "grad_norm": 0.33832573890686035, + "learning_rate": 0.0001332220367278798, + "loss": 0.5788, + "step": 1006 + }, + { + "epoch": 0.00035346631728375703, + "grad_norm": 0.3025464117527008, + "learning_rate": 0.00013315525876460768, + "loss": 0.3762, + "step": 1007 + }, + { + "epoch": 0.00035381732653627315, + "grad_norm": 0.33917921781539917, + "learning_rate": 0.00013308848080133555, + "loss": 0.5816, + "step": 1008 + }, + { + "epoch": 0.0003541683357887893, + "grad_norm": 0.3070494830608368, + "learning_rate": 0.00013302170283806345, + "loss": 0.522, + "step": 1009 + }, + { + "epoch": 0.00035451934504130545, + "grad_norm": 0.31389573216438293, + "learning_rate": 0.00013295492487479132, + "loss": 0.5966, + "step": 1010 + }, + { + "epoch": 0.00035487035429382157, + "grad_norm": 0.33663564920425415, + "learning_rate": 0.0001328881469115192, + "loss": 0.5857, + "step": 1011 + }, + { + "epoch": 0.00035522136354633775, + "grad_norm": 0.3280203640460968, + "learning_rate": 0.00013282136894824707, + "loss": 0.562, + "step": 1012 + }, + { + "epoch": 0.00035557237279885387, + "grad_norm": 0.3307760953903198, + "learning_rate": 0.00013275459098497497, + "loss": 0.6258, + "step": 1013 + }, + { + "epoch": 0.00035592338205137005, + "grad_norm": 0.34378358721733093, + "learning_rate": 0.00013268781302170284, + "loss": 0.5026, + "step": 1014 + }, + { + "epoch": 0.00035627439130388617, + "grad_norm": 0.32818603515625, + "learning_rate": 0.00013262103505843074, + "loss": 0.513, + "step": 1015 + }, + { + "epoch": 0.0003566254005564023, + "grad_norm": 0.3015523850917816, + "learning_rate": 0.00013255425709515862, + "loss": 0.5448, + "step": 1016 + }, + { + "epoch": 0.00035697640980891847, + "grad_norm": 0.2927173674106598, + "learning_rate": 0.0001324874791318865, + "loss": 0.6565, + "step": 1017 + }, + { + "epoch": 0.0003573274190614346, + "grad_norm": 0.3502102196216583, + "learning_rate": 0.00013242070116861436, + "loss": 0.6235, + "step": 1018 + }, + { + "epoch": 0.00035767842831395077, + "grad_norm": 0.32151371240615845, + "learning_rate": 0.00013235392320534224, + "loss": 0.5613, + "step": 1019 + }, + { + "epoch": 0.0003580294375664669, + "grad_norm": 0.31253233551979065, + "learning_rate": 0.00013228714524207014, + "loss": 0.4744, + "step": 1020 + }, + { + "epoch": 0.000358380446818983, + "grad_norm": 0.2831304669380188, + "learning_rate": 0.000132220367278798, + "loss": 0.5385, + "step": 1021 + }, + { + "epoch": 0.0003587314560714992, + "grad_norm": 0.32526761293411255, + "learning_rate": 0.00013215358931552588, + "loss": 0.6316, + "step": 1022 + }, + { + "epoch": 0.0003590824653240153, + "grad_norm": 0.3305005729198456, + "learning_rate": 0.00013208681135225376, + "loss": 0.5287, + "step": 1023 + }, + { + "epoch": 0.0003594334745765315, + "grad_norm": 0.29515331983566284, + "learning_rate": 0.00013202003338898163, + "loss": 0.5478, + "step": 1024 + }, + { + "epoch": 0.0003597844838290476, + "grad_norm": 0.32527396082878113, + "learning_rate": 0.00013195325542570953, + "loss": 0.6309, + "step": 1025 + }, + { + "epoch": 0.00036013549308156373, + "grad_norm": 0.3407800793647766, + "learning_rate": 0.0001318864774624374, + "loss": 0.5958, + "step": 1026 + }, + { + "epoch": 0.0003604865023340799, + "grad_norm": 0.40766170620918274, + "learning_rate": 0.00013181969949916528, + "loss": 0.5281, + "step": 1027 + }, + { + "epoch": 0.00036083751158659603, + "grad_norm": 0.3853365480899811, + "learning_rate": 0.00013175292153589315, + "loss": 0.6349, + "step": 1028 + }, + { + "epoch": 0.0003611885208391122, + "grad_norm": 0.2854768633842468, + "learning_rate": 0.00013168614357262102, + "loss": 0.4515, + "step": 1029 + }, + { + "epoch": 0.00036153953009162833, + "grad_norm": 0.3713400065898895, + "learning_rate": 0.00013161936560934892, + "loss": 0.5256, + "step": 1030 + }, + { + "epoch": 0.00036189053934414445, + "grad_norm": 0.3738803565502167, + "learning_rate": 0.0001315525876460768, + "loss": 0.647, + "step": 1031 + }, + { + "epoch": 0.00036224154859666063, + "grad_norm": 0.3904534578323364, + "learning_rate": 0.0001314858096828047, + "loss": 0.6047, + "step": 1032 + }, + { + "epoch": 0.00036259255784917675, + "grad_norm": 0.3647315204143524, + "learning_rate": 0.00013141903171953257, + "loss": 0.5027, + "step": 1033 + }, + { + "epoch": 0.00036294356710169293, + "grad_norm": 0.3410654366016388, + "learning_rate": 0.00013135225375626044, + "loss": 0.6187, + "step": 1034 + }, + { + "epoch": 0.00036329457635420905, + "grad_norm": 0.3227837383747101, + "learning_rate": 0.00013128547579298832, + "loss": 0.4749, + "step": 1035 + }, + { + "epoch": 0.00036364558560672517, + "grad_norm": 0.2792038917541504, + "learning_rate": 0.00013121869782971622, + "loss": 0.4981, + "step": 1036 + }, + { + "epoch": 0.00036399659485924135, + "grad_norm": 0.339101642370224, + "learning_rate": 0.0001311519198664441, + "loss": 0.5875, + "step": 1037 + }, + { + "epoch": 0.00036434760411175747, + "grad_norm": 0.369004487991333, + "learning_rate": 0.00013108514190317196, + "loss": 0.4854, + "step": 1038 + }, + { + "epoch": 0.00036469861336427365, + "grad_norm": 0.39061155915260315, + "learning_rate": 0.00013101836393989983, + "loss": 0.5887, + "step": 1039 + }, + { + "epoch": 0.00036504962261678977, + "grad_norm": 0.3913773000240326, + "learning_rate": 0.0001309515859766277, + "loss": 0.5388, + "step": 1040 + }, + { + "epoch": 0.0003654006318693059, + "grad_norm": 0.27972474694252014, + "learning_rate": 0.0001308848080133556, + "loss": 0.3841, + "step": 1041 + }, + { + "epoch": 0.00036575164112182207, + "grad_norm": 0.3185168504714966, + "learning_rate": 0.00013081803005008348, + "loss": 0.4955, + "step": 1042 + }, + { + "epoch": 0.0003661026503743382, + "grad_norm": 0.6088166236877441, + "learning_rate": 0.00013075125208681135, + "loss": 0.5242, + "step": 1043 + }, + { + "epoch": 0.00036645365962685437, + "grad_norm": 0.4608970582485199, + "learning_rate": 0.00013068447412353923, + "loss": 0.5375, + "step": 1044 + }, + { + "epoch": 0.0003668046688793705, + "grad_norm": 0.38970229029655457, + "learning_rate": 0.0001306176961602671, + "loss": 0.5227, + "step": 1045 + }, + { + "epoch": 0.0003671556781318866, + "grad_norm": 0.3537042438983917, + "learning_rate": 0.00013055091819699497, + "loss": 0.5022, + "step": 1046 + }, + { + "epoch": 0.0003675066873844028, + "grad_norm": 0.3243977725505829, + "learning_rate": 0.00013048414023372287, + "loss": 0.4638, + "step": 1047 + }, + { + "epoch": 0.0003678576966369189, + "grad_norm": 0.5033393502235413, + "learning_rate": 0.00013041736227045075, + "loss": 0.6124, + "step": 1048 + }, + { + "epoch": 0.0003682087058894351, + "grad_norm": 0.3304978907108307, + "learning_rate": 0.00013035058430717865, + "loss": 0.5645, + "step": 1049 + }, + { + "epoch": 0.0003685597151419512, + "grad_norm": 0.36042529344558716, + "learning_rate": 0.00013028380634390652, + "loss": 0.4484, + "step": 1050 + }, + { + "epoch": 0.00036891072439446733, + "grad_norm": 0.4284050166606903, + "learning_rate": 0.0001302170283806344, + "loss": 0.6074, + "step": 1051 + }, + { + "epoch": 0.0003692617336469835, + "grad_norm": 0.28319039940834045, + "learning_rate": 0.0001301502504173623, + "loss": 0.563, + "step": 1052 + }, + { + "epoch": 0.00036961274289949963, + "grad_norm": 0.35593390464782715, + "learning_rate": 0.00013008347245409017, + "loss": 0.5548, + "step": 1053 + }, + { + "epoch": 0.0003699637521520158, + "grad_norm": 0.3092995285987854, + "learning_rate": 0.00013001669449081804, + "loss": 0.5512, + "step": 1054 + }, + { + "epoch": 0.00037031476140453193, + "grad_norm": 0.39928558468818665, + "learning_rate": 0.00012994991652754591, + "loss": 0.5828, + "step": 1055 + }, + { + "epoch": 0.00037066577065704805, + "grad_norm": 0.3541167974472046, + "learning_rate": 0.0001298831385642738, + "loss": 0.5943, + "step": 1056 + }, + { + "epoch": 0.00037101677990956423, + "grad_norm": 0.3520177900791168, + "learning_rate": 0.0001298163606010017, + "loss": 0.5629, + "step": 1057 + }, + { + "epoch": 0.00037136778916208035, + "grad_norm": 0.26769620180130005, + "learning_rate": 0.00012974958263772956, + "loss": 0.4686, + "step": 1058 + }, + { + "epoch": 0.00037171879841459653, + "grad_norm": 0.4143349528312683, + "learning_rate": 0.00012968280467445743, + "loss": 0.5898, + "step": 1059 + }, + { + "epoch": 0.00037206980766711265, + "grad_norm": 0.29856693744659424, + "learning_rate": 0.0001296160267111853, + "loss": 0.5795, + "step": 1060 + }, + { + "epoch": 0.0003724208169196288, + "grad_norm": 0.3835422396659851, + "learning_rate": 0.00012954924874791318, + "loss": 0.657, + "step": 1061 + }, + { + "epoch": 0.00037277182617214495, + "grad_norm": 0.3311139941215515, + "learning_rate": 0.00012948247078464108, + "loss": 0.5206, + "step": 1062 + }, + { + "epoch": 0.0003731228354246611, + "grad_norm": 0.38118553161621094, + "learning_rate": 0.00012941569282136895, + "loss": 0.6101, + "step": 1063 + }, + { + "epoch": 0.00037347384467717725, + "grad_norm": 0.3357555568218231, + "learning_rate": 0.00012934891485809683, + "loss": 0.4583, + "step": 1064 + }, + { + "epoch": 0.00037382485392969337, + "grad_norm": 0.3239798843860626, + "learning_rate": 0.0001292821368948247, + "loss": 0.5717, + "step": 1065 + }, + { + "epoch": 0.0003741758631822095, + "grad_norm": 0.31502071022987366, + "learning_rate": 0.0001292153589315526, + "loss": 0.5528, + "step": 1066 + }, + { + "epoch": 0.00037452687243472567, + "grad_norm": 0.35177144408226013, + "learning_rate": 0.00012914858096828047, + "loss": 0.5404, + "step": 1067 + }, + { + "epoch": 0.0003748778816872418, + "grad_norm": 0.3457860052585602, + "learning_rate": 0.00012908180300500837, + "loss": 0.5311, + "step": 1068 + }, + { + "epoch": 0.00037522889093975797, + "grad_norm": 0.31016480922698975, + "learning_rate": 0.00012901502504173625, + "loss": 0.521, + "step": 1069 + }, + { + "epoch": 0.0003755799001922741, + "grad_norm": 0.2800024151802063, + "learning_rate": 0.00012894824707846412, + "loss": 0.4831, + "step": 1070 + }, + { + "epoch": 0.0003759309094447902, + "grad_norm": 0.3560345470905304, + "learning_rate": 0.000128881469115192, + "loss": 0.4771, + "step": 1071 + }, + { + "epoch": 0.0003762819186973064, + "grad_norm": 0.28846535086631775, + "learning_rate": 0.00012881469115191987, + "loss": 0.4444, + "step": 1072 + }, + { + "epoch": 0.0003766329279498225, + "grad_norm": 0.29720595479011536, + "learning_rate": 0.00012874791318864777, + "loss": 0.5048, + "step": 1073 + }, + { + "epoch": 0.0003769839372023387, + "grad_norm": 0.40147536993026733, + "learning_rate": 0.00012868113522537564, + "loss": 0.5521, + "step": 1074 + }, + { + "epoch": 0.0003773349464548548, + "grad_norm": 0.36368894577026367, + "learning_rate": 0.0001286143572621035, + "loss": 0.5211, + "step": 1075 + }, + { + "epoch": 0.00037768595570737094, + "grad_norm": 0.34239786863327026, + "learning_rate": 0.00012854757929883139, + "loss": 0.4327, + "step": 1076 + }, + { + "epoch": 0.0003780369649598871, + "grad_norm": 0.3420031666755676, + "learning_rate": 0.00012848080133555926, + "loss": 0.5377, + "step": 1077 + }, + { + "epoch": 0.00037838797421240323, + "grad_norm": 0.32050299644470215, + "learning_rate": 0.00012841402337228716, + "loss": 0.6428, + "step": 1078 + }, + { + "epoch": 0.0003787389834649194, + "grad_norm": 0.31478747725486755, + "learning_rate": 0.00012834724540901503, + "loss": 0.4042, + "step": 1079 + }, + { + "epoch": 0.00037908999271743553, + "grad_norm": 0.4019688367843628, + "learning_rate": 0.0001282804674457429, + "loss": 0.5806, + "step": 1080 + }, + { + "epoch": 0.00037944100196995166, + "grad_norm": 0.3169090151786804, + "learning_rate": 0.00012821368948247078, + "loss": 0.6143, + "step": 1081 + }, + { + "epoch": 0.00037979201122246783, + "grad_norm": 0.3160766363143921, + "learning_rate": 0.00012814691151919865, + "loss": 0.4358, + "step": 1082 + }, + { + "epoch": 0.00038014302047498395, + "grad_norm": 0.30607977509498596, + "learning_rate": 0.00012808013355592655, + "loss": 0.611, + "step": 1083 + }, + { + "epoch": 0.00038049402972750013, + "grad_norm": 0.3392901122570038, + "learning_rate": 0.00012801335559265442, + "loss": 0.4677, + "step": 1084 + }, + { + "epoch": 0.00038084503898001625, + "grad_norm": 0.3608296513557434, + "learning_rate": 0.00012794657762938233, + "loss": 0.4681, + "step": 1085 + }, + { + "epoch": 0.0003811960482325324, + "grad_norm": 0.35469377040863037, + "learning_rate": 0.0001278797996661102, + "loss": 0.5122, + "step": 1086 + }, + { + "epoch": 0.00038154705748504855, + "grad_norm": 0.42851918935775757, + "learning_rate": 0.00012781302170283807, + "loss": 0.511, + "step": 1087 + }, + { + "epoch": 0.0003818980667375647, + "grad_norm": 0.31718799471855164, + "learning_rate": 0.00012774624373956594, + "loss": 0.5504, + "step": 1088 + }, + { + "epoch": 0.00038224907599008085, + "grad_norm": 0.31201183795928955, + "learning_rate": 0.00012767946577629384, + "loss": 0.5846, + "step": 1089 + }, + { + "epoch": 0.000382600085242597, + "grad_norm": 0.44880107045173645, + "learning_rate": 0.00012761268781302172, + "loss": 0.6351, + "step": 1090 + }, + { + "epoch": 0.0003829510944951131, + "grad_norm": 0.3685932755470276, + "learning_rate": 0.0001275459098497496, + "loss": 0.4946, + "step": 1091 + }, + { + "epoch": 0.00038330210374762927, + "grad_norm": 0.38342320919036865, + "learning_rate": 0.00012747913188647746, + "loss": 0.4357, + "step": 1092 + }, + { + "epoch": 0.0003836531130001454, + "grad_norm": 0.2710161805152893, + "learning_rate": 0.00012741235392320534, + "loss": 0.4635, + "step": 1093 + }, + { + "epoch": 0.00038400412225266157, + "grad_norm": 0.3405950963497162, + "learning_rate": 0.00012734557595993324, + "loss": 0.4272, + "step": 1094 + }, + { + "epoch": 0.0003843551315051777, + "grad_norm": 0.3414493203163147, + "learning_rate": 0.0001272787979966611, + "loss": 0.5387, + "step": 1095 + }, + { + "epoch": 0.0003847061407576938, + "grad_norm": 0.30659371614456177, + "learning_rate": 0.00012721202003338898, + "loss": 0.451, + "step": 1096 + }, + { + "epoch": 0.00038505715001021, + "grad_norm": 0.33229631185531616, + "learning_rate": 0.00012714524207011686, + "loss": 0.6062, + "step": 1097 + }, + { + "epoch": 0.0003854081592627261, + "grad_norm": 0.29991772770881653, + "learning_rate": 0.00012707846410684473, + "loss": 0.5812, + "step": 1098 + }, + { + "epoch": 0.0003857591685152423, + "grad_norm": 0.2937552332878113, + "learning_rate": 0.0001270116861435726, + "loss": 0.4762, + "step": 1099 + }, + { + "epoch": 0.0003861101777677584, + "grad_norm": 0.3993151783943176, + "learning_rate": 0.0001269449081803005, + "loss": 0.5288, + "step": 1100 + }, + { + "epoch": 0.00038646118702027454, + "grad_norm": 0.34012341499328613, + "learning_rate": 0.00012687813021702838, + "loss": 0.5858, + "step": 1101 + }, + { + "epoch": 0.0003868121962727907, + "grad_norm": 0.31721460819244385, + "learning_rate": 0.00012681135225375628, + "loss": 0.4543, + "step": 1102 + }, + { + "epoch": 0.00038716320552530684, + "grad_norm": 0.404480904340744, + "learning_rate": 0.00012674457429048415, + "loss": 0.6425, + "step": 1103 + }, + { + "epoch": 0.000387514214777823, + "grad_norm": 0.2888083755970001, + "learning_rate": 0.00012667779632721202, + "loss": 0.5737, + "step": 1104 + }, + { + "epoch": 0.00038786522403033913, + "grad_norm": 0.316724568605423, + "learning_rate": 0.00012661101836393992, + "loss": 0.4774, + "step": 1105 + }, + { + "epoch": 0.00038821623328285526, + "grad_norm": 0.34277236461639404, + "learning_rate": 0.0001265442404006678, + "loss": 0.5722, + "step": 1106 + }, + { + "epoch": 0.00038856724253537143, + "grad_norm": 0.3688976764678955, + "learning_rate": 0.00012647746243739567, + "loss": 0.478, + "step": 1107 + }, + { + "epoch": 0.00038891825178788756, + "grad_norm": 0.30905240774154663, + "learning_rate": 0.00012641068447412354, + "loss": 0.5578, + "step": 1108 + }, + { + "epoch": 0.00038926926104040373, + "grad_norm": 0.31679004430770874, + "learning_rate": 0.00012634390651085142, + "loss": 0.5564, + "step": 1109 + }, + { + "epoch": 0.00038962027029291985, + "grad_norm": 0.31234732270240784, + "learning_rate": 0.00012627712854757932, + "loss": 0.5403, + "step": 1110 + }, + { + "epoch": 0.000389971279545436, + "grad_norm": 0.2693454921245575, + "learning_rate": 0.0001262103505843072, + "loss": 0.577, + "step": 1111 + }, + { + "epoch": 0.00039032228879795215, + "grad_norm": 0.36127611994743347, + "learning_rate": 0.00012614357262103506, + "loss": 0.5558, + "step": 1112 + }, + { + "epoch": 0.0003906732980504683, + "grad_norm": 0.3124391436576843, + "learning_rate": 0.00012607679465776294, + "loss": 0.5198, + "step": 1113 + }, + { + "epoch": 0.00039102430730298445, + "grad_norm": 0.339495986700058, + "learning_rate": 0.0001260100166944908, + "loss": 0.4415, + "step": 1114 + }, + { + "epoch": 0.0003913753165555006, + "grad_norm": 0.3561634421348572, + "learning_rate": 0.00012594323873121868, + "loss": 0.5413, + "step": 1115 + }, + { + "epoch": 0.0003917263258080167, + "grad_norm": 0.30160975456237793, + "learning_rate": 0.00012587646076794658, + "loss": 0.5754, + "step": 1116 + }, + { + "epoch": 0.0003920773350605329, + "grad_norm": 0.583508312702179, + "learning_rate": 0.00012580968280467446, + "loss": 0.5645, + "step": 1117 + }, + { + "epoch": 0.000392428344313049, + "grad_norm": 0.3197818100452423, + "learning_rate": 0.00012574290484140233, + "loss": 0.5326, + "step": 1118 + }, + { + "epoch": 0.0003927793535655652, + "grad_norm": 0.3258291482925415, + "learning_rate": 0.00012567612687813023, + "loss": 0.5504, + "step": 1119 + }, + { + "epoch": 0.0003931303628180813, + "grad_norm": 0.2790183424949646, + "learning_rate": 0.0001256093489148581, + "loss": 0.4691, + "step": 1120 + }, + { + "epoch": 0.0003934813720705974, + "grad_norm": 0.4802376627922058, + "learning_rate": 0.000125542570951586, + "loss": 0.5689, + "step": 1121 + }, + { + "epoch": 0.0003938323813231136, + "grad_norm": 0.42296934127807617, + "learning_rate": 0.00012547579298831388, + "loss": 0.5082, + "step": 1122 + }, + { + "epoch": 0.0003941833905756297, + "grad_norm": 0.4018993377685547, + "learning_rate": 0.00012540901502504175, + "loss": 0.5967, + "step": 1123 + }, + { + "epoch": 0.0003945343998281459, + "grad_norm": 0.2756693661212921, + "learning_rate": 0.00012534223706176962, + "loss": 0.5071, + "step": 1124 + }, + { + "epoch": 0.000394885409080662, + "grad_norm": 0.28827816247940063, + "learning_rate": 0.0001252754590984975, + "loss": 0.446, + "step": 1125 + }, + { + "epoch": 0.00039523641833317814, + "grad_norm": 0.33188387751579285, + "learning_rate": 0.0001252086811352254, + "loss": 0.59, + "step": 1126 + }, + { + "epoch": 0.0003955874275856943, + "grad_norm": 0.3057992458343506, + "learning_rate": 0.00012514190317195327, + "loss": 0.4665, + "step": 1127 + }, + { + "epoch": 0.00039593843683821044, + "grad_norm": 0.423970103263855, + "learning_rate": 0.00012507512520868114, + "loss": 0.5603, + "step": 1128 + }, + { + "epoch": 0.0003962894460907266, + "grad_norm": 0.4346948266029358, + "learning_rate": 0.00012500834724540902, + "loss": 0.7188, + "step": 1129 + }, + { + "epoch": 0.00039664045534324274, + "grad_norm": 0.3196350932121277, + "learning_rate": 0.0001249415692821369, + "loss": 0.499, + "step": 1130 + }, + { + "epoch": 0.00039699146459575886, + "grad_norm": 0.32787612080574036, + "learning_rate": 0.00012487479131886476, + "loss": 0.562, + "step": 1131 + }, + { + "epoch": 0.00039734247384827504, + "grad_norm": 0.3701760768890381, + "learning_rate": 0.00012480801335559266, + "loss": 0.5906, + "step": 1132 + }, + { + "epoch": 0.00039769348310079116, + "grad_norm": 0.2836174964904785, + "learning_rate": 0.00012474123539232053, + "loss": 0.5241, + "step": 1133 + }, + { + "epoch": 0.00039804449235330733, + "grad_norm": 0.3123319745063782, + "learning_rate": 0.0001246744574290484, + "loss": 0.5591, + "step": 1134 + }, + { + "epoch": 0.00039839550160582346, + "grad_norm": 0.2965394854545593, + "learning_rate": 0.0001246076794657763, + "loss": 0.5522, + "step": 1135 + }, + { + "epoch": 0.0003987465108583396, + "grad_norm": 0.3452530801296234, + "learning_rate": 0.00012454090150250418, + "loss": 0.5572, + "step": 1136 + }, + { + "epoch": 0.00039909752011085576, + "grad_norm": 0.3368155062198639, + "learning_rate": 0.00012447412353923208, + "loss": 0.4947, + "step": 1137 + }, + { + "epoch": 0.0003994485293633719, + "grad_norm": 0.31308281421661377, + "learning_rate": 0.00012440734557595995, + "loss": 0.5395, + "step": 1138 + }, + { + "epoch": 0.00039979953861588805, + "grad_norm": 0.36880385875701904, + "learning_rate": 0.00012434056761268783, + "loss": 0.5449, + "step": 1139 + }, + { + "epoch": 0.0004001505478684042, + "grad_norm": 0.3276751935482025, + "learning_rate": 0.0001242737896494157, + "loss": 0.5714, + "step": 1140 + }, + { + "epoch": 0.0004005015571209203, + "grad_norm": 0.34474796056747437, + "learning_rate": 0.00012420701168614357, + "loss": 0.5579, + "step": 1141 + }, + { + "epoch": 0.0004008525663734365, + "grad_norm": 0.3203624188899994, + "learning_rate": 0.00012414023372287147, + "loss": 0.5848, + "step": 1142 + }, + { + "epoch": 0.0004012035756259526, + "grad_norm": 0.33093470335006714, + "learning_rate": 0.00012407345575959935, + "loss": 0.5515, + "step": 1143 + }, + { + "epoch": 0.0004015545848784688, + "grad_norm": 0.2994841933250427, + "learning_rate": 0.00012400667779632722, + "loss": 0.4696, + "step": 1144 + }, + { + "epoch": 0.0004019055941309849, + "grad_norm": 0.43979793787002563, + "learning_rate": 0.0001239398998330551, + "loss": 0.5531, + "step": 1145 + }, + { + "epoch": 0.000402256603383501, + "grad_norm": 0.33747658133506775, + "learning_rate": 0.00012387312186978297, + "loss": 0.5442, + "step": 1146 + }, + { + "epoch": 0.0004026076126360172, + "grad_norm": 0.3129333257675171, + "learning_rate": 0.00012380634390651084, + "loss": 0.5812, + "step": 1147 + }, + { + "epoch": 0.0004029586218885333, + "grad_norm": 0.27842286229133606, + "learning_rate": 0.00012373956594323874, + "loss": 0.5571, + "step": 1148 + }, + { + "epoch": 0.0004033096311410495, + "grad_norm": 0.30332496762275696, + "learning_rate": 0.00012367278797996661, + "loss": 0.5264, + "step": 1149 + }, + { + "epoch": 0.0004036606403935656, + "grad_norm": 0.41959401965141296, + "learning_rate": 0.0001236060100166945, + "loss": 0.6208, + "step": 1150 + }, + { + "epoch": 0.00040401164964608174, + "grad_norm": 0.2994483411312103, + "learning_rate": 0.00012353923205342236, + "loss": 0.5311, + "step": 1151 + }, + { + "epoch": 0.0004043626588985979, + "grad_norm": 0.28562021255493164, + "learning_rate": 0.00012347245409015026, + "loss": 0.4664, + "step": 1152 + }, + { + "epoch": 0.00040471366815111404, + "grad_norm": 0.3773499131202698, + "learning_rate": 0.00012340567612687813, + "loss": 0.6372, + "step": 1153 + }, + { + "epoch": 0.0004050646774036302, + "grad_norm": 0.3149654269218445, + "learning_rate": 0.00012333889816360603, + "loss": 0.5295, + "step": 1154 + }, + { + "epoch": 0.00040541568665614634, + "grad_norm": 0.345595121383667, + "learning_rate": 0.0001232721202003339, + "loss": 0.5568, + "step": 1155 + }, + { + "epoch": 0.00040576669590866246, + "grad_norm": 0.2795856297016144, + "learning_rate": 0.00012320534223706178, + "loss": 0.4909, + "step": 1156 + }, + { + "epoch": 0.00040611770516117864, + "grad_norm": 0.37467122077941895, + "learning_rate": 0.00012313856427378965, + "loss": 0.5733, + "step": 1157 + }, + { + "epoch": 0.00040646871441369476, + "grad_norm": 0.33086350560188293, + "learning_rate": 0.00012307178631051755, + "loss": 0.5371, + "step": 1158 + }, + { + "epoch": 0.00040681972366621094, + "grad_norm": 0.3587074279785156, + "learning_rate": 0.00012300500834724543, + "loss": 0.5555, + "step": 1159 + }, + { + "epoch": 0.00040717073291872706, + "grad_norm": 0.35360291600227356, + "learning_rate": 0.0001229382303839733, + "loss": 0.5686, + "step": 1160 + }, + { + "epoch": 0.0004075217421712432, + "grad_norm": 0.32877933979034424, + "learning_rate": 0.00012287145242070117, + "loss": 0.6232, + "step": 1161 + }, + { + "epoch": 0.00040787275142375936, + "grad_norm": 0.3402215540409088, + "learning_rate": 0.00012280467445742905, + "loss": 0.5923, + "step": 1162 + }, + { + "epoch": 0.0004082237606762755, + "grad_norm": 0.3712671399116516, + "learning_rate": 0.00012273789649415692, + "loss": 0.4405, + "step": 1163 + }, + { + "epoch": 0.00040857476992879166, + "grad_norm": 0.34966424107551575, + "learning_rate": 0.00012267111853088482, + "loss": 0.5987, + "step": 1164 + }, + { + "epoch": 0.0004089257791813078, + "grad_norm": 0.8779903650283813, + "learning_rate": 0.0001226043405676127, + "loss": 0.5677, + "step": 1165 + }, + { + "epoch": 0.0004092767884338239, + "grad_norm": 0.30721041560173035, + "learning_rate": 0.00012253756260434057, + "loss": 0.4803, + "step": 1166 + }, + { + "epoch": 0.0004096277976863401, + "grad_norm": 0.3509838879108429, + "learning_rate": 0.00012247078464106844, + "loss": 0.4216, + "step": 1167 + }, + { + "epoch": 0.0004099788069388562, + "grad_norm": 0.2961578071117401, + "learning_rate": 0.0001224040066777963, + "loss": 0.5599, + "step": 1168 + }, + { + "epoch": 0.0004103298161913724, + "grad_norm": 0.28842684626579285, + "learning_rate": 0.0001223372287145242, + "loss": 0.5023, + "step": 1169 + }, + { + "epoch": 0.0004106808254438885, + "grad_norm": 0.3395219147205353, + "learning_rate": 0.00012227045075125209, + "loss": 0.6371, + "step": 1170 + }, + { + "epoch": 0.0004110318346964046, + "grad_norm": 0.2860247492790222, + "learning_rate": 0.00012220367278797999, + "loss": 0.3881, + "step": 1171 + }, + { + "epoch": 0.0004113828439489208, + "grad_norm": 0.5463435053825378, + "learning_rate": 0.00012213689482470786, + "loss": 0.5751, + "step": 1172 + }, + { + "epoch": 0.0004117338532014369, + "grad_norm": 0.30383020639419556, + "learning_rate": 0.00012207011686143572, + "loss": 0.4892, + "step": 1173 + }, + { + "epoch": 0.0004120848624539531, + "grad_norm": 0.6111129522323608, + "learning_rate": 0.00012200333889816362, + "loss": 0.6786, + "step": 1174 + }, + { + "epoch": 0.0004124358717064692, + "grad_norm": 0.32131698727607727, + "learning_rate": 0.00012193656093489149, + "loss": 0.6301, + "step": 1175 + }, + { + "epoch": 0.00041278688095898534, + "grad_norm": 0.3574715256690979, + "learning_rate": 0.00012186978297161938, + "loss": 0.5705, + "step": 1176 + }, + { + "epoch": 0.0004131378902115015, + "grad_norm": 0.46258190274238586, + "learning_rate": 0.00012180300500834725, + "loss": 0.54, + "step": 1177 + }, + { + "epoch": 0.00041348889946401764, + "grad_norm": 0.385326623916626, + "learning_rate": 0.00012173622704507512, + "loss": 0.5792, + "step": 1178 + }, + { + "epoch": 0.0004138399087165338, + "grad_norm": 0.3880153000354767, + "learning_rate": 0.00012166944908180303, + "loss": 0.5396, + "step": 1179 + }, + { + "epoch": 0.00041419091796904994, + "grad_norm": 0.32916024327278137, + "learning_rate": 0.0001216026711185309, + "loss": 0.5632, + "step": 1180 + }, + { + "epoch": 0.00041454192722156606, + "grad_norm": 0.30234548449516296, + "learning_rate": 0.00012153589315525877, + "loss": 0.5162, + "step": 1181 + }, + { + "epoch": 0.00041489293647408224, + "grad_norm": 0.3654727339744568, + "learning_rate": 0.00012146911519198664, + "loss": 0.6333, + "step": 1182 + }, + { + "epoch": 0.00041524394572659836, + "grad_norm": 0.3166685700416565, + "learning_rate": 0.00012140233722871452, + "loss": 0.5276, + "step": 1183 + }, + { + "epoch": 0.00041559495497911454, + "grad_norm": 0.3722357153892517, + "learning_rate": 0.0001213355592654424, + "loss": 0.5771, + "step": 1184 + }, + { + "epoch": 0.00041594596423163066, + "grad_norm": 0.3407818377017975, + "learning_rate": 0.00012126878130217029, + "loss": 0.5998, + "step": 1185 + }, + { + "epoch": 0.0004162969734841468, + "grad_norm": 0.28665193915367126, + "learning_rate": 0.00012120200333889818, + "loss": 0.5457, + "step": 1186 + }, + { + "epoch": 0.00041664798273666296, + "grad_norm": 0.3052026629447937, + "learning_rate": 0.00012113522537562605, + "loss": 0.5204, + "step": 1187 + }, + { + "epoch": 0.0004169989919891791, + "grad_norm": 0.286080002784729, + "learning_rate": 0.00012106844741235392, + "loss": 0.4346, + "step": 1188 + }, + { + "epoch": 0.00041735000124169526, + "grad_norm": 0.306473970413208, + "learning_rate": 0.0001210016694490818, + "loss": 0.5544, + "step": 1189 + }, + { + "epoch": 0.0004177010104942114, + "grad_norm": 0.3347833454608917, + "learning_rate": 0.0001209348914858097, + "loss": 0.4619, + "step": 1190 + }, + { + "epoch": 0.0004180520197467275, + "grad_norm": 0.28040143847465515, + "learning_rate": 0.00012086811352253757, + "loss": 0.5492, + "step": 1191 + }, + { + "epoch": 0.0004184030289992437, + "grad_norm": 0.2940806448459625, + "learning_rate": 0.00012080133555926544, + "loss": 0.5653, + "step": 1192 + }, + { + "epoch": 0.0004187540382517598, + "grad_norm": 0.37384578585624695, + "learning_rate": 0.00012073455759599333, + "loss": 0.4931, + "step": 1193 + }, + { + "epoch": 0.000419105047504276, + "grad_norm": 0.28816068172454834, + "learning_rate": 0.0001206677796327212, + "loss": 0.5292, + "step": 1194 + }, + { + "epoch": 0.0004194560567567921, + "grad_norm": 0.31325826048851013, + "learning_rate": 0.0001206010016694491, + "loss": 0.5288, + "step": 1195 + }, + { + "epoch": 0.0004198070660093082, + "grad_norm": 0.30658552050590515, + "learning_rate": 0.00012053422370617698, + "loss": 0.5854, + "step": 1196 + }, + { + "epoch": 0.0004201580752618244, + "grad_norm": 0.341240257024765, + "learning_rate": 0.00012046744574290485, + "loss": 0.5358, + "step": 1197 + }, + { + "epoch": 0.0004205090845143405, + "grad_norm": 0.3595687747001648, + "learning_rate": 0.00012040066777963272, + "loss": 0.5944, + "step": 1198 + }, + { + "epoch": 0.00042086009376685664, + "grad_norm": 0.3249213397502899, + "learning_rate": 0.0001203338898163606, + "loss": 0.4873, + "step": 1199 + }, + { + "epoch": 0.0004212111030193728, + "grad_norm": 0.37282127141952515, + "learning_rate": 0.00012026711185308848, + "loss": 0.5173, + "step": 1200 + }, + { + "epoch": 0.00042156211227188894, + "grad_norm": 0.325110524892807, + "learning_rate": 0.00012020033388981637, + "loss": 0.4819, + "step": 1201 + }, + { + "epoch": 0.0004219131215244051, + "grad_norm": 0.313388466835022, + "learning_rate": 0.00012013355592654426, + "loss": 0.5613, + "step": 1202 + }, + { + "epoch": 0.00042226413077692124, + "grad_norm": 0.38384371995925903, + "learning_rate": 0.00012006677796327213, + "loss": 0.5711, + "step": 1203 + }, + { + "epoch": 0.00042261514002943736, + "grad_norm": 0.3431423008441925, + "learning_rate": 0.00012, + "loss": 0.5593, + "step": 1204 + }, + { + "epoch": 0.00042296614928195354, + "grad_norm": 0.3032066822052002, + "learning_rate": 0.00011993322203672788, + "loss": 0.559, + "step": 1205 + }, + { + "epoch": 0.00042331715853446966, + "grad_norm": 0.30639907717704773, + "learning_rate": 0.00011986644407345578, + "loss": 0.5727, + "step": 1206 + }, + { + "epoch": 0.00042366816778698584, + "grad_norm": 0.2970695197582245, + "learning_rate": 0.00011979966611018365, + "loss": 0.5933, + "step": 1207 + }, + { + "epoch": 0.00042401917703950196, + "grad_norm": 0.3868466317653656, + "learning_rate": 0.00011973288814691152, + "loss": 0.5779, + "step": 1208 + }, + { + "epoch": 0.0004243701862920181, + "grad_norm": 0.29085230827331543, + "learning_rate": 0.0001196661101836394, + "loss": 0.6558, + "step": 1209 + }, + { + "epoch": 0.00042472119554453426, + "grad_norm": 0.33766743540763855, + "learning_rate": 0.00011959933222036728, + "loss": 0.5809, + "step": 1210 + }, + { + "epoch": 0.0004250722047970504, + "grad_norm": 0.6739090085029602, + "learning_rate": 0.00011953255425709517, + "loss": 0.6085, + "step": 1211 + }, + { + "epoch": 0.00042542321404956656, + "grad_norm": 0.35693222284317017, + "learning_rate": 0.00011946577629382306, + "loss": 0.5855, + "step": 1212 + }, + { + "epoch": 0.0004257742233020827, + "grad_norm": 0.3087833523750305, + "learning_rate": 0.00011939899833055093, + "loss": 0.6379, + "step": 1213 + }, + { + "epoch": 0.0004261252325545988, + "grad_norm": 0.3548837900161743, + "learning_rate": 0.0001193322203672788, + "loss": 0.5303, + "step": 1214 + }, + { + "epoch": 0.000426476241807115, + "grad_norm": 0.46040648221969604, + "learning_rate": 0.00011926544240400668, + "loss": 0.5171, + "step": 1215 + }, + { + "epoch": 0.0004268272510596311, + "grad_norm": 0.5730584859848022, + "learning_rate": 0.00011919866444073455, + "loss": 0.615, + "step": 1216 + }, + { + "epoch": 0.0004271782603121473, + "grad_norm": 0.34618711471557617, + "learning_rate": 0.00011913188647746245, + "loss": 0.5605, + "step": 1217 + }, + { + "epoch": 0.0004275292695646634, + "grad_norm": 0.3499528169631958, + "learning_rate": 0.00011906510851419032, + "loss": 0.5184, + "step": 1218 + }, + { + "epoch": 0.0004278802788171795, + "grad_norm": 0.33638936281204224, + "learning_rate": 0.00011899833055091821, + "loss": 0.6276, + "step": 1219 + }, + { + "epoch": 0.0004282312880696957, + "grad_norm": 0.34646880626678467, + "learning_rate": 0.00011893155258764608, + "loss": 0.5737, + "step": 1220 + }, + { + "epoch": 0.0004285822973222118, + "grad_norm": 0.2783110439777374, + "learning_rate": 0.00011886477462437396, + "loss": 0.4424, + "step": 1221 + }, + { + "epoch": 0.000428933306574728, + "grad_norm": 0.33892807364463806, + "learning_rate": 0.00011879799666110186, + "loss": 0.5656, + "step": 1222 + }, + { + "epoch": 0.0004292843158272441, + "grad_norm": 0.2782565653324127, + "learning_rate": 0.00011873121869782973, + "loss": 0.5504, + "step": 1223 + }, + { + "epoch": 0.00042963532507976025, + "grad_norm": 0.3684981167316437, + "learning_rate": 0.0001186644407345576, + "loss": 0.5532, + "step": 1224 + }, + { + "epoch": 0.0004299863343322764, + "grad_norm": 0.4034316837787628, + "learning_rate": 0.00011859766277128547, + "loss": 0.5417, + "step": 1225 + }, + { + "epoch": 0.00043033734358479254, + "grad_norm": 0.5182071924209595, + "learning_rate": 0.00011853088480801335, + "loss": 0.6118, + "step": 1226 + }, + { + "epoch": 0.0004306883528373087, + "grad_norm": 0.3137674033641815, + "learning_rate": 0.00011846410684474125, + "loss": 0.6485, + "step": 1227 + }, + { + "epoch": 0.00043103936208982484, + "grad_norm": 0.4069771468639374, + "learning_rate": 0.00011839732888146912, + "loss": 0.5452, + "step": 1228 + }, + { + "epoch": 0.00043139037134234097, + "grad_norm": 0.5212397575378418, + "learning_rate": 0.00011833055091819701, + "loss": 0.5212, + "step": 1229 + }, + { + "epoch": 0.00043174138059485714, + "grad_norm": 0.3622184693813324, + "learning_rate": 0.00011826377295492488, + "loss": 0.4333, + "step": 1230 + }, + { + "epoch": 0.00043209238984737326, + "grad_norm": 0.335044801235199, + "learning_rate": 0.00011819699499165275, + "loss": 0.5606, + "step": 1231 + }, + { + "epoch": 0.00043244339909988944, + "grad_norm": 0.31680893898010254, + "learning_rate": 0.00011813021702838063, + "loss": 0.4988, + "step": 1232 + }, + { + "epoch": 0.00043279440835240556, + "grad_norm": 0.5272301435470581, + "learning_rate": 0.00011806343906510853, + "loss": 0.6024, + "step": 1233 + }, + { + "epoch": 0.0004331454176049217, + "grad_norm": 0.3663223385810852, + "learning_rate": 0.0001179966611018364, + "loss": 0.5964, + "step": 1234 + }, + { + "epoch": 0.00043349642685743786, + "grad_norm": 0.35138314962387085, + "learning_rate": 0.00011792988313856427, + "loss": 0.5908, + "step": 1235 + }, + { + "epoch": 0.000433847436109954, + "grad_norm": 0.3744595944881439, + "learning_rate": 0.00011786310517529216, + "loss": 0.551, + "step": 1236 + }, + { + "epoch": 0.00043419844536247016, + "grad_norm": 0.31489259004592896, + "learning_rate": 0.00011779632721202003, + "loss": 0.6431, + "step": 1237 + }, + { + "epoch": 0.0004345494546149863, + "grad_norm": 0.3356812298297882, + "learning_rate": 0.00011772954924874793, + "loss": 0.4507, + "step": 1238 + }, + { + "epoch": 0.0004349004638675024, + "grad_norm": 0.3018808364868164, + "learning_rate": 0.00011766277128547581, + "loss": 0.4796, + "step": 1239 + }, + { + "epoch": 0.0004352514731200186, + "grad_norm": 0.3201460540294647, + "learning_rate": 0.00011759599332220368, + "loss": 0.4768, + "step": 1240 + }, + { + "epoch": 0.0004356024823725347, + "grad_norm": 0.3269093334674835, + "learning_rate": 0.00011752921535893155, + "loss": 0.5419, + "step": 1241 + }, + { + "epoch": 0.0004359534916250509, + "grad_norm": 0.28690990805625916, + "learning_rate": 0.00011746243739565943, + "loss": 0.5088, + "step": 1242 + }, + { + "epoch": 0.000436304500877567, + "grad_norm": 0.32765012979507446, + "learning_rate": 0.00011739565943238733, + "loss": 0.4953, + "step": 1243 + }, + { + "epoch": 0.0004366555101300831, + "grad_norm": 0.28830674290657043, + "learning_rate": 0.0001173288814691152, + "loss": 0.5179, + "step": 1244 + }, + { + "epoch": 0.0004370065193825993, + "grad_norm": 0.37793827056884766, + "learning_rate": 0.00011726210350584307, + "loss": 0.5951, + "step": 1245 + }, + { + "epoch": 0.0004373575286351154, + "grad_norm": 0.37173348665237427, + "learning_rate": 0.00011719532554257096, + "loss": 0.6059, + "step": 1246 + }, + { + "epoch": 0.0004377085378876316, + "grad_norm": 0.5363826155662537, + "learning_rate": 0.00011712854757929883, + "loss": 0.5183, + "step": 1247 + }, + { + "epoch": 0.0004380595471401477, + "grad_norm": 0.31671205163002014, + "learning_rate": 0.0001170617696160267, + "loss": 0.5711, + "step": 1248 + }, + { + "epoch": 0.00043841055639266385, + "grad_norm": 0.3112623989582062, + "learning_rate": 0.0001169949916527546, + "loss": 0.5647, + "step": 1249 + }, + { + "epoch": 0.00043876156564518, + "grad_norm": 0.3153972923755646, + "learning_rate": 0.00011692821368948248, + "loss": 0.4939, + "step": 1250 + }, + { + "epoch": 0.00043911257489769615, + "grad_norm": 0.29940372705459595, + "learning_rate": 0.00011686143572621035, + "loss": 0.5509, + "step": 1251 + }, + { + "epoch": 0.0004394635841502123, + "grad_norm": 0.42540279030799866, + "learning_rate": 0.00011679465776293823, + "loss": 0.4104, + "step": 1252 + }, + { + "epoch": 0.00043981459340272844, + "grad_norm": 0.3222522437572479, + "learning_rate": 0.00011672787979966611, + "loss": 0.6237, + "step": 1253 + }, + { + "epoch": 0.00044016560265524457, + "grad_norm": 0.34896525740623474, + "learning_rate": 0.000116661101836394, + "loss": 0.5162, + "step": 1254 + }, + { + "epoch": 0.00044051661190776074, + "grad_norm": 0.29780149459838867, + "learning_rate": 0.00011659432387312189, + "loss": 0.5805, + "step": 1255 + }, + { + "epoch": 0.00044086762116027687, + "grad_norm": 0.3533996343612671, + "learning_rate": 0.00011652754590984976, + "loss": 0.5749, + "step": 1256 + }, + { + "epoch": 0.00044121863041279304, + "grad_norm": 0.30867093801498413, + "learning_rate": 0.00011646076794657763, + "loss": 0.479, + "step": 1257 + }, + { + "epoch": 0.00044156963966530917, + "grad_norm": 0.31176280975341797, + "learning_rate": 0.0001163939899833055, + "loss": 0.5007, + "step": 1258 + }, + { + "epoch": 0.0004419206489178253, + "grad_norm": 0.3480489253997803, + "learning_rate": 0.0001163272120200334, + "loss": 0.5595, + "step": 1259 + }, + { + "epoch": 0.00044227165817034146, + "grad_norm": 0.37473055720329285, + "learning_rate": 0.00011626043405676128, + "loss": 0.5042, + "step": 1260 + }, + { + "epoch": 0.0004426226674228576, + "grad_norm": 0.3167501986026764, + "learning_rate": 0.00011619365609348915, + "loss": 0.5335, + "step": 1261 + }, + { + "epoch": 0.00044297367667537376, + "grad_norm": 0.31276339292526245, + "learning_rate": 0.00011612687813021703, + "loss": 0.5594, + "step": 1262 + }, + { + "epoch": 0.0004433246859278899, + "grad_norm": 0.42910438776016235, + "learning_rate": 0.00011606010016694491, + "loss": 0.4659, + "step": 1263 + }, + { + "epoch": 0.000443675695180406, + "grad_norm": 0.3169635534286499, + "learning_rate": 0.00011599332220367279, + "loss": 0.5463, + "step": 1264 + }, + { + "epoch": 0.0004440267044329222, + "grad_norm": 0.3419555425643921, + "learning_rate": 0.00011592654424040069, + "loss": 0.5091, + "step": 1265 + }, + { + "epoch": 0.0004443777136854383, + "grad_norm": 0.31462714076042175, + "learning_rate": 0.00011585976627712856, + "loss": 0.6233, + "step": 1266 + }, + { + "epoch": 0.0004447287229379545, + "grad_norm": 0.36186134815216064, + "learning_rate": 0.00011579298831385643, + "loss": 0.5634, + "step": 1267 + }, + { + "epoch": 0.0004450797321904706, + "grad_norm": 0.385903000831604, + "learning_rate": 0.0001157262103505843, + "loss": 0.5892, + "step": 1268 + }, + { + "epoch": 0.00044543074144298673, + "grad_norm": 0.28669610619544983, + "learning_rate": 0.00011565943238731218, + "loss": 0.4746, + "step": 1269 + }, + { + "epoch": 0.0004457817506955029, + "grad_norm": 0.37557515501976013, + "learning_rate": 0.00011559265442404008, + "loss": 0.5946, + "step": 1270 + }, + { + "epoch": 0.00044613275994801903, + "grad_norm": 0.30455920100212097, + "learning_rate": 0.00011552587646076795, + "loss": 0.4064, + "step": 1271 + }, + { + "epoch": 0.0004464837692005352, + "grad_norm": 0.36547228693962097, + "learning_rate": 0.00011545909849749584, + "loss": 0.4354, + "step": 1272 + }, + { + "epoch": 0.0004468347784530513, + "grad_norm": 0.3912973999977112, + "learning_rate": 0.00011539232053422371, + "loss": 0.544, + "step": 1273 + }, + { + "epoch": 0.00044718578770556745, + "grad_norm": 0.2993258237838745, + "learning_rate": 0.00011532554257095158, + "loss": 0.4623, + "step": 1274 + }, + { + "epoch": 0.0004475367969580836, + "grad_norm": 0.39676982164382935, + "learning_rate": 0.00011525876460767948, + "loss": 0.4735, + "step": 1275 + }, + { + "epoch": 0.00044788780621059975, + "grad_norm": 0.43738967180252075, + "learning_rate": 0.00011519198664440736, + "loss": 0.5639, + "step": 1276 + }, + { + "epoch": 0.0004482388154631159, + "grad_norm": 0.4572802186012268, + "learning_rate": 0.00011512520868113523, + "loss": 0.5043, + "step": 1277 + }, + { + "epoch": 0.00044858982471563205, + "grad_norm": 0.301929771900177, + "learning_rate": 0.0001150584307178631, + "loss": 0.3962, + "step": 1278 + }, + { + "epoch": 0.00044894083396814817, + "grad_norm": 0.42450666427612305, + "learning_rate": 0.00011499165275459098, + "loss": 0.5885, + "step": 1279 + }, + { + "epoch": 0.00044929184322066435, + "grad_norm": 0.3520278036594391, + "learning_rate": 0.00011492487479131886, + "loss": 0.5557, + "step": 1280 + }, + { + "epoch": 0.00044964285247318047, + "grad_norm": 0.32748425006866455, + "learning_rate": 0.00011485809682804675, + "loss": 0.5788, + "step": 1281 + }, + { + "epoch": 0.00044999386172569664, + "grad_norm": 0.3404058516025543, + "learning_rate": 0.00011479131886477464, + "loss": 0.431, + "step": 1282 + }, + { + "epoch": 0.00045034487097821277, + "grad_norm": 0.30703750252723694, + "learning_rate": 0.00011472454090150251, + "loss": 0.5603, + "step": 1283 + }, + { + "epoch": 0.0004506958802307289, + "grad_norm": 0.3476982116699219, + "learning_rate": 0.00011465776293823038, + "loss": 0.4984, + "step": 1284 + }, + { + "epoch": 0.00045104688948324507, + "grad_norm": 0.361433207988739, + "learning_rate": 0.00011459098497495826, + "loss": 0.4012, + "step": 1285 + }, + { + "epoch": 0.0004513978987357612, + "grad_norm": 0.31583985686302185, + "learning_rate": 0.00011452420701168616, + "loss": 0.5115, + "step": 1286 + }, + { + "epoch": 0.00045174890798827736, + "grad_norm": 0.3581843376159668, + "learning_rate": 0.00011445742904841403, + "loss": 0.5795, + "step": 1287 + }, + { + "epoch": 0.0004520999172407935, + "grad_norm": 0.30088526010513306, + "learning_rate": 0.0001143906510851419, + "loss": 0.4995, + "step": 1288 + }, + { + "epoch": 0.0004524509264933096, + "grad_norm": 0.34739211201667786, + "learning_rate": 0.00011432387312186979, + "loss": 0.5513, + "step": 1289 + }, + { + "epoch": 0.0004528019357458258, + "grad_norm": 0.3440413177013397, + "learning_rate": 0.00011425709515859766, + "loss": 0.626, + "step": 1290 + }, + { + "epoch": 0.0004531529449983419, + "grad_norm": 0.34715211391448975, + "learning_rate": 0.00011419031719532556, + "loss": 0.5567, + "step": 1291 + }, + { + "epoch": 0.0004535039542508581, + "grad_norm": 0.3141072690486908, + "learning_rate": 0.00011412353923205344, + "loss": 0.515, + "step": 1292 + }, + { + "epoch": 0.0004538549635033742, + "grad_norm": 0.3693056106567383, + "learning_rate": 0.00011405676126878131, + "loss": 0.6039, + "step": 1293 + }, + { + "epoch": 0.00045420597275589033, + "grad_norm": 0.2877582609653473, + "learning_rate": 0.00011398998330550918, + "loss": 0.627, + "step": 1294 + }, + { + "epoch": 0.0004545569820084065, + "grad_norm": 0.30727502703666687, + "learning_rate": 0.00011392320534223706, + "loss": 0.4439, + "step": 1295 + }, + { + "epoch": 0.00045490799126092263, + "grad_norm": 0.340834379196167, + "learning_rate": 0.00011385642737896493, + "loss": 0.6043, + "step": 1296 + }, + { + "epoch": 0.0004552590005134388, + "grad_norm": 0.37094762921333313, + "learning_rate": 0.00011378964941569283, + "loss": 0.5279, + "step": 1297 + }, + { + "epoch": 0.00045561000976595493, + "grad_norm": 0.352252721786499, + "learning_rate": 0.0001137228714524207, + "loss": 0.4534, + "step": 1298 + }, + { + "epoch": 0.00045596101901847105, + "grad_norm": 0.3592413663864136, + "learning_rate": 0.00011365609348914859, + "loss": 0.6009, + "step": 1299 + }, + { + "epoch": 0.0004563120282709872, + "grad_norm": 0.3028002679347992, + "learning_rate": 0.00011358931552587646, + "loss": 0.5451, + "step": 1300 + }, + { + "epoch": 0.00045666303752350335, + "grad_norm": 0.3545093238353729, + "learning_rate": 0.00011352253756260434, + "loss": 0.6022, + "step": 1301 + }, + { + "epoch": 0.0004570140467760195, + "grad_norm": 0.31239053606987, + "learning_rate": 0.00011345575959933224, + "loss": 0.5893, + "step": 1302 + }, + { + "epoch": 0.00045736505602853565, + "grad_norm": 0.2930079996585846, + "learning_rate": 0.00011338898163606011, + "loss": 0.6469, + "step": 1303 + }, + { + "epoch": 0.00045771606528105177, + "grad_norm": 0.3328670263290405, + "learning_rate": 0.00011332220367278798, + "loss": 0.551, + "step": 1304 + }, + { + "epoch": 0.00045806707453356795, + "grad_norm": 0.2958623766899109, + "learning_rate": 0.00011325542570951586, + "loss": 0.4699, + "step": 1305 + }, + { + "epoch": 0.00045841808378608407, + "grad_norm": 0.26540592312812805, + "learning_rate": 0.00011318864774624374, + "loss": 0.5651, + "step": 1306 + }, + { + "epoch": 0.00045876909303860025, + "grad_norm": 0.30372926592826843, + "learning_rate": 0.00011312186978297163, + "loss": 0.4466, + "step": 1307 + }, + { + "epoch": 0.00045912010229111637, + "grad_norm": 0.32394206523895264, + "learning_rate": 0.00011305509181969952, + "loss": 0.4651, + "step": 1308 + }, + { + "epoch": 0.0004594711115436325, + "grad_norm": 0.2792419493198395, + "learning_rate": 0.00011298831385642739, + "loss": 0.4761, + "step": 1309 + }, + { + "epoch": 0.00045982212079614867, + "grad_norm": 0.26445260643959045, + "learning_rate": 0.00011292153589315526, + "loss": 0.4564, + "step": 1310 + }, + { + "epoch": 0.0004601731300486648, + "grad_norm": 0.3601842224597931, + "learning_rate": 0.00011285475792988314, + "loss": 0.5397, + "step": 1311 + }, + { + "epoch": 0.00046052413930118097, + "grad_norm": 0.3574691712856293, + "learning_rate": 0.00011278797996661104, + "loss": 0.5961, + "step": 1312 + }, + { + "epoch": 0.0004608751485536971, + "grad_norm": 0.3000461161136627, + "learning_rate": 0.00011272120200333891, + "loss": 0.4527, + "step": 1313 + }, + { + "epoch": 0.0004612261578062132, + "grad_norm": 0.34302622079849243, + "learning_rate": 0.00011265442404006678, + "loss": 0.6379, + "step": 1314 + }, + { + "epoch": 0.0004615771670587294, + "grad_norm": 0.3945535123348236, + "learning_rate": 0.00011258764607679465, + "loss": 0.5631, + "step": 1315 + }, + { + "epoch": 0.0004619281763112455, + "grad_norm": 0.4170839786529541, + "learning_rate": 0.00011252086811352254, + "loss": 0.6339, + "step": 1316 + }, + { + "epoch": 0.0004622791855637617, + "grad_norm": 0.36513859033584595, + "learning_rate": 0.00011245409015025041, + "loss": 0.5528, + "step": 1317 + }, + { + "epoch": 0.0004626301948162778, + "grad_norm": 0.45692166686058044, + "learning_rate": 0.00011238731218697832, + "loss": 0.6315, + "step": 1318 + }, + { + "epoch": 0.00046298120406879393, + "grad_norm": 0.3772307336330414, + "learning_rate": 0.00011232053422370619, + "loss": 0.5349, + "step": 1319 + }, + { + "epoch": 0.0004633322133213101, + "grad_norm": 0.3114742636680603, + "learning_rate": 0.00011225375626043406, + "loss": 0.4121, + "step": 1320 + }, + { + "epoch": 0.00046368322257382623, + "grad_norm": 0.3508698344230652, + "learning_rate": 0.00011218697829716193, + "loss": 0.638, + "step": 1321 + }, + { + "epoch": 0.0004640342318263424, + "grad_norm": 0.34588712453842163, + "learning_rate": 0.00011212020033388981, + "loss": 0.4898, + "step": 1322 + }, + { + "epoch": 0.00046438524107885853, + "grad_norm": 0.2846747934818268, + "learning_rate": 0.00011205342237061771, + "loss": 0.5521, + "step": 1323 + }, + { + "epoch": 0.00046473625033137465, + "grad_norm": 0.31673532724380493, + "learning_rate": 0.00011198664440734558, + "loss": 0.4676, + "step": 1324 + }, + { + "epoch": 0.00046508725958389083, + "grad_norm": 0.3159814774990082, + "learning_rate": 0.00011191986644407347, + "loss": 0.508, + "step": 1325 + }, + { + "epoch": 0.00046543826883640695, + "grad_norm": 0.3438906967639923, + "learning_rate": 0.00011185308848080134, + "loss": 0.6521, + "step": 1326 + }, + { + "epoch": 0.00046578927808892313, + "grad_norm": 0.28350135684013367, + "learning_rate": 0.00011178631051752921, + "loss": 0.517, + "step": 1327 + }, + { + "epoch": 0.00046614028734143925, + "grad_norm": 0.3244381844997406, + "learning_rate": 0.00011171953255425711, + "loss": 0.4975, + "step": 1328 + }, + { + "epoch": 0.00046649129659395537, + "grad_norm": 0.32338446378707886, + "learning_rate": 0.00011165275459098499, + "loss": 0.5581, + "step": 1329 + }, + { + "epoch": 0.00046684230584647155, + "grad_norm": 0.3385190963745117, + "learning_rate": 0.00011158597662771286, + "loss": 0.5287, + "step": 1330 + }, + { + "epoch": 0.00046719331509898767, + "grad_norm": 0.30869290232658386, + "learning_rate": 0.00011151919866444073, + "loss": 0.5694, + "step": 1331 + }, + { + "epoch": 0.00046754432435150385, + "grad_norm": 0.39800670742988586, + "learning_rate": 0.00011145242070116862, + "loss": 0.6783, + "step": 1332 + }, + { + "epoch": 0.00046789533360401997, + "grad_norm": 0.3691728413105011, + "learning_rate": 0.0001113856427378965, + "loss": 0.5814, + "step": 1333 + }, + { + "epoch": 0.0004682463428565361, + "grad_norm": 0.34991732239723206, + "learning_rate": 0.0001113188647746244, + "loss": 0.414, + "step": 1334 + }, + { + "epoch": 0.00046859735210905227, + "grad_norm": 0.3095676302909851, + "learning_rate": 0.00011125208681135227, + "loss": 0.5982, + "step": 1335 + }, + { + "epoch": 0.0004689483613615684, + "grad_norm": 0.3367360830307007, + "learning_rate": 0.00011118530884808014, + "loss": 0.5794, + "step": 1336 + }, + { + "epoch": 0.00046929937061408457, + "grad_norm": 0.3058132529258728, + "learning_rate": 0.00011111853088480801, + "loss": 0.5001, + "step": 1337 + }, + { + "epoch": 0.0004696503798666007, + "grad_norm": 0.32190924882888794, + "learning_rate": 0.00011105175292153589, + "loss": 0.6184, + "step": 1338 + }, + { + "epoch": 0.0004700013891191168, + "grad_norm": 0.2544103264808655, + "learning_rate": 0.00011098497495826379, + "loss": 0.5338, + "step": 1339 + }, + { + "epoch": 0.000470352398371633, + "grad_norm": 0.3533720374107361, + "learning_rate": 0.00011091819699499166, + "loss": 0.5817, + "step": 1340 + }, + { + "epoch": 0.0004707034076241491, + "grad_norm": 0.29889243841171265, + "learning_rate": 0.00011085141903171953, + "loss": 0.4836, + "step": 1341 + }, + { + "epoch": 0.0004710544168766653, + "grad_norm": 0.3215756118297577, + "learning_rate": 0.00011078464106844742, + "loss": 0.5438, + "step": 1342 + }, + { + "epoch": 0.0004714054261291814, + "grad_norm": 0.3005795478820801, + "learning_rate": 0.00011071786310517529, + "loss": 0.5341, + "step": 1343 + }, + { + "epoch": 0.00047175643538169753, + "grad_norm": 0.31172803044319153, + "learning_rate": 0.0001106510851419032, + "loss": 0.5517, + "step": 1344 + }, + { + "epoch": 0.0004721074446342137, + "grad_norm": 0.3667462468147278, + "learning_rate": 0.00011058430717863107, + "loss": 0.5487, + "step": 1345 + }, + { + "epoch": 0.00047245845388672983, + "grad_norm": 0.3609708249568939, + "learning_rate": 0.00011051752921535894, + "loss": 0.5514, + "step": 1346 + }, + { + "epoch": 0.000472809463139246, + "grad_norm": 0.36390745639801025, + "learning_rate": 0.00011045075125208681, + "loss": 0.609, + "step": 1347 + }, + { + "epoch": 0.00047316047239176213, + "grad_norm": 0.3918192982673645, + "learning_rate": 0.00011038397328881469, + "loss": 0.5841, + "step": 1348 + }, + { + "epoch": 0.00047351148164427825, + "grad_norm": 0.3789425194263458, + "learning_rate": 0.00011031719532554257, + "loss": 0.5551, + "step": 1349 + }, + { + "epoch": 0.00047386249089679443, + "grad_norm": 0.31591498851776123, + "learning_rate": 0.00011025041736227046, + "loss": 0.5445, + "step": 1350 + }, + { + "epoch": 0.00047421350014931055, + "grad_norm": 0.3711070120334625, + "learning_rate": 0.00011018363939899835, + "loss": 0.6124, + "step": 1351 + }, + { + "epoch": 0.00047456450940182673, + "grad_norm": 0.3442644476890564, + "learning_rate": 0.00011011686143572622, + "loss": 0.5793, + "step": 1352 + }, + { + "epoch": 0.00047491551865434285, + "grad_norm": 0.2866378426551819, + "learning_rate": 0.00011005008347245409, + "loss": 0.5144, + "step": 1353 + }, + { + "epoch": 0.000475266527906859, + "grad_norm": 0.3127586841583252, + "learning_rate": 0.00010998330550918197, + "loss": 0.6036, + "step": 1354 + }, + { + "epoch": 0.00047561753715937515, + "grad_norm": 0.32305601239204407, + "learning_rate": 0.00010991652754590987, + "loss": 0.5215, + "step": 1355 + }, + { + "epoch": 0.00047596854641189127, + "grad_norm": 0.30483660101890564, + "learning_rate": 0.00010984974958263774, + "loss": 0.6094, + "step": 1356 + }, + { + "epoch": 0.00047631955566440745, + "grad_norm": 0.33019503951072693, + "learning_rate": 0.00010978297161936561, + "loss": 0.5646, + "step": 1357 + }, + { + "epoch": 0.00047667056491692357, + "grad_norm": 0.3414929509162903, + "learning_rate": 0.00010971619365609349, + "loss": 0.5262, + "step": 1358 + }, + { + "epoch": 0.0004770215741694397, + "grad_norm": 0.3471517860889435, + "learning_rate": 0.00010964941569282137, + "loss": 0.492, + "step": 1359 + }, + { + "epoch": 0.00047737258342195587, + "grad_norm": 0.3226645588874817, + "learning_rate": 0.00010958263772954926, + "loss": 0.6318, + "step": 1360 + }, + { + "epoch": 0.000477723592674472, + "grad_norm": 0.3425777852535248, + "learning_rate": 0.00010951585976627715, + "loss": 0.5878, + "step": 1361 + }, + { + "epoch": 0.00047807460192698817, + "grad_norm": 0.307462215423584, + "learning_rate": 0.00010944908180300502, + "loss": 0.4948, + "step": 1362 + }, + { + "epoch": 0.0004784256111795043, + "grad_norm": 0.34796106815338135, + "learning_rate": 0.00010938230383973289, + "loss": 0.5525, + "step": 1363 + }, + { + "epoch": 0.0004787766204320204, + "grad_norm": 0.2861281633377075, + "learning_rate": 0.00010931552587646076, + "loss": 0.4578, + "step": 1364 + }, + { + "epoch": 0.0004791276296845366, + "grad_norm": 0.2861836552619934, + "learning_rate": 0.00010924874791318864, + "loss": 0.5761, + "step": 1365 + }, + { + "epoch": 0.0004794786389370527, + "grad_norm": 0.3063654601573944, + "learning_rate": 0.00010918196994991654, + "loss": 0.5338, + "step": 1366 + }, + { + "epoch": 0.0004798296481895689, + "grad_norm": 0.3108372390270233, + "learning_rate": 0.00010911519198664441, + "loss": 0.4896, + "step": 1367 + }, + { + "epoch": 0.000480180657442085, + "grad_norm": 0.3263947069644928, + "learning_rate": 0.0001090484140233723, + "loss": 0.6142, + "step": 1368 + }, + { + "epoch": 0.00048053166669460113, + "grad_norm": 0.27663156390190125, + "learning_rate": 0.00010898163606010017, + "loss": 0.3852, + "step": 1369 + }, + { + "epoch": 0.0004808826759471173, + "grad_norm": 0.2791202962398529, + "learning_rate": 0.00010891485809682804, + "loss": 0.6032, + "step": 1370 + }, + { + "epoch": 0.00048123368519963343, + "grad_norm": 0.2715228199958801, + "learning_rate": 0.00010884808013355594, + "loss": 0.4717, + "step": 1371 + }, + { + "epoch": 0.0004815846944521496, + "grad_norm": 0.3232786953449249, + "learning_rate": 0.00010878130217028382, + "loss": 0.5511, + "step": 1372 + }, + { + "epoch": 0.00048193570370466573, + "grad_norm": 0.42948031425476074, + "learning_rate": 0.00010871452420701169, + "loss": 0.5223, + "step": 1373 + }, + { + "epoch": 0.00048228671295718185, + "grad_norm": 0.31973496079444885, + "learning_rate": 0.00010864774624373956, + "loss": 0.4532, + "step": 1374 + }, + { + "epoch": 0.00048263772220969803, + "grad_norm": 0.3149821162223816, + "learning_rate": 0.00010858096828046744, + "loss": 0.4894, + "step": 1375 + }, + { + "epoch": 0.00048298873146221415, + "grad_norm": 0.30229589343070984, + "learning_rate": 0.00010851419031719534, + "loss": 0.5039, + "step": 1376 + }, + { + "epoch": 0.00048333974071473033, + "grad_norm": 0.36127185821533203, + "learning_rate": 0.00010844741235392321, + "loss": 0.4379, + "step": 1377 + }, + { + "epoch": 0.00048369074996724645, + "grad_norm": 0.3135043978691101, + "learning_rate": 0.0001083806343906511, + "loss": 0.5172, + "step": 1378 + }, + { + "epoch": 0.0004840417592197626, + "grad_norm": 0.33123600482940674, + "learning_rate": 0.00010831385642737897, + "loss": 0.4959, + "step": 1379 + }, + { + "epoch": 0.00048439276847227875, + "grad_norm": 0.32165780663490295, + "learning_rate": 0.00010824707846410684, + "loss": 0.5152, + "step": 1380 + }, + { + "epoch": 0.0004847437777247949, + "grad_norm": 0.28580865263938904, + "learning_rate": 0.00010818030050083472, + "loss": 0.4879, + "step": 1381 + }, + { + "epoch": 0.00048509478697731105, + "grad_norm": 0.4019862711429596, + "learning_rate": 0.00010811352253756262, + "loss": 0.5475, + "step": 1382 + }, + { + "epoch": 0.0004854457962298272, + "grad_norm": 0.34479352831840515, + "learning_rate": 0.00010804674457429049, + "loss": 0.4279, + "step": 1383 + }, + { + "epoch": 0.0004857968054823433, + "grad_norm": 0.3664172887802124, + "learning_rate": 0.00010797996661101836, + "loss": 0.5815, + "step": 1384 + }, + { + "epoch": 0.00048614781473485947, + "grad_norm": 0.34667205810546875, + "learning_rate": 0.00010791318864774625, + "loss": 0.5453, + "step": 1385 + }, + { + "epoch": 0.0004864988239873756, + "grad_norm": 0.36878061294555664, + "learning_rate": 0.00010784641068447412, + "loss": 0.5464, + "step": 1386 + }, + { + "epoch": 0.00048684983323989177, + "grad_norm": 0.3552783727645874, + "learning_rate": 0.00010777963272120202, + "loss": 0.5668, + "step": 1387 + }, + { + "epoch": 0.0004872008424924079, + "grad_norm": 0.35390666127204895, + "learning_rate": 0.0001077128547579299, + "loss": 0.4799, + "step": 1388 + }, + { + "epoch": 0.000487551851744924, + "grad_norm": 0.3539852797985077, + "learning_rate": 0.00010764607679465777, + "loss": 0.6264, + "step": 1389 + }, + { + "epoch": 0.0004879028609974402, + "grad_norm": 0.3104274868965149, + "learning_rate": 0.00010757929883138564, + "loss": 0.4881, + "step": 1390 + }, + { + "epoch": 0.0004882538702499563, + "grad_norm": 0.29643991589546204, + "learning_rate": 0.00010751252086811352, + "loss": 0.5277, + "step": 1391 + }, + { + "epoch": 0.0004886048795024725, + "grad_norm": 0.3498566448688507, + "learning_rate": 0.00010744574290484142, + "loss": 0.4394, + "step": 1392 + }, + { + "epoch": 0.0004889558887549886, + "grad_norm": 0.31261810660362244, + "learning_rate": 0.00010737896494156929, + "loss": 0.4557, + "step": 1393 + }, + { + "epoch": 0.0004893068980075047, + "grad_norm": 0.301792711019516, + "learning_rate": 0.00010731218697829716, + "loss": 0.471, + "step": 1394 + }, + { + "epoch": 0.0004896579072600209, + "grad_norm": 0.34246626496315, + "learning_rate": 0.00010724540901502505, + "loss": 0.5917, + "step": 1395 + }, + { + "epoch": 0.0004900089165125371, + "grad_norm": 0.2901524305343628, + "learning_rate": 0.00010717863105175292, + "loss": 0.441, + "step": 1396 + }, + { + "epoch": 0.0004903599257650532, + "grad_norm": 0.3026966452598572, + "learning_rate": 0.0001071118530884808, + "loss": 0.5373, + "step": 1397 + }, + { + "epoch": 0.0004907109350175693, + "grad_norm": 0.29963356256484985, + "learning_rate": 0.0001070450751252087, + "loss": 0.4464, + "step": 1398 + }, + { + "epoch": 0.0004910619442700855, + "grad_norm": 0.26481980085372925, + "learning_rate": 0.00010697829716193657, + "loss": 0.5372, + "step": 1399 + }, + { + "epoch": 0.0004914129535226016, + "grad_norm": 0.26084020733833313, + "learning_rate": 0.00010691151919866444, + "loss": 0.5523, + "step": 1400 + }, + { + "epoch": 0.0004917639627751178, + "grad_norm": 0.34062638878822327, + "learning_rate": 0.00010684474123539232, + "loss": 0.5466, + "step": 1401 + }, + { + "epoch": 0.0004921149720276339, + "grad_norm": 0.3231668472290039, + "learning_rate": 0.0001067779632721202, + "loss": 0.5019, + "step": 1402 + }, + { + "epoch": 0.00049246598128015, + "grad_norm": 0.3362787961959839, + "learning_rate": 0.00010671118530884809, + "loss": 0.5251, + "step": 1403 + }, + { + "epoch": 0.0004928169905326662, + "grad_norm": 0.28928473591804504, + "learning_rate": 0.00010664440734557598, + "loss": 0.5346, + "step": 1404 + }, + { + "epoch": 0.0004931679997851824, + "grad_norm": 0.32969072461128235, + "learning_rate": 0.00010657762938230385, + "loss": 0.6131, + "step": 1405 + }, + { + "epoch": 0.0004935190090376985, + "grad_norm": 0.29733914136886597, + "learning_rate": 0.00010651085141903172, + "loss": 0.4406, + "step": 1406 + }, + { + "epoch": 0.0004938700182902146, + "grad_norm": 0.36437737941741943, + "learning_rate": 0.0001064440734557596, + "loss": 0.551, + "step": 1407 + }, + { + "epoch": 0.0004942210275427308, + "grad_norm": 0.33889076113700867, + "learning_rate": 0.0001063772954924875, + "loss": 0.5904, + "step": 1408 + }, + { + "epoch": 0.000494572036795247, + "grad_norm": 0.3446680009365082, + "learning_rate": 0.00010631051752921537, + "loss": 0.394, + "step": 1409 + }, + { + "epoch": 0.000494923046047763, + "grad_norm": 0.33298397064208984, + "learning_rate": 0.00010624373956594324, + "loss": 0.5048, + "step": 1410 + }, + { + "epoch": 0.0004952740553002792, + "grad_norm": 0.3153474032878876, + "learning_rate": 0.00010617696160267111, + "loss": 0.5314, + "step": 1411 + }, + { + "epoch": 0.0004956250645527954, + "grad_norm": 0.27105385065078735, + "learning_rate": 0.000106110183639399, + "loss": 0.5098, + "step": 1412 + }, + { + "epoch": 0.0004959760738053114, + "grad_norm": 0.3450585901737213, + "learning_rate": 0.00010604340567612687, + "loss": 0.5249, + "step": 1413 + }, + { + "epoch": 0.0004963270830578276, + "grad_norm": 0.35962969064712524, + "learning_rate": 0.00010597662771285477, + "loss": 0.4714, + "step": 1414 + }, + { + "epoch": 0.0004966780923103438, + "grad_norm": 0.33413732051849365, + "learning_rate": 0.00010590984974958265, + "loss": 0.5618, + "step": 1415 + }, + { + "epoch": 0.00049702910156286, + "grad_norm": 0.37907567620277405, + "learning_rate": 0.00010584307178631052, + "loss": 0.5751, + "step": 1416 + }, + { + "epoch": 0.000497380110815376, + "grad_norm": 0.3324087858200073, + "learning_rate": 0.0001057762938230384, + "loss": 0.5032, + "step": 1417 + }, + { + "epoch": 0.0004977311200678922, + "grad_norm": 0.2794540822505951, + "learning_rate": 0.00010570951585976627, + "loss": 0.4823, + "step": 1418 + }, + { + "epoch": 0.0004980821293204084, + "grad_norm": 0.31896448135375977, + "learning_rate": 0.00010564273789649417, + "loss": 0.5293, + "step": 1419 + }, + { + "epoch": 0.0004984331385729245, + "grad_norm": 0.39455580711364746, + "learning_rate": 0.00010557595993322204, + "loss": 0.6312, + "step": 1420 + }, + { + "epoch": 0.0004987841478254406, + "grad_norm": 0.3108445107936859, + "learning_rate": 0.00010550918196994993, + "loss": 0.4614, + "step": 1421 + }, + { + "epoch": 0.0004991351570779568, + "grad_norm": 0.2984072268009186, + "learning_rate": 0.0001054424040066778, + "loss": 0.5516, + "step": 1422 + }, + { + "epoch": 0.0004994861663304729, + "grad_norm": 0.3056257665157318, + "learning_rate": 0.00010537562604340567, + "loss": 0.5906, + "step": 1423 + }, + { + "epoch": 0.0004998371755829891, + "grad_norm": 0.29374566674232483, + "learning_rate": 0.00010530884808013357, + "loss": 0.599, + "step": 1424 + }, + { + "epoch": 0.0005001881848355052, + "grad_norm": 0.3665946424007416, + "learning_rate": 0.00010524207011686145, + "loss": 0.5599, + "step": 1425 + }, + { + "epoch": 0.0005005391940880214, + "grad_norm": 0.31262800097465515, + "learning_rate": 0.00010517529215358932, + "loss": 0.5566, + "step": 1426 + }, + { + "epoch": 0.0005008902033405375, + "grad_norm": 0.3117959797382355, + "learning_rate": 0.0001051085141903172, + "loss": 0.4372, + "step": 1427 + }, + { + "epoch": 0.0005012412125930537, + "grad_norm": 0.3499256670475006, + "learning_rate": 0.00010504173622704507, + "loss": 0.543, + "step": 1428 + }, + { + "epoch": 0.0005015922218455698, + "grad_norm": 0.3630000948905945, + "learning_rate": 0.00010497495826377295, + "loss": 0.5099, + "step": 1429 + }, + { + "epoch": 0.0005019432310980859, + "grad_norm": 0.3609743118286133, + "learning_rate": 0.00010490818030050084, + "loss": 0.5304, + "step": 1430 + }, + { + "epoch": 0.0005022942403506021, + "grad_norm": 0.3600139617919922, + "learning_rate": 0.00010484140233722873, + "loss": 0.4811, + "step": 1431 + }, + { + "epoch": 0.0005026452496031183, + "grad_norm": 0.30108320713043213, + "learning_rate": 0.0001047746243739566, + "loss": 0.6055, + "step": 1432 + }, + { + "epoch": 0.0005029962588556343, + "grad_norm": 0.34729886054992676, + "learning_rate": 0.00010470784641068447, + "loss": 0.5011, + "step": 1433 + }, + { + "epoch": 0.0005033472681081505, + "grad_norm": 0.33984988927841187, + "learning_rate": 0.00010464106844741235, + "loss": 0.5905, + "step": 1434 + }, + { + "epoch": 0.0005036982773606667, + "grad_norm": 0.3109802007675171, + "learning_rate": 0.00010457429048414025, + "loss": 0.5228, + "step": 1435 + }, + { + "epoch": 0.0005040492866131829, + "grad_norm": 0.37691593170166016, + "learning_rate": 0.00010450751252086812, + "loss": 0.5839, + "step": 1436 + }, + { + "epoch": 0.0005044002958656989, + "grad_norm": 0.3665965497493744, + "learning_rate": 0.00010444073455759599, + "loss": 0.5381, + "step": 1437 + }, + { + "epoch": 0.0005047513051182151, + "grad_norm": 0.29414570331573486, + "learning_rate": 0.00010437395659432388, + "loss": 0.6072, + "step": 1438 + }, + { + "epoch": 0.0005051023143707313, + "grad_norm": 0.3206839859485626, + "learning_rate": 0.00010430717863105175, + "loss": 0.5285, + "step": 1439 + }, + { + "epoch": 0.0005054533236232473, + "grad_norm": 0.3003496527671814, + "learning_rate": 0.00010424040066777965, + "loss": 0.4037, + "step": 1440 + }, + { + "epoch": 0.0005058043328757635, + "grad_norm": 0.2955014109611511, + "learning_rate": 0.00010417362270450753, + "loss": 0.4646, + "step": 1441 + }, + { + "epoch": 0.0005061553421282797, + "grad_norm": 0.3399007022380829, + "learning_rate": 0.0001041068447412354, + "loss": 0.5649, + "step": 1442 + }, + { + "epoch": 0.0005065063513807958, + "grad_norm": 0.3394736349582672, + "learning_rate": 0.00010404006677796327, + "loss": 0.5512, + "step": 1443 + }, + { + "epoch": 0.0005068573606333119, + "grad_norm": 0.31650441884994507, + "learning_rate": 0.00010397328881469115, + "loss": 0.4669, + "step": 1444 + }, + { + "epoch": 0.0005072083698858281, + "grad_norm": 0.3380611538887024, + "learning_rate": 0.00010390651085141905, + "loss": 0.6714, + "step": 1445 + }, + { + "epoch": 0.0005075593791383443, + "grad_norm": 0.29049673676490784, + "learning_rate": 0.00010383973288814692, + "loss": 0.5652, + "step": 1446 + }, + { + "epoch": 0.0005079103883908604, + "grad_norm": 0.37694746255874634, + "learning_rate": 0.0001037729549248748, + "loss": 0.4355, + "step": 1447 + }, + { + "epoch": 0.0005082613976433765, + "grad_norm": 0.36622750759124756, + "learning_rate": 0.00010370617696160268, + "loss": 0.4758, + "step": 1448 + }, + { + "epoch": 0.0005086124068958927, + "grad_norm": 0.3366115093231201, + "learning_rate": 0.00010363939899833055, + "loss": 0.5498, + "step": 1449 + }, + { + "epoch": 0.0005089634161484088, + "grad_norm": 0.2836514711380005, + "learning_rate": 0.00010357262103505843, + "loss": 0.5405, + "step": 1450 + }, + { + "epoch": 0.000509314425400925, + "grad_norm": 0.357666015625, + "learning_rate": 0.00010350584307178633, + "loss": 0.4738, + "step": 1451 + }, + { + "epoch": 0.0005096654346534411, + "grad_norm": 0.37991905212402344, + "learning_rate": 0.0001034390651085142, + "loss": 0.4932, + "step": 1452 + }, + { + "epoch": 0.0005100164439059572, + "grad_norm": 0.2862101197242737, + "learning_rate": 0.00010337228714524207, + "loss": 0.5387, + "step": 1453 + }, + { + "epoch": 0.0005103674531584734, + "grad_norm": 0.3000154197216034, + "learning_rate": 0.00010330550918196994, + "loss": 0.509, + "step": 1454 + }, + { + "epoch": 0.0005107184624109896, + "grad_norm": 0.29454153776168823, + "learning_rate": 0.00010323873121869783, + "loss": 0.3872, + "step": 1455 + }, + { + "epoch": 0.0005110694716635057, + "grad_norm": 0.305803507566452, + "learning_rate": 0.00010317195325542572, + "loss": 0.5, + "step": 1456 + }, + { + "epoch": 0.0005114204809160218, + "grad_norm": 0.3164152204990387, + "learning_rate": 0.0001031051752921536, + "loss": 0.5426, + "step": 1457 + }, + { + "epoch": 0.000511771490168538, + "grad_norm": 0.3026213049888611, + "learning_rate": 0.00010303839732888148, + "loss": 0.5783, + "step": 1458 + }, + { + "epoch": 0.0005121224994210542, + "grad_norm": 0.3170768618583679, + "learning_rate": 0.00010297161936560935, + "loss": 0.5701, + "step": 1459 + }, + { + "epoch": 0.0005124735086735702, + "grad_norm": 0.3275301456451416, + "learning_rate": 0.00010290484140233722, + "loss": 0.4884, + "step": 1460 + }, + { + "epoch": 0.0005128245179260864, + "grad_norm": 0.3446187973022461, + "learning_rate": 0.00010283806343906512, + "loss": 0.4516, + "step": 1461 + }, + { + "epoch": 0.0005131755271786026, + "grad_norm": 0.3188260495662689, + "learning_rate": 0.000102771285475793, + "loss": 0.561, + "step": 1462 + }, + { + "epoch": 0.0005135265364311186, + "grad_norm": 0.3547864258289337, + "learning_rate": 0.00010270450751252087, + "loss": 0.5768, + "step": 1463 + }, + { + "epoch": 0.0005138775456836348, + "grad_norm": 0.3740866482257843, + "learning_rate": 0.00010263772954924876, + "loss": 0.4197, + "step": 1464 + }, + { + "epoch": 0.000514228554936151, + "grad_norm": 0.38915491104125977, + "learning_rate": 0.00010257095158597663, + "loss": 0.553, + "step": 1465 + }, + { + "epoch": 0.0005145795641886672, + "grad_norm": 0.38494518399238586, + "learning_rate": 0.0001025041736227045, + "loss": 0.6247, + "step": 1466 + }, + { + "epoch": 0.0005149305734411832, + "grad_norm": 0.2716946303844452, + "learning_rate": 0.0001024373956594324, + "loss": 0.4426, + "step": 1467 + }, + { + "epoch": 0.0005152815826936994, + "grad_norm": 0.33764415979385376, + "learning_rate": 0.00010237061769616028, + "loss": 0.5939, + "step": 1468 + }, + { + "epoch": 0.0005156325919462156, + "grad_norm": 0.34384095668792725, + "learning_rate": 0.00010230383973288815, + "loss": 0.604, + "step": 1469 + }, + { + "epoch": 0.0005159836011987317, + "grad_norm": 0.3203445076942444, + "learning_rate": 0.00010223706176961602, + "loss": 0.5255, + "step": 1470 + }, + { + "epoch": 0.0005163346104512478, + "grad_norm": 0.2592601180076599, + "learning_rate": 0.0001021702838063439, + "loss": 0.4509, + "step": 1471 + }, + { + "epoch": 0.000516685619703764, + "grad_norm": 0.3425324261188507, + "learning_rate": 0.0001021035058430718, + "loss": 0.5498, + "step": 1472 + }, + { + "epoch": 0.0005170366289562801, + "grad_norm": 0.3077262341976166, + "learning_rate": 0.00010203672787979967, + "loss": 0.5364, + "step": 1473 + }, + { + "epoch": 0.0005173876382087963, + "grad_norm": 0.2831708788871765, + "learning_rate": 0.00010196994991652756, + "loss": 0.434, + "step": 1474 + }, + { + "epoch": 0.0005177386474613124, + "grad_norm": 0.29104581475257874, + "learning_rate": 0.00010190317195325543, + "loss": 0.5875, + "step": 1475 + }, + { + "epoch": 0.0005180896567138286, + "grad_norm": 0.29584741592407227, + "learning_rate": 0.0001018363939899833, + "loss": 0.4574, + "step": 1476 + }, + { + "epoch": 0.0005184406659663447, + "grad_norm": 0.41971537470817566, + "learning_rate": 0.0001017696160267112, + "loss": 0.5616, + "step": 1477 + }, + { + "epoch": 0.0005187916752188609, + "grad_norm": 0.3439647853374481, + "learning_rate": 0.00010170283806343908, + "loss": 0.4288, + "step": 1478 + }, + { + "epoch": 0.000519142684471377, + "grad_norm": 0.35867923498153687, + "learning_rate": 0.00010163606010016695, + "loss": 0.4415, + "step": 1479 + }, + { + "epoch": 0.0005194936937238931, + "grad_norm": 0.368987500667572, + "learning_rate": 0.00010156928213689482, + "loss": 0.5474, + "step": 1480 + }, + { + "epoch": 0.0005198447029764093, + "grad_norm": 0.30241629481315613, + "learning_rate": 0.00010150250417362271, + "loss": 0.4113, + "step": 1481 + }, + { + "epoch": 0.0005201957122289255, + "grad_norm": 0.31089895963668823, + "learning_rate": 0.00010143572621035058, + "loss": 0.4726, + "step": 1482 + }, + { + "epoch": 0.0005205467214814415, + "grad_norm": 0.2900741994380951, + "learning_rate": 0.00010136894824707848, + "loss": 0.4591, + "step": 1483 + }, + { + "epoch": 0.0005208977307339577, + "grad_norm": 0.2920607030391693, + "learning_rate": 0.00010130217028380636, + "loss": 0.508, + "step": 1484 + }, + { + "epoch": 0.0005212487399864739, + "grad_norm": 0.5145193338394165, + "learning_rate": 0.00010123539232053423, + "loss": 0.6125, + "step": 1485 + }, + { + "epoch": 0.0005215997492389901, + "grad_norm": 0.3466121554374695, + "learning_rate": 0.0001011686143572621, + "loss": 0.5236, + "step": 1486 + }, + { + "epoch": 0.0005219507584915061, + "grad_norm": 0.2820659577846527, + "learning_rate": 0.00010110183639398998, + "loss": 0.4886, + "step": 1487 + }, + { + "epoch": 0.0005223017677440223, + "grad_norm": 0.31797733902931213, + "learning_rate": 0.00010103505843071788, + "loss": 0.4605, + "step": 1488 + }, + { + "epoch": 0.0005226527769965385, + "grad_norm": 0.3547564148902893, + "learning_rate": 0.00010096828046744575, + "loss": 0.5559, + "step": 1489 + }, + { + "epoch": 0.0005230037862490545, + "grad_norm": 0.3584667146205902, + "learning_rate": 0.00010090150250417362, + "loss": 0.4402, + "step": 1490 + }, + { + "epoch": 0.0005233547955015707, + "grad_norm": 0.3230780065059662, + "learning_rate": 0.00010083472454090151, + "loss": 0.5187, + "step": 1491 + }, + { + "epoch": 0.0005237058047540869, + "grad_norm": 0.3932897448539734, + "learning_rate": 0.00010076794657762938, + "loss": 0.5758, + "step": 1492 + }, + { + "epoch": 0.000524056814006603, + "grad_norm": 0.39378783106803894, + "learning_rate": 0.00010070116861435728, + "loss": 0.5199, + "step": 1493 + }, + { + "epoch": 0.0005244078232591191, + "grad_norm": 0.33147481083869934, + "learning_rate": 0.00010063439065108516, + "loss": 0.4489, + "step": 1494 + }, + { + "epoch": 0.0005247588325116353, + "grad_norm": 0.3706863522529602, + "learning_rate": 0.00010056761268781303, + "loss": 0.4601, + "step": 1495 + }, + { + "epoch": 0.0005251098417641515, + "grad_norm": 0.45806849002838135, + "learning_rate": 0.0001005008347245409, + "loss": 0.4522, + "step": 1496 + }, + { + "epoch": 0.0005254608510166676, + "grad_norm": 0.2931700050830841, + "learning_rate": 0.00010043405676126878, + "loss": 0.3673, + "step": 1497 + }, + { + "epoch": 0.0005258118602691837, + "grad_norm": 0.31791719794273376, + "learning_rate": 0.00010036727879799666, + "loss": 0.497, + "step": 1498 + }, + { + "epoch": 0.0005261628695216999, + "grad_norm": 0.51285719871521, + "learning_rate": 0.00010030050083472455, + "loss": 0.4736, + "step": 1499 + }, + { + "epoch": 0.000526513878774216, + "grad_norm": 0.37526455521583557, + "learning_rate": 0.00010023372287145244, + "loss": 0.5242, + "step": 1500 + }, + { + "epoch": 0.0005268648880267322, + "grad_norm": 0.3161305785179138, + "learning_rate": 0.00010016694490818031, + "loss": 0.5977, + "step": 1501 + }, + { + "epoch": 0.0005272158972792483, + "grad_norm": 0.37831833958625793, + "learning_rate": 0.00010010016694490818, + "loss": 0.4086, + "step": 1502 + }, + { + "epoch": 0.0005275669065317644, + "grad_norm": 0.32192960381507874, + "learning_rate": 0.00010003338898163605, + "loss": 0.5963, + "step": 1503 + }, + { + "epoch": 0.0005279179157842806, + "grad_norm": 0.2514601945877075, + "learning_rate": 9.996661101836394e-05, + "loss": 0.4623, + "step": 1504 + }, + { + "epoch": 0.0005282689250367968, + "grad_norm": 0.2768949270248413, + "learning_rate": 9.989983305509183e-05, + "loss": 0.4104, + "step": 1505 + }, + { + "epoch": 0.0005286199342893129, + "grad_norm": 0.405597448348999, + "learning_rate": 9.98330550918197e-05, + "loss": 0.6432, + "step": 1506 + }, + { + "epoch": 0.000528970943541829, + "grad_norm": 0.36578214168548584, + "learning_rate": 9.976627712854757e-05, + "loss": 0.5242, + "step": 1507 + }, + { + "epoch": 0.0005293219527943452, + "grad_norm": 0.3062324821949005, + "learning_rate": 9.969949916527546e-05, + "loss": 0.5465, + "step": 1508 + }, + { + "epoch": 0.0005296729620468614, + "grad_norm": 0.35657453536987305, + "learning_rate": 9.963272120200335e-05, + "loss": 0.521, + "step": 1509 + }, + { + "epoch": 0.0005300239712993774, + "grad_norm": 0.4276399612426758, + "learning_rate": 9.956594323873122e-05, + "loss": 0.6213, + "step": 1510 + }, + { + "epoch": 0.0005303749805518936, + "grad_norm": 0.2819078862667084, + "learning_rate": 9.949916527545911e-05, + "loss": 0.5481, + "step": 1511 + }, + { + "epoch": 0.0005307259898044098, + "grad_norm": 0.31928518414497375, + "learning_rate": 9.943238731218698e-05, + "loss": 0.6391, + "step": 1512 + }, + { + "epoch": 0.0005310769990569258, + "grad_norm": 0.30502405762672424, + "learning_rate": 9.936560934891487e-05, + "loss": 0.6408, + "step": 1513 + }, + { + "epoch": 0.000531428008309442, + "grad_norm": 0.33620330691337585, + "learning_rate": 9.929883138564274e-05, + "loss": 0.5161, + "step": 1514 + }, + { + "epoch": 0.0005317790175619582, + "grad_norm": 0.34412580728530884, + "learning_rate": 9.923205342237061e-05, + "loss": 0.5942, + "step": 1515 + }, + { + "epoch": 0.0005321300268144744, + "grad_norm": 0.4236386716365814, + "learning_rate": 9.91652754590985e-05, + "loss": 0.6118, + "step": 1516 + }, + { + "epoch": 0.0005324810360669904, + "grad_norm": 0.3482692539691925, + "learning_rate": 9.909849749582639e-05, + "loss": 0.4935, + "step": 1517 + }, + { + "epoch": 0.0005328320453195066, + "grad_norm": 0.36736801266670227, + "learning_rate": 9.903171953255426e-05, + "loss": 0.5077, + "step": 1518 + }, + { + "epoch": 0.0005331830545720228, + "grad_norm": 0.3174130916595459, + "learning_rate": 9.896494156928215e-05, + "loss": 0.5046, + "step": 1519 + }, + { + "epoch": 0.0005335340638245389, + "grad_norm": 0.35202938318252563, + "learning_rate": 9.889816360601002e-05, + "loss": 0.5843, + "step": 1520 + }, + { + "epoch": 0.000533885073077055, + "grad_norm": 0.3530493974685669, + "learning_rate": 9.883138564273791e-05, + "loss": 0.4544, + "step": 1521 + }, + { + "epoch": 0.0005342360823295712, + "grad_norm": 0.36287322640419006, + "learning_rate": 9.876460767946578e-05, + "loss": 0.3369, + "step": 1522 + }, + { + "epoch": 0.0005345870915820873, + "grad_norm": 0.32286468148231506, + "learning_rate": 9.869782971619365e-05, + "loss": 0.4056, + "step": 1523 + }, + { + "epoch": 0.0005349381008346035, + "grad_norm": 0.34090831875801086, + "learning_rate": 9.863105175292154e-05, + "loss": 0.4746, + "step": 1524 + }, + { + "epoch": 0.0005352891100871196, + "grad_norm": 0.35454946756362915, + "learning_rate": 9.856427378964941e-05, + "loss": 0.5432, + "step": 1525 + }, + { + "epoch": 0.0005356401193396357, + "grad_norm": 0.3738378584384918, + "learning_rate": 9.84974958263773e-05, + "loss": 0.5179, + "step": 1526 + }, + { + "epoch": 0.0005359911285921519, + "grad_norm": 0.309709370136261, + "learning_rate": 9.843071786310519e-05, + "loss": 0.5872, + "step": 1527 + }, + { + "epoch": 0.0005363421378446681, + "grad_norm": 0.2821864187717438, + "learning_rate": 9.836393989983306e-05, + "loss": 0.5162, + "step": 1528 + }, + { + "epoch": 0.0005366931470971842, + "grad_norm": 0.46964001655578613, + "learning_rate": 9.829716193656095e-05, + "loss": 0.4713, + "step": 1529 + }, + { + "epoch": 0.0005370441563497003, + "grad_norm": 0.3433643877506256, + "learning_rate": 9.823038397328882e-05, + "loss": 0.516, + "step": 1530 + }, + { + "epoch": 0.0005373951656022165, + "grad_norm": 0.347112774848938, + "learning_rate": 9.816360601001669e-05, + "loss": 0.3772, + "step": 1531 + }, + { + "epoch": 0.0005377461748547327, + "grad_norm": 0.2924909293651581, + "learning_rate": 9.809682804674458e-05, + "loss": 0.5616, + "step": 1532 + }, + { + "epoch": 0.0005380971841072487, + "grad_norm": 0.36090362071990967, + "learning_rate": 9.803005008347245e-05, + "loss": 0.5715, + "step": 1533 + }, + { + "epoch": 0.0005384481933597649, + "grad_norm": 0.31504470109939575, + "learning_rate": 9.796327212020034e-05, + "loss": 0.5311, + "step": 1534 + }, + { + "epoch": 0.0005387992026122811, + "grad_norm": 0.34885862469673157, + "learning_rate": 9.789649415692823e-05, + "loss": 0.4626, + "step": 1535 + }, + { + "epoch": 0.0005391502118647971, + "grad_norm": 0.34042325615882874, + "learning_rate": 9.78297161936561e-05, + "loss": 0.4934, + "step": 1536 + }, + { + "epoch": 0.0005395012211173133, + "grad_norm": 0.39018404483795166, + "learning_rate": 9.776293823038399e-05, + "loss": 0.5814, + "step": 1537 + }, + { + "epoch": 0.0005398522303698295, + "grad_norm": 0.2676241397857666, + "learning_rate": 9.769616026711186e-05, + "loss": 0.438, + "step": 1538 + }, + { + "epoch": 0.0005402032396223457, + "grad_norm": 0.32380932569503784, + "learning_rate": 9.762938230383973e-05, + "loss": 0.6087, + "step": 1539 + }, + { + "epoch": 0.0005405542488748617, + "grad_norm": 0.35949036478996277, + "learning_rate": 9.756260434056762e-05, + "loss": 0.6076, + "step": 1540 + }, + { + "epoch": 0.0005409052581273779, + "grad_norm": 0.29408982396125793, + "learning_rate": 9.749582637729549e-05, + "loss": 0.5134, + "step": 1541 + }, + { + "epoch": 0.0005412562673798941, + "grad_norm": 0.30686628818511963, + "learning_rate": 9.742904841402337e-05, + "loss": 0.5617, + "step": 1542 + }, + { + "epoch": 0.0005416072766324102, + "grad_norm": 0.37297409772872925, + "learning_rate": 9.736227045075125e-05, + "loss": 0.4563, + "step": 1543 + }, + { + "epoch": 0.0005419582858849263, + "grad_norm": 0.3103518486022949, + "learning_rate": 9.729549248747914e-05, + "loss": 0.5235, + "step": 1544 + }, + { + "epoch": 0.0005423092951374425, + "grad_norm": 0.3941648602485657, + "learning_rate": 9.722871452420703e-05, + "loss": 0.6541, + "step": 1545 + }, + { + "epoch": 0.0005426603043899586, + "grad_norm": 0.30755361914634705, + "learning_rate": 9.71619365609349e-05, + "loss": 0.5612, + "step": 1546 + }, + { + "epoch": 0.0005430113136424748, + "grad_norm": 0.35478439927101135, + "learning_rate": 9.709515859766277e-05, + "loss": 0.6078, + "step": 1547 + }, + { + "epoch": 0.0005433623228949909, + "grad_norm": 0.30011776089668274, + "learning_rate": 9.702838063439066e-05, + "loss": 0.3989, + "step": 1548 + }, + { + "epoch": 0.0005437133321475071, + "grad_norm": 0.3524412214756012, + "learning_rate": 9.696160267111853e-05, + "loss": 0.554, + "step": 1549 + }, + { + "epoch": 0.0005440643414000232, + "grad_norm": 0.33379805088043213, + "learning_rate": 9.68948247078464e-05, + "loss": 0.4773, + "step": 1550 + }, + { + "epoch": 0.0005444153506525394, + "grad_norm": 0.3144623339176178, + "learning_rate": 9.682804674457429e-05, + "loss": 0.5397, + "step": 1551 + }, + { + "epoch": 0.0005447663599050555, + "grad_norm": 0.3189099431037903, + "learning_rate": 9.676126878130218e-05, + "loss": 0.5577, + "step": 1552 + }, + { + "epoch": 0.0005451173691575716, + "grad_norm": 0.2930092215538025, + "learning_rate": 9.669449081803006e-05, + "loss": 0.436, + "step": 1553 + }, + { + "epoch": 0.0005454683784100878, + "grad_norm": 0.30305665731430054, + "learning_rate": 9.662771285475794e-05, + "loss": 0.6663, + "step": 1554 + }, + { + "epoch": 0.000545819387662604, + "grad_norm": 0.31724509596824646, + "learning_rate": 9.656093489148581e-05, + "loss": 0.5232, + "step": 1555 + }, + { + "epoch": 0.00054617039691512, + "grad_norm": 0.3048739731311798, + "learning_rate": 9.64941569282137e-05, + "loss": 0.5975, + "step": 1556 + }, + { + "epoch": 0.0005465214061676362, + "grad_norm": 0.313481867313385, + "learning_rate": 9.642737896494157e-05, + "loss": 0.5658, + "step": 1557 + }, + { + "epoch": 0.0005468724154201524, + "grad_norm": 0.3365669548511505, + "learning_rate": 9.636060100166944e-05, + "loss": 0.5356, + "step": 1558 + }, + { + "epoch": 0.0005472234246726686, + "grad_norm": 0.29624179005622864, + "learning_rate": 9.629382303839733e-05, + "loss": 0.5596, + "step": 1559 + }, + { + "epoch": 0.0005475744339251846, + "grad_norm": 0.32584840059280396, + "learning_rate": 9.62270450751252e-05, + "loss": 0.6195, + "step": 1560 + }, + { + "epoch": 0.0005479254431777008, + "grad_norm": 0.3141777217388153, + "learning_rate": 9.616026711185309e-05, + "loss": 0.5428, + "step": 1561 + }, + { + "epoch": 0.000548276452430217, + "grad_norm": 0.49182063341140747, + "learning_rate": 9.609348914858098e-05, + "loss": 0.4425, + "step": 1562 + }, + { + "epoch": 0.000548627461682733, + "grad_norm": 0.3521610200405121, + "learning_rate": 9.602671118530885e-05, + "loss": 0.4566, + "step": 1563 + }, + { + "epoch": 0.0005489784709352492, + "grad_norm": 0.32009604573249817, + "learning_rate": 9.595993322203674e-05, + "loss": 0.4673, + "step": 1564 + }, + { + "epoch": 0.0005493294801877654, + "grad_norm": 0.4251219630241394, + "learning_rate": 9.589315525876461e-05, + "loss": 0.4582, + "step": 1565 + }, + { + "epoch": 0.0005496804894402815, + "grad_norm": 0.4044347107410431, + "learning_rate": 9.582637729549248e-05, + "loss": 0.6094, + "step": 1566 + }, + { + "epoch": 0.0005500314986927976, + "grad_norm": 0.37995630502700806, + "learning_rate": 9.575959933222037e-05, + "loss": 0.5798, + "step": 1567 + }, + { + "epoch": 0.0005503825079453138, + "grad_norm": 0.36014696955680847, + "learning_rate": 9.569282136894824e-05, + "loss": 0.5578, + "step": 1568 + }, + { + "epoch": 0.00055073351719783, + "grad_norm": 0.36085575819015503, + "learning_rate": 9.562604340567613e-05, + "loss": 0.5723, + "step": 1569 + }, + { + "epoch": 0.0005510845264503461, + "grad_norm": 0.34479430317878723, + "learning_rate": 9.555926544240402e-05, + "loss": 0.3701, + "step": 1570 + }, + { + "epoch": 0.0005514355357028622, + "grad_norm": 0.29680463671684265, + "learning_rate": 9.549248747913189e-05, + "loss": 0.443, + "step": 1571 + }, + { + "epoch": 0.0005517865449553784, + "grad_norm": 0.282615065574646, + "learning_rate": 9.542570951585978e-05, + "loss": 0.4408, + "step": 1572 + }, + { + "epoch": 0.0005521375542078945, + "grad_norm": 0.30851373076438904, + "learning_rate": 9.535893155258765e-05, + "loss": 0.5537, + "step": 1573 + }, + { + "epoch": 0.0005524885634604107, + "grad_norm": 0.41260892152786255, + "learning_rate": 9.529215358931554e-05, + "loss": 0.5401, + "step": 1574 + }, + { + "epoch": 0.0005528395727129268, + "grad_norm": 0.31149372458457947, + "learning_rate": 9.522537562604341e-05, + "loss": 0.4033, + "step": 1575 + }, + { + "epoch": 0.0005531905819654429, + "grad_norm": 0.33126652240753174, + "learning_rate": 9.515859766277128e-05, + "loss": 0.5386, + "step": 1576 + }, + { + "epoch": 0.0005535415912179591, + "grad_norm": 0.2965177297592163, + "learning_rate": 9.509181969949917e-05, + "loss": 0.4869, + "step": 1577 + }, + { + "epoch": 0.0005538926004704753, + "grad_norm": 0.28436359763145447, + "learning_rate": 9.502504173622706e-05, + "loss": 0.5632, + "step": 1578 + }, + { + "epoch": 0.0005542436097229914, + "grad_norm": 0.3518412113189697, + "learning_rate": 9.495826377295493e-05, + "loss": 0.4086, + "step": 1579 + }, + { + "epoch": 0.0005545946189755075, + "grad_norm": 0.3295888900756836, + "learning_rate": 9.489148580968282e-05, + "loss": 0.5742, + "step": 1580 + }, + { + "epoch": 0.0005549456282280237, + "grad_norm": 0.3147815763950348, + "learning_rate": 9.482470784641069e-05, + "loss": 0.5332, + "step": 1581 + }, + { + "epoch": 0.0005552966374805399, + "grad_norm": 0.30593639612197876, + "learning_rate": 9.475792988313858e-05, + "loss": 0.5496, + "step": 1582 + }, + { + "epoch": 0.0005556476467330559, + "grad_norm": 0.3162075877189636, + "learning_rate": 9.469115191986645e-05, + "loss": 0.5912, + "step": 1583 + }, + { + "epoch": 0.0005559986559855721, + "grad_norm": 0.32497403025627136, + "learning_rate": 9.462437395659432e-05, + "loss": 0.5494, + "step": 1584 + }, + { + "epoch": 0.0005563496652380883, + "grad_norm": 0.31055036187171936, + "learning_rate": 9.455759599332221e-05, + "loss": 0.6336, + "step": 1585 + }, + { + "epoch": 0.0005567006744906044, + "grad_norm": 0.33537331223487854, + "learning_rate": 9.449081803005008e-05, + "loss": 0.4221, + "step": 1586 + }, + { + "epoch": 0.0005570516837431205, + "grad_norm": 0.3572893440723419, + "learning_rate": 9.442404006677797e-05, + "loss": 0.5553, + "step": 1587 + }, + { + "epoch": 0.0005574026929956367, + "grad_norm": 0.3298802375793457, + "learning_rate": 9.435726210350586e-05, + "loss": 0.5121, + "step": 1588 + }, + { + "epoch": 0.0005577537022481529, + "grad_norm": 0.3529982268810272, + "learning_rate": 9.429048414023373e-05, + "loss": 0.3964, + "step": 1589 + }, + { + "epoch": 0.000558104711500669, + "grad_norm": 0.294223427772522, + "learning_rate": 9.422370617696162e-05, + "loss": 0.4495, + "step": 1590 + }, + { + "epoch": 0.0005584557207531851, + "grad_norm": 0.2953149676322937, + "learning_rate": 9.415692821368949e-05, + "loss": 0.4241, + "step": 1591 + }, + { + "epoch": 0.0005588067300057013, + "grad_norm": 0.31237637996673584, + "learning_rate": 9.409015025041736e-05, + "loss": 0.5025, + "step": 1592 + }, + { + "epoch": 0.0005591577392582174, + "grad_norm": 0.31202566623687744, + "learning_rate": 9.402337228714525e-05, + "loss": 0.5442, + "step": 1593 + }, + { + "epoch": 0.0005595087485107335, + "grad_norm": 0.34976473450660706, + "learning_rate": 9.395659432387312e-05, + "loss": 0.5314, + "step": 1594 + }, + { + "epoch": 0.0005598597577632497, + "grad_norm": 0.3305265009403229, + "learning_rate": 9.388981636060101e-05, + "loss": 0.4842, + "step": 1595 + }, + { + "epoch": 0.0005602107670157658, + "grad_norm": 0.30773475766181946, + "learning_rate": 9.38230383973289e-05, + "loss": 0.4621, + "step": 1596 + }, + { + "epoch": 0.000560561776268282, + "grad_norm": 0.35445886850357056, + "learning_rate": 9.375626043405677e-05, + "loss": 0.509, + "step": 1597 + }, + { + "epoch": 0.0005609127855207981, + "grad_norm": 0.46057018637657166, + "learning_rate": 9.368948247078465e-05, + "loss": 0.5471, + "step": 1598 + }, + { + "epoch": 0.0005612637947733143, + "grad_norm": 0.3413529396057129, + "learning_rate": 9.362270450751253e-05, + "loss": 0.5558, + "step": 1599 + }, + { + "epoch": 0.0005616148040258304, + "grad_norm": 0.36943134665489197, + "learning_rate": 9.35559265442404e-05, + "loss": 0.4718, + "step": 1600 + }, + { + "epoch": 0.0005619658132783466, + "grad_norm": 0.3529636263847351, + "learning_rate": 9.348914858096829e-05, + "loss": 0.4591, + "step": 1601 + }, + { + "epoch": 0.0005623168225308627, + "grad_norm": 0.3375125229358673, + "learning_rate": 9.342237061769616e-05, + "loss": 0.4971, + "step": 1602 + }, + { + "epoch": 0.0005626678317833788, + "grad_norm": 0.3923933804035187, + "learning_rate": 9.335559265442403e-05, + "loss": 0.545, + "step": 1603 + }, + { + "epoch": 0.000563018841035895, + "grad_norm": 0.3128841519355774, + "learning_rate": 9.328881469115192e-05, + "loss": 0.4374, + "step": 1604 + }, + { + "epoch": 0.0005633698502884112, + "grad_norm": 0.3729458153247833, + "learning_rate": 9.322203672787981e-05, + "loss": 0.581, + "step": 1605 + }, + { + "epoch": 0.0005637208595409272, + "grad_norm": 0.3644692003726959, + "learning_rate": 9.31552587646077e-05, + "loss": 0.5223, + "step": 1606 + }, + { + "epoch": 0.0005640718687934434, + "grad_norm": 0.365633100271225, + "learning_rate": 9.308848080133557e-05, + "loss": 0.3695, + "step": 1607 + }, + { + "epoch": 0.0005644228780459596, + "grad_norm": 0.3256838917732239, + "learning_rate": 9.302170283806344e-05, + "loss": 0.5484, + "step": 1608 + }, + { + "epoch": 0.0005647738872984758, + "grad_norm": 0.26042798161506653, + "learning_rate": 9.295492487479133e-05, + "loss": 0.529, + "step": 1609 + }, + { + "epoch": 0.0005651248965509918, + "grad_norm": 0.27954763174057007, + "learning_rate": 9.28881469115192e-05, + "loss": 0.5216, + "step": 1610 + }, + { + "epoch": 0.000565475905803508, + "grad_norm": 0.3117378354072571, + "learning_rate": 9.282136894824707e-05, + "loss": 0.4835, + "step": 1611 + }, + { + "epoch": 0.0005658269150560242, + "grad_norm": 0.3219063878059387, + "learning_rate": 9.275459098497496e-05, + "loss": 0.6403, + "step": 1612 + }, + { + "epoch": 0.0005661779243085403, + "grad_norm": 0.32121285796165466, + "learning_rate": 9.268781302170285e-05, + "loss": 0.5992, + "step": 1613 + }, + { + "epoch": 0.0005665289335610564, + "grad_norm": 0.2896992564201355, + "learning_rate": 9.262103505843073e-05, + "loss": 0.4995, + "step": 1614 + }, + { + "epoch": 0.0005668799428135726, + "grad_norm": 0.311301589012146, + "learning_rate": 9.255425709515861e-05, + "loss": 0.5201, + "step": 1615 + }, + { + "epoch": 0.0005672309520660887, + "grad_norm": 0.2977074682712555, + "learning_rate": 9.248747913188648e-05, + "loss": 0.5557, + "step": 1616 + }, + { + "epoch": 0.0005675819613186049, + "grad_norm": 0.315746009349823, + "learning_rate": 9.242070116861437e-05, + "loss": 0.5746, + "step": 1617 + }, + { + "epoch": 0.000567932970571121, + "grad_norm": 0.323231965303421, + "learning_rate": 9.235392320534224e-05, + "loss": 0.5714, + "step": 1618 + }, + { + "epoch": 0.0005682839798236372, + "grad_norm": 0.30381882190704346, + "learning_rate": 9.228714524207011e-05, + "loss": 0.5279, + "step": 1619 + }, + { + "epoch": 0.0005686349890761533, + "grad_norm": 0.3350276052951813, + "learning_rate": 9.2220367278798e-05, + "loss": 0.4504, + "step": 1620 + }, + { + "epoch": 0.0005689859983286694, + "grad_norm": 0.3821620047092438, + "learning_rate": 9.215358931552587e-05, + "loss": 0.4713, + "step": 1621 + }, + { + "epoch": 0.0005693370075811856, + "grad_norm": 0.299938827753067, + "learning_rate": 9.208681135225376e-05, + "loss": 0.5426, + "step": 1622 + }, + { + "epoch": 0.0005696880168337017, + "grad_norm": 0.3533617854118347, + "learning_rate": 9.202003338898165e-05, + "loss": 0.5947, + "step": 1623 + }, + { + "epoch": 0.0005700390260862179, + "grad_norm": 0.5132538080215454, + "learning_rate": 9.195325542570952e-05, + "loss": 0.4809, + "step": 1624 + }, + { + "epoch": 0.000570390035338734, + "grad_norm": 0.28735020756721497, + "learning_rate": 9.18864774624374e-05, + "loss": 0.5597, + "step": 1625 + }, + { + "epoch": 0.0005707410445912501, + "grad_norm": 0.3230040669441223, + "learning_rate": 9.181969949916528e-05, + "loss": 0.5099, + "step": 1626 + }, + { + "epoch": 0.0005710920538437663, + "grad_norm": 0.3185240924358368, + "learning_rate": 9.175292153589315e-05, + "loss": 0.5443, + "step": 1627 + }, + { + "epoch": 0.0005714430630962825, + "grad_norm": 0.3230789005756378, + "learning_rate": 9.168614357262104e-05, + "loss": 0.4757, + "step": 1628 + }, + { + "epoch": 0.0005717940723487986, + "grad_norm": 0.3181735873222351, + "learning_rate": 9.161936560934891e-05, + "loss": 0.4645, + "step": 1629 + }, + { + "epoch": 0.0005721450816013147, + "grad_norm": 0.31638282537460327, + "learning_rate": 9.15525876460768e-05, + "loss": 0.6041, + "step": 1630 + }, + { + "epoch": 0.0005724960908538309, + "grad_norm": 0.31525102257728577, + "learning_rate": 9.148580968280469e-05, + "loss": 0.5238, + "step": 1631 + }, + { + "epoch": 0.0005728471001063471, + "grad_norm": 0.27146804332733154, + "learning_rate": 9.141903171953256e-05, + "loss": 0.5115, + "step": 1632 + }, + { + "epoch": 0.0005731981093588631, + "grad_norm": 0.28801295161247253, + "learning_rate": 9.135225375626045e-05, + "loss": 0.4111, + "step": 1633 + }, + { + "epoch": 0.0005735491186113793, + "grad_norm": 0.3048948645591736, + "learning_rate": 9.128547579298832e-05, + "loss": 0.5293, + "step": 1634 + }, + { + "epoch": 0.0005739001278638955, + "grad_norm": 0.31797000765800476, + "learning_rate": 9.121869782971619e-05, + "loss": 0.5365, + "step": 1635 + }, + { + "epoch": 0.0005742511371164116, + "grad_norm": 0.3156517446041107, + "learning_rate": 9.115191986644408e-05, + "loss": 0.6072, + "step": 1636 + }, + { + "epoch": 0.0005746021463689277, + "grad_norm": 0.28218841552734375, + "learning_rate": 9.108514190317195e-05, + "loss": 0.5127, + "step": 1637 + }, + { + "epoch": 0.0005749531556214439, + "grad_norm": 0.34264588356018066, + "learning_rate": 9.101836393989984e-05, + "loss": 0.5442, + "step": 1638 + }, + { + "epoch": 0.0005753041648739601, + "grad_norm": 0.31075727939605713, + "learning_rate": 9.095158597662771e-05, + "loss": 0.4853, + "step": 1639 + }, + { + "epoch": 0.0005756551741264762, + "grad_norm": 0.34270209074020386, + "learning_rate": 9.08848080133556e-05, + "loss": 0.5188, + "step": 1640 + }, + { + "epoch": 0.0005760061833789923, + "grad_norm": 0.3420792520046234, + "learning_rate": 9.081803005008348e-05, + "loss": 0.552, + "step": 1641 + }, + { + "epoch": 0.0005763571926315085, + "grad_norm": 0.24184514582157135, + "learning_rate": 9.075125208681136e-05, + "loss": 0.4318, + "step": 1642 + }, + { + "epoch": 0.0005767082018840246, + "grad_norm": 0.27248474955558777, + "learning_rate": 9.068447412353923e-05, + "loss": 0.4984, + "step": 1643 + }, + { + "epoch": 0.0005770592111365408, + "grad_norm": 0.2861645817756653, + "learning_rate": 9.061769616026712e-05, + "loss": 0.4954, + "step": 1644 + }, + { + "epoch": 0.0005774102203890569, + "grad_norm": 0.3070414662361145, + "learning_rate": 9.055091819699499e-05, + "loss": 0.5734, + "step": 1645 + }, + { + "epoch": 0.000577761229641573, + "grad_norm": 0.32180657982826233, + "learning_rate": 9.048414023372288e-05, + "loss": 0.595, + "step": 1646 + }, + { + "epoch": 0.0005781122388940892, + "grad_norm": 0.29433441162109375, + "learning_rate": 9.041736227045075e-05, + "loss": 0.4721, + "step": 1647 + }, + { + "epoch": 0.0005784632481466053, + "grad_norm": 0.28735247254371643, + "learning_rate": 9.035058430717864e-05, + "loss": 0.5441, + "step": 1648 + }, + { + "epoch": 0.0005788142573991215, + "grad_norm": 0.38344794511795044, + "learning_rate": 9.028380634390652e-05, + "loss": 0.6197, + "step": 1649 + }, + { + "epoch": 0.0005791652666516376, + "grad_norm": 0.32271769642829895, + "learning_rate": 9.02170283806344e-05, + "loss": 0.5229, + "step": 1650 + }, + { + "epoch": 0.0005795162759041538, + "grad_norm": 0.27504557371139526, + "learning_rate": 9.015025041736227e-05, + "loss": 0.432, + "step": 1651 + }, + { + "epoch": 0.00057986728515667, + "grad_norm": 0.3397347033023834, + "learning_rate": 9.008347245409016e-05, + "loss": 0.5546, + "step": 1652 + }, + { + "epoch": 0.000580218294409186, + "grad_norm": 0.3478119671344757, + "learning_rate": 9.001669449081803e-05, + "loss": 0.5094, + "step": 1653 + }, + { + "epoch": 0.0005805693036617022, + "grad_norm": 0.3200027644634247, + "learning_rate": 8.994991652754592e-05, + "loss": 0.4964, + "step": 1654 + }, + { + "epoch": 0.0005809203129142184, + "grad_norm": 0.3458947539329529, + "learning_rate": 8.988313856427379e-05, + "loss": 0.5945, + "step": 1655 + }, + { + "epoch": 0.0005812713221667344, + "grad_norm": 0.30390462279319763, + "learning_rate": 8.981636060100166e-05, + "loss": 0.5664, + "step": 1656 + }, + { + "epoch": 0.0005816223314192506, + "grad_norm": 0.32214075326919556, + "learning_rate": 8.974958263772955e-05, + "loss": 0.464, + "step": 1657 + }, + { + "epoch": 0.0005819733406717668, + "grad_norm": 0.3261844217777252, + "learning_rate": 8.968280467445744e-05, + "loss": 0.6139, + "step": 1658 + }, + { + "epoch": 0.000582324349924283, + "grad_norm": 0.30164632201194763, + "learning_rate": 8.961602671118531e-05, + "loss": 0.4767, + "step": 1659 + }, + { + "epoch": 0.000582675359176799, + "grad_norm": 0.27412328124046326, + "learning_rate": 8.95492487479132e-05, + "loss": 0.4773, + "step": 1660 + }, + { + "epoch": 0.0005830263684293152, + "grad_norm": 0.3026188313961029, + "learning_rate": 8.948247078464107e-05, + "loss": 0.5091, + "step": 1661 + }, + { + "epoch": 0.0005833773776818314, + "grad_norm": 0.4182475507259369, + "learning_rate": 8.941569282136896e-05, + "loss": 0.4763, + "step": 1662 + }, + { + "epoch": 0.0005837283869343475, + "grad_norm": 0.32345879077911377, + "learning_rate": 8.934891485809683e-05, + "loss": 0.4365, + "step": 1663 + }, + { + "epoch": 0.0005840793961868636, + "grad_norm": 0.27278438210487366, + "learning_rate": 8.92821368948247e-05, + "loss": 0.4126, + "step": 1664 + }, + { + "epoch": 0.0005844304054393798, + "grad_norm": 0.2701342701911926, + "learning_rate": 8.921535893155259e-05, + "loss": 0.44, + "step": 1665 + }, + { + "epoch": 0.0005847814146918959, + "grad_norm": 0.33415308594703674, + "learning_rate": 8.914858096828048e-05, + "loss": 0.4873, + "step": 1666 + }, + { + "epoch": 0.0005851324239444121, + "grad_norm": 0.25953027606010437, + "learning_rate": 8.908180300500835e-05, + "loss": 0.5047, + "step": 1667 + }, + { + "epoch": 0.0005854834331969282, + "grad_norm": 0.2938767373561859, + "learning_rate": 8.901502504173624e-05, + "loss": 0.5472, + "step": 1668 + }, + { + "epoch": 0.0005858344424494444, + "grad_norm": 0.34639960527420044, + "learning_rate": 8.894824707846411e-05, + "loss": 0.4647, + "step": 1669 + }, + { + "epoch": 0.0005861854517019605, + "grad_norm": 0.30084356665611267, + "learning_rate": 8.8881469115192e-05, + "loss": 0.5751, + "step": 1670 + }, + { + "epoch": 0.0005865364609544767, + "grad_norm": 0.3419461250305176, + "learning_rate": 8.881469115191987e-05, + "loss": 0.5945, + "step": 1671 + }, + { + "epoch": 0.0005868874702069928, + "grad_norm": 0.30969375371932983, + "learning_rate": 8.874791318864774e-05, + "loss": 0.476, + "step": 1672 + }, + { + "epoch": 0.0005872384794595089, + "grad_norm": 0.2766319513320923, + "learning_rate": 8.868113522537563e-05, + "loss": 0.471, + "step": 1673 + }, + { + "epoch": 0.0005875894887120251, + "grad_norm": 0.2892490327358246, + "learning_rate": 8.86143572621035e-05, + "loss": 0.5525, + "step": 1674 + }, + { + "epoch": 0.0005879404979645413, + "grad_norm": 0.2913951575756073, + "learning_rate": 8.854757929883139e-05, + "loss": 0.5969, + "step": 1675 + }, + { + "epoch": 0.0005882915072170573, + "grad_norm": 0.3010789155960083, + "learning_rate": 8.848080133555928e-05, + "loss": 0.4817, + "step": 1676 + }, + { + "epoch": 0.0005886425164695735, + "grad_norm": 0.29977700114250183, + "learning_rate": 8.841402337228715e-05, + "loss": 0.4793, + "step": 1677 + }, + { + "epoch": 0.0005889935257220897, + "grad_norm": 0.3283400535583496, + "learning_rate": 8.834724540901504e-05, + "loss": 0.5056, + "step": 1678 + }, + { + "epoch": 0.0005893445349746058, + "grad_norm": 0.30444255471229553, + "learning_rate": 8.828046744574291e-05, + "loss": 0.4955, + "step": 1679 + }, + { + "epoch": 0.0005896955442271219, + "grad_norm": 0.3443448543548584, + "learning_rate": 8.821368948247078e-05, + "loss": 0.5143, + "step": 1680 + }, + { + "epoch": 0.0005900465534796381, + "grad_norm": 0.29445815086364746, + "learning_rate": 8.814691151919867e-05, + "loss": 0.5487, + "step": 1681 + }, + { + "epoch": 0.0005903975627321543, + "grad_norm": 0.2663688659667969, + "learning_rate": 8.808013355592654e-05, + "loss": 0.4625, + "step": 1682 + }, + { + "epoch": 0.0005907485719846703, + "grad_norm": 0.3313208222389221, + "learning_rate": 8.801335559265443e-05, + "loss": 0.5043, + "step": 1683 + }, + { + "epoch": 0.0005910995812371865, + "grad_norm": 0.33829203248023987, + "learning_rate": 8.794657762938232e-05, + "loss": 0.5575, + "step": 1684 + }, + { + "epoch": 0.0005914505904897027, + "grad_norm": 0.2788808047771454, + "learning_rate": 8.787979966611019e-05, + "loss": 0.3439, + "step": 1685 + }, + { + "epoch": 0.0005918015997422188, + "grad_norm": 0.2924749255180359, + "learning_rate": 8.781302170283808e-05, + "loss": 0.5249, + "step": 1686 + }, + { + "epoch": 0.0005921526089947349, + "grad_norm": 0.3375588357448578, + "learning_rate": 8.774624373956595e-05, + "loss": 0.5204, + "step": 1687 + }, + { + "epoch": 0.0005925036182472511, + "grad_norm": 0.31543827056884766, + "learning_rate": 8.767946577629382e-05, + "loss": 0.547, + "step": 1688 + }, + { + "epoch": 0.0005928546274997673, + "grad_norm": 0.29130932688713074, + "learning_rate": 8.761268781302171e-05, + "loss": 0.4064, + "step": 1689 + }, + { + "epoch": 0.0005932056367522834, + "grad_norm": 0.28948086500167847, + "learning_rate": 8.754590984974958e-05, + "loss": 0.4538, + "step": 1690 + }, + { + "epoch": 0.0005935566460047995, + "grad_norm": 0.3201799690723419, + "learning_rate": 8.747913188647745e-05, + "loss": 0.5578, + "step": 1691 + }, + { + "epoch": 0.0005939076552573157, + "grad_norm": 0.3169330954551697, + "learning_rate": 8.741235392320535e-05, + "loss": 0.5626, + "step": 1692 + }, + { + "epoch": 0.0005942586645098318, + "grad_norm": 0.34727850556373596, + "learning_rate": 8.734557595993323e-05, + "loss": 0.4209, + "step": 1693 + }, + { + "epoch": 0.000594609673762348, + "grad_norm": 0.3186934292316437, + "learning_rate": 8.727879799666111e-05, + "loss": 0.5401, + "step": 1694 + }, + { + "epoch": 0.0005949606830148641, + "grad_norm": 0.34129294753074646, + "learning_rate": 8.721202003338899e-05, + "loss": 0.4617, + "step": 1695 + }, + { + "epoch": 0.0005953116922673802, + "grad_norm": 0.3374929130077362, + "learning_rate": 8.714524207011686e-05, + "loss": 0.5558, + "step": 1696 + }, + { + "epoch": 0.0005956627015198964, + "grad_norm": 0.30274853110313416, + "learning_rate": 8.707846410684475e-05, + "loss": 0.4544, + "step": 1697 + }, + { + "epoch": 0.0005960137107724126, + "grad_norm": 0.3348468244075775, + "learning_rate": 8.701168614357262e-05, + "loss": 0.641, + "step": 1698 + }, + { + "epoch": 0.0005963647200249287, + "grad_norm": 0.2674828767776489, + "learning_rate": 8.694490818030051e-05, + "loss": 0.5649, + "step": 1699 + }, + { + "epoch": 0.0005967157292774448, + "grad_norm": 0.3447219729423523, + "learning_rate": 8.687813021702838e-05, + "loss": 0.4618, + "step": 1700 + }, + { + "epoch": 0.000597066738529961, + "grad_norm": 0.3155357241630554, + "learning_rate": 8.681135225375627e-05, + "loss": 0.4557, + "step": 1701 + }, + { + "epoch": 0.0005974177477824772, + "grad_norm": 0.2937457263469696, + "learning_rate": 8.674457429048415e-05, + "loss": 0.65, + "step": 1702 + }, + { + "epoch": 0.0005977687570349932, + "grad_norm": 0.287835031747818, + "learning_rate": 8.667779632721203e-05, + "loss": 0.4525, + "step": 1703 + }, + { + "epoch": 0.0005981197662875094, + "grad_norm": 0.3285943865776062, + "learning_rate": 8.66110183639399e-05, + "loss": 0.4686, + "step": 1704 + }, + { + "epoch": 0.0005984707755400256, + "grad_norm": 0.3463473618030548, + "learning_rate": 8.654424040066779e-05, + "loss": 0.5349, + "step": 1705 + }, + { + "epoch": 0.0005988217847925416, + "grad_norm": 0.3047028183937073, + "learning_rate": 8.647746243739566e-05, + "loss": 0.4551, + "step": 1706 + }, + { + "epoch": 0.0005991727940450578, + "grad_norm": 0.2832798361778259, + "learning_rate": 8.641068447412355e-05, + "loss": 0.4721, + "step": 1707 + }, + { + "epoch": 0.000599523803297574, + "grad_norm": 0.3024655878543854, + "learning_rate": 8.634390651085142e-05, + "loss": 0.4971, + "step": 1708 + }, + { + "epoch": 0.0005998748125500902, + "grad_norm": 0.2802872657775879, + "learning_rate": 8.62771285475793e-05, + "loss": 0.4598, + "step": 1709 + }, + { + "epoch": 0.0006002258218026062, + "grad_norm": 0.2773732841014862, + "learning_rate": 8.62103505843072e-05, + "loss": 0.4215, + "step": 1710 + }, + { + "epoch": 0.0006005768310551224, + "grad_norm": 0.3328293263912201, + "learning_rate": 8.614357262103507e-05, + "loss": 0.4502, + "step": 1711 + }, + { + "epoch": 0.0006009278403076386, + "grad_norm": 0.3046766519546509, + "learning_rate": 8.607679465776294e-05, + "loss": 0.4578, + "step": 1712 + }, + { + "epoch": 0.0006012788495601547, + "grad_norm": 0.33364781737327576, + "learning_rate": 8.601001669449083e-05, + "loss": 0.5184, + "step": 1713 + }, + { + "epoch": 0.0006016298588126708, + "grad_norm": 0.3627041280269623, + "learning_rate": 8.59432387312187e-05, + "loss": 0.5041, + "step": 1714 + }, + { + "epoch": 0.000601980868065187, + "grad_norm": 0.3411107361316681, + "learning_rate": 8.587646076794659e-05, + "loss": 0.4983, + "step": 1715 + }, + { + "epoch": 0.0006023318773177031, + "grad_norm": 0.3014586865901947, + "learning_rate": 8.580968280467446e-05, + "loss": 0.6105, + "step": 1716 + }, + { + "epoch": 0.0006026828865702193, + "grad_norm": 0.29484355449676514, + "learning_rate": 8.574290484140233e-05, + "loss": 0.4038, + "step": 1717 + }, + { + "epoch": 0.0006030338958227354, + "grad_norm": 0.37084364891052246, + "learning_rate": 8.567612687813022e-05, + "loss": 0.4537, + "step": 1718 + }, + { + "epoch": 0.0006033849050752516, + "grad_norm": 0.29114142060279846, + "learning_rate": 8.56093489148581e-05, + "loss": 0.5568, + "step": 1719 + }, + { + "epoch": 0.0006037359143277677, + "grad_norm": 0.3706299662590027, + "learning_rate": 8.554257095158598e-05, + "loss": 0.5678, + "step": 1720 + }, + { + "epoch": 0.0006040869235802839, + "grad_norm": 0.3251887857913971, + "learning_rate": 8.547579298831387e-05, + "loss": 0.573, + "step": 1721 + }, + { + "epoch": 0.0006044379328328, + "grad_norm": 0.28198716044425964, + "learning_rate": 8.540901502504174e-05, + "loss": 0.4209, + "step": 1722 + }, + { + "epoch": 0.0006047889420853161, + "grad_norm": 0.2896440029144287, + "learning_rate": 8.534223706176963e-05, + "loss": 0.4579, + "step": 1723 + }, + { + "epoch": 0.0006051399513378323, + "grad_norm": 0.3755309283733368, + "learning_rate": 8.52754590984975e-05, + "loss": 0.5178, + "step": 1724 + }, + { + "epoch": 0.0006054909605903485, + "grad_norm": 0.37272268533706665, + "learning_rate": 8.520868113522537e-05, + "loss": 0.5911, + "step": 1725 + }, + { + "epoch": 0.0006058419698428645, + "grad_norm": 0.29033470153808594, + "learning_rate": 8.514190317195326e-05, + "loss": 0.4492, + "step": 1726 + }, + { + "epoch": 0.0006061929790953807, + "grad_norm": 0.2940375804901123, + "learning_rate": 8.507512520868115e-05, + "loss": 0.4917, + "step": 1727 + }, + { + "epoch": 0.0006065439883478969, + "grad_norm": 0.3448154926300049, + "learning_rate": 8.500834724540902e-05, + "loss": 0.5053, + "step": 1728 + }, + { + "epoch": 0.000606894997600413, + "grad_norm": 0.30485787987709045, + "learning_rate": 8.49415692821369e-05, + "loss": 0.4761, + "step": 1729 + }, + { + "epoch": 0.0006072460068529291, + "grad_norm": 0.33083775639533997, + "learning_rate": 8.487479131886478e-05, + "loss": 0.5504, + "step": 1730 + }, + { + "epoch": 0.0006075970161054453, + "grad_norm": 0.2886825203895569, + "learning_rate": 8.480801335559267e-05, + "loss": 0.5908, + "step": 1731 + }, + { + "epoch": 0.0006079480253579615, + "grad_norm": 0.3262576758861542, + "learning_rate": 8.474123539232054e-05, + "loss": 0.4562, + "step": 1732 + }, + { + "epoch": 0.0006082990346104775, + "grad_norm": 0.31888243556022644, + "learning_rate": 8.467445742904841e-05, + "loss": 0.6006, + "step": 1733 + }, + { + "epoch": 0.0006086500438629937, + "grad_norm": 0.33102548122406006, + "learning_rate": 8.46076794657763e-05, + "loss": 0.5731, + "step": 1734 + }, + { + "epoch": 0.0006090010531155099, + "grad_norm": 0.31176602840423584, + "learning_rate": 8.454090150250417e-05, + "loss": 0.4979, + "step": 1735 + }, + { + "epoch": 0.000609352062368026, + "grad_norm": 0.30639031529426575, + "learning_rate": 8.447412353923206e-05, + "loss": 0.5953, + "step": 1736 + }, + { + "epoch": 0.0006097030716205421, + "grad_norm": 0.3576785922050476, + "learning_rate": 8.440734557595994e-05, + "loss": 0.612, + "step": 1737 + }, + { + "epoch": 0.0006100540808730583, + "grad_norm": 0.3325173854827881, + "learning_rate": 8.434056761268782e-05, + "loss": 0.5577, + "step": 1738 + }, + { + "epoch": 0.0006104050901255745, + "grad_norm": 0.3713616728782654, + "learning_rate": 8.42737896494157e-05, + "loss": 0.5457, + "step": 1739 + }, + { + "epoch": 0.0006107560993780906, + "grad_norm": 0.37327736616134644, + "learning_rate": 8.420701168614358e-05, + "loss": 0.4726, + "step": 1740 + }, + { + "epoch": 0.0006111071086306067, + "grad_norm": 0.3603207468986511, + "learning_rate": 8.414023372287145e-05, + "loss": 0.5489, + "step": 1741 + }, + { + "epoch": 0.0006114581178831229, + "grad_norm": 0.30581197142601013, + "learning_rate": 8.407345575959934e-05, + "loss": 0.4219, + "step": 1742 + }, + { + "epoch": 0.000611809127135639, + "grad_norm": 0.3137530982494354, + "learning_rate": 8.400667779632721e-05, + "loss": 0.4862, + "step": 1743 + }, + { + "epoch": 0.0006121601363881552, + "grad_norm": 0.28663527965545654, + "learning_rate": 8.39398998330551e-05, + "loss": 0.4886, + "step": 1744 + }, + { + "epoch": 0.0006125111456406713, + "grad_norm": 0.28816184401512146, + "learning_rate": 8.387312186978298e-05, + "loss": 0.4536, + "step": 1745 + }, + { + "epoch": 0.0006128621548931874, + "grad_norm": 0.36478331685066223, + "learning_rate": 8.380634390651086e-05, + "loss": 0.4393, + "step": 1746 + }, + { + "epoch": 0.0006132131641457036, + "grad_norm": 0.34497642517089844, + "learning_rate": 8.373956594323874e-05, + "loss": 0.4783, + "step": 1747 + }, + { + "epoch": 0.0006135641733982198, + "grad_norm": 0.34038984775543213, + "learning_rate": 8.367278797996662e-05, + "loss": 0.4024, + "step": 1748 + }, + { + "epoch": 0.0006139151826507358, + "grad_norm": 0.42788851261138916, + "learning_rate": 8.360601001669449e-05, + "loss": 0.4738, + "step": 1749 + }, + { + "epoch": 0.000614266191903252, + "grad_norm": 0.3174630105495453, + "learning_rate": 8.353923205342238e-05, + "loss": 0.571, + "step": 1750 + }, + { + "epoch": 0.0006146172011557682, + "grad_norm": 0.43922609090805054, + "learning_rate": 8.347245409015025e-05, + "loss": 0.6078, + "step": 1751 + }, + { + "epoch": 0.0006149682104082844, + "grad_norm": 0.3589128255844116, + "learning_rate": 8.340567612687812e-05, + "loss": 0.6748, + "step": 1752 + }, + { + "epoch": 0.0006153192196608004, + "grad_norm": 0.36477571725845337, + "learning_rate": 8.333889816360601e-05, + "loss": 0.4796, + "step": 1753 + }, + { + "epoch": 0.0006156702289133166, + "grad_norm": 0.3312797546386719, + "learning_rate": 8.32721202003339e-05, + "loss": 0.5847, + "step": 1754 + }, + { + "epoch": 0.0006160212381658328, + "grad_norm": 0.3113849461078644, + "learning_rate": 8.320534223706178e-05, + "loss": 0.5345, + "step": 1755 + }, + { + "epoch": 0.0006163722474183488, + "grad_norm": 0.3181850016117096, + "learning_rate": 8.313856427378966e-05, + "loss": 0.5949, + "step": 1756 + }, + { + "epoch": 0.000616723256670865, + "grad_norm": 0.44424140453338623, + "learning_rate": 8.307178631051753e-05, + "loss": 0.5702, + "step": 1757 + }, + { + "epoch": 0.0006170742659233812, + "grad_norm": 0.3985821604728699, + "learning_rate": 8.300500834724542e-05, + "loss": 0.5699, + "step": 1758 + }, + { + "epoch": 0.0006174252751758973, + "grad_norm": 0.3222169280052185, + "learning_rate": 8.293823038397329e-05, + "loss": 0.5349, + "step": 1759 + }, + { + "epoch": 0.0006177762844284134, + "grad_norm": 0.4233343303203583, + "learning_rate": 8.287145242070116e-05, + "loss": 0.5031, + "step": 1760 + }, + { + "epoch": 0.0006181272936809296, + "grad_norm": 0.3432156443595886, + "learning_rate": 8.280467445742905e-05, + "loss": 0.5084, + "step": 1761 + }, + { + "epoch": 0.0006184783029334458, + "grad_norm": 0.33886751532554626, + "learning_rate": 8.273789649415694e-05, + "loss": 0.4592, + "step": 1762 + }, + { + "epoch": 0.0006188293121859619, + "grad_norm": 0.3379828929901123, + "learning_rate": 8.267111853088482e-05, + "loss": 0.4691, + "step": 1763 + }, + { + "epoch": 0.000619180321438478, + "grad_norm": 0.2838027775287628, + "learning_rate": 8.26043405676127e-05, + "loss": 0.5345, + "step": 1764 + }, + { + "epoch": 0.0006195313306909942, + "grad_norm": 0.3198727071285248, + "learning_rate": 8.253756260434057e-05, + "loss": 0.6029, + "step": 1765 + }, + { + "epoch": 0.0006198823399435103, + "grad_norm": 0.37079837918281555, + "learning_rate": 8.247078464106846e-05, + "loss": 0.6643, + "step": 1766 + }, + { + "epoch": 0.0006202333491960265, + "grad_norm": 0.3130449652671814, + "learning_rate": 8.240400667779633e-05, + "loss": 0.5585, + "step": 1767 + }, + { + "epoch": 0.0006205843584485426, + "grad_norm": 0.29854029417037964, + "learning_rate": 8.23372287145242e-05, + "loss": 0.5202, + "step": 1768 + }, + { + "epoch": 0.0006209353677010587, + "grad_norm": 0.3536113202571869, + "learning_rate": 8.227045075125209e-05, + "loss": 0.5882, + "step": 1769 + }, + { + "epoch": 0.0006212863769535749, + "grad_norm": 0.2841801941394806, + "learning_rate": 8.220367278797996e-05, + "loss": 0.4227, + "step": 1770 + }, + { + "epoch": 0.0006216373862060911, + "grad_norm": 0.32225102186203003, + "learning_rate": 8.213689482470785e-05, + "loss": 0.5545, + "step": 1771 + }, + { + "epoch": 0.0006219883954586072, + "grad_norm": 0.3385821282863617, + "learning_rate": 8.207011686143574e-05, + "loss": 0.5307, + "step": 1772 + }, + { + "epoch": 0.0006223394047111233, + "grad_norm": 0.3400219976902008, + "learning_rate": 8.200333889816361e-05, + "loss": 0.5664, + "step": 1773 + }, + { + "epoch": 0.0006226904139636395, + "grad_norm": 0.4283548593521118, + "learning_rate": 8.19365609348915e-05, + "loss": 0.4957, + "step": 1774 + }, + { + "epoch": 0.0006230414232161557, + "grad_norm": 0.3625548779964447, + "learning_rate": 8.186978297161937e-05, + "loss": 0.4819, + "step": 1775 + }, + { + "epoch": 0.0006233924324686717, + "grad_norm": 0.34131062030792236, + "learning_rate": 8.180300500834724e-05, + "loss": 0.5277, + "step": 1776 + }, + { + "epoch": 0.0006237434417211879, + "grad_norm": 0.3383775055408478, + "learning_rate": 8.173622704507513e-05, + "loss": 0.5539, + "step": 1777 + }, + { + "epoch": 0.0006240944509737041, + "grad_norm": 0.2844056785106659, + "learning_rate": 8.1669449081803e-05, + "loss": 0.4959, + "step": 1778 + }, + { + "epoch": 0.0006244454602262201, + "grad_norm": 0.3345259428024292, + "learning_rate": 8.160267111853089e-05, + "loss": 0.5136, + "step": 1779 + }, + { + "epoch": 0.0006247964694787363, + "grad_norm": 0.32142356038093567, + "learning_rate": 8.153589315525877e-05, + "loss": 0.5348, + "step": 1780 + }, + { + "epoch": 0.0006251474787312525, + "grad_norm": 0.30291274189949036, + "learning_rate": 8.146911519198665e-05, + "loss": 0.5296, + "step": 1781 + }, + { + "epoch": 0.0006254984879837687, + "grad_norm": 0.36180031299591064, + "learning_rate": 8.140233722871453e-05, + "loss": 0.5498, + "step": 1782 + }, + { + "epoch": 0.0006258494972362847, + "grad_norm": 0.2952847182750702, + "learning_rate": 8.133555926544241e-05, + "loss": 0.5233, + "step": 1783 + }, + { + "epoch": 0.0006262005064888009, + "grad_norm": 0.2964370846748352, + "learning_rate": 8.126878130217028e-05, + "loss": 0.5787, + "step": 1784 + }, + { + "epoch": 0.0006265515157413171, + "grad_norm": 0.3017970323562622, + "learning_rate": 8.120200333889817e-05, + "loss": 0.5927, + "step": 1785 + }, + { + "epoch": 0.0006269025249938332, + "grad_norm": 0.32457467913627625, + "learning_rate": 8.113522537562604e-05, + "loss": 0.6207, + "step": 1786 + }, + { + "epoch": 0.0006272535342463493, + "grad_norm": 0.3024297058582306, + "learning_rate": 8.106844741235393e-05, + "loss": 0.5379, + "step": 1787 + }, + { + "epoch": 0.0006276045434988655, + "grad_norm": 0.2766537368297577, + "learning_rate": 8.10016694490818e-05, + "loss": 0.432, + "step": 1788 + }, + { + "epoch": 0.0006279555527513816, + "grad_norm": 0.3326070308685303, + "learning_rate": 8.093489148580969e-05, + "loss": 0.6633, + "step": 1789 + }, + { + "epoch": 0.0006283065620038978, + "grad_norm": 0.2948818802833557, + "learning_rate": 8.086811352253757e-05, + "loss": 0.4987, + "step": 1790 + }, + { + "epoch": 0.0006286575712564139, + "grad_norm": 0.28426218032836914, + "learning_rate": 8.080133555926545e-05, + "loss": 0.442, + "step": 1791 + }, + { + "epoch": 0.0006290085805089301, + "grad_norm": 0.30030035972595215, + "learning_rate": 8.073455759599332e-05, + "loss": 0.6064, + "step": 1792 + }, + { + "epoch": 0.0006293595897614462, + "grad_norm": 0.30664128065109253, + "learning_rate": 8.066777963272121e-05, + "loss": 0.4789, + "step": 1793 + }, + { + "epoch": 0.0006297105990139624, + "grad_norm": 0.30878594517707825, + "learning_rate": 8.060100166944908e-05, + "loss": 0.5365, + "step": 1794 + }, + { + "epoch": 0.0006300616082664785, + "grad_norm": 0.31132617592811584, + "learning_rate": 8.053422370617697e-05, + "loss": 0.5432, + "step": 1795 + }, + { + "epoch": 0.0006304126175189946, + "grad_norm": 0.3347366154193878, + "learning_rate": 8.046744574290484e-05, + "loss": 0.4208, + "step": 1796 + }, + { + "epoch": 0.0006307636267715108, + "grad_norm": 0.3419090509414673, + "learning_rate": 8.040066777963273e-05, + "loss": 0.4985, + "step": 1797 + }, + { + "epoch": 0.000631114636024027, + "grad_norm": 0.3174959719181061, + "learning_rate": 8.033388981636061e-05, + "loss": 0.4255, + "step": 1798 + }, + { + "epoch": 0.000631465645276543, + "grad_norm": 0.32764488458633423, + "learning_rate": 8.026711185308849e-05, + "loss": 0.6213, + "step": 1799 + }, + { + "epoch": 0.0006318166545290592, + "grad_norm": 0.3342370390892029, + "learning_rate": 8.020033388981636e-05, + "loss": 0.4789, + "step": 1800 + }, + { + "epoch": 0.0006321676637815754, + "grad_norm": 0.301438570022583, + "learning_rate": 8.013355592654425e-05, + "loss": 0.5937, + "step": 1801 + }, + { + "epoch": 0.0006325186730340916, + "grad_norm": 0.31911852955818176, + "learning_rate": 8.006677796327212e-05, + "loss": 0.5831, + "step": 1802 + }, + { + "epoch": 0.0006328696822866076, + "grad_norm": 0.2970680296421051, + "learning_rate": 8e-05, + "loss": 0.5223, + "step": 1803 + }, + { + "epoch": 0.0006332206915391238, + "grad_norm": 0.29310017824172974, + "learning_rate": 7.993322203672788e-05, + "loss": 0.5266, + "step": 1804 + }, + { + "epoch": 0.00063357170079164, + "grad_norm": 0.34701675176620483, + "learning_rate": 7.986644407345575e-05, + "loss": 0.4887, + "step": 1805 + }, + { + "epoch": 0.000633922710044156, + "grad_norm": 0.24955204129219055, + "learning_rate": 7.979966611018364e-05, + "loss": 0.437, + "step": 1806 + }, + { + "epoch": 0.0006342737192966722, + "grad_norm": 0.33152899146080017, + "learning_rate": 7.973288814691153e-05, + "loss": 0.5932, + "step": 1807 + }, + { + "epoch": 0.0006346247285491884, + "grad_norm": 0.2790103852748871, + "learning_rate": 7.96661101836394e-05, + "loss": 0.4585, + "step": 1808 + }, + { + "epoch": 0.0006349757378017045, + "grad_norm": 0.30877217650413513, + "learning_rate": 7.959933222036729e-05, + "loss": 0.5174, + "step": 1809 + }, + { + "epoch": 0.0006353267470542206, + "grad_norm": 0.38331231474876404, + "learning_rate": 7.953255425709516e-05, + "loss": 0.5696, + "step": 1810 + }, + { + "epoch": 0.0006356777563067368, + "grad_norm": 0.35821542143821716, + "learning_rate": 7.946577629382305e-05, + "loss": 0.4815, + "step": 1811 + }, + { + "epoch": 0.000636028765559253, + "grad_norm": 0.3109416365623474, + "learning_rate": 7.939899833055092e-05, + "loss": 0.5783, + "step": 1812 + }, + { + "epoch": 0.0006363797748117691, + "grad_norm": 0.3217208683490753, + "learning_rate": 7.933222036727879e-05, + "loss": 0.5606, + "step": 1813 + }, + { + "epoch": 0.0006367307840642852, + "grad_norm": 0.3818305432796478, + "learning_rate": 7.926544240400668e-05, + "loss": 0.5592, + "step": 1814 + }, + { + "epoch": 0.0006370817933168014, + "grad_norm": 0.29824909567832947, + "learning_rate": 7.919866444073457e-05, + "loss": 0.5157, + "step": 1815 + }, + { + "epoch": 0.0006374328025693175, + "grad_norm": 0.31353560090065, + "learning_rate": 7.913188647746244e-05, + "loss": 0.5991, + "step": 1816 + }, + { + "epoch": 0.0006377838118218337, + "grad_norm": 0.33129647374153137, + "learning_rate": 7.906510851419033e-05, + "loss": 0.54, + "step": 1817 + }, + { + "epoch": 0.0006381348210743498, + "grad_norm": 0.3199217915534973, + "learning_rate": 7.89983305509182e-05, + "loss": 0.4823, + "step": 1818 + }, + { + "epoch": 0.0006384858303268659, + "grad_norm": 0.2801882028579712, + "learning_rate": 7.893155258764609e-05, + "loss": 0.5379, + "step": 1819 + }, + { + "epoch": 0.0006388368395793821, + "grad_norm": 0.29676681756973267, + "learning_rate": 7.886477462437396e-05, + "loss": 0.5142, + "step": 1820 + }, + { + "epoch": 0.0006391878488318983, + "grad_norm": 0.3249494433403015, + "learning_rate": 7.879799666110183e-05, + "loss": 0.4743, + "step": 1821 + }, + { + "epoch": 0.0006395388580844144, + "grad_norm": 0.47364258766174316, + "learning_rate": 7.873121869782972e-05, + "loss": 0.5575, + "step": 1822 + }, + { + "epoch": 0.0006398898673369305, + "grad_norm": 0.310779869556427, + "learning_rate": 7.86644407345576e-05, + "loss": 0.5115, + "step": 1823 + }, + { + "epoch": 0.0006402408765894467, + "grad_norm": 0.26023536920547485, + "learning_rate": 7.859766277128548e-05, + "loss": 0.5084, + "step": 1824 + }, + { + "epoch": 0.0006405918858419629, + "grad_norm": 0.31088247895240784, + "learning_rate": 7.853088480801337e-05, + "loss": 0.513, + "step": 1825 + }, + { + "epoch": 0.0006409428950944789, + "grad_norm": 0.2561517357826233, + "learning_rate": 7.846410684474124e-05, + "loss": 0.4056, + "step": 1826 + }, + { + "epoch": 0.0006412939043469951, + "grad_norm": 0.28456807136535645, + "learning_rate": 7.839732888146912e-05, + "loss": 0.4895, + "step": 1827 + }, + { + "epoch": 0.0006416449135995113, + "grad_norm": 0.30845314264297485, + "learning_rate": 7.8330550918197e-05, + "loss": 0.5941, + "step": 1828 + }, + { + "epoch": 0.0006419959228520273, + "grad_norm": 0.30980512499809265, + "learning_rate": 7.826377295492487e-05, + "loss": 0.5307, + "step": 1829 + }, + { + "epoch": 0.0006423469321045435, + "grad_norm": 0.2923174500465393, + "learning_rate": 7.819699499165276e-05, + "loss": 0.4737, + "step": 1830 + }, + { + "epoch": 0.0006426979413570597, + "grad_norm": 0.3474715054035187, + "learning_rate": 7.813021702838063e-05, + "loss": 0.6606, + "step": 1831 + }, + { + "epoch": 0.0006430489506095759, + "grad_norm": 0.29576122760772705, + "learning_rate": 7.806343906510852e-05, + "loss": 0.4151, + "step": 1832 + }, + { + "epoch": 0.000643399959862092, + "grad_norm": 0.3127489686012268, + "learning_rate": 7.79966611018364e-05, + "loss": 0.5683, + "step": 1833 + }, + { + "epoch": 0.0006437509691146081, + "grad_norm": 0.32313060760498047, + "learning_rate": 7.792988313856428e-05, + "loss": 0.3911, + "step": 1834 + }, + { + "epoch": 0.0006441019783671243, + "grad_norm": 0.38172590732574463, + "learning_rate": 7.786310517529216e-05, + "loss": 0.4852, + "step": 1835 + }, + { + "epoch": 0.0006444529876196404, + "grad_norm": 0.38548141717910767, + "learning_rate": 7.779632721202004e-05, + "loss": 0.5238, + "step": 1836 + }, + { + "epoch": 0.0006448039968721565, + "grad_norm": 0.3326992392539978, + "learning_rate": 7.772954924874791e-05, + "loss": 0.5435, + "step": 1837 + }, + { + "epoch": 0.0006451550061246727, + "grad_norm": 0.2704392969608307, + "learning_rate": 7.76627712854758e-05, + "loss": 0.5049, + "step": 1838 + }, + { + "epoch": 0.0006455060153771888, + "grad_norm": 0.3688966929912567, + "learning_rate": 7.759599332220367e-05, + "loss": 0.5507, + "step": 1839 + }, + { + "epoch": 0.000645857024629705, + "grad_norm": 0.33513352274894714, + "learning_rate": 7.752921535893156e-05, + "loss": 0.59, + "step": 1840 + }, + { + "epoch": 0.0006462080338822211, + "grad_norm": 0.26873478293418884, + "learning_rate": 7.746243739565944e-05, + "loss": 0.4088, + "step": 1841 + }, + { + "epoch": 0.0006465590431347373, + "grad_norm": 0.41162189841270447, + "learning_rate": 7.739565943238732e-05, + "loss": 0.4159, + "step": 1842 + }, + { + "epoch": 0.0006469100523872534, + "grad_norm": 0.3542315661907196, + "learning_rate": 7.73288814691152e-05, + "loss": 0.6067, + "step": 1843 + }, + { + "epoch": 0.0006472610616397696, + "grad_norm": 0.39147111773490906, + "learning_rate": 7.726210350584308e-05, + "loss": 0.4139, + "step": 1844 + }, + { + "epoch": 0.0006476120708922857, + "grad_norm": 0.3200126588344574, + "learning_rate": 7.719532554257095e-05, + "loss": 0.4112, + "step": 1845 + }, + { + "epoch": 0.0006479630801448018, + "grad_norm": 0.34853747487068176, + "learning_rate": 7.712854757929884e-05, + "loss": 0.4983, + "step": 1846 + }, + { + "epoch": 0.000648314089397318, + "grad_norm": 0.2987789511680603, + "learning_rate": 7.706176961602671e-05, + "loss": 0.5186, + "step": 1847 + }, + { + "epoch": 0.0006486650986498342, + "grad_norm": 0.3692026436328888, + "learning_rate": 7.69949916527546e-05, + "loss": 0.4028, + "step": 1848 + }, + { + "epoch": 0.0006490161079023502, + "grad_norm": 0.26036712527275085, + "learning_rate": 7.692821368948247e-05, + "loss": 0.4971, + "step": 1849 + }, + { + "epoch": 0.0006493671171548664, + "grad_norm": 0.2928013801574707, + "learning_rate": 7.686143572621036e-05, + "loss": 0.549, + "step": 1850 + }, + { + "epoch": 0.0006497181264073826, + "grad_norm": 0.2794664204120636, + "learning_rate": 7.679465776293824e-05, + "loss": 0.4184, + "step": 1851 + }, + { + "epoch": 0.0006500691356598988, + "grad_norm": 0.282713919878006, + "learning_rate": 7.672787979966612e-05, + "loss": 0.4637, + "step": 1852 + }, + { + "epoch": 0.0006504201449124148, + "grad_norm": 0.3084028959274292, + "learning_rate": 7.666110183639399e-05, + "loss": 0.4423, + "step": 1853 + }, + { + "epoch": 0.000650771154164931, + "grad_norm": 0.35329973697662354, + "learning_rate": 7.659432387312188e-05, + "loss": 0.4868, + "step": 1854 + }, + { + "epoch": 0.0006511221634174472, + "grad_norm": 0.38975444436073303, + "learning_rate": 7.652754590984975e-05, + "loss": 0.3701, + "step": 1855 + }, + { + "epoch": 0.0006514731726699632, + "grad_norm": 0.2983016073703766, + "learning_rate": 7.646076794657764e-05, + "loss": 0.5407, + "step": 1856 + }, + { + "epoch": 0.0006518241819224794, + "grad_norm": 0.32849010825157166, + "learning_rate": 7.639398998330551e-05, + "loss": 0.548, + "step": 1857 + }, + { + "epoch": 0.0006521751911749956, + "grad_norm": 0.32322797179222107, + "learning_rate": 7.63272120200334e-05, + "loss": 0.4231, + "step": 1858 + }, + { + "epoch": 0.0006525262004275117, + "grad_norm": 0.2949173152446747, + "learning_rate": 7.626043405676128e-05, + "loss": 0.5777, + "step": 1859 + }, + { + "epoch": 0.0006528772096800278, + "grad_norm": 0.3120216727256775, + "learning_rate": 7.619365609348916e-05, + "loss": 0.4483, + "step": 1860 + }, + { + "epoch": 0.000653228218932544, + "grad_norm": 0.32363617420196533, + "learning_rate": 7.612687813021703e-05, + "loss": 0.5748, + "step": 1861 + }, + { + "epoch": 0.0006535792281850602, + "grad_norm": 0.3077629506587982, + "learning_rate": 7.606010016694492e-05, + "loss": 0.5135, + "step": 1862 + }, + { + "epoch": 0.0006539302374375763, + "grad_norm": 0.3201192319393158, + "learning_rate": 7.599332220367279e-05, + "loss": 0.6412, + "step": 1863 + }, + { + "epoch": 0.0006542812466900924, + "grad_norm": 0.3008538484573364, + "learning_rate": 7.592654424040068e-05, + "loss": 0.4858, + "step": 1864 + }, + { + "epoch": 0.0006546322559426086, + "grad_norm": 0.35019761323928833, + "learning_rate": 7.585976627712855e-05, + "loss": 0.4819, + "step": 1865 + }, + { + "epoch": 0.0006549832651951247, + "grad_norm": 0.39763036370277405, + "learning_rate": 7.579298831385642e-05, + "loss": 0.5775, + "step": 1866 + }, + { + "epoch": 0.0006553342744476409, + "grad_norm": 0.29005396366119385, + "learning_rate": 7.572621035058431e-05, + "loss": 0.4828, + "step": 1867 + }, + { + "epoch": 0.000655685283700157, + "grad_norm": 0.30613401532173157, + "learning_rate": 7.56594323873122e-05, + "loss": 0.4375, + "step": 1868 + }, + { + "epoch": 0.0006560362929526731, + "grad_norm": 0.3596465289592743, + "learning_rate": 7.559265442404007e-05, + "loss": 0.4468, + "step": 1869 + }, + { + "epoch": 0.0006563873022051893, + "grad_norm": 0.28737086057662964, + "learning_rate": 7.552587646076796e-05, + "loss": 0.5726, + "step": 1870 + }, + { + "epoch": 0.0006567383114577055, + "grad_norm": 0.38036370277404785, + "learning_rate": 7.545909849749583e-05, + "loss": 0.5747, + "step": 1871 + }, + { + "epoch": 0.0006570893207102216, + "grad_norm": 0.3192722499370575, + "learning_rate": 7.539232053422371e-05, + "loss": 0.5859, + "step": 1872 + }, + { + "epoch": 0.0006574403299627377, + "grad_norm": 0.2886595129966736, + "learning_rate": 7.532554257095159e-05, + "loss": 0.5099, + "step": 1873 + }, + { + "epoch": 0.0006577913392152539, + "grad_norm": 0.3017093241214752, + "learning_rate": 7.525876460767946e-05, + "loss": 0.4442, + "step": 1874 + }, + { + "epoch": 0.0006581423484677701, + "grad_norm": 0.3073802590370178, + "learning_rate": 7.519198664440735e-05, + "loss": 0.5022, + "step": 1875 + }, + { + "epoch": 0.0006584933577202861, + "grad_norm": 0.34113094210624695, + "learning_rate": 7.512520868113523e-05, + "loss": 0.5146, + "step": 1876 + }, + { + "epoch": 0.0006588443669728023, + "grad_norm": 0.32277509570121765, + "learning_rate": 7.505843071786311e-05, + "loss": 0.5743, + "step": 1877 + }, + { + "epoch": 0.0006591953762253185, + "grad_norm": 0.3168696463108063, + "learning_rate": 7.4991652754591e-05, + "loss": 0.417, + "step": 1878 + }, + { + "epoch": 0.0006595463854778346, + "grad_norm": 0.35164040327072144, + "learning_rate": 7.492487479131887e-05, + "loss": 0.5078, + "step": 1879 + }, + { + "epoch": 0.0006598973947303507, + "grad_norm": 0.3132971227169037, + "learning_rate": 7.485809682804675e-05, + "loss": 0.4293, + "step": 1880 + }, + { + "epoch": 0.0006602484039828669, + "grad_norm": 0.3158970773220062, + "learning_rate": 7.479131886477463e-05, + "loss": 0.5559, + "step": 1881 + }, + { + "epoch": 0.0006605994132353831, + "grad_norm": 0.3228873610496521, + "learning_rate": 7.47245409015025e-05, + "loss": 0.4935, + "step": 1882 + }, + { + "epoch": 0.0006609504224878992, + "grad_norm": 0.4734925925731659, + "learning_rate": 7.465776293823039e-05, + "loss": 0.3587, + "step": 1883 + }, + { + "epoch": 0.0006613014317404153, + "grad_norm": 0.33582058548927307, + "learning_rate": 7.459098497495826e-05, + "loss": 0.4987, + "step": 1884 + }, + { + "epoch": 0.0006616524409929315, + "grad_norm": 0.38209983706474304, + "learning_rate": 7.452420701168615e-05, + "loss": 0.4443, + "step": 1885 + }, + { + "epoch": 0.0006620034502454476, + "grad_norm": 0.3218359649181366, + "learning_rate": 7.445742904841403e-05, + "loss": 0.5087, + "step": 1886 + }, + { + "epoch": 0.0006623544594979637, + "grad_norm": 0.33005908131599426, + "learning_rate": 7.439065108514191e-05, + "loss": 0.5362, + "step": 1887 + }, + { + "epoch": 0.0006627054687504799, + "grad_norm": 0.4753172993659973, + "learning_rate": 7.43238731218698e-05, + "loss": 0.4474, + "step": 1888 + }, + { + "epoch": 0.000663056478002996, + "grad_norm": 0.3765251636505127, + "learning_rate": 7.425709515859767e-05, + "loss": 0.5993, + "step": 1889 + }, + { + "epoch": 0.0006634074872555122, + "grad_norm": 0.3113894462585449, + "learning_rate": 7.419031719532554e-05, + "loss": 0.4636, + "step": 1890 + }, + { + "epoch": 0.0006637584965080283, + "grad_norm": 0.30841702222824097, + "learning_rate": 7.412353923205343e-05, + "loss": 0.5326, + "step": 1891 + }, + { + "epoch": 0.0006641095057605445, + "grad_norm": 0.29381653666496277, + "learning_rate": 7.40567612687813e-05, + "loss": 0.325, + "step": 1892 + }, + { + "epoch": 0.0006644605150130606, + "grad_norm": 0.3482291102409363, + "learning_rate": 7.398998330550919e-05, + "loss": 0.4646, + "step": 1893 + }, + { + "epoch": 0.0006648115242655768, + "grad_norm": 0.2865064740180969, + "learning_rate": 7.392320534223707e-05, + "loss": 0.4789, + "step": 1894 + }, + { + "epoch": 0.0006651625335180929, + "grad_norm": 0.29580044746398926, + "learning_rate": 7.385642737896495e-05, + "loss": 0.5047, + "step": 1895 + }, + { + "epoch": 0.000665513542770609, + "grad_norm": 0.3370521068572998, + "learning_rate": 7.378964941569283e-05, + "loss": 0.5915, + "step": 1896 + }, + { + "epoch": 0.0006658645520231252, + "grad_norm": 0.2680570185184479, + "learning_rate": 7.37228714524207e-05, + "loss": 0.4602, + "step": 1897 + }, + { + "epoch": 0.0006662155612756414, + "grad_norm": 0.2855984568595886, + "learning_rate": 7.365609348914858e-05, + "loss": 0.5439, + "step": 1898 + }, + { + "epoch": 0.0006665665705281574, + "grad_norm": 0.28999075293540955, + "learning_rate": 7.358931552587647e-05, + "loss": 0.4828, + "step": 1899 + }, + { + "epoch": 0.0006669175797806736, + "grad_norm": 0.3230993151664734, + "learning_rate": 7.352253756260434e-05, + "loss": 0.5974, + "step": 1900 + }, + { + "epoch": 0.0006672685890331898, + "grad_norm": 0.28700417280197144, + "learning_rate": 7.345575959933221e-05, + "loss": 0.5179, + "step": 1901 + }, + { + "epoch": 0.000667619598285706, + "grad_norm": 0.2921486794948578, + "learning_rate": 7.33889816360601e-05, + "loss": 0.4727, + "step": 1902 + }, + { + "epoch": 0.000667970607538222, + "grad_norm": 0.3887636959552765, + "learning_rate": 7.332220367278799e-05, + "loss": 0.5334, + "step": 1903 + }, + { + "epoch": 0.0006683216167907382, + "grad_norm": 0.3640362322330475, + "learning_rate": 7.325542570951587e-05, + "loss": 0.5576, + "step": 1904 + }, + { + "epoch": 0.0006686726260432544, + "grad_norm": 0.2985169589519501, + "learning_rate": 7.318864774624375e-05, + "loss": 0.5544, + "step": 1905 + }, + { + "epoch": 0.0006690236352957705, + "grad_norm": 0.30294784903526306, + "learning_rate": 7.312186978297162e-05, + "loss": 0.5005, + "step": 1906 + }, + { + "epoch": 0.0006693746445482866, + "grad_norm": 0.2947355806827545, + "learning_rate": 7.30550918196995e-05, + "loss": 0.4879, + "step": 1907 + }, + { + "epoch": 0.0006697256538008028, + "grad_norm": 0.2764705419540405, + "learning_rate": 7.298831385642738e-05, + "loss": 0.4531, + "step": 1908 + }, + { + "epoch": 0.0006700766630533189, + "grad_norm": 0.4107155501842499, + "learning_rate": 7.292153589315525e-05, + "loss": 0.4532, + "step": 1909 + }, + { + "epoch": 0.000670427672305835, + "grad_norm": 0.28341203927993774, + "learning_rate": 7.285475792988314e-05, + "loss": 0.5424, + "step": 1910 + }, + { + "epoch": 0.0006707786815583512, + "grad_norm": 0.36663204431533813, + "learning_rate": 7.278797996661103e-05, + "loss": 0.599, + "step": 1911 + }, + { + "epoch": 0.0006711296908108674, + "grad_norm": 0.30708596110343933, + "learning_rate": 7.272120200333891e-05, + "loss": 0.5971, + "step": 1912 + }, + { + "epoch": 0.0006714807000633835, + "grad_norm": 0.3823882043361664, + "learning_rate": 7.265442404006679e-05, + "loss": 0.5367, + "step": 1913 + }, + { + "epoch": 0.0006718317093158996, + "grad_norm": 0.3780754804611206, + "learning_rate": 7.258764607679466e-05, + "loss": 0.5756, + "step": 1914 + }, + { + "epoch": 0.0006721827185684158, + "grad_norm": 0.31058263778686523, + "learning_rate": 7.252086811352255e-05, + "loss": 0.5966, + "step": 1915 + }, + { + "epoch": 0.0006725337278209319, + "grad_norm": 0.29191386699676514, + "learning_rate": 7.245409015025042e-05, + "loss": 0.6099, + "step": 1916 + }, + { + "epoch": 0.0006728847370734481, + "grad_norm": 0.3607024550437927, + "learning_rate": 7.238731218697829e-05, + "loss": 0.5779, + "step": 1917 + }, + { + "epoch": 0.0006732357463259642, + "grad_norm": 0.2735411524772644, + "learning_rate": 7.232053422370618e-05, + "loss": 0.5511, + "step": 1918 + }, + { + "epoch": 0.0006735867555784803, + "grad_norm": 0.37066903710365295, + "learning_rate": 7.225375626043405e-05, + "loss": 0.5984, + "step": 1919 + }, + { + "epoch": 0.0006739377648309965, + "grad_norm": 0.3535907566547394, + "learning_rate": 7.218697829716194e-05, + "loss": 0.5074, + "step": 1920 + }, + { + "epoch": 0.0006742887740835127, + "grad_norm": 0.2900503873825073, + "learning_rate": 7.212020033388982e-05, + "loss": 0.3989, + "step": 1921 + }, + { + "epoch": 0.0006746397833360288, + "grad_norm": 0.2970031201839447, + "learning_rate": 7.20534223706177e-05, + "loss": 0.5514, + "step": 1922 + }, + { + "epoch": 0.0006749907925885449, + "grad_norm": 0.30902254581451416, + "learning_rate": 7.198664440734558e-05, + "loss": 0.3982, + "step": 1923 + }, + { + "epoch": 0.0006753418018410611, + "grad_norm": 0.2622113823890686, + "learning_rate": 7.191986644407346e-05, + "loss": 0.4587, + "step": 1924 + }, + { + "epoch": 0.0006756928110935773, + "grad_norm": 0.30972495675086975, + "learning_rate": 7.185308848080133e-05, + "loss": 0.5435, + "step": 1925 + }, + { + "epoch": 0.0006760438203460933, + "grad_norm": 0.3070833384990692, + "learning_rate": 7.178631051752922e-05, + "loss": 0.5074, + "step": 1926 + }, + { + "epoch": 0.0006763948295986095, + "grad_norm": 0.3055395781993866, + "learning_rate": 7.171953255425709e-05, + "loss": 0.5999, + "step": 1927 + }, + { + "epoch": 0.0006767458388511257, + "grad_norm": 0.3127722144126892, + "learning_rate": 7.165275459098498e-05, + "loss": 0.5511, + "step": 1928 + }, + { + "epoch": 0.0006770968481036418, + "grad_norm": 0.3363809585571289, + "learning_rate": 7.158597662771286e-05, + "loss": 0.5415, + "step": 1929 + }, + { + "epoch": 0.0006774478573561579, + "grad_norm": 0.3258194625377655, + "learning_rate": 7.151919866444074e-05, + "loss": 0.5976, + "step": 1930 + }, + { + "epoch": 0.0006777988666086741, + "grad_norm": 0.3083065152168274, + "learning_rate": 7.145242070116862e-05, + "loss": 0.6067, + "step": 1931 + }, + { + "epoch": 0.0006781498758611903, + "grad_norm": 0.3474681079387665, + "learning_rate": 7.13856427378965e-05, + "loss": 0.5749, + "step": 1932 + }, + { + "epoch": 0.0006785008851137064, + "grad_norm": 0.3168641924858093, + "learning_rate": 7.131886477462437e-05, + "loss": 0.4242, + "step": 1933 + }, + { + "epoch": 0.0006788518943662225, + "grad_norm": 0.30177485942840576, + "learning_rate": 7.125208681135226e-05, + "loss": 0.4978, + "step": 1934 + }, + { + "epoch": 0.0006792029036187387, + "grad_norm": 0.3365834653377533, + "learning_rate": 7.118530884808013e-05, + "loss": 0.5994, + "step": 1935 + }, + { + "epoch": 0.0006795539128712548, + "grad_norm": 0.3282754123210907, + "learning_rate": 7.111853088480802e-05, + "loss": 0.615, + "step": 1936 + }, + { + "epoch": 0.000679904922123771, + "grad_norm": 0.24498236179351807, + "learning_rate": 7.105175292153589e-05, + "loss": 0.4254, + "step": 1937 + }, + { + "epoch": 0.0006802559313762871, + "grad_norm": 0.3450114130973816, + "learning_rate": 7.098497495826378e-05, + "loss": 0.5362, + "step": 1938 + }, + { + "epoch": 0.0006806069406288032, + "grad_norm": 0.28795021772384644, + "learning_rate": 7.091819699499166e-05, + "loss": 0.4984, + "step": 1939 + }, + { + "epoch": 0.0006809579498813194, + "grad_norm": 0.32352307438850403, + "learning_rate": 7.085141903171954e-05, + "loss": 0.4549, + "step": 1940 + }, + { + "epoch": 0.0006813089591338355, + "grad_norm": 0.34447386860847473, + "learning_rate": 7.078464106844741e-05, + "loss": 0.5349, + "step": 1941 + }, + { + "epoch": 0.0006816599683863517, + "grad_norm": 0.31918805837631226, + "learning_rate": 7.07178631051753e-05, + "loss": 0.5468, + "step": 1942 + }, + { + "epoch": 0.0006820109776388678, + "grad_norm": 0.3190132975578308, + "learning_rate": 7.065108514190317e-05, + "loss": 0.5348, + "step": 1943 + }, + { + "epoch": 0.000682361986891384, + "grad_norm": 0.32868409156799316, + "learning_rate": 7.058430717863106e-05, + "loss": 0.6209, + "step": 1944 + }, + { + "epoch": 0.0006827129961439001, + "grad_norm": 0.2713989317417145, + "learning_rate": 7.051752921535893e-05, + "loss": 0.4681, + "step": 1945 + }, + { + "epoch": 0.0006830640053964162, + "grad_norm": 0.35190147161483765, + "learning_rate": 7.045075125208682e-05, + "loss": 0.5415, + "step": 1946 + }, + { + "epoch": 0.0006834150146489324, + "grad_norm": 0.322889119386673, + "learning_rate": 7.03839732888147e-05, + "loss": 0.5586, + "step": 1947 + }, + { + "epoch": 0.0006837660239014486, + "grad_norm": 0.33939826488494873, + "learning_rate": 7.031719532554258e-05, + "loss": 0.5586, + "step": 1948 + }, + { + "epoch": 0.0006841170331539646, + "grad_norm": 0.3554326891899109, + "learning_rate": 7.025041736227045e-05, + "loss": 0.5386, + "step": 1949 + }, + { + "epoch": 0.0006844680424064808, + "grad_norm": 0.3021222949028015, + "learning_rate": 7.018363939899834e-05, + "loss": 0.5569, + "step": 1950 + }, + { + "epoch": 0.000684819051658997, + "grad_norm": 0.3286188244819641, + "learning_rate": 7.011686143572621e-05, + "loss": 0.5466, + "step": 1951 + }, + { + "epoch": 0.0006851700609115132, + "grad_norm": 0.302117258310318, + "learning_rate": 7.00500834724541e-05, + "loss": 0.4038, + "step": 1952 + }, + { + "epoch": 0.0006855210701640292, + "grad_norm": 0.3204907178878784, + "learning_rate": 6.998330550918197e-05, + "loss": 0.4429, + "step": 1953 + }, + { + "epoch": 0.0006858720794165454, + "grad_norm": 0.2782181203365326, + "learning_rate": 6.991652754590986e-05, + "loss": 0.4102, + "step": 1954 + }, + { + "epoch": 0.0006862230886690616, + "grad_norm": 0.31240731477737427, + "learning_rate": 6.984974958263774e-05, + "loss": 0.5353, + "step": 1955 + }, + { + "epoch": 0.0006865740979215777, + "grad_norm": 0.32677972316741943, + "learning_rate": 6.978297161936562e-05, + "loss": 0.4403, + "step": 1956 + }, + { + "epoch": 0.0006869251071740938, + "grad_norm": 0.33199426531791687, + "learning_rate": 6.971619365609349e-05, + "loss": 0.4433, + "step": 1957 + }, + { + "epoch": 0.00068727611642661, + "grad_norm": 0.2825728952884674, + "learning_rate": 6.964941569282138e-05, + "loss": 0.5624, + "step": 1958 + }, + { + "epoch": 0.0006876271256791261, + "grad_norm": 0.30743977427482605, + "learning_rate": 6.958263772954925e-05, + "loss": 0.565, + "step": 1959 + }, + { + "epoch": 0.0006879781349316423, + "grad_norm": 0.32357290387153625, + "learning_rate": 6.951585976627714e-05, + "loss": 0.596, + "step": 1960 + }, + { + "epoch": 0.0006883291441841584, + "grad_norm": 0.31747472286224365, + "learning_rate": 6.944908180300501e-05, + "loss": 0.5811, + "step": 1961 + }, + { + "epoch": 0.0006886801534366745, + "grad_norm": 0.3278048038482666, + "learning_rate": 6.938230383973288e-05, + "loss": 0.4468, + "step": 1962 + }, + { + "epoch": 0.0006890311626891907, + "grad_norm": 0.3308374285697937, + "learning_rate": 6.931552587646077e-05, + "loss": 0.6508, + "step": 1963 + }, + { + "epoch": 0.0006893821719417069, + "grad_norm": 0.3360099792480469, + "learning_rate": 6.924874791318865e-05, + "loss": 0.534, + "step": 1964 + }, + { + "epoch": 0.000689733181194223, + "grad_norm": 0.3039510250091553, + "learning_rate": 6.918196994991654e-05, + "loss": 0.5789, + "step": 1965 + }, + { + "epoch": 0.0006900841904467391, + "grad_norm": 0.3015453517436981, + "learning_rate": 6.911519198664441e-05, + "loss": 0.3639, + "step": 1966 + }, + { + "epoch": 0.0006904351996992553, + "grad_norm": 0.3157881498336792, + "learning_rate": 6.904841402337229e-05, + "loss": 0.5002, + "step": 1967 + }, + { + "epoch": 0.0006907862089517714, + "grad_norm": 0.28026652336120605, + "learning_rate": 6.898163606010017e-05, + "loss": 0.5289, + "step": 1968 + }, + { + "epoch": 0.0006911372182042875, + "grad_norm": 0.3170677125453949, + "learning_rate": 6.891485809682805e-05, + "loss": 0.6144, + "step": 1969 + }, + { + "epoch": 0.0006914882274568037, + "grad_norm": 0.3244359791278839, + "learning_rate": 6.884808013355592e-05, + "loss": 0.6176, + "step": 1970 + }, + { + "epoch": 0.0006918392367093199, + "grad_norm": 0.3142417371273041, + "learning_rate": 6.878130217028381e-05, + "loss": 0.6137, + "step": 1971 + }, + { + "epoch": 0.0006921902459618359, + "grad_norm": 0.3678075969219208, + "learning_rate": 6.87145242070117e-05, + "loss": 0.5228, + "step": 1972 + }, + { + "epoch": 0.0006925412552143521, + "grad_norm": 0.35631263256073, + "learning_rate": 6.864774624373958e-05, + "loss": 0.4831, + "step": 1973 + }, + { + "epoch": 0.0006928922644668683, + "grad_norm": 0.30589306354522705, + "learning_rate": 6.858096828046745e-05, + "loss": 0.47, + "step": 1974 + }, + { + "epoch": 0.0006932432737193845, + "grad_norm": 0.3037767708301544, + "learning_rate": 6.851419031719533e-05, + "loss": 0.5334, + "step": 1975 + }, + { + "epoch": 0.0006935942829719005, + "grad_norm": 0.3331162631511688, + "learning_rate": 6.844741235392321e-05, + "loss": 0.6051, + "step": 1976 + }, + { + "epoch": 0.0006939452922244167, + "grad_norm": 0.3342154622077942, + "learning_rate": 6.838063439065109e-05, + "loss": 0.5466, + "step": 1977 + }, + { + "epoch": 0.0006942963014769329, + "grad_norm": 0.3748263418674469, + "learning_rate": 6.831385642737896e-05, + "loss": 0.5265, + "step": 1978 + }, + { + "epoch": 0.000694647310729449, + "grad_norm": 0.33476313948631287, + "learning_rate": 6.824707846410685e-05, + "loss": 0.5298, + "step": 1979 + }, + { + "epoch": 0.0006949983199819651, + "grad_norm": 0.37101680040359497, + "learning_rate": 6.818030050083472e-05, + "loss": 0.5745, + "step": 1980 + }, + { + "epoch": 0.0006953493292344813, + "grad_norm": 0.3126341998577118, + "learning_rate": 6.811352253756261e-05, + "loss": 0.4874, + "step": 1981 + }, + { + "epoch": 0.0006957003384869974, + "grad_norm": 0.305896133184433, + "learning_rate": 6.80467445742905e-05, + "loss": 0.5187, + "step": 1982 + }, + { + "epoch": 0.0006960513477395136, + "grad_norm": 0.3486585319042206, + "learning_rate": 6.797996661101837e-05, + "loss": 0.5567, + "step": 1983 + }, + { + "epoch": 0.0006964023569920297, + "grad_norm": 0.33587202429771423, + "learning_rate": 6.791318864774625e-05, + "loss": 0.505, + "step": 1984 + }, + { + "epoch": 0.0006967533662445459, + "grad_norm": 0.32981690764427185, + "learning_rate": 6.784641068447413e-05, + "loss": 0.4372, + "step": 1985 + }, + { + "epoch": 0.000697104375497062, + "grad_norm": 0.30636945366859436, + "learning_rate": 6.7779632721202e-05, + "loss": 0.4731, + "step": 1986 + }, + { + "epoch": 0.0006974553847495782, + "grad_norm": 0.3573989272117615, + "learning_rate": 6.771285475792989e-05, + "loss": 0.6193, + "step": 1987 + }, + { + "epoch": 0.0006978063940020943, + "grad_norm": 0.3697716295719147, + "learning_rate": 6.764607679465776e-05, + "loss": 0.4243, + "step": 1988 + }, + { + "epoch": 0.0006981574032546104, + "grad_norm": 0.3072642385959625, + "learning_rate": 6.757929883138565e-05, + "loss": 0.5506, + "step": 1989 + }, + { + "epoch": 0.0006985084125071266, + "grad_norm": 0.3706247806549072, + "learning_rate": 6.751252086811353e-05, + "loss": 0.4897, + "step": 1990 + }, + { + "epoch": 0.0006988594217596428, + "grad_norm": 0.3179176449775696, + "learning_rate": 6.74457429048414e-05, + "loss": 0.582, + "step": 1991 + }, + { + "epoch": 0.0006992104310121588, + "grad_norm": 0.3597802519798279, + "learning_rate": 6.737896494156929e-05, + "loss": 0.5297, + "step": 1992 + }, + { + "epoch": 0.000699561440264675, + "grad_norm": 0.3542323410511017, + "learning_rate": 6.731218697829717e-05, + "loss": 0.5995, + "step": 1993 + }, + { + "epoch": 0.0006999124495171912, + "grad_norm": 0.3902435302734375, + "learning_rate": 6.724540901502504e-05, + "loss": 0.55, + "step": 1994 + }, + { + "epoch": 0.0007002634587697074, + "grad_norm": 0.433971107006073, + "learning_rate": 6.717863105175293e-05, + "loss": 0.5859, + "step": 1995 + }, + { + "epoch": 0.0007006144680222234, + "grad_norm": 0.30398884415626526, + "learning_rate": 6.71118530884808e-05, + "loss": 0.5749, + "step": 1996 + }, + { + "epoch": 0.0007009654772747396, + "grad_norm": 0.2854095995426178, + "learning_rate": 6.704507512520869e-05, + "loss": 0.5932, + "step": 1997 + }, + { + "epoch": 0.0007013164865272558, + "grad_norm": 0.3235953450202942, + "learning_rate": 6.697829716193656e-05, + "loss": 0.5921, + "step": 1998 + }, + { + "epoch": 0.0007016674957797718, + "grad_norm": 0.364388108253479, + "learning_rate": 6.691151919866445e-05, + "loss": 0.5932, + "step": 1999 + }, + { + "epoch": 0.000702018505032288, + "grad_norm": 0.2984377145767212, + "learning_rate": 6.684474123539233e-05, + "loss": 0.5099, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 3000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.69712584814592e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/marques/outputs/checkpoint-2000/training_args.bin b/marques/outputs/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..fd0ba520c124bb1ece608079704fa15e0236be45 --- /dev/null +++ b/marques/outputs/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09362706a3d58d219e41be1682b770b8f5069fcd630f7dbcadb71e4d4ce8859b +size 6289 diff --git a/marques/outputs/checkpoint-2500/README.md b/marques/outputs/checkpoint-2500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d90a96dfe2e51221657a6e936d376789e21081f9 --- /dev/null +++ b/marques/outputs/checkpoint-2500/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/marques/outputs/checkpoint-2500/adapter_config.json b/marques/outputs/checkpoint-2500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9930a191a30254256c9550b1bdffa58b8d7aee8 --- /dev/null +++ b/marques/outputs/checkpoint-2500/adapter_config.json @@ -0,0 +1,50 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "LlamaForCausalLM", + "parent_library": "transformers.models.llama.modeling_llama", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "gate_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/marques/outputs/checkpoint-2500/adapter_model.safetensors b/marques/outputs/checkpoint-2500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dfda0c2aa5eb7ea0254b59178f19fe8a3bd2a6d7 --- /dev/null +++ b/marques/outputs/checkpoint-2500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b559da449c45e32d7efc4b66aa78432336511496d950fedc6ecb00804695683c +size 167832240 diff --git a/marques/outputs/checkpoint-2500/optimizer.pt b/marques/outputs/checkpoint-2500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9cc1e62566f0a7a1ee982eae717e0d0d40dbb91f --- /dev/null +++ b/marques/outputs/checkpoint-2500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4372d22a0956c45638d38a7987dacf8d7c39b9e3bf8b6e5f6fe85baaa8c1ae5 +size 85724133 diff --git a/marques/outputs/checkpoint-2500/rng_state.pth b/marques/outputs/checkpoint-2500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ef66339b9befa098183fd5d69faed6838e526b0 --- /dev/null +++ b/marques/outputs/checkpoint-2500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1d565802a8e26c4e8a31328752b7a7fdc186d9401aa008e65697d0ad8c22e33 +size 14645 diff --git a/marques/outputs/checkpoint-2500/scheduler.pt b/marques/outputs/checkpoint-2500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c59bd6d60eed741f10acc7274d8126b95aeb69a6 --- /dev/null +++ b/marques/outputs/checkpoint-2500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aacf03c4147ab2f9ed283ccf05955a7bf4667d3e9f508a3442c413bd9884246 +size 1465 diff --git a/marques/outputs/checkpoint-2500/special_tokens_map.json b/marques/outputs/checkpoint-2500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..68b10c7f0a479eae0c358eac6a14959b3f9acdf1 --- /dev/null +++ b/marques/outputs/checkpoint-2500/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/marques/outputs/checkpoint-2500/tokenizer.json b/marques/outputs/checkpoint-2500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/marques/outputs/checkpoint-2500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/marques/outputs/checkpoint-2500/tokenizer_config.json b/marques/outputs/checkpoint-2500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..92b1d94e894e5474ebea1d171e14751be79ca3e5 --- /dev/null +++ b/marques/outputs/checkpoint-2500/tokenizer_config.json @@ -0,0 +1,2066 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/marques/outputs/checkpoint-2500/trainer_state.json b/marques/outputs/checkpoint-2500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..063eb977e851f556deef3fad6ee122048b6efca9 --- /dev/null +++ b/marques/outputs/checkpoint-2500/trainer_state.json @@ -0,0 +1,17534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.00087752313129036, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.5100925251614403e-07, + "grad_norm": 0.53782719373703, + "learning_rate": 0.0, + "loss": 0.5835, + "step": 1 + }, + { + "epoch": 7.020185050322881e-07, + "grad_norm": 0.6201626062393188, + "learning_rate": 4e-05, + "loss": 0.5242, + "step": 2 + }, + { + "epoch": 1.053027757548432e-06, + "grad_norm": 0.7571901082992554, + "learning_rate": 8e-05, + "loss": 0.5642, + "step": 3 + }, + { + "epoch": 1.4040370100645761e-06, + "grad_norm": 0.5588695406913757, + "learning_rate": 0.00012, + "loss": 0.4859, + "step": 4 + }, + { + "epoch": 1.75504626258072e-06, + "grad_norm": 0.7208331227302551, + "learning_rate": 0.00016, + "loss": 0.4645, + "step": 5 + }, + { + "epoch": 2.106055515096864e-06, + "grad_norm": 0.8169743418693542, + "learning_rate": 0.0002, + "loss": 0.3702, + "step": 6 + }, + { + "epoch": 2.4570647676130083e-06, + "grad_norm": 2.051530599594116, + "learning_rate": 0.00019993322203672788, + "loss": 0.4856, + "step": 7 + }, + { + "epoch": 2.8080740201291522e-06, + "grad_norm": 1.2310550212860107, + "learning_rate": 0.00019986644407345576, + "loss": 0.5192, + "step": 8 + }, + { + "epoch": 3.1590832726452962e-06, + "grad_norm": 1.612046241760254, + "learning_rate": 0.00019979966611018366, + "loss": 0.4719, + "step": 9 + }, + { + "epoch": 3.51009252516144e-06, + "grad_norm": 1.4484680891036987, + "learning_rate": 0.00019973288814691153, + "loss": 0.4416, + "step": 10 + }, + { + "epoch": 3.861101777677584e-06, + "grad_norm": 1.4529719352722168, + "learning_rate": 0.0001996661101836394, + "loss": 0.6275, + "step": 11 + }, + { + "epoch": 4.212111030193728e-06, + "grad_norm": 1.3963671922683716, + "learning_rate": 0.00019959933222036728, + "loss": 0.5874, + "step": 12 + }, + { + "epoch": 4.563120282709872e-06, + "grad_norm": 1.4744153022766113, + "learning_rate": 0.00019953255425709515, + "loss": 0.6422, + "step": 13 + }, + { + "epoch": 4.9141295352260165e-06, + "grad_norm": 0.8640050888061523, + "learning_rate": 0.00019946577629382305, + "loss": 0.5064, + "step": 14 + }, + { + "epoch": 5.26513878774216e-06, + "grad_norm": 0.7137419581413269, + "learning_rate": 0.00019939899833055092, + "loss": 0.5218, + "step": 15 + }, + { + "epoch": 5.6161480402583045e-06, + "grad_norm": 0.7769026756286621, + "learning_rate": 0.00019933222036727882, + "loss": 0.5377, + "step": 16 + }, + { + "epoch": 5.967157292774448e-06, + "grad_norm": 0.7558479905128479, + "learning_rate": 0.0001992654424040067, + "loss": 0.5054, + "step": 17 + }, + { + "epoch": 6.3181665452905924e-06, + "grad_norm": 0.8237054347991943, + "learning_rate": 0.00019919866444073457, + "loss": 0.5094, + "step": 18 + }, + { + "epoch": 6.669175797806736e-06, + "grad_norm": 1.0375059843063354, + "learning_rate": 0.00019913188647746244, + "loss": 0.5751, + "step": 19 + }, + { + "epoch": 7.02018505032288e-06, + "grad_norm": 1.075869083404541, + "learning_rate": 0.00019906510851419034, + "loss": 0.594, + "step": 20 + }, + { + "epoch": 7.371194302839024e-06, + "grad_norm": 0.8041358590126038, + "learning_rate": 0.00019899833055091822, + "loss": 0.553, + "step": 21 + }, + { + "epoch": 7.722203555355168e-06, + "grad_norm": 0.9264736771583557, + "learning_rate": 0.0001989315525876461, + "loss": 0.5555, + "step": 22 + }, + { + "epoch": 8.073212807871313e-06, + "grad_norm": 1.0074031352996826, + "learning_rate": 0.00019886477462437396, + "loss": 0.5353, + "step": 23 + }, + { + "epoch": 8.424222060387455e-06, + "grad_norm": 0.8725020885467529, + "learning_rate": 0.00019879799666110183, + "loss": 0.5557, + "step": 24 + }, + { + "epoch": 8.7752313129036e-06, + "grad_norm": 0.8867582678794861, + "learning_rate": 0.00019873121869782974, + "loss": 0.5992, + "step": 25 + }, + { + "epoch": 9.126240565419744e-06, + "grad_norm": 0.9235608577728271, + "learning_rate": 0.0001986644407345576, + "loss": 0.516, + "step": 26 + }, + { + "epoch": 9.477249817935889e-06, + "grad_norm": 0.8653218150138855, + "learning_rate": 0.00019859766277128548, + "loss": 0.5249, + "step": 27 + }, + { + "epoch": 9.828259070452033e-06, + "grad_norm": 0.7479026913642883, + "learning_rate": 0.00019853088480801335, + "loss": 0.5037, + "step": 28 + }, + { + "epoch": 1.0179268322968176e-05, + "grad_norm": 0.9531452655792236, + "learning_rate": 0.00019846410684474123, + "loss": 0.5896, + "step": 29 + }, + { + "epoch": 1.053027757548432e-05, + "grad_norm": 1.1012492179870605, + "learning_rate": 0.00019839732888146913, + "loss": 0.5139, + "step": 30 + }, + { + "epoch": 1.0881286828000465e-05, + "grad_norm": 1.0198887586593628, + "learning_rate": 0.000198330550918197, + "loss": 0.5587, + "step": 31 + }, + { + "epoch": 1.1232296080516609e-05, + "grad_norm": 0.8081266283988953, + "learning_rate": 0.00019826377295492487, + "loss": 0.4762, + "step": 32 + }, + { + "epoch": 1.1583305333032752e-05, + "grad_norm": 1.1965891122817993, + "learning_rate": 0.00019819699499165277, + "loss": 0.5719, + "step": 33 + }, + { + "epoch": 1.1934314585548896e-05, + "grad_norm": 1.214903473854065, + "learning_rate": 0.00019813021702838065, + "loss": 0.5756, + "step": 34 + }, + { + "epoch": 1.228532383806504e-05, + "grad_norm": 0.8360006213188171, + "learning_rate": 0.00019806343906510852, + "loss": 0.5688, + "step": 35 + }, + { + "epoch": 1.2636333090581185e-05, + "grad_norm": 0.8328489065170288, + "learning_rate": 0.00019799666110183642, + "loss": 0.6418, + "step": 36 + }, + { + "epoch": 1.298734234309733e-05, + "grad_norm": 1.1427714824676514, + "learning_rate": 0.0001979298831385643, + "loss": 0.6531, + "step": 37 + }, + { + "epoch": 1.3338351595613472e-05, + "grad_norm": 1.0145376920700073, + "learning_rate": 0.00019786310517529217, + "loss": 0.6473, + "step": 38 + }, + { + "epoch": 1.3689360848129616e-05, + "grad_norm": 0.8427861928939819, + "learning_rate": 0.00019779632721202004, + "loss": 0.5882, + "step": 39 + }, + { + "epoch": 1.404037010064576e-05, + "grad_norm": 0.8792659044265747, + "learning_rate": 0.00019772954924874791, + "loss": 0.608, + "step": 40 + }, + { + "epoch": 1.4391379353161905e-05, + "grad_norm": 0.9338463544845581, + "learning_rate": 0.00019766277128547581, + "loss": 0.7118, + "step": 41 + }, + { + "epoch": 1.4742388605678048e-05, + "grad_norm": 0.7554420232772827, + "learning_rate": 0.0001975959933222037, + "loss": 0.5898, + "step": 42 + }, + { + "epoch": 1.5093397858194192e-05, + "grad_norm": 0.7700084447860718, + "learning_rate": 0.00019752921535893156, + "loss": 0.6466, + "step": 43 + }, + { + "epoch": 1.5444407110710337e-05, + "grad_norm": 0.8639333248138428, + "learning_rate": 0.00019746243739565943, + "loss": 0.7253, + "step": 44 + }, + { + "epoch": 1.579541636322648e-05, + "grad_norm": 0.7760612964630127, + "learning_rate": 0.0001973956594323873, + "loss": 0.7099, + "step": 45 + }, + { + "epoch": 1.6146425615742626e-05, + "grad_norm": 0.7319066524505615, + "learning_rate": 0.0001973288814691152, + "loss": 0.6664, + "step": 46 + }, + { + "epoch": 1.6497434868258768e-05, + "grad_norm": 0.7557100057601929, + "learning_rate": 0.00019726210350584308, + "loss": 0.6318, + "step": 47 + }, + { + "epoch": 1.684844412077491e-05, + "grad_norm": 0.6420389413833618, + "learning_rate": 0.00019719532554257095, + "loss": 0.6688, + "step": 48 + }, + { + "epoch": 1.7199453373291057e-05, + "grad_norm": 0.660383939743042, + "learning_rate": 0.00019712854757929883, + "loss": 0.6204, + "step": 49 + }, + { + "epoch": 1.75504626258072e-05, + "grad_norm": 0.5614909529685974, + "learning_rate": 0.00019706176961602673, + "loss": 0.664, + "step": 50 + }, + { + "epoch": 1.7901471878323346e-05, + "grad_norm": 0.502738356590271, + "learning_rate": 0.0001969949916527546, + "loss": 0.6918, + "step": 51 + }, + { + "epoch": 1.825248113083949e-05, + "grad_norm": 0.47578102350234985, + "learning_rate": 0.0001969282136894825, + "loss": 0.6747, + "step": 52 + }, + { + "epoch": 1.860349038335563e-05, + "grad_norm": 0.5528931617736816, + "learning_rate": 0.00019686143572621037, + "loss": 0.765, + "step": 53 + }, + { + "epoch": 1.8954499635871777e-05, + "grad_norm": 0.6176997423171997, + "learning_rate": 0.00019679465776293825, + "loss": 0.5959, + "step": 54 + }, + { + "epoch": 1.930550888838792e-05, + "grad_norm": 0.43425047397613525, + "learning_rate": 0.00019672787979966612, + "loss": 0.6437, + "step": 55 + }, + { + "epoch": 1.9656518140904066e-05, + "grad_norm": 0.5135884881019592, + "learning_rate": 0.000196661101836394, + "loss": 0.7019, + "step": 56 + }, + { + "epoch": 2.000752739342021e-05, + "grad_norm": 0.4628916084766388, + "learning_rate": 0.0001965943238731219, + "loss": 0.5722, + "step": 57 + }, + { + "epoch": 2.035853664593635e-05, + "grad_norm": 0.48201897740364075, + "learning_rate": 0.00019652754590984977, + "loss": 0.6288, + "step": 58 + }, + { + "epoch": 2.0709545898452498e-05, + "grad_norm": 0.5772811770439148, + "learning_rate": 0.00019646076794657764, + "loss": 0.6067, + "step": 59 + }, + { + "epoch": 2.106055515096864e-05, + "grad_norm": 0.4976802170276642, + "learning_rate": 0.0001963939899833055, + "loss": 0.4722, + "step": 60 + }, + { + "epoch": 2.1411564403484786e-05, + "grad_norm": 0.4842129051685333, + "learning_rate": 0.00019632721202003339, + "loss": 0.5876, + "step": 61 + }, + { + "epoch": 2.176257365600093e-05, + "grad_norm": 0.46149536967277527, + "learning_rate": 0.00019626043405676129, + "loss": 0.6373, + "step": 62 + }, + { + "epoch": 2.2113582908517072e-05, + "grad_norm": 0.47199445962905884, + "learning_rate": 0.00019619365609348916, + "loss": 0.5546, + "step": 63 + }, + { + "epoch": 2.2464592161033218e-05, + "grad_norm": 0.6109340190887451, + "learning_rate": 0.00019612687813021703, + "loss": 0.6069, + "step": 64 + }, + { + "epoch": 2.281560141354936e-05, + "grad_norm": 0.5529135465621948, + "learning_rate": 0.0001960601001669449, + "loss": 0.553, + "step": 65 + }, + { + "epoch": 2.3166610666065503e-05, + "grad_norm": 0.500245213508606, + "learning_rate": 0.00019599332220367278, + "loss": 0.6149, + "step": 66 + }, + { + "epoch": 2.351761991858165e-05, + "grad_norm": 0.4841914474964142, + "learning_rate": 0.00019592654424040068, + "loss": 0.6509, + "step": 67 + }, + { + "epoch": 2.3868629171097792e-05, + "grad_norm": 0.5308504104614258, + "learning_rate": 0.00019585976627712855, + "loss": 0.7017, + "step": 68 + }, + { + "epoch": 2.4219638423613938e-05, + "grad_norm": 0.5157874822616577, + "learning_rate": 0.00019579298831385645, + "loss": 0.7125, + "step": 69 + }, + { + "epoch": 2.457064767613008e-05, + "grad_norm": 0.47787800431251526, + "learning_rate": 0.00019572621035058433, + "loss": 0.5792, + "step": 70 + }, + { + "epoch": 2.4921656928646224e-05, + "grad_norm": 0.46792763471603394, + "learning_rate": 0.0001956594323873122, + "loss": 0.7, + "step": 71 + }, + { + "epoch": 2.527266618116237e-05, + "grad_norm": 0.5394675135612488, + "learning_rate": 0.00019559265442404007, + "loss": 0.5549, + "step": 72 + }, + { + "epoch": 2.5623675433678512e-05, + "grad_norm": 0.45065200328826904, + "learning_rate": 0.00019552587646076797, + "loss": 0.6663, + "step": 73 + }, + { + "epoch": 2.597468468619466e-05, + "grad_norm": 0.4026688039302826, + "learning_rate": 0.00019545909849749584, + "loss": 0.6315, + "step": 74 + }, + { + "epoch": 2.63256939387108e-05, + "grad_norm": 0.42353659868240356, + "learning_rate": 0.00019539232053422372, + "loss": 0.5419, + "step": 75 + }, + { + "epoch": 2.6676703191226944e-05, + "grad_norm": 0.45561954379081726, + "learning_rate": 0.0001953255425709516, + "loss": 0.6624, + "step": 76 + }, + { + "epoch": 2.702771244374309e-05, + "grad_norm": 0.3954075574874878, + "learning_rate": 0.00019525876460767946, + "loss": 0.5479, + "step": 77 + }, + { + "epoch": 2.7378721696259233e-05, + "grad_norm": 0.4994329512119293, + "learning_rate": 0.00019519198664440736, + "loss": 0.7224, + "step": 78 + }, + { + "epoch": 2.7729730948775375e-05, + "grad_norm": 0.41149672865867615, + "learning_rate": 0.00019512520868113524, + "loss": 0.5621, + "step": 79 + }, + { + "epoch": 2.808074020129152e-05, + "grad_norm": 0.4199008345603943, + "learning_rate": 0.0001950584307178631, + "loss": 0.7038, + "step": 80 + }, + { + "epoch": 2.8431749453807664e-05, + "grad_norm": 0.4378969371318817, + "learning_rate": 0.00019499165275459098, + "loss": 0.6654, + "step": 81 + }, + { + "epoch": 2.878275870632381e-05, + "grad_norm": 0.4653928279876709, + "learning_rate": 0.00019492487479131886, + "loss": 0.6241, + "step": 82 + }, + { + "epoch": 2.9133767958839953e-05, + "grad_norm": 0.5166454911231995, + "learning_rate": 0.00019485809682804673, + "loss": 0.5366, + "step": 83 + }, + { + "epoch": 2.9484777211356096e-05, + "grad_norm": 0.43180733919143677, + "learning_rate": 0.00019479131886477463, + "loss": 0.6178, + "step": 84 + }, + { + "epoch": 2.9835786463872242e-05, + "grad_norm": 0.44828200340270996, + "learning_rate": 0.0001947245409015025, + "loss": 0.6706, + "step": 85 + }, + { + "epoch": 3.0186795716388385e-05, + "grad_norm": 0.384175181388855, + "learning_rate": 0.0001946577629382304, + "loss": 0.5551, + "step": 86 + }, + { + "epoch": 3.053780496890453e-05, + "grad_norm": 0.4359772503376007, + "learning_rate": 0.00019459098497495828, + "loss": 0.5626, + "step": 87 + }, + { + "epoch": 3.0888814221420673e-05, + "grad_norm": 0.4177016615867615, + "learning_rate": 0.00019452420701168615, + "loss": 0.6023, + "step": 88 + }, + { + "epoch": 3.1239823473936816e-05, + "grad_norm": 0.43592438101768494, + "learning_rate": 0.00019445742904841405, + "loss": 0.682, + "step": 89 + }, + { + "epoch": 3.159083272645296e-05, + "grad_norm": 0.48027974367141724, + "learning_rate": 0.00019439065108514192, + "loss": 0.7596, + "step": 90 + }, + { + "epoch": 3.194184197896911e-05, + "grad_norm": 0.35989537835121155, + "learning_rate": 0.0001943238731218698, + "loss": 0.6018, + "step": 91 + }, + { + "epoch": 3.229285123148525e-05, + "grad_norm": 0.48477092385292053, + "learning_rate": 0.00019425709515859767, + "loss": 0.512, + "step": 92 + }, + { + "epoch": 3.2643860484001394e-05, + "grad_norm": 0.38858646154403687, + "learning_rate": 0.00019419031719532554, + "loss": 0.6371, + "step": 93 + }, + { + "epoch": 3.2994869736517536e-05, + "grad_norm": 0.5323147177696228, + "learning_rate": 0.00019412353923205344, + "loss": 0.5221, + "step": 94 + }, + { + "epoch": 3.334587898903368e-05, + "grad_norm": 0.3784274160861969, + "learning_rate": 0.00019405676126878132, + "loss": 0.6158, + "step": 95 + }, + { + "epoch": 3.369688824154982e-05, + "grad_norm": 0.4076334834098816, + "learning_rate": 0.0001939899833055092, + "loss": 0.5535, + "step": 96 + }, + { + "epoch": 3.404789749406597e-05, + "grad_norm": 0.43930479884147644, + "learning_rate": 0.00019392320534223706, + "loss": 0.6482, + "step": 97 + }, + { + "epoch": 3.4398906746582114e-05, + "grad_norm": 0.4266909658908844, + "learning_rate": 0.00019385642737896494, + "loss": 0.6, + "step": 98 + }, + { + "epoch": 3.474991599909826e-05, + "grad_norm": 0.45353513956069946, + "learning_rate": 0.0001937896494156928, + "loss": 0.6596, + "step": 99 + }, + { + "epoch": 3.51009252516144e-05, + "grad_norm": 0.3424838185310364, + "learning_rate": 0.0001937228714524207, + "loss": 0.555, + "step": 100 + }, + { + "epoch": 3.545193450413054e-05, + "grad_norm": 0.40126165747642517, + "learning_rate": 0.00019365609348914858, + "loss": 0.6921, + "step": 101 + }, + { + "epoch": 3.580294375664669e-05, + "grad_norm": 0.36572012305259705, + "learning_rate": 0.00019358931552587646, + "loss": 0.5485, + "step": 102 + }, + { + "epoch": 3.6153953009162834e-05, + "grad_norm": 0.3972407281398773, + "learning_rate": 0.00019352253756260436, + "loss": 0.5884, + "step": 103 + }, + { + "epoch": 3.650496226167898e-05, + "grad_norm": 0.3900579512119293, + "learning_rate": 0.00019345575959933223, + "loss": 0.6664, + "step": 104 + }, + { + "epoch": 3.685597151419512e-05, + "grad_norm": 0.31666621565818787, + "learning_rate": 0.00019338898163606013, + "loss": 0.5009, + "step": 105 + }, + { + "epoch": 3.720698076671126e-05, + "grad_norm": 0.5269597172737122, + "learning_rate": 0.000193322203672788, + "loss": 0.6292, + "step": 106 + }, + { + "epoch": 3.755799001922741e-05, + "grad_norm": 0.4645126163959503, + "learning_rate": 0.00019325542570951588, + "loss": 0.636, + "step": 107 + }, + { + "epoch": 3.7908999271743555e-05, + "grad_norm": 0.3900754153728485, + "learning_rate": 0.00019318864774624375, + "loss": 0.5367, + "step": 108 + }, + { + "epoch": 3.82600085242597e-05, + "grad_norm": 0.42533883452415466, + "learning_rate": 0.00019312186978297162, + "loss": 0.6862, + "step": 109 + }, + { + "epoch": 3.861101777677584e-05, + "grad_norm": 0.6809422969818115, + "learning_rate": 0.00019305509181969952, + "loss": 0.6434, + "step": 110 + }, + { + "epoch": 3.896202702929198e-05, + "grad_norm": 0.5127860307693481, + "learning_rate": 0.0001929883138564274, + "loss": 0.6266, + "step": 111 + }, + { + "epoch": 3.931303628180813e-05, + "grad_norm": 0.5254234671592712, + "learning_rate": 0.00019292153589315527, + "loss": 0.6982, + "step": 112 + }, + { + "epoch": 3.9664045534324275e-05, + "grad_norm": 0.3699031472206116, + "learning_rate": 0.00019285475792988314, + "loss": 0.6037, + "step": 113 + }, + { + "epoch": 4.001505478684042e-05, + "grad_norm": 0.3807130455970764, + "learning_rate": 0.00019278797996661101, + "loss": 0.5861, + "step": 114 + }, + { + "epoch": 4.036606403935656e-05, + "grad_norm": 0.4455645978450775, + "learning_rate": 0.0001927212020033389, + "loss": 0.5658, + "step": 115 + }, + { + "epoch": 4.07170732918727e-05, + "grad_norm": 0.3830210864543915, + "learning_rate": 0.0001926544240400668, + "loss": 0.606, + "step": 116 + }, + { + "epoch": 4.106808254438885e-05, + "grad_norm": 0.41419631242752075, + "learning_rate": 0.00019258764607679466, + "loss": 0.6095, + "step": 117 + }, + { + "epoch": 4.1419091796904995e-05, + "grad_norm": 0.3929574489593506, + "learning_rate": 0.00019252086811352253, + "loss": 0.6464, + "step": 118 + }, + { + "epoch": 4.177010104942114e-05, + "grad_norm": 0.35958629846572876, + "learning_rate": 0.0001924540901502504, + "loss": 0.5185, + "step": 119 + }, + { + "epoch": 4.212111030193728e-05, + "grad_norm": 0.3790556490421295, + "learning_rate": 0.0001923873121869783, + "loss": 0.5156, + "step": 120 + }, + { + "epoch": 4.2472119554453423e-05, + "grad_norm": 0.37452438473701477, + "learning_rate": 0.00019232053422370618, + "loss": 0.5711, + "step": 121 + }, + { + "epoch": 4.282312880696957e-05, + "grad_norm": 0.38976770639419556, + "learning_rate": 0.00019225375626043408, + "loss": 0.6075, + "step": 122 + }, + { + "epoch": 4.3174138059485716e-05, + "grad_norm": 0.4098513424396515, + "learning_rate": 0.00019218697829716195, + "loss": 0.5312, + "step": 123 + }, + { + "epoch": 4.352514731200186e-05, + "grad_norm": 0.33890047669410706, + "learning_rate": 0.00019212020033388983, + "loss": 0.4984, + "step": 124 + }, + { + "epoch": 4.3876156564518e-05, + "grad_norm": 0.49077001214027405, + "learning_rate": 0.0001920534223706177, + "loss": 0.7159, + "step": 125 + }, + { + "epoch": 4.4227165817034144e-05, + "grad_norm": 0.41653814911842346, + "learning_rate": 0.0001919866444073456, + "loss": 0.5642, + "step": 126 + }, + { + "epoch": 4.4578175069550286e-05, + "grad_norm": 0.45710283517837524, + "learning_rate": 0.00019191986644407347, + "loss": 0.6936, + "step": 127 + }, + { + "epoch": 4.4929184322066436e-05, + "grad_norm": 0.36976873874664307, + "learning_rate": 0.00019185308848080135, + "loss": 0.5407, + "step": 128 + }, + { + "epoch": 4.528019357458258e-05, + "grad_norm": 0.42852675914764404, + "learning_rate": 0.00019178631051752922, + "loss": 0.6731, + "step": 129 + }, + { + "epoch": 4.563120282709872e-05, + "grad_norm": 0.5426310300827026, + "learning_rate": 0.0001917195325542571, + "loss": 0.5775, + "step": 130 + }, + { + "epoch": 4.5982212079614864e-05, + "grad_norm": 0.38442543148994446, + "learning_rate": 0.00019165275459098497, + "loss": 0.5994, + "step": 131 + }, + { + "epoch": 4.633322133213101e-05, + "grad_norm": 0.4298035502433777, + "learning_rate": 0.00019158597662771287, + "loss": 0.5563, + "step": 132 + }, + { + "epoch": 4.6684230584647156e-05, + "grad_norm": 0.40397605299949646, + "learning_rate": 0.00019151919866444074, + "loss": 0.6924, + "step": 133 + }, + { + "epoch": 4.70352398371633e-05, + "grad_norm": 0.4338497519493103, + "learning_rate": 0.0001914524207011686, + "loss": 0.5739, + "step": 134 + }, + { + "epoch": 4.738624908967944e-05, + "grad_norm": 0.39713653922080994, + "learning_rate": 0.0001913856427378965, + "loss": 0.4529, + "step": 135 + }, + { + "epoch": 4.7737258342195584e-05, + "grad_norm": 0.31409478187561035, + "learning_rate": 0.0001913188647746244, + "loss": 0.562, + "step": 136 + }, + { + "epoch": 4.808826759471173e-05, + "grad_norm": 0.371624618768692, + "learning_rate": 0.00019125208681135226, + "loss": 0.5288, + "step": 137 + }, + { + "epoch": 4.8439276847227877e-05, + "grad_norm": 0.4600190818309784, + "learning_rate": 0.00019118530884808016, + "loss": 0.6215, + "step": 138 + }, + { + "epoch": 4.879028609974402e-05, + "grad_norm": 0.45351359248161316, + "learning_rate": 0.00019111853088480803, + "loss": 0.686, + "step": 139 + }, + { + "epoch": 4.914129535226016e-05, + "grad_norm": 0.42282962799072266, + "learning_rate": 0.0001910517529215359, + "loss": 0.5966, + "step": 140 + }, + { + "epoch": 4.9492304604776305e-05, + "grad_norm": 0.41479986906051636, + "learning_rate": 0.00019098497495826378, + "loss": 0.5948, + "step": 141 + }, + { + "epoch": 4.984331385729245e-05, + "grad_norm": 0.40453553199768066, + "learning_rate": 0.00019091819699499168, + "loss": 0.6411, + "step": 142 + }, + { + "epoch": 5.01943231098086e-05, + "grad_norm": 0.3939369320869446, + "learning_rate": 0.00019085141903171955, + "loss": 0.5513, + "step": 143 + }, + { + "epoch": 5.054533236232474e-05, + "grad_norm": 0.3700481653213501, + "learning_rate": 0.00019078464106844743, + "loss": 0.5459, + "step": 144 + }, + { + "epoch": 5.089634161484088e-05, + "grad_norm": 0.4377487897872925, + "learning_rate": 0.0001907178631051753, + "loss": 0.6076, + "step": 145 + }, + { + "epoch": 5.1247350867357025e-05, + "grad_norm": 0.37919673323631287, + "learning_rate": 0.00019065108514190317, + "loss": 0.5207, + "step": 146 + }, + { + "epoch": 5.159836011987317e-05, + "grad_norm": 0.3841630816459656, + "learning_rate": 0.00019058430717863107, + "loss": 0.614, + "step": 147 + }, + { + "epoch": 5.194936937238932e-05, + "grad_norm": 0.43541714549064636, + "learning_rate": 0.00019051752921535895, + "loss": 0.6283, + "step": 148 + }, + { + "epoch": 5.230037862490546e-05, + "grad_norm": 0.4853285253047943, + "learning_rate": 0.00019045075125208682, + "loss": 0.5807, + "step": 149 + }, + { + "epoch": 5.26513878774216e-05, + "grad_norm": 0.3572970926761627, + "learning_rate": 0.0001903839732888147, + "loss": 0.6866, + "step": 150 + }, + { + "epoch": 5.3002397129937745e-05, + "grad_norm": 0.3674347698688507, + "learning_rate": 0.00019031719532554257, + "loss": 0.5552, + "step": 151 + }, + { + "epoch": 5.335340638245389e-05, + "grad_norm": 0.37748461961746216, + "learning_rate": 0.00019025041736227044, + "loss": 0.6278, + "step": 152 + }, + { + "epoch": 5.370441563497003e-05, + "grad_norm": 0.3788503408432007, + "learning_rate": 0.00019018363939899834, + "loss": 0.622, + "step": 153 + }, + { + "epoch": 5.405542488748618e-05, + "grad_norm": 0.3736303150653839, + "learning_rate": 0.0001901168614357262, + "loss": 0.5822, + "step": 154 + }, + { + "epoch": 5.440643414000232e-05, + "grad_norm": 0.32680070400238037, + "learning_rate": 0.0001900500834724541, + "loss": 0.5715, + "step": 155 + }, + { + "epoch": 5.4757443392518466e-05, + "grad_norm": 0.34495192766189575, + "learning_rate": 0.00018998330550918199, + "loss": 0.6497, + "step": 156 + }, + { + "epoch": 5.510845264503461e-05, + "grad_norm": 0.4244193136692047, + "learning_rate": 0.00018991652754590986, + "loss": 0.5519, + "step": 157 + }, + { + "epoch": 5.545946189755075e-05, + "grad_norm": 0.4024031162261963, + "learning_rate": 0.00018984974958263776, + "loss": 0.5339, + "step": 158 + }, + { + "epoch": 5.58104711500669e-05, + "grad_norm": 0.46051299571990967, + "learning_rate": 0.00018978297161936563, + "loss": 0.5979, + "step": 159 + }, + { + "epoch": 5.616148040258304e-05, + "grad_norm": 0.49051615595817566, + "learning_rate": 0.0001897161936560935, + "loss": 0.5563, + "step": 160 + }, + { + "epoch": 5.6512489655099186e-05, + "grad_norm": 0.43045854568481445, + "learning_rate": 0.00018964941569282138, + "loss": 0.5984, + "step": 161 + }, + { + "epoch": 5.686349890761533e-05, + "grad_norm": 0.37778228521347046, + "learning_rate": 0.00018958263772954925, + "loss": 0.5955, + "step": 162 + }, + { + "epoch": 5.721450816013147e-05, + "grad_norm": 0.3736341893672943, + "learning_rate": 0.00018951585976627715, + "loss": 0.6438, + "step": 163 + }, + { + "epoch": 5.756551741264762e-05, + "grad_norm": 0.3940117061138153, + "learning_rate": 0.00018944908180300502, + "loss": 0.503, + "step": 164 + }, + { + "epoch": 5.7916526665163763e-05, + "grad_norm": 0.4193519055843353, + "learning_rate": 0.0001893823038397329, + "loss": 0.6324, + "step": 165 + }, + { + "epoch": 5.8267535917679906e-05, + "grad_norm": 0.34481996297836304, + "learning_rate": 0.00018931552587646077, + "loss": 0.5745, + "step": 166 + }, + { + "epoch": 5.861854517019605e-05, + "grad_norm": 0.38285771012306213, + "learning_rate": 0.00018924874791318864, + "loss": 0.639, + "step": 167 + }, + { + "epoch": 5.896955442271219e-05, + "grad_norm": 0.36933982372283936, + "learning_rate": 0.00018918196994991652, + "loss": 0.6681, + "step": 168 + }, + { + "epoch": 5.932056367522834e-05, + "grad_norm": 0.36970776319503784, + "learning_rate": 0.00018911519198664442, + "loss": 0.5626, + "step": 169 + }, + { + "epoch": 5.9671572927744484e-05, + "grad_norm": 0.38494783639907837, + "learning_rate": 0.0001890484140233723, + "loss": 0.6066, + "step": 170 + }, + { + "epoch": 6.0022582180260627e-05, + "grad_norm": 0.3446069061756134, + "learning_rate": 0.00018898163606010016, + "loss": 0.6354, + "step": 171 + }, + { + "epoch": 6.037359143277677e-05, + "grad_norm": 0.4466759264469147, + "learning_rate": 0.00018891485809682806, + "loss": 0.4737, + "step": 172 + }, + { + "epoch": 6.072460068529291e-05, + "grad_norm": 0.43630918860435486, + "learning_rate": 0.00018884808013355594, + "loss": 0.6839, + "step": 173 + }, + { + "epoch": 6.107560993780906e-05, + "grad_norm": 0.37083202600479126, + "learning_rate": 0.00018878130217028384, + "loss": 0.5372, + "step": 174 + }, + { + "epoch": 6.14266191903252e-05, + "grad_norm": 0.37066200375556946, + "learning_rate": 0.0001887145242070117, + "loss": 0.6653, + "step": 175 + }, + { + "epoch": 6.177762844284135e-05, + "grad_norm": 0.5191747546195984, + "learning_rate": 0.00018864774624373958, + "loss": 0.6677, + "step": 176 + }, + { + "epoch": 6.21286376953575e-05, + "grad_norm": 0.4235158860683441, + "learning_rate": 0.00018858096828046746, + "loss": 0.5971, + "step": 177 + }, + { + "epoch": 6.247964694787363e-05, + "grad_norm": 0.405074805021286, + "learning_rate": 0.00018851419031719533, + "loss": 0.5717, + "step": 178 + }, + { + "epoch": 6.283065620038978e-05, + "grad_norm": 0.45817336440086365, + "learning_rate": 0.00018844741235392323, + "loss": 0.5878, + "step": 179 + }, + { + "epoch": 6.318166545290592e-05, + "grad_norm": 0.6313037276268005, + "learning_rate": 0.0001883806343906511, + "loss": 0.62, + "step": 180 + }, + { + "epoch": 6.353267470542207e-05, + "grad_norm": 0.41896742582321167, + "learning_rate": 0.00018831385642737898, + "loss": 0.5565, + "step": 181 + }, + { + "epoch": 6.388368395793822e-05, + "grad_norm": 0.4143432676792145, + "learning_rate": 0.00018824707846410685, + "loss": 0.5552, + "step": 182 + }, + { + "epoch": 6.423469321045435e-05, + "grad_norm": 0.38745641708374023, + "learning_rate": 0.00018818030050083472, + "loss": 0.5949, + "step": 183 + }, + { + "epoch": 6.45857024629705e-05, + "grad_norm": 0.7472612261772156, + "learning_rate": 0.0001881135225375626, + "loss": 0.6708, + "step": 184 + }, + { + "epoch": 6.493671171548664e-05, + "grad_norm": 0.4416198432445526, + "learning_rate": 0.0001880467445742905, + "loss": 0.6069, + "step": 185 + }, + { + "epoch": 6.528772096800279e-05, + "grad_norm": 0.4312993884086609, + "learning_rate": 0.00018797996661101837, + "loss": 0.5778, + "step": 186 + }, + { + "epoch": 6.563873022051894e-05, + "grad_norm": 0.4524860978126526, + "learning_rate": 0.00018791318864774624, + "loss": 0.5091, + "step": 187 + }, + { + "epoch": 6.598973947303507e-05, + "grad_norm": 0.4320828914642334, + "learning_rate": 0.00018784641068447412, + "loss": 0.6557, + "step": 188 + }, + { + "epoch": 6.634074872555122e-05, + "grad_norm": 0.6967452168464661, + "learning_rate": 0.00018777963272120202, + "loss": 0.612, + "step": 189 + }, + { + "epoch": 6.669175797806736e-05, + "grad_norm": 0.4389924705028534, + "learning_rate": 0.0001877128547579299, + "loss": 0.6271, + "step": 190 + }, + { + "epoch": 6.704276723058351e-05, + "grad_norm": 0.3693922162055969, + "learning_rate": 0.0001876460767946578, + "loss": 0.6715, + "step": 191 + }, + { + "epoch": 6.739377648309964e-05, + "grad_norm": 0.32230404019355774, + "learning_rate": 0.00018757929883138566, + "loss": 0.6344, + "step": 192 + }, + { + "epoch": 6.774478573561579e-05, + "grad_norm": 0.4440002143383026, + "learning_rate": 0.00018751252086811354, + "loss": 0.6671, + "step": 193 + }, + { + "epoch": 6.809579498813194e-05, + "grad_norm": 0.5676587820053101, + "learning_rate": 0.0001874457429048414, + "loss": 0.6818, + "step": 194 + }, + { + "epoch": 6.844680424064808e-05, + "grad_norm": 0.36207348108291626, + "learning_rate": 0.0001873789649415693, + "loss": 0.5029, + "step": 195 + }, + { + "epoch": 6.879781349316423e-05, + "grad_norm": 0.35714131593704224, + "learning_rate": 0.00018731218697829718, + "loss": 0.6127, + "step": 196 + }, + { + "epoch": 6.914882274568036e-05, + "grad_norm": 0.4285273551940918, + "learning_rate": 0.00018724540901502506, + "loss": 0.6355, + "step": 197 + }, + { + "epoch": 6.949983199819651e-05, + "grad_norm": 0.42585939168930054, + "learning_rate": 0.00018717863105175293, + "loss": 0.6302, + "step": 198 + }, + { + "epoch": 6.985084125071266e-05, + "grad_norm": 0.524303138256073, + "learning_rate": 0.0001871118530884808, + "loss": 0.6683, + "step": 199 + }, + { + "epoch": 7.02018505032288e-05, + "grad_norm": 0.39635923504829407, + "learning_rate": 0.00018704507512520868, + "loss": 0.6694, + "step": 200 + }, + { + "epoch": 7.055285975574495e-05, + "grad_norm": 0.39712437987327576, + "learning_rate": 0.00018697829716193658, + "loss": 0.5794, + "step": 201 + }, + { + "epoch": 7.090386900826108e-05, + "grad_norm": 0.4115397334098816, + "learning_rate": 0.00018691151919866445, + "loss": 0.5579, + "step": 202 + }, + { + "epoch": 7.125487826077723e-05, + "grad_norm": 0.4776385724544525, + "learning_rate": 0.00018684474123539232, + "loss": 0.5589, + "step": 203 + }, + { + "epoch": 7.160588751329338e-05, + "grad_norm": 0.35574638843536377, + "learning_rate": 0.0001867779632721202, + "loss": 0.5311, + "step": 204 + }, + { + "epoch": 7.195689676580952e-05, + "grad_norm": 0.44872432947158813, + "learning_rate": 0.00018671118530884807, + "loss": 0.635, + "step": 205 + }, + { + "epoch": 7.230790601832567e-05, + "grad_norm": 0.3511079251766205, + "learning_rate": 0.00018664440734557597, + "loss": 0.5317, + "step": 206 + }, + { + "epoch": 7.26589152708418e-05, + "grad_norm": 0.39862194657325745, + "learning_rate": 0.00018657762938230384, + "loss": 0.6653, + "step": 207 + }, + { + "epoch": 7.300992452335795e-05, + "grad_norm": 0.4046575725078583, + "learning_rate": 0.00018651085141903174, + "loss": 0.6065, + "step": 208 + }, + { + "epoch": 7.33609337758741e-05, + "grad_norm": 0.4231868088245392, + "learning_rate": 0.00018644407345575962, + "loss": 0.7078, + "step": 209 + }, + { + "epoch": 7.371194302839024e-05, + "grad_norm": 0.364700049161911, + "learning_rate": 0.0001863772954924875, + "loss": 0.6309, + "step": 210 + }, + { + "epoch": 7.406295228090639e-05, + "grad_norm": 0.5385531187057495, + "learning_rate": 0.0001863105175292154, + "loss": 0.4233, + "step": 211 + }, + { + "epoch": 7.441396153342252e-05, + "grad_norm": 0.39415115118026733, + "learning_rate": 0.00018624373956594326, + "loss": 0.5928, + "step": 212 + }, + { + "epoch": 7.476497078593867e-05, + "grad_norm": 0.6021363735198975, + "learning_rate": 0.00018617696160267113, + "loss": 0.6611, + "step": 213 + }, + { + "epoch": 7.511598003845482e-05, + "grad_norm": 0.3709903061389923, + "learning_rate": 0.000186110183639399, + "loss": 0.6136, + "step": 214 + }, + { + "epoch": 7.546698929097096e-05, + "grad_norm": 0.36710435152053833, + "learning_rate": 0.00018604340567612688, + "loss": 0.5267, + "step": 215 + }, + { + "epoch": 7.581799854348711e-05, + "grad_norm": 0.4379352033138275, + "learning_rate": 0.00018597662771285475, + "loss": 0.6429, + "step": 216 + }, + { + "epoch": 7.616900779600325e-05, + "grad_norm": 0.3408482074737549, + "learning_rate": 0.00018590984974958265, + "loss": 0.5379, + "step": 217 + }, + { + "epoch": 7.65200170485194e-05, + "grad_norm": 0.4487043023109436, + "learning_rate": 0.00018584307178631053, + "loss": 0.6582, + "step": 218 + }, + { + "epoch": 7.687102630103554e-05, + "grad_norm": 0.42003679275512695, + "learning_rate": 0.0001857762938230384, + "loss": 0.5712, + "step": 219 + }, + { + "epoch": 7.722203555355168e-05, + "grad_norm": 0.4698665738105774, + "learning_rate": 0.00018570951585976627, + "loss": 0.5715, + "step": 220 + }, + { + "epoch": 7.757304480606783e-05, + "grad_norm": 0.3777780830860138, + "learning_rate": 0.00018564273789649415, + "loss": 0.4667, + "step": 221 + }, + { + "epoch": 7.792405405858397e-05, + "grad_norm": 0.36794212460517883, + "learning_rate": 0.00018557595993322205, + "loss": 0.5382, + "step": 222 + }, + { + "epoch": 7.827506331110012e-05, + "grad_norm": 0.4582989513874054, + "learning_rate": 0.00018550918196994992, + "loss": 0.6437, + "step": 223 + }, + { + "epoch": 7.862607256361626e-05, + "grad_norm": 0.4065852761268616, + "learning_rate": 0.0001854424040066778, + "loss": 0.6928, + "step": 224 + }, + { + "epoch": 7.89770818161324e-05, + "grad_norm": 0.3857649564743042, + "learning_rate": 0.0001853756260434057, + "loss": 0.5405, + "step": 225 + }, + { + "epoch": 7.932809106864855e-05, + "grad_norm": 0.40056589245796204, + "learning_rate": 0.00018530884808013357, + "loss": 0.6425, + "step": 226 + }, + { + "epoch": 7.967910032116469e-05, + "grad_norm": 0.43137016892433167, + "learning_rate": 0.00018524207011686147, + "loss": 0.5001, + "step": 227 + }, + { + "epoch": 8.003010957368084e-05, + "grad_norm": 0.3723987340927124, + "learning_rate": 0.00018517529215358934, + "loss": 0.5118, + "step": 228 + }, + { + "epoch": 8.038111882619698e-05, + "grad_norm": 0.34196361899375916, + "learning_rate": 0.00018510851419031721, + "loss": 0.5468, + "step": 229 + }, + { + "epoch": 8.073212807871312e-05, + "grad_norm": 0.4319117069244385, + "learning_rate": 0.0001850417362270451, + "loss": 0.5703, + "step": 230 + }, + { + "epoch": 8.108313733122927e-05, + "grad_norm": 0.4467247724533081, + "learning_rate": 0.00018497495826377296, + "loss": 0.6536, + "step": 231 + }, + { + "epoch": 8.14341465837454e-05, + "grad_norm": 0.3569909632205963, + "learning_rate": 0.00018490818030050083, + "loss": 0.5335, + "step": 232 + }, + { + "epoch": 8.178515583626156e-05, + "grad_norm": 0.33486437797546387, + "learning_rate": 0.00018484140233722873, + "loss": 0.6803, + "step": 233 + }, + { + "epoch": 8.21361650887777e-05, + "grad_norm": 0.3783140480518341, + "learning_rate": 0.0001847746243739566, + "loss": 0.6361, + "step": 234 + }, + { + "epoch": 8.248717434129384e-05, + "grad_norm": 0.4844662547111511, + "learning_rate": 0.00018470784641068448, + "loss": 0.5322, + "step": 235 + }, + { + "epoch": 8.283818359380999e-05, + "grad_norm": 0.508406400680542, + "learning_rate": 0.00018464106844741235, + "loss": 0.6676, + "step": 236 + }, + { + "epoch": 8.318919284632613e-05, + "grad_norm": 0.3710225820541382, + "learning_rate": 0.00018457429048414023, + "loss": 0.6656, + "step": 237 + }, + { + "epoch": 8.354020209884228e-05, + "grad_norm": 0.3757292628288269, + "learning_rate": 0.00018450751252086813, + "loss": 0.6095, + "step": 238 + }, + { + "epoch": 8.389121135135843e-05, + "grad_norm": 0.40651261806488037, + "learning_rate": 0.000184440734557596, + "loss": 0.6626, + "step": 239 + }, + { + "epoch": 8.424222060387456e-05, + "grad_norm": 0.40700778365135193, + "learning_rate": 0.00018437395659432387, + "loss": 0.5328, + "step": 240 + }, + { + "epoch": 8.459322985639071e-05, + "grad_norm": 0.5067440867424011, + "learning_rate": 0.00018430717863105175, + "loss": 0.4811, + "step": 241 + }, + { + "epoch": 8.494423910890685e-05, + "grad_norm": 0.3934602737426758, + "learning_rate": 0.00018424040066777965, + "loss": 0.5691, + "step": 242 + }, + { + "epoch": 8.5295248361423e-05, + "grad_norm": 0.3360019624233246, + "learning_rate": 0.00018417362270450752, + "loss": 0.5542, + "step": 243 + }, + { + "epoch": 8.564625761393915e-05, + "grad_norm": 0.4023631513118744, + "learning_rate": 0.00018410684474123542, + "loss": 0.5192, + "step": 244 + }, + { + "epoch": 8.599726686645528e-05, + "grad_norm": 0.41704171895980835, + "learning_rate": 0.0001840400667779633, + "loss": 0.5018, + "step": 245 + }, + { + "epoch": 8.634827611897143e-05, + "grad_norm": 0.361977756023407, + "learning_rate": 0.00018397328881469117, + "loss": 0.6193, + "step": 246 + }, + { + "epoch": 8.669928537148757e-05, + "grad_norm": 0.37774717807769775, + "learning_rate": 0.00018390651085141904, + "loss": 0.5552, + "step": 247 + }, + { + "epoch": 8.705029462400372e-05, + "grad_norm": 0.3408471941947937, + "learning_rate": 0.0001838397328881469, + "loss": 0.5876, + "step": 248 + }, + { + "epoch": 8.740130387651985e-05, + "grad_norm": 0.3892226815223694, + "learning_rate": 0.0001837729549248748, + "loss": 0.4227, + "step": 249 + }, + { + "epoch": 8.7752313129036e-05, + "grad_norm": 0.5315036177635193, + "learning_rate": 0.00018370617696160269, + "loss": 0.5826, + "step": 250 + }, + { + "epoch": 8.810332238155215e-05, + "grad_norm": 0.35433024168014526, + "learning_rate": 0.00018363939899833056, + "loss": 0.5992, + "step": 251 + }, + { + "epoch": 8.845433163406829e-05, + "grad_norm": 0.34777382016181946, + "learning_rate": 0.00018357262103505843, + "loss": 0.4973, + "step": 252 + }, + { + "epoch": 8.880534088658444e-05, + "grad_norm": 0.3936387002468109, + "learning_rate": 0.0001835058430717863, + "loss": 0.6254, + "step": 253 + }, + { + "epoch": 8.915635013910057e-05, + "grad_norm": 0.4009217917919159, + "learning_rate": 0.0001834390651085142, + "loss": 0.4843, + "step": 254 + }, + { + "epoch": 8.950735939161672e-05, + "grad_norm": 0.4863683879375458, + "learning_rate": 0.00018337228714524208, + "loss": 0.5204, + "step": 255 + }, + { + "epoch": 8.985836864413287e-05, + "grad_norm": 0.6100988984107971, + "learning_rate": 0.00018330550918196995, + "loss": 0.7296, + "step": 256 + }, + { + "epoch": 9.020937789664901e-05, + "grad_norm": 0.40949374437332153, + "learning_rate": 0.00018323873121869782, + "loss": 0.5707, + "step": 257 + }, + { + "epoch": 9.056038714916516e-05, + "grad_norm": 0.47316402196884155, + "learning_rate": 0.0001831719532554257, + "loss": 0.6655, + "step": 258 + }, + { + "epoch": 9.091139640168129e-05, + "grad_norm": 0.4053696393966675, + "learning_rate": 0.0001831051752921536, + "loss": 0.5822, + "step": 259 + }, + { + "epoch": 9.126240565419744e-05, + "grad_norm": 0.4582972228527069, + "learning_rate": 0.00018303839732888147, + "loss": 0.5475, + "step": 260 + }, + { + "epoch": 9.161341490671359e-05, + "grad_norm": 0.38666802644729614, + "learning_rate": 0.00018297161936560937, + "loss": 0.4744, + "step": 261 + }, + { + "epoch": 9.196442415922973e-05, + "grad_norm": 0.31954991817474365, + "learning_rate": 0.00018290484140233724, + "loss": 0.6337, + "step": 262 + }, + { + "epoch": 9.231543341174588e-05, + "grad_norm": 0.3590424358844757, + "learning_rate": 0.00018283806343906512, + "loss": 0.5683, + "step": 263 + }, + { + "epoch": 9.266644266426201e-05, + "grad_norm": 0.4042195975780487, + "learning_rate": 0.000182771285475793, + "loss": 0.6142, + "step": 264 + }, + { + "epoch": 9.301745191677816e-05, + "grad_norm": 0.3474234342575073, + "learning_rate": 0.0001827045075125209, + "loss": 0.6035, + "step": 265 + }, + { + "epoch": 9.336846116929431e-05, + "grad_norm": 0.337091326713562, + "learning_rate": 0.00018263772954924876, + "loss": 0.6107, + "step": 266 + }, + { + "epoch": 9.371947042181045e-05, + "grad_norm": 0.3313732445240021, + "learning_rate": 0.00018257095158597664, + "loss": 0.6491, + "step": 267 + }, + { + "epoch": 9.40704796743266e-05, + "grad_norm": 0.3931679129600525, + "learning_rate": 0.0001825041736227045, + "loss": 0.5492, + "step": 268 + }, + { + "epoch": 9.442148892684273e-05, + "grad_norm": 0.5848420262336731, + "learning_rate": 0.00018243739565943238, + "loss": 0.7091, + "step": 269 + }, + { + "epoch": 9.477249817935888e-05, + "grad_norm": 0.4851846992969513, + "learning_rate": 0.00018237061769616028, + "loss": 0.5856, + "step": 270 + }, + { + "epoch": 9.512350743187503e-05, + "grad_norm": 0.3434993326663971, + "learning_rate": 0.00018230383973288816, + "loss": 0.5085, + "step": 271 + }, + { + "epoch": 9.547451668439117e-05, + "grad_norm": 0.2978988587856293, + "learning_rate": 0.00018223706176961603, + "loss": 0.481, + "step": 272 + }, + { + "epoch": 9.582552593690732e-05, + "grad_norm": 0.34215858578681946, + "learning_rate": 0.0001821702838063439, + "loss": 0.5723, + "step": 273 + }, + { + "epoch": 9.617653518942345e-05, + "grad_norm": 0.43445509672164917, + "learning_rate": 0.00018210350584307178, + "loss": 0.5691, + "step": 274 + }, + { + "epoch": 9.65275444419396e-05, + "grad_norm": 0.36094945669174194, + "learning_rate": 0.00018203672787979968, + "loss": 0.5543, + "step": 275 + }, + { + "epoch": 9.687855369445575e-05, + "grad_norm": 0.386106014251709, + "learning_rate": 0.00018196994991652755, + "loss": 0.5561, + "step": 276 + }, + { + "epoch": 9.722956294697189e-05, + "grad_norm": 0.36676689982414246, + "learning_rate": 0.00018190317195325542, + "loss": 0.5479, + "step": 277 + }, + { + "epoch": 9.758057219948804e-05, + "grad_norm": 0.37988394498825073, + "learning_rate": 0.00018183639398998332, + "loss": 0.5772, + "step": 278 + }, + { + "epoch": 9.793158145200417e-05, + "grad_norm": 0.4024789035320282, + "learning_rate": 0.0001817696160267112, + "loss": 0.6065, + "step": 279 + }, + { + "epoch": 9.828259070452032e-05, + "grad_norm": 0.3697255551815033, + "learning_rate": 0.0001817028380634391, + "loss": 0.5021, + "step": 280 + }, + { + "epoch": 9.863359995703647e-05, + "grad_norm": 0.43579426407814026, + "learning_rate": 0.00018163606010016697, + "loss": 0.555, + "step": 281 + }, + { + "epoch": 9.898460920955261e-05, + "grad_norm": 0.4760832190513611, + "learning_rate": 0.00018156928213689484, + "loss": 0.6438, + "step": 282 + }, + { + "epoch": 9.933561846206876e-05, + "grad_norm": 0.45258408784866333, + "learning_rate": 0.00018150250417362272, + "loss": 0.4717, + "step": 283 + }, + { + "epoch": 9.96866277145849e-05, + "grad_norm": 0.428108274936676, + "learning_rate": 0.0001814357262103506, + "loss": 0.6029, + "step": 284 + }, + { + "epoch": 0.00010003763696710104, + "grad_norm": 0.3999852240085602, + "learning_rate": 0.00018136894824707846, + "loss": 0.4524, + "step": 285 + }, + { + "epoch": 0.0001003886462196172, + "grad_norm": 0.44319403171539307, + "learning_rate": 0.00018130217028380636, + "loss": 0.6619, + "step": 286 + }, + { + "epoch": 0.00010073965547213333, + "grad_norm": 0.43008357286453247, + "learning_rate": 0.00018123539232053424, + "loss": 0.6105, + "step": 287 + }, + { + "epoch": 0.00010109066472464948, + "grad_norm": 0.38037821650505066, + "learning_rate": 0.0001811686143572621, + "loss": 0.6649, + "step": 288 + }, + { + "epoch": 0.00010144167397716562, + "grad_norm": 0.3713517487049103, + "learning_rate": 0.00018110183639398998, + "loss": 0.6381, + "step": 289 + }, + { + "epoch": 0.00010179268322968176, + "grad_norm": 0.3437170386314392, + "learning_rate": 0.00018103505843071786, + "loss": 0.4563, + "step": 290 + }, + { + "epoch": 0.00010214369248219791, + "grad_norm": 0.3661468029022217, + "learning_rate": 0.00018096828046744576, + "loss": 0.606, + "step": 291 + }, + { + "epoch": 0.00010249470173471405, + "grad_norm": 0.36346200108528137, + "learning_rate": 0.00018090150250417363, + "loss": 0.5895, + "step": 292 + }, + { + "epoch": 0.0001028457109872302, + "grad_norm": 0.31052225828170776, + "learning_rate": 0.0001808347245409015, + "loss": 0.4409, + "step": 293 + }, + { + "epoch": 0.00010319672023974634, + "grad_norm": 0.37012970447540283, + "learning_rate": 0.00018076794657762938, + "loss": 0.505, + "step": 294 + }, + { + "epoch": 0.00010354772949226248, + "grad_norm": 0.3958667814731598, + "learning_rate": 0.00018070116861435728, + "loss": 0.5371, + "step": 295 + }, + { + "epoch": 0.00010389873874477863, + "grad_norm": 0.4892179071903229, + "learning_rate": 0.00018063439065108515, + "loss": 0.6737, + "step": 296 + }, + { + "epoch": 0.00010424974799729477, + "grad_norm": 0.41874751448631287, + "learning_rate": 0.00018056761268781305, + "loss": 0.651, + "step": 297 + }, + { + "epoch": 0.00010460075724981092, + "grad_norm": 0.4167911410331726, + "learning_rate": 0.00018050083472454092, + "loss": 0.5531, + "step": 298 + }, + { + "epoch": 0.00010495176650232706, + "grad_norm": 0.3758225440979004, + "learning_rate": 0.0001804340567612688, + "loss": 0.6285, + "step": 299 + }, + { + "epoch": 0.0001053027757548432, + "grad_norm": 0.3688598573207855, + "learning_rate": 0.00018036727879799667, + "loss": 0.5219, + "step": 300 + }, + { + "epoch": 0.00010565378500735934, + "grad_norm": 0.3501751124858856, + "learning_rate": 0.00018030050083472454, + "loss": 0.6351, + "step": 301 + }, + { + "epoch": 0.00010600479425987549, + "grad_norm": 0.42876511812210083, + "learning_rate": 0.00018023372287145244, + "loss": 0.544, + "step": 302 + }, + { + "epoch": 0.00010635580351239164, + "grad_norm": 0.47046172618865967, + "learning_rate": 0.00018016694490818031, + "loss": 0.6304, + "step": 303 + }, + { + "epoch": 0.00010670681276490778, + "grad_norm": 0.402271032333374, + "learning_rate": 0.0001801001669449082, + "loss": 0.5039, + "step": 304 + }, + { + "epoch": 0.00010705782201742393, + "grad_norm": 0.41232413053512573, + "learning_rate": 0.00018003338898163606, + "loss": 0.5892, + "step": 305 + }, + { + "epoch": 0.00010740883126994006, + "grad_norm": 0.3628154993057251, + "learning_rate": 0.00017996661101836393, + "loss": 0.5737, + "step": 306 + }, + { + "epoch": 0.00010775984052245621, + "grad_norm": 0.4291020631790161, + "learning_rate": 0.00017989983305509183, + "loss": 0.6597, + "step": 307 + }, + { + "epoch": 0.00010811084977497236, + "grad_norm": 0.33218181133270264, + "learning_rate": 0.0001798330550918197, + "loss": 0.5726, + "step": 308 + }, + { + "epoch": 0.0001084618590274885, + "grad_norm": 0.3439387381076813, + "learning_rate": 0.00017976627712854758, + "loss": 0.5615, + "step": 309 + }, + { + "epoch": 0.00010881286828000465, + "grad_norm": 0.3523644208908081, + "learning_rate": 0.00017969949916527545, + "loss": 0.4968, + "step": 310 + }, + { + "epoch": 0.00010916387753252078, + "grad_norm": 0.4045630991458893, + "learning_rate": 0.00017963272120200333, + "loss": 0.6425, + "step": 311 + }, + { + "epoch": 0.00010951488678503693, + "grad_norm": 0.3726767599582672, + "learning_rate": 0.00017956594323873123, + "loss": 0.6575, + "step": 312 + }, + { + "epoch": 0.00010986589603755308, + "grad_norm": 0.32131972908973694, + "learning_rate": 0.0001794991652754591, + "loss": 0.5146, + "step": 313 + }, + { + "epoch": 0.00011021690529006922, + "grad_norm": 0.5013764500617981, + "learning_rate": 0.000179432387312187, + "loss": 0.53, + "step": 314 + }, + { + "epoch": 0.00011056791454258537, + "grad_norm": 0.36830246448516846, + "learning_rate": 0.00017936560934891487, + "loss": 0.6291, + "step": 315 + }, + { + "epoch": 0.0001109189237951015, + "grad_norm": 0.3587378263473511, + "learning_rate": 0.00017929883138564275, + "loss": 0.4954, + "step": 316 + }, + { + "epoch": 0.00011126993304761765, + "grad_norm": 0.3480195105075836, + "learning_rate": 0.00017923205342237062, + "loss": 0.606, + "step": 317 + }, + { + "epoch": 0.0001116209423001338, + "grad_norm": 0.38415858149528503, + "learning_rate": 0.00017916527545909852, + "loss": 0.7281, + "step": 318 + }, + { + "epoch": 0.00011197195155264994, + "grad_norm": 0.35853826999664307, + "learning_rate": 0.0001790984974958264, + "loss": 0.5851, + "step": 319 + }, + { + "epoch": 0.00011232296080516609, + "grad_norm": 0.42092210054397583, + "learning_rate": 0.00017903171953255427, + "loss": 0.5324, + "step": 320 + }, + { + "epoch": 0.00011267397005768222, + "grad_norm": 0.34538987278938293, + "learning_rate": 0.00017896494156928214, + "loss": 0.6387, + "step": 321 + }, + { + "epoch": 0.00011302497931019837, + "grad_norm": 0.38299745321273804, + "learning_rate": 0.00017889816360601, + "loss": 0.6013, + "step": 322 + }, + { + "epoch": 0.00011337598856271452, + "grad_norm": 0.32100436091423035, + "learning_rate": 0.0001788313856427379, + "loss": 0.4627, + "step": 323 + }, + { + "epoch": 0.00011372699781523066, + "grad_norm": 0.3458426594734192, + "learning_rate": 0.0001787646076794658, + "loss": 0.5865, + "step": 324 + }, + { + "epoch": 0.0001140780070677468, + "grad_norm": 0.33228665590286255, + "learning_rate": 0.00017869782971619366, + "loss": 0.4611, + "step": 325 + }, + { + "epoch": 0.00011442901632026294, + "grad_norm": 0.38747021555900574, + "learning_rate": 0.00017863105175292153, + "loss": 0.5777, + "step": 326 + }, + { + "epoch": 0.00011478002557277909, + "grad_norm": 0.3888608515262604, + "learning_rate": 0.0001785642737896494, + "loss": 0.5664, + "step": 327 + }, + { + "epoch": 0.00011513103482529524, + "grad_norm": 0.4084737002849579, + "learning_rate": 0.0001784974958263773, + "loss": 0.5939, + "step": 328 + }, + { + "epoch": 0.00011548204407781138, + "grad_norm": 0.4964492917060852, + "learning_rate": 0.00017843071786310518, + "loss": 0.6256, + "step": 329 + }, + { + "epoch": 0.00011583305333032753, + "grad_norm": 0.37329745292663574, + "learning_rate": 0.00017836393989983305, + "loss": 0.5388, + "step": 330 + }, + { + "epoch": 0.00011618406258284366, + "grad_norm": 0.37680140137672424, + "learning_rate": 0.00017829716193656095, + "loss": 0.6203, + "step": 331 + }, + { + "epoch": 0.00011653507183535981, + "grad_norm": 0.4162957966327667, + "learning_rate": 0.00017823038397328883, + "loss": 0.6478, + "step": 332 + }, + { + "epoch": 0.00011688608108787596, + "grad_norm": 0.3473896086215973, + "learning_rate": 0.0001781636060100167, + "loss": 0.589, + "step": 333 + }, + { + "epoch": 0.0001172370903403921, + "grad_norm": 0.4039511978626251, + "learning_rate": 0.0001780968280467446, + "loss": 0.5681, + "step": 334 + }, + { + "epoch": 0.00011758809959290825, + "grad_norm": 0.3135715425014496, + "learning_rate": 0.00017803005008347247, + "loss": 0.5069, + "step": 335 + }, + { + "epoch": 0.00011793910884542438, + "grad_norm": 0.4296559989452362, + "learning_rate": 0.00017796327212020035, + "loss": 0.5413, + "step": 336 + }, + { + "epoch": 0.00011829011809794053, + "grad_norm": 0.4197536110877991, + "learning_rate": 0.00017789649415692822, + "loss": 0.694, + "step": 337 + }, + { + "epoch": 0.00011864112735045668, + "grad_norm": 0.3633468449115753, + "learning_rate": 0.0001778297161936561, + "loss": 0.5475, + "step": 338 + }, + { + "epoch": 0.00011899213660297282, + "grad_norm": 0.2867147922515869, + "learning_rate": 0.000177762938230384, + "loss": 0.485, + "step": 339 + }, + { + "epoch": 0.00011934314585548897, + "grad_norm": 0.3445490300655365, + "learning_rate": 0.00017769616026711187, + "loss": 0.6304, + "step": 340 + }, + { + "epoch": 0.0001196941551080051, + "grad_norm": 0.31692221760749817, + "learning_rate": 0.00017762938230383974, + "loss": 0.5804, + "step": 341 + }, + { + "epoch": 0.00012004516436052125, + "grad_norm": 0.31391167640686035, + "learning_rate": 0.0001775626043405676, + "loss": 0.5945, + "step": 342 + }, + { + "epoch": 0.0001203961736130374, + "grad_norm": 0.3484472632408142, + "learning_rate": 0.00017749582637729548, + "loss": 0.6577, + "step": 343 + }, + { + "epoch": 0.00012074718286555354, + "grad_norm": 0.37430596351623535, + "learning_rate": 0.00017742904841402339, + "loss": 0.6854, + "step": 344 + }, + { + "epoch": 0.00012109819211806969, + "grad_norm": 0.34305211901664734, + "learning_rate": 0.00017736227045075126, + "loss": 0.5123, + "step": 345 + }, + { + "epoch": 0.00012144920137058582, + "grad_norm": 0.3398534059524536, + "learning_rate": 0.00017729549248747913, + "loss": 0.5602, + "step": 346 + }, + { + "epoch": 0.00012180021062310197, + "grad_norm": 0.4278014600276947, + "learning_rate": 0.000177228714524207, + "loss": 0.5152, + "step": 347 + }, + { + "epoch": 0.00012215121987561812, + "grad_norm": 0.4011085629463196, + "learning_rate": 0.0001771619365609349, + "loss": 0.6217, + "step": 348 + }, + { + "epoch": 0.00012250222912813427, + "grad_norm": 0.3425695598125458, + "learning_rate": 0.00017709515859766278, + "loss": 0.5037, + "step": 349 + }, + { + "epoch": 0.0001228532383806504, + "grad_norm": 0.34036242961883545, + "learning_rate": 0.00017702838063439068, + "loss": 0.649, + "step": 350 + }, + { + "epoch": 0.00012320424763316654, + "grad_norm": 0.5631874203681946, + "learning_rate": 0.00017696160267111855, + "loss": 0.5656, + "step": 351 + }, + { + "epoch": 0.0001235552568856827, + "grad_norm": 0.4195176661014557, + "learning_rate": 0.00017689482470784642, + "loss": 0.6899, + "step": 352 + }, + { + "epoch": 0.00012390626613819884, + "grad_norm": 0.41814154386520386, + "learning_rate": 0.0001768280467445743, + "loss": 0.551, + "step": 353 + }, + { + "epoch": 0.000124257275390715, + "grad_norm": 0.3374340534210205, + "learning_rate": 0.00017676126878130217, + "loss": 0.7022, + "step": 354 + }, + { + "epoch": 0.00012460828464323112, + "grad_norm": 0.41464921832084656, + "learning_rate": 0.00017669449081803007, + "loss": 0.5301, + "step": 355 + }, + { + "epoch": 0.00012495929389574726, + "grad_norm": 0.4443178176879883, + "learning_rate": 0.00017662771285475794, + "loss": 0.5487, + "step": 356 + }, + { + "epoch": 0.00012531030314826341, + "grad_norm": 0.3389272093772888, + "learning_rate": 0.00017656093489148582, + "loss": 0.581, + "step": 357 + }, + { + "epoch": 0.00012566131240077956, + "grad_norm": 0.29650986194610596, + "learning_rate": 0.0001764941569282137, + "loss": 0.5801, + "step": 358 + }, + { + "epoch": 0.0001260123216532957, + "grad_norm": 0.40271905064582825, + "learning_rate": 0.00017642737896494156, + "loss": 0.6738, + "step": 359 + }, + { + "epoch": 0.00012636333090581184, + "grad_norm": 0.352225661277771, + "learning_rate": 0.00017636060100166946, + "loss": 0.5727, + "step": 360 + }, + { + "epoch": 0.00012671434015832798, + "grad_norm": 0.3469563126564026, + "learning_rate": 0.00017629382303839734, + "loss": 0.5188, + "step": 361 + }, + { + "epoch": 0.00012706534941084413, + "grad_norm": 0.30644670128822327, + "learning_rate": 0.0001762270450751252, + "loss": 0.497, + "step": 362 + }, + { + "epoch": 0.00012741635866336028, + "grad_norm": 0.3472917377948761, + "learning_rate": 0.00017616026711185308, + "loss": 0.6363, + "step": 363 + }, + { + "epoch": 0.00012776736791587643, + "grad_norm": 0.37184756994247437, + "learning_rate": 0.00017609348914858096, + "loss": 0.5223, + "step": 364 + }, + { + "epoch": 0.00012811837716839256, + "grad_norm": 0.3247138559818268, + "learning_rate": 0.00017602671118530886, + "loss": 0.5457, + "step": 365 + }, + { + "epoch": 0.0001284693864209087, + "grad_norm": 0.5236158967018127, + "learning_rate": 0.00017595993322203673, + "loss": 0.615, + "step": 366 + }, + { + "epoch": 0.00012882039567342485, + "grad_norm": 0.33708465099334717, + "learning_rate": 0.00017589315525876463, + "loss": 0.6163, + "step": 367 + }, + { + "epoch": 0.000129171404925941, + "grad_norm": 0.33848705887794495, + "learning_rate": 0.0001758263772954925, + "loss": 0.4229, + "step": 368 + }, + { + "epoch": 0.00012952241417845715, + "grad_norm": 0.5827682018280029, + "learning_rate": 0.00017575959933222038, + "loss": 0.5668, + "step": 369 + }, + { + "epoch": 0.00012987342343097328, + "grad_norm": 0.36217448115348816, + "learning_rate": 0.00017569282136894825, + "loss": 0.4983, + "step": 370 + }, + { + "epoch": 0.00013022443268348943, + "grad_norm": 0.329414963722229, + "learning_rate": 0.00017562604340567615, + "loss": 0.4281, + "step": 371 + }, + { + "epoch": 0.00013057544193600557, + "grad_norm": 0.36746612191200256, + "learning_rate": 0.00017555926544240402, + "loss": 0.6629, + "step": 372 + }, + { + "epoch": 0.00013092645118852172, + "grad_norm": 0.3954717516899109, + "learning_rate": 0.0001754924874791319, + "loss": 0.5784, + "step": 373 + }, + { + "epoch": 0.00013127746044103787, + "grad_norm": 0.41279932856559753, + "learning_rate": 0.00017542570951585977, + "loss": 0.5994, + "step": 374 + }, + { + "epoch": 0.000131628469693554, + "grad_norm": 0.3019951581954956, + "learning_rate": 0.00017535893155258764, + "loss": 0.5584, + "step": 375 + }, + { + "epoch": 0.00013197947894607015, + "grad_norm": 0.3079768121242523, + "learning_rate": 0.00017529215358931554, + "loss": 0.5904, + "step": 376 + }, + { + "epoch": 0.0001323304881985863, + "grad_norm": 0.5678027272224426, + "learning_rate": 0.00017522537562604342, + "loss": 0.6441, + "step": 377 + }, + { + "epoch": 0.00013268149745110244, + "grad_norm": 0.38624581694602966, + "learning_rate": 0.0001751585976627713, + "loss": 0.5582, + "step": 378 + }, + { + "epoch": 0.0001330325067036186, + "grad_norm": 0.4368002712726593, + "learning_rate": 0.00017509181969949916, + "loss": 0.686, + "step": 379 + }, + { + "epoch": 0.00013338351595613472, + "grad_norm": 0.3409269154071808, + "learning_rate": 0.00017502504173622704, + "loss": 0.582, + "step": 380 + }, + { + "epoch": 0.00013373452520865087, + "grad_norm": 0.3772698938846588, + "learning_rate": 0.0001749582637729549, + "loss": 0.5314, + "step": 381 + }, + { + "epoch": 0.00013408553446116702, + "grad_norm": 0.3791707158088684, + "learning_rate": 0.0001748914858096828, + "loss": 0.6143, + "step": 382 + }, + { + "epoch": 0.00013443654371368317, + "grad_norm": 0.4441101551055908, + "learning_rate": 0.0001748247078464107, + "loss": 0.5726, + "step": 383 + }, + { + "epoch": 0.0001347875529661993, + "grad_norm": 0.4160211980342865, + "learning_rate": 0.00017475792988313858, + "loss": 0.6003, + "step": 384 + }, + { + "epoch": 0.00013513856221871544, + "grad_norm": 0.41698628664016724, + "learning_rate": 0.00017469115191986646, + "loss": 0.4539, + "step": 385 + }, + { + "epoch": 0.00013548957147123159, + "grad_norm": 0.337007999420166, + "learning_rate": 0.00017462437395659433, + "loss": 0.5176, + "step": 386 + }, + { + "epoch": 0.00013584058072374774, + "grad_norm": 0.30926409363746643, + "learning_rate": 0.00017455759599332223, + "loss": 0.6072, + "step": 387 + }, + { + "epoch": 0.00013619158997626389, + "grad_norm": 0.3663052022457123, + "learning_rate": 0.0001744908180300501, + "loss": 0.538, + "step": 388 + }, + { + "epoch": 0.00013654259922878, + "grad_norm": 0.3410074710845947, + "learning_rate": 0.00017442404006677798, + "loss": 0.5687, + "step": 389 + }, + { + "epoch": 0.00013689360848129616, + "grad_norm": 0.5266095399856567, + "learning_rate": 0.00017435726210350585, + "loss": 0.6685, + "step": 390 + }, + { + "epoch": 0.0001372446177338123, + "grad_norm": 0.4020686149597168, + "learning_rate": 0.00017429048414023372, + "loss": 0.586, + "step": 391 + }, + { + "epoch": 0.00013759562698632846, + "grad_norm": 0.39995548129081726, + "learning_rate": 0.00017422370617696162, + "loss": 0.6958, + "step": 392 + }, + { + "epoch": 0.0001379466362388446, + "grad_norm": 0.4024721682071686, + "learning_rate": 0.0001741569282136895, + "loss": 0.6411, + "step": 393 + }, + { + "epoch": 0.00013829764549136073, + "grad_norm": 0.38193392753601074, + "learning_rate": 0.00017409015025041737, + "loss": 0.5857, + "step": 394 + }, + { + "epoch": 0.00013864865474387688, + "grad_norm": 0.39786526560783386, + "learning_rate": 0.00017402337228714524, + "loss": 0.5215, + "step": 395 + }, + { + "epoch": 0.00013899966399639303, + "grad_norm": 0.49223974347114563, + "learning_rate": 0.00017395659432387311, + "loss": 0.5881, + "step": 396 + }, + { + "epoch": 0.00013935067324890918, + "grad_norm": 0.3398894667625427, + "learning_rate": 0.00017388981636060101, + "loss": 0.5466, + "step": 397 + }, + { + "epoch": 0.00013970168250142533, + "grad_norm": 0.34891223907470703, + "learning_rate": 0.0001738230383973289, + "loss": 0.5901, + "step": 398 + }, + { + "epoch": 0.00014005269175394145, + "grad_norm": 0.47644108533859253, + "learning_rate": 0.00017375626043405676, + "loss": 0.5075, + "step": 399 + }, + { + "epoch": 0.0001404037010064576, + "grad_norm": 0.42530229687690735, + "learning_rate": 0.00017368948247078466, + "loss": 0.663, + "step": 400 + }, + { + "epoch": 0.00014075471025897375, + "grad_norm": 0.30858534574508667, + "learning_rate": 0.00017362270450751253, + "loss": 0.4724, + "step": 401 + }, + { + "epoch": 0.0001411057195114899, + "grad_norm": 0.42453449964523315, + "learning_rate": 0.0001735559265442404, + "loss": 0.6074, + "step": 402 + }, + { + "epoch": 0.00014145672876400605, + "grad_norm": 0.3964505195617676, + "learning_rate": 0.0001734891485809683, + "loss": 0.4913, + "step": 403 + }, + { + "epoch": 0.00014180773801652217, + "grad_norm": 0.3317703902721405, + "learning_rate": 0.00017342237061769618, + "loss": 0.5504, + "step": 404 + }, + { + "epoch": 0.00014215874726903832, + "grad_norm": 0.3912264108657837, + "learning_rate": 0.00017335559265442405, + "loss": 0.6301, + "step": 405 + }, + { + "epoch": 0.00014250975652155447, + "grad_norm": 0.3582877218723297, + "learning_rate": 0.00017328881469115193, + "loss": 0.6205, + "step": 406 + }, + { + "epoch": 0.00014286076577407062, + "grad_norm": 0.3691099286079407, + "learning_rate": 0.0001732220367278798, + "loss": 0.5348, + "step": 407 + }, + { + "epoch": 0.00014321177502658677, + "grad_norm": 0.35860803723335266, + "learning_rate": 0.0001731552587646077, + "loss": 0.6029, + "step": 408 + }, + { + "epoch": 0.0001435627842791029, + "grad_norm": 0.3640693426132202, + "learning_rate": 0.00017308848080133557, + "loss": 0.6673, + "step": 409 + }, + { + "epoch": 0.00014391379353161904, + "grad_norm": 0.3550623953342438, + "learning_rate": 0.00017302170283806345, + "loss": 0.4659, + "step": 410 + }, + { + "epoch": 0.0001442648027841352, + "grad_norm": 0.45885637402534485, + "learning_rate": 0.00017295492487479132, + "loss": 0.4781, + "step": 411 + }, + { + "epoch": 0.00014461581203665134, + "grad_norm": 0.3703556954860687, + "learning_rate": 0.0001728881469115192, + "loss": 0.4829, + "step": 412 + }, + { + "epoch": 0.0001449668212891675, + "grad_norm": 0.5436837077140808, + "learning_rate": 0.0001728213689482471, + "loss": 0.6056, + "step": 413 + }, + { + "epoch": 0.0001453178305416836, + "grad_norm": 0.3953244686126709, + "learning_rate": 0.00017275459098497497, + "loss": 0.4884, + "step": 414 + }, + { + "epoch": 0.00014566883979419976, + "grad_norm": 0.34003904461860657, + "learning_rate": 0.00017268781302170284, + "loss": 0.6014, + "step": 415 + }, + { + "epoch": 0.0001460198490467159, + "grad_norm": 0.3463648557662964, + "learning_rate": 0.0001726210350584307, + "loss": 0.603, + "step": 416 + }, + { + "epoch": 0.00014637085829923206, + "grad_norm": 0.4293590784072876, + "learning_rate": 0.0001725542570951586, + "loss": 0.6686, + "step": 417 + }, + { + "epoch": 0.0001467218675517482, + "grad_norm": 0.4243469834327698, + "learning_rate": 0.0001724874791318865, + "loss": 0.6422, + "step": 418 + }, + { + "epoch": 0.00014707287680426433, + "grad_norm": 0.38327839970588684, + "learning_rate": 0.0001724207011686144, + "loss": 0.5595, + "step": 419 + }, + { + "epoch": 0.00014742388605678048, + "grad_norm": 0.31334301829338074, + "learning_rate": 0.00017235392320534226, + "loss": 0.474, + "step": 420 + }, + { + "epoch": 0.00014777489530929663, + "grad_norm": 0.3335350453853607, + "learning_rate": 0.00017228714524207013, + "loss": 0.6172, + "step": 421 + }, + { + "epoch": 0.00014812590456181278, + "grad_norm": 0.373696506023407, + "learning_rate": 0.000172220367278798, + "loss": 0.6183, + "step": 422 + }, + { + "epoch": 0.00014847691381432893, + "grad_norm": 0.45814886689186096, + "learning_rate": 0.00017215358931552588, + "loss": 0.5059, + "step": 423 + }, + { + "epoch": 0.00014882792306684505, + "grad_norm": 0.3578277826309204, + "learning_rate": 0.00017208681135225378, + "loss": 0.5771, + "step": 424 + }, + { + "epoch": 0.0001491789323193612, + "grad_norm": 0.42081883549690247, + "learning_rate": 0.00017202003338898165, + "loss": 0.5604, + "step": 425 + }, + { + "epoch": 0.00014952994157187735, + "grad_norm": 0.3173503875732422, + "learning_rate": 0.00017195325542570953, + "loss": 0.5738, + "step": 426 + }, + { + "epoch": 0.0001498809508243935, + "grad_norm": 0.38292011618614197, + "learning_rate": 0.0001718864774624374, + "loss": 0.6067, + "step": 427 + }, + { + "epoch": 0.00015023196007690965, + "grad_norm": 0.3518977463245392, + "learning_rate": 0.00017181969949916527, + "loss": 0.5073, + "step": 428 + }, + { + "epoch": 0.00015058296932942577, + "grad_norm": 0.5157706141471863, + "learning_rate": 0.00017175292153589317, + "loss": 0.5496, + "step": 429 + }, + { + "epoch": 0.00015093397858194192, + "grad_norm": 0.32064110040664673, + "learning_rate": 0.00017168614357262105, + "loss": 0.4766, + "step": 430 + }, + { + "epoch": 0.00015128498783445807, + "grad_norm": 0.42229798436164856, + "learning_rate": 0.00017161936560934892, + "loss": 0.5953, + "step": 431 + }, + { + "epoch": 0.00015163599708697422, + "grad_norm": 0.4723895192146301, + "learning_rate": 0.0001715525876460768, + "loss": 0.4783, + "step": 432 + }, + { + "epoch": 0.00015198700633949037, + "grad_norm": 0.3841445744037628, + "learning_rate": 0.00017148580968280467, + "loss": 0.5003, + "step": 433 + }, + { + "epoch": 0.0001523380155920065, + "grad_norm": 0.38026461005210876, + "learning_rate": 0.00017141903171953257, + "loss": 0.5093, + "step": 434 + }, + { + "epoch": 0.00015268902484452264, + "grad_norm": 0.37034904956817627, + "learning_rate": 0.00017135225375626044, + "loss": 0.6158, + "step": 435 + }, + { + "epoch": 0.0001530400340970388, + "grad_norm": 0.3876091241836548, + "learning_rate": 0.00017128547579298834, + "loss": 0.5287, + "step": 436 + }, + { + "epoch": 0.00015339104334955494, + "grad_norm": 0.30055519938468933, + "learning_rate": 0.0001712186978297162, + "loss": 0.5018, + "step": 437 + }, + { + "epoch": 0.0001537420526020711, + "grad_norm": 0.36094966530799866, + "learning_rate": 0.00017115191986644409, + "loss": 0.4961, + "step": 438 + }, + { + "epoch": 0.0001540930618545872, + "grad_norm": 0.3300524055957794, + "learning_rate": 0.00017108514190317196, + "loss": 0.5246, + "step": 439 + }, + { + "epoch": 0.00015444407110710336, + "grad_norm": 0.40980783104896545, + "learning_rate": 0.00017101836393989986, + "loss": 0.5705, + "step": 440 + }, + { + "epoch": 0.0001547950803596195, + "grad_norm": 0.3442326784133911, + "learning_rate": 0.00017095158597662773, + "loss": 0.5595, + "step": 441 + }, + { + "epoch": 0.00015514608961213566, + "grad_norm": 0.48015034198760986, + "learning_rate": 0.0001708848080133556, + "loss": 0.5642, + "step": 442 + }, + { + "epoch": 0.0001554970988646518, + "grad_norm": 0.5570142269134521, + "learning_rate": 0.00017081803005008348, + "loss": 0.6111, + "step": 443 + }, + { + "epoch": 0.00015584810811716793, + "grad_norm": 0.30470094084739685, + "learning_rate": 0.00017075125208681135, + "loss": 0.5151, + "step": 444 + }, + { + "epoch": 0.00015619911736968408, + "grad_norm": 0.31946614384651184, + "learning_rate": 0.00017068447412353925, + "loss": 0.5265, + "step": 445 + }, + { + "epoch": 0.00015655012662220023, + "grad_norm": 0.38980719447135925, + "learning_rate": 0.00017061769616026712, + "loss": 0.575, + "step": 446 + }, + { + "epoch": 0.00015690113587471638, + "grad_norm": 0.4077732264995575, + "learning_rate": 0.000170550918196995, + "loss": 0.5729, + "step": 447 + }, + { + "epoch": 0.00015725214512723253, + "grad_norm": 0.38632732629776, + "learning_rate": 0.00017048414023372287, + "loss": 0.594, + "step": 448 + }, + { + "epoch": 0.00015760315437974865, + "grad_norm": 0.37193921208381653, + "learning_rate": 0.00017041736227045074, + "loss": 0.6062, + "step": 449 + }, + { + "epoch": 0.0001579541636322648, + "grad_norm": 0.399029016494751, + "learning_rate": 0.00017035058430717862, + "loss": 0.4538, + "step": 450 + }, + { + "epoch": 0.00015830517288478095, + "grad_norm": 0.37710487842559814, + "learning_rate": 0.00017028380634390652, + "loss": 0.5615, + "step": 451 + }, + { + "epoch": 0.0001586561821372971, + "grad_norm": 0.38591668009757996, + "learning_rate": 0.0001702170283806344, + "loss": 0.5316, + "step": 452 + }, + { + "epoch": 0.00015900719138981325, + "grad_norm": 0.3453538417816162, + "learning_rate": 0.0001701502504173623, + "loss": 0.4645, + "step": 453 + }, + { + "epoch": 0.00015935820064232937, + "grad_norm": 0.34171512722969055, + "learning_rate": 0.00017008347245409016, + "loss": 0.5856, + "step": 454 + }, + { + "epoch": 0.00015970920989484552, + "grad_norm": 0.39591720700263977, + "learning_rate": 0.00017001669449081804, + "loss": 0.573, + "step": 455 + }, + { + "epoch": 0.00016006021914736167, + "grad_norm": 0.4127822816371918, + "learning_rate": 0.00016994991652754594, + "loss": 0.5183, + "step": 456 + }, + { + "epoch": 0.00016041122839987782, + "grad_norm": 0.37893375754356384, + "learning_rate": 0.0001698831385642738, + "loss": 0.566, + "step": 457 + }, + { + "epoch": 0.00016076223765239397, + "grad_norm": 0.33429333567619324, + "learning_rate": 0.00016981636060100168, + "loss": 0.449, + "step": 458 + }, + { + "epoch": 0.0001611132469049101, + "grad_norm": 0.3333180546760559, + "learning_rate": 0.00016974958263772956, + "loss": 0.4441, + "step": 459 + }, + { + "epoch": 0.00016146425615742624, + "grad_norm": 0.3591359257698059, + "learning_rate": 0.00016968280467445743, + "loss": 0.55, + "step": 460 + }, + { + "epoch": 0.0001618152654099424, + "grad_norm": 0.35390427708625793, + "learning_rate": 0.00016961602671118533, + "loss": 0.6445, + "step": 461 + }, + { + "epoch": 0.00016216627466245854, + "grad_norm": 0.42036697268486023, + "learning_rate": 0.0001695492487479132, + "loss": 0.5411, + "step": 462 + }, + { + "epoch": 0.0001625172839149747, + "grad_norm": 0.42147770524024963, + "learning_rate": 0.00016948247078464108, + "loss": 0.6218, + "step": 463 + }, + { + "epoch": 0.0001628682931674908, + "grad_norm": 0.3960399329662323, + "learning_rate": 0.00016941569282136895, + "loss": 0.6608, + "step": 464 + }, + { + "epoch": 0.00016321930242000696, + "grad_norm": 0.39676985144615173, + "learning_rate": 0.00016934891485809682, + "loss": 0.5838, + "step": 465 + }, + { + "epoch": 0.0001635703116725231, + "grad_norm": 0.2839520573616028, + "learning_rate": 0.0001692821368948247, + "loss": 0.5334, + "step": 466 + }, + { + "epoch": 0.00016392132092503926, + "grad_norm": 0.3654347062110901, + "learning_rate": 0.0001692153589315526, + "loss": 0.6065, + "step": 467 + }, + { + "epoch": 0.0001642723301775554, + "grad_norm": 0.3709166646003723, + "learning_rate": 0.00016914858096828047, + "loss": 0.509, + "step": 468 + }, + { + "epoch": 0.00016462333943007153, + "grad_norm": 0.29224780201911926, + "learning_rate": 0.00016908180300500834, + "loss": 0.5372, + "step": 469 + }, + { + "epoch": 0.00016497434868258768, + "grad_norm": 0.34979283809661865, + "learning_rate": 0.00016901502504173624, + "loss": 0.3968, + "step": 470 + }, + { + "epoch": 0.00016532535793510383, + "grad_norm": 0.34580183029174805, + "learning_rate": 0.00016894824707846412, + "loss": 0.6032, + "step": 471 + }, + { + "epoch": 0.00016567636718761998, + "grad_norm": 0.39046213030815125, + "learning_rate": 0.00016888146911519202, + "loss": 0.5628, + "step": 472 + }, + { + "epoch": 0.00016602737644013613, + "grad_norm": 0.35301411151885986, + "learning_rate": 0.0001688146911519199, + "loss": 0.607, + "step": 473 + }, + { + "epoch": 0.00016637838569265225, + "grad_norm": 0.4572748839855194, + "learning_rate": 0.00016874791318864776, + "loss": 0.5018, + "step": 474 + }, + { + "epoch": 0.0001667293949451684, + "grad_norm": 0.38230374455451965, + "learning_rate": 0.00016868113522537564, + "loss": 0.5026, + "step": 475 + }, + { + "epoch": 0.00016708040419768455, + "grad_norm": 0.37066343426704407, + "learning_rate": 0.0001686143572621035, + "loss": 0.5819, + "step": 476 + }, + { + "epoch": 0.0001674314134502007, + "grad_norm": 0.3658660054206848, + "learning_rate": 0.0001685475792988314, + "loss": 0.6825, + "step": 477 + }, + { + "epoch": 0.00016778242270271685, + "grad_norm": 0.42174890637397766, + "learning_rate": 0.00016848080133555928, + "loss": 0.6065, + "step": 478 + }, + { + "epoch": 0.00016813343195523297, + "grad_norm": 0.3462882936000824, + "learning_rate": 0.00016841402337228716, + "loss": 0.5888, + "step": 479 + }, + { + "epoch": 0.00016848444120774912, + "grad_norm": 0.44681960344314575, + "learning_rate": 0.00016834724540901503, + "loss": 0.4987, + "step": 480 + }, + { + "epoch": 0.00016883545046026527, + "grad_norm": 0.3535650372505188, + "learning_rate": 0.0001682804674457429, + "loss": 0.6478, + "step": 481 + }, + { + "epoch": 0.00016918645971278142, + "grad_norm": 0.3357018232345581, + "learning_rate": 0.00016821368948247077, + "loss": 0.4949, + "step": 482 + }, + { + "epoch": 0.00016953746896529757, + "grad_norm": 0.42756739258766174, + "learning_rate": 0.00016814691151919868, + "loss": 0.6475, + "step": 483 + }, + { + "epoch": 0.0001698884782178137, + "grad_norm": 0.36174866557121277, + "learning_rate": 0.00016808013355592655, + "loss": 0.598, + "step": 484 + }, + { + "epoch": 0.00017023948747032984, + "grad_norm": 0.37115278840065, + "learning_rate": 0.00016801335559265442, + "loss": 0.6215, + "step": 485 + }, + { + "epoch": 0.000170590496722846, + "grad_norm": 0.340249627828598, + "learning_rate": 0.0001679465776293823, + "loss": 0.5702, + "step": 486 + }, + { + "epoch": 0.00017094150597536214, + "grad_norm": 0.31226348876953125, + "learning_rate": 0.0001678797996661102, + "loss": 0.6531, + "step": 487 + }, + { + "epoch": 0.0001712925152278783, + "grad_norm": 0.35571998357772827, + "learning_rate": 0.00016781302170283807, + "loss": 0.6406, + "step": 488 + }, + { + "epoch": 0.00017164352448039441, + "grad_norm": 0.4167378842830658, + "learning_rate": 0.00016774624373956597, + "loss": 0.5111, + "step": 489 + }, + { + "epoch": 0.00017199453373291056, + "grad_norm": 0.292304128408432, + "learning_rate": 0.00016767946577629384, + "loss": 0.6643, + "step": 490 + }, + { + "epoch": 0.0001723455429854267, + "grad_norm": 0.38789069652557373, + "learning_rate": 0.00016761268781302171, + "loss": 0.4542, + "step": 491 + }, + { + "epoch": 0.00017269655223794286, + "grad_norm": 0.33764714002609253, + "learning_rate": 0.0001675459098497496, + "loss": 0.4158, + "step": 492 + }, + { + "epoch": 0.00017304756149045898, + "grad_norm": 0.34849148988723755, + "learning_rate": 0.0001674791318864775, + "loss": 0.4737, + "step": 493 + }, + { + "epoch": 0.00017339857074297513, + "grad_norm": 0.2921352684497833, + "learning_rate": 0.00016741235392320536, + "loss": 0.679, + "step": 494 + }, + { + "epoch": 0.00017374957999549128, + "grad_norm": 0.33746641874313354, + "learning_rate": 0.00016734557595993323, + "loss": 0.4957, + "step": 495 + }, + { + "epoch": 0.00017410058924800743, + "grad_norm": 0.4029395878314972, + "learning_rate": 0.0001672787979966611, + "loss": 0.6708, + "step": 496 + }, + { + "epoch": 0.00017445159850052358, + "grad_norm": 0.440033882856369, + "learning_rate": 0.00016721202003338898, + "loss": 0.5889, + "step": 497 + }, + { + "epoch": 0.0001748026077530397, + "grad_norm": 0.330692857503891, + "learning_rate": 0.00016714524207011685, + "loss": 0.5942, + "step": 498 + }, + { + "epoch": 0.00017515361700555585, + "grad_norm": 0.3111809492111206, + "learning_rate": 0.00016707846410684475, + "loss": 0.5506, + "step": 499 + }, + { + "epoch": 0.000175504626258072, + "grad_norm": 0.38885676860809326, + "learning_rate": 0.00016701168614357263, + "loss": 0.4713, + "step": 500 + }, + { + "epoch": 0.00017585563551058815, + "grad_norm": 0.3697550296783447, + "learning_rate": 0.0001669449081803005, + "loss": 0.5955, + "step": 501 + }, + { + "epoch": 0.0001762066447631043, + "grad_norm": 0.35807061195373535, + "learning_rate": 0.00016687813021702837, + "loss": 0.555, + "step": 502 + }, + { + "epoch": 0.00017655765401562043, + "grad_norm": 0.44033464789390564, + "learning_rate": 0.00016681135225375625, + "loss": 0.5668, + "step": 503 + }, + { + "epoch": 0.00017690866326813657, + "grad_norm": 0.3363400399684906, + "learning_rate": 0.00016674457429048415, + "loss": 0.6176, + "step": 504 + }, + { + "epoch": 0.00017725967252065272, + "grad_norm": 0.31457507610321045, + "learning_rate": 0.00016667779632721202, + "loss": 0.6524, + "step": 505 + }, + { + "epoch": 0.00017761068177316887, + "grad_norm": 0.38115641474723816, + "learning_rate": 0.00016661101836393992, + "loss": 0.5848, + "step": 506 + }, + { + "epoch": 0.00017796169102568502, + "grad_norm": 0.3387603759765625, + "learning_rate": 0.0001665442404006678, + "loss": 0.6992, + "step": 507 + }, + { + "epoch": 0.00017831270027820115, + "grad_norm": 0.31671345233917236, + "learning_rate": 0.00016647746243739567, + "loss": 0.5744, + "step": 508 + }, + { + "epoch": 0.0001786637095307173, + "grad_norm": 0.3776471018791199, + "learning_rate": 0.00016641068447412357, + "loss": 0.622, + "step": 509 + }, + { + "epoch": 0.00017901471878323344, + "grad_norm": 0.37572941184043884, + "learning_rate": 0.00016634390651085144, + "loss": 0.5259, + "step": 510 + }, + { + "epoch": 0.0001793657280357496, + "grad_norm": 0.3335510194301605, + "learning_rate": 0.0001662771285475793, + "loss": 0.547, + "step": 511 + }, + { + "epoch": 0.00017971673728826574, + "grad_norm": 0.33241015672683716, + "learning_rate": 0.00016621035058430719, + "loss": 0.5827, + "step": 512 + }, + { + "epoch": 0.00018006774654078187, + "grad_norm": 0.3761122524738312, + "learning_rate": 0.00016614357262103506, + "loss": 0.6962, + "step": 513 + }, + { + "epoch": 0.00018041875579329802, + "grad_norm": 0.4172234833240509, + "learning_rate": 0.00016607679465776293, + "loss": 0.4922, + "step": 514 + }, + { + "epoch": 0.00018076976504581416, + "grad_norm": 0.45372599363327026, + "learning_rate": 0.00016601001669449083, + "loss": 0.5804, + "step": 515 + }, + { + "epoch": 0.00018112077429833031, + "grad_norm": 0.3854759931564331, + "learning_rate": 0.0001659432387312187, + "loss": 0.6026, + "step": 516 + }, + { + "epoch": 0.00018147178355084646, + "grad_norm": 0.3399171829223633, + "learning_rate": 0.00016587646076794658, + "loss": 0.4773, + "step": 517 + }, + { + "epoch": 0.00018182279280336259, + "grad_norm": 0.36649778485298157, + "learning_rate": 0.00016580968280467445, + "loss": 0.59, + "step": 518 + }, + { + "epoch": 0.00018217380205587874, + "grad_norm": 0.39988765120506287, + "learning_rate": 0.00016574290484140233, + "loss": 0.6094, + "step": 519 + }, + { + "epoch": 0.00018252481130839489, + "grad_norm": 0.34659436345100403, + "learning_rate": 0.00016567612687813023, + "loss": 0.4832, + "step": 520 + }, + { + "epoch": 0.00018287582056091103, + "grad_norm": 0.3742654025554657, + "learning_rate": 0.0001656093489148581, + "loss": 0.413, + "step": 521 + }, + { + "epoch": 0.00018322682981342718, + "grad_norm": 0.43068456649780273, + "learning_rate": 0.00016554257095158597, + "loss": 0.6576, + "step": 522 + }, + { + "epoch": 0.0001835778390659433, + "grad_norm": 0.42455193400382996, + "learning_rate": 0.00016547579298831387, + "loss": 0.5897, + "step": 523 + }, + { + "epoch": 0.00018392884831845946, + "grad_norm": 0.3290526568889618, + "learning_rate": 0.00016540901502504175, + "loss": 0.4022, + "step": 524 + }, + { + "epoch": 0.0001842798575709756, + "grad_norm": 0.3744141161441803, + "learning_rate": 0.00016534223706176965, + "loss": 0.5577, + "step": 525 + }, + { + "epoch": 0.00018463086682349176, + "grad_norm": 0.3516618609428406, + "learning_rate": 0.00016527545909849752, + "loss": 0.5481, + "step": 526 + }, + { + "epoch": 0.0001849818760760079, + "grad_norm": 0.3591526448726654, + "learning_rate": 0.0001652086811352254, + "loss": 0.6339, + "step": 527 + }, + { + "epoch": 0.00018533288532852403, + "grad_norm": 0.4024425745010376, + "learning_rate": 0.00016514190317195327, + "loss": 0.5268, + "step": 528 + }, + { + "epoch": 0.00018568389458104018, + "grad_norm": 0.3502136766910553, + "learning_rate": 0.00016507512520868114, + "loss": 0.5112, + "step": 529 + }, + { + "epoch": 0.00018603490383355633, + "grad_norm": 0.3338727056980133, + "learning_rate": 0.00016500834724540904, + "loss": 0.5623, + "step": 530 + }, + { + "epoch": 0.00018638591308607248, + "grad_norm": 0.43554845452308655, + "learning_rate": 0.0001649415692821369, + "loss": 0.5853, + "step": 531 + }, + { + "epoch": 0.00018673692233858862, + "grad_norm": 0.34424322843551636, + "learning_rate": 0.00016487479131886478, + "loss": 0.4951, + "step": 532 + }, + { + "epoch": 0.00018708793159110475, + "grad_norm": 0.4424237012863159, + "learning_rate": 0.00016480801335559266, + "loss": 0.4576, + "step": 533 + }, + { + "epoch": 0.0001874389408436209, + "grad_norm": 0.4616681933403015, + "learning_rate": 0.00016474123539232053, + "loss": 0.4974, + "step": 534 + }, + { + "epoch": 0.00018778995009613705, + "grad_norm": 0.3599206507205963, + "learning_rate": 0.0001646744574290484, + "loss": 0.5987, + "step": 535 + }, + { + "epoch": 0.0001881409593486532, + "grad_norm": 0.40468478202819824, + "learning_rate": 0.0001646076794657763, + "loss": 0.5914, + "step": 536 + }, + { + "epoch": 0.00018849196860116935, + "grad_norm": 0.5389227271080017, + "learning_rate": 0.00016454090150250418, + "loss": 0.6459, + "step": 537 + }, + { + "epoch": 0.00018884297785368547, + "grad_norm": 0.3493568003177643, + "learning_rate": 0.00016447412353923205, + "loss": 0.5191, + "step": 538 + }, + { + "epoch": 0.00018919398710620162, + "grad_norm": 0.31237804889678955, + "learning_rate": 0.00016440734557595992, + "loss": 0.4819, + "step": 539 + }, + { + "epoch": 0.00018954499635871777, + "grad_norm": 0.31142041087150574, + "learning_rate": 0.00016434056761268782, + "loss": 0.5659, + "step": 540 + }, + { + "epoch": 0.00018989600561123392, + "grad_norm": 0.3323245644569397, + "learning_rate": 0.0001642737896494157, + "loss": 0.5779, + "step": 541 + }, + { + "epoch": 0.00019024701486375007, + "grad_norm": 0.3679036498069763, + "learning_rate": 0.0001642070116861436, + "loss": 0.6919, + "step": 542 + }, + { + "epoch": 0.0001905980241162662, + "grad_norm": 0.3094903528690338, + "learning_rate": 0.00016414023372287147, + "loss": 0.4773, + "step": 543 + }, + { + "epoch": 0.00019094903336878234, + "grad_norm": 0.37995582818984985, + "learning_rate": 0.00016407345575959934, + "loss": 0.539, + "step": 544 + }, + { + "epoch": 0.0001913000426212985, + "grad_norm": 0.46415746212005615, + "learning_rate": 0.00016400667779632722, + "loss": 0.6708, + "step": 545 + }, + { + "epoch": 0.00019165105187381464, + "grad_norm": 0.3479398190975189, + "learning_rate": 0.00016393989983305512, + "loss": 0.5496, + "step": 546 + }, + { + "epoch": 0.00019200206112633079, + "grad_norm": 0.3740891218185425, + "learning_rate": 0.000163873121869783, + "loss": 0.6256, + "step": 547 + }, + { + "epoch": 0.0001923530703788469, + "grad_norm": 0.4934074878692627, + "learning_rate": 0.00016380634390651086, + "loss": 0.6788, + "step": 548 + }, + { + "epoch": 0.00019270407963136306, + "grad_norm": 0.42659157514572144, + "learning_rate": 0.00016373956594323874, + "loss": 0.5981, + "step": 549 + }, + { + "epoch": 0.0001930550888838792, + "grad_norm": 0.35727575421333313, + "learning_rate": 0.0001636727879799666, + "loss": 0.4095, + "step": 550 + }, + { + "epoch": 0.00019340609813639536, + "grad_norm": 0.4294300377368927, + "learning_rate": 0.00016360601001669448, + "loss": 0.5386, + "step": 551 + }, + { + "epoch": 0.0001937571073889115, + "grad_norm": 0.33482253551483154, + "learning_rate": 0.00016353923205342238, + "loss": 0.4901, + "step": 552 + }, + { + "epoch": 0.00019410811664142763, + "grad_norm": 0.3379746079444885, + "learning_rate": 0.00016347245409015026, + "loss": 0.5454, + "step": 553 + }, + { + "epoch": 0.00019445912589394378, + "grad_norm": 0.42393919825553894, + "learning_rate": 0.00016340567612687813, + "loss": 0.5959, + "step": 554 + }, + { + "epoch": 0.00019481013514645993, + "grad_norm": 0.31975501775741577, + "learning_rate": 0.000163338898163606, + "loss": 0.6048, + "step": 555 + }, + { + "epoch": 0.00019516114439897608, + "grad_norm": 0.43404972553253174, + "learning_rate": 0.00016327212020033388, + "loss": 0.6252, + "step": 556 + }, + { + "epoch": 0.00019551215365149223, + "grad_norm": 0.3559292256832123, + "learning_rate": 0.00016320534223706178, + "loss": 0.6036, + "step": 557 + }, + { + "epoch": 0.00019586316290400835, + "grad_norm": 0.3134891092777252, + "learning_rate": 0.00016313856427378965, + "loss": 0.5656, + "step": 558 + }, + { + "epoch": 0.0001962141721565245, + "grad_norm": 0.32056671380996704, + "learning_rate": 0.00016307178631051755, + "loss": 0.6509, + "step": 559 + }, + { + "epoch": 0.00019656518140904065, + "grad_norm": 0.46249130368232727, + "learning_rate": 0.00016300500834724542, + "loss": 0.6379, + "step": 560 + }, + { + "epoch": 0.0001969161906615568, + "grad_norm": 0.36366966366767883, + "learning_rate": 0.0001629382303839733, + "loss": 0.5334, + "step": 561 + }, + { + "epoch": 0.00019726719991407295, + "grad_norm": 0.4234124422073364, + "learning_rate": 0.0001628714524207012, + "loss": 0.4864, + "step": 562 + }, + { + "epoch": 0.00019761820916658907, + "grad_norm": 0.3687801659107208, + "learning_rate": 0.00016280467445742907, + "loss": 0.4855, + "step": 563 + }, + { + "epoch": 0.00019796921841910522, + "grad_norm": 0.37247028946876526, + "learning_rate": 0.00016273789649415694, + "loss": 0.6215, + "step": 564 + }, + { + "epoch": 0.00019832022767162137, + "grad_norm": 0.30445635318756104, + "learning_rate": 0.00016267111853088482, + "loss": 0.5741, + "step": 565 + }, + { + "epoch": 0.00019867123692413752, + "grad_norm": 0.3349187970161438, + "learning_rate": 0.0001626043405676127, + "loss": 0.4524, + "step": 566 + }, + { + "epoch": 0.00019902224617665367, + "grad_norm": 0.36938101053237915, + "learning_rate": 0.00016253756260434056, + "loss": 0.5046, + "step": 567 + }, + { + "epoch": 0.0001993732554291698, + "grad_norm": 0.37673529982566833, + "learning_rate": 0.00016247078464106846, + "loss": 0.5001, + "step": 568 + }, + { + "epoch": 0.00019972426468168594, + "grad_norm": 0.3571556508541107, + "learning_rate": 0.00016240400667779634, + "loss": 0.6419, + "step": 569 + }, + { + "epoch": 0.0002000752739342021, + "grad_norm": 0.35543423891067505, + "learning_rate": 0.0001623372287145242, + "loss": 0.6191, + "step": 570 + }, + { + "epoch": 0.00020042628318671824, + "grad_norm": 0.3096729516983032, + "learning_rate": 0.00016227045075125208, + "loss": 0.5373, + "step": 571 + }, + { + "epoch": 0.0002007772924392344, + "grad_norm": 0.30310383439064026, + "learning_rate": 0.00016220367278797996, + "loss": 0.558, + "step": 572 + }, + { + "epoch": 0.0002011283016917505, + "grad_norm": 0.3616211712360382, + "learning_rate": 0.00016213689482470786, + "loss": 0.6504, + "step": 573 + }, + { + "epoch": 0.00020147931094426666, + "grad_norm": 0.34818220138549805, + "learning_rate": 0.00016207011686143573, + "loss": 0.6136, + "step": 574 + }, + { + "epoch": 0.0002018303201967828, + "grad_norm": 0.36225444078445435, + "learning_rate": 0.0001620033388981636, + "loss": 0.4905, + "step": 575 + }, + { + "epoch": 0.00020218132944929896, + "grad_norm": 0.40039536356925964, + "learning_rate": 0.0001619365609348915, + "loss": 0.5997, + "step": 576 + }, + { + "epoch": 0.0002025323387018151, + "grad_norm": 0.33715930581092834, + "learning_rate": 0.00016186978297161938, + "loss": 0.5284, + "step": 577 + }, + { + "epoch": 0.00020288334795433123, + "grad_norm": 0.4137067198753357, + "learning_rate": 0.00016180300500834728, + "loss": 0.6873, + "step": 578 + }, + { + "epoch": 0.00020323435720684738, + "grad_norm": 0.41598305106163025, + "learning_rate": 0.00016173622704507515, + "loss": 0.491, + "step": 579 + }, + { + "epoch": 0.00020358536645936353, + "grad_norm": 0.5466423034667969, + "learning_rate": 0.00016166944908180302, + "loss": 0.6188, + "step": 580 + }, + { + "epoch": 0.00020393637571187968, + "grad_norm": 0.3718060851097107, + "learning_rate": 0.0001616026711185309, + "loss": 0.5573, + "step": 581 + }, + { + "epoch": 0.00020428738496439583, + "grad_norm": 0.33747225999832153, + "learning_rate": 0.00016153589315525877, + "loss": 0.4887, + "step": 582 + }, + { + "epoch": 0.00020463839421691195, + "grad_norm": 0.36478081345558167, + "learning_rate": 0.00016146911519198664, + "loss": 0.553, + "step": 583 + }, + { + "epoch": 0.0002049894034694281, + "grad_norm": 0.38441962003707886, + "learning_rate": 0.00016140233722871454, + "loss": 0.4833, + "step": 584 + }, + { + "epoch": 0.00020534041272194425, + "grad_norm": 0.45594358444213867, + "learning_rate": 0.00016133555926544241, + "loss": 0.5877, + "step": 585 + }, + { + "epoch": 0.0002056914219744604, + "grad_norm": 0.356517493724823, + "learning_rate": 0.0001612687813021703, + "loss": 0.5614, + "step": 586 + }, + { + "epoch": 0.00020604243122697655, + "grad_norm": 0.4051963686943054, + "learning_rate": 0.00016120200333889816, + "loss": 0.5208, + "step": 587 + }, + { + "epoch": 0.00020639344047949267, + "grad_norm": 0.36947959661483765, + "learning_rate": 0.00016113522537562603, + "loss": 0.4385, + "step": 588 + }, + { + "epoch": 0.00020674444973200882, + "grad_norm": 0.45947200059890747, + "learning_rate": 0.00016106844741235393, + "loss": 0.4972, + "step": 589 + }, + { + "epoch": 0.00020709545898452497, + "grad_norm": 0.40610602498054504, + "learning_rate": 0.0001610016694490818, + "loss": 0.4022, + "step": 590 + }, + { + "epoch": 0.00020744646823704112, + "grad_norm": 0.3529384732246399, + "learning_rate": 0.00016093489148580968, + "loss": 0.5222, + "step": 591 + }, + { + "epoch": 0.00020779747748955727, + "grad_norm": 0.35114821791648865, + "learning_rate": 0.00016086811352253755, + "loss": 0.6224, + "step": 592 + }, + { + "epoch": 0.0002081484867420734, + "grad_norm": 0.3596336841583252, + "learning_rate": 0.00016080133555926545, + "loss": 0.5081, + "step": 593 + }, + { + "epoch": 0.00020849949599458954, + "grad_norm": 0.4214174747467041, + "learning_rate": 0.00016073455759599333, + "loss": 0.5189, + "step": 594 + }, + { + "epoch": 0.0002088505052471057, + "grad_norm": 0.39635175466537476, + "learning_rate": 0.00016066777963272123, + "loss": 0.582, + "step": 595 + }, + { + "epoch": 0.00020920151449962184, + "grad_norm": 0.36160576343536377, + "learning_rate": 0.0001606010016694491, + "loss": 0.568, + "step": 596 + }, + { + "epoch": 0.000209552523752138, + "grad_norm": 0.4242927134037018, + "learning_rate": 0.00016053422370617697, + "loss": 0.6235, + "step": 597 + }, + { + "epoch": 0.0002099035330046541, + "grad_norm": 0.4257853925228119, + "learning_rate": 0.00016046744574290485, + "loss": 0.5294, + "step": 598 + }, + { + "epoch": 0.00021025454225717026, + "grad_norm": 0.3890500068664551, + "learning_rate": 0.00016040066777963272, + "loss": 0.6224, + "step": 599 + }, + { + "epoch": 0.0002106055515096864, + "grad_norm": 0.2971879541873932, + "learning_rate": 0.00016033388981636062, + "loss": 0.5951, + "step": 600 + }, + { + "epoch": 0.00021095656076220256, + "grad_norm": 0.29551970958709717, + "learning_rate": 0.0001602671118530885, + "loss": 0.6713, + "step": 601 + }, + { + "epoch": 0.00021130757001471868, + "grad_norm": 0.31588122248649597, + "learning_rate": 0.00016020033388981637, + "loss": 0.6384, + "step": 602 + }, + { + "epoch": 0.00021165857926723483, + "grad_norm": 0.3138657510280609, + "learning_rate": 0.00016013355592654424, + "loss": 0.5846, + "step": 603 + }, + { + "epoch": 0.00021200958851975098, + "grad_norm": 0.31286585330963135, + "learning_rate": 0.0001600667779632721, + "loss": 0.6236, + "step": 604 + }, + { + "epoch": 0.00021236059777226713, + "grad_norm": 0.32098105549812317, + "learning_rate": 0.00016, + "loss": 0.4926, + "step": 605 + }, + { + "epoch": 0.00021271160702478328, + "grad_norm": 0.371427446603775, + "learning_rate": 0.00015993322203672789, + "loss": 0.6205, + "step": 606 + }, + { + "epoch": 0.0002130626162772994, + "grad_norm": 0.28764042258262634, + "learning_rate": 0.00015986644407345576, + "loss": 0.449, + "step": 607 + }, + { + "epoch": 0.00021341362552981555, + "grad_norm": 0.35086238384246826, + "learning_rate": 0.00015979966611018363, + "loss": 0.549, + "step": 608 + }, + { + "epoch": 0.0002137646347823317, + "grad_norm": 0.3118048906326294, + "learning_rate": 0.0001597328881469115, + "loss": 0.6037, + "step": 609 + }, + { + "epoch": 0.00021411564403484785, + "grad_norm": 0.3894517123699188, + "learning_rate": 0.0001596661101836394, + "loss": 0.5989, + "step": 610 + }, + { + "epoch": 0.000214466653287364, + "grad_norm": 0.39642322063446045, + "learning_rate": 0.00015959933222036728, + "loss": 0.566, + "step": 611 + }, + { + "epoch": 0.00021481766253988012, + "grad_norm": 0.35333508253097534, + "learning_rate": 0.00015953255425709518, + "loss": 0.5055, + "step": 612 + }, + { + "epoch": 0.00021516867179239627, + "grad_norm": 0.39200490713119507, + "learning_rate": 0.00015946577629382305, + "loss": 0.5951, + "step": 613 + }, + { + "epoch": 0.00021551968104491242, + "grad_norm": 0.38436442613601685, + "learning_rate": 0.00015939899833055093, + "loss": 0.4876, + "step": 614 + }, + { + "epoch": 0.00021587069029742857, + "grad_norm": 0.3397504389286041, + "learning_rate": 0.0001593322203672788, + "loss": 0.6287, + "step": 615 + }, + { + "epoch": 0.00021622169954994472, + "grad_norm": 0.35870012640953064, + "learning_rate": 0.0001592654424040067, + "loss": 0.5857, + "step": 616 + }, + { + "epoch": 0.00021657270880246084, + "grad_norm": 0.31163597106933594, + "learning_rate": 0.00015919866444073457, + "loss": 0.4831, + "step": 617 + }, + { + "epoch": 0.000216923718054977, + "grad_norm": 0.35106539726257324, + "learning_rate": 0.00015913188647746245, + "loss": 0.5776, + "step": 618 + }, + { + "epoch": 0.00021727472730749314, + "grad_norm": 0.3639923334121704, + "learning_rate": 0.00015906510851419032, + "loss": 0.5039, + "step": 619 + }, + { + "epoch": 0.0002176257365600093, + "grad_norm": 0.3622918128967285, + "learning_rate": 0.0001589983305509182, + "loss": 0.6293, + "step": 620 + }, + { + "epoch": 0.00021797674581252544, + "grad_norm": 0.3899349868297577, + "learning_rate": 0.0001589315525876461, + "loss": 0.567, + "step": 621 + }, + { + "epoch": 0.00021832775506504156, + "grad_norm": 0.3834361732006073, + "learning_rate": 0.00015886477462437397, + "loss": 0.5106, + "step": 622 + }, + { + "epoch": 0.0002186787643175577, + "grad_norm": 0.34996962547302246, + "learning_rate": 0.00015879799666110184, + "loss": 0.5155, + "step": 623 + }, + { + "epoch": 0.00021902977357007386, + "grad_norm": 0.47908079624176025, + "learning_rate": 0.0001587312186978297, + "loss": 0.4529, + "step": 624 + }, + { + "epoch": 0.00021938078282259, + "grad_norm": 0.3167901635169983, + "learning_rate": 0.00015866444073455758, + "loss": 0.6075, + "step": 625 + }, + { + "epoch": 0.00021973179207510616, + "grad_norm": 0.4254927337169647, + "learning_rate": 0.00015859766277128548, + "loss": 0.6404, + "step": 626 + }, + { + "epoch": 0.00022008280132762228, + "grad_norm": 0.4317469000816345, + "learning_rate": 0.00015853088480801336, + "loss": 0.5881, + "step": 627 + }, + { + "epoch": 0.00022043381058013843, + "grad_norm": 0.4441644251346588, + "learning_rate": 0.00015846410684474123, + "loss": 0.5864, + "step": 628 + }, + { + "epoch": 0.00022078481983265458, + "grad_norm": 0.37883102893829346, + "learning_rate": 0.00015839732888146913, + "loss": 0.5664, + "step": 629 + }, + { + "epoch": 0.00022113582908517073, + "grad_norm": 0.35548868775367737, + "learning_rate": 0.000158330550918197, + "loss": 0.5712, + "step": 630 + }, + { + "epoch": 0.00022148683833768688, + "grad_norm": 0.31588616967201233, + "learning_rate": 0.00015826377295492488, + "loss": 0.4856, + "step": 631 + }, + { + "epoch": 0.000221837847590203, + "grad_norm": 0.3186424672603607, + "learning_rate": 0.00015819699499165278, + "loss": 0.542, + "step": 632 + }, + { + "epoch": 0.00022218885684271915, + "grad_norm": 0.41098466515541077, + "learning_rate": 0.00015813021702838065, + "loss": 0.6311, + "step": 633 + }, + { + "epoch": 0.0002225398660952353, + "grad_norm": 0.413401335477829, + "learning_rate": 0.00015806343906510852, + "loss": 0.5036, + "step": 634 + }, + { + "epoch": 0.00022289087534775145, + "grad_norm": 0.34203773736953735, + "learning_rate": 0.0001579966611018364, + "loss": 0.5508, + "step": 635 + }, + { + "epoch": 0.0002232418846002676, + "grad_norm": 0.34416648745536804, + "learning_rate": 0.00015792988313856427, + "loss": 0.5442, + "step": 636 + }, + { + "epoch": 0.00022359289385278372, + "grad_norm": 0.3439941704273224, + "learning_rate": 0.00015786310517529217, + "loss": 0.4969, + "step": 637 + }, + { + "epoch": 0.00022394390310529987, + "grad_norm": 0.3547762930393219, + "learning_rate": 0.00015779632721202004, + "loss": 0.5564, + "step": 638 + }, + { + "epoch": 0.00022429491235781602, + "grad_norm": 0.35666894912719727, + "learning_rate": 0.00015772954924874792, + "loss": 0.4759, + "step": 639 + }, + { + "epoch": 0.00022464592161033217, + "grad_norm": 0.3175058364868164, + "learning_rate": 0.0001576627712854758, + "loss": 0.5708, + "step": 640 + }, + { + "epoch": 0.00022499693086284832, + "grad_norm": 0.4329943358898163, + "learning_rate": 0.00015759599332220366, + "loss": 0.5293, + "step": 641 + }, + { + "epoch": 0.00022534794011536444, + "grad_norm": 0.5703821778297424, + "learning_rate": 0.00015752921535893156, + "loss": 0.6187, + "step": 642 + }, + { + "epoch": 0.0002256989493678806, + "grad_norm": 0.32244032621383667, + "learning_rate": 0.00015746243739565944, + "loss": 0.4847, + "step": 643 + }, + { + "epoch": 0.00022604995862039674, + "grad_norm": 0.36224085092544556, + "learning_rate": 0.0001573956594323873, + "loss": 0.6804, + "step": 644 + }, + { + "epoch": 0.0002264009678729129, + "grad_norm": 0.3316931426525116, + "learning_rate": 0.0001573288814691152, + "loss": 0.6413, + "step": 645 + }, + { + "epoch": 0.00022675197712542904, + "grad_norm": 0.38156425952911377, + "learning_rate": 0.00015726210350584308, + "loss": 0.5659, + "step": 646 + }, + { + "epoch": 0.00022710298637794516, + "grad_norm": 0.48353493213653564, + "learning_rate": 0.00015719532554257096, + "loss": 0.5788, + "step": 647 + }, + { + "epoch": 0.00022745399563046131, + "grad_norm": 0.3913673758506775, + "learning_rate": 0.00015712854757929886, + "loss": 0.6899, + "step": 648 + }, + { + "epoch": 0.00022780500488297746, + "grad_norm": 0.46836981177330017, + "learning_rate": 0.00015706176961602673, + "loss": 0.5712, + "step": 649 + }, + { + "epoch": 0.0002281560141354936, + "grad_norm": 0.34713172912597656, + "learning_rate": 0.0001569949916527546, + "loss": 0.381, + "step": 650 + }, + { + "epoch": 0.00022850702338800976, + "grad_norm": 0.3837398886680603, + "learning_rate": 0.00015692821368948248, + "loss": 0.5236, + "step": 651 + }, + { + "epoch": 0.00022885803264052589, + "grad_norm": 0.5181556940078735, + "learning_rate": 0.00015686143572621035, + "loss": 0.5889, + "step": 652 + }, + { + "epoch": 0.00022920904189304203, + "grad_norm": 0.42713961005210876, + "learning_rate": 0.00015679465776293825, + "loss": 0.5346, + "step": 653 + }, + { + "epoch": 0.00022956005114555818, + "grad_norm": 0.2868479788303375, + "learning_rate": 0.00015672787979966612, + "loss": 0.5546, + "step": 654 + }, + { + "epoch": 0.00022991106039807433, + "grad_norm": 0.31901800632476807, + "learning_rate": 0.000156661101836394, + "loss": 0.5014, + "step": 655 + }, + { + "epoch": 0.00023026206965059048, + "grad_norm": 0.41681963205337524, + "learning_rate": 0.00015659432387312187, + "loss": 0.5709, + "step": 656 + }, + { + "epoch": 0.0002306130789031066, + "grad_norm": 0.5942090749740601, + "learning_rate": 0.00015652754590984974, + "loss": 0.6022, + "step": 657 + }, + { + "epoch": 0.00023096408815562276, + "grad_norm": 0.405391126871109, + "learning_rate": 0.00015646076794657764, + "loss": 0.5363, + "step": 658 + }, + { + "epoch": 0.0002313150974081389, + "grad_norm": 0.3201390206813812, + "learning_rate": 0.00015639398998330552, + "loss": 0.6045, + "step": 659 + }, + { + "epoch": 0.00023166610666065505, + "grad_norm": 0.2989407479763031, + "learning_rate": 0.0001563272120200334, + "loss": 0.5604, + "step": 660 + }, + { + "epoch": 0.0002320171159131712, + "grad_norm": 0.3919268548488617, + "learning_rate": 0.00015626043405676126, + "loss": 0.5413, + "step": 661 + }, + { + "epoch": 0.00023236812516568733, + "grad_norm": 0.4080122709274292, + "learning_rate": 0.00015619365609348916, + "loss": 0.498, + "step": 662 + }, + { + "epoch": 0.00023271913441820348, + "grad_norm": 0.38974156975746155, + "learning_rate": 0.00015612687813021704, + "loss": 0.6149, + "step": 663 + }, + { + "epoch": 0.00023307014367071962, + "grad_norm": 0.3145015835762024, + "learning_rate": 0.00015606010016694494, + "loss": 0.4886, + "step": 664 + }, + { + "epoch": 0.00023342115292323577, + "grad_norm": 0.3009328246116638, + "learning_rate": 0.0001559933222036728, + "loss": 0.5534, + "step": 665 + }, + { + "epoch": 0.00023377216217575192, + "grad_norm": 0.4774717092514038, + "learning_rate": 0.00015592654424040068, + "loss": 0.6006, + "step": 666 + }, + { + "epoch": 0.00023412317142826805, + "grad_norm": 0.32965418696403503, + "learning_rate": 0.00015585976627712856, + "loss": 0.5463, + "step": 667 + }, + { + "epoch": 0.0002344741806807842, + "grad_norm": 0.3066554665565491, + "learning_rate": 0.00015579298831385643, + "loss": 0.5675, + "step": 668 + }, + { + "epoch": 0.00023482518993330035, + "grad_norm": 0.3879207372665405, + "learning_rate": 0.00015572621035058433, + "loss": 0.5825, + "step": 669 + }, + { + "epoch": 0.0002351761991858165, + "grad_norm": 0.3171943128108978, + "learning_rate": 0.0001556594323873122, + "loss": 0.5677, + "step": 670 + }, + { + "epoch": 0.00023552720843833264, + "grad_norm": 0.36982622742652893, + "learning_rate": 0.00015559265442404007, + "loss": 0.5885, + "step": 671 + }, + { + "epoch": 0.00023587821769084877, + "grad_norm": 0.30437183380126953, + "learning_rate": 0.00015552587646076795, + "loss": 0.6288, + "step": 672 + }, + { + "epoch": 0.00023622922694336492, + "grad_norm": 0.30654504895210266, + "learning_rate": 0.00015545909849749582, + "loss": 0.5924, + "step": 673 + }, + { + "epoch": 0.00023658023619588107, + "grad_norm": 0.3771214783191681, + "learning_rate": 0.00015539232053422372, + "loss": 0.4901, + "step": 674 + }, + { + "epoch": 0.00023693124544839721, + "grad_norm": 0.3018699884414673, + "learning_rate": 0.0001553255425709516, + "loss": 0.6159, + "step": 675 + }, + { + "epoch": 0.00023728225470091336, + "grad_norm": 0.32899734377861023, + "learning_rate": 0.00015525876460767947, + "loss": 0.6197, + "step": 676 + }, + { + "epoch": 0.0002376332639534295, + "grad_norm": 0.31837883591651917, + "learning_rate": 0.00015519198664440734, + "loss": 0.5449, + "step": 677 + }, + { + "epoch": 0.00023798427320594564, + "grad_norm": 0.35326528549194336, + "learning_rate": 0.00015512520868113521, + "loss": 0.6315, + "step": 678 + }, + { + "epoch": 0.00023833528245846179, + "grad_norm": 0.3714829385280609, + "learning_rate": 0.00015505843071786311, + "loss": 0.6352, + "step": 679 + }, + { + "epoch": 0.00023868629171097794, + "grad_norm": 0.4002094864845276, + "learning_rate": 0.000154991652754591, + "loss": 0.4235, + "step": 680 + }, + { + "epoch": 0.00023903730096349408, + "grad_norm": 0.3382783532142639, + "learning_rate": 0.0001549248747913189, + "loss": 0.5476, + "step": 681 + }, + { + "epoch": 0.0002393883102160102, + "grad_norm": 0.2985747158527374, + "learning_rate": 0.00015485809682804676, + "loss": 0.5684, + "step": 682 + }, + { + "epoch": 0.00023973931946852636, + "grad_norm": 0.3288929760456085, + "learning_rate": 0.00015479131886477463, + "loss": 0.5657, + "step": 683 + }, + { + "epoch": 0.0002400903287210425, + "grad_norm": 0.39641210436820984, + "learning_rate": 0.0001547245409015025, + "loss": 0.6283, + "step": 684 + }, + { + "epoch": 0.00024044133797355866, + "grad_norm": 0.37413230538368225, + "learning_rate": 0.0001546577629382304, + "loss": 0.5778, + "step": 685 + }, + { + "epoch": 0.0002407923472260748, + "grad_norm": 0.28837504982948303, + "learning_rate": 0.00015459098497495828, + "loss": 0.5079, + "step": 686 + }, + { + "epoch": 0.00024114335647859093, + "grad_norm": 0.32851526141166687, + "learning_rate": 0.00015452420701168615, + "loss": 0.649, + "step": 687 + }, + { + "epoch": 0.00024149436573110708, + "grad_norm": 0.3848758637905121, + "learning_rate": 0.00015445742904841403, + "loss": 0.6099, + "step": 688 + }, + { + "epoch": 0.00024184537498362323, + "grad_norm": 0.35494935512542725, + "learning_rate": 0.0001543906510851419, + "loss": 0.6498, + "step": 689 + }, + { + "epoch": 0.00024219638423613938, + "grad_norm": 0.3431280553340912, + "learning_rate": 0.0001543238731218698, + "loss": 0.4934, + "step": 690 + }, + { + "epoch": 0.00024254739348865553, + "grad_norm": 0.33980974555015564, + "learning_rate": 0.00015425709515859767, + "loss": 0.5556, + "step": 691 + }, + { + "epoch": 0.00024289840274117165, + "grad_norm": 0.3086068034172058, + "learning_rate": 0.00015419031719532555, + "loss": 0.5955, + "step": 692 + }, + { + "epoch": 0.0002432494119936878, + "grad_norm": 0.33093178272247314, + "learning_rate": 0.00015412353923205342, + "loss": 0.5926, + "step": 693 + }, + { + "epoch": 0.00024360042124620395, + "grad_norm": 0.3660534620285034, + "learning_rate": 0.0001540567612687813, + "loss": 0.5494, + "step": 694 + }, + { + "epoch": 0.0002439514304987201, + "grad_norm": 0.29803964495658875, + "learning_rate": 0.0001539899833055092, + "loss": 0.6074, + "step": 695 + }, + { + "epoch": 0.00024430243975123625, + "grad_norm": 0.36542224884033203, + "learning_rate": 0.00015392320534223707, + "loss": 0.59, + "step": 696 + }, + { + "epoch": 0.00024465344900375237, + "grad_norm": 0.34015166759490967, + "learning_rate": 0.00015385642737896494, + "loss": 0.6029, + "step": 697 + }, + { + "epoch": 0.00024500445825626854, + "grad_norm": 0.3211725950241089, + "learning_rate": 0.00015378964941569284, + "loss": 0.535, + "step": 698 + }, + { + "epoch": 0.00024535546750878467, + "grad_norm": 0.37027183175086975, + "learning_rate": 0.0001537228714524207, + "loss": 0.6265, + "step": 699 + }, + { + "epoch": 0.0002457064767613008, + "grad_norm": 0.3447396159172058, + "learning_rate": 0.00015365609348914859, + "loss": 0.6061, + "step": 700 + }, + { + "epoch": 0.00024605748601381697, + "grad_norm": 0.3344075679779053, + "learning_rate": 0.00015358931552587649, + "loss": 0.5412, + "step": 701 + }, + { + "epoch": 0.0002464084952663331, + "grad_norm": 0.29049620032310486, + "learning_rate": 0.00015352253756260436, + "loss": 0.5137, + "step": 702 + }, + { + "epoch": 0.00024675950451884926, + "grad_norm": 0.37048932909965515, + "learning_rate": 0.00015345575959933223, + "loss": 0.6118, + "step": 703 + }, + { + "epoch": 0.0002471105137713654, + "grad_norm": 0.38212522864341736, + "learning_rate": 0.0001533889816360601, + "loss": 0.466, + "step": 704 + }, + { + "epoch": 0.0002474615230238815, + "grad_norm": 0.3576483428478241, + "learning_rate": 0.00015332220367278798, + "loss": 0.561, + "step": 705 + }, + { + "epoch": 0.0002478125322763977, + "grad_norm": 0.3550293743610382, + "learning_rate": 0.00015325542570951588, + "loss": 0.5634, + "step": 706 + }, + { + "epoch": 0.0002481635415289138, + "grad_norm": 0.362474650144577, + "learning_rate": 0.00015318864774624375, + "loss": 0.5608, + "step": 707 + }, + { + "epoch": 0.00024851455078143, + "grad_norm": 0.39463603496551514, + "learning_rate": 0.00015312186978297163, + "loss": 0.64, + "step": 708 + }, + { + "epoch": 0.0002488655600339461, + "grad_norm": 0.3456307649612427, + "learning_rate": 0.0001530550918196995, + "loss": 0.4631, + "step": 709 + }, + { + "epoch": 0.00024921656928646223, + "grad_norm": 0.3300929367542267, + "learning_rate": 0.00015298831385642737, + "loss": 0.3984, + "step": 710 + }, + { + "epoch": 0.0002495675785389784, + "grad_norm": 0.35923343896865845, + "learning_rate": 0.00015292153589315527, + "loss": 0.6003, + "step": 711 + }, + { + "epoch": 0.00024991858779149453, + "grad_norm": 0.4047611653804779, + "learning_rate": 0.00015285475792988315, + "loss": 0.5715, + "step": 712 + }, + { + "epoch": 0.0002502695970440107, + "grad_norm": 0.43539851903915405, + "learning_rate": 0.00015278797996661102, + "loss": 0.571, + "step": 713 + }, + { + "epoch": 0.00025062060629652683, + "grad_norm": 0.34745046496391296, + "learning_rate": 0.0001527212020033389, + "loss": 0.622, + "step": 714 + }, + { + "epoch": 0.00025097161554904295, + "grad_norm": 0.3130028247833252, + "learning_rate": 0.0001526544240400668, + "loss": 0.507, + "step": 715 + }, + { + "epoch": 0.0002513226248015591, + "grad_norm": 0.3093617558479309, + "learning_rate": 0.00015258764607679466, + "loss": 0.4951, + "step": 716 + }, + { + "epoch": 0.00025167363405407525, + "grad_norm": 0.34299540519714355, + "learning_rate": 0.00015252086811352257, + "loss": 0.539, + "step": 717 + }, + { + "epoch": 0.0002520246433065914, + "grad_norm": 0.32698413729667664, + "learning_rate": 0.00015245409015025044, + "loss": 0.4588, + "step": 718 + }, + { + "epoch": 0.00025237565255910755, + "grad_norm": 0.37853989005088806, + "learning_rate": 0.0001523873121869783, + "loss": 0.6227, + "step": 719 + }, + { + "epoch": 0.00025272666181162367, + "grad_norm": 0.32887300848960876, + "learning_rate": 0.00015232053422370618, + "loss": 0.5893, + "step": 720 + }, + { + "epoch": 0.00025307767106413985, + "grad_norm": 0.43352028727531433, + "learning_rate": 0.00015225375626043406, + "loss": 0.5811, + "step": 721 + }, + { + "epoch": 0.00025342868031665597, + "grad_norm": 0.42844903469085693, + "learning_rate": 0.00015218697829716196, + "loss": 0.6196, + "step": 722 + }, + { + "epoch": 0.00025377968956917215, + "grad_norm": 0.39929670095443726, + "learning_rate": 0.00015212020033388983, + "loss": 0.6722, + "step": 723 + }, + { + "epoch": 0.00025413069882168827, + "grad_norm": 0.5063486695289612, + "learning_rate": 0.0001520534223706177, + "loss": 0.6086, + "step": 724 + }, + { + "epoch": 0.0002544817080742044, + "grad_norm": 0.3625267446041107, + "learning_rate": 0.00015198664440734558, + "loss": 0.6331, + "step": 725 + }, + { + "epoch": 0.00025483271732672057, + "grad_norm": 0.3452700078487396, + "learning_rate": 0.00015191986644407345, + "loss": 0.5812, + "step": 726 + }, + { + "epoch": 0.0002551837265792367, + "grad_norm": 0.31915003061294556, + "learning_rate": 0.00015185308848080135, + "loss": 0.5653, + "step": 727 + }, + { + "epoch": 0.00025553473583175287, + "grad_norm": 0.3085877299308777, + "learning_rate": 0.00015178631051752922, + "loss": 0.4702, + "step": 728 + }, + { + "epoch": 0.000255885745084269, + "grad_norm": 0.31519320607185364, + "learning_rate": 0.0001517195325542571, + "loss": 0.5096, + "step": 729 + }, + { + "epoch": 0.0002562367543367851, + "grad_norm": 0.3637699782848358, + "learning_rate": 0.00015165275459098497, + "loss": 0.6001, + "step": 730 + }, + { + "epoch": 0.0002565877635893013, + "grad_norm": 0.34056970477104187, + "learning_rate": 0.00015158597662771284, + "loss": 0.5546, + "step": 731 + }, + { + "epoch": 0.0002569387728418174, + "grad_norm": 0.37110257148742676, + "learning_rate": 0.00015151919866444074, + "loss": 0.5612, + "step": 732 + }, + { + "epoch": 0.0002572897820943336, + "grad_norm": 0.35854101181030273, + "learning_rate": 0.00015145242070116862, + "loss": 0.6364, + "step": 733 + }, + { + "epoch": 0.0002576407913468497, + "grad_norm": 0.4340030252933502, + "learning_rate": 0.00015138564273789652, + "loss": 0.5772, + "step": 734 + }, + { + "epoch": 0.00025799180059936583, + "grad_norm": 0.3807721436023712, + "learning_rate": 0.0001513188647746244, + "loss": 0.4986, + "step": 735 + }, + { + "epoch": 0.000258342809851882, + "grad_norm": 0.3522527813911438, + "learning_rate": 0.00015125208681135226, + "loss": 0.5982, + "step": 736 + }, + { + "epoch": 0.00025869381910439813, + "grad_norm": 0.31251296401023865, + "learning_rate": 0.00015118530884808014, + "loss": 0.5239, + "step": 737 + }, + { + "epoch": 0.0002590448283569143, + "grad_norm": 0.3460885286331177, + "learning_rate": 0.00015111853088480804, + "loss": 0.5881, + "step": 738 + }, + { + "epoch": 0.00025939583760943043, + "grad_norm": 0.33298879861831665, + "learning_rate": 0.0001510517529215359, + "loss": 0.5272, + "step": 739 + }, + { + "epoch": 0.00025974684686194655, + "grad_norm": 0.351468950510025, + "learning_rate": 0.00015098497495826378, + "loss": 0.6049, + "step": 740 + }, + { + "epoch": 0.00026009785611446273, + "grad_norm": 0.3449242413043976, + "learning_rate": 0.00015091819699499166, + "loss": 0.5983, + "step": 741 + }, + { + "epoch": 0.00026044886536697885, + "grad_norm": 0.34724265336990356, + "learning_rate": 0.00015085141903171953, + "loss": 0.5292, + "step": 742 + }, + { + "epoch": 0.00026079987461949503, + "grad_norm": 0.3525671660900116, + "learning_rate": 0.00015078464106844743, + "loss": 0.5391, + "step": 743 + }, + { + "epoch": 0.00026115088387201115, + "grad_norm": 0.33959653973579407, + "learning_rate": 0.0001507178631051753, + "loss": 0.5898, + "step": 744 + }, + { + "epoch": 0.00026150189312452727, + "grad_norm": 0.5051225423812866, + "learning_rate": 0.00015065108514190318, + "loss": 0.5408, + "step": 745 + }, + { + "epoch": 0.00026185290237704345, + "grad_norm": 0.3298085629940033, + "learning_rate": 0.00015058430717863105, + "loss": 0.557, + "step": 746 + }, + { + "epoch": 0.00026220391162955957, + "grad_norm": 0.3375703990459442, + "learning_rate": 0.00015051752921535892, + "loss": 0.5541, + "step": 747 + }, + { + "epoch": 0.00026255492088207575, + "grad_norm": 0.27896445989608765, + "learning_rate": 0.0001504507512520868, + "loss": 0.5273, + "step": 748 + }, + { + "epoch": 0.00026290593013459187, + "grad_norm": 0.30591917037963867, + "learning_rate": 0.0001503839732888147, + "loss": 0.5988, + "step": 749 + }, + { + "epoch": 0.000263256939387108, + "grad_norm": 0.41014084219932556, + "learning_rate": 0.00015031719532554257, + "loss": 0.555, + "step": 750 + }, + { + "epoch": 0.00026360794863962417, + "grad_norm": 0.2935464084148407, + "learning_rate": 0.00015025041736227047, + "loss": 0.625, + "step": 751 + }, + { + "epoch": 0.0002639589578921403, + "grad_norm": 0.46361032128334045, + "learning_rate": 0.00015018363939899834, + "loss": 0.4753, + "step": 752 + }, + { + "epoch": 0.00026430996714465647, + "grad_norm": 0.35808300971984863, + "learning_rate": 0.00015011686143572622, + "loss": 0.5531, + "step": 753 + }, + { + "epoch": 0.0002646609763971726, + "grad_norm": 0.3411274254322052, + "learning_rate": 0.00015005008347245412, + "loss": 0.5577, + "step": 754 + }, + { + "epoch": 0.0002650119856496887, + "grad_norm": 0.34169328212738037, + "learning_rate": 0.000149983305509182, + "loss": 0.4856, + "step": 755 + }, + { + "epoch": 0.0002653629949022049, + "grad_norm": 0.38024139404296875, + "learning_rate": 0.00014991652754590986, + "loss": 0.5203, + "step": 756 + }, + { + "epoch": 0.000265714004154721, + "grad_norm": 0.35004425048828125, + "learning_rate": 0.00014984974958263774, + "loss": 0.4999, + "step": 757 + }, + { + "epoch": 0.0002660650134072372, + "grad_norm": 0.47526153922080994, + "learning_rate": 0.0001497829716193656, + "loss": 0.5503, + "step": 758 + }, + { + "epoch": 0.0002664160226597533, + "grad_norm": 0.35096925497055054, + "learning_rate": 0.0001497161936560935, + "loss": 0.5812, + "step": 759 + }, + { + "epoch": 0.00026676703191226943, + "grad_norm": 0.4505446255207062, + "learning_rate": 0.00014964941569282138, + "loss": 0.6069, + "step": 760 + }, + { + "epoch": 0.0002671180411647856, + "grad_norm": 0.3261663019657135, + "learning_rate": 0.00014958263772954926, + "loss": 0.5601, + "step": 761 + }, + { + "epoch": 0.00026746905041730173, + "grad_norm": 0.3397548794746399, + "learning_rate": 0.00014951585976627713, + "loss": 0.5572, + "step": 762 + }, + { + "epoch": 0.00026782005966981785, + "grad_norm": 0.35547688603401184, + "learning_rate": 0.000149449081803005, + "loss": 0.5983, + "step": 763 + }, + { + "epoch": 0.00026817106892233403, + "grad_norm": 0.41515079140663147, + "learning_rate": 0.00014938230383973287, + "loss": 0.6106, + "step": 764 + }, + { + "epoch": 0.00026852207817485015, + "grad_norm": 0.3840051591396332, + "learning_rate": 0.00014931552587646077, + "loss": 0.5328, + "step": 765 + }, + { + "epoch": 0.00026887308742736633, + "grad_norm": 0.3401285707950592, + "learning_rate": 0.00014924874791318865, + "loss": 0.4666, + "step": 766 + }, + { + "epoch": 0.00026922409667988245, + "grad_norm": 0.32983794808387756, + "learning_rate": 0.00014918196994991652, + "loss": 0.5214, + "step": 767 + }, + { + "epoch": 0.0002695751059323986, + "grad_norm": 0.30202198028564453, + "learning_rate": 0.00014911519198664442, + "loss": 0.4969, + "step": 768 + }, + { + "epoch": 0.00026992611518491475, + "grad_norm": 0.3222092092037201, + "learning_rate": 0.0001490484140233723, + "loss": 0.5093, + "step": 769 + }, + { + "epoch": 0.0002702771244374309, + "grad_norm": 0.4211997091770172, + "learning_rate": 0.0001489816360601002, + "loss": 0.6295, + "step": 770 + }, + { + "epoch": 0.00027062813368994705, + "grad_norm": 0.32112184166908264, + "learning_rate": 0.00014891485809682807, + "loss": 0.5611, + "step": 771 + }, + { + "epoch": 0.00027097914294246317, + "grad_norm": 0.3272956609725952, + "learning_rate": 0.00014884808013355594, + "loss": 0.6438, + "step": 772 + }, + { + "epoch": 0.0002713301521949793, + "grad_norm": 0.39423295855522156, + "learning_rate": 0.00014878130217028381, + "loss": 0.6029, + "step": 773 + }, + { + "epoch": 0.00027168116144749547, + "grad_norm": 0.3053528070449829, + "learning_rate": 0.0001487145242070117, + "loss": 0.4978, + "step": 774 + }, + { + "epoch": 0.0002720321707000116, + "grad_norm": 0.312774658203125, + "learning_rate": 0.0001486477462437396, + "loss": 0.5753, + "step": 775 + }, + { + "epoch": 0.00027238317995252777, + "grad_norm": 0.343964546918869, + "learning_rate": 0.00014858096828046746, + "loss": 0.5173, + "step": 776 + }, + { + "epoch": 0.0002727341892050439, + "grad_norm": 0.39104631543159485, + "learning_rate": 0.00014851419031719533, + "loss": 0.6381, + "step": 777 + }, + { + "epoch": 0.00027308519845756, + "grad_norm": 0.3958207070827484, + "learning_rate": 0.0001484474123539232, + "loss": 0.6046, + "step": 778 + }, + { + "epoch": 0.0002734362077100762, + "grad_norm": 0.36198097467422485, + "learning_rate": 0.00014838063439065108, + "loss": 0.6066, + "step": 779 + }, + { + "epoch": 0.0002737872169625923, + "grad_norm": 0.29619571566581726, + "learning_rate": 0.00014831385642737895, + "loss": 0.5131, + "step": 780 + }, + { + "epoch": 0.0002741382262151085, + "grad_norm": 0.344784677028656, + "learning_rate": 0.00014824707846410685, + "loss": 0.5626, + "step": 781 + }, + { + "epoch": 0.0002744892354676246, + "grad_norm": 0.35641250014305115, + "learning_rate": 0.00014818030050083473, + "loss": 0.5451, + "step": 782 + }, + { + "epoch": 0.00027484024472014074, + "grad_norm": 0.3496847152709961, + "learning_rate": 0.0001481135225375626, + "loss": 0.4814, + "step": 783 + }, + { + "epoch": 0.0002751912539726569, + "grad_norm": 0.3726658821105957, + "learning_rate": 0.00014804674457429047, + "loss": 0.6244, + "step": 784 + }, + { + "epoch": 0.00027554226322517303, + "grad_norm": 0.3317565619945526, + "learning_rate": 0.00014797996661101837, + "loss": 0.562, + "step": 785 + }, + { + "epoch": 0.0002758932724776892, + "grad_norm": 0.3478979468345642, + "learning_rate": 0.00014791318864774625, + "loss": 0.613, + "step": 786 + }, + { + "epoch": 0.00027624428173020533, + "grad_norm": 0.3572550415992737, + "learning_rate": 0.00014784641068447415, + "loss": 0.4841, + "step": 787 + }, + { + "epoch": 0.00027659529098272146, + "grad_norm": 0.34030210971832275, + "learning_rate": 0.00014777963272120202, + "loss": 0.4879, + "step": 788 + }, + { + "epoch": 0.00027694630023523763, + "grad_norm": 0.378203421831131, + "learning_rate": 0.0001477128547579299, + "loss": 0.6086, + "step": 789 + }, + { + "epoch": 0.00027729730948775375, + "grad_norm": 0.3390562832355499, + "learning_rate": 0.00014764607679465777, + "loss": 0.586, + "step": 790 + }, + { + "epoch": 0.00027764831874026993, + "grad_norm": 0.4986645579338074, + "learning_rate": 0.00014757929883138567, + "loss": 0.5592, + "step": 791 + }, + { + "epoch": 0.00027799932799278605, + "grad_norm": 0.3361869156360626, + "learning_rate": 0.00014751252086811354, + "loss": 0.4632, + "step": 792 + }, + { + "epoch": 0.0002783503372453022, + "grad_norm": 0.3726123571395874, + "learning_rate": 0.0001474457429048414, + "loss": 0.4915, + "step": 793 + }, + { + "epoch": 0.00027870134649781835, + "grad_norm": 0.3358845114707947, + "learning_rate": 0.00014737896494156929, + "loss": 0.5593, + "step": 794 + }, + { + "epoch": 0.0002790523557503345, + "grad_norm": 0.30473607778549194, + "learning_rate": 0.00014731218697829716, + "loss": 0.3672, + "step": 795 + }, + { + "epoch": 0.00027940336500285065, + "grad_norm": 0.33929023146629333, + "learning_rate": 0.00014724540901502506, + "loss": 0.5404, + "step": 796 + }, + { + "epoch": 0.0002797543742553668, + "grad_norm": 0.30778205394744873, + "learning_rate": 0.00014717863105175293, + "loss": 0.4379, + "step": 797 + }, + { + "epoch": 0.0002801053835078829, + "grad_norm": 0.286443829536438, + "learning_rate": 0.0001471118530884808, + "loss": 0.5579, + "step": 798 + }, + { + "epoch": 0.0002804563927603991, + "grad_norm": 0.4246799051761627, + "learning_rate": 0.00014704507512520868, + "loss": 0.536, + "step": 799 + }, + { + "epoch": 0.0002808074020129152, + "grad_norm": 0.4085538983345032, + "learning_rate": 0.00014697829716193655, + "loss": 0.5309, + "step": 800 + }, + { + "epoch": 0.00028115841126543137, + "grad_norm": 0.35396453738212585, + "learning_rate": 0.00014691151919866443, + "loss": 0.5307, + "step": 801 + }, + { + "epoch": 0.0002815094205179475, + "grad_norm": 0.45588648319244385, + "learning_rate": 0.00014684474123539233, + "loss": 0.5905, + "step": 802 + }, + { + "epoch": 0.0002818604297704636, + "grad_norm": 0.3353815972805023, + "learning_rate": 0.0001467779632721202, + "loss": 0.612, + "step": 803 + }, + { + "epoch": 0.0002822114390229798, + "grad_norm": 0.4152653217315674, + "learning_rate": 0.0001467111853088481, + "loss": 0.592, + "step": 804 + }, + { + "epoch": 0.0002825624482754959, + "grad_norm": 0.3651511073112488, + "learning_rate": 0.00014664440734557597, + "loss": 0.5909, + "step": 805 + }, + { + "epoch": 0.0002829134575280121, + "grad_norm": 0.3518235385417938, + "learning_rate": 0.00014657762938230385, + "loss": 0.5684, + "step": 806 + }, + { + "epoch": 0.0002832644667805282, + "grad_norm": 0.33562156558036804, + "learning_rate": 0.00014651085141903175, + "loss": 0.5165, + "step": 807 + }, + { + "epoch": 0.00028361547603304434, + "grad_norm": 0.3648052513599396, + "learning_rate": 0.00014644407345575962, + "loss": 0.5451, + "step": 808 + }, + { + "epoch": 0.0002839664852855605, + "grad_norm": 0.44342300295829773, + "learning_rate": 0.0001463772954924875, + "loss": 0.5907, + "step": 809 + }, + { + "epoch": 0.00028431749453807664, + "grad_norm": 0.33331966400146484, + "learning_rate": 0.00014631051752921536, + "loss": 0.4254, + "step": 810 + }, + { + "epoch": 0.0002846685037905928, + "grad_norm": 0.3444873094558716, + "learning_rate": 0.00014624373956594324, + "loss": 0.5201, + "step": 811 + }, + { + "epoch": 0.00028501951304310894, + "grad_norm": 0.4239615201950073, + "learning_rate": 0.00014617696160267114, + "loss": 0.5098, + "step": 812 + }, + { + "epoch": 0.00028537052229562506, + "grad_norm": 0.47895997762680054, + "learning_rate": 0.000146110183639399, + "loss": 0.6243, + "step": 813 + }, + { + "epoch": 0.00028572153154814123, + "grad_norm": 0.47322046756744385, + "learning_rate": 0.00014604340567612688, + "loss": 0.6841, + "step": 814 + }, + { + "epoch": 0.00028607254080065736, + "grad_norm": 0.35017871856689453, + "learning_rate": 0.00014597662771285476, + "loss": 0.5313, + "step": 815 + }, + { + "epoch": 0.00028642355005317353, + "grad_norm": 0.4342300295829773, + "learning_rate": 0.00014590984974958263, + "loss": 0.4363, + "step": 816 + }, + { + "epoch": 0.00028677455930568966, + "grad_norm": 0.2966228723526001, + "learning_rate": 0.0001458430717863105, + "loss": 0.6428, + "step": 817 + }, + { + "epoch": 0.0002871255685582058, + "grad_norm": 0.3320361375808716, + "learning_rate": 0.0001457762938230384, + "loss": 0.5266, + "step": 818 + }, + { + "epoch": 0.00028747657781072195, + "grad_norm": 0.3318590223789215, + "learning_rate": 0.00014570951585976628, + "loss": 0.5676, + "step": 819 + }, + { + "epoch": 0.0002878275870632381, + "grad_norm": 0.38573157787323, + "learning_rate": 0.00014564273789649415, + "loss": 0.7083, + "step": 820 + }, + { + "epoch": 0.00028817859631575425, + "grad_norm": 0.3731164038181305, + "learning_rate": 0.00014557595993322205, + "loss": 0.578, + "step": 821 + }, + { + "epoch": 0.0002885296055682704, + "grad_norm": 0.33610039949417114, + "learning_rate": 0.00014550918196994992, + "loss": 0.5923, + "step": 822 + }, + { + "epoch": 0.0002888806148207865, + "grad_norm": 0.3393179476261139, + "learning_rate": 0.00014544240400667782, + "loss": 0.5162, + "step": 823 + }, + { + "epoch": 0.0002892316240733027, + "grad_norm": 0.35552918910980225, + "learning_rate": 0.0001453756260434057, + "loss": 0.556, + "step": 824 + }, + { + "epoch": 0.0002895826333258188, + "grad_norm": 0.32425832748413086, + "learning_rate": 0.00014530884808013357, + "loss": 0.5157, + "step": 825 + }, + { + "epoch": 0.000289933642578335, + "grad_norm": 0.3353455662727356, + "learning_rate": 0.00014524207011686144, + "loss": 0.483, + "step": 826 + }, + { + "epoch": 0.0002902846518308511, + "grad_norm": 0.46254628896713257, + "learning_rate": 0.00014517529215358932, + "loss": 0.633, + "step": 827 + }, + { + "epoch": 0.0002906356610833672, + "grad_norm": 0.3275732100009918, + "learning_rate": 0.00014510851419031722, + "loss": 0.5502, + "step": 828 + }, + { + "epoch": 0.0002909866703358834, + "grad_norm": 0.3495190441608429, + "learning_rate": 0.0001450417362270451, + "loss": 0.368, + "step": 829 + }, + { + "epoch": 0.0002913376795883995, + "grad_norm": 0.35350501537323, + "learning_rate": 0.00014497495826377296, + "loss": 0.5819, + "step": 830 + }, + { + "epoch": 0.0002916886888409157, + "grad_norm": 0.37886378169059753, + "learning_rate": 0.00014490818030050084, + "loss": 0.5418, + "step": 831 + }, + { + "epoch": 0.0002920396980934318, + "grad_norm": 0.4279928505420685, + "learning_rate": 0.0001448414023372287, + "loss": 0.5199, + "step": 832 + }, + { + "epoch": 0.00029239070734594794, + "grad_norm": 0.33105382323265076, + "learning_rate": 0.00014477462437395658, + "loss": 0.5952, + "step": 833 + }, + { + "epoch": 0.0002927417165984641, + "grad_norm": 0.40114086866378784, + "learning_rate": 0.00014470784641068448, + "loss": 0.4611, + "step": 834 + }, + { + "epoch": 0.00029309272585098024, + "grad_norm": 0.3294037878513336, + "learning_rate": 0.00014464106844741236, + "loss": 0.5562, + "step": 835 + }, + { + "epoch": 0.0002934437351034964, + "grad_norm": 0.3391546607017517, + "learning_rate": 0.00014457429048414023, + "loss": 0.5748, + "step": 836 + }, + { + "epoch": 0.00029379474435601254, + "grad_norm": 0.4093922972679138, + "learning_rate": 0.0001445075125208681, + "loss": 0.4607, + "step": 837 + }, + { + "epoch": 0.00029414575360852866, + "grad_norm": 0.3331819176673889, + "learning_rate": 0.000144440734557596, + "loss": 0.5874, + "step": 838 + }, + { + "epoch": 0.00029449676286104484, + "grad_norm": 0.43205946683883667, + "learning_rate": 0.00014437395659432388, + "loss": 0.6152, + "step": 839 + }, + { + "epoch": 0.00029484777211356096, + "grad_norm": 0.36046868562698364, + "learning_rate": 0.00014430717863105178, + "loss": 0.4781, + "step": 840 + }, + { + "epoch": 0.00029519878136607713, + "grad_norm": 0.35514524579048157, + "learning_rate": 0.00014424040066777965, + "loss": 0.568, + "step": 841 + }, + { + "epoch": 0.00029554979061859326, + "grad_norm": 0.40260326862335205, + "learning_rate": 0.00014417362270450752, + "loss": 0.6075, + "step": 842 + }, + { + "epoch": 0.0002959007998711094, + "grad_norm": 0.3102671205997467, + "learning_rate": 0.0001441068447412354, + "loss": 0.4927, + "step": 843 + }, + { + "epoch": 0.00029625180912362556, + "grad_norm": 0.30940982699394226, + "learning_rate": 0.0001440400667779633, + "loss": 0.5549, + "step": 844 + }, + { + "epoch": 0.0002966028183761417, + "grad_norm": 0.3652762174606323, + "learning_rate": 0.00014397328881469117, + "loss": 0.6085, + "step": 845 + }, + { + "epoch": 0.00029695382762865786, + "grad_norm": 0.43056777119636536, + "learning_rate": 0.00014390651085141904, + "loss": 0.494, + "step": 846 + }, + { + "epoch": 0.000297304836881174, + "grad_norm": 0.3112967014312744, + "learning_rate": 0.00014383973288814692, + "loss": 0.5141, + "step": 847 + }, + { + "epoch": 0.0002976558461336901, + "grad_norm": 0.36729326844215393, + "learning_rate": 0.0001437729549248748, + "loss": 0.5435, + "step": 848 + }, + { + "epoch": 0.0002980068553862063, + "grad_norm": 0.3128114938735962, + "learning_rate": 0.00014370617696160266, + "loss": 0.5419, + "step": 849 + }, + { + "epoch": 0.0002983578646387224, + "grad_norm": 0.4030589163303375, + "learning_rate": 0.00014363939899833056, + "loss": 0.5959, + "step": 850 + }, + { + "epoch": 0.0002987088738912386, + "grad_norm": 0.39571288228034973, + "learning_rate": 0.00014357262103505844, + "loss": 0.6798, + "step": 851 + }, + { + "epoch": 0.0002990598831437547, + "grad_norm": 0.3388408422470093, + "learning_rate": 0.0001435058430717863, + "loss": 0.4887, + "step": 852 + }, + { + "epoch": 0.0002994108923962708, + "grad_norm": 0.39615562558174133, + "learning_rate": 0.00014343906510851418, + "loss": 0.5654, + "step": 853 + }, + { + "epoch": 0.000299761901648787, + "grad_norm": 0.3967401683330536, + "learning_rate": 0.00014337228714524205, + "loss": 0.6192, + "step": 854 + }, + { + "epoch": 0.0003001129109013031, + "grad_norm": 0.5597772002220154, + "learning_rate": 0.00014330550918196995, + "loss": 0.5808, + "step": 855 + }, + { + "epoch": 0.0003004639201538193, + "grad_norm": 0.36231061816215515, + "learning_rate": 0.00014323873121869783, + "loss": 0.4936, + "step": 856 + }, + { + "epoch": 0.0003008149294063354, + "grad_norm": 0.3775942027568817, + "learning_rate": 0.00014317195325542573, + "loss": 0.5706, + "step": 857 + }, + { + "epoch": 0.00030116593865885154, + "grad_norm": 0.4139408767223358, + "learning_rate": 0.0001431051752921536, + "loss": 0.5784, + "step": 858 + }, + { + "epoch": 0.0003015169479113677, + "grad_norm": 0.4101429879665375, + "learning_rate": 0.00014303839732888147, + "loss": 0.5937, + "step": 859 + }, + { + "epoch": 0.00030186795716388384, + "grad_norm": 0.5272162556648254, + "learning_rate": 0.00014297161936560937, + "loss": 0.5244, + "step": 860 + }, + { + "epoch": 0.0003022189664164, + "grad_norm": 0.3587292730808258, + "learning_rate": 0.00014290484140233725, + "loss": 0.6333, + "step": 861 + }, + { + "epoch": 0.00030256997566891614, + "grad_norm": 0.3284890353679657, + "learning_rate": 0.00014283806343906512, + "loss": 0.5414, + "step": 862 + }, + { + "epoch": 0.00030292098492143226, + "grad_norm": 0.414974182844162, + "learning_rate": 0.000142771285475793, + "loss": 0.6116, + "step": 863 + }, + { + "epoch": 0.00030327199417394844, + "grad_norm": 0.33619245886802673, + "learning_rate": 0.00014270450751252087, + "loss": 0.5506, + "step": 864 + }, + { + "epoch": 0.00030362300342646456, + "grad_norm": 0.45475640892982483, + "learning_rate": 0.00014263772954924874, + "loss": 0.6347, + "step": 865 + }, + { + "epoch": 0.00030397401267898074, + "grad_norm": 0.2695920765399933, + "learning_rate": 0.00014257095158597664, + "loss": 0.4529, + "step": 866 + }, + { + "epoch": 0.00030432502193149686, + "grad_norm": 0.3314480781555176, + "learning_rate": 0.00014250417362270451, + "loss": 0.5812, + "step": 867 + }, + { + "epoch": 0.000304676031184013, + "grad_norm": 0.31949582695961, + "learning_rate": 0.0001424373956594324, + "loss": 0.5213, + "step": 868 + }, + { + "epoch": 0.00030502704043652916, + "grad_norm": 0.34049752354621887, + "learning_rate": 0.00014237061769616026, + "loss": 0.4645, + "step": 869 + }, + { + "epoch": 0.0003053780496890453, + "grad_norm": 0.4304719567298889, + "learning_rate": 0.00014230383973288813, + "loss": 0.5065, + "step": 870 + }, + { + "epoch": 0.00030572905894156146, + "grad_norm": 0.32379043102264404, + "learning_rate": 0.00014223706176961603, + "loss": 0.553, + "step": 871 + }, + { + "epoch": 0.0003060800681940776, + "grad_norm": 0.33285439014434814, + "learning_rate": 0.0001421702838063439, + "loss": 0.5092, + "step": 872 + }, + { + "epoch": 0.0003064310774465937, + "grad_norm": 0.336795449256897, + "learning_rate": 0.00014210350584307178, + "loss": 0.4967, + "step": 873 + }, + { + "epoch": 0.0003067820866991099, + "grad_norm": 0.34653040766716003, + "learning_rate": 0.00014203672787979968, + "loss": 0.5353, + "step": 874 + }, + { + "epoch": 0.000307133095951626, + "grad_norm": 0.3352467715740204, + "learning_rate": 0.00014196994991652755, + "loss": 0.5594, + "step": 875 + }, + { + "epoch": 0.0003074841052041422, + "grad_norm": 0.38723453879356384, + "learning_rate": 0.00014190317195325545, + "loss": 0.5897, + "step": 876 + }, + { + "epoch": 0.0003078351144566583, + "grad_norm": 0.3987238109111786, + "learning_rate": 0.00014183639398998333, + "loss": 0.4647, + "step": 877 + }, + { + "epoch": 0.0003081861237091744, + "grad_norm": 0.3452693223953247, + "learning_rate": 0.0001417696160267112, + "loss": 0.5687, + "step": 878 + }, + { + "epoch": 0.0003085371329616906, + "grad_norm": 0.3561328649520874, + "learning_rate": 0.00014170283806343907, + "loss": 0.5845, + "step": 879 + }, + { + "epoch": 0.0003088881422142067, + "grad_norm": 0.29658418893814087, + "learning_rate": 0.00014163606010016695, + "loss": 0.5202, + "step": 880 + }, + { + "epoch": 0.0003092391514667229, + "grad_norm": 0.3908213973045349, + "learning_rate": 0.00014156928213689482, + "loss": 0.4439, + "step": 881 + }, + { + "epoch": 0.000309590160719239, + "grad_norm": 0.35816919803619385, + "learning_rate": 0.00014150250417362272, + "loss": 0.5384, + "step": 882 + }, + { + "epoch": 0.00030994116997175514, + "grad_norm": 0.3681255877017975, + "learning_rate": 0.0001414357262103506, + "loss": 0.5999, + "step": 883 + }, + { + "epoch": 0.0003102921792242713, + "grad_norm": 0.31137388944625854, + "learning_rate": 0.00014136894824707847, + "loss": 0.4495, + "step": 884 + }, + { + "epoch": 0.00031064318847678744, + "grad_norm": 0.2831423878669739, + "learning_rate": 0.00014130217028380634, + "loss": 0.4576, + "step": 885 + }, + { + "epoch": 0.0003109941977293036, + "grad_norm": 0.25953516364097595, + "learning_rate": 0.0001412353923205342, + "loss": 0.5606, + "step": 886 + }, + { + "epoch": 0.00031134520698181974, + "grad_norm": 0.31105297803878784, + "learning_rate": 0.0001411686143572621, + "loss": 0.5986, + "step": 887 + }, + { + "epoch": 0.00031169621623433586, + "grad_norm": 0.35177484154701233, + "learning_rate": 0.00014110183639398999, + "loss": 0.3394, + "step": 888 + }, + { + "epoch": 0.00031204722548685204, + "grad_norm": 0.373470276594162, + "learning_rate": 0.00014103505843071786, + "loss": 0.5862, + "step": 889 + }, + { + "epoch": 0.00031239823473936816, + "grad_norm": 0.37227189540863037, + "learning_rate": 0.00014096828046744576, + "loss": 0.4677, + "step": 890 + }, + { + "epoch": 0.00031274924399188434, + "grad_norm": 0.3799666464328766, + "learning_rate": 0.00014090150250417363, + "loss": 0.5255, + "step": 891 + }, + { + "epoch": 0.00031310025324440046, + "grad_norm": 0.3630129098892212, + "learning_rate": 0.00014083472454090153, + "loss": 0.5111, + "step": 892 + }, + { + "epoch": 0.0003134512624969166, + "grad_norm": 0.5131457448005676, + "learning_rate": 0.0001407679465776294, + "loss": 0.5207, + "step": 893 + }, + { + "epoch": 0.00031380227174943276, + "grad_norm": 0.3759867548942566, + "learning_rate": 0.00014070116861435728, + "loss": 0.6678, + "step": 894 + }, + { + "epoch": 0.0003141532810019489, + "grad_norm": 0.5577414631843567, + "learning_rate": 0.00014063439065108515, + "loss": 0.62, + "step": 895 + }, + { + "epoch": 0.00031450429025446506, + "grad_norm": 0.2789120376110077, + "learning_rate": 0.00014056761268781303, + "loss": 0.4204, + "step": 896 + }, + { + "epoch": 0.0003148552995069812, + "grad_norm": 0.2897239327430725, + "learning_rate": 0.0001405008347245409, + "loss": 0.432, + "step": 897 + }, + { + "epoch": 0.0003152063087594973, + "grad_norm": 0.3552323579788208, + "learning_rate": 0.0001404340567612688, + "loss": 0.5512, + "step": 898 + }, + { + "epoch": 0.0003155573180120135, + "grad_norm": 0.49963894486427307, + "learning_rate": 0.00014036727879799667, + "loss": 0.5868, + "step": 899 + }, + { + "epoch": 0.0003159083272645296, + "grad_norm": 0.37479934096336365, + "learning_rate": 0.00014030050083472454, + "loss": 0.6682, + "step": 900 + }, + { + "epoch": 0.0003162593365170458, + "grad_norm": 0.3415648639202118, + "learning_rate": 0.00014023372287145242, + "loss": 0.5301, + "step": 901 + }, + { + "epoch": 0.0003166103457695619, + "grad_norm": 0.37530943751335144, + "learning_rate": 0.0001401669449081803, + "loss": 0.5409, + "step": 902 + }, + { + "epoch": 0.000316961355022078, + "grad_norm": 0.37487658858299255, + "learning_rate": 0.0001401001669449082, + "loss": 0.5976, + "step": 903 + }, + { + "epoch": 0.0003173123642745942, + "grad_norm": 0.37174728512763977, + "learning_rate": 0.00014003338898163606, + "loss": 0.5933, + "step": 904 + }, + { + "epoch": 0.0003176633735271103, + "grad_norm": 0.491584450006485, + "learning_rate": 0.00013996661101836394, + "loss": 0.5112, + "step": 905 + }, + { + "epoch": 0.0003180143827796265, + "grad_norm": 0.38381487131118774, + "learning_rate": 0.0001398998330550918, + "loss": 0.6486, + "step": 906 + }, + { + "epoch": 0.0003183653920321426, + "grad_norm": 0.2867659330368042, + "learning_rate": 0.0001398330550918197, + "loss": 0.5033, + "step": 907 + }, + { + "epoch": 0.00031871640128465874, + "grad_norm": 0.3146355450153351, + "learning_rate": 0.00013976627712854758, + "loss": 0.5878, + "step": 908 + }, + { + "epoch": 0.0003190674105371749, + "grad_norm": 0.3454856276512146, + "learning_rate": 0.00013969949916527548, + "loss": 0.4751, + "step": 909 + }, + { + "epoch": 0.00031941841978969104, + "grad_norm": 0.32241204380989075, + "learning_rate": 0.00013963272120200336, + "loss": 0.6378, + "step": 910 + }, + { + "epoch": 0.0003197694290422072, + "grad_norm": 0.33703315258026123, + "learning_rate": 0.00013956594323873123, + "loss": 0.4634, + "step": 911 + }, + { + "epoch": 0.00032012043829472334, + "grad_norm": 0.3781648576259613, + "learning_rate": 0.0001394991652754591, + "loss": 0.5218, + "step": 912 + }, + { + "epoch": 0.00032047144754723946, + "grad_norm": 0.4124391973018646, + "learning_rate": 0.00013943238731218698, + "loss": 0.4958, + "step": 913 + }, + { + "epoch": 0.00032082245679975564, + "grad_norm": 0.3970220685005188, + "learning_rate": 0.00013936560934891488, + "loss": 0.5624, + "step": 914 + }, + { + "epoch": 0.00032117346605227176, + "grad_norm": 0.43682703375816345, + "learning_rate": 0.00013929883138564275, + "loss": 0.544, + "step": 915 + }, + { + "epoch": 0.00032152447530478794, + "grad_norm": 0.3476586639881134, + "learning_rate": 0.00013923205342237062, + "loss": 0.4418, + "step": 916 + }, + { + "epoch": 0.00032187548455730406, + "grad_norm": 0.36963552236557007, + "learning_rate": 0.0001391652754590985, + "loss": 0.5946, + "step": 917 + }, + { + "epoch": 0.0003222264938098202, + "grad_norm": 0.3445582985877991, + "learning_rate": 0.00013909849749582637, + "loss": 0.5879, + "step": 918 + }, + { + "epoch": 0.00032257750306233636, + "grad_norm": 0.39813530445098877, + "learning_rate": 0.00013903171953255427, + "loss": 0.5759, + "step": 919 + }, + { + "epoch": 0.0003229285123148525, + "grad_norm": 0.3314265012741089, + "learning_rate": 0.00013896494156928214, + "loss": 0.6165, + "step": 920 + }, + { + "epoch": 0.00032327952156736866, + "grad_norm": 0.4094330072402954, + "learning_rate": 0.00013889816360601002, + "loss": 0.5787, + "step": 921 + }, + { + "epoch": 0.0003236305308198848, + "grad_norm": 0.36821484565734863, + "learning_rate": 0.0001388313856427379, + "loss": 0.5303, + "step": 922 + }, + { + "epoch": 0.0003239815400724009, + "grad_norm": 0.3517453968524933, + "learning_rate": 0.00013876460767946576, + "loss": 0.4586, + "step": 923 + }, + { + "epoch": 0.0003243325493249171, + "grad_norm": 0.2959018647670746, + "learning_rate": 0.00013869782971619366, + "loss": 0.5225, + "step": 924 + }, + { + "epoch": 0.0003246835585774332, + "grad_norm": 0.3286895751953125, + "learning_rate": 0.00013863105175292154, + "loss": 0.5353, + "step": 925 + }, + { + "epoch": 0.0003250345678299494, + "grad_norm": 0.3328275680541992, + "learning_rate": 0.00013856427378964944, + "loss": 0.5915, + "step": 926 + }, + { + "epoch": 0.0003253855770824655, + "grad_norm": 0.3400813937187195, + "learning_rate": 0.0001384974958263773, + "loss": 0.4598, + "step": 927 + }, + { + "epoch": 0.0003257365863349816, + "grad_norm": 0.2876541018486023, + "learning_rate": 0.00013843071786310518, + "loss": 0.4835, + "step": 928 + }, + { + "epoch": 0.0003260875955874978, + "grad_norm": 0.3401765525341034, + "learning_rate": 0.00013836393989983308, + "loss": 0.56, + "step": 929 + }, + { + "epoch": 0.0003264386048400139, + "grad_norm": 0.34506598114967346, + "learning_rate": 0.00013829716193656096, + "loss": 0.6234, + "step": 930 + }, + { + "epoch": 0.0003267896140925301, + "grad_norm": 0.33732855319976807, + "learning_rate": 0.00013823038397328883, + "loss": 0.5686, + "step": 931 + }, + { + "epoch": 0.0003271406233450462, + "grad_norm": 0.34300100803375244, + "learning_rate": 0.0001381636060100167, + "loss": 0.6091, + "step": 932 + }, + { + "epoch": 0.00032749163259756235, + "grad_norm": 0.30349200963974, + "learning_rate": 0.00013809682804674458, + "loss": 0.4836, + "step": 933 + }, + { + "epoch": 0.0003278426418500785, + "grad_norm": 0.35742175579071045, + "learning_rate": 0.00013803005008347245, + "loss": 0.6443, + "step": 934 + }, + { + "epoch": 0.00032819365110259464, + "grad_norm": 0.33582496643066406, + "learning_rate": 0.00013796327212020035, + "loss": 0.6361, + "step": 935 + }, + { + "epoch": 0.0003285446603551108, + "grad_norm": 0.33403804898262024, + "learning_rate": 0.00013789649415692822, + "loss": 0.5911, + "step": 936 + }, + { + "epoch": 0.00032889566960762694, + "grad_norm": 0.4263191521167755, + "learning_rate": 0.0001378297161936561, + "loss": 0.5243, + "step": 937 + }, + { + "epoch": 0.00032924667886014307, + "grad_norm": 0.31543296575546265, + "learning_rate": 0.00013776293823038397, + "loss": 0.554, + "step": 938 + }, + { + "epoch": 0.00032959768811265924, + "grad_norm": 0.38975203037261963, + "learning_rate": 0.00013769616026711184, + "loss": 0.5358, + "step": 939 + }, + { + "epoch": 0.00032994869736517536, + "grad_norm": 0.3175157904624939, + "learning_rate": 0.00013762938230383974, + "loss": 0.5385, + "step": 940 + }, + { + "epoch": 0.00033029970661769154, + "grad_norm": 0.32753151655197144, + "learning_rate": 0.00013756260434056762, + "loss": 0.5191, + "step": 941 + }, + { + "epoch": 0.00033065071587020766, + "grad_norm": 0.2516227066516876, + "learning_rate": 0.0001374958263772955, + "loss": 0.3496, + "step": 942 + }, + { + "epoch": 0.0003310017251227238, + "grad_norm": 0.275806188583374, + "learning_rate": 0.0001374290484140234, + "loss": 0.4197, + "step": 943 + }, + { + "epoch": 0.00033135273437523996, + "grad_norm": 0.30234864354133606, + "learning_rate": 0.00013736227045075126, + "loss": 0.4909, + "step": 944 + }, + { + "epoch": 0.0003317037436277561, + "grad_norm": 0.32561683654785156, + "learning_rate": 0.00013729549248747916, + "loss": 0.5865, + "step": 945 + }, + { + "epoch": 0.00033205475288027226, + "grad_norm": 0.32075145840644836, + "learning_rate": 0.00013722871452420704, + "loss": 0.5957, + "step": 946 + }, + { + "epoch": 0.0003324057621327884, + "grad_norm": 0.3077705204486847, + "learning_rate": 0.0001371619365609349, + "loss": 0.6026, + "step": 947 + }, + { + "epoch": 0.0003327567713853045, + "grad_norm": 0.3092177212238312, + "learning_rate": 0.00013709515859766278, + "loss": 0.553, + "step": 948 + }, + { + "epoch": 0.0003331077806378207, + "grad_norm": 0.3611501157283783, + "learning_rate": 0.00013702838063439065, + "loss": 0.5707, + "step": 949 + }, + { + "epoch": 0.0003334587898903368, + "grad_norm": 0.3343827724456787, + "learning_rate": 0.00013696160267111853, + "loss": 0.5626, + "step": 950 + }, + { + "epoch": 0.000333809799142853, + "grad_norm": 0.3330281376838684, + "learning_rate": 0.00013689482470784643, + "loss": 0.6353, + "step": 951 + }, + { + "epoch": 0.0003341608083953691, + "grad_norm": 0.4045816957950592, + "learning_rate": 0.0001368280467445743, + "loss": 0.5781, + "step": 952 + }, + { + "epoch": 0.0003345118176478852, + "grad_norm": 0.3618166446685791, + "learning_rate": 0.00013676126878130217, + "loss": 0.6702, + "step": 953 + }, + { + "epoch": 0.0003348628269004014, + "grad_norm": 0.2836553752422333, + "learning_rate": 0.00013669449081803005, + "loss": 0.4371, + "step": 954 + }, + { + "epoch": 0.0003352138361529175, + "grad_norm": 0.3100498914718628, + "learning_rate": 0.00013662771285475792, + "loss": 0.5184, + "step": 955 + }, + { + "epoch": 0.0003355648454054337, + "grad_norm": 0.34877723455429077, + "learning_rate": 0.00013656093489148582, + "loss": 0.4778, + "step": 956 + }, + { + "epoch": 0.0003359158546579498, + "grad_norm": 0.27756938338279724, + "learning_rate": 0.0001364941569282137, + "loss": 0.4314, + "step": 957 + }, + { + "epoch": 0.00033626686391046595, + "grad_norm": 0.36129051446914673, + "learning_rate": 0.00013642737896494157, + "loss": 0.5837, + "step": 958 + }, + { + "epoch": 0.0003366178731629821, + "grad_norm": 0.35625776648521423, + "learning_rate": 0.00013636060100166944, + "loss": 0.5579, + "step": 959 + }, + { + "epoch": 0.00033696888241549825, + "grad_norm": 0.3735104501247406, + "learning_rate": 0.00013629382303839734, + "loss": 0.5283, + "step": 960 + }, + { + "epoch": 0.0003373198916680144, + "grad_norm": 0.34185606241226196, + "learning_rate": 0.00013622704507512521, + "loss": 0.5669, + "step": 961 + }, + { + "epoch": 0.00033767090092053054, + "grad_norm": 0.29324260354042053, + "learning_rate": 0.00013616026711185311, + "loss": 0.4468, + "step": 962 + }, + { + "epoch": 0.00033802191017304667, + "grad_norm": 0.3439052700996399, + "learning_rate": 0.000136093489148581, + "loss": 0.5196, + "step": 963 + }, + { + "epoch": 0.00033837291942556284, + "grad_norm": 0.3536570370197296, + "learning_rate": 0.00013602671118530886, + "loss": 0.5251, + "step": 964 + }, + { + "epoch": 0.00033872392867807897, + "grad_norm": 0.4759911298751831, + "learning_rate": 0.00013595993322203673, + "loss": 0.7017, + "step": 965 + }, + { + "epoch": 0.00033907493793059514, + "grad_norm": 0.2958674728870392, + "learning_rate": 0.0001358931552587646, + "loss": 0.4936, + "step": 966 + }, + { + "epoch": 0.00033942594718311126, + "grad_norm": 0.32770562171936035, + "learning_rate": 0.0001358263772954925, + "loss": 0.5741, + "step": 967 + }, + { + "epoch": 0.0003397769564356274, + "grad_norm": 0.35697153210639954, + "learning_rate": 0.00013575959933222038, + "loss": 0.428, + "step": 968 + }, + { + "epoch": 0.00034012796568814356, + "grad_norm": 0.3409043252468109, + "learning_rate": 0.00013569282136894825, + "loss": 0.6142, + "step": 969 + }, + { + "epoch": 0.0003404789749406597, + "grad_norm": 0.47055551409721375, + "learning_rate": 0.00013562604340567613, + "loss": 0.463, + "step": 970 + }, + { + "epoch": 0.00034082998419317586, + "grad_norm": 0.38270413875579834, + "learning_rate": 0.000135559265442404, + "loss": 0.462, + "step": 971 + }, + { + "epoch": 0.000341180993445692, + "grad_norm": 0.26209867000579834, + "learning_rate": 0.0001354924874791319, + "loss": 0.5341, + "step": 972 + }, + { + "epoch": 0.0003415320026982081, + "grad_norm": 0.37498748302459717, + "learning_rate": 0.00013542570951585977, + "loss": 0.5196, + "step": 973 + }, + { + "epoch": 0.0003418830119507243, + "grad_norm": 0.36789608001708984, + "learning_rate": 0.00013535893155258765, + "loss": 0.4723, + "step": 974 + }, + { + "epoch": 0.0003422340212032404, + "grad_norm": 0.33915975689888, + "learning_rate": 0.00013529215358931552, + "loss": 0.5511, + "step": 975 + }, + { + "epoch": 0.0003425850304557566, + "grad_norm": 0.43045058846473694, + "learning_rate": 0.0001352253756260434, + "loss": 0.5667, + "step": 976 + }, + { + "epoch": 0.0003429360397082727, + "grad_norm": 0.2948949933052063, + "learning_rate": 0.0001351585976627713, + "loss": 0.4804, + "step": 977 + }, + { + "epoch": 0.00034328704896078883, + "grad_norm": 0.3249470889568329, + "learning_rate": 0.00013509181969949917, + "loss": 0.6041, + "step": 978 + }, + { + "epoch": 0.000343638058213305, + "grad_norm": 0.2865908741950989, + "learning_rate": 0.00013502504173622707, + "loss": 0.5617, + "step": 979 + }, + { + "epoch": 0.0003439890674658211, + "grad_norm": 0.3190818428993225, + "learning_rate": 0.00013495826377295494, + "loss": 0.4902, + "step": 980 + }, + { + "epoch": 0.00034434007671833725, + "grad_norm": 0.3111664950847626, + "learning_rate": 0.0001348914858096828, + "loss": 0.5504, + "step": 981 + }, + { + "epoch": 0.0003446910859708534, + "grad_norm": 0.3255857229232788, + "learning_rate": 0.00013482470784641069, + "loss": 0.5592, + "step": 982 + }, + { + "epoch": 0.00034504209522336955, + "grad_norm": 0.30806589126586914, + "learning_rate": 0.00013475792988313859, + "loss": 0.5567, + "step": 983 + }, + { + "epoch": 0.0003453931044758857, + "grad_norm": 0.33785945177078247, + "learning_rate": 0.00013469115191986646, + "loss": 0.5881, + "step": 984 + }, + { + "epoch": 0.00034574411372840185, + "grad_norm": 0.34626781940460205, + "learning_rate": 0.00013462437395659433, + "loss": 0.578, + "step": 985 + }, + { + "epoch": 0.00034609512298091797, + "grad_norm": 0.367034912109375, + "learning_rate": 0.0001345575959933222, + "loss": 0.5893, + "step": 986 + }, + { + "epoch": 0.00034644613223343415, + "grad_norm": 0.37824952602386475, + "learning_rate": 0.00013449081803005008, + "loss": 0.5681, + "step": 987 + }, + { + "epoch": 0.00034679714148595027, + "grad_norm": 0.4054035544395447, + "learning_rate": 0.00013442404006677798, + "loss": 0.6108, + "step": 988 + }, + { + "epoch": 0.00034714815073846645, + "grad_norm": 0.4374067485332489, + "learning_rate": 0.00013435726210350585, + "loss": 0.6002, + "step": 989 + }, + { + "epoch": 0.00034749915999098257, + "grad_norm": 0.3554278016090393, + "learning_rate": 0.00013429048414023373, + "loss": 0.6444, + "step": 990 + }, + { + "epoch": 0.0003478501692434987, + "grad_norm": 0.3428646922111511, + "learning_rate": 0.0001342237061769616, + "loss": 0.6527, + "step": 991 + }, + { + "epoch": 0.00034820117849601487, + "grad_norm": 0.25603657960891724, + "learning_rate": 0.00013415692821368947, + "loss": 0.5244, + "step": 992 + }, + { + "epoch": 0.000348552187748531, + "grad_norm": 0.35237595438957214, + "learning_rate": 0.00013409015025041737, + "loss": 0.557, + "step": 993 + }, + { + "epoch": 0.00034890319700104717, + "grad_norm": 0.33666110038757324, + "learning_rate": 0.00013402337228714524, + "loss": 0.5674, + "step": 994 + }, + { + "epoch": 0.0003492542062535633, + "grad_norm": 0.30283182859420776, + "learning_rate": 0.00013395659432387312, + "loss": 0.6081, + "step": 995 + }, + { + "epoch": 0.0003496052155060794, + "grad_norm": 0.30893146991729736, + "learning_rate": 0.00013388981636060102, + "loss": 0.6089, + "step": 996 + }, + { + "epoch": 0.0003499562247585956, + "grad_norm": 0.2617473304271698, + "learning_rate": 0.0001338230383973289, + "loss": 0.6104, + "step": 997 + }, + { + "epoch": 0.0003503072340111117, + "grad_norm": 0.29493093490600586, + "learning_rate": 0.00013375626043405676, + "loss": 0.5047, + "step": 998 + }, + { + "epoch": 0.0003506582432636279, + "grad_norm": 0.3991663157939911, + "learning_rate": 0.00013368948247078466, + "loss": 0.5137, + "step": 999 + }, + { + "epoch": 0.000351009252516144, + "grad_norm": 0.31760329008102417, + "learning_rate": 0.00013362270450751254, + "loss": 0.4371, + "step": 1000 + }, + { + "epoch": 0.00035136026176866013, + "grad_norm": 0.35144907236099243, + "learning_rate": 0.0001335559265442404, + "loss": 0.5085, + "step": 1001 + }, + { + "epoch": 0.0003517112710211763, + "grad_norm": 0.3597724735736847, + "learning_rate": 0.00013348914858096828, + "loss": 0.593, + "step": 1002 + }, + { + "epoch": 0.00035206228027369243, + "grad_norm": 0.33647072315216064, + "learning_rate": 0.00013342237061769616, + "loss": 0.6011, + "step": 1003 + }, + { + "epoch": 0.0003524132895262086, + "grad_norm": 0.3377489745616913, + "learning_rate": 0.00013335559265442406, + "loss": 0.6285, + "step": 1004 + }, + { + "epoch": 0.00035276429877872473, + "grad_norm": 0.3210775852203369, + "learning_rate": 0.00013328881469115193, + "loss": 0.5214, + "step": 1005 + }, + { + "epoch": 0.00035311530803124085, + "grad_norm": 0.33832573890686035, + "learning_rate": 0.0001332220367278798, + "loss": 0.5788, + "step": 1006 + }, + { + "epoch": 0.00035346631728375703, + "grad_norm": 0.3025464117527008, + "learning_rate": 0.00013315525876460768, + "loss": 0.3762, + "step": 1007 + }, + { + "epoch": 0.00035381732653627315, + "grad_norm": 0.33917921781539917, + "learning_rate": 0.00013308848080133555, + "loss": 0.5816, + "step": 1008 + }, + { + "epoch": 0.0003541683357887893, + "grad_norm": 0.3070494830608368, + "learning_rate": 0.00013302170283806345, + "loss": 0.522, + "step": 1009 + }, + { + "epoch": 0.00035451934504130545, + "grad_norm": 0.31389573216438293, + "learning_rate": 0.00013295492487479132, + "loss": 0.5966, + "step": 1010 + }, + { + "epoch": 0.00035487035429382157, + "grad_norm": 0.33663564920425415, + "learning_rate": 0.0001328881469115192, + "loss": 0.5857, + "step": 1011 + }, + { + "epoch": 0.00035522136354633775, + "grad_norm": 0.3280203640460968, + "learning_rate": 0.00013282136894824707, + "loss": 0.562, + "step": 1012 + }, + { + "epoch": 0.00035557237279885387, + "grad_norm": 0.3307760953903198, + "learning_rate": 0.00013275459098497497, + "loss": 0.6258, + "step": 1013 + }, + { + "epoch": 0.00035592338205137005, + "grad_norm": 0.34378358721733093, + "learning_rate": 0.00013268781302170284, + "loss": 0.5026, + "step": 1014 + }, + { + "epoch": 0.00035627439130388617, + "grad_norm": 0.32818603515625, + "learning_rate": 0.00013262103505843074, + "loss": 0.513, + "step": 1015 + }, + { + "epoch": 0.0003566254005564023, + "grad_norm": 0.3015523850917816, + "learning_rate": 0.00013255425709515862, + "loss": 0.5448, + "step": 1016 + }, + { + "epoch": 0.00035697640980891847, + "grad_norm": 0.2927173674106598, + "learning_rate": 0.0001324874791318865, + "loss": 0.6565, + "step": 1017 + }, + { + "epoch": 0.0003573274190614346, + "grad_norm": 0.3502102196216583, + "learning_rate": 0.00013242070116861436, + "loss": 0.6235, + "step": 1018 + }, + { + "epoch": 0.00035767842831395077, + "grad_norm": 0.32151371240615845, + "learning_rate": 0.00013235392320534224, + "loss": 0.5613, + "step": 1019 + }, + { + "epoch": 0.0003580294375664669, + "grad_norm": 0.31253233551979065, + "learning_rate": 0.00013228714524207014, + "loss": 0.4744, + "step": 1020 + }, + { + "epoch": 0.000358380446818983, + "grad_norm": 0.2831304669380188, + "learning_rate": 0.000132220367278798, + "loss": 0.5385, + "step": 1021 + }, + { + "epoch": 0.0003587314560714992, + "grad_norm": 0.32526761293411255, + "learning_rate": 0.00013215358931552588, + "loss": 0.6316, + "step": 1022 + }, + { + "epoch": 0.0003590824653240153, + "grad_norm": 0.3305005729198456, + "learning_rate": 0.00013208681135225376, + "loss": 0.5287, + "step": 1023 + }, + { + "epoch": 0.0003594334745765315, + "grad_norm": 0.29515331983566284, + "learning_rate": 0.00013202003338898163, + "loss": 0.5478, + "step": 1024 + }, + { + "epoch": 0.0003597844838290476, + "grad_norm": 0.32527396082878113, + "learning_rate": 0.00013195325542570953, + "loss": 0.6309, + "step": 1025 + }, + { + "epoch": 0.00036013549308156373, + "grad_norm": 0.3407800793647766, + "learning_rate": 0.0001318864774624374, + "loss": 0.5958, + "step": 1026 + }, + { + "epoch": 0.0003604865023340799, + "grad_norm": 0.40766170620918274, + "learning_rate": 0.00013181969949916528, + "loss": 0.5281, + "step": 1027 + }, + { + "epoch": 0.00036083751158659603, + "grad_norm": 0.3853365480899811, + "learning_rate": 0.00013175292153589315, + "loss": 0.6349, + "step": 1028 + }, + { + "epoch": 0.0003611885208391122, + "grad_norm": 0.2854768633842468, + "learning_rate": 0.00013168614357262102, + "loss": 0.4515, + "step": 1029 + }, + { + "epoch": 0.00036153953009162833, + "grad_norm": 0.3713400065898895, + "learning_rate": 0.00013161936560934892, + "loss": 0.5256, + "step": 1030 + }, + { + "epoch": 0.00036189053934414445, + "grad_norm": 0.3738803565502167, + "learning_rate": 0.0001315525876460768, + "loss": 0.647, + "step": 1031 + }, + { + "epoch": 0.00036224154859666063, + "grad_norm": 0.3904534578323364, + "learning_rate": 0.0001314858096828047, + "loss": 0.6047, + "step": 1032 + }, + { + "epoch": 0.00036259255784917675, + "grad_norm": 0.3647315204143524, + "learning_rate": 0.00013141903171953257, + "loss": 0.5027, + "step": 1033 + }, + { + "epoch": 0.00036294356710169293, + "grad_norm": 0.3410654366016388, + "learning_rate": 0.00013135225375626044, + "loss": 0.6187, + "step": 1034 + }, + { + "epoch": 0.00036329457635420905, + "grad_norm": 0.3227837383747101, + "learning_rate": 0.00013128547579298832, + "loss": 0.4749, + "step": 1035 + }, + { + "epoch": 0.00036364558560672517, + "grad_norm": 0.2792038917541504, + "learning_rate": 0.00013121869782971622, + "loss": 0.4981, + "step": 1036 + }, + { + "epoch": 0.00036399659485924135, + "grad_norm": 0.339101642370224, + "learning_rate": 0.0001311519198664441, + "loss": 0.5875, + "step": 1037 + }, + { + "epoch": 0.00036434760411175747, + "grad_norm": 0.369004487991333, + "learning_rate": 0.00013108514190317196, + "loss": 0.4854, + "step": 1038 + }, + { + "epoch": 0.00036469861336427365, + "grad_norm": 0.39061155915260315, + "learning_rate": 0.00013101836393989983, + "loss": 0.5887, + "step": 1039 + }, + { + "epoch": 0.00036504962261678977, + "grad_norm": 0.3913773000240326, + "learning_rate": 0.0001309515859766277, + "loss": 0.5388, + "step": 1040 + }, + { + "epoch": 0.0003654006318693059, + "grad_norm": 0.27972474694252014, + "learning_rate": 0.0001308848080133556, + "loss": 0.3841, + "step": 1041 + }, + { + "epoch": 0.00036575164112182207, + "grad_norm": 0.3185168504714966, + "learning_rate": 0.00013081803005008348, + "loss": 0.4955, + "step": 1042 + }, + { + "epoch": 0.0003661026503743382, + "grad_norm": 0.6088166236877441, + "learning_rate": 0.00013075125208681135, + "loss": 0.5242, + "step": 1043 + }, + { + "epoch": 0.00036645365962685437, + "grad_norm": 0.4608970582485199, + "learning_rate": 0.00013068447412353923, + "loss": 0.5375, + "step": 1044 + }, + { + "epoch": 0.0003668046688793705, + "grad_norm": 0.38970229029655457, + "learning_rate": 0.0001306176961602671, + "loss": 0.5227, + "step": 1045 + }, + { + "epoch": 0.0003671556781318866, + "grad_norm": 0.3537042438983917, + "learning_rate": 0.00013055091819699497, + "loss": 0.5022, + "step": 1046 + }, + { + "epoch": 0.0003675066873844028, + "grad_norm": 0.3243977725505829, + "learning_rate": 0.00013048414023372287, + "loss": 0.4638, + "step": 1047 + }, + { + "epoch": 0.0003678576966369189, + "grad_norm": 0.5033393502235413, + "learning_rate": 0.00013041736227045075, + "loss": 0.6124, + "step": 1048 + }, + { + "epoch": 0.0003682087058894351, + "grad_norm": 0.3304978907108307, + "learning_rate": 0.00013035058430717865, + "loss": 0.5645, + "step": 1049 + }, + { + "epoch": 0.0003685597151419512, + "grad_norm": 0.36042529344558716, + "learning_rate": 0.00013028380634390652, + "loss": 0.4484, + "step": 1050 + }, + { + "epoch": 0.00036891072439446733, + "grad_norm": 0.4284050166606903, + "learning_rate": 0.0001302170283806344, + "loss": 0.6074, + "step": 1051 + }, + { + "epoch": 0.0003692617336469835, + "grad_norm": 0.28319039940834045, + "learning_rate": 0.0001301502504173623, + "loss": 0.563, + "step": 1052 + }, + { + "epoch": 0.00036961274289949963, + "grad_norm": 0.35593390464782715, + "learning_rate": 0.00013008347245409017, + "loss": 0.5548, + "step": 1053 + }, + { + "epoch": 0.0003699637521520158, + "grad_norm": 0.3092995285987854, + "learning_rate": 0.00013001669449081804, + "loss": 0.5512, + "step": 1054 + }, + { + "epoch": 0.00037031476140453193, + "grad_norm": 0.39928558468818665, + "learning_rate": 0.00012994991652754591, + "loss": 0.5828, + "step": 1055 + }, + { + "epoch": 0.00037066577065704805, + "grad_norm": 0.3541167974472046, + "learning_rate": 0.0001298831385642738, + "loss": 0.5943, + "step": 1056 + }, + { + "epoch": 0.00037101677990956423, + "grad_norm": 0.3520177900791168, + "learning_rate": 0.0001298163606010017, + "loss": 0.5629, + "step": 1057 + }, + { + "epoch": 0.00037136778916208035, + "grad_norm": 0.26769620180130005, + "learning_rate": 0.00012974958263772956, + "loss": 0.4686, + "step": 1058 + }, + { + "epoch": 0.00037171879841459653, + "grad_norm": 0.4143349528312683, + "learning_rate": 0.00012968280467445743, + "loss": 0.5898, + "step": 1059 + }, + { + "epoch": 0.00037206980766711265, + "grad_norm": 0.29856693744659424, + "learning_rate": 0.0001296160267111853, + "loss": 0.5795, + "step": 1060 + }, + { + "epoch": 0.0003724208169196288, + "grad_norm": 0.3835422396659851, + "learning_rate": 0.00012954924874791318, + "loss": 0.657, + "step": 1061 + }, + { + "epoch": 0.00037277182617214495, + "grad_norm": 0.3311139941215515, + "learning_rate": 0.00012948247078464108, + "loss": 0.5206, + "step": 1062 + }, + { + "epoch": 0.0003731228354246611, + "grad_norm": 0.38118553161621094, + "learning_rate": 0.00012941569282136895, + "loss": 0.6101, + "step": 1063 + }, + { + "epoch": 0.00037347384467717725, + "grad_norm": 0.3357555568218231, + "learning_rate": 0.00012934891485809683, + "loss": 0.4583, + "step": 1064 + }, + { + "epoch": 0.00037382485392969337, + "grad_norm": 0.3239798843860626, + "learning_rate": 0.0001292821368948247, + "loss": 0.5717, + "step": 1065 + }, + { + "epoch": 0.0003741758631822095, + "grad_norm": 0.31502071022987366, + "learning_rate": 0.0001292153589315526, + "loss": 0.5528, + "step": 1066 + }, + { + "epoch": 0.00037452687243472567, + "grad_norm": 0.35177144408226013, + "learning_rate": 0.00012914858096828047, + "loss": 0.5404, + "step": 1067 + }, + { + "epoch": 0.0003748778816872418, + "grad_norm": 0.3457860052585602, + "learning_rate": 0.00012908180300500837, + "loss": 0.5311, + "step": 1068 + }, + { + "epoch": 0.00037522889093975797, + "grad_norm": 0.31016480922698975, + "learning_rate": 0.00012901502504173625, + "loss": 0.521, + "step": 1069 + }, + { + "epoch": 0.0003755799001922741, + "grad_norm": 0.2800024151802063, + "learning_rate": 0.00012894824707846412, + "loss": 0.4831, + "step": 1070 + }, + { + "epoch": 0.0003759309094447902, + "grad_norm": 0.3560345470905304, + "learning_rate": 0.000128881469115192, + "loss": 0.4771, + "step": 1071 + }, + { + "epoch": 0.0003762819186973064, + "grad_norm": 0.28846535086631775, + "learning_rate": 0.00012881469115191987, + "loss": 0.4444, + "step": 1072 + }, + { + "epoch": 0.0003766329279498225, + "grad_norm": 0.29720595479011536, + "learning_rate": 0.00012874791318864777, + "loss": 0.5048, + "step": 1073 + }, + { + "epoch": 0.0003769839372023387, + "grad_norm": 0.40147536993026733, + "learning_rate": 0.00012868113522537564, + "loss": 0.5521, + "step": 1074 + }, + { + "epoch": 0.0003773349464548548, + "grad_norm": 0.36368894577026367, + "learning_rate": 0.0001286143572621035, + "loss": 0.5211, + "step": 1075 + }, + { + "epoch": 0.00037768595570737094, + "grad_norm": 0.34239786863327026, + "learning_rate": 0.00012854757929883139, + "loss": 0.4327, + "step": 1076 + }, + { + "epoch": 0.0003780369649598871, + "grad_norm": 0.3420031666755676, + "learning_rate": 0.00012848080133555926, + "loss": 0.5377, + "step": 1077 + }, + { + "epoch": 0.00037838797421240323, + "grad_norm": 0.32050299644470215, + "learning_rate": 0.00012841402337228716, + "loss": 0.6428, + "step": 1078 + }, + { + "epoch": 0.0003787389834649194, + "grad_norm": 0.31478747725486755, + "learning_rate": 0.00012834724540901503, + "loss": 0.4042, + "step": 1079 + }, + { + "epoch": 0.00037908999271743553, + "grad_norm": 0.4019688367843628, + "learning_rate": 0.0001282804674457429, + "loss": 0.5806, + "step": 1080 + }, + { + "epoch": 0.00037944100196995166, + "grad_norm": 0.3169090151786804, + "learning_rate": 0.00012821368948247078, + "loss": 0.6143, + "step": 1081 + }, + { + "epoch": 0.00037979201122246783, + "grad_norm": 0.3160766363143921, + "learning_rate": 0.00012814691151919865, + "loss": 0.4358, + "step": 1082 + }, + { + "epoch": 0.00038014302047498395, + "grad_norm": 0.30607977509498596, + "learning_rate": 0.00012808013355592655, + "loss": 0.611, + "step": 1083 + }, + { + "epoch": 0.00038049402972750013, + "grad_norm": 0.3392901122570038, + "learning_rate": 0.00012801335559265442, + "loss": 0.4677, + "step": 1084 + }, + { + "epoch": 0.00038084503898001625, + "grad_norm": 0.3608296513557434, + "learning_rate": 0.00012794657762938233, + "loss": 0.4681, + "step": 1085 + }, + { + "epoch": 0.0003811960482325324, + "grad_norm": 0.35469377040863037, + "learning_rate": 0.0001278797996661102, + "loss": 0.5122, + "step": 1086 + }, + { + "epoch": 0.00038154705748504855, + "grad_norm": 0.42851918935775757, + "learning_rate": 0.00012781302170283807, + "loss": 0.511, + "step": 1087 + }, + { + "epoch": 0.0003818980667375647, + "grad_norm": 0.31718799471855164, + "learning_rate": 0.00012774624373956594, + "loss": 0.5504, + "step": 1088 + }, + { + "epoch": 0.00038224907599008085, + "grad_norm": 0.31201183795928955, + "learning_rate": 0.00012767946577629384, + "loss": 0.5846, + "step": 1089 + }, + { + "epoch": 0.000382600085242597, + "grad_norm": 0.44880107045173645, + "learning_rate": 0.00012761268781302172, + "loss": 0.6351, + "step": 1090 + }, + { + "epoch": 0.0003829510944951131, + "grad_norm": 0.3685932755470276, + "learning_rate": 0.0001275459098497496, + "loss": 0.4946, + "step": 1091 + }, + { + "epoch": 0.00038330210374762927, + "grad_norm": 0.38342320919036865, + "learning_rate": 0.00012747913188647746, + "loss": 0.4357, + "step": 1092 + }, + { + "epoch": 0.0003836531130001454, + "grad_norm": 0.2710161805152893, + "learning_rate": 0.00012741235392320534, + "loss": 0.4635, + "step": 1093 + }, + { + "epoch": 0.00038400412225266157, + "grad_norm": 0.3405950963497162, + "learning_rate": 0.00012734557595993324, + "loss": 0.4272, + "step": 1094 + }, + { + "epoch": 0.0003843551315051777, + "grad_norm": 0.3414493203163147, + "learning_rate": 0.0001272787979966611, + "loss": 0.5387, + "step": 1095 + }, + { + "epoch": 0.0003847061407576938, + "grad_norm": 0.30659371614456177, + "learning_rate": 0.00012721202003338898, + "loss": 0.451, + "step": 1096 + }, + { + "epoch": 0.00038505715001021, + "grad_norm": 0.33229631185531616, + "learning_rate": 0.00012714524207011686, + "loss": 0.6062, + "step": 1097 + }, + { + "epoch": 0.0003854081592627261, + "grad_norm": 0.29991772770881653, + "learning_rate": 0.00012707846410684473, + "loss": 0.5812, + "step": 1098 + }, + { + "epoch": 0.0003857591685152423, + "grad_norm": 0.2937552332878113, + "learning_rate": 0.0001270116861435726, + "loss": 0.4762, + "step": 1099 + }, + { + "epoch": 0.0003861101777677584, + "grad_norm": 0.3993151783943176, + "learning_rate": 0.0001269449081803005, + "loss": 0.5288, + "step": 1100 + }, + { + "epoch": 0.00038646118702027454, + "grad_norm": 0.34012341499328613, + "learning_rate": 0.00012687813021702838, + "loss": 0.5858, + "step": 1101 + }, + { + "epoch": 0.0003868121962727907, + "grad_norm": 0.31721460819244385, + "learning_rate": 0.00012681135225375628, + "loss": 0.4543, + "step": 1102 + }, + { + "epoch": 0.00038716320552530684, + "grad_norm": 0.404480904340744, + "learning_rate": 0.00012674457429048415, + "loss": 0.6425, + "step": 1103 + }, + { + "epoch": 0.000387514214777823, + "grad_norm": 0.2888083755970001, + "learning_rate": 0.00012667779632721202, + "loss": 0.5737, + "step": 1104 + }, + { + "epoch": 0.00038786522403033913, + "grad_norm": 0.316724568605423, + "learning_rate": 0.00012661101836393992, + "loss": 0.4774, + "step": 1105 + }, + { + "epoch": 0.00038821623328285526, + "grad_norm": 0.34277236461639404, + "learning_rate": 0.0001265442404006678, + "loss": 0.5722, + "step": 1106 + }, + { + "epoch": 0.00038856724253537143, + "grad_norm": 0.3688976764678955, + "learning_rate": 0.00012647746243739567, + "loss": 0.478, + "step": 1107 + }, + { + "epoch": 0.00038891825178788756, + "grad_norm": 0.30905240774154663, + "learning_rate": 0.00012641068447412354, + "loss": 0.5578, + "step": 1108 + }, + { + "epoch": 0.00038926926104040373, + "grad_norm": 0.31679004430770874, + "learning_rate": 0.00012634390651085142, + "loss": 0.5564, + "step": 1109 + }, + { + "epoch": 0.00038962027029291985, + "grad_norm": 0.31234732270240784, + "learning_rate": 0.00012627712854757932, + "loss": 0.5403, + "step": 1110 + }, + { + "epoch": 0.000389971279545436, + "grad_norm": 0.2693454921245575, + "learning_rate": 0.0001262103505843072, + "loss": 0.577, + "step": 1111 + }, + { + "epoch": 0.00039032228879795215, + "grad_norm": 0.36127611994743347, + "learning_rate": 0.00012614357262103506, + "loss": 0.5558, + "step": 1112 + }, + { + "epoch": 0.0003906732980504683, + "grad_norm": 0.3124391436576843, + "learning_rate": 0.00012607679465776294, + "loss": 0.5198, + "step": 1113 + }, + { + "epoch": 0.00039102430730298445, + "grad_norm": 0.339495986700058, + "learning_rate": 0.0001260100166944908, + "loss": 0.4415, + "step": 1114 + }, + { + "epoch": 0.0003913753165555006, + "grad_norm": 0.3561634421348572, + "learning_rate": 0.00012594323873121868, + "loss": 0.5413, + "step": 1115 + }, + { + "epoch": 0.0003917263258080167, + "grad_norm": 0.30160975456237793, + "learning_rate": 0.00012587646076794658, + "loss": 0.5754, + "step": 1116 + }, + { + "epoch": 0.0003920773350605329, + "grad_norm": 0.583508312702179, + "learning_rate": 0.00012580968280467446, + "loss": 0.5645, + "step": 1117 + }, + { + "epoch": 0.000392428344313049, + "grad_norm": 0.3197818100452423, + "learning_rate": 0.00012574290484140233, + "loss": 0.5326, + "step": 1118 + }, + { + "epoch": 0.0003927793535655652, + "grad_norm": 0.3258291482925415, + "learning_rate": 0.00012567612687813023, + "loss": 0.5504, + "step": 1119 + }, + { + "epoch": 0.0003931303628180813, + "grad_norm": 0.2790183424949646, + "learning_rate": 0.0001256093489148581, + "loss": 0.4691, + "step": 1120 + }, + { + "epoch": 0.0003934813720705974, + "grad_norm": 0.4802376627922058, + "learning_rate": 0.000125542570951586, + "loss": 0.5689, + "step": 1121 + }, + { + "epoch": 0.0003938323813231136, + "grad_norm": 0.42296934127807617, + "learning_rate": 0.00012547579298831388, + "loss": 0.5082, + "step": 1122 + }, + { + "epoch": 0.0003941833905756297, + "grad_norm": 0.4018993377685547, + "learning_rate": 0.00012540901502504175, + "loss": 0.5967, + "step": 1123 + }, + { + "epoch": 0.0003945343998281459, + "grad_norm": 0.2756693661212921, + "learning_rate": 0.00012534223706176962, + "loss": 0.5071, + "step": 1124 + }, + { + "epoch": 0.000394885409080662, + "grad_norm": 0.28827816247940063, + "learning_rate": 0.0001252754590984975, + "loss": 0.446, + "step": 1125 + }, + { + "epoch": 0.00039523641833317814, + "grad_norm": 0.33188387751579285, + "learning_rate": 0.0001252086811352254, + "loss": 0.59, + "step": 1126 + }, + { + "epoch": 0.0003955874275856943, + "grad_norm": 0.3057992458343506, + "learning_rate": 0.00012514190317195327, + "loss": 0.4665, + "step": 1127 + }, + { + "epoch": 0.00039593843683821044, + "grad_norm": 0.423970103263855, + "learning_rate": 0.00012507512520868114, + "loss": 0.5603, + "step": 1128 + }, + { + "epoch": 0.0003962894460907266, + "grad_norm": 0.4346948266029358, + "learning_rate": 0.00012500834724540902, + "loss": 0.7188, + "step": 1129 + }, + { + "epoch": 0.00039664045534324274, + "grad_norm": 0.3196350932121277, + "learning_rate": 0.0001249415692821369, + "loss": 0.499, + "step": 1130 + }, + { + "epoch": 0.00039699146459575886, + "grad_norm": 0.32787612080574036, + "learning_rate": 0.00012487479131886476, + "loss": 0.562, + "step": 1131 + }, + { + "epoch": 0.00039734247384827504, + "grad_norm": 0.3701760768890381, + "learning_rate": 0.00012480801335559266, + "loss": 0.5906, + "step": 1132 + }, + { + "epoch": 0.00039769348310079116, + "grad_norm": 0.2836174964904785, + "learning_rate": 0.00012474123539232053, + "loss": 0.5241, + "step": 1133 + }, + { + "epoch": 0.00039804449235330733, + "grad_norm": 0.3123319745063782, + "learning_rate": 0.0001246744574290484, + "loss": 0.5591, + "step": 1134 + }, + { + "epoch": 0.00039839550160582346, + "grad_norm": 0.2965394854545593, + "learning_rate": 0.0001246076794657763, + "loss": 0.5522, + "step": 1135 + }, + { + "epoch": 0.0003987465108583396, + "grad_norm": 0.3452530801296234, + "learning_rate": 0.00012454090150250418, + "loss": 0.5572, + "step": 1136 + }, + { + "epoch": 0.00039909752011085576, + "grad_norm": 0.3368155062198639, + "learning_rate": 0.00012447412353923208, + "loss": 0.4947, + "step": 1137 + }, + { + "epoch": 0.0003994485293633719, + "grad_norm": 0.31308281421661377, + "learning_rate": 0.00012440734557595995, + "loss": 0.5395, + "step": 1138 + }, + { + "epoch": 0.00039979953861588805, + "grad_norm": 0.36880385875701904, + "learning_rate": 0.00012434056761268783, + "loss": 0.5449, + "step": 1139 + }, + { + "epoch": 0.0004001505478684042, + "grad_norm": 0.3276751935482025, + "learning_rate": 0.0001242737896494157, + "loss": 0.5714, + "step": 1140 + }, + { + "epoch": 0.0004005015571209203, + "grad_norm": 0.34474796056747437, + "learning_rate": 0.00012420701168614357, + "loss": 0.5579, + "step": 1141 + }, + { + "epoch": 0.0004008525663734365, + "grad_norm": 0.3203624188899994, + "learning_rate": 0.00012414023372287147, + "loss": 0.5848, + "step": 1142 + }, + { + "epoch": 0.0004012035756259526, + "grad_norm": 0.33093470335006714, + "learning_rate": 0.00012407345575959935, + "loss": 0.5515, + "step": 1143 + }, + { + "epoch": 0.0004015545848784688, + "grad_norm": 0.2994841933250427, + "learning_rate": 0.00012400667779632722, + "loss": 0.4696, + "step": 1144 + }, + { + "epoch": 0.0004019055941309849, + "grad_norm": 0.43979793787002563, + "learning_rate": 0.0001239398998330551, + "loss": 0.5531, + "step": 1145 + }, + { + "epoch": 0.000402256603383501, + "grad_norm": 0.33747658133506775, + "learning_rate": 0.00012387312186978297, + "loss": 0.5442, + "step": 1146 + }, + { + "epoch": 0.0004026076126360172, + "grad_norm": 0.3129333257675171, + "learning_rate": 0.00012380634390651084, + "loss": 0.5812, + "step": 1147 + }, + { + "epoch": 0.0004029586218885333, + "grad_norm": 0.27842286229133606, + "learning_rate": 0.00012373956594323874, + "loss": 0.5571, + "step": 1148 + }, + { + "epoch": 0.0004033096311410495, + "grad_norm": 0.30332496762275696, + "learning_rate": 0.00012367278797996661, + "loss": 0.5264, + "step": 1149 + }, + { + "epoch": 0.0004036606403935656, + "grad_norm": 0.41959401965141296, + "learning_rate": 0.0001236060100166945, + "loss": 0.6208, + "step": 1150 + }, + { + "epoch": 0.00040401164964608174, + "grad_norm": 0.2994483411312103, + "learning_rate": 0.00012353923205342236, + "loss": 0.5311, + "step": 1151 + }, + { + "epoch": 0.0004043626588985979, + "grad_norm": 0.28562021255493164, + "learning_rate": 0.00012347245409015026, + "loss": 0.4664, + "step": 1152 + }, + { + "epoch": 0.00040471366815111404, + "grad_norm": 0.3773499131202698, + "learning_rate": 0.00012340567612687813, + "loss": 0.6372, + "step": 1153 + }, + { + "epoch": 0.0004050646774036302, + "grad_norm": 0.3149654269218445, + "learning_rate": 0.00012333889816360603, + "loss": 0.5295, + "step": 1154 + }, + { + "epoch": 0.00040541568665614634, + "grad_norm": 0.345595121383667, + "learning_rate": 0.0001232721202003339, + "loss": 0.5568, + "step": 1155 + }, + { + "epoch": 0.00040576669590866246, + "grad_norm": 0.2795856297016144, + "learning_rate": 0.00012320534223706178, + "loss": 0.4909, + "step": 1156 + }, + { + "epoch": 0.00040611770516117864, + "grad_norm": 0.37467122077941895, + "learning_rate": 0.00012313856427378965, + "loss": 0.5733, + "step": 1157 + }, + { + "epoch": 0.00040646871441369476, + "grad_norm": 0.33086350560188293, + "learning_rate": 0.00012307178631051755, + "loss": 0.5371, + "step": 1158 + }, + { + "epoch": 0.00040681972366621094, + "grad_norm": 0.3587074279785156, + "learning_rate": 0.00012300500834724543, + "loss": 0.5555, + "step": 1159 + }, + { + "epoch": 0.00040717073291872706, + "grad_norm": 0.35360291600227356, + "learning_rate": 0.0001229382303839733, + "loss": 0.5686, + "step": 1160 + }, + { + "epoch": 0.0004075217421712432, + "grad_norm": 0.32877933979034424, + "learning_rate": 0.00012287145242070117, + "loss": 0.6232, + "step": 1161 + }, + { + "epoch": 0.00040787275142375936, + "grad_norm": 0.3402215540409088, + "learning_rate": 0.00012280467445742905, + "loss": 0.5923, + "step": 1162 + }, + { + "epoch": 0.0004082237606762755, + "grad_norm": 0.3712671399116516, + "learning_rate": 0.00012273789649415692, + "loss": 0.4405, + "step": 1163 + }, + { + "epoch": 0.00040857476992879166, + "grad_norm": 0.34966424107551575, + "learning_rate": 0.00012267111853088482, + "loss": 0.5987, + "step": 1164 + }, + { + "epoch": 0.0004089257791813078, + "grad_norm": 0.8779903650283813, + "learning_rate": 0.0001226043405676127, + "loss": 0.5677, + "step": 1165 + }, + { + "epoch": 0.0004092767884338239, + "grad_norm": 0.30721041560173035, + "learning_rate": 0.00012253756260434057, + "loss": 0.4803, + "step": 1166 + }, + { + "epoch": 0.0004096277976863401, + "grad_norm": 0.3509838879108429, + "learning_rate": 0.00012247078464106844, + "loss": 0.4216, + "step": 1167 + }, + { + "epoch": 0.0004099788069388562, + "grad_norm": 0.2961578071117401, + "learning_rate": 0.0001224040066777963, + "loss": 0.5599, + "step": 1168 + }, + { + "epoch": 0.0004103298161913724, + "grad_norm": 0.28842684626579285, + "learning_rate": 0.0001223372287145242, + "loss": 0.5023, + "step": 1169 + }, + { + "epoch": 0.0004106808254438885, + "grad_norm": 0.3395219147205353, + "learning_rate": 0.00012227045075125209, + "loss": 0.6371, + "step": 1170 + }, + { + "epoch": 0.0004110318346964046, + "grad_norm": 0.2860247492790222, + "learning_rate": 0.00012220367278797999, + "loss": 0.3881, + "step": 1171 + }, + { + "epoch": 0.0004113828439489208, + "grad_norm": 0.5463435053825378, + "learning_rate": 0.00012213689482470786, + "loss": 0.5751, + "step": 1172 + }, + { + "epoch": 0.0004117338532014369, + "grad_norm": 0.30383020639419556, + "learning_rate": 0.00012207011686143572, + "loss": 0.4892, + "step": 1173 + }, + { + "epoch": 0.0004120848624539531, + "grad_norm": 0.6111129522323608, + "learning_rate": 0.00012200333889816362, + "loss": 0.6786, + "step": 1174 + }, + { + "epoch": 0.0004124358717064692, + "grad_norm": 0.32131698727607727, + "learning_rate": 0.00012193656093489149, + "loss": 0.6301, + "step": 1175 + }, + { + "epoch": 0.00041278688095898534, + "grad_norm": 0.3574715256690979, + "learning_rate": 0.00012186978297161938, + "loss": 0.5705, + "step": 1176 + }, + { + "epoch": 0.0004131378902115015, + "grad_norm": 0.46258190274238586, + "learning_rate": 0.00012180300500834725, + "loss": 0.54, + "step": 1177 + }, + { + "epoch": 0.00041348889946401764, + "grad_norm": 0.385326623916626, + "learning_rate": 0.00012173622704507512, + "loss": 0.5792, + "step": 1178 + }, + { + "epoch": 0.0004138399087165338, + "grad_norm": 0.3880153000354767, + "learning_rate": 0.00012166944908180303, + "loss": 0.5396, + "step": 1179 + }, + { + "epoch": 0.00041419091796904994, + "grad_norm": 0.32916024327278137, + "learning_rate": 0.0001216026711185309, + "loss": 0.5632, + "step": 1180 + }, + { + "epoch": 0.00041454192722156606, + "grad_norm": 0.30234548449516296, + "learning_rate": 0.00012153589315525877, + "loss": 0.5162, + "step": 1181 + }, + { + "epoch": 0.00041489293647408224, + "grad_norm": 0.3654727339744568, + "learning_rate": 0.00012146911519198664, + "loss": 0.6333, + "step": 1182 + }, + { + "epoch": 0.00041524394572659836, + "grad_norm": 0.3166685700416565, + "learning_rate": 0.00012140233722871452, + "loss": 0.5276, + "step": 1183 + }, + { + "epoch": 0.00041559495497911454, + "grad_norm": 0.3722357153892517, + "learning_rate": 0.0001213355592654424, + "loss": 0.5771, + "step": 1184 + }, + { + "epoch": 0.00041594596423163066, + "grad_norm": 0.3407818377017975, + "learning_rate": 0.00012126878130217029, + "loss": 0.5998, + "step": 1185 + }, + { + "epoch": 0.0004162969734841468, + "grad_norm": 0.28665193915367126, + "learning_rate": 0.00012120200333889818, + "loss": 0.5457, + "step": 1186 + }, + { + "epoch": 0.00041664798273666296, + "grad_norm": 0.3052026629447937, + "learning_rate": 0.00012113522537562605, + "loss": 0.5204, + "step": 1187 + }, + { + "epoch": 0.0004169989919891791, + "grad_norm": 0.286080002784729, + "learning_rate": 0.00012106844741235392, + "loss": 0.4346, + "step": 1188 + }, + { + "epoch": 0.00041735000124169526, + "grad_norm": 0.306473970413208, + "learning_rate": 0.0001210016694490818, + "loss": 0.5544, + "step": 1189 + }, + { + "epoch": 0.0004177010104942114, + "grad_norm": 0.3347833454608917, + "learning_rate": 0.0001209348914858097, + "loss": 0.4619, + "step": 1190 + }, + { + "epoch": 0.0004180520197467275, + "grad_norm": 0.28040143847465515, + "learning_rate": 0.00012086811352253757, + "loss": 0.5492, + "step": 1191 + }, + { + "epoch": 0.0004184030289992437, + "grad_norm": 0.2940806448459625, + "learning_rate": 0.00012080133555926544, + "loss": 0.5653, + "step": 1192 + }, + { + "epoch": 0.0004187540382517598, + "grad_norm": 0.37384578585624695, + "learning_rate": 0.00012073455759599333, + "loss": 0.4931, + "step": 1193 + }, + { + "epoch": 0.000419105047504276, + "grad_norm": 0.28816068172454834, + "learning_rate": 0.0001206677796327212, + "loss": 0.5292, + "step": 1194 + }, + { + "epoch": 0.0004194560567567921, + "grad_norm": 0.31325826048851013, + "learning_rate": 0.0001206010016694491, + "loss": 0.5288, + "step": 1195 + }, + { + "epoch": 0.0004198070660093082, + "grad_norm": 0.30658552050590515, + "learning_rate": 0.00012053422370617698, + "loss": 0.5854, + "step": 1196 + }, + { + "epoch": 0.0004201580752618244, + "grad_norm": 0.341240257024765, + "learning_rate": 0.00012046744574290485, + "loss": 0.5358, + "step": 1197 + }, + { + "epoch": 0.0004205090845143405, + "grad_norm": 0.3595687747001648, + "learning_rate": 0.00012040066777963272, + "loss": 0.5944, + "step": 1198 + }, + { + "epoch": 0.00042086009376685664, + "grad_norm": 0.3249213397502899, + "learning_rate": 0.0001203338898163606, + "loss": 0.4873, + "step": 1199 + }, + { + "epoch": 0.0004212111030193728, + "grad_norm": 0.37282127141952515, + "learning_rate": 0.00012026711185308848, + "loss": 0.5173, + "step": 1200 + }, + { + "epoch": 0.00042156211227188894, + "grad_norm": 0.325110524892807, + "learning_rate": 0.00012020033388981637, + "loss": 0.4819, + "step": 1201 + }, + { + "epoch": 0.0004219131215244051, + "grad_norm": 0.313388466835022, + "learning_rate": 0.00012013355592654426, + "loss": 0.5613, + "step": 1202 + }, + { + "epoch": 0.00042226413077692124, + "grad_norm": 0.38384371995925903, + "learning_rate": 0.00012006677796327213, + "loss": 0.5711, + "step": 1203 + }, + { + "epoch": 0.00042261514002943736, + "grad_norm": 0.3431423008441925, + "learning_rate": 0.00012, + "loss": 0.5593, + "step": 1204 + }, + { + "epoch": 0.00042296614928195354, + "grad_norm": 0.3032066822052002, + "learning_rate": 0.00011993322203672788, + "loss": 0.559, + "step": 1205 + }, + { + "epoch": 0.00042331715853446966, + "grad_norm": 0.30639907717704773, + "learning_rate": 0.00011986644407345578, + "loss": 0.5727, + "step": 1206 + }, + { + "epoch": 0.00042366816778698584, + "grad_norm": 0.2970695197582245, + "learning_rate": 0.00011979966611018365, + "loss": 0.5933, + "step": 1207 + }, + { + "epoch": 0.00042401917703950196, + "grad_norm": 0.3868466317653656, + "learning_rate": 0.00011973288814691152, + "loss": 0.5779, + "step": 1208 + }, + { + "epoch": 0.0004243701862920181, + "grad_norm": 0.29085230827331543, + "learning_rate": 0.0001196661101836394, + "loss": 0.6558, + "step": 1209 + }, + { + "epoch": 0.00042472119554453426, + "grad_norm": 0.33766743540763855, + "learning_rate": 0.00011959933222036728, + "loss": 0.5809, + "step": 1210 + }, + { + "epoch": 0.0004250722047970504, + "grad_norm": 0.6739090085029602, + "learning_rate": 0.00011953255425709517, + "loss": 0.6085, + "step": 1211 + }, + { + "epoch": 0.00042542321404956656, + "grad_norm": 0.35693222284317017, + "learning_rate": 0.00011946577629382306, + "loss": 0.5855, + "step": 1212 + }, + { + "epoch": 0.0004257742233020827, + "grad_norm": 0.3087833523750305, + "learning_rate": 0.00011939899833055093, + "loss": 0.6379, + "step": 1213 + }, + { + "epoch": 0.0004261252325545988, + "grad_norm": 0.3548837900161743, + "learning_rate": 0.0001193322203672788, + "loss": 0.5303, + "step": 1214 + }, + { + "epoch": 0.000426476241807115, + "grad_norm": 0.46040648221969604, + "learning_rate": 0.00011926544240400668, + "loss": 0.5171, + "step": 1215 + }, + { + "epoch": 0.0004268272510596311, + "grad_norm": 0.5730584859848022, + "learning_rate": 0.00011919866444073455, + "loss": 0.615, + "step": 1216 + }, + { + "epoch": 0.0004271782603121473, + "grad_norm": 0.34618711471557617, + "learning_rate": 0.00011913188647746245, + "loss": 0.5605, + "step": 1217 + }, + { + "epoch": 0.0004275292695646634, + "grad_norm": 0.3499528169631958, + "learning_rate": 0.00011906510851419032, + "loss": 0.5184, + "step": 1218 + }, + { + "epoch": 0.0004278802788171795, + "grad_norm": 0.33638936281204224, + "learning_rate": 0.00011899833055091821, + "loss": 0.6276, + "step": 1219 + }, + { + "epoch": 0.0004282312880696957, + "grad_norm": 0.34646880626678467, + "learning_rate": 0.00011893155258764608, + "loss": 0.5737, + "step": 1220 + }, + { + "epoch": 0.0004285822973222118, + "grad_norm": 0.2783110439777374, + "learning_rate": 0.00011886477462437396, + "loss": 0.4424, + "step": 1221 + }, + { + "epoch": 0.000428933306574728, + "grad_norm": 0.33892807364463806, + "learning_rate": 0.00011879799666110186, + "loss": 0.5656, + "step": 1222 + }, + { + "epoch": 0.0004292843158272441, + "grad_norm": 0.2782565653324127, + "learning_rate": 0.00011873121869782973, + "loss": 0.5504, + "step": 1223 + }, + { + "epoch": 0.00042963532507976025, + "grad_norm": 0.3684981167316437, + "learning_rate": 0.0001186644407345576, + "loss": 0.5532, + "step": 1224 + }, + { + "epoch": 0.0004299863343322764, + "grad_norm": 0.4034316837787628, + "learning_rate": 0.00011859766277128547, + "loss": 0.5417, + "step": 1225 + }, + { + "epoch": 0.00043033734358479254, + "grad_norm": 0.5182071924209595, + "learning_rate": 0.00011853088480801335, + "loss": 0.6118, + "step": 1226 + }, + { + "epoch": 0.0004306883528373087, + "grad_norm": 0.3137674033641815, + "learning_rate": 0.00011846410684474125, + "loss": 0.6485, + "step": 1227 + }, + { + "epoch": 0.00043103936208982484, + "grad_norm": 0.4069771468639374, + "learning_rate": 0.00011839732888146912, + "loss": 0.5452, + "step": 1228 + }, + { + "epoch": 0.00043139037134234097, + "grad_norm": 0.5212397575378418, + "learning_rate": 0.00011833055091819701, + "loss": 0.5212, + "step": 1229 + }, + { + "epoch": 0.00043174138059485714, + "grad_norm": 0.3622184693813324, + "learning_rate": 0.00011826377295492488, + "loss": 0.4333, + "step": 1230 + }, + { + "epoch": 0.00043209238984737326, + "grad_norm": 0.335044801235199, + "learning_rate": 0.00011819699499165275, + "loss": 0.5606, + "step": 1231 + }, + { + "epoch": 0.00043244339909988944, + "grad_norm": 0.31680893898010254, + "learning_rate": 0.00011813021702838063, + "loss": 0.4988, + "step": 1232 + }, + { + "epoch": 0.00043279440835240556, + "grad_norm": 0.5272301435470581, + "learning_rate": 0.00011806343906510853, + "loss": 0.6024, + "step": 1233 + }, + { + "epoch": 0.0004331454176049217, + "grad_norm": 0.3663223385810852, + "learning_rate": 0.0001179966611018364, + "loss": 0.5964, + "step": 1234 + }, + { + "epoch": 0.00043349642685743786, + "grad_norm": 0.35138314962387085, + "learning_rate": 0.00011792988313856427, + "loss": 0.5908, + "step": 1235 + }, + { + "epoch": 0.000433847436109954, + "grad_norm": 0.3744595944881439, + "learning_rate": 0.00011786310517529216, + "loss": 0.551, + "step": 1236 + }, + { + "epoch": 0.00043419844536247016, + "grad_norm": 0.31489259004592896, + "learning_rate": 0.00011779632721202003, + "loss": 0.6431, + "step": 1237 + }, + { + "epoch": 0.0004345494546149863, + "grad_norm": 0.3356812298297882, + "learning_rate": 0.00011772954924874793, + "loss": 0.4507, + "step": 1238 + }, + { + "epoch": 0.0004349004638675024, + "grad_norm": 0.3018808364868164, + "learning_rate": 0.00011766277128547581, + "loss": 0.4796, + "step": 1239 + }, + { + "epoch": 0.0004352514731200186, + "grad_norm": 0.3201460540294647, + "learning_rate": 0.00011759599332220368, + "loss": 0.4768, + "step": 1240 + }, + { + "epoch": 0.0004356024823725347, + "grad_norm": 0.3269093334674835, + "learning_rate": 0.00011752921535893155, + "loss": 0.5419, + "step": 1241 + }, + { + "epoch": 0.0004359534916250509, + "grad_norm": 0.28690990805625916, + "learning_rate": 0.00011746243739565943, + "loss": 0.5088, + "step": 1242 + }, + { + "epoch": 0.000436304500877567, + "grad_norm": 0.32765012979507446, + "learning_rate": 0.00011739565943238733, + "loss": 0.4953, + "step": 1243 + }, + { + "epoch": 0.0004366555101300831, + "grad_norm": 0.28830674290657043, + "learning_rate": 0.0001173288814691152, + "loss": 0.5179, + "step": 1244 + }, + { + "epoch": 0.0004370065193825993, + "grad_norm": 0.37793827056884766, + "learning_rate": 0.00011726210350584307, + "loss": 0.5951, + "step": 1245 + }, + { + "epoch": 0.0004373575286351154, + "grad_norm": 0.37173348665237427, + "learning_rate": 0.00011719532554257096, + "loss": 0.6059, + "step": 1246 + }, + { + "epoch": 0.0004377085378876316, + "grad_norm": 0.5363826155662537, + "learning_rate": 0.00011712854757929883, + "loss": 0.5183, + "step": 1247 + }, + { + "epoch": 0.0004380595471401477, + "grad_norm": 0.31671205163002014, + "learning_rate": 0.0001170617696160267, + "loss": 0.5711, + "step": 1248 + }, + { + "epoch": 0.00043841055639266385, + "grad_norm": 0.3112623989582062, + "learning_rate": 0.0001169949916527546, + "loss": 0.5647, + "step": 1249 + }, + { + "epoch": 0.00043876156564518, + "grad_norm": 0.3153972923755646, + "learning_rate": 0.00011692821368948248, + "loss": 0.4939, + "step": 1250 + }, + { + "epoch": 0.00043911257489769615, + "grad_norm": 0.29940372705459595, + "learning_rate": 0.00011686143572621035, + "loss": 0.5509, + "step": 1251 + }, + { + "epoch": 0.0004394635841502123, + "grad_norm": 0.42540279030799866, + "learning_rate": 0.00011679465776293823, + "loss": 0.4104, + "step": 1252 + }, + { + "epoch": 0.00043981459340272844, + "grad_norm": 0.3222522437572479, + "learning_rate": 0.00011672787979966611, + "loss": 0.6237, + "step": 1253 + }, + { + "epoch": 0.00044016560265524457, + "grad_norm": 0.34896525740623474, + "learning_rate": 0.000116661101836394, + "loss": 0.5162, + "step": 1254 + }, + { + "epoch": 0.00044051661190776074, + "grad_norm": 0.29780149459838867, + "learning_rate": 0.00011659432387312189, + "loss": 0.5805, + "step": 1255 + }, + { + "epoch": 0.00044086762116027687, + "grad_norm": 0.3533996343612671, + "learning_rate": 0.00011652754590984976, + "loss": 0.5749, + "step": 1256 + }, + { + "epoch": 0.00044121863041279304, + "grad_norm": 0.30867093801498413, + "learning_rate": 0.00011646076794657763, + "loss": 0.479, + "step": 1257 + }, + { + "epoch": 0.00044156963966530917, + "grad_norm": 0.31176280975341797, + "learning_rate": 0.0001163939899833055, + "loss": 0.5007, + "step": 1258 + }, + { + "epoch": 0.0004419206489178253, + "grad_norm": 0.3480489253997803, + "learning_rate": 0.0001163272120200334, + "loss": 0.5595, + "step": 1259 + }, + { + "epoch": 0.00044227165817034146, + "grad_norm": 0.37473055720329285, + "learning_rate": 0.00011626043405676128, + "loss": 0.5042, + "step": 1260 + }, + { + "epoch": 0.0004426226674228576, + "grad_norm": 0.3167501986026764, + "learning_rate": 0.00011619365609348915, + "loss": 0.5335, + "step": 1261 + }, + { + "epoch": 0.00044297367667537376, + "grad_norm": 0.31276339292526245, + "learning_rate": 0.00011612687813021703, + "loss": 0.5594, + "step": 1262 + }, + { + "epoch": 0.0004433246859278899, + "grad_norm": 0.42910438776016235, + "learning_rate": 0.00011606010016694491, + "loss": 0.4659, + "step": 1263 + }, + { + "epoch": 0.000443675695180406, + "grad_norm": 0.3169635534286499, + "learning_rate": 0.00011599332220367279, + "loss": 0.5463, + "step": 1264 + }, + { + "epoch": 0.0004440267044329222, + "grad_norm": 0.3419555425643921, + "learning_rate": 0.00011592654424040069, + "loss": 0.5091, + "step": 1265 + }, + { + "epoch": 0.0004443777136854383, + "grad_norm": 0.31462714076042175, + "learning_rate": 0.00011585976627712856, + "loss": 0.6233, + "step": 1266 + }, + { + "epoch": 0.0004447287229379545, + "grad_norm": 0.36186134815216064, + "learning_rate": 0.00011579298831385643, + "loss": 0.5634, + "step": 1267 + }, + { + "epoch": 0.0004450797321904706, + "grad_norm": 0.385903000831604, + "learning_rate": 0.0001157262103505843, + "loss": 0.5892, + "step": 1268 + }, + { + "epoch": 0.00044543074144298673, + "grad_norm": 0.28669610619544983, + "learning_rate": 0.00011565943238731218, + "loss": 0.4746, + "step": 1269 + }, + { + "epoch": 0.0004457817506955029, + "grad_norm": 0.37557515501976013, + "learning_rate": 0.00011559265442404008, + "loss": 0.5946, + "step": 1270 + }, + { + "epoch": 0.00044613275994801903, + "grad_norm": 0.30455920100212097, + "learning_rate": 0.00011552587646076795, + "loss": 0.4064, + "step": 1271 + }, + { + "epoch": 0.0004464837692005352, + "grad_norm": 0.36547228693962097, + "learning_rate": 0.00011545909849749584, + "loss": 0.4354, + "step": 1272 + }, + { + "epoch": 0.0004468347784530513, + "grad_norm": 0.3912973999977112, + "learning_rate": 0.00011539232053422371, + "loss": 0.544, + "step": 1273 + }, + { + "epoch": 0.00044718578770556745, + "grad_norm": 0.2993258237838745, + "learning_rate": 0.00011532554257095158, + "loss": 0.4623, + "step": 1274 + }, + { + "epoch": 0.0004475367969580836, + "grad_norm": 0.39676982164382935, + "learning_rate": 0.00011525876460767948, + "loss": 0.4735, + "step": 1275 + }, + { + "epoch": 0.00044788780621059975, + "grad_norm": 0.43738967180252075, + "learning_rate": 0.00011519198664440736, + "loss": 0.5639, + "step": 1276 + }, + { + "epoch": 0.0004482388154631159, + "grad_norm": 0.4572802186012268, + "learning_rate": 0.00011512520868113523, + "loss": 0.5043, + "step": 1277 + }, + { + "epoch": 0.00044858982471563205, + "grad_norm": 0.301929771900177, + "learning_rate": 0.0001150584307178631, + "loss": 0.3962, + "step": 1278 + }, + { + "epoch": 0.00044894083396814817, + "grad_norm": 0.42450666427612305, + "learning_rate": 0.00011499165275459098, + "loss": 0.5885, + "step": 1279 + }, + { + "epoch": 0.00044929184322066435, + "grad_norm": 0.3520278036594391, + "learning_rate": 0.00011492487479131886, + "loss": 0.5557, + "step": 1280 + }, + { + "epoch": 0.00044964285247318047, + "grad_norm": 0.32748425006866455, + "learning_rate": 0.00011485809682804675, + "loss": 0.5788, + "step": 1281 + }, + { + "epoch": 0.00044999386172569664, + "grad_norm": 0.3404058516025543, + "learning_rate": 0.00011479131886477464, + "loss": 0.431, + "step": 1282 + }, + { + "epoch": 0.00045034487097821277, + "grad_norm": 0.30703750252723694, + "learning_rate": 0.00011472454090150251, + "loss": 0.5603, + "step": 1283 + }, + { + "epoch": 0.0004506958802307289, + "grad_norm": 0.3476982116699219, + "learning_rate": 0.00011465776293823038, + "loss": 0.4984, + "step": 1284 + }, + { + "epoch": 0.00045104688948324507, + "grad_norm": 0.361433207988739, + "learning_rate": 0.00011459098497495826, + "loss": 0.4012, + "step": 1285 + }, + { + "epoch": 0.0004513978987357612, + "grad_norm": 0.31583985686302185, + "learning_rate": 0.00011452420701168616, + "loss": 0.5115, + "step": 1286 + }, + { + "epoch": 0.00045174890798827736, + "grad_norm": 0.3581843376159668, + "learning_rate": 0.00011445742904841403, + "loss": 0.5795, + "step": 1287 + }, + { + "epoch": 0.0004520999172407935, + "grad_norm": 0.30088526010513306, + "learning_rate": 0.0001143906510851419, + "loss": 0.4995, + "step": 1288 + }, + { + "epoch": 0.0004524509264933096, + "grad_norm": 0.34739211201667786, + "learning_rate": 0.00011432387312186979, + "loss": 0.5513, + "step": 1289 + }, + { + "epoch": 0.0004528019357458258, + "grad_norm": 0.3440413177013397, + "learning_rate": 0.00011425709515859766, + "loss": 0.626, + "step": 1290 + }, + { + "epoch": 0.0004531529449983419, + "grad_norm": 0.34715211391448975, + "learning_rate": 0.00011419031719532556, + "loss": 0.5567, + "step": 1291 + }, + { + "epoch": 0.0004535039542508581, + "grad_norm": 0.3141072690486908, + "learning_rate": 0.00011412353923205344, + "loss": 0.515, + "step": 1292 + }, + { + "epoch": 0.0004538549635033742, + "grad_norm": 0.3693056106567383, + "learning_rate": 0.00011405676126878131, + "loss": 0.6039, + "step": 1293 + }, + { + "epoch": 0.00045420597275589033, + "grad_norm": 0.2877582609653473, + "learning_rate": 0.00011398998330550918, + "loss": 0.627, + "step": 1294 + }, + { + "epoch": 0.0004545569820084065, + "grad_norm": 0.30727502703666687, + "learning_rate": 0.00011392320534223706, + "loss": 0.4439, + "step": 1295 + }, + { + "epoch": 0.00045490799126092263, + "grad_norm": 0.340834379196167, + "learning_rate": 0.00011385642737896493, + "loss": 0.6043, + "step": 1296 + }, + { + "epoch": 0.0004552590005134388, + "grad_norm": 0.37094762921333313, + "learning_rate": 0.00011378964941569283, + "loss": 0.5279, + "step": 1297 + }, + { + "epoch": 0.00045561000976595493, + "grad_norm": 0.352252721786499, + "learning_rate": 0.0001137228714524207, + "loss": 0.4534, + "step": 1298 + }, + { + "epoch": 0.00045596101901847105, + "grad_norm": 0.3592413663864136, + "learning_rate": 0.00011365609348914859, + "loss": 0.6009, + "step": 1299 + }, + { + "epoch": 0.0004563120282709872, + "grad_norm": 0.3028002679347992, + "learning_rate": 0.00011358931552587646, + "loss": 0.5451, + "step": 1300 + }, + { + "epoch": 0.00045666303752350335, + "grad_norm": 0.3545093238353729, + "learning_rate": 0.00011352253756260434, + "loss": 0.6022, + "step": 1301 + }, + { + "epoch": 0.0004570140467760195, + "grad_norm": 0.31239053606987, + "learning_rate": 0.00011345575959933224, + "loss": 0.5893, + "step": 1302 + }, + { + "epoch": 0.00045736505602853565, + "grad_norm": 0.2930079996585846, + "learning_rate": 0.00011338898163606011, + "loss": 0.6469, + "step": 1303 + }, + { + "epoch": 0.00045771606528105177, + "grad_norm": 0.3328670263290405, + "learning_rate": 0.00011332220367278798, + "loss": 0.551, + "step": 1304 + }, + { + "epoch": 0.00045806707453356795, + "grad_norm": 0.2958623766899109, + "learning_rate": 0.00011325542570951586, + "loss": 0.4699, + "step": 1305 + }, + { + "epoch": 0.00045841808378608407, + "grad_norm": 0.26540592312812805, + "learning_rate": 0.00011318864774624374, + "loss": 0.5651, + "step": 1306 + }, + { + "epoch": 0.00045876909303860025, + "grad_norm": 0.30372926592826843, + "learning_rate": 0.00011312186978297163, + "loss": 0.4466, + "step": 1307 + }, + { + "epoch": 0.00045912010229111637, + "grad_norm": 0.32394206523895264, + "learning_rate": 0.00011305509181969952, + "loss": 0.4651, + "step": 1308 + }, + { + "epoch": 0.0004594711115436325, + "grad_norm": 0.2792419493198395, + "learning_rate": 0.00011298831385642739, + "loss": 0.4761, + "step": 1309 + }, + { + "epoch": 0.00045982212079614867, + "grad_norm": 0.26445260643959045, + "learning_rate": 0.00011292153589315526, + "loss": 0.4564, + "step": 1310 + }, + { + "epoch": 0.0004601731300486648, + "grad_norm": 0.3601842224597931, + "learning_rate": 0.00011285475792988314, + "loss": 0.5397, + "step": 1311 + }, + { + "epoch": 0.00046052413930118097, + "grad_norm": 0.3574691712856293, + "learning_rate": 0.00011278797996661104, + "loss": 0.5961, + "step": 1312 + }, + { + "epoch": 0.0004608751485536971, + "grad_norm": 0.3000461161136627, + "learning_rate": 0.00011272120200333891, + "loss": 0.4527, + "step": 1313 + }, + { + "epoch": 0.0004612261578062132, + "grad_norm": 0.34302622079849243, + "learning_rate": 0.00011265442404006678, + "loss": 0.6379, + "step": 1314 + }, + { + "epoch": 0.0004615771670587294, + "grad_norm": 0.3945535123348236, + "learning_rate": 0.00011258764607679465, + "loss": 0.5631, + "step": 1315 + }, + { + "epoch": 0.0004619281763112455, + "grad_norm": 0.4170839786529541, + "learning_rate": 0.00011252086811352254, + "loss": 0.6339, + "step": 1316 + }, + { + "epoch": 0.0004622791855637617, + "grad_norm": 0.36513859033584595, + "learning_rate": 0.00011245409015025041, + "loss": 0.5528, + "step": 1317 + }, + { + "epoch": 0.0004626301948162778, + "grad_norm": 0.45692166686058044, + "learning_rate": 0.00011238731218697832, + "loss": 0.6315, + "step": 1318 + }, + { + "epoch": 0.00046298120406879393, + "grad_norm": 0.3772307336330414, + "learning_rate": 0.00011232053422370619, + "loss": 0.5349, + "step": 1319 + }, + { + "epoch": 0.0004633322133213101, + "grad_norm": 0.3114742636680603, + "learning_rate": 0.00011225375626043406, + "loss": 0.4121, + "step": 1320 + }, + { + "epoch": 0.00046368322257382623, + "grad_norm": 0.3508698344230652, + "learning_rate": 0.00011218697829716193, + "loss": 0.638, + "step": 1321 + }, + { + "epoch": 0.0004640342318263424, + "grad_norm": 0.34588712453842163, + "learning_rate": 0.00011212020033388981, + "loss": 0.4898, + "step": 1322 + }, + { + "epoch": 0.00046438524107885853, + "grad_norm": 0.2846747934818268, + "learning_rate": 0.00011205342237061771, + "loss": 0.5521, + "step": 1323 + }, + { + "epoch": 0.00046473625033137465, + "grad_norm": 0.31673532724380493, + "learning_rate": 0.00011198664440734558, + "loss": 0.4676, + "step": 1324 + }, + { + "epoch": 0.00046508725958389083, + "grad_norm": 0.3159814774990082, + "learning_rate": 0.00011191986644407347, + "loss": 0.508, + "step": 1325 + }, + { + "epoch": 0.00046543826883640695, + "grad_norm": 0.3438906967639923, + "learning_rate": 0.00011185308848080134, + "loss": 0.6521, + "step": 1326 + }, + { + "epoch": 0.00046578927808892313, + "grad_norm": 0.28350135684013367, + "learning_rate": 0.00011178631051752921, + "loss": 0.517, + "step": 1327 + }, + { + "epoch": 0.00046614028734143925, + "grad_norm": 0.3244381844997406, + "learning_rate": 0.00011171953255425711, + "loss": 0.4975, + "step": 1328 + }, + { + "epoch": 0.00046649129659395537, + "grad_norm": 0.32338446378707886, + "learning_rate": 0.00011165275459098499, + "loss": 0.5581, + "step": 1329 + }, + { + "epoch": 0.00046684230584647155, + "grad_norm": 0.3385190963745117, + "learning_rate": 0.00011158597662771286, + "loss": 0.5287, + "step": 1330 + }, + { + "epoch": 0.00046719331509898767, + "grad_norm": 0.30869290232658386, + "learning_rate": 0.00011151919866444073, + "loss": 0.5694, + "step": 1331 + }, + { + "epoch": 0.00046754432435150385, + "grad_norm": 0.39800670742988586, + "learning_rate": 0.00011145242070116862, + "loss": 0.6783, + "step": 1332 + }, + { + "epoch": 0.00046789533360401997, + "grad_norm": 0.3691728413105011, + "learning_rate": 0.0001113856427378965, + "loss": 0.5814, + "step": 1333 + }, + { + "epoch": 0.0004682463428565361, + "grad_norm": 0.34991732239723206, + "learning_rate": 0.0001113188647746244, + "loss": 0.414, + "step": 1334 + }, + { + "epoch": 0.00046859735210905227, + "grad_norm": 0.3095676302909851, + "learning_rate": 0.00011125208681135227, + "loss": 0.5982, + "step": 1335 + }, + { + "epoch": 0.0004689483613615684, + "grad_norm": 0.3367360830307007, + "learning_rate": 0.00011118530884808014, + "loss": 0.5794, + "step": 1336 + }, + { + "epoch": 0.00046929937061408457, + "grad_norm": 0.3058132529258728, + "learning_rate": 0.00011111853088480801, + "loss": 0.5001, + "step": 1337 + }, + { + "epoch": 0.0004696503798666007, + "grad_norm": 0.32190924882888794, + "learning_rate": 0.00011105175292153589, + "loss": 0.6184, + "step": 1338 + }, + { + "epoch": 0.0004700013891191168, + "grad_norm": 0.2544103264808655, + "learning_rate": 0.00011098497495826379, + "loss": 0.5338, + "step": 1339 + }, + { + "epoch": 0.000470352398371633, + "grad_norm": 0.3533720374107361, + "learning_rate": 0.00011091819699499166, + "loss": 0.5817, + "step": 1340 + }, + { + "epoch": 0.0004707034076241491, + "grad_norm": 0.29889243841171265, + "learning_rate": 0.00011085141903171953, + "loss": 0.4836, + "step": 1341 + }, + { + "epoch": 0.0004710544168766653, + "grad_norm": 0.3215756118297577, + "learning_rate": 0.00011078464106844742, + "loss": 0.5438, + "step": 1342 + }, + { + "epoch": 0.0004714054261291814, + "grad_norm": 0.3005795478820801, + "learning_rate": 0.00011071786310517529, + "loss": 0.5341, + "step": 1343 + }, + { + "epoch": 0.00047175643538169753, + "grad_norm": 0.31172803044319153, + "learning_rate": 0.0001106510851419032, + "loss": 0.5517, + "step": 1344 + }, + { + "epoch": 0.0004721074446342137, + "grad_norm": 0.3667462468147278, + "learning_rate": 0.00011058430717863107, + "loss": 0.5487, + "step": 1345 + }, + { + "epoch": 0.00047245845388672983, + "grad_norm": 0.3609708249568939, + "learning_rate": 0.00011051752921535894, + "loss": 0.5514, + "step": 1346 + }, + { + "epoch": 0.000472809463139246, + "grad_norm": 0.36390745639801025, + "learning_rate": 0.00011045075125208681, + "loss": 0.609, + "step": 1347 + }, + { + "epoch": 0.00047316047239176213, + "grad_norm": 0.3918192982673645, + "learning_rate": 0.00011038397328881469, + "loss": 0.5841, + "step": 1348 + }, + { + "epoch": 0.00047351148164427825, + "grad_norm": 0.3789425194263458, + "learning_rate": 0.00011031719532554257, + "loss": 0.5551, + "step": 1349 + }, + { + "epoch": 0.00047386249089679443, + "grad_norm": 0.31591498851776123, + "learning_rate": 0.00011025041736227046, + "loss": 0.5445, + "step": 1350 + }, + { + "epoch": 0.00047421350014931055, + "grad_norm": 0.3711070120334625, + "learning_rate": 0.00011018363939899835, + "loss": 0.6124, + "step": 1351 + }, + { + "epoch": 0.00047456450940182673, + "grad_norm": 0.3442644476890564, + "learning_rate": 0.00011011686143572622, + "loss": 0.5793, + "step": 1352 + }, + { + "epoch": 0.00047491551865434285, + "grad_norm": 0.2866378426551819, + "learning_rate": 0.00011005008347245409, + "loss": 0.5144, + "step": 1353 + }, + { + "epoch": 0.000475266527906859, + "grad_norm": 0.3127586841583252, + "learning_rate": 0.00010998330550918197, + "loss": 0.6036, + "step": 1354 + }, + { + "epoch": 0.00047561753715937515, + "grad_norm": 0.32305601239204407, + "learning_rate": 0.00010991652754590987, + "loss": 0.5215, + "step": 1355 + }, + { + "epoch": 0.00047596854641189127, + "grad_norm": 0.30483660101890564, + "learning_rate": 0.00010984974958263774, + "loss": 0.6094, + "step": 1356 + }, + { + "epoch": 0.00047631955566440745, + "grad_norm": 0.33019503951072693, + "learning_rate": 0.00010978297161936561, + "loss": 0.5646, + "step": 1357 + }, + { + "epoch": 0.00047667056491692357, + "grad_norm": 0.3414929509162903, + "learning_rate": 0.00010971619365609349, + "loss": 0.5262, + "step": 1358 + }, + { + "epoch": 0.0004770215741694397, + "grad_norm": 0.3471517860889435, + "learning_rate": 0.00010964941569282137, + "loss": 0.492, + "step": 1359 + }, + { + "epoch": 0.00047737258342195587, + "grad_norm": 0.3226645588874817, + "learning_rate": 0.00010958263772954926, + "loss": 0.6318, + "step": 1360 + }, + { + "epoch": 0.000477723592674472, + "grad_norm": 0.3425777852535248, + "learning_rate": 0.00010951585976627715, + "loss": 0.5878, + "step": 1361 + }, + { + "epoch": 0.00047807460192698817, + "grad_norm": 0.307462215423584, + "learning_rate": 0.00010944908180300502, + "loss": 0.4948, + "step": 1362 + }, + { + "epoch": 0.0004784256111795043, + "grad_norm": 0.34796106815338135, + "learning_rate": 0.00010938230383973289, + "loss": 0.5525, + "step": 1363 + }, + { + "epoch": 0.0004787766204320204, + "grad_norm": 0.2861281633377075, + "learning_rate": 0.00010931552587646076, + "loss": 0.4578, + "step": 1364 + }, + { + "epoch": 0.0004791276296845366, + "grad_norm": 0.2861836552619934, + "learning_rate": 0.00010924874791318864, + "loss": 0.5761, + "step": 1365 + }, + { + "epoch": 0.0004794786389370527, + "grad_norm": 0.3063654601573944, + "learning_rate": 0.00010918196994991654, + "loss": 0.5338, + "step": 1366 + }, + { + "epoch": 0.0004798296481895689, + "grad_norm": 0.3108372390270233, + "learning_rate": 0.00010911519198664441, + "loss": 0.4896, + "step": 1367 + }, + { + "epoch": 0.000480180657442085, + "grad_norm": 0.3263947069644928, + "learning_rate": 0.0001090484140233723, + "loss": 0.6142, + "step": 1368 + }, + { + "epoch": 0.00048053166669460113, + "grad_norm": 0.27663156390190125, + "learning_rate": 0.00010898163606010017, + "loss": 0.3852, + "step": 1369 + }, + { + "epoch": 0.0004808826759471173, + "grad_norm": 0.2791202962398529, + "learning_rate": 0.00010891485809682804, + "loss": 0.6032, + "step": 1370 + }, + { + "epoch": 0.00048123368519963343, + "grad_norm": 0.2715228199958801, + "learning_rate": 0.00010884808013355594, + "loss": 0.4717, + "step": 1371 + }, + { + "epoch": 0.0004815846944521496, + "grad_norm": 0.3232786953449249, + "learning_rate": 0.00010878130217028382, + "loss": 0.5511, + "step": 1372 + }, + { + "epoch": 0.00048193570370466573, + "grad_norm": 0.42948031425476074, + "learning_rate": 0.00010871452420701169, + "loss": 0.5223, + "step": 1373 + }, + { + "epoch": 0.00048228671295718185, + "grad_norm": 0.31973496079444885, + "learning_rate": 0.00010864774624373956, + "loss": 0.4532, + "step": 1374 + }, + { + "epoch": 0.00048263772220969803, + "grad_norm": 0.3149821162223816, + "learning_rate": 0.00010858096828046744, + "loss": 0.4894, + "step": 1375 + }, + { + "epoch": 0.00048298873146221415, + "grad_norm": 0.30229589343070984, + "learning_rate": 0.00010851419031719534, + "loss": 0.5039, + "step": 1376 + }, + { + "epoch": 0.00048333974071473033, + "grad_norm": 0.36127185821533203, + "learning_rate": 0.00010844741235392321, + "loss": 0.4379, + "step": 1377 + }, + { + "epoch": 0.00048369074996724645, + "grad_norm": 0.3135043978691101, + "learning_rate": 0.0001083806343906511, + "loss": 0.5172, + "step": 1378 + }, + { + "epoch": 0.0004840417592197626, + "grad_norm": 0.33123600482940674, + "learning_rate": 0.00010831385642737897, + "loss": 0.4959, + "step": 1379 + }, + { + "epoch": 0.00048439276847227875, + "grad_norm": 0.32165780663490295, + "learning_rate": 0.00010824707846410684, + "loss": 0.5152, + "step": 1380 + }, + { + "epoch": 0.0004847437777247949, + "grad_norm": 0.28580865263938904, + "learning_rate": 0.00010818030050083472, + "loss": 0.4879, + "step": 1381 + }, + { + "epoch": 0.00048509478697731105, + "grad_norm": 0.4019862711429596, + "learning_rate": 0.00010811352253756262, + "loss": 0.5475, + "step": 1382 + }, + { + "epoch": 0.0004854457962298272, + "grad_norm": 0.34479352831840515, + "learning_rate": 0.00010804674457429049, + "loss": 0.4279, + "step": 1383 + }, + { + "epoch": 0.0004857968054823433, + "grad_norm": 0.3664172887802124, + "learning_rate": 0.00010797996661101836, + "loss": 0.5815, + "step": 1384 + }, + { + "epoch": 0.00048614781473485947, + "grad_norm": 0.34667205810546875, + "learning_rate": 0.00010791318864774625, + "loss": 0.5453, + "step": 1385 + }, + { + "epoch": 0.0004864988239873756, + "grad_norm": 0.36878061294555664, + "learning_rate": 0.00010784641068447412, + "loss": 0.5464, + "step": 1386 + }, + { + "epoch": 0.00048684983323989177, + "grad_norm": 0.3552783727645874, + "learning_rate": 0.00010777963272120202, + "loss": 0.5668, + "step": 1387 + }, + { + "epoch": 0.0004872008424924079, + "grad_norm": 0.35390666127204895, + "learning_rate": 0.0001077128547579299, + "loss": 0.4799, + "step": 1388 + }, + { + "epoch": 0.000487551851744924, + "grad_norm": 0.3539852797985077, + "learning_rate": 0.00010764607679465777, + "loss": 0.6264, + "step": 1389 + }, + { + "epoch": 0.0004879028609974402, + "grad_norm": 0.3104274868965149, + "learning_rate": 0.00010757929883138564, + "loss": 0.4881, + "step": 1390 + }, + { + "epoch": 0.0004882538702499563, + "grad_norm": 0.29643991589546204, + "learning_rate": 0.00010751252086811352, + "loss": 0.5277, + "step": 1391 + }, + { + "epoch": 0.0004886048795024725, + "grad_norm": 0.3498566448688507, + "learning_rate": 0.00010744574290484142, + "loss": 0.4394, + "step": 1392 + }, + { + "epoch": 0.0004889558887549886, + "grad_norm": 0.31261810660362244, + "learning_rate": 0.00010737896494156929, + "loss": 0.4557, + "step": 1393 + }, + { + "epoch": 0.0004893068980075047, + "grad_norm": 0.301792711019516, + "learning_rate": 0.00010731218697829716, + "loss": 0.471, + "step": 1394 + }, + { + "epoch": 0.0004896579072600209, + "grad_norm": 0.34246626496315, + "learning_rate": 0.00010724540901502505, + "loss": 0.5917, + "step": 1395 + }, + { + "epoch": 0.0004900089165125371, + "grad_norm": 0.2901524305343628, + "learning_rate": 0.00010717863105175292, + "loss": 0.441, + "step": 1396 + }, + { + "epoch": 0.0004903599257650532, + "grad_norm": 0.3026966452598572, + "learning_rate": 0.0001071118530884808, + "loss": 0.5373, + "step": 1397 + }, + { + "epoch": 0.0004907109350175693, + "grad_norm": 0.29963356256484985, + "learning_rate": 0.0001070450751252087, + "loss": 0.4464, + "step": 1398 + }, + { + "epoch": 0.0004910619442700855, + "grad_norm": 0.26481980085372925, + "learning_rate": 0.00010697829716193657, + "loss": 0.5372, + "step": 1399 + }, + { + "epoch": 0.0004914129535226016, + "grad_norm": 0.26084020733833313, + "learning_rate": 0.00010691151919866444, + "loss": 0.5523, + "step": 1400 + }, + { + "epoch": 0.0004917639627751178, + "grad_norm": 0.34062638878822327, + "learning_rate": 0.00010684474123539232, + "loss": 0.5466, + "step": 1401 + }, + { + "epoch": 0.0004921149720276339, + "grad_norm": 0.3231668472290039, + "learning_rate": 0.0001067779632721202, + "loss": 0.5019, + "step": 1402 + }, + { + "epoch": 0.00049246598128015, + "grad_norm": 0.3362787961959839, + "learning_rate": 0.00010671118530884809, + "loss": 0.5251, + "step": 1403 + }, + { + "epoch": 0.0004928169905326662, + "grad_norm": 0.28928473591804504, + "learning_rate": 0.00010664440734557598, + "loss": 0.5346, + "step": 1404 + }, + { + "epoch": 0.0004931679997851824, + "grad_norm": 0.32969072461128235, + "learning_rate": 0.00010657762938230385, + "loss": 0.6131, + "step": 1405 + }, + { + "epoch": 0.0004935190090376985, + "grad_norm": 0.29733914136886597, + "learning_rate": 0.00010651085141903172, + "loss": 0.4406, + "step": 1406 + }, + { + "epoch": 0.0004938700182902146, + "grad_norm": 0.36437737941741943, + "learning_rate": 0.0001064440734557596, + "loss": 0.551, + "step": 1407 + }, + { + "epoch": 0.0004942210275427308, + "grad_norm": 0.33889076113700867, + "learning_rate": 0.0001063772954924875, + "loss": 0.5904, + "step": 1408 + }, + { + "epoch": 0.000494572036795247, + "grad_norm": 0.3446680009365082, + "learning_rate": 0.00010631051752921537, + "loss": 0.394, + "step": 1409 + }, + { + "epoch": 0.000494923046047763, + "grad_norm": 0.33298397064208984, + "learning_rate": 0.00010624373956594324, + "loss": 0.5048, + "step": 1410 + }, + { + "epoch": 0.0004952740553002792, + "grad_norm": 0.3153474032878876, + "learning_rate": 0.00010617696160267111, + "loss": 0.5314, + "step": 1411 + }, + { + "epoch": 0.0004956250645527954, + "grad_norm": 0.27105385065078735, + "learning_rate": 0.000106110183639399, + "loss": 0.5098, + "step": 1412 + }, + { + "epoch": 0.0004959760738053114, + "grad_norm": 0.3450585901737213, + "learning_rate": 0.00010604340567612687, + "loss": 0.5249, + "step": 1413 + }, + { + "epoch": 0.0004963270830578276, + "grad_norm": 0.35962969064712524, + "learning_rate": 0.00010597662771285477, + "loss": 0.4714, + "step": 1414 + }, + { + "epoch": 0.0004966780923103438, + "grad_norm": 0.33413732051849365, + "learning_rate": 0.00010590984974958265, + "loss": 0.5618, + "step": 1415 + }, + { + "epoch": 0.00049702910156286, + "grad_norm": 0.37907567620277405, + "learning_rate": 0.00010584307178631052, + "loss": 0.5751, + "step": 1416 + }, + { + "epoch": 0.000497380110815376, + "grad_norm": 0.3324087858200073, + "learning_rate": 0.0001057762938230384, + "loss": 0.5032, + "step": 1417 + }, + { + "epoch": 0.0004977311200678922, + "grad_norm": 0.2794540822505951, + "learning_rate": 0.00010570951585976627, + "loss": 0.4823, + "step": 1418 + }, + { + "epoch": 0.0004980821293204084, + "grad_norm": 0.31896448135375977, + "learning_rate": 0.00010564273789649417, + "loss": 0.5293, + "step": 1419 + }, + { + "epoch": 0.0004984331385729245, + "grad_norm": 0.39455580711364746, + "learning_rate": 0.00010557595993322204, + "loss": 0.6312, + "step": 1420 + }, + { + "epoch": 0.0004987841478254406, + "grad_norm": 0.3108445107936859, + "learning_rate": 0.00010550918196994993, + "loss": 0.4614, + "step": 1421 + }, + { + "epoch": 0.0004991351570779568, + "grad_norm": 0.2984072268009186, + "learning_rate": 0.0001054424040066778, + "loss": 0.5516, + "step": 1422 + }, + { + "epoch": 0.0004994861663304729, + "grad_norm": 0.3056257665157318, + "learning_rate": 0.00010537562604340567, + "loss": 0.5906, + "step": 1423 + }, + { + "epoch": 0.0004998371755829891, + "grad_norm": 0.29374566674232483, + "learning_rate": 0.00010530884808013357, + "loss": 0.599, + "step": 1424 + }, + { + "epoch": 0.0005001881848355052, + "grad_norm": 0.3665946424007416, + "learning_rate": 0.00010524207011686145, + "loss": 0.5599, + "step": 1425 + }, + { + "epoch": 0.0005005391940880214, + "grad_norm": 0.31262800097465515, + "learning_rate": 0.00010517529215358932, + "loss": 0.5566, + "step": 1426 + }, + { + "epoch": 0.0005008902033405375, + "grad_norm": 0.3117959797382355, + "learning_rate": 0.0001051085141903172, + "loss": 0.4372, + "step": 1427 + }, + { + "epoch": 0.0005012412125930537, + "grad_norm": 0.3499256670475006, + "learning_rate": 0.00010504173622704507, + "loss": 0.543, + "step": 1428 + }, + { + "epoch": 0.0005015922218455698, + "grad_norm": 0.3630000948905945, + "learning_rate": 0.00010497495826377295, + "loss": 0.5099, + "step": 1429 + }, + { + "epoch": 0.0005019432310980859, + "grad_norm": 0.3609743118286133, + "learning_rate": 0.00010490818030050084, + "loss": 0.5304, + "step": 1430 + }, + { + "epoch": 0.0005022942403506021, + "grad_norm": 0.3600139617919922, + "learning_rate": 0.00010484140233722873, + "loss": 0.4811, + "step": 1431 + }, + { + "epoch": 0.0005026452496031183, + "grad_norm": 0.30108320713043213, + "learning_rate": 0.0001047746243739566, + "loss": 0.6055, + "step": 1432 + }, + { + "epoch": 0.0005029962588556343, + "grad_norm": 0.34729886054992676, + "learning_rate": 0.00010470784641068447, + "loss": 0.5011, + "step": 1433 + }, + { + "epoch": 0.0005033472681081505, + "grad_norm": 0.33984988927841187, + "learning_rate": 0.00010464106844741235, + "loss": 0.5905, + "step": 1434 + }, + { + "epoch": 0.0005036982773606667, + "grad_norm": 0.3109802007675171, + "learning_rate": 0.00010457429048414025, + "loss": 0.5228, + "step": 1435 + }, + { + "epoch": 0.0005040492866131829, + "grad_norm": 0.37691593170166016, + "learning_rate": 0.00010450751252086812, + "loss": 0.5839, + "step": 1436 + }, + { + "epoch": 0.0005044002958656989, + "grad_norm": 0.3665965497493744, + "learning_rate": 0.00010444073455759599, + "loss": 0.5381, + "step": 1437 + }, + { + "epoch": 0.0005047513051182151, + "grad_norm": 0.29414570331573486, + "learning_rate": 0.00010437395659432388, + "loss": 0.6072, + "step": 1438 + }, + { + "epoch": 0.0005051023143707313, + "grad_norm": 0.3206839859485626, + "learning_rate": 0.00010430717863105175, + "loss": 0.5285, + "step": 1439 + }, + { + "epoch": 0.0005054533236232473, + "grad_norm": 0.3003496527671814, + "learning_rate": 0.00010424040066777965, + "loss": 0.4037, + "step": 1440 + }, + { + "epoch": 0.0005058043328757635, + "grad_norm": 0.2955014109611511, + "learning_rate": 0.00010417362270450753, + "loss": 0.4646, + "step": 1441 + }, + { + "epoch": 0.0005061553421282797, + "grad_norm": 0.3399007022380829, + "learning_rate": 0.0001041068447412354, + "loss": 0.5649, + "step": 1442 + }, + { + "epoch": 0.0005065063513807958, + "grad_norm": 0.3394736349582672, + "learning_rate": 0.00010404006677796327, + "loss": 0.5512, + "step": 1443 + }, + { + "epoch": 0.0005068573606333119, + "grad_norm": 0.31650441884994507, + "learning_rate": 0.00010397328881469115, + "loss": 0.4669, + "step": 1444 + }, + { + "epoch": 0.0005072083698858281, + "grad_norm": 0.3380611538887024, + "learning_rate": 0.00010390651085141905, + "loss": 0.6714, + "step": 1445 + }, + { + "epoch": 0.0005075593791383443, + "grad_norm": 0.29049673676490784, + "learning_rate": 0.00010383973288814692, + "loss": 0.5652, + "step": 1446 + }, + { + "epoch": 0.0005079103883908604, + "grad_norm": 0.37694746255874634, + "learning_rate": 0.0001037729549248748, + "loss": 0.4355, + "step": 1447 + }, + { + "epoch": 0.0005082613976433765, + "grad_norm": 0.36622750759124756, + "learning_rate": 0.00010370617696160268, + "loss": 0.4758, + "step": 1448 + }, + { + "epoch": 0.0005086124068958927, + "grad_norm": 0.3366115093231201, + "learning_rate": 0.00010363939899833055, + "loss": 0.5498, + "step": 1449 + }, + { + "epoch": 0.0005089634161484088, + "grad_norm": 0.2836514711380005, + "learning_rate": 0.00010357262103505843, + "loss": 0.5405, + "step": 1450 + }, + { + "epoch": 0.000509314425400925, + "grad_norm": 0.357666015625, + "learning_rate": 0.00010350584307178633, + "loss": 0.4738, + "step": 1451 + }, + { + "epoch": 0.0005096654346534411, + "grad_norm": 0.37991905212402344, + "learning_rate": 0.0001034390651085142, + "loss": 0.4932, + "step": 1452 + }, + { + "epoch": 0.0005100164439059572, + "grad_norm": 0.2862101197242737, + "learning_rate": 0.00010337228714524207, + "loss": 0.5387, + "step": 1453 + }, + { + "epoch": 0.0005103674531584734, + "grad_norm": 0.3000154197216034, + "learning_rate": 0.00010330550918196994, + "loss": 0.509, + "step": 1454 + }, + { + "epoch": 0.0005107184624109896, + "grad_norm": 0.29454153776168823, + "learning_rate": 0.00010323873121869783, + "loss": 0.3872, + "step": 1455 + }, + { + "epoch": 0.0005110694716635057, + "grad_norm": 0.305803507566452, + "learning_rate": 0.00010317195325542572, + "loss": 0.5, + "step": 1456 + }, + { + "epoch": 0.0005114204809160218, + "grad_norm": 0.3164152204990387, + "learning_rate": 0.0001031051752921536, + "loss": 0.5426, + "step": 1457 + }, + { + "epoch": 0.000511771490168538, + "grad_norm": 0.3026213049888611, + "learning_rate": 0.00010303839732888148, + "loss": 0.5783, + "step": 1458 + }, + { + "epoch": 0.0005121224994210542, + "grad_norm": 0.3170768618583679, + "learning_rate": 0.00010297161936560935, + "loss": 0.5701, + "step": 1459 + }, + { + "epoch": 0.0005124735086735702, + "grad_norm": 0.3275301456451416, + "learning_rate": 0.00010290484140233722, + "loss": 0.4884, + "step": 1460 + }, + { + "epoch": 0.0005128245179260864, + "grad_norm": 0.3446187973022461, + "learning_rate": 0.00010283806343906512, + "loss": 0.4516, + "step": 1461 + }, + { + "epoch": 0.0005131755271786026, + "grad_norm": 0.3188260495662689, + "learning_rate": 0.000102771285475793, + "loss": 0.561, + "step": 1462 + }, + { + "epoch": 0.0005135265364311186, + "grad_norm": 0.3547864258289337, + "learning_rate": 0.00010270450751252087, + "loss": 0.5768, + "step": 1463 + }, + { + "epoch": 0.0005138775456836348, + "grad_norm": 0.3740866482257843, + "learning_rate": 0.00010263772954924876, + "loss": 0.4197, + "step": 1464 + }, + { + "epoch": 0.000514228554936151, + "grad_norm": 0.38915491104125977, + "learning_rate": 0.00010257095158597663, + "loss": 0.553, + "step": 1465 + }, + { + "epoch": 0.0005145795641886672, + "grad_norm": 0.38494518399238586, + "learning_rate": 0.0001025041736227045, + "loss": 0.6247, + "step": 1466 + }, + { + "epoch": 0.0005149305734411832, + "grad_norm": 0.2716946303844452, + "learning_rate": 0.0001024373956594324, + "loss": 0.4426, + "step": 1467 + }, + { + "epoch": 0.0005152815826936994, + "grad_norm": 0.33764415979385376, + "learning_rate": 0.00010237061769616028, + "loss": 0.5939, + "step": 1468 + }, + { + "epoch": 0.0005156325919462156, + "grad_norm": 0.34384095668792725, + "learning_rate": 0.00010230383973288815, + "loss": 0.604, + "step": 1469 + }, + { + "epoch": 0.0005159836011987317, + "grad_norm": 0.3203445076942444, + "learning_rate": 0.00010223706176961602, + "loss": 0.5255, + "step": 1470 + }, + { + "epoch": 0.0005163346104512478, + "grad_norm": 0.2592601180076599, + "learning_rate": 0.0001021702838063439, + "loss": 0.4509, + "step": 1471 + }, + { + "epoch": 0.000516685619703764, + "grad_norm": 0.3425324261188507, + "learning_rate": 0.0001021035058430718, + "loss": 0.5498, + "step": 1472 + }, + { + "epoch": 0.0005170366289562801, + "grad_norm": 0.3077262341976166, + "learning_rate": 0.00010203672787979967, + "loss": 0.5364, + "step": 1473 + }, + { + "epoch": 0.0005173876382087963, + "grad_norm": 0.2831708788871765, + "learning_rate": 0.00010196994991652756, + "loss": 0.434, + "step": 1474 + }, + { + "epoch": 0.0005177386474613124, + "grad_norm": 0.29104581475257874, + "learning_rate": 0.00010190317195325543, + "loss": 0.5875, + "step": 1475 + }, + { + "epoch": 0.0005180896567138286, + "grad_norm": 0.29584741592407227, + "learning_rate": 0.0001018363939899833, + "loss": 0.4574, + "step": 1476 + }, + { + "epoch": 0.0005184406659663447, + "grad_norm": 0.41971537470817566, + "learning_rate": 0.0001017696160267112, + "loss": 0.5616, + "step": 1477 + }, + { + "epoch": 0.0005187916752188609, + "grad_norm": 0.3439647853374481, + "learning_rate": 0.00010170283806343908, + "loss": 0.4288, + "step": 1478 + }, + { + "epoch": 0.000519142684471377, + "grad_norm": 0.35867923498153687, + "learning_rate": 0.00010163606010016695, + "loss": 0.4415, + "step": 1479 + }, + { + "epoch": 0.0005194936937238931, + "grad_norm": 0.368987500667572, + "learning_rate": 0.00010156928213689482, + "loss": 0.5474, + "step": 1480 + }, + { + "epoch": 0.0005198447029764093, + "grad_norm": 0.30241629481315613, + "learning_rate": 0.00010150250417362271, + "loss": 0.4113, + "step": 1481 + }, + { + "epoch": 0.0005201957122289255, + "grad_norm": 0.31089895963668823, + "learning_rate": 0.00010143572621035058, + "loss": 0.4726, + "step": 1482 + }, + { + "epoch": 0.0005205467214814415, + "grad_norm": 0.2900741994380951, + "learning_rate": 0.00010136894824707848, + "loss": 0.4591, + "step": 1483 + }, + { + "epoch": 0.0005208977307339577, + "grad_norm": 0.2920607030391693, + "learning_rate": 0.00010130217028380636, + "loss": 0.508, + "step": 1484 + }, + { + "epoch": 0.0005212487399864739, + "grad_norm": 0.5145193338394165, + "learning_rate": 0.00010123539232053423, + "loss": 0.6125, + "step": 1485 + }, + { + "epoch": 0.0005215997492389901, + "grad_norm": 0.3466121554374695, + "learning_rate": 0.0001011686143572621, + "loss": 0.5236, + "step": 1486 + }, + { + "epoch": 0.0005219507584915061, + "grad_norm": 0.2820659577846527, + "learning_rate": 0.00010110183639398998, + "loss": 0.4886, + "step": 1487 + }, + { + "epoch": 0.0005223017677440223, + "grad_norm": 0.31797733902931213, + "learning_rate": 0.00010103505843071788, + "loss": 0.4605, + "step": 1488 + }, + { + "epoch": 0.0005226527769965385, + "grad_norm": 0.3547564148902893, + "learning_rate": 0.00010096828046744575, + "loss": 0.5559, + "step": 1489 + }, + { + "epoch": 0.0005230037862490545, + "grad_norm": 0.3584667146205902, + "learning_rate": 0.00010090150250417362, + "loss": 0.4402, + "step": 1490 + }, + { + "epoch": 0.0005233547955015707, + "grad_norm": 0.3230780065059662, + "learning_rate": 0.00010083472454090151, + "loss": 0.5187, + "step": 1491 + }, + { + "epoch": 0.0005237058047540869, + "grad_norm": 0.3932897448539734, + "learning_rate": 0.00010076794657762938, + "loss": 0.5758, + "step": 1492 + }, + { + "epoch": 0.000524056814006603, + "grad_norm": 0.39378783106803894, + "learning_rate": 0.00010070116861435728, + "loss": 0.5199, + "step": 1493 + }, + { + "epoch": 0.0005244078232591191, + "grad_norm": 0.33147481083869934, + "learning_rate": 0.00010063439065108516, + "loss": 0.4489, + "step": 1494 + }, + { + "epoch": 0.0005247588325116353, + "grad_norm": 0.3706863522529602, + "learning_rate": 0.00010056761268781303, + "loss": 0.4601, + "step": 1495 + }, + { + "epoch": 0.0005251098417641515, + "grad_norm": 0.45806849002838135, + "learning_rate": 0.0001005008347245409, + "loss": 0.4522, + "step": 1496 + }, + { + "epoch": 0.0005254608510166676, + "grad_norm": 0.2931700050830841, + "learning_rate": 0.00010043405676126878, + "loss": 0.3673, + "step": 1497 + }, + { + "epoch": 0.0005258118602691837, + "grad_norm": 0.31791719794273376, + "learning_rate": 0.00010036727879799666, + "loss": 0.497, + "step": 1498 + }, + { + "epoch": 0.0005261628695216999, + "grad_norm": 0.51285719871521, + "learning_rate": 0.00010030050083472455, + "loss": 0.4736, + "step": 1499 + }, + { + "epoch": 0.000526513878774216, + "grad_norm": 0.37526455521583557, + "learning_rate": 0.00010023372287145244, + "loss": 0.5242, + "step": 1500 + }, + { + "epoch": 0.0005268648880267322, + "grad_norm": 0.3161305785179138, + "learning_rate": 0.00010016694490818031, + "loss": 0.5977, + "step": 1501 + }, + { + "epoch": 0.0005272158972792483, + "grad_norm": 0.37831833958625793, + "learning_rate": 0.00010010016694490818, + "loss": 0.4086, + "step": 1502 + }, + { + "epoch": 0.0005275669065317644, + "grad_norm": 0.32192960381507874, + "learning_rate": 0.00010003338898163605, + "loss": 0.5963, + "step": 1503 + }, + { + "epoch": 0.0005279179157842806, + "grad_norm": 0.2514601945877075, + "learning_rate": 9.996661101836394e-05, + "loss": 0.4623, + "step": 1504 + }, + { + "epoch": 0.0005282689250367968, + "grad_norm": 0.2768949270248413, + "learning_rate": 9.989983305509183e-05, + "loss": 0.4104, + "step": 1505 + }, + { + "epoch": 0.0005286199342893129, + "grad_norm": 0.405597448348999, + "learning_rate": 9.98330550918197e-05, + "loss": 0.6432, + "step": 1506 + }, + { + "epoch": 0.000528970943541829, + "grad_norm": 0.36578214168548584, + "learning_rate": 9.976627712854757e-05, + "loss": 0.5242, + "step": 1507 + }, + { + "epoch": 0.0005293219527943452, + "grad_norm": 0.3062324821949005, + "learning_rate": 9.969949916527546e-05, + "loss": 0.5465, + "step": 1508 + }, + { + "epoch": 0.0005296729620468614, + "grad_norm": 0.35657453536987305, + "learning_rate": 9.963272120200335e-05, + "loss": 0.521, + "step": 1509 + }, + { + "epoch": 0.0005300239712993774, + "grad_norm": 0.4276399612426758, + "learning_rate": 9.956594323873122e-05, + "loss": 0.6213, + "step": 1510 + }, + { + "epoch": 0.0005303749805518936, + "grad_norm": 0.2819078862667084, + "learning_rate": 9.949916527545911e-05, + "loss": 0.5481, + "step": 1511 + }, + { + "epoch": 0.0005307259898044098, + "grad_norm": 0.31928518414497375, + "learning_rate": 9.943238731218698e-05, + "loss": 0.6391, + "step": 1512 + }, + { + "epoch": 0.0005310769990569258, + "grad_norm": 0.30502405762672424, + "learning_rate": 9.936560934891487e-05, + "loss": 0.6408, + "step": 1513 + }, + { + "epoch": 0.000531428008309442, + "grad_norm": 0.33620330691337585, + "learning_rate": 9.929883138564274e-05, + "loss": 0.5161, + "step": 1514 + }, + { + "epoch": 0.0005317790175619582, + "grad_norm": 0.34412580728530884, + "learning_rate": 9.923205342237061e-05, + "loss": 0.5942, + "step": 1515 + }, + { + "epoch": 0.0005321300268144744, + "grad_norm": 0.4236386716365814, + "learning_rate": 9.91652754590985e-05, + "loss": 0.6118, + "step": 1516 + }, + { + "epoch": 0.0005324810360669904, + "grad_norm": 0.3482692539691925, + "learning_rate": 9.909849749582639e-05, + "loss": 0.4935, + "step": 1517 + }, + { + "epoch": 0.0005328320453195066, + "grad_norm": 0.36736801266670227, + "learning_rate": 9.903171953255426e-05, + "loss": 0.5077, + "step": 1518 + }, + { + "epoch": 0.0005331830545720228, + "grad_norm": 0.3174130916595459, + "learning_rate": 9.896494156928215e-05, + "loss": 0.5046, + "step": 1519 + }, + { + "epoch": 0.0005335340638245389, + "grad_norm": 0.35202938318252563, + "learning_rate": 9.889816360601002e-05, + "loss": 0.5843, + "step": 1520 + }, + { + "epoch": 0.000533885073077055, + "grad_norm": 0.3530493974685669, + "learning_rate": 9.883138564273791e-05, + "loss": 0.4544, + "step": 1521 + }, + { + "epoch": 0.0005342360823295712, + "grad_norm": 0.36287322640419006, + "learning_rate": 9.876460767946578e-05, + "loss": 0.3369, + "step": 1522 + }, + { + "epoch": 0.0005345870915820873, + "grad_norm": 0.32286468148231506, + "learning_rate": 9.869782971619365e-05, + "loss": 0.4056, + "step": 1523 + }, + { + "epoch": 0.0005349381008346035, + "grad_norm": 0.34090831875801086, + "learning_rate": 9.863105175292154e-05, + "loss": 0.4746, + "step": 1524 + }, + { + "epoch": 0.0005352891100871196, + "grad_norm": 0.35454946756362915, + "learning_rate": 9.856427378964941e-05, + "loss": 0.5432, + "step": 1525 + }, + { + "epoch": 0.0005356401193396357, + "grad_norm": 0.3738378584384918, + "learning_rate": 9.84974958263773e-05, + "loss": 0.5179, + "step": 1526 + }, + { + "epoch": 0.0005359911285921519, + "grad_norm": 0.309709370136261, + "learning_rate": 9.843071786310519e-05, + "loss": 0.5872, + "step": 1527 + }, + { + "epoch": 0.0005363421378446681, + "grad_norm": 0.2821864187717438, + "learning_rate": 9.836393989983306e-05, + "loss": 0.5162, + "step": 1528 + }, + { + "epoch": 0.0005366931470971842, + "grad_norm": 0.46964001655578613, + "learning_rate": 9.829716193656095e-05, + "loss": 0.4713, + "step": 1529 + }, + { + "epoch": 0.0005370441563497003, + "grad_norm": 0.3433643877506256, + "learning_rate": 9.823038397328882e-05, + "loss": 0.516, + "step": 1530 + }, + { + "epoch": 0.0005373951656022165, + "grad_norm": 0.347112774848938, + "learning_rate": 9.816360601001669e-05, + "loss": 0.3772, + "step": 1531 + }, + { + "epoch": 0.0005377461748547327, + "grad_norm": 0.2924909293651581, + "learning_rate": 9.809682804674458e-05, + "loss": 0.5616, + "step": 1532 + }, + { + "epoch": 0.0005380971841072487, + "grad_norm": 0.36090362071990967, + "learning_rate": 9.803005008347245e-05, + "loss": 0.5715, + "step": 1533 + }, + { + "epoch": 0.0005384481933597649, + "grad_norm": 0.31504470109939575, + "learning_rate": 9.796327212020034e-05, + "loss": 0.5311, + "step": 1534 + }, + { + "epoch": 0.0005387992026122811, + "grad_norm": 0.34885862469673157, + "learning_rate": 9.789649415692823e-05, + "loss": 0.4626, + "step": 1535 + }, + { + "epoch": 0.0005391502118647971, + "grad_norm": 0.34042325615882874, + "learning_rate": 9.78297161936561e-05, + "loss": 0.4934, + "step": 1536 + }, + { + "epoch": 0.0005395012211173133, + "grad_norm": 0.39018404483795166, + "learning_rate": 9.776293823038399e-05, + "loss": 0.5814, + "step": 1537 + }, + { + "epoch": 0.0005398522303698295, + "grad_norm": 0.2676241397857666, + "learning_rate": 9.769616026711186e-05, + "loss": 0.438, + "step": 1538 + }, + { + "epoch": 0.0005402032396223457, + "grad_norm": 0.32380932569503784, + "learning_rate": 9.762938230383973e-05, + "loss": 0.6087, + "step": 1539 + }, + { + "epoch": 0.0005405542488748617, + "grad_norm": 0.35949036478996277, + "learning_rate": 9.756260434056762e-05, + "loss": 0.6076, + "step": 1540 + }, + { + "epoch": 0.0005409052581273779, + "grad_norm": 0.29408982396125793, + "learning_rate": 9.749582637729549e-05, + "loss": 0.5134, + "step": 1541 + }, + { + "epoch": 0.0005412562673798941, + "grad_norm": 0.30686628818511963, + "learning_rate": 9.742904841402337e-05, + "loss": 0.5617, + "step": 1542 + }, + { + "epoch": 0.0005416072766324102, + "grad_norm": 0.37297409772872925, + "learning_rate": 9.736227045075125e-05, + "loss": 0.4563, + "step": 1543 + }, + { + "epoch": 0.0005419582858849263, + "grad_norm": 0.3103518486022949, + "learning_rate": 9.729549248747914e-05, + "loss": 0.5235, + "step": 1544 + }, + { + "epoch": 0.0005423092951374425, + "grad_norm": 0.3941648602485657, + "learning_rate": 9.722871452420703e-05, + "loss": 0.6541, + "step": 1545 + }, + { + "epoch": 0.0005426603043899586, + "grad_norm": 0.30755361914634705, + "learning_rate": 9.71619365609349e-05, + "loss": 0.5612, + "step": 1546 + }, + { + "epoch": 0.0005430113136424748, + "grad_norm": 0.35478439927101135, + "learning_rate": 9.709515859766277e-05, + "loss": 0.6078, + "step": 1547 + }, + { + "epoch": 0.0005433623228949909, + "grad_norm": 0.30011776089668274, + "learning_rate": 9.702838063439066e-05, + "loss": 0.3989, + "step": 1548 + }, + { + "epoch": 0.0005437133321475071, + "grad_norm": 0.3524412214756012, + "learning_rate": 9.696160267111853e-05, + "loss": 0.554, + "step": 1549 + }, + { + "epoch": 0.0005440643414000232, + "grad_norm": 0.33379805088043213, + "learning_rate": 9.68948247078464e-05, + "loss": 0.4773, + "step": 1550 + }, + { + "epoch": 0.0005444153506525394, + "grad_norm": 0.3144623339176178, + "learning_rate": 9.682804674457429e-05, + "loss": 0.5397, + "step": 1551 + }, + { + "epoch": 0.0005447663599050555, + "grad_norm": 0.3189099431037903, + "learning_rate": 9.676126878130218e-05, + "loss": 0.5577, + "step": 1552 + }, + { + "epoch": 0.0005451173691575716, + "grad_norm": 0.2930092215538025, + "learning_rate": 9.669449081803006e-05, + "loss": 0.436, + "step": 1553 + }, + { + "epoch": 0.0005454683784100878, + "grad_norm": 0.30305665731430054, + "learning_rate": 9.662771285475794e-05, + "loss": 0.6663, + "step": 1554 + }, + { + "epoch": 0.000545819387662604, + "grad_norm": 0.31724509596824646, + "learning_rate": 9.656093489148581e-05, + "loss": 0.5232, + "step": 1555 + }, + { + "epoch": 0.00054617039691512, + "grad_norm": 0.3048739731311798, + "learning_rate": 9.64941569282137e-05, + "loss": 0.5975, + "step": 1556 + }, + { + "epoch": 0.0005465214061676362, + "grad_norm": 0.313481867313385, + "learning_rate": 9.642737896494157e-05, + "loss": 0.5658, + "step": 1557 + }, + { + "epoch": 0.0005468724154201524, + "grad_norm": 0.3365669548511505, + "learning_rate": 9.636060100166944e-05, + "loss": 0.5356, + "step": 1558 + }, + { + "epoch": 0.0005472234246726686, + "grad_norm": 0.29624179005622864, + "learning_rate": 9.629382303839733e-05, + "loss": 0.5596, + "step": 1559 + }, + { + "epoch": 0.0005475744339251846, + "grad_norm": 0.32584840059280396, + "learning_rate": 9.62270450751252e-05, + "loss": 0.6195, + "step": 1560 + }, + { + "epoch": 0.0005479254431777008, + "grad_norm": 0.3141777217388153, + "learning_rate": 9.616026711185309e-05, + "loss": 0.5428, + "step": 1561 + }, + { + "epoch": 0.000548276452430217, + "grad_norm": 0.49182063341140747, + "learning_rate": 9.609348914858098e-05, + "loss": 0.4425, + "step": 1562 + }, + { + "epoch": 0.000548627461682733, + "grad_norm": 0.3521610200405121, + "learning_rate": 9.602671118530885e-05, + "loss": 0.4566, + "step": 1563 + }, + { + "epoch": 0.0005489784709352492, + "grad_norm": 0.32009604573249817, + "learning_rate": 9.595993322203674e-05, + "loss": 0.4673, + "step": 1564 + }, + { + "epoch": 0.0005493294801877654, + "grad_norm": 0.4251219630241394, + "learning_rate": 9.589315525876461e-05, + "loss": 0.4582, + "step": 1565 + }, + { + "epoch": 0.0005496804894402815, + "grad_norm": 0.4044347107410431, + "learning_rate": 9.582637729549248e-05, + "loss": 0.6094, + "step": 1566 + }, + { + "epoch": 0.0005500314986927976, + "grad_norm": 0.37995630502700806, + "learning_rate": 9.575959933222037e-05, + "loss": 0.5798, + "step": 1567 + }, + { + "epoch": 0.0005503825079453138, + "grad_norm": 0.36014696955680847, + "learning_rate": 9.569282136894824e-05, + "loss": 0.5578, + "step": 1568 + }, + { + "epoch": 0.00055073351719783, + "grad_norm": 0.36085575819015503, + "learning_rate": 9.562604340567613e-05, + "loss": 0.5723, + "step": 1569 + }, + { + "epoch": 0.0005510845264503461, + "grad_norm": 0.34479430317878723, + "learning_rate": 9.555926544240402e-05, + "loss": 0.3701, + "step": 1570 + }, + { + "epoch": 0.0005514355357028622, + "grad_norm": 0.29680463671684265, + "learning_rate": 9.549248747913189e-05, + "loss": 0.443, + "step": 1571 + }, + { + "epoch": 0.0005517865449553784, + "grad_norm": 0.282615065574646, + "learning_rate": 9.542570951585978e-05, + "loss": 0.4408, + "step": 1572 + }, + { + "epoch": 0.0005521375542078945, + "grad_norm": 0.30851373076438904, + "learning_rate": 9.535893155258765e-05, + "loss": 0.5537, + "step": 1573 + }, + { + "epoch": 0.0005524885634604107, + "grad_norm": 0.41260892152786255, + "learning_rate": 9.529215358931554e-05, + "loss": 0.5401, + "step": 1574 + }, + { + "epoch": 0.0005528395727129268, + "grad_norm": 0.31149372458457947, + "learning_rate": 9.522537562604341e-05, + "loss": 0.4033, + "step": 1575 + }, + { + "epoch": 0.0005531905819654429, + "grad_norm": 0.33126652240753174, + "learning_rate": 9.515859766277128e-05, + "loss": 0.5386, + "step": 1576 + }, + { + "epoch": 0.0005535415912179591, + "grad_norm": 0.2965177297592163, + "learning_rate": 9.509181969949917e-05, + "loss": 0.4869, + "step": 1577 + }, + { + "epoch": 0.0005538926004704753, + "grad_norm": 0.28436359763145447, + "learning_rate": 9.502504173622706e-05, + "loss": 0.5632, + "step": 1578 + }, + { + "epoch": 0.0005542436097229914, + "grad_norm": 0.3518412113189697, + "learning_rate": 9.495826377295493e-05, + "loss": 0.4086, + "step": 1579 + }, + { + "epoch": 0.0005545946189755075, + "grad_norm": 0.3295888900756836, + "learning_rate": 9.489148580968282e-05, + "loss": 0.5742, + "step": 1580 + }, + { + "epoch": 0.0005549456282280237, + "grad_norm": 0.3147815763950348, + "learning_rate": 9.482470784641069e-05, + "loss": 0.5332, + "step": 1581 + }, + { + "epoch": 0.0005552966374805399, + "grad_norm": 0.30593639612197876, + "learning_rate": 9.475792988313858e-05, + "loss": 0.5496, + "step": 1582 + }, + { + "epoch": 0.0005556476467330559, + "grad_norm": 0.3162075877189636, + "learning_rate": 9.469115191986645e-05, + "loss": 0.5912, + "step": 1583 + }, + { + "epoch": 0.0005559986559855721, + "grad_norm": 0.32497403025627136, + "learning_rate": 9.462437395659432e-05, + "loss": 0.5494, + "step": 1584 + }, + { + "epoch": 0.0005563496652380883, + "grad_norm": 0.31055036187171936, + "learning_rate": 9.455759599332221e-05, + "loss": 0.6336, + "step": 1585 + }, + { + "epoch": 0.0005567006744906044, + "grad_norm": 0.33537331223487854, + "learning_rate": 9.449081803005008e-05, + "loss": 0.4221, + "step": 1586 + }, + { + "epoch": 0.0005570516837431205, + "grad_norm": 0.3572893440723419, + "learning_rate": 9.442404006677797e-05, + "loss": 0.5553, + "step": 1587 + }, + { + "epoch": 0.0005574026929956367, + "grad_norm": 0.3298802375793457, + "learning_rate": 9.435726210350586e-05, + "loss": 0.5121, + "step": 1588 + }, + { + "epoch": 0.0005577537022481529, + "grad_norm": 0.3529982268810272, + "learning_rate": 9.429048414023373e-05, + "loss": 0.3964, + "step": 1589 + }, + { + "epoch": 0.000558104711500669, + "grad_norm": 0.294223427772522, + "learning_rate": 9.422370617696162e-05, + "loss": 0.4495, + "step": 1590 + }, + { + "epoch": 0.0005584557207531851, + "grad_norm": 0.2953149676322937, + "learning_rate": 9.415692821368949e-05, + "loss": 0.4241, + "step": 1591 + }, + { + "epoch": 0.0005588067300057013, + "grad_norm": 0.31237637996673584, + "learning_rate": 9.409015025041736e-05, + "loss": 0.5025, + "step": 1592 + }, + { + "epoch": 0.0005591577392582174, + "grad_norm": 0.31202566623687744, + "learning_rate": 9.402337228714525e-05, + "loss": 0.5442, + "step": 1593 + }, + { + "epoch": 0.0005595087485107335, + "grad_norm": 0.34976473450660706, + "learning_rate": 9.395659432387312e-05, + "loss": 0.5314, + "step": 1594 + }, + { + "epoch": 0.0005598597577632497, + "grad_norm": 0.3305265009403229, + "learning_rate": 9.388981636060101e-05, + "loss": 0.4842, + "step": 1595 + }, + { + "epoch": 0.0005602107670157658, + "grad_norm": 0.30773475766181946, + "learning_rate": 9.38230383973289e-05, + "loss": 0.4621, + "step": 1596 + }, + { + "epoch": 0.000560561776268282, + "grad_norm": 0.35445886850357056, + "learning_rate": 9.375626043405677e-05, + "loss": 0.509, + "step": 1597 + }, + { + "epoch": 0.0005609127855207981, + "grad_norm": 0.46057018637657166, + "learning_rate": 9.368948247078465e-05, + "loss": 0.5471, + "step": 1598 + }, + { + "epoch": 0.0005612637947733143, + "grad_norm": 0.3413529396057129, + "learning_rate": 9.362270450751253e-05, + "loss": 0.5558, + "step": 1599 + }, + { + "epoch": 0.0005616148040258304, + "grad_norm": 0.36943134665489197, + "learning_rate": 9.35559265442404e-05, + "loss": 0.4718, + "step": 1600 + }, + { + "epoch": 0.0005619658132783466, + "grad_norm": 0.3529636263847351, + "learning_rate": 9.348914858096829e-05, + "loss": 0.4591, + "step": 1601 + }, + { + "epoch": 0.0005623168225308627, + "grad_norm": 0.3375125229358673, + "learning_rate": 9.342237061769616e-05, + "loss": 0.4971, + "step": 1602 + }, + { + "epoch": 0.0005626678317833788, + "grad_norm": 0.3923933804035187, + "learning_rate": 9.335559265442403e-05, + "loss": 0.545, + "step": 1603 + }, + { + "epoch": 0.000563018841035895, + "grad_norm": 0.3128841519355774, + "learning_rate": 9.328881469115192e-05, + "loss": 0.4374, + "step": 1604 + }, + { + "epoch": 0.0005633698502884112, + "grad_norm": 0.3729458153247833, + "learning_rate": 9.322203672787981e-05, + "loss": 0.581, + "step": 1605 + }, + { + "epoch": 0.0005637208595409272, + "grad_norm": 0.3644692003726959, + "learning_rate": 9.31552587646077e-05, + "loss": 0.5223, + "step": 1606 + }, + { + "epoch": 0.0005640718687934434, + "grad_norm": 0.365633100271225, + "learning_rate": 9.308848080133557e-05, + "loss": 0.3695, + "step": 1607 + }, + { + "epoch": 0.0005644228780459596, + "grad_norm": 0.3256838917732239, + "learning_rate": 9.302170283806344e-05, + "loss": 0.5484, + "step": 1608 + }, + { + "epoch": 0.0005647738872984758, + "grad_norm": 0.26042798161506653, + "learning_rate": 9.295492487479133e-05, + "loss": 0.529, + "step": 1609 + }, + { + "epoch": 0.0005651248965509918, + "grad_norm": 0.27954763174057007, + "learning_rate": 9.28881469115192e-05, + "loss": 0.5216, + "step": 1610 + }, + { + "epoch": 0.000565475905803508, + "grad_norm": 0.3117378354072571, + "learning_rate": 9.282136894824707e-05, + "loss": 0.4835, + "step": 1611 + }, + { + "epoch": 0.0005658269150560242, + "grad_norm": 0.3219063878059387, + "learning_rate": 9.275459098497496e-05, + "loss": 0.6403, + "step": 1612 + }, + { + "epoch": 0.0005661779243085403, + "grad_norm": 0.32121285796165466, + "learning_rate": 9.268781302170285e-05, + "loss": 0.5992, + "step": 1613 + }, + { + "epoch": 0.0005665289335610564, + "grad_norm": 0.2896992564201355, + "learning_rate": 9.262103505843073e-05, + "loss": 0.4995, + "step": 1614 + }, + { + "epoch": 0.0005668799428135726, + "grad_norm": 0.311301589012146, + "learning_rate": 9.255425709515861e-05, + "loss": 0.5201, + "step": 1615 + }, + { + "epoch": 0.0005672309520660887, + "grad_norm": 0.2977074682712555, + "learning_rate": 9.248747913188648e-05, + "loss": 0.5557, + "step": 1616 + }, + { + "epoch": 0.0005675819613186049, + "grad_norm": 0.315746009349823, + "learning_rate": 9.242070116861437e-05, + "loss": 0.5746, + "step": 1617 + }, + { + "epoch": 0.000567932970571121, + "grad_norm": 0.323231965303421, + "learning_rate": 9.235392320534224e-05, + "loss": 0.5714, + "step": 1618 + }, + { + "epoch": 0.0005682839798236372, + "grad_norm": 0.30381882190704346, + "learning_rate": 9.228714524207011e-05, + "loss": 0.5279, + "step": 1619 + }, + { + "epoch": 0.0005686349890761533, + "grad_norm": 0.3350276052951813, + "learning_rate": 9.2220367278798e-05, + "loss": 0.4504, + "step": 1620 + }, + { + "epoch": 0.0005689859983286694, + "grad_norm": 0.3821620047092438, + "learning_rate": 9.215358931552587e-05, + "loss": 0.4713, + "step": 1621 + }, + { + "epoch": 0.0005693370075811856, + "grad_norm": 0.299938827753067, + "learning_rate": 9.208681135225376e-05, + "loss": 0.5426, + "step": 1622 + }, + { + "epoch": 0.0005696880168337017, + "grad_norm": 0.3533617854118347, + "learning_rate": 9.202003338898165e-05, + "loss": 0.5947, + "step": 1623 + }, + { + "epoch": 0.0005700390260862179, + "grad_norm": 0.5132538080215454, + "learning_rate": 9.195325542570952e-05, + "loss": 0.4809, + "step": 1624 + }, + { + "epoch": 0.000570390035338734, + "grad_norm": 0.28735020756721497, + "learning_rate": 9.18864774624374e-05, + "loss": 0.5597, + "step": 1625 + }, + { + "epoch": 0.0005707410445912501, + "grad_norm": 0.3230040669441223, + "learning_rate": 9.181969949916528e-05, + "loss": 0.5099, + "step": 1626 + }, + { + "epoch": 0.0005710920538437663, + "grad_norm": 0.3185240924358368, + "learning_rate": 9.175292153589315e-05, + "loss": 0.5443, + "step": 1627 + }, + { + "epoch": 0.0005714430630962825, + "grad_norm": 0.3230789005756378, + "learning_rate": 9.168614357262104e-05, + "loss": 0.4757, + "step": 1628 + }, + { + "epoch": 0.0005717940723487986, + "grad_norm": 0.3181735873222351, + "learning_rate": 9.161936560934891e-05, + "loss": 0.4645, + "step": 1629 + }, + { + "epoch": 0.0005721450816013147, + "grad_norm": 0.31638282537460327, + "learning_rate": 9.15525876460768e-05, + "loss": 0.6041, + "step": 1630 + }, + { + "epoch": 0.0005724960908538309, + "grad_norm": 0.31525102257728577, + "learning_rate": 9.148580968280469e-05, + "loss": 0.5238, + "step": 1631 + }, + { + "epoch": 0.0005728471001063471, + "grad_norm": 0.27146804332733154, + "learning_rate": 9.141903171953256e-05, + "loss": 0.5115, + "step": 1632 + }, + { + "epoch": 0.0005731981093588631, + "grad_norm": 0.28801295161247253, + "learning_rate": 9.135225375626045e-05, + "loss": 0.4111, + "step": 1633 + }, + { + "epoch": 0.0005735491186113793, + "grad_norm": 0.3048948645591736, + "learning_rate": 9.128547579298832e-05, + "loss": 0.5293, + "step": 1634 + }, + { + "epoch": 0.0005739001278638955, + "grad_norm": 0.31797000765800476, + "learning_rate": 9.121869782971619e-05, + "loss": 0.5365, + "step": 1635 + }, + { + "epoch": 0.0005742511371164116, + "grad_norm": 0.3156517446041107, + "learning_rate": 9.115191986644408e-05, + "loss": 0.6072, + "step": 1636 + }, + { + "epoch": 0.0005746021463689277, + "grad_norm": 0.28218841552734375, + "learning_rate": 9.108514190317195e-05, + "loss": 0.5127, + "step": 1637 + }, + { + "epoch": 0.0005749531556214439, + "grad_norm": 0.34264588356018066, + "learning_rate": 9.101836393989984e-05, + "loss": 0.5442, + "step": 1638 + }, + { + "epoch": 0.0005753041648739601, + "grad_norm": 0.31075727939605713, + "learning_rate": 9.095158597662771e-05, + "loss": 0.4853, + "step": 1639 + }, + { + "epoch": 0.0005756551741264762, + "grad_norm": 0.34270209074020386, + "learning_rate": 9.08848080133556e-05, + "loss": 0.5188, + "step": 1640 + }, + { + "epoch": 0.0005760061833789923, + "grad_norm": 0.3420792520046234, + "learning_rate": 9.081803005008348e-05, + "loss": 0.552, + "step": 1641 + }, + { + "epoch": 0.0005763571926315085, + "grad_norm": 0.24184514582157135, + "learning_rate": 9.075125208681136e-05, + "loss": 0.4318, + "step": 1642 + }, + { + "epoch": 0.0005767082018840246, + "grad_norm": 0.27248474955558777, + "learning_rate": 9.068447412353923e-05, + "loss": 0.4984, + "step": 1643 + }, + { + "epoch": 0.0005770592111365408, + "grad_norm": 0.2861645817756653, + "learning_rate": 9.061769616026712e-05, + "loss": 0.4954, + "step": 1644 + }, + { + "epoch": 0.0005774102203890569, + "grad_norm": 0.3070414662361145, + "learning_rate": 9.055091819699499e-05, + "loss": 0.5734, + "step": 1645 + }, + { + "epoch": 0.000577761229641573, + "grad_norm": 0.32180657982826233, + "learning_rate": 9.048414023372288e-05, + "loss": 0.595, + "step": 1646 + }, + { + "epoch": 0.0005781122388940892, + "grad_norm": 0.29433441162109375, + "learning_rate": 9.041736227045075e-05, + "loss": 0.4721, + "step": 1647 + }, + { + "epoch": 0.0005784632481466053, + "grad_norm": 0.28735247254371643, + "learning_rate": 9.035058430717864e-05, + "loss": 0.5441, + "step": 1648 + }, + { + "epoch": 0.0005788142573991215, + "grad_norm": 0.38344794511795044, + "learning_rate": 9.028380634390652e-05, + "loss": 0.6197, + "step": 1649 + }, + { + "epoch": 0.0005791652666516376, + "grad_norm": 0.32271769642829895, + "learning_rate": 9.02170283806344e-05, + "loss": 0.5229, + "step": 1650 + }, + { + "epoch": 0.0005795162759041538, + "grad_norm": 0.27504557371139526, + "learning_rate": 9.015025041736227e-05, + "loss": 0.432, + "step": 1651 + }, + { + "epoch": 0.00057986728515667, + "grad_norm": 0.3397347033023834, + "learning_rate": 9.008347245409016e-05, + "loss": 0.5546, + "step": 1652 + }, + { + "epoch": 0.000580218294409186, + "grad_norm": 0.3478119671344757, + "learning_rate": 9.001669449081803e-05, + "loss": 0.5094, + "step": 1653 + }, + { + "epoch": 0.0005805693036617022, + "grad_norm": 0.3200027644634247, + "learning_rate": 8.994991652754592e-05, + "loss": 0.4964, + "step": 1654 + }, + { + "epoch": 0.0005809203129142184, + "grad_norm": 0.3458947539329529, + "learning_rate": 8.988313856427379e-05, + "loss": 0.5945, + "step": 1655 + }, + { + "epoch": 0.0005812713221667344, + "grad_norm": 0.30390462279319763, + "learning_rate": 8.981636060100166e-05, + "loss": 0.5664, + "step": 1656 + }, + { + "epoch": 0.0005816223314192506, + "grad_norm": 0.32214075326919556, + "learning_rate": 8.974958263772955e-05, + "loss": 0.464, + "step": 1657 + }, + { + "epoch": 0.0005819733406717668, + "grad_norm": 0.3261844217777252, + "learning_rate": 8.968280467445744e-05, + "loss": 0.6139, + "step": 1658 + }, + { + "epoch": 0.000582324349924283, + "grad_norm": 0.30164632201194763, + "learning_rate": 8.961602671118531e-05, + "loss": 0.4767, + "step": 1659 + }, + { + "epoch": 0.000582675359176799, + "grad_norm": 0.27412328124046326, + "learning_rate": 8.95492487479132e-05, + "loss": 0.4773, + "step": 1660 + }, + { + "epoch": 0.0005830263684293152, + "grad_norm": 0.3026188313961029, + "learning_rate": 8.948247078464107e-05, + "loss": 0.5091, + "step": 1661 + }, + { + "epoch": 0.0005833773776818314, + "grad_norm": 0.4182475507259369, + "learning_rate": 8.941569282136896e-05, + "loss": 0.4763, + "step": 1662 + }, + { + "epoch": 0.0005837283869343475, + "grad_norm": 0.32345879077911377, + "learning_rate": 8.934891485809683e-05, + "loss": 0.4365, + "step": 1663 + }, + { + "epoch": 0.0005840793961868636, + "grad_norm": 0.27278438210487366, + "learning_rate": 8.92821368948247e-05, + "loss": 0.4126, + "step": 1664 + }, + { + "epoch": 0.0005844304054393798, + "grad_norm": 0.2701342701911926, + "learning_rate": 8.921535893155259e-05, + "loss": 0.44, + "step": 1665 + }, + { + "epoch": 0.0005847814146918959, + "grad_norm": 0.33415308594703674, + "learning_rate": 8.914858096828048e-05, + "loss": 0.4873, + "step": 1666 + }, + { + "epoch": 0.0005851324239444121, + "grad_norm": 0.25953027606010437, + "learning_rate": 8.908180300500835e-05, + "loss": 0.5047, + "step": 1667 + }, + { + "epoch": 0.0005854834331969282, + "grad_norm": 0.2938767373561859, + "learning_rate": 8.901502504173624e-05, + "loss": 0.5472, + "step": 1668 + }, + { + "epoch": 0.0005858344424494444, + "grad_norm": 0.34639960527420044, + "learning_rate": 8.894824707846411e-05, + "loss": 0.4647, + "step": 1669 + }, + { + "epoch": 0.0005861854517019605, + "grad_norm": 0.30084356665611267, + "learning_rate": 8.8881469115192e-05, + "loss": 0.5751, + "step": 1670 + }, + { + "epoch": 0.0005865364609544767, + "grad_norm": 0.3419461250305176, + "learning_rate": 8.881469115191987e-05, + "loss": 0.5945, + "step": 1671 + }, + { + "epoch": 0.0005868874702069928, + "grad_norm": 0.30969375371932983, + "learning_rate": 8.874791318864774e-05, + "loss": 0.476, + "step": 1672 + }, + { + "epoch": 0.0005872384794595089, + "grad_norm": 0.2766319513320923, + "learning_rate": 8.868113522537563e-05, + "loss": 0.471, + "step": 1673 + }, + { + "epoch": 0.0005875894887120251, + "grad_norm": 0.2892490327358246, + "learning_rate": 8.86143572621035e-05, + "loss": 0.5525, + "step": 1674 + }, + { + "epoch": 0.0005879404979645413, + "grad_norm": 0.2913951575756073, + "learning_rate": 8.854757929883139e-05, + "loss": 0.5969, + "step": 1675 + }, + { + "epoch": 0.0005882915072170573, + "grad_norm": 0.3010789155960083, + "learning_rate": 8.848080133555928e-05, + "loss": 0.4817, + "step": 1676 + }, + { + "epoch": 0.0005886425164695735, + "grad_norm": 0.29977700114250183, + "learning_rate": 8.841402337228715e-05, + "loss": 0.4793, + "step": 1677 + }, + { + "epoch": 0.0005889935257220897, + "grad_norm": 0.3283400535583496, + "learning_rate": 8.834724540901504e-05, + "loss": 0.5056, + "step": 1678 + }, + { + "epoch": 0.0005893445349746058, + "grad_norm": 0.30444255471229553, + "learning_rate": 8.828046744574291e-05, + "loss": 0.4955, + "step": 1679 + }, + { + "epoch": 0.0005896955442271219, + "grad_norm": 0.3443448543548584, + "learning_rate": 8.821368948247078e-05, + "loss": 0.5143, + "step": 1680 + }, + { + "epoch": 0.0005900465534796381, + "grad_norm": 0.29445815086364746, + "learning_rate": 8.814691151919867e-05, + "loss": 0.5487, + "step": 1681 + }, + { + "epoch": 0.0005903975627321543, + "grad_norm": 0.2663688659667969, + "learning_rate": 8.808013355592654e-05, + "loss": 0.4625, + "step": 1682 + }, + { + "epoch": 0.0005907485719846703, + "grad_norm": 0.3313208222389221, + "learning_rate": 8.801335559265443e-05, + "loss": 0.5043, + "step": 1683 + }, + { + "epoch": 0.0005910995812371865, + "grad_norm": 0.33829203248023987, + "learning_rate": 8.794657762938232e-05, + "loss": 0.5575, + "step": 1684 + }, + { + "epoch": 0.0005914505904897027, + "grad_norm": 0.2788808047771454, + "learning_rate": 8.787979966611019e-05, + "loss": 0.3439, + "step": 1685 + }, + { + "epoch": 0.0005918015997422188, + "grad_norm": 0.2924749255180359, + "learning_rate": 8.781302170283808e-05, + "loss": 0.5249, + "step": 1686 + }, + { + "epoch": 0.0005921526089947349, + "grad_norm": 0.3375588357448578, + "learning_rate": 8.774624373956595e-05, + "loss": 0.5204, + "step": 1687 + }, + { + "epoch": 0.0005925036182472511, + "grad_norm": 0.31543827056884766, + "learning_rate": 8.767946577629382e-05, + "loss": 0.547, + "step": 1688 + }, + { + "epoch": 0.0005928546274997673, + "grad_norm": 0.29130932688713074, + "learning_rate": 8.761268781302171e-05, + "loss": 0.4064, + "step": 1689 + }, + { + "epoch": 0.0005932056367522834, + "grad_norm": 0.28948086500167847, + "learning_rate": 8.754590984974958e-05, + "loss": 0.4538, + "step": 1690 + }, + { + "epoch": 0.0005935566460047995, + "grad_norm": 0.3201799690723419, + "learning_rate": 8.747913188647745e-05, + "loss": 0.5578, + "step": 1691 + }, + { + "epoch": 0.0005939076552573157, + "grad_norm": 0.3169330954551697, + "learning_rate": 8.741235392320535e-05, + "loss": 0.5626, + "step": 1692 + }, + { + "epoch": 0.0005942586645098318, + "grad_norm": 0.34727850556373596, + "learning_rate": 8.734557595993323e-05, + "loss": 0.4209, + "step": 1693 + }, + { + "epoch": 0.000594609673762348, + "grad_norm": 0.3186934292316437, + "learning_rate": 8.727879799666111e-05, + "loss": 0.5401, + "step": 1694 + }, + { + "epoch": 0.0005949606830148641, + "grad_norm": 0.34129294753074646, + "learning_rate": 8.721202003338899e-05, + "loss": 0.4617, + "step": 1695 + }, + { + "epoch": 0.0005953116922673802, + "grad_norm": 0.3374929130077362, + "learning_rate": 8.714524207011686e-05, + "loss": 0.5558, + "step": 1696 + }, + { + "epoch": 0.0005956627015198964, + "grad_norm": 0.30274853110313416, + "learning_rate": 8.707846410684475e-05, + "loss": 0.4544, + "step": 1697 + }, + { + "epoch": 0.0005960137107724126, + "grad_norm": 0.3348468244075775, + "learning_rate": 8.701168614357262e-05, + "loss": 0.641, + "step": 1698 + }, + { + "epoch": 0.0005963647200249287, + "grad_norm": 0.2674828767776489, + "learning_rate": 8.694490818030051e-05, + "loss": 0.5649, + "step": 1699 + }, + { + "epoch": 0.0005967157292774448, + "grad_norm": 0.3447219729423523, + "learning_rate": 8.687813021702838e-05, + "loss": 0.4618, + "step": 1700 + }, + { + "epoch": 0.000597066738529961, + "grad_norm": 0.3155357241630554, + "learning_rate": 8.681135225375627e-05, + "loss": 0.4557, + "step": 1701 + }, + { + "epoch": 0.0005974177477824772, + "grad_norm": 0.2937457263469696, + "learning_rate": 8.674457429048415e-05, + "loss": 0.65, + "step": 1702 + }, + { + "epoch": 0.0005977687570349932, + "grad_norm": 0.287835031747818, + "learning_rate": 8.667779632721203e-05, + "loss": 0.4525, + "step": 1703 + }, + { + "epoch": 0.0005981197662875094, + "grad_norm": 0.3285943865776062, + "learning_rate": 8.66110183639399e-05, + "loss": 0.4686, + "step": 1704 + }, + { + "epoch": 0.0005984707755400256, + "grad_norm": 0.3463473618030548, + "learning_rate": 8.654424040066779e-05, + "loss": 0.5349, + "step": 1705 + }, + { + "epoch": 0.0005988217847925416, + "grad_norm": 0.3047028183937073, + "learning_rate": 8.647746243739566e-05, + "loss": 0.4551, + "step": 1706 + }, + { + "epoch": 0.0005991727940450578, + "grad_norm": 0.2832798361778259, + "learning_rate": 8.641068447412355e-05, + "loss": 0.4721, + "step": 1707 + }, + { + "epoch": 0.000599523803297574, + "grad_norm": 0.3024655878543854, + "learning_rate": 8.634390651085142e-05, + "loss": 0.4971, + "step": 1708 + }, + { + "epoch": 0.0005998748125500902, + "grad_norm": 0.2802872657775879, + "learning_rate": 8.62771285475793e-05, + "loss": 0.4598, + "step": 1709 + }, + { + "epoch": 0.0006002258218026062, + "grad_norm": 0.2773732841014862, + "learning_rate": 8.62103505843072e-05, + "loss": 0.4215, + "step": 1710 + }, + { + "epoch": 0.0006005768310551224, + "grad_norm": 0.3328293263912201, + "learning_rate": 8.614357262103507e-05, + "loss": 0.4502, + "step": 1711 + }, + { + "epoch": 0.0006009278403076386, + "grad_norm": 0.3046766519546509, + "learning_rate": 8.607679465776294e-05, + "loss": 0.4578, + "step": 1712 + }, + { + "epoch": 0.0006012788495601547, + "grad_norm": 0.33364781737327576, + "learning_rate": 8.601001669449083e-05, + "loss": 0.5184, + "step": 1713 + }, + { + "epoch": 0.0006016298588126708, + "grad_norm": 0.3627041280269623, + "learning_rate": 8.59432387312187e-05, + "loss": 0.5041, + "step": 1714 + }, + { + "epoch": 0.000601980868065187, + "grad_norm": 0.3411107361316681, + "learning_rate": 8.587646076794659e-05, + "loss": 0.4983, + "step": 1715 + }, + { + "epoch": 0.0006023318773177031, + "grad_norm": 0.3014586865901947, + "learning_rate": 8.580968280467446e-05, + "loss": 0.6105, + "step": 1716 + }, + { + "epoch": 0.0006026828865702193, + "grad_norm": 0.29484355449676514, + "learning_rate": 8.574290484140233e-05, + "loss": 0.4038, + "step": 1717 + }, + { + "epoch": 0.0006030338958227354, + "grad_norm": 0.37084364891052246, + "learning_rate": 8.567612687813022e-05, + "loss": 0.4537, + "step": 1718 + }, + { + "epoch": 0.0006033849050752516, + "grad_norm": 0.29114142060279846, + "learning_rate": 8.56093489148581e-05, + "loss": 0.5568, + "step": 1719 + }, + { + "epoch": 0.0006037359143277677, + "grad_norm": 0.3706299662590027, + "learning_rate": 8.554257095158598e-05, + "loss": 0.5678, + "step": 1720 + }, + { + "epoch": 0.0006040869235802839, + "grad_norm": 0.3251887857913971, + "learning_rate": 8.547579298831387e-05, + "loss": 0.573, + "step": 1721 + }, + { + "epoch": 0.0006044379328328, + "grad_norm": 0.28198716044425964, + "learning_rate": 8.540901502504174e-05, + "loss": 0.4209, + "step": 1722 + }, + { + "epoch": 0.0006047889420853161, + "grad_norm": 0.2896440029144287, + "learning_rate": 8.534223706176963e-05, + "loss": 0.4579, + "step": 1723 + }, + { + "epoch": 0.0006051399513378323, + "grad_norm": 0.3755309283733368, + "learning_rate": 8.52754590984975e-05, + "loss": 0.5178, + "step": 1724 + }, + { + "epoch": 0.0006054909605903485, + "grad_norm": 0.37272268533706665, + "learning_rate": 8.520868113522537e-05, + "loss": 0.5911, + "step": 1725 + }, + { + "epoch": 0.0006058419698428645, + "grad_norm": 0.29033470153808594, + "learning_rate": 8.514190317195326e-05, + "loss": 0.4492, + "step": 1726 + }, + { + "epoch": 0.0006061929790953807, + "grad_norm": 0.2940375804901123, + "learning_rate": 8.507512520868115e-05, + "loss": 0.4917, + "step": 1727 + }, + { + "epoch": 0.0006065439883478969, + "grad_norm": 0.3448154926300049, + "learning_rate": 8.500834724540902e-05, + "loss": 0.5053, + "step": 1728 + }, + { + "epoch": 0.000606894997600413, + "grad_norm": 0.30485787987709045, + "learning_rate": 8.49415692821369e-05, + "loss": 0.4761, + "step": 1729 + }, + { + "epoch": 0.0006072460068529291, + "grad_norm": 0.33083775639533997, + "learning_rate": 8.487479131886478e-05, + "loss": 0.5504, + "step": 1730 + }, + { + "epoch": 0.0006075970161054453, + "grad_norm": 0.2886825203895569, + "learning_rate": 8.480801335559267e-05, + "loss": 0.5908, + "step": 1731 + }, + { + "epoch": 0.0006079480253579615, + "grad_norm": 0.3262576758861542, + "learning_rate": 8.474123539232054e-05, + "loss": 0.4562, + "step": 1732 + }, + { + "epoch": 0.0006082990346104775, + "grad_norm": 0.31888243556022644, + "learning_rate": 8.467445742904841e-05, + "loss": 0.6006, + "step": 1733 + }, + { + "epoch": 0.0006086500438629937, + "grad_norm": 0.33102548122406006, + "learning_rate": 8.46076794657763e-05, + "loss": 0.5731, + "step": 1734 + }, + { + "epoch": 0.0006090010531155099, + "grad_norm": 0.31176602840423584, + "learning_rate": 8.454090150250417e-05, + "loss": 0.4979, + "step": 1735 + }, + { + "epoch": 0.000609352062368026, + "grad_norm": 0.30639031529426575, + "learning_rate": 8.447412353923206e-05, + "loss": 0.5953, + "step": 1736 + }, + { + "epoch": 0.0006097030716205421, + "grad_norm": 0.3576785922050476, + "learning_rate": 8.440734557595994e-05, + "loss": 0.612, + "step": 1737 + }, + { + "epoch": 0.0006100540808730583, + "grad_norm": 0.3325173854827881, + "learning_rate": 8.434056761268782e-05, + "loss": 0.5577, + "step": 1738 + }, + { + "epoch": 0.0006104050901255745, + "grad_norm": 0.3713616728782654, + "learning_rate": 8.42737896494157e-05, + "loss": 0.5457, + "step": 1739 + }, + { + "epoch": 0.0006107560993780906, + "grad_norm": 0.37327736616134644, + "learning_rate": 8.420701168614358e-05, + "loss": 0.4726, + "step": 1740 + }, + { + "epoch": 0.0006111071086306067, + "grad_norm": 0.3603207468986511, + "learning_rate": 8.414023372287145e-05, + "loss": 0.5489, + "step": 1741 + }, + { + "epoch": 0.0006114581178831229, + "grad_norm": 0.30581197142601013, + "learning_rate": 8.407345575959934e-05, + "loss": 0.4219, + "step": 1742 + }, + { + "epoch": 0.000611809127135639, + "grad_norm": 0.3137530982494354, + "learning_rate": 8.400667779632721e-05, + "loss": 0.4862, + "step": 1743 + }, + { + "epoch": 0.0006121601363881552, + "grad_norm": 0.28663527965545654, + "learning_rate": 8.39398998330551e-05, + "loss": 0.4886, + "step": 1744 + }, + { + "epoch": 0.0006125111456406713, + "grad_norm": 0.28816184401512146, + "learning_rate": 8.387312186978298e-05, + "loss": 0.4536, + "step": 1745 + }, + { + "epoch": 0.0006128621548931874, + "grad_norm": 0.36478331685066223, + "learning_rate": 8.380634390651086e-05, + "loss": 0.4393, + "step": 1746 + }, + { + "epoch": 0.0006132131641457036, + "grad_norm": 0.34497642517089844, + "learning_rate": 8.373956594323874e-05, + "loss": 0.4783, + "step": 1747 + }, + { + "epoch": 0.0006135641733982198, + "grad_norm": 0.34038984775543213, + "learning_rate": 8.367278797996662e-05, + "loss": 0.4024, + "step": 1748 + }, + { + "epoch": 0.0006139151826507358, + "grad_norm": 0.42788851261138916, + "learning_rate": 8.360601001669449e-05, + "loss": 0.4738, + "step": 1749 + }, + { + "epoch": 0.000614266191903252, + "grad_norm": 0.3174630105495453, + "learning_rate": 8.353923205342238e-05, + "loss": 0.571, + "step": 1750 + }, + { + "epoch": 0.0006146172011557682, + "grad_norm": 0.43922609090805054, + "learning_rate": 8.347245409015025e-05, + "loss": 0.6078, + "step": 1751 + }, + { + "epoch": 0.0006149682104082844, + "grad_norm": 0.3589128255844116, + "learning_rate": 8.340567612687812e-05, + "loss": 0.6748, + "step": 1752 + }, + { + "epoch": 0.0006153192196608004, + "grad_norm": 0.36477571725845337, + "learning_rate": 8.333889816360601e-05, + "loss": 0.4796, + "step": 1753 + }, + { + "epoch": 0.0006156702289133166, + "grad_norm": 0.3312797546386719, + "learning_rate": 8.32721202003339e-05, + "loss": 0.5847, + "step": 1754 + }, + { + "epoch": 0.0006160212381658328, + "grad_norm": 0.3113849461078644, + "learning_rate": 8.320534223706178e-05, + "loss": 0.5345, + "step": 1755 + }, + { + "epoch": 0.0006163722474183488, + "grad_norm": 0.3181850016117096, + "learning_rate": 8.313856427378966e-05, + "loss": 0.5949, + "step": 1756 + }, + { + "epoch": 0.000616723256670865, + "grad_norm": 0.44424140453338623, + "learning_rate": 8.307178631051753e-05, + "loss": 0.5702, + "step": 1757 + }, + { + "epoch": 0.0006170742659233812, + "grad_norm": 0.3985821604728699, + "learning_rate": 8.300500834724542e-05, + "loss": 0.5699, + "step": 1758 + }, + { + "epoch": 0.0006174252751758973, + "grad_norm": 0.3222169280052185, + "learning_rate": 8.293823038397329e-05, + "loss": 0.5349, + "step": 1759 + }, + { + "epoch": 0.0006177762844284134, + "grad_norm": 0.4233343303203583, + "learning_rate": 8.287145242070116e-05, + "loss": 0.5031, + "step": 1760 + }, + { + "epoch": 0.0006181272936809296, + "grad_norm": 0.3432156443595886, + "learning_rate": 8.280467445742905e-05, + "loss": 0.5084, + "step": 1761 + }, + { + "epoch": 0.0006184783029334458, + "grad_norm": 0.33886751532554626, + "learning_rate": 8.273789649415694e-05, + "loss": 0.4592, + "step": 1762 + }, + { + "epoch": 0.0006188293121859619, + "grad_norm": 0.3379828929901123, + "learning_rate": 8.267111853088482e-05, + "loss": 0.4691, + "step": 1763 + }, + { + "epoch": 0.000619180321438478, + "grad_norm": 0.2838027775287628, + "learning_rate": 8.26043405676127e-05, + "loss": 0.5345, + "step": 1764 + }, + { + "epoch": 0.0006195313306909942, + "grad_norm": 0.3198727071285248, + "learning_rate": 8.253756260434057e-05, + "loss": 0.6029, + "step": 1765 + }, + { + "epoch": 0.0006198823399435103, + "grad_norm": 0.37079837918281555, + "learning_rate": 8.247078464106846e-05, + "loss": 0.6643, + "step": 1766 + }, + { + "epoch": 0.0006202333491960265, + "grad_norm": 0.3130449652671814, + "learning_rate": 8.240400667779633e-05, + "loss": 0.5585, + "step": 1767 + }, + { + "epoch": 0.0006205843584485426, + "grad_norm": 0.29854029417037964, + "learning_rate": 8.23372287145242e-05, + "loss": 0.5202, + "step": 1768 + }, + { + "epoch": 0.0006209353677010587, + "grad_norm": 0.3536113202571869, + "learning_rate": 8.227045075125209e-05, + "loss": 0.5882, + "step": 1769 + }, + { + "epoch": 0.0006212863769535749, + "grad_norm": 0.2841801941394806, + "learning_rate": 8.220367278797996e-05, + "loss": 0.4227, + "step": 1770 + }, + { + "epoch": 0.0006216373862060911, + "grad_norm": 0.32225102186203003, + "learning_rate": 8.213689482470785e-05, + "loss": 0.5545, + "step": 1771 + }, + { + "epoch": 0.0006219883954586072, + "grad_norm": 0.3385821282863617, + "learning_rate": 8.207011686143574e-05, + "loss": 0.5307, + "step": 1772 + }, + { + "epoch": 0.0006223394047111233, + "grad_norm": 0.3400219976902008, + "learning_rate": 8.200333889816361e-05, + "loss": 0.5664, + "step": 1773 + }, + { + "epoch": 0.0006226904139636395, + "grad_norm": 0.4283548593521118, + "learning_rate": 8.19365609348915e-05, + "loss": 0.4957, + "step": 1774 + }, + { + "epoch": 0.0006230414232161557, + "grad_norm": 0.3625548779964447, + "learning_rate": 8.186978297161937e-05, + "loss": 0.4819, + "step": 1775 + }, + { + "epoch": 0.0006233924324686717, + "grad_norm": 0.34131062030792236, + "learning_rate": 8.180300500834724e-05, + "loss": 0.5277, + "step": 1776 + }, + { + "epoch": 0.0006237434417211879, + "grad_norm": 0.3383775055408478, + "learning_rate": 8.173622704507513e-05, + "loss": 0.5539, + "step": 1777 + }, + { + "epoch": 0.0006240944509737041, + "grad_norm": 0.2844056785106659, + "learning_rate": 8.1669449081803e-05, + "loss": 0.4959, + "step": 1778 + }, + { + "epoch": 0.0006244454602262201, + "grad_norm": 0.3345259428024292, + "learning_rate": 8.160267111853089e-05, + "loss": 0.5136, + "step": 1779 + }, + { + "epoch": 0.0006247964694787363, + "grad_norm": 0.32142356038093567, + "learning_rate": 8.153589315525877e-05, + "loss": 0.5348, + "step": 1780 + }, + { + "epoch": 0.0006251474787312525, + "grad_norm": 0.30291274189949036, + "learning_rate": 8.146911519198665e-05, + "loss": 0.5296, + "step": 1781 + }, + { + "epoch": 0.0006254984879837687, + "grad_norm": 0.36180031299591064, + "learning_rate": 8.140233722871453e-05, + "loss": 0.5498, + "step": 1782 + }, + { + "epoch": 0.0006258494972362847, + "grad_norm": 0.2952847182750702, + "learning_rate": 8.133555926544241e-05, + "loss": 0.5233, + "step": 1783 + }, + { + "epoch": 0.0006262005064888009, + "grad_norm": 0.2964370846748352, + "learning_rate": 8.126878130217028e-05, + "loss": 0.5787, + "step": 1784 + }, + { + "epoch": 0.0006265515157413171, + "grad_norm": 0.3017970323562622, + "learning_rate": 8.120200333889817e-05, + "loss": 0.5927, + "step": 1785 + }, + { + "epoch": 0.0006269025249938332, + "grad_norm": 0.32457467913627625, + "learning_rate": 8.113522537562604e-05, + "loss": 0.6207, + "step": 1786 + }, + { + "epoch": 0.0006272535342463493, + "grad_norm": 0.3024297058582306, + "learning_rate": 8.106844741235393e-05, + "loss": 0.5379, + "step": 1787 + }, + { + "epoch": 0.0006276045434988655, + "grad_norm": 0.2766537368297577, + "learning_rate": 8.10016694490818e-05, + "loss": 0.432, + "step": 1788 + }, + { + "epoch": 0.0006279555527513816, + "grad_norm": 0.3326070308685303, + "learning_rate": 8.093489148580969e-05, + "loss": 0.6633, + "step": 1789 + }, + { + "epoch": 0.0006283065620038978, + "grad_norm": 0.2948818802833557, + "learning_rate": 8.086811352253757e-05, + "loss": 0.4987, + "step": 1790 + }, + { + "epoch": 0.0006286575712564139, + "grad_norm": 0.28426218032836914, + "learning_rate": 8.080133555926545e-05, + "loss": 0.442, + "step": 1791 + }, + { + "epoch": 0.0006290085805089301, + "grad_norm": 0.30030035972595215, + "learning_rate": 8.073455759599332e-05, + "loss": 0.6064, + "step": 1792 + }, + { + "epoch": 0.0006293595897614462, + "grad_norm": 0.30664128065109253, + "learning_rate": 8.066777963272121e-05, + "loss": 0.4789, + "step": 1793 + }, + { + "epoch": 0.0006297105990139624, + "grad_norm": 0.30878594517707825, + "learning_rate": 8.060100166944908e-05, + "loss": 0.5365, + "step": 1794 + }, + { + "epoch": 0.0006300616082664785, + "grad_norm": 0.31132617592811584, + "learning_rate": 8.053422370617697e-05, + "loss": 0.5432, + "step": 1795 + }, + { + "epoch": 0.0006304126175189946, + "grad_norm": 0.3347366154193878, + "learning_rate": 8.046744574290484e-05, + "loss": 0.4208, + "step": 1796 + }, + { + "epoch": 0.0006307636267715108, + "grad_norm": 0.3419090509414673, + "learning_rate": 8.040066777963273e-05, + "loss": 0.4985, + "step": 1797 + }, + { + "epoch": 0.000631114636024027, + "grad_norm": 0.3174959719181061, + "learning_rate": 8.033388981636061e-05, + "loss": 0.4255, + "step": 1798 + }, + { + "epoch": 0.000631465645276543, + "grad_norm": 0.32764488458633423, + "learning_rate": 8.026711185308849e-05, + "loss": 0.6213, + "step": 1799 + }, + { + "epoch": 0.0006318166545290592, + "grad_norm": 0.3342370390892029, + "learning_rate": 8.020033388981636e-05, + "loss": 0.4789, + "step": 1800 + }, + { + "epoch": 0.0006321676637815754, + "grad_norm": 0.301438570022583, + "learning_rate": 8.013355592654425e-05, + "loss": 0.5937, + "step": 1801 + }, + { + "epoch": 0.0006325186730340916, + "grad_norm": 0.31911852955818176, + "learning_rate": 8.006677796327212e-05, + "loss": 0.5831, + "step": 1802 + }, + { + "epoch": 0.0006328696822866076, + "grad_norm": 0.2970680296421051, + "learning_rate": 8e-05, + "loss": 0.5223, + "step": 1803 + }, + { + "epoch": 0.0006332206915391238, + "grad_norm": 0.29310017824172974, + "learning_rate": 7.993322203672788e-05, + "loss": 0.5266, + "step": 1804 + }, + { + "epoch": 0.00063357170079164, + "grad_norm": 0.34701675176620483, + "learning_rate": 7.986644407345575e-05, + "loss": 0.4887, + "step": 1805 + }, + { + "epoch": 0.000633922710044156, + "grad_norm": 0.24955204129219055, + "learning_rate": 7.979966611018364e-05, + "loss": 0.437, + "step": 1806 + }, + { + "epoch": 0.0006342737192966722, + "grad_norm": 0.33152899146080017, + "learning_rate": 7.973288814691153e-05, + "loss": 0.5932, + "step": 1807 + }, + { + "epoch": 0.0006346247285491884, + "grad_norm": 0.2790103852748871, + "learning_rate": 7.96661101836394e-05, + "loss": 0.4585, + "step": 1808 + }, + { + "epoch": 0.0006349757378017045, + "grad_norm": 0.30877217650413513, + "learning_rate": 7.959933222036729e-05, + "loss": 0.5174, + "step": 1809 + }, + { + "epoch": 0.0006353267470542206, + "grad_norm": 0.38331231474876404, + "learning_rate": 7.953255425709516e-05, + "loss": 0.5696, + "step": 1810 + }, + { + "epoch": 0.0006356777563067368, + "grad_norm": 0.35821542143821716, + "learning_rate": 7.946577629382305e-05, + "loss": 0.4815, + "step": 1811 + }, + { + "epoch": 0.000636028765559253, + "grad_norm": 0.3109416365623474, + "learning_rate": 7.939899833055092e-05, + "loss": 0.5783, + "step": 1812 + }, + { + "epoch": 0.0006363797748117691, + "grad_norm": 0.3217208683490753, + "learning_rate": 7.933222036727879e-05, + "loss": 0.5606, + "step": 1813 + }, + { + "epoch": 0.0006367307840642852, + "grad_norm": 0.3818305432796478, + "learning_rate": 7.926544240400668e-05, + "loss": 0.5592, + "step": 1814 + }, + { + "epoch": 0.0006370817933168014, + "grad_norm": 0.29824909567832947, + "learning_rate": 7.919866444073457e-05, + "loss": 0.5157, + "step": 1815 + }, + { + "epoch": 0.0006374328025693175, + "grad_norm": 0.31353560090065, + "learning_rate": 7.913188647746244e-05, + "loss": 0.5991, + "step": 1816 + }, + { + "epoch": 0.0006377838118218337, + "grad_norm": 0.33129647374153137, + "learning_rate": 7.906510851419033e-05, + "loss": 0.54, + "step": 1817 + }, + { + "epoch": 0.0006381348210743498, + "grad_norm": 0.3199217915534973, + "learning_rate": 7.89983305509182e-05, + "loss": 0.4823, + "step": 1818 + }, + { + "epoch": 0.0006384858303268659, + "grad_norm": 0.2801882028579712, + "learning_rate": 7.893155258764609e-05, + "loss": 0.5379, + "step": 1819 + }, + { + "epoch": 0.0006388368395793821, + "grad_norm": 0.29676681756973267, + "learning_rate": 7.886477462437396e-05, + "loss": 0.5142, + "step": 1820 + }, + { + "epoch": 0.0006391878488318983, + "grad_norm": 0.3249494433403015, + "learning_rate": 7.879799666110183e-05, + "loss": 0.4743, + "step": 1821 + }, + { + "epoch": 0.0006395388580844144, + "grad_norm": 0.47364258766174316, + "learning_rate": 7.873121869782972e-05, + "loss": 0.5575, + "step": 1822 + }, + { + "epoch": 0.0006398898673369305, + "grad_norm": 0.310779869556427, + "learning_rate": 7.86644407345576e-05, + "loss": 0.5115, + "step": 1823 + }, + { + "epoch": 0.0006402408765894467, + "grad_norm": 0.26023536920547485, + "learning_rate": 7.859766277128548e-05, + "loss": 0.5084, + "step": 1824 + }, + { + "epoch": 0.0006405918858419629, + "grad_norm": 0.31088247895240784, + "learning_rate": 7.853088480801337e-05, + "loss": 0.513, + "step": 1825 + }, + { + "epoch": 0.0006409428950944789, + "grad_norm": 0.2561517357826233, + "learning_rate": 7.846410684474124e-05, + "loss": 0.4056, + "step": 1826 + }, + { + "epoch": 0.0006412939043469951, + "grad_norm": 0.28456807136535645, + "learning_rate": 7.839732888146912e-05, + "loss": 0.4895, + "step": 1827 + }, + { + "epoch": 0.0006416449135995113, + "grad_norm": 0.30845314264297485, + "learning_rate": 7.8330550918197e-05, + "loss": 0.5941, + "step": 1828 + }, + { + "epoch": 0.0006419959228520273, + "grad_norm": 0.30980512499809265, + "learning_rate": 7.826377295492487e-05, + "loss": 0.5307, + "step": 1829 + }, + { + "epoch": 0.0006423469321045435, + "grad_norm": 0.2923174500465393, + "learning_rate": 7.819699499165276e-05, + "loss": 0.4737, + "step": 1830 + }, + { + "epoch": 0.0006426979413570597, + "grad_norm": 0.3474715054035187, + "learning_rate": 7.813021702838063e-05, + "loss": 0.6606, + "step": 1831 + }, + { + "epoch": 0.0006430489506095759, + "grad_norm": 0.29576122760772705, + "learning_rate": 7.806343906510852e-05, + "loss": 0.4151, + "step": 1832 + }, + { + "epoch": 0.000643399959862092, + "grad_norm": 0.3127489686012268, + "learning_rate": 7.79966611018364e-05, + "loss": 0.5683, + "step": 1833 + }, + { + "epoch": 0.0006437509691146081, + "grad_norm": 0.32313060760498047, + "learning_rate": 7.792988313856428e-05, + "loss": 0.3911, + "step": 1834 + }, + { + "epoch": 0.0006441019783671243, + "grad_norm": 0.38172590732574463, + "learning_rate": 7.786310517529216e-05, + "loss": 0.4852, + "step": 1835 + }, + { + "epoch": 0.0006444529876196404, + "grad_norm": 0.38548141717910767, + "learning_rate": 7.779632721202004e-05, + "loss": 0.5238, + "step": 1836 + }, + { + "epoch": 0.0006448039968721565, + "grad_norm": 0.3326992392539978, + "learning_rate": 7.772954924874791e-05, + "loss": 0.5435, + "step": 1837 + }, + { + "epoch": 0.0006451550061246727, + "grad_norm": 0.2704392969608307, + "learning_rate": 7.76627712854758e-05, + "loss": 0.5049, + "step": 1838 + }, + { + "epoch": 0.0006455060153771888, + "grad_norm": 0.3688966929912567, + "learning_rate": 7.759599332220367e-05, + "loss": 0.5507, + "step": 1839 + }, + { + "epoch": 0.000645857024629705, + "grad_norm": 0.33513352274894714, + "learning_rate": 7.752921535893156e-05, + "loss": 0.59, + "step": 1840 + }, + { + "epoch": 0.0006462080338822211, + "grad_norm": 0.26873478293418884, + "learning_rate": 7.746243739565944e-05, + "loss": 0.4088, + "step": 1841 + }, + { + "epoch": 0.0006465590431347373, + "grad_norm": 0.41162189841270447, + "learning_rate": 7.739565943238732e-05, + "loss": 0.4159, + "step": 1842 + }, + { + "epoch": 0.0006469100523872534, + "grad_norm": 0.3542315661907196, + "learning_rate": 7.73288814691152e-05, + "loss": 0.6067, + "step": 1843 + }, + { + "epoch": 0.0006472610616397696, + "grad_norm": 0.39147111773490906, + "learning_rate": 7.726210350584308e-05, + "loss": 0.4139, + "step": 1844 + }, + { + "epoch": 0.0006476120708922857, + "grad_norm": 0.3200126588344574, + "learning_rate": 7.719532554257095e-05, + "loss": 0.4112, + "step": 1845 + }, + { + "epoch": 0.0006479630801448018, + "grad_norm": 0.34853747487068176, + "learning_rate": 7.712854757929884e-05, + "loss": 0.4983, + "step": 1846 + }, + { + "epoch": 0.000648314089397318, + "grad_norm": 0.2987789511680603, + "learning_rate": 7.706176961602671e-05, + "loss": 0.5186, + "step": 1847 + }, + { + "epoch": 0.0006486650986498342, + "grad_norm": 0.3692026436328888, + "learning_rate": 7.69949916527546e-05, + "loss": 0.4028, + "step": 1848 + }, + { + "epoch": 0.0006490161079023502, + "grad_norm": 0.26036712527275085, + "learning_rate": 7.692821368948247e-05, + "loss": 0.4971, + "step": 1849 + }, + { + "epoch": 0.0006493671171548664, + "grad_norm": 0.2928013801574707, + "learning_rate": 7.686143572621036e-05, + "loss": 0.549, + "step": 1850 + }, + { + "epoch": 0.0006497181264073826, + "grad_norm": 0.2794664204120636, + "learning_rate": 7.679465776293824e-05, + "loss": 0.4184, + "step": 1851 + }, + { + "epoch": 0.0006500691356598988, + "grad_norm": 0.282713919878006, + "learning_rate": 7.672787979966612e-05, + "loss": 0.4637, + "step": 1852 + }, + { + "epoch": 0.0006504201449124148, + "grad_norm": 0.3084028959274292, + "learning_rate": 7.666110183639399e-05, + "loss": 0.4423, + "step": 1853 + }, + { + "epoch": 0.000650771154164931, + "grad_norm": 0.35329973697662354, + "learning_rate": 7.659432387312188e-05, + "loss": 0.4868, + "step": 1854 + }, + { + "epoch": 0.0006511221634174472, + "grad_norm": 0.38975444436073303, + "learning_rate": 7.652754590984975e-05, + "loss": 0.3701, + "step": 1855 + }, + { + "epoch": 0.0006514731726699632, + "grad_norm": 0.2983016073703766, + "learning_rate": 7.646076794657764e-05, + "loss": 0.5407, + "step": 1856 + }, + { + "epoch": 0.0006518241819224794, + "grad_norm": 0.32849010825157166, + "learning_rate": 7.639398998330551e-05, + "loss": 0.548, + "step": 1857 + }, + { + "epoch": 0.0006521751911749956, + "grad_norm": 0.32322797179222107, + "learning_rate": 7.63272120200334e-05, + "loss": 0.4231, + "step": 1858 + }, + { + "epoch": 0.0006525262004275117, + "grad_norm": 0.2949173152446747, + "learning_rate": 7.626043405676128e-05, + "loss": 0.5777, + "step": 1859 + }, + { + "epoch": 0.0006528772096800278, + "grad_norm": 0.3120216727256775, + "learning_rate": 7.619365609348916e-05, + "loss": 0.4483, + "step": 1860 + }, + { + "epoch": 0.000653228218932544, + "grad_norm": 0.32363617420196533, + "learning_rate": 7.612687813021703e-05, + "loss": 0.5748, + "step": 1861 + }, + { + "epoch": 0.0006535792281850602, + "grad_norm": 0.3077629506587982, + "learning_rate": 7.606010016694492e-05, + "loss": 0.5135, + "step": 1862 + }, + { + "epoch": 0.0006539302374375763, + "grad_norm": 0.3201192319393158, + "learning_rate": 7.599332220367279e-05, + "loss": 0.6412, + "step": 1863 + }, + { + "epoch": 0.0006542812466900924, + "grad_norm": 0.3008538484573364, + "learning_rate": 7.592654424040068e-05, + "loss": 0.4858, + "step": 1864 + }, + { + "epoch": 0.0006546322559426086, + "grad_norm": 0.35019761323928833, + "learning_rate": 7.585976627712855e-05, + "loss": 0.4819, + "step": 1865 + }, + { + "epoch": 0.0006549832651951247, + "grad_norm": 0.39763036370277405, + "learning_rate": 7.579298831385642e-05, + "loss": 0.5775, + "step": 1866 + }, + { + "epoch": 0.0006553342744476409, + "grad_norm": 0.29005396366119385, + "learning_rate": 7.572621035058431e-05, + "loss": 0.4828, + "step": 1867 + }, + { + "epoch": 0.000655685283700157, + "grad_norm": 0.30613401532173157, + "learning_rate": 7.56594323873122e-05, + "loss": 0.4375, + "step": 1868 + }, + { + "epoch": 0.0006560362929526731, + "grad_norm": 0.3596465289592743, + "learning_rate": 7.559265442404007e-05, + "loss": 0.4468, + "step": 1869 + }, + { + "epoch": 0.0006563873022051893, + "grad_norm": 0.28737086057662964, + "learning_rate": 7.552587646076796e-05, + "loss": 0.5726, + "step": 1870 + }, + { + "epoch": 0.0006567383114577055, + "grad_norm": 0.38036370277404785, + "learning_rate": 7.545909849749583e-05, + "loss": 0.5747, + "step": 1871 + }, + { + "epoch": 0.0006570893207102216, + "grad_norm": 0.3192722499370575, + "learning_rate": 7.539232053422371e-05, + "loss": 0.5859, + "step": 1872 + }, + { + "epoch": 0.0006574403299627377, + "grad_norm": 0.2886595129966736, + "learning_rate": 7.532554257095159e-05, + "loss": 0.5099, + "step": 1873 + }, + { + "epoch": 0.0006577913392152539, + "grad_norm": 0.3017093241214752, + "learning_rate": 7.525876460767946e-05, + "loss": 0.4442, + "step": 1874 + }, + { + "epoch": 0.0006581423484677701, + "grad_norm": 0.3073802590370178, + "learning_rate": 7.519198664440735e-05, + "loss": 0.5022, + "step": 1875 + }, + { + "epoch": 0.0006584933577202861, + "grad_norm": 0.34113094210624695, + "learning_rate": 7.512520868113523e-05, + "loss": 0.5146, + "step": 1876 + }, + { + "epoch": 0.0006588443669728023, + "grad_norm": 0.32277509570121765, + "learning_rate": 7.505843071786311e-05, + "loss": 0.5743, + "step": 1877 + }, + { + "epoch": 0.0006591953762253185, + "grad_norm": 0.3168696463108063, + "learning_rate": 7.4991652754591e-05, + "loss": 0.417, + "step": 1878 + }, + { + "epoch": 0.0006595463854778346, + "grad_norm": 0.35164040327072144, + "learning_rate": 7.492487479131887e-05, + "loss": 0.5078, + "step": 1879 + }, + { + "epoch": 0.0006598973947303507, + "grad_norm": 0.3132971227169037, + "learning_rate": 7.485809682804675e-05, + "loss": 0.4293, + "step": 1880 + }, + { + "epoch": 0.0006602484039828669, + "grad_norm": 0.3158970773220062, + "learning_rate": 7.479131886477463e-05, + "loss": 0.5559, + "step": 1881 + }, + { + "epoch": 0.0006605994132353831, + "grad_norm": 0.3228873610496521, + "learning_rate": 7.47245409015025e-05, + "loss": 0.4935, + "step": 1882 + }, + { + "epoch": 0.0006609504224878992, + "grad_norm": 0.4734925925731659, + "learning_rate": 7.465776293823039e-05, + "loss": 0.3587, + "step": 1883 + }, + { + "epoch": 0.0006613014317404153, + "grad_norm": 0.33582058548927307, + "learning_rate": 7.459098497495826e-05, + "loss": 0.4987, + "step": 1884 + }, + { + "epoch": 0.0006616524409929315, + "grad_norm": 0.38209983706474304, + "learning_rate": 7.452420701168615e-05, + "loss": 0.4443, + "step": 1885 + }, + { + "epoch": 0.0006620034502454476, + "grad_norm": 0.3218359649181366, + "learning_rate": 7.445742904841403e-05, + "loss": 0.5087, + "step": 1886 + }, + { + "epoch": 0.0006623544594979637, + "grad_norm": 0.33005908131599426, + "learning_rate": 7.439065108514191e-05, + "loss": 0.5362, + "step": 1887 + }, + { + "epoch": 0.0006627054687504799, + "grad_norm": 0.4753172993659973, + "learning_rate": 7.43238731218698e-05, + "loss": 0.4474, + "step": 1888 + }, + { + "epoch": 0.000663056478002996, + "grad_norm": 0.3765251636505127, + "learning_rate": 7.425709515859767e-05, + "loss": 0.5993, + "step": 1889 + }, + { + "epoch": 0.0006634074872555122, + "grad_norm": 0.3113894462585449, + "learning_rate": 7.419031719532554e-05, + "loss": 0.4636, + "step": 1890 + }, + { + "epoch": 0.0006637584965080283, + "grad_norm": 0.30841702222824097, + "learning_rate": 7.412353923205343e-05, + "loss": 0.5326, + "step": 1891 + }, + { + "epoch": 0.0006641095057605445, + "grad_norm": 0.29381653666496277, + "learning_rate": 7.40567612687813e-05, + "loss": 0.325, + "step": 1892 + }, + { + "epoch": 0.0006644605150130606, + "grad_norm": 0.3482291102409363, + "learning_rate": 7.398998330550919e-05, + "loss": 0.4646, + "step": 1893 + }, + { + "epoch": 0.0006648115242655768, + "grad_norm": 0.2865064740180969, + "learning_rate": 7.392320534223707e-05, + "loss": 0.4789, + "step": 1894 + }, + { + "epoch": 0.0006651625335180929, + "grad_norm": 0.29580044746398926, + "learning_rate": 7.385642737896495e-05, + "loss": 0.5047, + "step": 1895 + }, + { + "epoch": 0.000665513542770609, + "grad_norm": 0.3370521068572998, + "learning_rate": 7.378964941569283e-05, + "loss": 0.5915, + "step": 1896 + }, + { + "epoch": 0.0006658645520231252, + "grad_norm": 0.2680570185184479, + "learning_rate": 7.37228714524207e-05, + "loss": 0.4602, + "step": 1897 + }, + { + "epoch": 0.0006662155612756414, + "grad_norm": 0.2855984568595886, + "learning_rate": 7.365609348914858e-05, + "loss": 0.5439, + "step": 1898 + }, + { + "epoch": 0.0006665665705281574, + "grad_norm": 0.28999075293540955, + "learning_rate": 7.358931552587647e-05, + "loss": 0.4828, + "step": 1899 + }, + { + "epoch": 0.0006669175797806736, + "grad_norm": 0.3230993151664734, + "learning_rate": 7.352253756260434e-05, + "loss": 0.5974, + "step": 1900 + }, + { + "epoch": 0.0006672685890331898, + "grad_norm": 0.28700417280197144, + "learning_rate": 7.345575959933221e-05, + "loss": 0.5179, + "step": 1901 + }, + { + "epoch": 0.000667619598285706, + "grad_norm": 0.2921486794948578, + "learning_rate": 7.33889816360601e-05, + "loss": 0.4727, + "step": 1902 + }, + { + "epoch": 0.000667970607538222, + "grad_norm": 0.3887636959552765, + "learning_rate": 7.332220367278799e-05, + "loss": 0.5334, + "step": 1903 + }, + { + "epoch": 0.0006683216167907382, + "grad_norm": 0.3640362322330475, + "learning_rate": 7.325542570951587e-05, + "loss": 0.5576, + "step": 1904 + }, + { + "epoch": 0.0006686726260432544, + "grad_norm": 0.2985169589519501, + "learning_rate": 7.318864774624375e-05, + "loss": 0.5544, + "step": 1905 + }, + { + "epoch": 0.0006690236352957705, + "grad_norm": 0.30294784903526306, + "learning_rate": 7.312186978297162e-05, + "loss": 0.5005, + "step": 1906 + }, + { + "epoch": 0.0006693746445482866, + "grad_norm": 0.2947355806827545, + "learning_rate": 7.30550918196995e-05, + "loss": 0.4879, + "step": 1907 + }, + { + "epoch": 0.0006697256538008028, + "grad_norm": 0.2764705419540405, + "learning_rate": 7.298831385642738e-05, + "loss": 0.4531, + "step": 1908 + }, + { + "epoch": 0.0006700766630533189, + "grad_norm": 0.4107155501842499, + "learning_rate": 7.292153589315525e-05, + "loss": 0.4532, + "step": 1909 + }, + { + "epoch": 0.000670427672305835, + "grad_norm": 0.28341203927993774, + "learning_rate": 7.285475792988314e-05, + "loss": 0.5424, + "step": 1910 + }, + { + "epoch": 0.0006707786815583512, + "grad_norm": 0.36663204431533813, + "learning_rate": 7.278797996661103e-05, + "loss": 0.599, + "step": 1911 + }, + { + "epoch": 0.0006711296908108674, + "grad_norm": 0.30708596110343933, + "learning_rate": 7.272120200333891e-05, + "loss": 0.5971, + "step": 1912 + }, + { + "epoch": 0.0006714807000633835, + "grad_norm": 0.3823882043361664, + "learning_rate": 7.265442404006679e-05, + "loss": 0.5367, + "step": 1913 + }, + { + "epoch": 0.0006718317093158996, + "grad_norm": 0.3780754804611206, + "learning_rate": 7.258764607679466e-05, + "loss": 0.5756, + "step": 1914 + }, + { + "epoch": 0.0006721827185684158, + "grad_norm": 0.31058263778686523, + "learning_rate": 7.252086811352255e-05, + "loss": 0.5966, + "step": 1915 + }, + { + "epoch": 0.0006725337278209319, + "grad_norm": 0.29191386699676514, + "learning_rate": 7.245409015025042e-05, + "loss": 0.6099, + "step": 1916 + }, + { + "epoch": 0.0006728847370734481, + "grad_norm": 0.3607024550437927, + "learning_rate": 7.238731218697829e-05, + "loss": 0.5779, + "step": 1917 + }, + { + "epoch": 0.0006732357463259642, + "grad_norm": 0.2735411524772644, + "learning_rate": 7.232053422370618e-05, + "loss": 0.5511, + "step": 1918 + }, + { + "epoch": 0.0006735867555784803, + "grad_norm": 0.37066903710365295, + "learning_rate": 7.225375626043405e-05, + "loss": 0.5984, + "step": 1919 + }, + { + "epoch": 0.0006739377648309965, + "grad_norm": 0.3535907566547394, + "learning_rate": 7.218697829716194e-05, + "loss": 0.5074, + "step": 1920 + }, + { + "epoch": 0.0006742887740835127, + "grad_norm": 0.2900503873825073, + "learning_rate": 7.212020033388982e-05, + "loss": 0.3989, + "step": 1921 + }, + { + "epoch": 0.0006746397833360288, + "grad_norm": 0.2970031201839447, + "learning_rate": 7.20534223706177e-05, + "loss": 0.5514, + "step": 1922 + }, + { + "epoch": 0.0006749907925885449, + "grad_norm": 0.30902254581451416, + "learning_rate": 7.198664440734558e-05, + "loss": 0.3982, + "step": 1923 + }, + { + "epoch": 0.0006753418018410611, + "grad_norm": 0.2622113823890686, + "learning_rate": 7.191986644407346e-05, + "loss": 0.4587, + "step": 1924 + }, + { + "epoch": 0.0006756928110935773, + "grad_norm": 0.30972495675086975, + "learning_rate": 7.185308848080133e-05, + "loss": 0.5435, + "step": 1925 + }, + { + "epoch": 0.0006760438203460933, + "grad_norm": 0.3070833384990692, + "learning_rate": 7.178631051752922e-05, + "loss": 0.5074, + "step": 1926 + }, + { + "epoch": 0.0006763948295986095, + "grad_norm": 0.3055395781993866, + "learning_rate": 7.171953255425709e-05, + "loss": 0.5999, + "step": 1927 + }, + { + "epoch": 0.0006767458388511257, + "grad_norm": 0.3127722144126892, + "learning_rate": 7.165275459098498e-05, + "loss": 0.5511, + "step": 1928 + }, + { + "epoch": 0.0006770968481036418, + "grad_norm": 0.3363809585571289, + "learning_rate": 7.158597662771286e-05, + "loss": 0.5415, + "step": 1929 + }, + { + "epoch": 0.0006774478573561579, + "grad_norm": 0.3258194625377655, + "learning_rate": 7.151919866444074e-05, + "loss": 0.5976, + "step": 1930 + }, + { + "epoch": 0.0006777988666086741, + "grad_norm": 0.3083065152168274, + "learning_rate": 7.145242070116862e-05, + "loss": 0.6067, + "step": 1931 + }, + { + "epoch": 0.0006781498758611903, + "grad_norm": 0.3474681079387665, + "learning_rate": 7.13856427378965e-05, + "loss": 0.5749, + "step": 1932 + }, + { + "epoch": 0.0006785008851137064, + "grad_norm": 0.3168641924858093, + "learning_rate": 7.131886477462437e-05, + "loss": 0.4242, + "step": 1933 + }, + { + "epoch": 0.0006788518943662225, + "grad_norm": 0.30177485942840576, + "learning_rate": 7.125208681135226e-05, + "loss": 0.4978, + "step": 1934 + }, + { + "epoch": 0.0006792029036187387, + "grad_norm": 0.3365834653377533, + "learning_rate": 7.118530884808013e-05, + "loss": 0.5994, + "step": 1935 + }, + { + "epoch": 0.0006795539128712548, + "grad_norm": 0.3282754123210907, + "learning_rate": 7.111853088480802e-05, + "loss": 0.615, + "step": 1936 + }, + { + "epoch": 0.000679904922123771, + "grad_norm": 0.24498236179351807, + "learning_rate": 7.105175292153589e-05, + "loss": 0.4254, + "step": 1937 + }, + { + "epoch": 0.0006802559313762871, + "grad_norm": 0.3450114130973816, + "learning_rate": 7.098497495826378e-05, + "loss": 0.5362, + "step": 1938 + }, + { + "epoch": 0.0006806069406288032, + "grad_norm": 0.28795021772384644, + "learning_rate": 7.091819699499166e-05, + "loss": 0.4984, + "step": 1939 + }, + { + "epoch": 0.0006809579498813194, + "grad_norm": 0.32352307438850403, + "learning_rate": 7.085141903171954e-05, + "loss": 0.4549, + "step": 1940 + }, + { + "epoch": 0.0006813089591338355, + "grad_norm": 0.34447386860847473, + "learning_rate": 7.078464106844741e-05, + "loss": 0.5349, + "step": 1941 + }, + { + "epoch": 0.0006816599683863517, + "grad_norm": 0.31918805837631226, + "learning_rate": 7.07178631051753e-05, + "loss": 0.5468, + "step": 1942 + }, + { + "epoch": 0.0006820109776388678, + "grad_norm": 0.3190132975578308, + "learning_rate": 7.065108514190317e-05, + "loss": 0.5348, + "step": 1943 + }, + { + "epoch": 0.000682361986891384, + "grad_norm": 0.32868409156799316, + "learning_rate": 7.058430717863106e-05, + "loss": 0.6209, + "step": 1944 + }, + { + "epoch": 0.0006827129961439001, + "grad_norm": 0.2713989317417145, + "learning_rate": 7.051752921535893e-05, + "loss": 0.4681, + "step": 1945 + }, + { + "epoch": 0.0006830640053964162, + "grad_norm": 0.35190147161483765, + "learning_rate": 7.045075125208682e-05, + "loss": 0.5415, + "step": 1946 + }, + { + "epoch": 0.0006834150146489324, + "grad_norm": 0.322889119386673, + "learning_rate": 7.03839732888147e-05, + "loss": 0.5586, + "step": 1947 + }, + { + "epoch": 0.0006837660239014486, + "grad_norm": 0.33939826488494873, + "learning_rate": 7.031719532554258e-05, + "loss": 0.5586, + "step": 1948 + }, + { + "epoch": 0.0006841170331539646, + "grad_norm": 0.3554326891899109, + "learning_rate": 7.025041736227045e-05, + "loss": 0.5386, + "step": 1949 + }, + { + "epoch": 0.0006844680424064808, + "grad_norm": 0.3021222949028015, + "learning_rate": 7.018363939899834e-05, + "loss": 0.5569, + "step": 1950 + }, + { + "epoch": 0.000684819051658997, + "grad_norm": 0.3286188244819641, + "learning_rate": 7.011686143572621e-05, + "loss": 0.5466, + "step": 1951 + }, + { + "epoch": 0.0006851700609115132, + "grad_norm": 0.302117258310318, + "learning_rate": 7.00500834724541e-05, + "loss": 0.4038, + "step": 1952 + }, + { + "epoch": 0.0006855210701640292, + "grad_norm": 0.3204907178878784, + "learning_rate": 6.998330550918197e-05, + "loss": 0.4429, + "step": 1953 + }, + { + "epoch": 0.0006858720794165454, + "grad_norm": 0.2782181203365326, + "learning_rate": 6.991652754590986e-05, + "loss": 0.4102, + "step": 1954 + }, + { + "epoch": 0.0006862230886690616, + "grad_norm": 0.31240731477737427, + "learning_rate": 6.984974958263774e-05, + "loss": 0.5353, + "step": 1955 + }, + { + "epoch": 0.0006865740979215777, + "grad_norm": 0.32677972316741943, + "learning_rate": 6.978297161936562e-05, + "loss": 0.4403, + "step": 1956 + }, + { + "epoch": 0.0006869251071740938, + "grad_norm": 0.33199426531791687, + "learning_rate": 6.971619365609349e-05, + "loss": 0.4433, + "step": 1957 + }, + { + "epoch": 0.00068727611642661, + "grad_norm": 0.2825728952884674, + "learning_rate": 6.964941569282138e-05, + "loss": 0.5624, + "step": 1958 + }, + { + "epoch": 0.0006876271256791261, + "grad_norm": 0.30743977427482605, + "learning_rate": 6.958263772954925e-05, + "loss": 0.565, + "step": 1959 + }, + { + "epoch": 0.0006879781349316423, + "grad_norm": 0.32357290387153625, + "learning_rate": 6.951585976627714e-05, + "loss": 0.596, + "step": 1960 + }, + { + "epoch": 0.0006883291441841584, + "grad_norm": 0.31747472286224365, + "learning_rate": 6.944908180300501e-05, + "loss": 0.5811, + "step": 1961 + }, + { + "epoch": 0.0006886801534366745, + "grad_norm": 0.3278048038482666, + "learning_rate": 6.938230383973288e-05, + "loss": 0.4468, + "step": 1962 + }, + { + "epoch": 0.0006890311626891907, + "grad_norm": 0.3308374285697937, + "learning_rate": 6.931552587646077e-05, + "loss": 0.6508, + "step": 1963 + }, + { + "epoch": 0.0006893821719417069, + "grad_norm": 0.3360099792480469, + "learning_rate": 6.924874791318865e-05, + "loss": 0.534, + "step": 1964 + }, + { + "epoch": 0.000689733181194223, + "grad_norm": 0.3039510250091553, + "learning_rate": 6.918196994991654e-05, + "loss": 0.5789, + "step": 1965 + }, + { + "epoch": 0.0006900841904467391, + "grad_norm": 0.3015453517436981, + "learning_rate": 6.911519198664441e-05, + "loss": 0.3639, + "step": 1966 + }, + { + "epoch": 0.0006904351996992553, + "grad_norm": 0.3157881498336792, + "learning_rate": 6.904841402337229e-05, + "loss": 0.5002, + "step": 1967 + }, + { + "epoch": 0.0006907862089517714, + "grad_norm": 0.28026652336120605, + "learning_rate": 6.898163606010017e-05, + "loss": 0.5289, + "step": 1968 + }, + { + "epoch": 0.0006911372182042875, + "grad_norm": 0.3170677125453949, + "learning_rate": 6.891485809682805e-05, + "loss": 0.6144, + "step": 1969 + }, + { + "epoch": 0.0006914882274568037, + "grad_norm": 0.3244359791278839, + "learning_rate": 6.884808013355592e-05, + "loss": 0.6176, + "step": 1970 + }, + { + "epoch": 0.0006918392367093199, + "grad_norm": 0.3142417371273041, + "learning_rate": 6.878130217028381e-05, + "loss": 0.6137, + "step": 1971 + }, + { + "epoch": 0.0006921902459618359, + "grad_norm": 0.3678075969219208, + "learning_rate": 6.87145242070117e-05, + "loss": 0.5228, + "step": 1972 + }, + { + "epoch": 0.0006925412552143521, + "grad_norm": 0.35631263256073, + "learning_rate": 6.864774624373958e-05, + "loss": 0.4831, + "step": 1973 + }, + { + "epoch": 0.0006928922644668683, + "grad_norm": 0.30589306354522705, + "learning_rate": 6.858096828046745e-05, + "loss": 0.47, + "step": 1974 + }, + { + "epoch": 0.0006932432737193845, + "grad_norm": 0.3037767708301544, + "learning_rate": 6.851419031719533e-05, + "loss": 0.5334, + "step": 1975 + }, + { + "epoch": 0.0006935942829719005, + "grad_norm": 0.3331162631511688, + "learning_rate": 6.844741235392321e-05, + "loss": 0.6051, + "step": 1976 + }, + { + "epoch": 0.0006939452922244167, + "grad_norm": 0.3342154622077942, + "learning_rate": 6.838063439065109e-05, + "loss": 0.5466, + "step": 1977 + }, + { + "epoch": 0.0006942963014769329, + "grad_norm": 0.3748263418674469, + "learning_rate": 6.831385642737896e-05, + "loss": 0.5265, + "step": 1978 + }, + { + "epoch": 0.000694647310729449, + "grad_norm": 0.33476313948631287, + "learning_rate": 6.824707846410685e-05, + "loss": 0.5298, + "step": 1979 + }, + { + "epoch": 0.0006949983199819651, + "grad_norm": 0.37101680040359497, + "learning_rate": 6.818030050083472e-05, + "loss": 0.5745, + "step": 1980 + }, + { + "epoch": 0.0006953493292344813, + "grad_norm": 0.3126341998577118, + "learning_rate": 6.811352253756261e-05, + "loss": 0.4874, + "step": 1981 + }, + { + "epoch": 0.0006957003384869974, + "grad_norm": 0.305896133184433, + "learning_rate": 6.80467445742905e-05, + "loss": 0.5187, + "step": 1982 + }, + { + "epoch": 0.0006960513477395136, + "grad_norm": 0.3486585319042206, + "learning_rate": 6.797996661101837e-05, + "loss": 0.5567, + "step": 1983 + }, + { + "epoch": 0.0006964023569920297, + "grad_norm": 0.33587202429771423, + "learning_rate": 6.791318864774625e-05, + "loss": 0.505, + "step": 1984 + }, + { + "epoch": 0.0006967533662445459, + "grad_norm": 0.32981690764427185, + "learning_rate": 6.784641068447413e-05, + "loss": 0.4372, + "step": 1985 + }, + { + "epoch": 0.000697104375497062, + "grad_norm": 0.30636945366859436, + "learning_rate": 6.7779632721202e-05, + "loss": 0.4731, + "step": 1986 + }, + { + "epoch": 0.0006974553847495782, + "grad_norm": 0.3573989272117615, + "learning_rate": 6.771285475792989e-05, + "loss": 0.6193, + "step": 1987 + }, + { + "epoch": 0.0006978063940020943, + "grad_norm": 0.3697716295719147, + "learning_rate": 6.764607679465776e-05, + "loss": 0.4243, + "step": 1988 + }, + { + "epoch": 0.0006981574032546104, + "grad_norm": 0.3072642385959625, + "learning_rate": 6.757929883138565e-05, + "loss": 0.5506, + "step": 1989 + }, + { + "epoch": 0.0006985084125071266, + "grad_norm": 0.3706247806549072, + "learning_rate": 6.751252086811353e-05, + "loss": 0.4897, + "step": 1990 + }, + { + "epoch": 0.0006988594217596428, + "grad_norm": 0.3179176449775696, + "learning_rate": 6.74457429048414e-05, + "loss": 0.582, + "step": 1991 + }, + { + "epoch": 0.0006992104310121588, + "grad_norm": 0.3597802519798279, + "learning_rate": 6.737896494156929e-05, + "loss": 0.5297, + "step": 1992 + }, + { + "epoch": 0.000699561440264675, + "grad_norm": 0.3542323410511017, + "learning_rate": 6.731218697829717e-05, + "loss": 0.5995, + "step": 1993 + }, + { + "epoch": 0.0006999124495171912, + "grad_norm": 0.3902435302734375, + "learning_rate": 6.724540901502504e-05, + "loss": 0.55, + "step": 1994 + }, + { + "epoch": 0.0007002634587697074, + "grad_norm": 0.433971107006073, + "learning_rate": 6.717863105175293e-05, + "loss": 0.5859, + "step": 1995 + }, + { + "epoch": 0.0007006144680222234, + "grad_norm": 0.30398884415626526, + "learning_rate": 6.71118530884808e-05, + "loss": 0.5749, + "step": 1996 + }, + { + "epoch": 0.0007009654772747396, + "grad_norm": 0.2854095995426178, + "learning_rate": 6.704507512520869e-05, + "loss": 0.5932, + "step": 1997 + }, + { + "epoch": 0.0007013164865272558, + "grad_norm": 0.3235953450202942, + "learning_rate": 6.697829716193656e-05, + "loss": 0.5921, + "step": 1998 + }, + { + "epoch": 0.0007016674957797718, + "grad_norm": 0.364388108253479, + "learning_rate": 6.691151919866445e-05, + "loss": 0.5932, + "step": 1999 + }, + { + "epoch": 0.000702018505032288, + "grad_norm": 0.2984377145767212, + "learning_rate": 6.684474123539233e-05, + "loss": 0.5099, + "step": 2000 + }, + { + "epoch": 0.0007023695142848042, + "grad_norm": 0.6035982370376587, + "learning_rate": 6.67779632721202e-05, + "loss": 0.5385, + "step": 2001 + }, + { + "epoch": 0.0007027205235373203, + "grad_norm": 0.3442158102989197, + "learning_rate": 6.671118530884808e-05, + "loss": 0.6262, + "step": 2002 + }, + { + "epoch": 0.0007030715327898364, + "grad_norm": 0.32627010345458984, + "learning_rate": 6.664440734557597e-05, + "loss": 0.5052, + "step": 2003 + }, + { + "epoch": 0.0007034225420423526, + "grad_norm": 0.2829074561595917, + "learning_rate": 6.657762938230384e-05, + "loss": 0.5151, + "step": 2004 + }, + { + "epoch": 0.0007037735512948688, + "grad_norm": 0.29303300380706787, + "learning_rate": 6.651085141903173e-05, + "loss": 0.4857, + "step": 2005 + }, + { + "epoch": 0.0007041245605473849, + "grad_norm": 0.2904541492462158, + "learning_rate": 6.64440734557596e-05, + "loss": 0.5264, + "step": 2006 + }, + { + "epoch": 0.000704475569799901, + "grad_norm": 0.36056920886039734, + "learning_rate": 6.637729549248749e-05, + "loss": 0.5662, + "step": 2007 + }, + { + "epoch": 0.0007048265790524172, + "grad_norm": 0.4564721882343292, + "learning_rate": 6.631051752921537e-05, + "loss": 0.523, + "step": 2008 + }, + { + "epoch": 0.0007051775883049333, + "grad_norm": 0.31811806559562683, + "learning_rate": 6.624373956594325e-05, + "loss": 0.5568, + "step": 2009 + }, + { + "epoch": 0.0007055285975574495, + "grad_norm": 0.30878302454948425, + "learning_rate": 6.617696160267112e-05, + "loss": 0.5579, + "step": 2010 + }, + { + "epoch": 0.0007058796068099656, + "grad_norm": 0.28648054599761963, + "learning_rate": 6.6110183639399e-05, + "loss": 0.4876, + "step": 2011 + }, + { + "epoch": 0.0007062306160624817, + "grad_norm": 0.30796393752098083, + "learning_rate": 6.604340567612688e-05, + "loss": 0.5471, + "step": 2012 + }, + { + "epoch": 0.0007065816253149979, + "grad_norm": 0.31019923090934753, + "learning_rate": 6.597662771285476e-05, + "loss": 0.5254, + "step": 2013 + }, + { + "epoch": 0.0007069326345675141, + "grad_norm": 0.2949763834476471, + "learning_rate": 6.590984974958264e-05, + "loss": 0.517, + "step": 2014 + }, + { + "epoch": 0.0007072836438200302, + "grad_norm": 0.28508061170578003, + "learning_rate": 6.584307178631051e-05, + "loss": 0.4345, + "step": 2015 + }, + { + "epoch": 0.0007076346530725463, + "grad_norm": 0.2776021957397461, + "learning_rate": 6.57762938230384e-05, + "loss": 0.4588, + "step": 2016 + }, + { + "epoch": 0.0007079856623250625, + "grad_norm": 0.31046923995018005, + "learning_rate": 6.570951585976628e-05, + "loss": 0.544, + "step": 2017 + }, + { + "epoch": 0.0007083366715775787, + "grad_norm": 0.25672435760498047, + "learning_rate": 6.564273789649416e-05, + "loss": 0.4267, + "step": 2018 + }, + { + "epoch": 0.0007086876808300947, + "grad_norm": 0.3223148286342621, + "learning_rate": 6.557595993322204e-05, + "loss": 0.4973, + "step": 2019 + }, + { + "epoch": 0.0007090386900826109, + "grad_norm": 0.3279174864292145, + "learning_rate": 6.550918196994992e-05, + "loss": 0.581, + "step": 2020 + }, + { + "epoch": 0.0007093896993351271, + "grad_norm": 0.29168081283569336, + "learning_rate": 6.54424040066778e-05, + "loss": 0.4743, + "step": 2021 + }, + { + "epoch": 0.0007097407085876431, + "grad_norm": 0.3340432345867157, + "learning_rate": 6.537562604340568e-05, + "loss": 0.5041, + "step": 2022 + }, + { + "epoch": 0.0007100917178401593, + "grad_norm": 0.32505378127098083, + "learning_rate": 6.530884808013355e-05, + "loss": 0.4696, + "step": 2023 + }, + { + "epoch": 0.0007104427270926755, + "grad_norm": 0.3208444118499756, + "learning_rate": 6.524207011686144e-05, + "loss": 0.4521, + "step": 2024 + }, + { + "epoch": 0.0007107937363451917, + "grad_norm": 0.3270561397075653, + "learning_rate": 6.517529215358932e-05, + "loss": 0.5442, + "step": 2025 + }, + { + "epoch": 0.0007111447455977077, + "grad_norm": 0.29020506143569946, + "learning_rate": 6.51085141903172e-05, + "loss": 0.5404, + "step": 2026 + }, + { + "epoch": 0.0007114957548502239, + "grad_norm": 0.364835262298584, + "learning_rate": 6.504173622704508e-05, + "loss": 0.499, + "step": 2027 + }, + { + "epoch": 0.0007118467641027401, + "grad_norm": 0.3265811502933502, + "learning_rate": 6.497495826377296e-05, + "loss": 0.4242, + "step": 2028 + }, + { + "epoch": 0.0007121977733552562, + "grad_norm": 0.2921433448791504, + "learning_rate": 6.490818030050084e-05, + "loss": 0.3823, + "step": 2029 + }, + { + "epoch": 0.0007125487826077723, + "grad_norm": 0.35920029878616333, + "learning_rate": 6.484140233722872e-05, + "loss": 0.4331, + "step": 2030 + }, + { + "epoch": 0.0007128997918602885, + "grad_norm": 0.3468065559864044, + "learning_rate": 6.477462437395659e-05, + "loss": 0.6026, + "step": 2031 + }, + { + "epoch": 0.0007132508011128046, + "grad_norm": 0.3000637888908386, + "learning_rate": 6.470784641068448e-05, + "loss": 0.3499, + "step": 2032 + }, + { + "epoch": 0.0007136018103653208, + "grad_norm": 0.34014737606048584, + "learning_rate": 6.464106844741235e-05, + "loss": 0.513, + "step": 2033 + }, + { + "epoch": 0.0007139528196178369, + "grad_norm": 0.32227322459220886, + "learning_rate": 6.457429048414024e-05, + "loss": 0.6163, + "step": 2034 + }, + { + "epoch": 0.0007143038288703531, + "grad_norm": 0.2845328450202942, + "learning_rate": 6.450751252086812e-05, + "loss": 0.4621, + "step": 2035 + }, + { + "epoch": 0.0007146548381228692, + "grad_norm": 0.3255940079689026, + "learning_rate": 6.4440734557596e-05, + "loss": 0.5099, + "step": 2036 + }, + { + "epoch": 0.0007150058473753854, + "grad_norm": 0.34483012557029724, + "learning_rate": 6.437395659432388e-05, + "loss": 0.6406, + "step": 2037 + }, + { + "epoch": 0.0007153568566279015, + "grad_norm": 0.32495322823524475, + "learning_rate": 6.430717863105176e-05, + "loss": 0.5417, + "step": 2038 + }, + { + "epoch": 0.0007157078658804176, + "grad_norm": 0.35258588194847107, + "learning_rate": 6.424040066777963e-05, + "loss": 0.5135, + "step": 2039 + }, + { + "epoch": 0.0007160588751329338, + "grad_norm": 0.40582478046417236, + "learning_rate": 6.417362270450752e-05, + "loss": 0.3909, + "step": 2040 + }, + { + "epoch": 0.00071640988438545, + "grad_norm": 0.2993859052658081, + "learning_rate": 6.410684474123539e-05, + "loss": 0.5538, + "step": 2041 + }, + { + "epoch": 0.000716760893637966, + "grad_norm": 0.2939004898071289, + "learning_rate": 6.404006677796328e-05, + "loss": 0.5033, + "step": 2042 + }, + { + "epoch": 0.0007171119028904822, + "grad_norm": 0.3596908152103424, + "learning_rate": 6.397328881469116e-05, + "loss": 0.6057, + "step": 2043 + }, + { + "epoch": 0.0007174629121429984, + "grad_norm": 0.3577982187271118, + "learning_rate": 6.390651085141904e-05, + "loss": 0.4969, + "step": 2044 + }, + { + "epoch": 0.0007178139213955146, + "grad_norm": 0.3823784589767456, + "learning_rate": 6.383973288814692e-05, + "loss": 0.4979, + "step": 2045 + }, + { + "epoch": 0.0007181649306480306, + "grad_norm": 0.31179672479629517, + "learning_rate": 6.37729549248748e-05, + "loss": 0.6643, + "step": 2046 + }, + { + "epoch": 0.0007185159399005468, + "grad_norm": 0.3055694103240967, + "learning_rate": 6.370617696160267e-05, + "loss": 0.5222, + "step": 2047 + }, + { + "epoch": 0.000718866949153063, + "grad_norm": 0.3177119493484497, + "learning_rate": 6.363939899833056e-05, + "loss": 0.4974, + "step": 2048 + }, + { + "epoch": 0.000719217958405579, + "grad_norm": 0.30025020241737366, + "learning_rate": 6.357262103505843e-05, + "loss": 0.4653, + "step": 2049 + }, + { + "epoch": 0.0007195689676580952, + "grad_norm": 0.3193509876728058, + "learning_rate": 6.35058430717863e-05, + "loss": 0.4433, + "step": 2050 + }, + { + "epoch": 0.0007199199769106114, + "grad_norm": 0.3532063961029053, + "learning_rate": 6.343906510851419e-05, + "loss": 0.5048, + "step": 2051 + }, + { + "epoch": 0.0007202709861631275, + "grad_norm": 0.3306260406970978, + "learning_rate": 6.337228714524208e-05, + "loss": 0.4217, + "step": 2052 + }, + { + "epoch": 0.0007206219954156436, + "grad_norm": 0.35060104727745056, + "learning_rate": 6.330550918196996e-05, + "loss": 0.6056, + "step": 2053 + }, + { + "epoch": 0.0007209730046681598, + "grad_norm": 0.3271511495113373, + "learning_rate": 6.323873121869784e-05, + "loss": 0.6413, + "step": 2054 + }, + { + "epoch": 0.000721324013920676, + "grad_norm": 0.3258192539215088, + "learning_rate": 6.317195325542571e-05, + "loss": 0.5181, + "step": 2055 + }, + { + "epoch": 0.0007216750231731921, + "grad_norm": 0.29275408387184143, + "learning_rate": 6.31051752921536e-05, + "loss": 0.427, + "step": 2056 + }, + { + "epoch": 0.0007220260324257082, + "grad_norm": 0.3610098659992218, + "learning_rate": 6.303839732888147e-05, + "loss": 0.4876, + "step": 2057 + }, + { + "epoch": 0.0007223770416782244, + "grad_norm": 0.2921926975250244, + "learning_rate": 6.297161936560934e-05, + "loss": 0.4277, + "step": 2058 + }, + { + "epoch": 0.0007227280509307405, + "grad_norm": 0.28178003430366516, + "learning_rate": 6.290484140233723e-05, + "loss": 0.4427, + "step": 2059 + }, + { + "epoch": 0.0007230790601832567, + "grad_norm": 0.2888009548187256, + "learning_rate": 6.283806343906511e-05, + "loss": 0.4472, + "step": 2060 + }, + { + "epoch": 0.0007234300694357728, + "grad_norm": 0.2815188467502594, + "learning_rate": 6.2771285475793e-05, + "loss": 0.4961, + "step": 2061 + }, + { + "epoch": 0.0007237810786882889, + "grad_norm": 0.3248046636581421, + "learning_rate": 6.270450751252087e-05, + "loss": 0.5689, + "step": 2062 + }, + { + "epoch": 0.0007241320879408051, + "grad_norm": 0.33134186267852783, + "learning_rate": 6.263772954924875e-05, + "loss": 0.5556, + "step": 2063 + }, + { + "epoch": 0.0007244830971933213, + "grad_norm": 0.3108437955379486, + "learning_rate": 6.257095158597663e-05, + "loss": 0.4706, + "step": 2064 + }, + { + "epoch": 0.0007248341064458374, + "grad_norm": 0.29280775785446167, + "learning_rate": 6.250417362270451e-05, + "loss": 0.5024, + "step": 2065 + }, + { + "epoch": 0.0007251851156983535, + "grad_norm": 0.3041885495185852, + "learning_rate": 6.243739565943238e-05, + "loss": 0.5804, + "step": 2066 + }, + { + "epoch": 0.0007255361249508697, + "grad_norm": 0.46564334630966187, + "learning_rate": 6.237061769616027e-05, + "loss": 0.557, + "step": 2067 + }, + { + "epoch": 0.0007258871342033859, + "grad_norm": 0.3866247832775116, + "learning_rate": 6.230383973288815e-05, + "loss": 0.6007, + "step": 2068 + }, + { + "epoch": 0.0007262381434559019, + "grad_norm": 0.3250521421432495, + "learning_rate": 6.223706176961604e-05, + "loss": 0.5898, + "step": 2069 + }, + { + "epoch": 0.0007265891527084181, + "grad_norm": 0.26282358169555664, + "learning_rate": 6.217028380634391e-05, + "loss": 0.4367, + "step": 2070 + }, + { + "epoch": 0.0007269401619609343, + "grad_norm": 0.4470483958721161, + "learning_rate": 6.210350584307179e-05, + "loss": 0.5953, + "step": 2071 + }, + { + "epoch": 0.0007272911712134503, + "grad_norm": 0.3086302578449249, + "learning_rate": 6.203672787979967e-05, + "loss": 0.4693, + "step": 2072 + }, + { + "epoch": 0.0007276421804659665, + "grad_norm": 0.26347777247428894, + "learning_rate": 6.196994991652755e-05, + "loss": 0.3897, + "step": 2073 + }, + { + "epoch": 0.0007279931897184827, + "grad_norm": 0.2902274429798126, + "learning_rate": 6.190317195325542e-05, + "loss": 0.491, + "step": 2074 + }, + { + "epoch": 0.0007283441989709989, + "grad_norm": 0.2900046706199646, + "learning_rate": 6.183639398998331e-05, + "loss": 0.43, + "step": 2075 + }, + { + "epoch": 0.0007286952082235149, + "grad_norm": 0.3595135509967804, + "learning_rate": 6.176961602671118e-05, + "loss": 0.4684, + "step": 2076 + }, + { + "epoch": 0.0007290462174760311, + "grad_norm": 0.2925381064414978, + "learning_rate": 6.170283806343907e-05, + "loss": 0.3986, + "step": 2077 + }, + { + "epoch": 0.0007293972267285473, + "grad_norm": 0.28008222579956055, + "learning_rate": 6.163606010016695e-05, + "loss": 0.4379, + "step": 2078 + }, + { + "epoch": 0.0007297482359810634, + "grad_norm": 0.2913059890270233, + "learning_rate": 6.156928213689483e-05, + "loss": 0.4155, + "step": 2079 + }, + { + "epoch": 0.0007300992452335795, + "grad_norm": 0.3182995021343231, + "learning_rate": 6.150250417362271e-05, + "loss": 0.445, + "step": 2080 + }, + { + "epoch": 0.0007304502544860957, + "grad_norm": 0.26757434010505676, + "learning_rate": 6.143572621035059e-05, + "loss": 0.4718, + "step": 2081 + }, + { + "epoch": 0.0007308012637386118, + "grad_norm": 0.29628437757492065, + "learning_rate": 6.136894824707846e-05, + "loss": 0.5108, + "step": 2082 + }, + { + "epoch": 0.000731152272991128, + "grad_norm": 0.3662126958370209, + "learning_rate": 6.130217028380635e-05, + "loss": 0.5035, + "step": 2083 + }, + { + "epoch": 0.0007315032822436441, + "grad_norm": 0.2901041805744171, + "learning_rate": 6.123539232053422e-05, + "loss": 0.5561, + "step": 2084 + }, + { + "epoch": 0.0007318542914961603, + "grad_norm": 0.3100784420967102, + "learning_rate": 6.11686143572621e-05, + "loss": 0.5088, + "step": 2085 + }, + { + "epoch": 0.0007322053007486764, + "grad_norm": 0.334096223115921, + "learning_rate": 6.110183639398999e-05, + "loss": 0.5374, + "step": 2086 + }, + { + "epoch": 0.0007325563100011926, + "grad_norm": 0.3160945773124695, + "learning_rate": 6.103505843071786e-05, + "loss": 0.5216, + "step": 2087 + }, + { + "epoch": 0.0007329073192537087, + "grad_norm": 0.2668875753879547, + "learning_rate": 6.0968280467445746e-05, + "loss": 0.5266, + "step": 2088 + }, + { + "epoch": 0.0007332583285062248, + "grad_norm": 0.3053551912307739, + "learning_rate": 6.0901502504173626e-05, + "loss": 0.4526, + "step": 2089 + }, + { + "epoch": 0.000733609337758741, + "grad_norm": 0.37118956446647644, + "learning_rate": 6.083472454090151e-05, + "loss": 0.3821, + "step": 2090 + }, + { + "epoch": 0.0007339603470112572, + "grad_norm": 0.3345617651939392, + "learning_rate": 6.0767946577629386e-05, + "loss": 0.5467, + "step": 2091 + }, + { + "epoch": 0.0007343113562637732, + "grad_norm": 0.3057086169719696, + "learning_rate": 6.070116861435726e-05, + "loss": 0.5843, + "step": 2092 + }, + { + "epoch": 0.0007346623655162894, + "grad_norm": 0.322822242975235, + "learning_rate": 6.0634390651085146e-05, + "loss": 0.4741, + "step": 2093 + }, + { + "epoch": 0.0007350133747688056, + "grad_norm": 0.3189939558506012, + "learning_rate": 6.0567612687813026e-05, + "loss": 0.6046, + "step": 2094 + }, + { + "epoch": 0.0007353643840213218, + "grad_norm": 0.2926713526248932, + "learning_rate": 6.05008347245409e-05, + "loss": 0.5153, + "step": 2095 + }, + { + "epoch": 0.0007357153932738378, + "grad_norm": 0.3285471200942993, + "learning_rate": 6.0434056761268785e-05, + "loss": 0.533, + "step": 2096 + }, + { + "epoch": 0.000736066402526354, + "grad_norm": 0.29923808574676514, + "learning_rate": 6.0367278797996665e-05, + "loss": 0.3605, + "step": 2097 + }, + { + "epoch": 0.0007364174117788702, + "grad_norm": 0.3213334381580353, + "learning_rate": 6.030050083472455e-05, + "loss": 0.5723, + "step": 2098 + }, + { + "epoch": 0.0007367684210313862, + "grad_norm": 0.30922314524650574, + "learning_rate": 6.0233722871452425e-05, + "loss": 0.4662, + "step": 2099 + }, + { + "epoch": 0.0007371194302839024, + "grad_norm": 0.2766854166984558, + "learning_rate": 6.01669449081803e-05, + "loss": 0.5024, + "step": 2100 + }, + { + "epoch": 0.0007374704395364186, + "grad_norm": 0.31957265734672546, + "learning_rate": 6.0100166944908185e-05, + "loss": 0.4026, + "step": 2101 + }, + { + "epoch": 0.0007378214487889347, + "grad_norm": 0.3103827238082886, + "learning_rate": 6.0033388981636065e-05, + "loss": 0.4981, + "step": 2102 + }, + { + "epoch": 0.0007381724580414508, + "grad_norm": 0.3047814667224884, + "learning_rate": 5.996661101836394e-05, + "loss": 0.5679, + "step": 2103 + }, + { + "epoch": 0.000738523467293967, + "grad_norm": 0.3266103267669678, + "learning_rate": 5.9899833055091825e-05, + "loss": 0.5143, + "step": 2104 + }, + { + "epoch": 0.0007388744765464832, + "grad_norm": 0.2987171709537506, + "learning_rate": 5.98330550918197e-05, + "loss": 0.3717, + "step": 2105 + }, + { + "epoch": 0.0007392254857989993, + "grad_norm": 0.2716870605945587, + "learning_rate": 5.9766277128547585e-05, + "loss": 0.4654, + "step": 2106 + }, + { + "epoch": 0.0007395764950515154, + "grad_norm": 0.2746477425098419, + "learning_rate": 5.9699499165275465e-05, + "loss": 0.5674, + "step": 2107 + }, + { + "epoch": 0.0007399275043040316, + "grad_norm": 0.30133751034736633, + "learning_rate": 5.963272120200334e-05, + "loss": 0.4119, + "step": 2108 + }, + { + "epoch": 0.0007402785135565477, + "grad_norm": 0.2945064902305603, + "learning_rate": 5.9565943238731224e-05, + "loss": 0.4342, + "step": 2109 + }, + { + "epoch": 0.0007406295228090639, + "grad_norm": 0.32607197761535645, + "learning_rate": 5.9499165275459104e-05, + "loss": 0.5316, + "step": 2110 + }, + { + "epoch": 0.00074098053206158, + "grad_norm": 0.29035452008247375, + "learning_rate": 5.943238731218698e-05, + "loss": 0.5378, + "step": 2111 + }, + { + "epoch": 0.0007413315413140961, + "grad_norm": 0.311575710773468, + "learning_rate": 5.9365609348914864e-05, + "loss": 0.4837, + "step": 2112 + }, + { + "epoch": 0.0007416825505666123, + "grad_norm": 0.29044589400291443, + "learning_rate": 5.929883138564274e-05, + "loss": 0.4939, + "step": 2113 + }, + { + "epoch": 0.0007420335598191285, + "grad_norm": 0.36396634578704834, + "learning_rate": 5.9232053422370624e-05, + "loss": 0.4681, + "step": 2114 + }, + { + "epoch": 0.0007423845690716446, + "grad_norm": 0.400389701128006, + "learning_rate": 5.9165275459098504e-05, + "loss": 0.6224, + "step": 2115 + }, + { + "epoch": 0.0007427355783241607, + "grad_norm": 0.32242146134376526, + "learning_rate": 5.909849749582638e-05, + "loss": 0.4635, + "step": 2116 + }, + { + "epoch": 0.0007430865875766769, + "grad_norm": 0.3917767107486725, + "learning_rate": 5.9031719532554264e-05, + "loss": 0.5719, + "step": 2117 + }, + { + "epoch": 0.0007434375968291931, + "grad_norm": 0.3087753653526306, + "learning_rate": 5.896494156928214e-05, + "loss": 0.4465, + "step": 2118 + }, + { + "epoch": 0.0007437886060817091, + "grad_norm": 0.3342611789703369, + "learning_rate": 5.889816360601002e-05, + "loss": 0.6269, + "step": 2119 + }, + { + "epoch": 0.0007441396153342253, + "grad_norm": 0.3343375623226166, + "learning_rate": 5.8831385642737904e-05, + "loss": 0.4707, + "step": 2120 + }, + { + "epoch": 0.0007444906245867415, + "grad_norm": 0.2936212718486786, + "learning_rate": 5.876460767946578e-05, + "loss": 0.5382, + "step": 2121 + }, + { + "epoch": 0.0007448416338392575, + "grad_norm": 0.29692140221595764, + "learning_rate": 5.8697829716193664e-05, + "loss": 0.4286, + "step": 2122 + }, + { + "epoch": 0.0007451926430917737, + "grad_norm": 0.3181445598602295, + "learning_rate": 5.863105175292154e-05, + "loss": 0.5988, + "step": 2123 + }, + { + "epoch": 0.0007455436523442899, + "grad_norm": 0.27625536918640137, + "learning_rate": 5.856427378964942e-05, + "loss": 0.5016, + "step": 2124 + }, + { + "epoch": 0.0007458946615968061, + "grad_norm": 0.28393319249153137, + "learning_rate": 5.84974958263773e-05, + "loss": 0.4275, + "step": 2125 + }, + { + "epoch": 0.0007462456708493221, + "grad_norm": 0.2632163465023041, + "learning_rate": 5.8430717863105176e-05, + "loss": 0.4724, + "step": 2126 + }, + { + "epoch": 0.0007465966801018383, + "grad_norm": 0.27599701285362244, + "learning_rate": 5.8363939899833056e-05, + "loss": 0.4605, + "step": 2127 + }, + { + "epoch": 0.0007469476893543545, + "grad_norm": 0.30116626620292664, + "learning_rate": 5.829716193656094e-05, + "loss": 0.4501, + "step": 2128 + }, + { + "epoch": 0.0007472986986068706, + "grad_norm": 0.3393256664276123, + "learning_rate": 5.8230383973288816e-05, + "loss": 0.4258, + "step": 2129 + }, + { + "epoch": 0.0007476497078593867, + "grad_norm": 0.33133190870285034, + "learning_rate": 5.81636060100167e-05, + "loss": 0.5216, + "step": 2130 + }, + { + "epoch": 0.0007480007171119029, + "grad_norm": 0.33170127868652344, + "learning_rate": 5.8096828046744576e-05, + "loss": 0.5108, + "step": 2131 + }, + { + "epoch": 0.000748351726364419, + "grad_norm": 0.2727866768836975, + "learning_rate": 5.8030050083472456e-05, + "loss": 0.4282, + "step": 2132 + }, + { + "epoch": 0.0007487027356169352, + "grad_norm": 0.3469356894493103, + "learning_rate": 5.796327212020034e-05, + "loss": 0.6135, + "step": 2133 + }, + { + "epoch": 0.0007490537448694513, + "grad_norm": 0.35446175932884216, + "learning_rate": 5.7896494156928216e-05, + "loss": 0.6232, + "step": 2134 + }, + { + "epoch": 0.0007494047541219675, + "grad_norm": 0.2925025522708893, + "learning_rate": 5.782971619365609e-05, + "loss": 0.5422, + "step": 2135 + }, + { + "epoch": 0.0007497557633744836, + "grad_norm": 0.3256363868713379, + "learning_rate": 5.7762938230383976e-05, + "loss": 0.5542, + "step": 2136 + }, + { + "epoch": 0.0007501067726269998, + "grad_norm": 0.30574938654899597, + "learning_rate": 5.7696160267111856e-05, + "loss": 0.4611, + "step": 2137 + }, + { + "epoch": 0.0007504577818795159, + "grad_norm": 0.31476491689682007, + "learning_rate": 5.762938230383974e-05, + "loss": 0.5159, + "step": 2138 + }, + { + "epoch": 0.000750808791132032, + "grad_norm": 0.3236735165119171, + "learning_rate": 5.7562604340567616e-05, + "loss": 0.41, + "step": 2139 + }, + { + "epoch": 0.0007511598003845482, + "grad_norm": 0.3269157409667969, + "learning_rate": 5.749582637729549e-05, + "loss": 0.5634, + "step": 2140 + }, + { + "epoch": 0.0007515108096370644, + "grad_norm": 0.2963981032371521, + "learning_rate": 5.7429048414023375e-05, + "loss": 0.5234, + "step": 2141 + }, + { + "epoch": 0.0007518618188895804, + "grad_norm": 0.3150303363800049, + "learning_rate": 5.7362270450751255e-05, + "loss": 0.5364, + "step": 2142 + }, + { + "epoch": 0.0007522128281420966, + "grad_norm": 0.31830161809921265, + "learning_rate": 5.729549248747913e-05, + "loss": 0.4401, + "step": 2143 + }, + { + "epoch": 0.0007525638373946128, + "grad_norm": 0.31755512952804565, + "learning_rate": 5.7228714524207015e-05, + "loss": 0.5842, + "step": 2144 + }, + { + "epoch": 0.000752914846647129, + "grad_norm": 0.3024124503135681, + "learning_rate": 5.7161936560934895e-05, + "loss": 0.4321, + "step": 2145 + }, + { + "epoch": 0.000753265855899645, + "grad_norm": 0.28298643231391907, + "learning_rate": 5.709515859766278e-05, + "loss": 0.3981, + "step": 2146 + }, + { + "epoch": 0.0007536168651521612, + "grad_norm": 0.35037901997566223, + "learning_rate": 5.7028380634390655e-05, + "loss": 0.5613, + "step": 2147 + }, + { + "epoch": 0.0007539678744046774, + "grad_norm": 0.34800994396209717, + "learning_rate": 5.696160267111853e-05, + "loss": 0.5463, + "step": 2148 + }, + { + "epoch": 0.0007543188836571934, + "grad_norm": 0.2794566750526428, + "learning_rate": 5.6894824707846415e-05, + "loss": 0.6228, + "step": 2149 + }, + { + "epoch": 0.0007546698929097096, + "grad_norm": 0.3680720031261444, + "learning_rate": 5.6828046744574295e-05, + "loss": 0.5515, + "step": 2150 + }, + { + "epoch": 0.0007550209021622258, + "grad_norm": 0.2983403503894806, + "learning_rate": 5.676126878130217e-05, + "loss": 0.4201, + "step": 2151 + }, + { + "epoch": 0.0007553719114147419, + "grad_norm": 0.3301478326320648, + "learning_rate": 5.6694490818030055e-05, + "loss": 0.5004, + "step": 2152 + }, + { + "epoch": 0.000755722920667258, + "grad_norm": 0.32439282536506653, + "learning_rate": 5.662771285475793e-05, + "loss": 0.5586, + "step": 2153 + }, + { + "epoch": 0.0007560739299197742, + "grad_norm": 0.2889827787876129, + "learning_rate": 5.6560934891485815e-05, + "loss": 0.3882, + "step": 2154 + }, + { + "epoch": 0.0007564249391722904, + "grad_norm": 0.3911135792732239, + "learning_rate": 5.6494156928213694e-05, + "loss": 0.542, + "step": 2155 + }, + { + "epoch": 0.0007567759484248065, + "grad_norm": 0.2955808937549591, + "learning_rate": 5.642737896494157e-05, + "loss": 0.5586, + "step": 2156 + }, + { + "epoch": 0.0007571269576773226, + "grad_norm": 0.5502769351005554, + "learning_rate": 5.6360601001669454e-05, + "loss": 0.5416, + "step": 2157 + }, + { + "epoch": 0.0007574779669298388, + "grad_norm": 0.3201058506965637, + "learning_rate": 5.629382303839733e-05, + "loss": 0.4864, + "step": 2158 + }, + { + "epoch": 0.0007578289761823549, + "grad_norm": 0.39620837569236755, + "learning_rate": 5.622704507512521e-05, + "loss": 0.5328, + "step": 2159 + }, + { + "epoch": 0.0007581799854348711, + "grad_norm": 0.3239331543445587, + "learning_rate": 5.6160267111853094e-05, + "loss": 0.5055, + "step": 2160 + }, + { + "epoch": 0.0007585309946873872, + "grad_norm": 0.3722355365753174, + "learning_rate": 5.609348914858097e-05, + "loss": 0.4805, + "step": 2161 + }, + { + "epoch": 0.0007588820039399033, + "grad_norm": 0.3486960232257843, + "learning_rate": 5.6026711185308854e-05, + "loss": 0.4956, + "step": 2162 + }, + { + "epoch": 0.0007592330131924195, + "grad_norm": 0.2911629378795624, + "learning_rate": 5.5959933222036734e-05, + "loss": 0.5888, + "step": 2163 + }, + { + "epoch": 0.0007595840224449357, + "grad_norm": 0.3276377022266388, + "learning_rate": 5.589315525876461e-05, + "loss": 0.4302, + "step": 2164 + }, + { + "epoch": 0.0007599350316974518, + "grad_norm": 0.3614025413990021, + "learning_rate": 5.5826377295492494e-05, + "loss": 0.5716, + "step": 2165 + }, + { + "epoch": 0.0007602860409499679, + "grad_norm": 0.2928791642189026, + "learning_rate": 5.575959933222037e-05, + "loss": 0.5744, + "step": 2166 + }, + { + "epoch": 0.0007606370502024841, + "grad_norm": 0.33232712745666504, + "learning_rate": 5.569282136894825e-05, + "loss": 0.5868, + "step": 2167 + }, + { + "epoch": 0.0007609880594550003, + "grad_norm": 0.28385528922080994, + "learning_rate": 5.5626043405676134e-05, + "loss": 0.4737, + "step": 2168 + }, + { + "epoch": 0.0007613390687075163, + "grad_norm": 0.3103507161140442, + "learning_rate": 5.555926544240401e-05, + "loss": 0.5225, + "step": 2169 + }, + { + "epoch": 0.0007616900779600325, + "grad_norm": 0.32343319058418274, + "learning_rate": 5.5492487479131893e-05, + "loss": 0.5051, + "step": 2170 + }, + { + "epoch": 0.0007620410872125487, + "grad_norm": 0.32034847140312195, + "learning_rate": 5.5425709515859767e-05, + "loss": 0.4835, + "step": 2171 + }, + { + "epoch": 0.0007623920964650648, + "grad_norm": 0.3377881646156311, + "learning_rate": 5.5358931552587646e-05, + "loss": 0.5747, + "step": 2172 + }, + { + "epoch": 0.0007627431057175809, + "grad_norm": 0.3083537518978119, + "learning_rate": 5.529215358931553e-05, + "loss": 0.4212, + "step": 2173 + }, + { + "epoch": 0.0007630941149700971, + "grad_norm": 0.29118117690086365, + "learning_rate": 5.5225375626043406e-05, + "loss": 0.4961, + "step": 2174 + }, + { + "epoch": 0.0007634451242226132, + "grad_norm": 0.28406381607055664, + "learning_rate": 5.5158597662771286e-05, + "loss": 0.4457, + "step": 2175 + }, + { + "epoch": 0.0007637961334751293, + "grad_norm": 0.30152761936187744, + "learning_rate": 5.509181969949917e-05, + "loss": 0.5338, + "step": 2176 + }, + { + "epoch": 0.0007641471427276455, + "grad_norm": 0.2911263108253479, + "learning_rate": 5.5025041736227046e-05, + "loss": 0.5034, + "step": 2177 + }, + { + "epoch": 0.0007644981519801617, + "grad_norm": 0.3346504271030426, + "learning_rate": 5.495826377295493e-05, + "loss": 0.477, + "step": 2178 + }, + { + "epoch": 0.0007648491612326778, + "grad_norm": 0.2966042459011078, + "learning_rate": 5.4891485809682806e-05, + "loss": 0.5751, + "step": 2179 + }, + { + "epoch": 0.000765200170485194, + "grad_norm": 0.3047448694705963, + "learning_rate": 5.4824707846410686e-05, + "loss": 0.5254, + "step": 2180 + }, + { + "epoch": 0.0007655511797377101, + "grad_norm": 0.2856384813785553, + "learning_rate": 5.475792988313857e-05, + "loss": 0.4936, + "step": 2181 + }, + { + "epoch": 0.0007659021889902262, + "grad_norm": 0.3124605119228363, + "learning_rate": 5.4691151919866446e-05, + "loss": 0.4669, + "step": 2182 + }, + { + "epoch": 0.0007662531982427424, + "grad_norm": 0.3174493908882141, + "learning_rate": 5.462437395659432e-05, + "loss": 0.4818, + "step": 2183 + }, + { + "epoch": 0.0007666042074952585, + "grad_norm": 0.2865968942642212, + "learning_rate": 5.4557595993322206e-05, + "loss": 0.5599, + "step": 2184 + }, + { + "epoch": 0.0007669552167477746, + "grad_norm": 0.29531776905059814, + "learning_rate": 5.4490818030050086e-05, + "loss": 0.5399, + "step": 2185 + }, + { + "epoch": 0.0007673062260002908, + "grad_norm": 0.2822519540786743, + "learning_rate": 5.442404006677797e-05, + "loss": 0.4714, + "step": 2186 + }, + { + "epoch": 0.000767657235252807, + "grad_norm": 0.3201335668563843, + "learning_rate": 5.4357262103505845e-05, + "loss": 0.4805, + "step": 2187 + }, + { + "epoch": 0.0007680082445053231, + "grad_norm": 0.3130393326282501, + "learning_rate": 5.429048414023372e-05, + "loss": 0.5615, + "step": 2188 + }, + { + "epoch": 0.0007683592537578392, + "grad_norm": 0.3346283435821533, + "learning_rate": 5.4223706176961605e-05, + "loss": 0.4135, + "step": 2189 + }, + { + "epoch": 0.0007687102630103554, + "grad_norm": 0.3538265526294708, + "learning_rate": 5.4156928213689485e-05, + "loss": 0.6236, + "step": 2190 + }, + { + "epoch": 0.0007690612722628716, + "grad_norm": 0.3180318772792816, + "learning_rate": 5.409015025041736e-05, + "loss": 0.5531, + "step": 2191 + }, + { + "epoch": 0.0007694122815153876, + "grad_norm": 0.29365554451942444, + "learning_rate": 5.4023372287145245e-05, + "loss": 0.4976, + "step": 2192 + }, + { + "epoch": 0.0007697632907679038, + "grad_norm": 0.3063375949859619, + "learning_rate": 5.3956594323873125e-05, + "loss": 0.4142, + "step": 2193 + }, + { + "epoch": 0.00077011430002042, + "grad_norm": 0.3582829535007477, + "learning_rate": 5.388981636060101e-05, + "loss": 0.5482, + "step": 2194 + }, + { + "epoch": 0.000770465309272936, + "grad_norm": 0.33901599049568176, + "learning_rate": 5.3823038397328885e-05, + "loss": 0.4793, + "step": 2195 + }, + { + "epoch": 0.0007708163185254522, + "grad_norm": 0.36831775307655334, + "learning_rate": 5.375626043405676e-05, + "loss": 0.5113, + "step": 2196 + }, + { + "epoch": 0.0007711673277779684, + "grad_norm": 0.28705963492393494, + "learning_rate": 5.3689482470784645e-05, + "loss": 0.4417, + "step": 2197 + }, + { + "epoch": 0.0007715183370304846, + "grad_norm": 0.3740500509738922, + "learning_rate": 5.3622704507512525e-05, + "loss": 0.4813, + "step": 2198 + }, + { + "epoch": 0.0007718693462830007, + "grad_norm": 0.3366188108921051, + "learning_rate": 5.35559265442404e-05, + "loss": 0.6308, + "step": 2199 + }, + { + "epoch": 0.0007722203555355168, + "grad_norm": 0.31155267357826233, + "learning_rate": 5.3489148580968285e-05, + "loss": 0.4941, + "step": 2200 + }, + { + "epoch": 0.000772571364788033, + "grad_norm": 0.30353885889053345, + "learning_rate": 5.342237061769616e-05, + "loss": 0.5345, + "step": 2201 + }, + { + "epoch": 0.0007729223740405491, + "grad_norm": 0.37722593545913696, + "learning_rate": 5.3355592654424044e-05, + "loss": 0.603, + "step": 2202 + }, + { + "epoch": 0.0007732733832930652, + "grad_norm": 0.31102925539016724, + "learning_rate": 5.3288814691151924e-05, + "loss": 0.5088, + "step": 2203 + }, + { + "epoch": 0.0007736243925455814, + "grad_norm": 0.285318523645401, + "learning_rate": 5.32220367278798e-05, + "loss": 0.4774, + "step": 2204 + }, + { + "epoch": 0.0007739754017980975, + "grad_norm": 0.26891449093818665, + "learning_rate": 5.3155258764607684e-05, + "loss": 0.4678, + "step": 2205 + }, + { + "epoch": 0.0007743264110506137, + "grad_norm": 0.40345075726509094, + "learning_rate": 5.308848080133556e-05, + "loss": 0.5511, + "step": 2206 + }, + { + "epoch": 0.0007746774203031298, + "grad_norm": 0.26102039217948914, + "learning_rate": 5.302170283806344e-05, + "loss": 0.5149, + "step": 2207 + }, + { + "epoch": 0.000775028429555646, + "grad_norm": 0.2700537443161011, + "learning_rate": 5.2954924874791324e-05, + "loss": 0.5239, + "step": 2208 + }, + { + "epoch": 0.0007753794388081621, + "grad_norm": 0.3144576847553253, + "learning_rate": 5.28881469115192e-05, + "loss": 0.4237, + "step": 2209 + }, + { + "epoch": 0.0007757304480606783, + "grad_norm": 0.26037758588790894, + "learning_rate": 5.2821368948247084e-05, + "loss": 0.4597, + "step": 2210 + }, + { + "epoch": 0.0007760814573131944, + "grad_norm": 0.2766638398170471, + "learning_rate": 5.2754590984974964e-05, + "loss": 0.4957, + "step": 2211 + }, + { + "epoch": 0.0007764324665657105, + "grad_norm": 0.30142873525619507, + "learning_rate": 5.268781302170284e-05, + "loss": 0.3903, + "step": 2212 + }, + { + "epoch": 0.0007767834758182267, + "grad_norm": 0.4231036305427551, + "learning_rate": 5.2621035058430724e-05, + "loss": 0.4719, + "step": 2213 + }, + { + "epoch": 0.0007771344850707429, + "grad_norm": 0.31130513548851013, + "learning_rate": 5.25542570951586e-05, + "loss": 0.53, + "step": 2214 + }, + { + "epoch": 0.0007774854943232589, + "grad_norm": 0.35050421953201294, + "learning_rate": 5.248747913188648e-05, + "loss": 0.5429, + "step": 2215 + }, + { + "epoch": 0.0007778365035757751, + "grad_norm": 0.3292376399040222, + "learning_rate": 5.2420701168614363e-05, + "loss": 0.4373, + "step": 2216 + }, + { + "epoch": 0.0007781875128282913, + "grad_norm": 0.319871187210083, + "learning_rate": 5.2353923205342237e-05, + "loss": 0.6004, + "step": 2217 + }, + { + "epoch": 0.0007785385220808075, + "grad_norm": 0.35365426540374756, + "learning_rate": 5.228714524207012e-05, + "loss": 0.613, + "step": 2218 + }, + { + "epoch": 0.0007788895313333235, + "grad_norm": 0.3369859457015991, + "learning_rate": 5.2220367278797996e-05, + "loss": 0.5368, + "step": 2219 + }, + { + "epoch": 0.0007792405405858397, + "grad_norm": 0.31861892342567444, + "learning_rate": 5.2153589315525876e-05, + "loss": 0.4325, + "step": 2220 + }, + { + "epoch": 0.0007795915498383559, + "grad_norm": 0.3197747766971588, + "learning_rate": 5.208681135225376e-05, + "loss": 0.5159, + "step": 2221 + }, + { + "epoch": 0.000779942559090872, + "grad_norm": 0.377331018447876, + "learning_rate": 5.2020033388981636e-05, + "loss": 0.4483, + "step": 2222 + }, + { + "epoch": 0.0007802935683433881, + "grad_norm": 0.2843930721282959, + "learning_rate": 5.195325542570952e-05, + "loss": 0.4812, + "step": 2223 + }, + { + "epoch": 0.0007806445775959043, + "grad_norm": 0.28777164220809937, + "learning_rate": 5.18864774624374e-05, + "loss": 0.4099, + "step": 2224 + }, + { + "epoch": 0.0007809955868484204, + "grad_norm": 0.3638690710067749, + "learning_rate": 5.1819699499165276e-05, + "loss": 0.5787, + "step": 2225 + }, + { + "epoch": 0.0007813465961009366, + "grad_norm": 0.36113011837005615, + "learning_rate": 5.175292153589316e-05, + "loss": 0.6242, + "step": 2226 + }, + { + "epoch": 0.0007816976053534527, + "grad_norm": 0.27899301052093506, + "learning_rate": 5.1686143572621036e-05, + "loss": 0.602, + "step": 2227 + }, + { + "epoch": 0.0007820486146059689, + "grad_norm": 0.32224345207214355, + "learning_rate": 5.1619365609348916e-05, + "loss": 0.4784, + "step": 2228 + }, + { + "epoch": 0.000782399623858485, + "grad_norm": 0.3012712597846985, + "learning_rate": 5.15525876460768e-05, + "loss": 0.5268, + "step": 2229 + }, + { + "epoch": 0.0007827506331110012, + "grad_norm": 0.3213576674461365, + "learning_rate": 5.1485809682804676e-05, + "loss": 0.4646, + "step": 2230 + }, + { + "epoch": 0.0007831016423635173, + "grad_norm": 0.3685286045074463, + "learning_rate": 5.141903171953256e-05, + "loss": 0.5744, + "step": 2231 + }, + { + "epoch": 0.0007834526516160334, + "grad_norm": 0.3282943665981293, + "learning_rate": 5.1352253756260436e-05, + "loss": 0.4577, + "step": 2232 + }, + { + "epoch": 0.0007838036608685496, + "grad_norm": 0.3141206204891205, + "learning_rate": 5.1285475792988315e-05, + "loss": 0.4351, + "step": 2233 + }, + { + "epoch": 0.0007841546701210657, + "grad_norm": 0.3435308337211609, + "learning_rate": 5.12186978297162e-05, + "loss": 0.5928, + "step": 2234 + }, + { + "epoch": 0.0007845056793735818, + "grad_norm": 0.37721729278564453, + "learning_rate": 5.1151919866444075e-05, + "loss": 0.4618, + "step": 2235 + }, + { + "epoch": 0.000784856688626098, + "grad_norm": 0.676645815372467, + "learning_rate": 5.108514190317195e-05, + "loss": 0.6159, + "step": 2236 + }, + { + "epoch": 0.0007852076978786142, + "grad_norm": 0.3856793940067291, + "learning_rate": 5.1018363939899835e-05, + "loss": 0.5401, + "step": 2237 + }, + { + "epoch": 0.0007855587071311303, + "grad_norm": 0.30672600865364075, + "learning_rate": 5.0951585976627715e-05, + "loss": 0.5427, + "step": 2238 + }, + { + "epoch": 0.0007859097163836464, + "grad_norm": 0.30035004019737244, + "learning_rate": 5.08848080133556e-05, + "loss": 0.5563, + "step": 2239 + }, + { + "epoch": 0.0007862607256361626, + "grad_norm": 0.29214805364608765, + "learning_rate": 5.0818030050083475e-05, + "loss": 0.4528, + "step": 2240 + }, + { + "epoch": 0.0007866117348886788, + "grad_norm": 0.2923140823841095, + "learning_rate": 5.0751252086811355e-05, + "loss": 0.4968, + "step": 2241 + }, + { + "epoch": 0.0007869627441411948, + "grad_norm": 0.2867215573787689, + "learning_rate": 5.068447412353924e-05, + "loss": 0.3843, + "step": 2242 + }, + { + "epoch": 0.000787313753393711, + "grad_norm": 0.35113075375556946, + "learning_rate": 5.0617696160267115e-05, + "loss": 0.5576, + "step": 2243 + }, + { + "epoch": 0.0007876647626462272, + "grad_norm": 0.3268751800060272, + "learning_rate": 5.055091819699499e-05, + "loss": 0.4707, + "step": 2244 + }, + { + "epoch": 0.0007880157718987433, + "grad_norm": 0.3053974211215973, + "learning_rate": 5.0484140233722875e-05, + "loss": 0.4569, + "step": 2245 + }, + { + "epoch": 0.0007883667811512594, + "grad_norm": 0.29972633719444275, + "learning_rate": 5.0417362270450755e-05, + "loss": 0.5683, + "step": 2246 + }, + { + "epoch": 0.0007887177904037756, + "grad_norm": 0.3231423795223236, + "learning_rate": 5.035058430717864e-05, + "loss": 0.6094, + "step": 2247 + }, + { + "epoch": 0.0007890687996562918, + "grad_norm": 0.3402612805366516, + "learning_rate": 5.0283806343906514e-05, + "loss": 0.4556, + "step": 2248 + }, + { + "epoch": 0.0007894198089088079, + "grad_norm": 0.3409544825553894, + "learning_rate": 5.021702838063439e-05, + "loss": 0.4996, + "step": 2249 + }, + { + "epoch": 0.000789770818161324, + "grad_norm": 0.36050474643707275, + "learning_rate": 5.0150250417362274e-05, + "loss": 0.5084, + "step": 2250 + }, + { + "epoch": 0.0007901218274138402, + "grad_norm": 0.26200827956199646, + "learning_rate": 5.0083472454090154e-05, + "loss": 0.4541, + "step": 2251 + }, + { + "epoch": 0.0007904728366663563, + "grad_norm": 0.2840903401374817, + "learning_rate": 5.001669449081803e-05, + "loss": 0.4942, + "step": 2252 + }, + { + "epoch": 0.0007908238459188725, + "grad_norm": 0.2910694181919098, + "learning_rate": 4.9949916527545914e-05, + "loss": 0.4837, + "step": 2253 + }, + { + "epoch": 0.0007911748551713886, + "grad_norm": 0.3161328136920929, + "learning_rate": 4.988313856427379e-05, + "loss": 0.55, + "step": 2254 + }, + { + "epoch": 0.0007915258644239047, + "grad_norm": 0.2806893587112427, + "learning_rate": 4.9816360601001674e-05, + "loss": 0.5187, + "step": 2255 + }, + { + "epoch": 0.0007918768736764209, + "grad_norm": 0.27077895402908325, + "learning_rate": 4.9749582637729554e-05, + "loss": 0.4551, + "step": 2256 + }, + { + "epoch": 0.000792227882928937, + "grad_norm": 0.29795902967453003, + "learning_rate": 4.9682804674457434e-05, + "loss": 0.4059, + "step": 2257 + }, + { + "epoch": 0.0007925788921814532, + "grad_norm": 0.2706364691257477, + "learning_rate": 4.961602671118531e-05, + "loss": 0.5352, + "step": 2258 + }, + { + "epoch": 0.0007929299014339693, + "grad_norm": 0.2788883149623871, + "learning_rate": 4.9549248747913194e-05, + "loss": 0.4768, + "step": 2259 + }, + { + "epoch": 0.0007932809106864855, + "grad_norm": 0.2956430912017822, + "learning_rate": 4.9482470784641074e-05, + "loss": 0.5953, + "step": 2260 + }, + { + "epoch": 0.0007936319199390016, + "grad_norm": 0.30534154176712036, + "learning_rate": 4.9415692821368953e-05, + "loss": 0.5104, + "step": 2261 + }, + { + "epoch": 0.0007939829291915177, + "grad_norm": 0.35847917199134827, + "learning_rate": 4.934891485809683e-05, + "loss": 0.5711, + "step": 2262 + }, + { + "epoch": 0.0007943339384440339, + "grad_norm": 0.2559013068675995, + "learning_rate": 4.9282136894824707e-05, + "loss": 0.3132, + "step": 2263 + }, + { + "epoch": 0.0007946849476965501, + "grad_norm": 0.33414438366889954, + "learning_rate": 4.921535893155259e-05, + "loss": 0.5808, + "step": 2264 + }, + { + "epoch": 0.0007950359569490661, + "grad_norm": 0.3346371650695801, + "learning_rate": 4.914858096828047e-05, + "loss": 0.5011, + "step": 2265 + }, + { + "epoch": 0.0007953869662015823, + "grad_norm": 0.3767020106315613, + "learning_rate": 4.9081803005008346e-05, + "loss": 0.5518, + "step": 2266 + }, + { + "epoch": 0.0007957379754540985, + "grad_norm": 0.34961530566215515, + "learning_rate": 4.9015025041736226e-05, + "loss": 0.5566, + "step": 2267 + }, + { + "epoch": 0.0007960889847066147, + "grad_norm": 0.42262473702430725, + "learning_rate": 4.894824707846411e-05, + "loss": 0.4769, + "step": 2268 + }, + { + "epoch": 0.0007964399939591307, + "grad_norm": 0.28671953082084656, + "learning_rate": 4.888146911519199e-05, + "loss": 0.4531, + "step": 2269 + }, + { + "epoch": 0.0007967910032116469, + "grad_norm": 0.2979021370410919, + "learning_rate": 4.8814691151919866e-05, + "loss": 0.46, + "step": 2270 + }, + { + "epoch": 0.0007971420124641631, + "grad_norm": 0.310390830039978, + "learning_rate": 4.8747913188647746e-05, + "loss": 0.4786, + "step": 2271 + }, + { + "epoch": 0.0007974930217166792, + "grad_norm": 0.2858920693397522, + "learning_rate": 4.8681135225375626e-05, + "loss": 0.5615, + "step": 2272 + }, + { + "epoch": 0.0007978440309691953, + "grad_norm": 0.30646857619285583, + "learning_rate": 4.861435726210351e-05, + "loss": 0.4111, + "step": 2273 + }, + { + "epoch": 0.0007981950402217115, + "grad_norm": 0.2704682946205139, + "learning_rate": 4.8547579298831386e-05, + "loss": 0.4601, + "step": 2274 + }, + { + "epoch": 0.0007985460494742276, + "grad_norm": 0.2745610475540161, + "learning_rate": 4.8480801335559266e-05, + "loss": 0.444, + "step": 2275 + }, + { + "epoch": 0.0007988970587267438, + "grad_norm": 0.318915992975235, + "learning_rate": 4.8414023372287146e-05, + "loss": 0.461, + "step": 2276 + }, + { + "epoch": 0.0007992480679792599, + "grad_norm": 0.38466915488243103, + "learning_rate": 4.834724540901503e-05, + "loss": 0.5273, + "step": 2277 + }, + { + "epoch": 0.0007995990772317761, + "grad_norm": 0.343703955411911, + "learning_rate": 4.8280467445742906e-05, + "loss": 0.4381, + "step": 2278 + }, + { + "epoch": 0.0007999500864842922, + "grad_norm": 0.30002158880233765, + "learning_rate": 4.8213689482470785e-05, + "loss": 0.3609, + "step": 2279 + }, + { + "epoch": 0.0008003010957368084, + "grad_norm": 0.2969815135002136, + "learning_rate": 4.8146911519198665e-05, + "loss": 0.4335, + "step": 2280 + }, + { + "epoch": 0.0008006521049893245, + "grad_norm": 0.24625307321548462, + "learning_rate": 4.8080133555926545e-05, + "loss": 0.3024, + "step": 2281 + }, + { + "epoch": 0.0008010031142418406, + "grad_norm": 0.3032619059085846, + "learning_rate": 4.8013355592654425e-05, + "loss": 0.5862, + "step": 2282 + }, + { + "epoch": 0.0008013541234943568, + "grad_norm": 0.3563072085380554, + "learning_rate": 4.7946577629382305e-05, + "loss": 0.5831, + "step": 2283 + }, + { + "epoch": 0.000801705132746873, + "grad_norm": 0.27989256381988525, + "learning_rate": 4.7879799666110185e-05, + "loss": 0.407, + "step": 2284 + }, + { + "epoch": 0.000802056141999389, + "grad_norm": 0.3893837034702301, + "learning_rate": 4.7813021702838065e-05, + "loss": 0.489, + "step": 2285 + }, + { + "epoch": 0.0008024071512519052, + "grad_norm": 0.2796432673931122, + "learning_rate": 4.7746243739565945e-05, + "loss": 0.5497, + "step": 2286 + }, + { + "epoch": 0.0008027581605044214, + "grad_norm": 0.30520594120025635, + "learning_rate": 4.7679465776293825e-05, + "loss": 0.5586, + "step": 2287 + }, + { + "epoch": 0.0008031091697569375, + "grad_norm": 0.31399065256118774, + "learning_rate": 4.7612687813021705e-05, + "loss": 0.5585, + "step": 2288 + }, + { + "epoch": 0.0008034601790094536, + "grad_norm": 0.29442235827445984, + "learning_rate": 4.7545909849749585e-05, + "loss": 0.487, + "step": 2289 + }, + { + "epoch": 0.0008038111882619698, + "grad_norm": 0.33235105872154236, + "learning_rate": 4.7479131886477465e-05, + "loss": 0.5476, + "step": 2290 + }, + { + "epoch": 0.000804162197514486, + "grad_norm": 0.31871527433395386, + "learning_rate": 4.7412353923205345e-05, + "loss": 0.5141, + "step": 2291 + }, + { + "epoch": 0.000804513206767002, + "grad_norm": 0.3413945138454437, + "learning_rate": 4.7345575959933225e-05, + "loss": 0.5544, + "step": 2292 + }, + { + "epoch": 0.0008048642160195182, + "grad_norm": 0.3110330402851105, + "learning_rate": 4.7278797996661104e-05, + "loss": 0.5044, + "step": 2293 + }, + { + "epoch": 0.0008052152252720344, + "grad_norm": 0.3235619068145752, + "learning_rate": 4.7212020033388984e-05, + "loss": 0.5005, + "step": 2294 + }, + { + "epoch": 0.0008055662345245505, + "grad_norm": 0.2979834973812103, + "learning_rate": 4.7145242070116864e-05, + "loss": 0.5182, + "step": 2295 + }, + { + "epoch": 0.0008059172437770666, + "grad_norm": 0.3092743456363678, + "learning_rate": 4.7078464106844744e-05, + "loss": 0.5234, + "step": 2296 + }, + { + "epoch": 0.0008062682530295828, + "grad_norm": 0.2838219702243805, + "learning_rate": 4.7011686143572624e-05, + "loss": 0.4375, + "step": 2297 + }, + { + "epoch": 0.000806619262282099, + "grad_norm": 0.2947825491428375, + "learning_rate": 4.6944908180300504e-05, + "loss": 0.5849, + "step": 2298 + }, + { + "epoch": 0.0008069702715346151, + "grad_norm": 0.32933109998703003, + "learning_rate": 4.6878130217028384e-05, + "loss": 0.5916, + "step": 2299 + }, + { + "epoch": 0.0008073212807871312, + "grad_norm": 0.28970029950141907, + "learning_rate": 4.6811352253756264e-05, + "loss": 0.5157, + "step": 2300 + }, + { + "epoch": 0.0008076722900396474, + "grad_norm": 0.30502164363861084, + "learning_rate": 4.6744574290484144e-05, + "loss": 0.475, + "step": 2301 + }, + { + "epoch": 0.0008080232992921635, + "grad_norm": 0.3376252353191376, + "learning_rate": 4.667779632721202e-05, + "loss": 0.5223, + "step": 2302 + }, + { + "epoch": 0.0008083743085446797, + "grad_norm": 0.3515482246875763, + "learning_rate": 4.6611018363939904e-05, + "loss": 0.4526, + "step": 2303 + }, + { + "epoch": 0.0008087253177971958, + "grad_norm": 0.27139726281166077, + "learning_rate": 4.6544240400667784e-05, + "loss": 0.3947, + "step": 2304 + }, + { + "epoch": 0.0008090763270497119, + "grad_norm": 0.329605370759964, + "learning_rate": 4.6477462437395664e-05, + "loss": 0.599, + "step": 2305 + }, + { + "epoch": 0.0008094273363022281, + "grad_norm": 0.2759001553058624, + "learning_rate": 4.641068447412354e-05, + "loss": 0.4998, + "step": 2306 + }, + { + "epoch": 0.0008097783455547443, + "grad_norm": 0.312492311000824, + "learning_rate": 4.6343906510851423e-05, + "loss": 0.4926, + "step": 2307 + }, + { + "epoch": 0.0008101293548072604, + "grad_norm": 0.29779669642448425, + "learning_rate": 4.6277128547579303e-05, + "loss": 0.4746, + "step": 2308 + }, + { + "epoch": 0.0008104803640597765, + "grad_norm": 0.3351886570453644, + "learning_rate": 4.621035058430718e-05, + "loss": 0.4445, + "step": 2309 + }, + { + "epoch": 0.0008108313733122927, + "grad_norm": 0.8489035367965698, + "learning_rate": 4.6143572621035056e-05, + "loss": 0.5599, + "step": 2310 + }, + { + "epoch": 0.0008111823825648089, + "grad_norm": 0.31646668910980225, + "learning_rate": 4.6076794657762936e-05, + "loss": 0.4444, + "step": 2311 + }, + { + "epoch": 0.0008115333918173249, + "grad_norm": 0.294809490442276, + "learning_rate": 4.601001669449082e-05, + "loss": 0.4919, + "step": 2312 + }, + { + "epoch": 0.0008118844010698411, + "grad_norm": 0.3671543598175049, + "learning_rate": 4.59432387312187e-05, + "loss": 0.4802, + "step": 2313 + }, + { + "epoch": 0.0008122354103223573, + "grad_norm": 0.2710740268230438, + "learning_rate": 4.5876460767946576e-05, + "loss": 0.3985, + "step": 2314 + }, + { + "epoch": 0.0008125864195748733, + "grad_norm": 0.32188868522644043, + "learning_rate": 4.5809682804674456e-05, + "loss": 0.4829, + "step": 2315 + }, + { + "epoch": 0.0008129374288273895, + "grad_norm": 0.3944168984889984, + "learning_rate": 4.574290484140234e-05, + "loss": 0.4558, + "step": 2316 + }, + { + "epoch": 0.0008132884380799057, + "grad_norm": 0.3056366741657257, + "learning_rate": 4.567612687813022e-05, + "loss": 0.5274, + "step": 2317 + }, + { + "epoch": 0.0008136394473324219, + "grad_norm": 0.9373723864555359, + "learning_rate": 4.5609348914858096e-05, + "loss": 0.5398, + "step": 2318 + }, + { + "epoch": 0.0008139904565849379, + "grad_norm": 0.26745036244392395, + "learning_rate": 4.5542570951585976e-05, + "loss": 0.4388, + "step": 2319 + }, + { + "epoch": 0.0008143414658374541, + "grad_norm": 0.30698806047439575, + "learning_rate": 4.5475792988313856e-05, + "loss": 0.4167, + "step": 2320 + }, + { + "epoch": 0.0008146924750899703, + "grad_norm": 0.36348575353622437, + "learning_rate": 4.540901502504174e-05, + "loss": 0.421, + "step": 2321 + }, + { + "epoch": 0.0008150434843424864, + "grad_norm": 0.49959614872932434, + "learning_rate": 4.5342237061769616e-05, + "loss": 0.3835, + "step": 2322 + }, + { + "epoch": 0.0008153944935950025, + "grad_norm": 0.3920055329799652, + "learning_rate": 4.5275459098497496e-05, + "loss": 0.4392, + "step": 2323 + }, + { + "epoch": 0.0008157455028475187, + "grad_norm": 0.3473761975765228, + "learning_rate": 4.5208681135225376e-05, + "loss": 0.5014, + "step": 2324 + }, + { + "epoch": 0.0008160965121000348, + "grad_norm": 0.29744240641593933, + "learning_rate": 4.514190317195326e-05, + "loss": 0.5175, + "step": 2325 + }, + { + "epoch": 0.000816447521352551, + "grad_norm": 0.35290253162384033, + "learning_rate": 4.5075125208681135e-05, + "loss": 0.5991, + "step": 2326 + }, + { + "epoch": 0.0008167985306050671, + "grad_norm": 0.3837706446647644, + "learning_rate": 4.5008347245409015e-05, + "loss": 0.5866, + "step": 2327 + }, + { + "epoch": 0.0008171495398575833, + "grad_norm": 0.321729451417923, + "learning_rate": 4.4941569282136895e-05, + "loss": 0.5387, + "step": 2328 + }, + { + "epoch": 0.0008175005491100994, + "grad_norm": 0.6311901211738586, + "learning_rate": 4.4874791318864775e-05, + "loss": 0.5239, + "step": 2329 + }, + { + "epoch": 0.0008178515583626156, + "grad_norm": 0.6958840489387512, + "learning_rate": 4.4808013355592655e-05, + "loss": 0.568, + "step": 2330 + }, + { + "epoch": 0.0008182025676151317, + "grad_norm": 0.5229877829551697, + "learning_rate": 4.4741235392320535e-05, + "loss": 0.6472, + "step": 2331 + }, + { + "epoch": 0.0008185535768676478, + "grad_norm": 0.3351100981235504, + "learning_rate": 4.4674457429048415e-05, + "loss": 0.5663, + "step": 2332 + }, + { + "epoch": 0.000818904586120164, + "grad_norm": 0.3409821689128876, + "learning_rate": 4.4607679465776295e-05, + "loss": 0.4835, + "step": 2333 + }, + { + "epoch": 0.0008192555953726802, + "grad_norm": 0.3333572745323181, + "learning_rate": 4.4540901502504175e-05, + "loss": 0.5457, + "step": 2334 + }, + { + "epoch": 0.0008196066046251962, + "grad_norm": 0.45605313777923584, + "learning_rate": 4.4474123539232055e-05, + "loss": 0.5375, + "step": 2335 + }, + { + "epoch": 0.0008199576138777124, + "grad_norm": 0.2985444664955139, + "learning_rate": 4.4407345575959935e-05, + "loss": 0.5018, + "step": 2336 + }, + { + "epoch": 0.0008203086231302286, + "grad_norm": 0.480658620595932, + "learning_rate": 4.4340567612687815e-05, + "loss": 0.5736, + "step": 2337 + }, + { + "epoch": 0.0008206596323827448, + "grad_norm": 0.38944509625434875, + "learning_rate": 4.4273789649415695e-05, + "loss": 0.449, + "step": 2338 + }, + { + "epoch": 0.0008210106416352608, + "grad_norm": 0.3390035629272461, + "learning_rate": 4.4207011686143574e-05, + "loss": 0.595, + "step": 2339 + }, + { + "epoch": 0.000821361650887777, + "grad_norm": 0.3503229022026062, + "learning_rate": 4.4140233722871454e-05, + "loss": 0.5451, + "step": 2340 + }, + { + "epoch": 0.0008217126601402932, + "grad_norm": 0.29299256205558777, + "learning_rate": 4.4073455759599334e-05, + "loss": 0.5404, + "step": 2341 + }, + { + "epoch": 0.0008220636693928092, + "grad_norm": 0.35951006412506104, + "learning_rate": 4.4006677796327214e-05, + "loss": 0.6232, + "step": 2342 + }, + { + "epoch": 0.0008224146786453254, + "grad_norm": 0.3211289346218109, + "learning_rate": 4.3939899833055094e-05, + "loss": 0.5601, + "step": 2343 + }, + { + "epoch": 0.0008227656878978416, + "grad_norm": 0.3218986392021179, + "learning_rate": 4.3873121869782974e-05, + "loss": 0.5392, + "step": 2344 + }, + { + "epoch": 0.0008231166971503577, + "grad_norm": 0.29046937823295593, + "learning_rate": 4.3806343906510854e-05, + "loss": 0.3494, + "step": 2345 + }, + { + "epoch": 0.0008234677064028738, + "grad_norm": 0.33025527000427246, + "learning_rate": 4.373956594323873e-05, + "loss": 0.4713, + "step": 2346 + }, + { + "epoch": 0.00082381871565539, + "grad_norm": 0.3046811521053314, + "learning_rate": 4.3672787979966614e-05, + "loss": 0.5873, + "step": 2347 + }, + { + "epoch": 0.0008241697249079062, + "grad_norm": 0.330526202917099, + "learning_rate": 4.3606010016694494e-05, + "loss": 0.5937, + "step": 2348 + }, + { + "epoch": 0.0008245207341604223, + "grad_norm": 0.309096097946167, + "learning_rate": 4.3539232053422374e-05, + "loss": 0.5905, + "step": 2349 + }, + { + "epoch": 0.0008248717434129384, + "grad_norm": 0.2798556685447693, + "learning_rate": 4.3472454090150254e-05, + "loss": 0.4126, + "step": 2350 + }, + { + "epoch": 0.0008252227526654546, + "grad_norm": 0.3218364417552948, + "learning_rate": 4.3405676126878134e-05, + "loss": 0.447, + "step": 2351 + }, + { + "epoch": 0.0008255737619179707, + "grad_norm": 0.32477137446403503, + "learning_rate": 4.3338898163606014e-05, + "loss": 0.5111, + "step": 2352 + }, + { + "epoch": 0.0008259247711704869, + "grad_norm": 0.32486987113952637, + "learning_rate": 4.3272120200333893e-05, + "loss": 0.4991, + "step": 2353 + }, + { + "epoch": 0.000826275780423003, + "grad_norm": 0.26125961542129517, + "learning_rate": 4.3205342237061773e-05, + "loss": 0.507, + "step": 2354 + }, + { + "epoch": 0.0008266267896755191, + "grad_norm": 0.29981791973114014, + "learning_rate": 4.313856427378965e-05, + "loss": 0.5704, + "step": 2355 + }, + { + "epoch": 0.0008269777989280353, + "grad_norm": 0.4315311014652252, + "learning_rate": 4.307178631051753e-05, + "loss": 0.4945, + "step": 2356 + }, + { + "epoch": 0.0008273288081805515, + "grad_norm": 0.2862604260444641, + "learning_rate": 4.300500834724541e-05, + "loss": 0.5129, + "step": 2357 + }, + { + "epoch": 0.0008276798174330676, + "grad_norm": 0.3008829951286316, + "learning_rate": 4.293823038397329e-05, + "loss": 0.4608, + "step": 2358 + }, + { + "epoch": 0.0008280308266855837, + "grad_norm": 0.3753371834754944, + "learning_rate": 4.2871452420701166e-05, + "loss": 0.3947, + "step": 2359 + }, + { + "epoch": 0.0008283818359380999, + "grad_norm": 0.310059130191803, + "learning_rate": 4.280467445742905e-05, + "loss": 0.487, + "step": 2360 + }, + { + "epoch": 0.0008287328451906161, + "grad_norm": 0.29558148980140686, + "learning_rate": 4.273789649415693e-05, + "loss": 0.4524, + "step": 2361 + }, + { + "epoch": 0.0008290838544431321, + "grad_norm": 0.3092529773712158, + "learning_rate": 4.267111853088481e-05, + "loss": 0.558, + "step": 2362 + }, + { + "epoch": 0.0008294348636956483, + "grad_norm": 0.3629109263420105, + "learning_rate": 4.2604340567612686e-05, + "loss": 0.5566, + "step": 2363 + }, + { + "epoch": 0.0008297858729481645, + "grad_norm": 0.3263145983219147, + "learning_rate": 4.253756260434057e-05, + "loss": 0.5271, + "step": 2364 + }, + { + "epoch": 0.0008301368822006805, + "grad_norm": 0.32853761315345764, + "learning_rate": 4.247078464106845e-05, + "loss": 0.5364, + "step": 2365 + }, + { + "epoch": 0.0008304878914531967, + "grad_norm": 0.29384636878967285, + "learning_rate": 4.240400667779633e-05, + "loss": 0.4424, + "step": 2366 + }, + { + "epoch": 0.0008308389007057129, + "grad_norm": 0.30362242460250854, + "learning_rate": 4.2337228714524206e-05, + "loss": 0.5283, + "step": 2367 + }, + { + "epoch": 0.0008311899099582291, + "grad_norm": 0.2768915295600891, + "learning_rate": 4.2270450751252086e-05, + "loss": 0.4866, + "step": 2368 + }, + { + "epoch": 0.0008315409192107451, + "grad_norm": 0.30676960945129395, + "learning_rate": 4.220367278797997e-05, + "loss": 0.4834, + "step": 2369 + }, + { + "epoch": 0.0008318919284632613, + "grad_norm": 0.34929925203323364, + "learning_rate": 4.213689482470785e-05, + "loss": 0.5445, + "step": 2370 + }, + { + "epoch": 0.0008322429377157775, + "grad_norm": 0.2859930396080017, + "learning_rate": 4.2070116861435725e-05, + "loss": 0.4663, + "step": 2371 + }, + { + "epoch": 0.0008325939469682936, + "grad_norm": 0.3314751088619232, + "learning_rate": 4.2003338898163605e-05, + "loss": 0.6377, + "step": 2372 + }, + { + "epoch": 0.0008329449562208097, + "grad_norm": 0.2735826373100281, + "learning_rate": 4.193656093489149e-05, + "loss": 0.3718, + "step": 2373 + }, + { + "epoch": 0.0008332959654733259, + "grad_norm": 0.3017156422138214, + "learning_rate": 4.186978297161937e-05, + "loss": 0.5561, + "step": 2374 + }, + { + "epoch": 0.000833646974725842, + "grad_norm": 0.28279563784599304, + "learning_rate": 4.1803005008347245e-05, + "loss": 0.4622, + "step": 2375 + }, + { + "epoch": 0.0008339979839783582, + "grad_norm": 0.3143702745437622, + "learning_rate": 4.1736227045075125e-05, + "loss": 0.538, + "step": 2376 + }, + { + "epoch": 0.0008343489932308743, + "grad_norm": 0.33771878480911255, + "learning_rate": 4.1669449081803005e-05, + "loss": 0.5423, + "step": 2377 + }, + { + "epoch": 0.0008347000024833905, + "grad_norm": 0.32004043459892273, + "learning_rate": 4.160267111853089e-05, + "loss": 0.5785, + "step": 2378 + }, + { + "epoch": 0.0008350510117359066, + "grad_norm": 0.3358834981918335, + "learning_rate": 4.1535893155258765e-05, + "loss": 0.6388, + "step": 2379 + }, + { + "epoch": 0.0008354020209884228, + "grad_norm": 0.3659215271472931, + "learning_rate": 4.1469115191986645e-05, + "loss": 0.5694, + "step": 2380 + }, + { + "epoch": 0.0008357530302409389, + "grad_norm": 0.29528388381004333, + "learning_rate": 4.1402337228714525e-05, + "loss": 0.4486, + "step": 2381 + }, + { + "epoch": 0.000836104039493455, + "grad_norm": 0.299845814704895, + "learning_rate": 4.133555926544241e-05, + "loss": 0.4146, + "step": 2382 + }, + { + "epoch": 0.0008364550487459712, + "grad_norm": 0.28873342275619507, + "learning_rate": 4.1268781302170285e-05, + "loss": 0.4332, + "step": 2383 + }, + { + "epoch": 0.0008368060579984874, + "grad_norm": 0.3562033176422119, + "learning_rate": 4.1202003338898165e-05, + "loss": 0.4753, + "step": 2384 + }, + { + "epoch": 0.0008371570672510034, + "grad_norm": 0.28127390146255493, + "learning_rate": 4.1135225375626044e-05, + "loss": 0.4853, + "step": 2385 + }, + { + "epoch": 0.0008375080765035196, + "grad_norm": 0.33200159668922424, + "learning_rate": 4.1068447412353924e-05, + "loss": 0.4871, + "step": 2386 + }, + { + "epoch": 0.0008378590857560358, + "grad_norm": 0.3686981499195099, + "learning_rate": 4.1001669449081804e-05, + "loss": 0.5417, + "step": 2387 + }, + { + "epoch": 0.000838210095008552, + "grad_norm": 0.3078843057155609, + "learning_rate": 4.0934891485809684e-05, + "loss": 0.5379, + "step": 2388 + }, + { + "epoch": 0.000838561104261068, + "grad_norm": 0.29550114274024963, + "learning_rate": 4.0868113522537564e-05, + "loss": 0.5334, + "step": 2389 + }, + { + "epoch": 0.0008389121135135842, + "grad_norm": 0.31512629985809326, + "learning_rate": 4.0801335559265444e-05, + "loss": 0.4291, + "step": 2390 + }, + { + "epoch": 0.0008392631227661004, + "grad_norm": 0.30229613184928894, + "learning_rate": 4.0734557595993324e-05, + "loss": 0.5583, + "step": 2391 + }, + { + "epoch": 0.0008396141320186164, + "grad_norm": 0.34097641706466675, + "learning_rate": 4.0667779632721204e-05, + "loss": 0.4972, + "step": 2392 + }, + { + "epoch": 0.0008399651412711326, + "grad_norm": 0.3585929274559021, + "learning_rate": 4.0601001669449084e-05, + "loss": 0.4456, + "step": 2393 + }, + { + "epoch": 0.0008403161505236488, + "grad_norm": 0.30176597833633423, + "learning_rate": 4.0534223706176964e-05, + "loss": 0.462, + "step": 2394 + }, + { + "epoch": 0.0008406671597761649, + "grad_norm": 0.31252893805503845, + "learning_rate": 4.0467445742904844e-05, + "loss": 0.4703, + "step": 2395 + }, + { + "epoch": 0.000841018169028681, + "grad_norm": 0.3262486159801483, + "learning_rate": 4.0400667779632724e-05, + "loss": 0.5536, + "step": 2396 + }, + { + "epoch": 0.0008413691782811972, + "grad_norm": 0.2762390077114105, + "learning_rate": 4.0333889816360604e-05, + "loss": 0.4422, + "step": 2397 + }, + { + "epoch": 0.0008417201875337133, + "grad_norm": 0.3413786292076111, + "learning_rate": 4.0267111853088484e-05, + "loss": 0.4447, + "step": 2398 + }, + { + "epoch": 0.0008420711967862295, + "grad_norm": 0.31144657731056213, + "learning_rate": 4.0200333889816363e-05, + "loss": 0.4937, + "step": 2399 + }, + { + "epoch": 0.0008424222060387456, + "grad_norm": 0.3274284899234772, + "learning_rate": 4.0133555926544243e-05, + "loss": 0.5868, + "step": 2400 + }, + { + "epoch": 0.0008427732152912618, + "grad_norm": 0.3613366186618805, + "learning_rate": 4.006677796327212e-05, + "loss": 0.4489, + "step": 2401 + }, + { + "epoch": 0.0008431242245437779, + "grad_norm": 0.38178175687789917, + "learning_rate": 4e-05, + "loss": 0.576, + "step": 2402 + }, + { + "epoch": 0.0008434752337962941, + "grad_norm": 0.35652783513069153, + "learning_rate": 3.9933222036727876e-05, + "loss": 0.6034, + "step": 2403 + }, + { + "epoch": 0.0008438262430488102, + "grad_norm": 0.3658648431301117, + "learning_rate": 3.986644407345576e-05, + "loss": 0.5243, + "step": 2404 + }, + { + "epoch": 0.0008441772523013263, + "grad_norm": 0.30486276745796204, + "learning_rate": 3.979966611018364e-05, + "loss": 0.4327, + "step": 2405 + }, + { + "epoch": 0.0008445282615538425, + "grad_norm": 0.2804754376411438, + "learning_rate": 3.973288814691152e-05, + "loss": 0.4825, + "step": 2406 + }, + { + "epoch": 0.0008448792708063587, + "grad_norm": 0.28429001569747925, + "learning_rate": 3.9666110183639396e-05, + "loss": 0.4161, + "step": 2407 + }, + { + "epoch": 0.0008452302800588747, + "grad_norm": 0.30368781089782715, + "learning_rate": 3.959933222036728e-05, + "loss": 0.5696, + "step": 2408 + }, + { + "epoch": 0.0008455812893113909, + "grad_norm": 0.33198389410972595, + "learning_rate": 3.953255425709516e-05, + "loss": 0.5458, + "step": 2409 + }, + { + "epoch": 0.0008459322985639071, + "grad_norm": 0.2976115942001343, + "learning_rate": 3.946577629382304e-05, + "loss": 0.5013, + "step": 2410 + }, + { + "epoch": 0.0008462833078164233, + "grad_norm": 0.34938329458236694, + "learning_rate": 3.9398998330550916e-05, + "loss": 0.6064, + "step": 2411 + }, + { + "epoch": 0.0008466343170689393, + "grad_norm": 0.30314376950263977, + "learning_rate": 3.93322203672788e-05, + "loss": 0.5154, + "step": 2412 + }, + { + "epoch": 0.0008469853263214555, + "grad_norm": 0.30583375692367554, + "learning_rate": 3.926544240400668e-05, + "loss": 0.5362, + "step": 2413 + }, + { + "epoch": 0.0008473363355739717, + "grad_norm": 0.3435641825199127, + "learning_rate": 3.919866444073456e-05, + "loss": 0.4087, + "step": 2414 + }, + { + "epoch": 0.0008476873448264877, + "grad_norm": 0.3141246736049652, + "learning_rate": 3.9131886477462436e-05, + "loss": 0.5155, + "step": 2415 + }, + { + "epoch": 0.0008480383540790039, + "grad_norm": 0.301431804895401, + "learning_rate": 3.9065108514190316e-05, + "loss": 0.4345, + "step": 2416 + }, + { + "epoch": 0.0008483893633315201, + "grad_norm": 0.2610575556755066, + "learning_rate": 3.89983305509182e-05, + "loss": 0.5468, + "step": 2417 + }, + { + "epoch": 0.0008487403725840362, + "grad_norm": 0.30231544375419617, + "learning_rate": 3.893155258764608e-05, + "loss": 0.4617, + "step": 2418 + }, + { + "epoch": 0.0008490913818365523, + "grad_norm": 0.3302491307258606, + "learning_rate": 3.8864774624373955e-05, + "loss": 0.5269, + "step": 2419 + }, + { + "epoch": 0.0008494423910890685, + "grad_norm": 0.31854262948036194, + "learning_rate": 3.8797996661101835e-05, + "loss": 0.5081, + "step": 2420 + }, + { + "epoch": 0.0008497934003415847, + "grad_norm": 0.356121689081192, + "learning_rate": 3.873121869782972e-05, + "loss": 0.5109, + "step": 2421 + }, + { + "epoch": 0.0008501444095941008, + "grad_norm": 0.3252284526824951, + "learning_rate": 3.86644407345576e-05, + "loss": 0.5355, + "step": 2422 + }, + { + "epoch": 0.0008504954188466169, + "grad_norm": 0.3570926785469055, + "learning_rate": 3.8597662771285475e-05, + "loss": 0.3933, + "step": 2423 + }, + { + "epoch": 0.0008508464280991331, + "grad_norm": 0.41406819224357605, + "learning_rate": 3.8530884808013355e-05, + "loss": 0.5708, + "step": 2424 + }, + { + "epoch": 0.0008511974373516492, + "grad_norm": 0.26306653022766113, + "learning_rate": 3.8464106844741235e-05, + "loss": 0.4166, + "step": 2425 + }, + { + "epoch": 0.0008515484466041654, + "grad_norm": 0.32971739768981934, + "learning_rate": 3.839732888146912e-05, + "loss": 0.4723, + "step": 2426 + }, + { + "epoch": 0.0008518994558566815, + "grad_norm": 0.3209386467933655, + "learning_rate": 3.8330550918196995e-05, + "loss": 0.4646, + "step": 2427 + }, + { + "epoch": 0.0008522504651091976, + "grad_norm": 0.40913888812065125, + "learning_rate": 3.8263772954924875e-05, + "loss": 0.4713, + "step": 2428 + }, + { + "epoch": 0.0008526014743617138, + "grad_norm": 0.3265860974788666, + "learning_rate": 3.8196994991652755e-05, + "loss": 0.4628, + "step": 2429 + }, + { + "epoch": 0.00085295248361423, + "grad_norm": 0.3348692059516907, + "learning_rate": 3.813021702838064e-05, + "loss": 0.5985, + "step": 2430 + }, + { + "epoch": 0.0008533034928667461, + "grad_norm": 0.31986677646636963, + "learning_rate": 3.8063439065108514e-05, + "loss": 0.6139, + "step": 2431 + }, + { + "epoch": 0.0008536545021192622, + "grad_norm": 0.35525721311569214, + "learning_rate": 3.7996661101836394e-05, + "loss": 0.5216, + "step": 2432 + }, + { + "epoch": 0.0008540055113717784, + "grad_norm": 0.3543768525123596, + "learning_rate": 3.7929883138564274e-05, + "loss": 0.52, + "step": 2433 + }, + { + "epoch": 0.0008543565206242946, + "grad_norm": 0.31203389167785645, + "learning_rate": 3.7863105175292154e-05, + "loss": 0.4891, + "step": 2434 + }, + { + "epoch": 0.0008547075298768106, + "grad_norm": 0.30776453018188477, + "learning_rate": 3.7796327212020034e-05, + "loss": 0.4326, + "step": 2435 + }, + { + "epoch": 0.0008550585391293268, + "grad_norm": 0.29725879430770874, + "learning_rate": 3.7729549248747914e-05, + "loss": 0.3621, + "step": 2436 + }, + { + "epoch": 0.000855409548381843, + "grad_norm": 0.3332844376564026, + "learning_rate": 3.7662771285475794e-05, + "loss": 0.5518, + "step": 2437 + }, + { + "epoch": 0.000855760557634359, + "grad_norm": 0.34597867727279663, + "learning_rate": 3.7595993322203674e-05, + "loss": 0.5084, + "step": 2438 + }, + { + "epoch": 0.0008561115668868752, + "grad_norm": 0.3425275981426239, + "learning_rate": 3.7529215358931554e-05, + "loss": 0.5683, + "step": 2439 + }, + { + "epoch": 0.0008564625761393914, + "grad_norm": 0.35414308309555054, + "learning_rate": 3.7462437395659434e-05, + "loss": 0.5962, + "step": 2440 + }, + { + "epoch": 0.0008568135853919076, + "grad_norm": 0.31397873163223267, + "learning_rate": 3.7395659432387314e-05, + "loss": 0.5067, + "step": 2441 + }, + { + "epoch": 0.0008571645946444236, + "grad_norm": 0.3142837584018707, + "learning_rate": 3.7328881469115194e-05, + "loss": 0.5437, + "step": 2442 + }, + { + "epoch": 0.0008575156038969398, + "grad_norm": 0.3198903501033783, + "learning_rate": 3.7262103505843074e-05, + "loss": 0.4864, + "step": 2443 + }, + { + "epoch": 0.000857866613149456, + "grad_norm": 0.37642693519592285, + "learning_rate": 3.7195325542570954e-05, + "loss": 0.4489, + "step": 2444 + }, + { + "epoch": 0.0008582176224019721, + "grad_norm": 0.31032124161720276, + "learning_rate": 3.7128547579298833e-05, + "loss": 0.3956, + "step": 2445 + }, + { + "epoch": 0.0008585686316544882, + "grad_norm": 0.2642196714878082, + "learning_rate": 3.7061769616026713e-05, + "loss": 0.3515, + "step": 2446 + }, + { + "epoch": 0.0008589196409070044, + "grad_norm": 0.2694128751754761, + "learning_rate": 3.699499165275459e-05, + "loss": 0.4628, + "step": 2447 + }, + { + "epoch": 0.0008592706501595205, + "grad_norm": 0.4253450632095337, + "learning_rate": 3.692821368948247e-05, + "loss": 0.5667, + "step": 2448 + }, + { + "epoch": 0.0008596216594120367, + "grad_norm": 0.32464760541915894, + "learning_rate": 3.686143572621035e-05, + "loss": 0.5857, + "step": 2449 + }, + { + "epoch": 0.0008599726686645528, + "grad_norm": 0.298491507768631, + "learning_rate": 3.679465776293823e-05, + "loss": 0.4721, + "step": 2450 + }, + { + "epoch": 0.000860323677917069, + "grad_norm": 0.36551931500434875, + "learning_rate": 3.6727879799666106e-05, + "loss": 0.491, + "step": 2451 + }, + { + "epoch": 0.0008606746871695851, + "grad_norm": 0.3350832164287567, + "learning_rate": 3.666110183639399e-05, + "loss": 0.5654, + "step": 2452 + }, + { + "epoch": 0.0008610256964221013, + "grad_norm": 0.34928208589553833, + "learning_rate": 3.659432387312187e-05, + "loss": 0.5588, + "step": 2453 + }, + { + "epoch": 0.0008613767056746174, + "grad_norm": 0.32251986861228943, + "learning_rate": 3.652754590984975e-05, + "loss": 0.4846, + "step": 2454 + }, + { + "epoch": 0.0008617277149271335, + "grad_norm": 0.3968466520309448, + "learning_rate": 3.6460767946577626e-05, + "loss": 0.5886, + "step": 2455 + }, + { + "epoch": 0.0008620787241796497, + "grad_norm": 0.3277405798435211, + "learning_rate": 3.639398998330551e-05, + "loss": 0.4697, + "step": 2456 + }, + { + "epoch": 0.0008624297334321659, + "grad_norm": 0.3197111487388611, + "learning_rate": 3.632721202003339e-05, + "loss": 0.4728, + "step": 2457 + }, + { + "epoch": 0.0008627807426846819, + "grad_norm": 0.30383023619651794, + "learning_rate": 3.626043405676127e-05, + "loss": 0.3914, + "step": 2458 + }, + { + "epoch": 0.0008631317519371981, + "grad_norm": 0.3400476276874542, + "learning_rate": 3.6193656093489146e-05, + "loss": 0.5023, + "step": 2459 + }, + { + "epoch": 0.0008634827611897143, + "grad_norm": 0.5489293932914734, + "learning_rate": 3.6126878130217026e-05, + "loss": 0.5215, + "step": 2460 + }, + { + "epoch": 0.0008638337704422305, + "grad_norm": 0.29822349548339844, + "learning_rate": 3.606010016694491e-05, + "loss": 0.5606, + "step": 2461 + }, + { + "epoch": 0.0008641847796947465, + "grad_norm": 0.35215723514556885, + "learning_rate": 3.599332220367279e-05, + "loss": 0.5509, + "step": 2462 + }, + { + "epoch": 0.0008645357889472627, + "grad_norm": 0.307216614484787, + "learning_rate": 3.5926544240400665e-05, + "loss": 0.4325, + "step": 2463 + }, + { + "epoch": 0.0008648867981997789, + "grad_norm": 0.31825220584869385, + "learning_rate": 3.5859766277128545e-05, + "loss": 0.4299, + "step": 2464 + }, + { + "epoch": 0.000865237807452295, + "grad_norm": 0.3078344166278839, + "learning_rate": 3.579298831385643e-05, + "loss": 0.5945, + "step": 2465 + }, + { + "epoch": 0.0008655888167048111, + "grad_norm": 0.29364824295043945, + "learning_rate": 3.572621035058431e-05, + "loss": 0.5109, + "step": 2466 + }, + { + "epoch": 0.0008659398259573273, + "grad_norm": 0.364878386259079, + "learning_rate": 3.5659432387312185e-05, + "loss": 0.5408, + "step": 2467 + }, + { + "epoch": 0.0008662908352098434, + "grad_norm": 0.32669126987457275, + "learning_rate": 3.5592654424040065e-05, + "loss": 0.5167, + "step": 2468 + }, + { + "epoch": 0.0008666418444623595, + "grad_norm": 0.3356972634792328, + "learning_rate": 3.5525876460767945e-05, + "loss": 0.6229, + "step": 2469 + }, + { + "epoch": 0.0008669928537148757, + "grad_norm": 0.3334660232067108, + "learning_rate": 3.545909849749583e-05, + "loss": 0.4762, + "step": 2470 + }, + { + "epoch": 0.0008673438629673919, + "grad_norm": 0.33314821124076843, + "learning_rate": 3.5392320534223705e-05, + "loss": 0.5889, + "step": 2471 + }, + { + "epoch": 0.000867694872219908, + "grad_norm": 0.2715354263782501, + "learning_rate": 3.5325542570951585e-05, + "loss": 0.4532, + "step": 2472 + }, + { + "epoch": 0.0008680458814724241, + "grad_norm": 0.3389108180999756, + "learning_rate": 3.5258764607679465e-05, + "loss": 0.5293, + "step": 2473 + }, + { + "epoch": 0.0008683968907249403, + "grad_norm": 0.28182253241539, + "learning_rate": 3.519198664440735e-05, + "loss": 0.4768, + "step": 2474 + }, + { + "epoch": 0.0008687478999774564, + "grad_norm": 0.3153379261493683, + "learning_rate": 3.5125208681135225e-05, + "loss": 0.3779, + "step": 2475 + }, + { + "epoch": 0.0008690989092299726, + "grad_norm": 0.3339671492576599, + "learning_rate": 3.5058430717863105e-05, + "loss": 0.4738, + "step": 2476 + }, + { + "epoch": 0.0008694499184824887, + "grad_norm": 0.3346128463745117, + "learning_rate": 3.4991652754590984e-05, + "loss": 0.4353, + "step": 2477 + }, + { + "epoch": 0.0008698009277350048, + "grad_norm": 0.33985427021980286, + "learning_rate": 3.492487479131887e-05, + "loss": 0.5535, + "step": 2478 + }, + { + "epoch": 0.000870151936987521, + "grad_norm": 0.36896049976348877, + "learning_rate": 3.4858096828046744e-05, + "loss": 0.4593, + "step": 2479 + }, + { + "epoch": 0.0008705029462400372, + "grad_norm": 0.3066719174385071, + "learning_rate": 3.4791318864774624e-05, + "loss": 0.5199, + "step": 2480 + }, + { + "epoch": 0.0008708539554925533, + "grad_norm": 0.28390833735466003, + "learning_rate": 3.4724540901502504e-05, + "loss": 0.4747, + "step": 2481 + }, + { + "epoch": 0.0008712049647450694, + "grad_norm": 0.3579369783401489, + "learning_rate": 3.4657762938230384e-05, + "loss": 0.342, + "step": 2482 + }, + { + "epoch": 0.0008715559739975856, + "grad_norm": 0.2909548282623291, + "learning_rate": 3.459098497495827e-05, + "loss": 0.525, + "step": 2483 + }, + { + "epoch": 0.0008719069832501018, + "grad_norm": 0.31367257237434387, + "learning_rate": 3.4524207011686144e-05, + "loss": 0.5744, + "step": 2484 + }, + { + "epoch": 0.0008722579925026178, + "grad_norm": 0.3309953510761261, + "learning_rate": 3.4457429048414024e-05, + "loss": 0.3773, + "step": 2485 + }, + { + "epoch": 0.000872609001755134, + "grad_norm": 0.32469210028648376, + "learning_rate": 3.4390651085141904e-05, + "loss": 0.5586, + "step": 2486 + }, + { + "epoch": 0.0008729600110076502, + "grad_norm": 0.3475576341152191, + "learning_rate": 3.432387312186979e-05, + "loss": 0.5238, + "step": 2487 + }, + { + "epoch": 0.0008733110202601663, + "grad_norm": 0.2654307782649994, + "learning_rate": 3.4257095158597664e-05, + "loss": 0.4142, + "step": 2488 + }, + { + "epoch": 0.0008736620295126824, + "grad_norm": 0.3001498579978943, + "learning_rate": 3.4190317195325544e-05, + "loss": 0.5085, + "step": 2489 + }, + { + "epoch": 0.0008740130387651986, + "grad_norm": 0.36860695481300354, + "learning_rate": 3.4123539232053424e-05, + "loss": 0.5357, + "step": 2490 + }, + { + "epoch": 0.0008743640480177148, + "grad_norm": 0.3456466794013977, + "learning_rate": 3.4056761268781303e-05, + "loss": 0.5146, + "step": 2491 + }, + { + "epoch": 0.0008747150572702309, + "grad_norm": 0.33204081654548645, + "learning_rate": 3.3989983305509183e-05, + "loss": 0.5977, + "step": 2492 + }, + { + "epoch": 0.000875066066522747, + "grad_norm": 0.3318590819835663, + "learning_rate": 3.392320534223706e-05, + "loss": 0.3919, + "step": 2493 + }, + { + "epoch": 0.0008754170757752632, + "grad_norm": 0.3074159324169159, + "learning_rate": 3.385642737896494e-05, + "loss": 0.4548, + "step": 2494 + }, + { + "epoch": 0.0008757680850277793, + "grad_norm": 0.33519870042800903, + "learning_rate": 3.378964941569282e-05, + "loss": 0.5339, + "step": 2495 + }, + { + "epoch": 0.0008761190942802954, + "grad_norm": 0.2852168679237366, + "learning_rate": 3.37228714524207e-05, + "loss": 0.4363, + "step": 2496 + }, + { + "epoch": 0.0008764701035328116, + "grad_norm": 0.3491702973842621, + "learning_rate": 3.365609348914858e-05, + "loss": 0.5674, + "step": 2497 + }, + { + "epoch": 0.0008768211127853277, + "grad_norm": 0.350176066160202, + "learning_rate": 3.358931552587646e-05, + "loss": 0.4088, + "step": 2498 + }, + { + "epoch": 0.0008771721220378439, + "grad_norm": 0.37386786937713623, + "learning_rate": 3.352253756260434e-05, + "loss": 0.5911, + "step": 2499 + }, + { + "epoch": 0.00087752313129036, + "grad_norm": 0.33551308512687683, + "learning_rate": 3.345575959933222e-05, + "loss": 0.5333, + "step": 2500 + } + ], + "logging_steps": 1, + "max_steps": 3000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.622066151733658e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/marques/outputs/checkpoint-2500/training_args.bin b/marques/outputs/checkpoint-2500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..fd0ba520c124bb1ece608079704fa15e0236be45 --- /dev/null +++ b/marques/outputs/checkpoint-2500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09362706a3d58d219e41be1682b770b8f5069fcd630f7dbcadb71e4d4ce8859b +size 6289 diff --git a/marques/outputs/checkpoint-3000/README.md b/marques/outputs/checkpoint-3000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d90a96dfe2e51221657a6e936d376789e21081f9 --- /dev/null +++ b/marques/outputs/checkpoint-3000/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/marques/outputs/checkpoint-3000/adapter_config.json b/marques/outputs/checkpoint-3000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9930a191a30254256c9550b1bdffa58b8d7aee8 --- /dev/null +++ b/marques/outputs/checkpoint-3000/adapter_config.json @@ -0,0 +1,50 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "LlamaForCausalLM", + "parent_library": "transformers.models.llama.modeling_llama", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "gate_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/marques/outputs/checkpoint-3000/adapter_model.safetensors b/marques/outputs/checkpoint-3000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..829b523e4a97e45695205a08d77853addddc5df8 --- /dev/null +++ b/marques/outputs/checkpoint-3000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f53ebd8a0f292eccf703aa21410678322775b96e30cdbfd4e46af6ac33d8ae9 +size 167832240 diff --git a/marques/outputs/checkpoint-3000/optimizer.pt b/marques/outputs/checkpoint-3000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cc23601538fbe366176449009d4b19de648b5cf --- /dev/null +++ b/marques/outputs/checkpoint-3000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4671515a579e8922ad7afdf4c24126dc8292aff0a767e427b659c96d11de304f +size 85724133 diff --git a/marques/outputs/checkpoint-3000/rng_state.pth b/marques/outputs/checkpoint-3000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ef66339b9befa098183fd5d69faed6838e526b0 --- /dev/null +++ b/marques/outputs/checkpoint-3000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1d565802a8e26c4e8a31328752b7a7fdc186d9401aa008e65697d0ad8c22e33 +size 14645 diff --git a/marques/outputs/checkpoint-3000/scheduler.pt b/marques/outputs/checkpoint-3000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca8920de4d83bf1d67d08eb97b407544d02221c3 --- /dev/null +++ b/marques/outputs/checkpoint-3000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d85389e5248d1dd8616d04a50b2a701574ad30f831b0cac5e498b6f7baf8635d +size 1465 diff --git a/marques/outputs/checkpoint-3000/special_tokens_map.json b/marques/outputs/checkpoint-3000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..68b10c7f0a479eae0c358eac6a14959b3f9acdf1 --- /dev/null +++ b/marques/outputs/checkpoint-3000/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/marques/outputs/checkpoint-3000/tokenizer.json b/marques/outputs/checkpoint-3000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/marques/outputs/checkpoint-3000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/marques/outputs/checkpoint-3000/tokenizer_config.json b/marques/outputs/checkpoint-3000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..92b1d94e894e5474ebea1d171e14751be79ca3e5 --- /dev/null +++ b/marques/outputs/checkpoint-3000/tokenizer_config.json @@ -0,0 +1,2066 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/marques/outputs/checkpoint-3000/trainer_state.json b/marques/outputs/checkpoint-3000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ced289c81ba0d3d0cbc70508a748fa794400f355 --- /dev/null +++ b/marques/outputs/checkpoint-3000/trainer_state.json @@ -0,0 +1,21034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.001053027757548432, + "eval_steps": 500, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.5100925251614403e-07, + "grad_norm": 0.53782719373703, + "learning_rate": 0.0, + "loss": 0.5835, + "step": 1 + }, + { + "epoch": 7.020185050322881e-07, + "grad_norm": 0.6201626062393188, + "learning_rate": 4e-05, + "loss": 0.5242, + "step": 2 + }, + { + "epoch": 1.053027757548432e-06, + "grad_norm": 0.7571901082992554, + "learning_rate": 8e-05, + "loss": 0.5642, + "step": 3 + }, + { + "epoch": 1.4040370100645761e-06, + "grad_norm": 0.5588695406913757, + "learning_rate": 0.00012, + "loss": 0.4859, + "step": 4 + }, + { + "epoch": 1.75504626258072e-06, + "grad_norm": 0.7208331227302551, + "learning_rate": 0.00016, + "loss": 0.4645, + "step": 5 + }, + { + "epoch": 2.106055515096864e-06, + "grad_norm": 0.8169743418693542, + "learning_rate": 0.0002, + "loss": 0.3702, + "step": 6 + }, + { + "epoch": 2.4570647676130083e-06, + "grad_norm": 2.051530599594116, + "learning_rate": 0.00019993322203672788, + "loss": 0.4856, + "step": 7 + }, + { + "epoch": 2.8080740201291522e-06, + "grad_norm": 1.2310550212860107, + "learning_rate": 0.00019986644407345576, + "loss": 0.5192, + "step": 8 + }, + { + "epoch": 3.1590832726452962e-06, + "grad_norm": 1.612046241760254, + "learning_rate": 0.00019979966611018366, + "loss": 0.4719, + "step": 9 + }, + { + "epoch": 3.51009252516144e-06, + "grad_norm": 1.4484680891036987, + "learning_rate": 0.00019973288814691153, + "loss": 0.4416, + "step": 10 + }, + { + "epoch": 3.861101777677584e-06, + "grad_norm": 1.4529719352722168, + "learning_rate": 0.0001996661101836394, + "loss": 0.6275, + "step": 11 + }, + { + "epoch": 4.212111030193728e-06, + "grad_norm": 1.3963671922683716, + "learning_rate": 0.00019959933222036728, + "loss": 0.5874, + "step": 12 + }, + { + "epoch": 4.563120282709872e-06, + "grad_norm": 1.4744153022766113, + "learning_rate": 0.00019953255425709515, + "loss": 0.6422, + "step": 13 + }, + { + "epoch": 4.9141295352260165e-06, + "grad_norm": 0.8640050888061523, + "learning_rate": 0.00019946577629382305, + "loss": 0.5064, + "step": 14 + }, + { + "epoch": 5.26513878774216e-06, + "grad_norm": 0.7137419581413269, + "learning_rate": 0.00019939899833055092, + "loss": 0.5218, + "step": 15 + }, + { + "epoch": 5.6161480402583045e-06, + "grad_norm": 0.7769026756286621, + "learning_rate": 0.00019933222036727882, + "loss": 0.5377, + "step": 16 + }, + { + "epoch": 5.967157292774448e-06, + "grad_norm": 0.7558479905128479, + "learning_rate": 0.0001992654424040067, + "loss": 0.5054, + "step": 17 + }, + { + "epoch": 6.3181665452905924e-06, + "grad_norm": 0.8237054347991943, + "learning_rate": 0.00019919866444073457, + "loss": 0.5094, + "step": 18 + }, + { + "epoch": 6.669175797806736e-06, + "grad_norm": 1.0375059843063354, + "learning_rate": 0.00019913188647746244, + "loss": 0.5751, + "step": 19 + }, + { + "epoch": 7.02018505032288e-06, + "grad_norm": 1.075869083404541, + "learning_rate": 0.00019906510851419034, + "loss": 0.594, + "step": 20 + }, + { + "epoch": 7.371194302839024e-06, + "grad_norm": 0.8041358590126038, + "learning_rate": 0.00019899833055091822, + "loss": 0.553, + "step": 21 + }, + { + "epoch": 7.722203555355168e-06, + "grad_norm": 0.9264736771583557, + "learning_rate": 0.0001989315525876461, + "loss": 0.5555, + "step": 22 + }, + { + "epoch": 8.073212807871313e-06, + "grad_norm": 1.0074031352996826, + "learning_rate": 0.00019886477462437396, + "loss": 0.5353, + "step": 23 + }, + { + "epoch": 8.424222060387455e-06, + "grad_norm": 0.8725020885467529, + "learning_rate": 0.00019879799666110183, + "loss": 0.5557, + "step": 24 + }, + { + "epoch": 8.7752313129036e-06, + "grad_norm": 0.8867582678794861, + "learning_rate": 0.00019873121869782974, + "loss": 0.5992, + "step": 25 + }, + { + "epoch": 9.126240565419744e-06, + "grad_norm": 0.9235608577728271, + "learning_rate": 0.0001986644407345576, + "loss": 0.516, + "step": 26 + }, + { + "epoch": 9.477249817935889e-06, + "grad_norm": 0.8653218150138855, + "learning_rate": 0.00019859766277128548, + "loss": 0.5249, + "step": 27 + }, + { + "epoch": 9.828259070452033e-06, + "grad_norm": 0.7479026913642883, + "learning_rate": 0.00019853088480801335, + "loss": 0.5037, + "step": 28 + }, + { + "epoch": 1.0179268322968176e-05, + "grad_norm": 0.9531452655792236, + "learning_rate": 0.00019846410684474123, + "loss": 0.5896, + "step": 29 + }, + { + "epoch": 1.053027757548432e-05, + "grad_norm": 1.1012492179870605, + "learning_rate": 0.00019839732888146913, + "loss": 0.5139, + "step": 30 + }, + { + "epoch": 1.0881286828000465e-05, + "grad_norm": 1.0198887586593628, + "learning_rate": 0.000198330550918197, + "loss": 0.5587, + "step": 31 + }, + { + "epoch": 1.1232296080516609e-05, + "grad_norm": 0.8081266283988953, + "learning_rate": 0.00019826377295492487, + "loss": 0.4762, + "step": 32 + }, + { + "epoch": 1.1583305333032752e-05, + "grad_norm": 1.1965891122817993, + "learning_rate": 0.00019819699499165277, + "loss": 0.5719, + "step": 33 + }, + { + "epoch": 1.1934314585548896e-05, + "grad_norm": 1.214903473854065, + "learning_rate": 0.00019813021702838065, + "loss": 0.5756, + "step": 34 + }, + { + "epoch": 1.228532383806504e-05, + "grad_norm": 0.8360006213188171, + "learning_rate": 0.00019806343906510852, + "loss": 0.5688, + "step": 35 + }, + { + "epoch": 1.2636333090581185e-05, + "grad_norm": 0.8328489065170288, + "learning_rate": 0.00019799666110183642, + "loss": 0.6418, + "step": 36 + }, + { + "epoch": 1.298734234309733e-05, + "grad_norm": 1.1427714824676514, + "learning_rate": 0.0001979298831385643, + "loss": 0.6531, + "step": 37 + }, + { + "epoch": 1.3338351595613472e-05, + "grad_norm": 1.0145376920700073, + "learning_rate": 0.00019786310517529217, + "loss": 0.6473, + "step": 38 + }, + { + "epoch": 1.3689360848129616e-05, + "grad_norm": 0.8427861928939819, + "learning_rate": 0.00019779632721202004, + "loss": 0.5882, + "step": 39 + }, + { + "epoch": 1.404037010064576e-05, + "grad_norm": 0.8792659044265747, + "learning_rate": 0.00019772954924874791, + "loss": 0.608, + "step": 40 + }, + { + "epoch": 1.4391379353161905e-05, + "grad_norm": 0.9338463544845581, + "learning_rate": 0.00019766277128547581, + "loss": 0.7118, + "step": 41 + }, + { + "epoch": 1.4742388605678048e-05, + "grad_norm": 0.7554420232772827, + "learning_rate": 0.0001975959933222037, + "loss": 0.5898, + "step": 42 + }, + { + "epoch": 1.5093397858194192e-05, + "grad_norm": 0.7700084447860718, + "learning_rate": 0.00019752921535893156, + "loss": 0.6466, + "step": 43 + }, + { + "epoch": 1.5444407110710337e-05, + "grad_norm": 0.8639333248138428, + "learning_rate": 0.00019746243739565943, + "loss": 0.7253, + "step": 44 + }, + { + "epoch": 1.579541636322648e-05, + "grad_norm": 0.7760612964630127, + "learning_rate": 0.0001973956594323873, + "loss": 0.7099, + "step": 45 + }, + { + "epoch": 1.6146425615742626e-05, + "grad_norm": 0.7319066524505615, + "learning_rate": 0.0001973288814691152, + "loss": 0.6664, + "step": 46 + }, + { + "epoch": 1.6497434868258768e-05, + "grad_norm": 0.7557100057601929, + "learning_rate": 0.00019726210350584308, + "loss": 0.6318, + "step": 47 + }, + { + "epoch": 1.684844412077491e-05, + "grad_norm": 0.6420389413833618, + "learning_rate": 0.00019719532554257095, + "loss": 0.6688, + "step": 48 + }, + { + "epoch": 1.7199453373291057e-05, + "grad_norm": 0.660383939743042, + "learning_rate": 0.00019712854757929883, + "loss": 0.6204, + "step": 49 + }, + { + "epoch": 1.75504626258072e-05, + "grad_norm": 0.5614909529685974, + "learning_rate": 0.00019706176961602673, + "loss": 0.664, + "step": 50 + }, + { + "epoch": 1.7901471878323346e-05, + "grad_norm": 0.502738356590271, + "learning_rate": 0.0001969949916527546, + "loss": 0.6918, + "step": 51 + }, + { + "epoch": 1.825248113083949e-05, + "grad_norm": 0.47578102350234985, + "learning_rate": 0.0001969282136894825, + "loss": 0.6747, + "step": 52 + }, + { + "epoch": 1.860349038335563e-05, + "grad_norm": 0.5528931617736816, + "learning_rate": 0.00019686143572621037, + "loss": 0.765, + "step": 53 + }, + { + "epoch": 1.8954499635871777e-05, + "grad_norm": 0.6176997423171997, + "learning_rate": 0.00019679465776293825, + "loss": 0.5959, + "step": 54 + }, + { + "epoch": 1.930550888838792e-05, + "grad_norm": 0.43425047397613525, + "learning_rate": 0.00019672787979966612, + "loss": 0.6437, + "step": 55 + }, + { + "epoch": 1.9656518140904066e-05, + "grad_norm": 0.5135884881019592, + "learning_rate": 0.000196661101836394, + "loss": 0.7019, + "step": 56 + }, + { + "epoch": 2.000752739342021e-05, + "grad_norm": 0.4628916084766388, + "learning_rate": 0.0001965943238731219, + "loss": 0.5722, + "step": 57 + }, + { + "epoch": 2.035853664593635e-05, + "grad_norm": 0.48201897740364075, + "learning_rate": 0.00019652754590984977, + "loss": 0.6288, + "step": 58 + }, + { + "epoch": 2.0709545898452498e-05, + "grad_norm": 0.5772811770439148, + "learning_rate": 0.00019646076794657764, + "loss": 0.6067, + "step": 59 + }, + { + "epoch": 2.106055515096864e-05, + "grad_norm": 0.4976802170276642, + "learning_rate": 0.0001963939899833055, + "loss": 0.4722, + "step": 60 + }, + { + "epoch": 2.1411564403484786e-05, + "grad_norm": 0.4842129051685333, + "learning_rate": 0.00019632721202003339, + "loss": 0.5876, + "step": 61 + }, + { + "epoch": 2.176257365600093e-05, + "grad_norm": 0.46149536967277527, + "learning_rate": 0.00019626043405676129, + "loss": 0.6373, + "step": 62 + }, + { + "epoch": 2.2113582908517072e-05, + "grad_norm": 0.47199445962905884, + "learning_rate": 0.00019619365609348916, + "loss": 0.5546, + "step": 63 + }, + { + "epoch": 2.2464592161033218e-05, + "grad_norm": 0.6109340190887451, + "learning_rate": 0.00019612687813021703, + "loss": 0.6069, + "step": 64 + }, + { + "epoch": 2.281560141354936e-05, + "grad_norm": 0.5529135465621948, + "learning_rate": 0.0001960601001669449, + "loss": 0.553, + "step": 65 + }, + { + "epoch": 2.3166610666065503e-05, + "grad_norm": 0.500245213508606, + "learning_rate": 0.00019599332220367278, + "loss": 0.6149, + "step": 66 + }, + { + "epoch": 2.351761991858165e-05, + "grad_norm": 0.4841914474964142, + "learning_rate": 0.00019592654424040068, + "loss": 0.6509, + "step": 67 + }, + { + "epoch": 2.3868629171097792e-05, + "grad_norm": 0.5308504104614258, + "learning_rate": 0.00019585976627712855, + "loss": 0.7017, + "step": 68 + }, + { + "epoch": 2.4219638423613938e-05, + "grad_norm": 0.5157874822616577, + "learning_rate": 0.00019579298831385645, + "loss": 0.7125, + "step": 69 + }, + { + "epoch": 2.457064767613008e-05, + "grad_norm": 0.47787800431251526, + "learning_rate": 0.00019572621035058433, + "loss": 0.5792, + "step": 70 + }, + { + "epoch": 2.4921656928646224e-05, + "grad_norm": 0.46792763471603394, + "learning_rate": 0.0001956594323873122, + "loss": 0.7, + "step": 71 + }, + { + "epoch": 2.527266618116237e-05, + "grad_norm": 0.5394675135612488, + "learning_rate": 0.00019559265442404007, + "loss": 0.5549, + "step": 72 + }, + { + "epoch": 2.5623675433678512e-05, + "grad_norm": 0.45065200328826904, + "learning_rate": 0.00019552587646076797, + "loss": 0.6663, + "step": 73 + }, + { + "epoch": 2.597468468619466e-05, + "grad_norm": 0.4026688039302826, + "learning_rate": 0.00019545909849749584, + "loss": 0.6315, + "step": 74 + }, + { + "epoch": 2.63256939387108e-05, + "grad_norm": 0.42353659868240356, + "learning_rate": 0.00019539232053422372, + "loss": 0.5419, + "step": 75 + }, + { + "epoch": 2.6676703191226944e-05, + "grad_norm": 0.45561954379081726, + "learning_rate": 0.0001953255425709516, + "loss": 0.6624, + "step": 76 + }, + { + "epoch": 2.702771244374309e-05, + "grad_norm": 0.3954075574874878, + "learning_rate": 0.00019525876460767946, + "loss": 0.5479, + "step": 77 + }, + { + "epoch": 2.7378721696259233e-05, + "grad_norm": 0.4994329512119293, + "learning_rate": 0.00019519198664440736, + "loss": 0.7224, + "step": 78 + }, + { + "epoch": 2.7729730948775375e-05, + "grad_norm": 0.41149672865867615, + "learning_rate": 0.00019512520868113524, + "loss": 0.5621, + "step": 79 + }, + { + "epoch": 2.808074020129152e-05, + "grad_norm": 0.4199008345603943, + "learning_rate": 0.0001950584307178631, + "loss": 0.7038, + "step": 80 + }, + { + "epoch": 2.8431749453807664e-05, + "grad_norm": 0.4378969371318817, + "learning_rate": 0.00019499165275459098, + "loss": 0.6654, + "step": 81 + }, + { + "epoch": 2.878275870632381e-05, + "grad_norm": 0.4653928279876709, + "learning_rate": 0.00019492487479131886, + "loss": 0.6241, + "step": 82 + }, + { + "epoch": 2.9133767958839953e-05, + "grad_norm": 0.5166454911231995, + "learning_rate": 0.00019485809682804673, + "loss": 0.5366, + "step": 83 + }, + { + "epoch": 2.9484777211356096e-05, + "grad_norm": 0.43180733919143677, + "learning_rate": 0.00019479131886477463, + "loss": 0.6178, + "step": 84 + }, + { + "epoch": 2.9835786463872242e-05, + "grad_norm": 0.44828200340270996, + "learning_rate": 0.0001947245409015025, + "loss": 0.6706, + "step": 85 + }, + { + "epoch": 3.0186795716388385e-05, + "grad_norm": 0.384175181388855, + "learning_rate": 0.0001946577629382304, + "loss": 0.5551, + "step": 86 + }, + { + "epoch": 3.053780496890453e-05, + "grad_norm": 0.4359772503376007, + "learning_rate": 0.00019459098497495828, + "loss": 0.5626, + "step": 87 + }, + { + "epoch": 3.0888814221420673e-05, + "grad_norm": 0.4177016615867615, + "learning_rate": 0.00019452420701168615, + "loss": 0.6023, + "step": 88 + }, + { + "epoch": 3.1239823473936816e-05, + "grad_norm": 0.43592438101768494, + "learning_rate": 0.00019445742904841405, + "loss": 0.682, + "step": 89 + }, + { + "epoch": 3.159083272645296e-05, + "grad_norm": 0.48027974367141724, + "learning_rate": 0.00019439065108514192, + "loss": 0.7596, + "step": 90 + }, + { + "epoch": 3.194184197896911e-05, + "grad_norm": 0.35989537835121155, + "learning_rate": 0.0001943238731218698, + "loss": 0.6018, + "step": 91 + }, + { + "epoch": 3.229285123148525e-05, + "grad_norm": 0.48477092385292053, + "learning_rate": 0.00019425709515859767, + "loss": 0.512, + "step": 92 + }, + { + "epoch": 3.2643860484001394e-05, + "grad_norm": 0.38858646154403687, + "learning_rate": 0.00019419031719532554, + "loss": 0.6371, + "step": 93 + }, + { + "epoch": 3.2994869736517536e-05, + "grad_norm": 0.5323147177696228, + "learning_rate": 0.00019412353923205344, + "loss": 0.5221, + "step": 94 + }, + { + "epoch": 3.334587898903368e-05, + "grad_norm": 0.3784274160861969, + "learning_rate": 0.00019405676126878132, + "loss": 0.6158, + "step": 95 + }, + { + "epoch": 3.369688824154982e-05, + "grad_norm": 0.4076334834098816, + "learning_rate": 0.0001939899833055092, + "loss": 0.5535, + "step": 96 + }, + { + "epoch": 3.404789749406597e-05, + "grad_norm": 0.43930479884147644, + "learning_rate": 0.00019392320534223706, + "loss": 0.6482, + "step": 97 + }, + { + "epoch": 3.4398906746582114e-05, + "grad_norm": 0.4266909658908844, + "learning_rate": 0.00019385642737896494, + "loss": 0.6, + "step": 98 + }, + { + "epoch": 3.474991599909826e-05, + "grad_norm": 0.45353513956069946, + "learning_rate": 0.0001937896494156928, + "loss": 0.6596, + "step": 99 + }, + { + "epoch": 3.51009252516144e-05, + "grad_norm": 0.3424838185310364, + "learning_rate": 0.0001937228714524207, + "loss": 0.555, + "step": 100 + }, + { + "epoch": 3.545193450413054e-05, + "grad_norm": 0.40126165747642517, + "learning_rate": 0.00019365609348914858, + "loss": 0.6921, + "step": 101 + }, + { + "epoch": 3.580294375664669e-05, + "grad_norm": 0.36572012305259705, + "learning_rate": 0.00019358931552587646, + "loss": 0.5485, + "step": 102 + }, + { + "epoch": 3.6153953009162834e-05, + "grad_norm": 0.3972407281398773, + "learning_rate": 0.00019352253756260436, + "loss": 0.5884, + "step": 103 + }, + { + "epoch": 3.650496226167898e-05, + "grad_norm": 0.3900579512119293, + "learning_rate": 0.00019345575959933223, + "loss": 0.6664, + "step": 104 + }, + { + "epoch": 3.685597151419512e-05, + "grad_norm": 0.31666621565818787, + "learning_rate": 0.00019338898163606013, + "loss": 0.5009, + "step": 105 + }, + { + "epoch": 3.720698076671126e-05, + "grad_norm": 0.5269597172737122, + "learning_rate": 0.000193322203672788, + "loss": 0.6292, + "step": 106 + }, + { + "epoch": 3.755799001922741e-05, + "grad_norm": 0.4645126163959503, + "learning_rate": 0.00019325542570951588, + "loss": 0.636, + "step": 107 + }, + { + "epoch": 3.7908999271743555e-05, + "grad_norm": 0.3900754153728485, + "learning_rate": 0.00019318864774624375, + "loss": 0.5367, + "step": 108 + }, + { + "epoch": 3.82600085242597e-05, + "grad_norm": 0.42533883452415466, + "learning_rate": 0.00019312186978297162, + "loss": 0.6862, + "step": 109 + }, + { + "epoch": 3.861101777677584e-05, + "grad_norm": 0.6809422969818115, + "learning_rate": 0.00019305509181969952, + "loss": 0.6434, + "step": 110 + }, + { + "epoch": 3.896202702929198e-05, + "grad_norm": 0.5127860307693481, + "learning_rate": 0.0001929883138564274, + "loss": 0.6266, + "step": 111 + }, + { + "epoch": 3.931303628180813e-05, + "grad_norm": 0.5254234671592712, + "learning_rate": 0.00019292153589315527, + "loss": 0.6982, + "step": 112 + }, + { + "epoch": 3.9664045534324275e-05, + "grad_norm": 0.3699031472206116, + "learning_rate": 0.00019285475792988314, + "loss": 0.6037, + "step": 113 + }, + { + "epoch": 4.001505478684042e-05, + "grad_norm": 0.3807130455970764, + "learning_rate": 0.00019278797996661101, + "loss": 0.5861, + "step": 114 + }, + { + "epoch": 4.036606403935656e-05, + "grad_norm": 0.4455645978450775, + "learning_rate": 0.0001927212020033389, + "loss": 0.5658, + "step": 115 + }, + { + "epoch": 4.07170732918727e-05, + "grad_norm": 0.3830210864543915, + "learning_rate": 0.0001926544240400668, + "loss": 0.606, + "step": 116 + }, + { + "epoch": 4.106808254438885e-05, + "grad_norm": 0.41419631242752075, + "learning_rate": 0.00019258764607679466, + "loss": 0.6095, + "step": 117 + }, + { + "epoch": 4.1419091796904995e-05, + "grad_norm": 0.3929574489593506, + "learning_rate": 0.00019252086811352253, + "loss": 0.6464, + "step": 118 + }, + { + "epoch": 4.177010104942114e-05, + "grad_norm": 0.35958629846572876, + "learning_rate": 0.0001924540901502504, + "loss": 0.5185, + "step": 119 + }, + { + "epoch": 4.212111030193728e-05, + "grad_norm": 0.3790556490421295, + "learning_rate": 0.0001923873121869783, + "loss": 0.5156, + "step": 120 + }, + { + "epoch": 4.2472119554453423e-05, + "grad_norm": 0.37452438473701477, + "learning_rate": 0.00019232053422370618, + "loss": 0.5711, + "step": 121 + }, + { + "epoch": 4.282312880696957e-05, + "grad_norm": 0.38976770639419556, + "learning_rate": 0.00019225375626043408, + "loss": 0.6075, + "step": 122 + }, + { + "epoch": 4.3174138059485716e-05, + "grad_norm": 0.4098513424396515, + "learning_rate": 0.00019218697829716195, + "loss": 0.5312, + "step": 123 + }, + { + "epoch": 4.352514731200186e-05, + "grad_norm": 0.33890047669410706, + "learning_rate": 0.00019212020033388983, + "loss": 0.4984, + "step": 124 + }, + { + "epoch": 4.3876156564518e-05, + "grad_norm": 0.49077001214027405, + "learning_rate": 0.0001920534223706177, + "loss": 0.7159, + "step": 125 + }, + { + "epoch": 4.4227165817034144e-05, + "grad_norm": 0.41653814911842346, + "learning_rate": 0.0001919866444073456, + "loss": 0.5642, + "step": 126 + }, + { + "epoch": 4.4578175069550286e-05, + "grad_norm": 0.45710283517837524, + "learning_rate": 0.00019191986644407347, + "loss": 0.6936, + "step": 127 + }, + { + "epoch": 4.4929184322066436e-05, + "grad_norm": 0.36976873874664307, + "learning_rate": 0.00019185308848080135, + "loss": 0.5407, + "step": 128 + }, + { + "epoch": 4.528019357458258e-05, + "grad_norm": 0.42852675914764404, + "learning_rate": 0.00019178631051752922, + "loss": 0.6731, + "step": 129 + }, + { + "epoch": 4.563120282709872e-05, + "grad_norm": 0.5426310300827026, + "learning_rate": 0.0001917195325542571, + "loss": 0.5775, + "step": 130 + }, + { + "epoch": 4.5982212079614864e-05, + "grad_norm": 0.38442543148994446, + "learning_rate": 0.00019165275459098497, + "loss": 0.5994, + "step": 131 + }, + { + "epoch": 4.633322133213101e-05, + "grad_norm": 0.4298035502433777, + "learning_rate": 0.00019158597662771287, + "loss": 0.5563, + "step": 132 + }, + { + "epoch": 4.6684230584647156e-05, + "grad_norm": 0.40397605299949646, + "learning_rate": 0.00019151919866444074, + "loss": 0.6924, + "step": 133 + }, + { + "epoch": 4.70352398371633e-05, + "grad_norm": 0.4338497519493103, + "learning_rate": 0.0001914524207011686, + "loss": 0.5739, + "step": 134 + }, + { + "epoch": 4.738624908967944e-05, + "grad_norm": 0.39713653922080994, + "learning_rate": 0.0001913856427378965, + "loss": 0.4529, + "step": 135 + }, + { + "epoch": 4.7737258342195584e-05, + "grad_norm": 0.31409478187561035, + "learning_rate": 0.0001913188647746244, + "loss": 0.562, + "step": 136 + }, + { + "epoch": 4.808826759471173e-05, + "grad_norm": 0.371624618768692, + "learning_rate": 0.00019125208681135226, + "loss": 0.5288, + "step": 137 + }, + { + "epoch": 4.8439276847227877e-05, + "grad_norm": 0.4600190818309784, + "learning_rate": 0.00019118530884808016, + "loss": 0.6215, + "step": 138 + }, + { + "epoch": 4.879028609974402e-05, + "grad_norm": 0.45351359248161316, + "learning_rate": 0.00019111853088480803, + "loss": 0.686, + "step": 139 + }, + { + "epoch": 4.914129535226016e-05, + "grad_norm": 0.42282962799072266, + "learning_rate": 0.0001910517529215359, + "loss": 0.5966, + "step": 140 + }, + { + "epoch": 4.9492304604776305e-05, + "grad_norm": 0.41479986906051636, + "learning_rate": 0.00019098497495826378, + "loss": 0.5948, + "step": 141 + }, + { + "epoch": 4.984331385729245e-05, + "grad_norm": 0.40453553199768066, + "learning_rate": 0.00019091819699499168, + "loss": 0.6411, + "step": 142 + }, + { + "epoch": 5.01943231098086e-05, + "grad_norm": 0.3939369320869446, + "learning_rate": 0.00019085141903171955, + "loss": 0.5513, + "step": 143 + }, + { + "epoch": 5.054533236232474e-05, + "grad_norm": 0.3700481653213501, + "learning_rate": 0.00019078464106844743, + "loss": 0.5459, + "step": 144 + }, + { + "epoch": 5.089634161484088e-05, + "grad_norm": 0.4377487897872925, + "learning_rate": 0.0001907178631051753, + "loss": 0.6076, + "step": 145 + }, + { + "epoch": 5.1247350867357025e-05, + "grad_norm": 0.37919673323631287, + "learning_rate": 0.00019065108514190317, + "loss": 0.5207, + "step": 146 + }, + { + "epoch": 5.159836011987317e-05, + "grad_norm": 0.3841630816459656, + "learning_rate": 0.00019058430717863107, + "loss": 0.614, + "step": 147 + }, + { + "epoch": 5.194936937238932e-05, + "grad_norm": 0.43541714549064636, + "learning_rate": 0.00019051752921535895, + "loss": 0.6283, + "step": 148 + }, + { + "epoch": 5.230037862490546e-05, + "grad_norm": 0.4853285253047943, + "learning_rate": 0.00019045075125208682, + "loss": 0.5807, + "step": 149 + }, + { + "epoch": 5.26513878774216e-05, + "grad_norm": 0.3572970926761627, + "learning_rate": 0.0001903839732888147, + "loss": 0.6866, + "step": 150 + }, + { + "epoch": 5.3002397129937745e-05, + "grad_norm": 0.3674347698688507, + "learning_rate": 0.00019031719532554257, + "loss": 0.5552, + "step": 151 + }, + { + "epoch": 5.335340638245389e-05, + "grad_norm": 0.37748461961746216, + "learning_rate": 0.00019025041736227044, + "loss": 0.6278, + "step": 152 + }, + { + "epoch": 5.370441563497003e-05, + "grad_norm": 0.3788503408432007, + "learning_rate": 0.00019018363939899834, + "loss": 0.622, + "step": 153 + }, + { + "epoch": 5.405542488748618e-05, + "grad_norm": 0.3736303150653839, + "learning_rate": 0.0001901168614357262, + "loss": 0.5822, + "step": 154 + }, + { + "epoch": 5.440643414000232e-05, + "grad_norm": 0.32680070400238037, + "learning_rate": 0.0001900500834724541, + "loss": 0.5715, + "step": 155 + }, + { + "epoch": 5.4757443392518466e-05, + "grad_norm": 0.34495192766189575, + "learning_rate": 0.00018998330550918199, + "loss": 0.6497, + "step": 156 + }, + { + "epoch": 5.510845264503461e-05, + "grad_norm": 0.4244193136692047, + "learning_rate": 0.00018991652754590986, + "loss": 0.5519, + "step": 157 + }, + { + "epoch": 5.545946189755075e-05, + "grad_norm": 0.4024031162261963, + "learning_rate": 0.00018984974958263776, + "loss": 0.5339, + "step": 158 + }, + { + "epoch": 5.58104711500669e-05, + "grad_norm": 0.46051299571990967, + "learning_rate": 0.00018978297161936563, + "loss": 0.5979, + "step": 159 + }, + { + "epoch": 5.616148040258304e-05, + "grad_norm": 0.49051615595817566, + "learning_rate": 0.0001897161936560935, + "loss": 0.5563, + "step": 160 + }, + { + "epoch": 5.6512489655099186e-05, + "grad_norm": 0.43045854568481445, + "learning_rate": 0.00018964941569282138, + "loss": 0.5984, + "step": 161 + }, + { + "epoch": 5.686349890761533e-05, + "grad_norm": 0.37778228521347046, + "learning_rate": 0.00018958263772954925, + "loss": 0.5955, + "step": 162 + }, + { + "epoch": 5.721450816013147e-05, + "grad_norm": 0.3736341893672943, + "learning_rate": 0.00018951585976627715, + "loss": 0.6438, + "step": 163 + }, + { + "epoch": 5.756551741264762e-05, + "grad_norm": 0.3940117061138153, + "learning_rate": 0.00018944908180300502, + "loss": 0.503, + "step": 164 + }, + { + "epoch": 5.7916526665163763e-05, + "grad_norm": 0.4193519055843353, + "learning_rate": 0.0001893823038397329, + "loss": 0.6324, + "step": 165 + }, + { + "epoch": 5.8267535917679906e-05, + "grad_norm": 0.34481996297836304, + "learning_rate": 0.00018931552587646077, + "loss": 0.5745, + "step": 166 + }, + { + "epoch": 5.861854517019605e-05, + "grad_norm": 0.38285771012306213, + "learning_rate": 0.00018924874791318864, + "loss": 0.639, + "step": 167 + }, + { + "epoch": 5.896955442271219e-05, + "grad_norm": 0.36933982372283936, + "learning_rate": 0.00018918196994991652, + "loss": 0.6681, + "step": 168 + }, + { + "epoch": 5.932056367522834e-05, + "grad_norm": 0.36970776319503784, + "learning_rate": 0.00018911519198664442, + "loss": 0.5626, + "step": 169 + }, + { + "epoch": 5.9671572927744484e-05, + "grad_norm": 0.38494783639907837, + "learning_rate": 0.0001890484140233723, + "loss": 0.6066, + "step": 170 + }, + { + "epoch": 6.0022582180260627e-05, + "grad_norm": 0.3446069061756134, + "learning_rate": 0.00018898163606010016, + "loss": 0.6354, + "step": 171 + }, + { + "epoch": 6.037359143277677e-05, + "grad_norm": 0.4466759264469147, + "learning_rate": 0.00018891485809682806, + "loss": 0.4737, + "step": 172 + }, + { + "epoch": 6.072460068529291e-05, + "grad_norm": 0.43630918860435486, + "learning_rate": 0.00018884808013355594, + "loss": 0.6839, + "step": 173 + }, + { + "epoch": 6.107560993780906e-05, + "grad_norm": 0.37083202600479126, + "learning_rate": 0.00018878130217028384, + "loss": 0.5372, + "step": 174 + }, + { + "epoch": 6.14266191903252e-05, + "grad_norm": 0.37066200375556946, + "learning_rate": 0.0001887145242070117, + "loss": 0.6653, + "step": 175 + }, + { + "epoch": 6.177762844284135e-05, + "grad_norm": 0.5191747546195984, + "learning_rate": 0.00018864774624373958, + "loss": 0.6677, + "step": 176 + }, + { + "epoch": 6.21286376953575e-05, + "grad_norm": 0.4235158860683441, + "learning_rate": 0.00018858096828046746, + "loss": 0.5971, + "step": 177 + }, + { + "epoch": 6.247964694787363e-05, + "grad_norm": 0.405074805021286, + "learning_rate": 0.00018851419031719533, + "loss": 0.5717, + "step": 178 + }, + { + "epoch": 6.283065620038978e-05, + "grad_norm": 0.45817336440086365, + "learning_rate": 0.00018844741235392323, + "loss": 0.5878, + "step": 179 + }, + { + "epoch": 6.318166545290592e-05, + "grad_norm": 0.6313037276268005, + "learning_rate": 0.0001883806343906511, + "loss": 0.62, + "step": 180 + }, + { + "epoch": 6.353267470542207e-05, + "grad_norm": 0.41896742582321167, + "learning_rate": 0.00018831385642737898, + "loss": 0.5565, + "step": 181 + }, + { + "epoch": 6.388368395793822e-05, + "grad_norm": 0.4143432676792145, + "learning_rate": 0.00018824707846410685, + "loss": 0.5552, + "step": 182 + }, + { + "epoch": 6.423469321045435e-05, + "grad_norm": 0.38745641708374023, + "learning_rate": 0.00018818030050083472, + "loss": 0.5949, + "step": 183 + }, + { + "epoch": 6.45857024629705e-05, + "grad_norm": 0.7472612261772156, + "learning_rate": 0.0001881135225375626, + "loss": 0.6708, + "step": 184 + }, + { + "epoch": 6.493671171548664e-05, + "grad_norm": 0.4416198432445526, + "learning_rate": 0.0001880467445742905, + "loss": 0.6069, + "step": 185 + }, + { + "epoch": 6.528772096800279e-05, + "grad_norm": 0.4312993884086609, + "learning_rate": 0.00018797996661101837, + "loss": 0.5778, + "step": 186 + }, + { + "epoch": 6.563873022051894e-05, + "grad_norm": 0.4524860978126526, + "learning_rate": 0.00018791318864774624, + "loss": 0.5091, + "step": 187 + }, + { + "epoch": 6.598973947303507e-05, + "grad_norm": 0.4320828914642334, + "learning_rate": 0.00018784641068447412, + "loss": 0.6557, + "step": 188 + }, + { + "epoch": 6.634074872555122e-05, + "grad_norm": 0.6967452168464661, + "learning_rate": 0.00018777963272120202, + "loss": 0.612, + "step": 189 + }, + { + "epoch": 6.669175797806736e-05, + "grad_norm": 0.4389924705028534, + "learning_rate": 0.0001877128547579299, + "loss": 0.6271, + "step": 190 + }, + { + "epoch": 6.704276723058351e-05, + "grad_norm": 0.3693922162055969, + "learning_rate": 0.0001876460767946578, + "loss": 0.6715, + "step": 191 + }, + { + "epoch": 6.739377648309964e-05, + "grad_norm": 0.32230404019355774, + "learning_rate": 0.00018757929883138566, + "loss": 0.6344, + "step": 192 + }, + { + "epoch": 6.774478573561579e-05, + "grad_norm": 0.4440002143383026, + "learning_rate": 0.00018751252086811354, + "loss": 0.6671, + "step": 193 + }, + { + "epoch": 6.809579498813194e-05, + "grad_norm": 0.5676587820053101, + "learning_rate": 0.0001874457429048414, + "loss": 0.6818, + "step": 194 + }, + { + "epoch": 6.844680424064808e-05, + "grad_norm": 0.36207348108291626, + "learning_rate": 0.0001873789649415693, + "loss": 0.5029, + "step": 195 + }, + { + "epoch": 6.879781349316423e-05, + "grad_norm": 0.35714131593704224, + "learning_rate": 0.00018731218697829718, + "loss": 0.6127, + "step": 196 + }, + { + "epoch": 6.914882274568036e-05, + "grad_norm": 0.4285273551940918, + "learning_rate": 0.00018724540901502506, + "loss": 0.6355, + "step": 197 + }, + { + "epoch": 6.949983199819651e-05, + "grad_norm": 0.42585939168930054, + "learning_rate": 0.00018717863105175293, + "loss": 0.6302, + "step": 198 + }, + { + "epoch": 6.985084125071266e-05, + "grad_norm": 0.524303138256073, + "learning_rate": 0.0001871118530884808, + "loss": 0.6683, + "step": 199 + }, + { + "epoch": 7.02018505032288e-05, + "grad_norm": 0.39635923504829407, + "learning_rate": 0.00018704507512520868, + "loss": 0.6694, + "step": 200 + }, + { + "epoch": 7.055285975574495e-05, + "grad_norm": 0.39712437987327576, + "learning_rate": 0.00018697829716193658, + "loss": 0.5794, + "step": 201 + }, + { + "epoch": 7.090386900826108e-05, + "grad_norm": 0.4115397334098816, + "learning_rate": 0.00018691151919866445, + "loss": 0.5579, + "step": 202 + }, + { + "epoch": 7.125487826077723e-05, + "grad_norm": 0.4776385724544525, + "learning_rate": 0.00018684474123539232, + "loss": 0.5589, + "step": 203 + }, + { + "epoch": 7.160588751329338e-05, + "grad_norm": 0.35574638843536377, + "learning_rate": 0.0001867779632721202, + "loss": 0.5311, + "step": 204 + }, + { + "epoch": 7.195689676580952e-05, + "grad_norm": 0.44872432947158813, + "learning_rate": 0.00018671118530884807, + "loss": 0.635, + "step": 205 + }, + { + "epoch": 7.230790601832567e-05, + "grad_norm": 0.3511079251766205, + "learning_rate": 0.00018664440734557597, + "loss": 0.5317, + "step": 206 + }, + { + "epoch": 7.26589152708418e-05, + "grad_norm": 0.39862194657325745, + "learning_rate": 0.00018657762938230384, + "loss": 0.6653, + "step": 207 + }, + { + "epoch": 7.300992452335795e-05, + "grad_norm": 0.4046575725078583, + "learning_rate": 0.00018651085141903174, + "loss": 0.6065, + "step": 208 + }, + { + "epoch": 7.33609337758741e-05, + "grad_norm": 0.4231868088245392, + "learning_rate": 0.00018644407345575962, + "loss": 0.7078, + "step": 209 + }, + { + "epoch": 7.371194302839024e-05, + "grad_norm": 0.364700049161911, + "learning_rate": 0.0001863772954924875, + "loss": 0.6309, + "step": 210 + }, + { + "epoch": 7.406295228090639e-05, + "grad_norm": 0.5385531187057495, + "learning_rate": 0.0001863105175292154, + "loss": 0.4233, + "step": 211 + }, + { + "epoch": 7.441396153342252e-05, + "grad_norm": 0.39415115118026733, + "learning_rate": 0.00018624373956594326, + "loss": 0.5928, + "step": 212 + }, + { + "epoch": 7.476497078593867e-05, + "grad_norm": 0.6021363735198975, + "learning_rate": 0.00018617696160267113, + "loss": 0.6611, + "step": 213 + }, + { + "epoch": 7.511598003845482e-05, + "grad_norm": 0.3709903061389923, + "learning_rate": 0.000186110183639399, + "loss": 0.6136, + "step": 214 + }, + { + "epoch": 7.546698929097096e-05, + "grad_norm": 0.36710435152053833, + "learning_rate": 0.00018604340567612688, + "loss": 0.5267, + "step": 215 + }, + { + "epoch": 7.581799854348711e-05, + "grad_norm": 0.4379352033138275, + "learning_rate": 0.00018597662771285475, + "loss": 0.6429, + "step": 216 + }, + { + "epoch": 7.616900779600325e-05, + "grad_norm": 0.3408482074737549, + "learning_rate": 0.00018590984974958265, + "loss": 0.5379, + "step": 217 + }, + { + "epoch": 7.65200170485194e-05, + "grad_norm": 0.4487043023109436, + "learning_rate": 0.00018584307178631053, + "loss": 0.6582, + "step": 218 + }, + { + "epoch": 7.687102630103554e-05, + "grad_norm": 0.42003679275512695, + "learning_rate": 0.0001857762938230384, + "loss": 0.5712, + "step": 219 + }, + { + "epoch": 7.722203555355168e-05, + "grad_norm": 0.4698665738105774, + "learning_rate": 0.00018570951585976627, + "loss": 0.5715, + "step": 220 + }, + { + "epoch": 7.757304480606783e-05, + "grad_norm": 0.3777780830860138, + "learning_rate": 0.00018564273789649415, + "loss": 0.4667, + "step": 221 + }, + { + "epoch": 7.792405405858397e-05, + "grad_norm": 0.36794212460517883, + "learning_rate": 0.00018557595993322205, + "loss": 0.5382, + "step": 222 + }, + { + "epoch": 7.827506331110012e-05, + "grad_norm": 0.4582989513874054, + "learning_rate": 0.00018550918196994992, + "loss": 0.6437, + "step": 223 + }, + { + "epoch": 7.862607256361626e-05, + "grad_norm": 0.4065852761268616, + "learning_rate": 0.0001854424040066778, + "loss": 0.6928, + "step": 224 + }, + { + "epoch": 7.89770818161324e-05, + "grad_norm": 0.3857649564743042, + "learning_rate": 0.0001853756260434057, + "loss": 0.5405, + "step": 225 + }, + { + "epoch": 7.932809106864855e-05, + "grad_norm": 0.40056589245796204, + "learning_rate": 0.00018530884808013357, + "loss": 0.6425, + "step": 226 + }, + { + "epoch": 7.967910032116469e-05, + "grad_norm": 0.43137016892433167, + "learning_rate": 0.00018524207011686147, + "loss": 0.5001, + "step": 227 + }, + { + "epoch": 8.003010957368084e-05, + "grad_norm": 0.3723987340927124, + "learning_rate": 0.00018517529215358934, + "loss": 0.5118, + "step": 228 + }, + { + "epoch": 8.038111882619698e-05, + "grad_norm": 0.34196361899375916, + "learning_rate": 0.00018510851419031721, + "loss": 0.5468, + "step": 229 + }, + { + "epoch": 8.073212807871312e-05, + "grad_norm": 0.4319117069244385, + "learning_rate": 0.0001850417362270451, + "loss": 0.5703, + "step": 230 + }, + { + "epoch": 8.108313733122927e-05, + "grad_norm": 0.4467247724533081, + "learning_rate": 0.00018497495826377296, + "loss": 0.6536, + "step": 231 + }, + { + "epoch": 8.14341465837454e-05, + "grad_norm": 0.3569909632205963, + "learning_rate": 0.00018490818030050083, + "loss": 0.5335, + "step": 232 + }, + { + "epoch": 8.178515583626156e-05, + "grad_norm": 0.33486437797546387, + "learning_rate": 0.00018484140233722873, + "loss": 0.6803, + "step": 233 + }, + { + "epoch": 8.21361650887777e-05, + "grad_norm": 0.3783140480518341, + "learning_rate": 0.0001847746243739566, + "loss": 0.6361, + "step": 234 + }, + { + "epoch": 8.248717434129384e-05, + "grad_norm": 0.4844662547111511, + "learning_rate": 0.00018470784641068448, + "loss": 0.5322, + "step": 235 + }, + { + "epoch": 8.283818359380999e-05, + "grad_norm": 0.508406400680542, + "learning_rate": 0.00018464106844741235, + "loss": 0.6676, + "step": 236 + }, + { + "epoch": 8.318919284632613e-05, + "grad_norm": 0.3710225820541382, + "learning_rate": 0.00018457429048414023, + "loss": 0.6656, + "step": 237 + }, + { + "epoch": 8.354020209884228e-05, + "grad_norm": 0.3757292628288269, + "learning_rate": 0.00018450751252086813, + "loss": 0.6095, + "step": 238 + }, + { + "epoch": 8.389121135135843e-05, + "grad_norm": 0.40651261806488037, + "learning_rate": 0.000184440734557596, + "loss": 0.6626, + "step": 239 + }, + { + "epoch": 8.424222060387456e-05, + "grad_norm": 0.40700778365135193, + "learning_rate": 0.00018437395659432387, + "loss": 0.5328, + "step": 240 + }, + { + "epoch": 8.459322985639071e-05, + "grad_norm": 0.5067440867424011, + "learning_rate": 0.00018430717863105175, + "loss": 0.4811, + "step": 241 + }, + { + "epoch": 8.494423910890685e-05, + "grad_norm": 0.3934602737426758, + "learning_rate": 0.00018424040066777965, + "loss": 0.5691, + "step": 242 + }, + { + "epoch": 8.5295248361423e-05, + "grad_norm": 0.3360019624233246, + "learning_rate": 0.00018417362270450752, + "loss": 0.5542, + "step": 243 + }, + { + "epoch": 8.564625761393915e-05, + "grad_norm": 0.4023631513118744, + "learning_rate": 0.00018410684474123542, + "loss": 0.5192, + "step": 244 + }, + { + "epoch": 8.599726686645528e-05, + "grad_norm": 0.41704171895980835, + "learning_rate": 0.0001840400667779633, + "loss": 0.5018, + "step": 245 + }, + { + "epoch": 8.634827611897143e-05, + "grad_norm": 0.361977756023407, + "learning_rate": 0.00018397328881469117, + "loss": 0.6193, + "step": 246 + }, + { + "epoch": 8.669928537148757e-05, + "grad_norm": 0.37774717807769775, + "learning_rate": 0.00018390651085141904, + "loss": 0.5552, + "step": 247 + }, + { + "epoch": 8.705029462400372e-05, + "grad_norm": 0.3408471941947937, + "learning_rate": 0.0001838397328881469, + "loss": 0.5876, + "step": 248 + }, + { + "epoch": 8.740130387651985e-05, + "grad_norm": 0.3892226815223694, + "learning_rate": 0.0001837729549248748, + "loss": 0.4227, + "step": 249 + }, + { + "epoch": 8.7752313129036e-05, + "grad_norm": 0.5315036177635193, + "learning_rate": 0.00018370617696160269, + "loss": 0.5826, + "step": 250 + }, + { + "epoch": 8.810332238155215e-05, + "grad_norm": 0.35433024168014526, + "learning_rate": 0.00018363939899833056, + "loss": 0.5992, + "step": 251 + }, + { + "epoch": 8.845433163406829e-05, + "grad_norm": 0.34777382016181946, + "learning_rate": 0.00018357262103505843, + "loss": 0.4973, + "step": 252 + }, + { + "epoch": 8.880534088658444e-05, + "grad_norm": 0.3936387002468109, + "learning_rate": 0.0001835058430717863, + "loss": 0.6254, + "step": 253 + }, + { + "epoch": 8.915635013910057e-05, + "grad_norm": 0.4009217917919159, + "learning_rate": 0.0001834390651085142, + "loss": 0.4843, + "step": 254 + }, + { + "epoch": 8.950735939161672e-05, + "grad_norm": 0.4863683879375458, + "learning_rate": 0.00018337228714524208, + "loss": 0.5204, + "step": 255 + }, + { + "epoch": 8.985836864413287e-05, + "grad_norm": 0.6100988984107971, + "learning_rate": 0.00018330550918196995, + "loss": 0.7296, + "step": 256 + }, + { + "epoch": 9.020937789664901e-05, + "grad_norm": 0.40949374437332153, + "learning_rate": 0.00018323873121869782, + "loss": 0.5707, + "step": 257 + }, + { + "epoch": 9.056038714916516e-05, + "grad_norm": 0.47316402196884155, + "learning_rate": 0.0001831719532554257, + "loss": 0.6655, + "step": 258 + }, + { + "epoch": 9.091139640168129e-05, + "grad_norm": 0.4053696393966675, + "learning_rate": 0.0001831051752921536, + "loss": 0.5822, + "step": 259 + }, + { + "epoch": 9.126240565419744e-05, + "grad_norm": 0.4582972228527069, + "learning_rate": 0.00018303839732888147, + "loss": 0.5475, + "step": 260 + }, + { + "epoch": 9.161341490671359e-05, + "grad_norm": 0.38666802644729614, + "learning_rate": 0.00018297161936560937, + "loss": 0.4744, + "step": 261 + }, + { + "epoch": 9.196442415922973e-05, + "grad_norm": 0.31954991817474365, + "learning_rate": 0.00018290484140233724, + "loss": 0.6337, + "step": 262 + }, + { + "epoch": 9.231543341174588e-05, + "grad_norm": 0.3590424358844757, + "learning_rate": 0.00018283806343906512, + "loss": 0.5683, + "step": 263 + }, + { + "epoch": 9.266644266426201e-05, + "grad_norm": 0.4042195975780487, + "learning_rate": 0.000182771285475793, + "loss": 0.6142, + "step": 264 + }, + { + "epoch": 9.301745191677816e-05, + "grad_norm": 0.3474234342575073, + "learning_rate": 0.0001827045075125209, + "loss": 0.6035, + "step": 265 + }, + { + "epoch": 9.336846116929431e-05, + "grad_norm": 0.337091326713562, + "learning_rate": 0.00018263772954924876, + "loss": 0.6107, + "step": 266 + }, + { + "epoch": 9.371947042181045e-05, + "grad_norm": 0.3313732445240021, + "learning_rate": 0.00018257095158597664, + "loss": 0.6491, + "step": 267 + }, + { + "epoch": 9.40704796743266e-05, + "grad_norm": 0.3931679129600525, + "learning_rate": 0.0001825041736227045, + "loss": 0.5492, + "step": 268 + }, + { + "epoch": 9.442148892684273e-05, + "grad_norm": 0.5848420262336731, + "learning_rate": 0.00018243739565943238, + "loss": 0.7091, + "step": 269 + }, + { + "epoch": 9.477249817935888e-05, + "grad_norm": 0.4851846992969513, + "learning_rate": 0.00018237061769616028, + "loss": 0.5856, + "step": 270 + }, + { + "epoch": 9.512350743187503e-05, + "grad_norm": 0.3434993326663971, + "learning_rate": 0.00018230383973288816, + "loss": 0.5085, + "step": 271 + }, + { + "epoch": 9.547451668439117e-05, + "grad_norm": 0.2978988587856293, + "learning_rate": 0.00018223706176961603, + "loss": 0.481, + "step": 272 + }, + { + "epoch": 9.582552593690732e-05, + "grad_norm": 0.34215858578681946, + "learning_rate": 0.0001821702838063439, + "loss": 0.5723, + "step": 273 + }, + { + "epoch": 9.617653518942345e-05, + "grad_norm": 0.43445509672164917, + "learning_rate": 0.00018210350584307178, + "loss": 0.5691, + "step": 274 + }, + { + "epoch": 9.65275444419396e-05, + "grad_norm": 0.36094945669174194, + "learning_rate": 0.00018203672787979968, + "loss": 0.5543, + "step": 275 + }, + { + "epoch": 9.687855369445575e-05, + "grad_norm": 0.386106014251709, + "learning_rate": 0.00018196994991652755, + "loss": 0.5561, + "step": 276 + }, + { + "epoch": 9.722956294697189e-05, + "grad_norm": 0.36676689982414246, + "learning_rate": 0.00018190317195325542, + "loss": 0.5479, + "step": 277 + }, + { + "epoch": 9.758057219948804e-05, + "grad_norm": 0.37988394498825073, + "learning_rate": 0.00018183639398998332, + "loss": 0.5772, + "step": 278 + }, + { + "epoch": 9.793158145200417e-05, + "grad_norm": 0.4024789035320282, + "learning_rate": 0.0001817696160267112, + "loss": 0.6065, + "step": 279 + }, + { + "epoch": 9.828259070452032e-05, + "grad_norm": 0.3697255551815033, + "learning_rate": 0.0001817028380634391, + "loss": 0.5021, + "step": 280 + }, + { + "epoch": 9.863359995703647e-05, + "grad_norm": 0.43579426407814026, + "learning_rate": 0.00018163606010016697, + "loss": 0.555, + "step": 281 + }, + { + "epoch": 9.898460920955261e-05, + "grad_norm": 0.4760832190513611, + "learning_rate": 0.00018156928213689484, + "loss": 0.6438, + "step": 282 + }, + { + "epoch": 9.933561846206876e-05, + "grad_norm": 0.45258408784866333, + "learning_rate": 0.00018150250417362272, + "loss": 0.4717, + "step": 283 + }, + { + "epoch": 9.96866277145849e-05, + "grad_norm": 0.428108274936676, + "learning_rate": 0.0001814357262103506, + "loss": 0.6029, + "step": 284 + }, + { + "epoch": 0.00010003763696710104, + "grad_norm": 0.3999852240085602, + "learning_rate": 0.00018136894824707846, + "loss": 0.4524, + "step": 285 + }, + { + "epoch": 0.0001003886462196172, + "grad_norm": 0.44319403171539307, + "learning_rate": 0.00018130217028380636, + "loss": 0.6619, + "step": 286 + }, + { + "epoch": 0.00010073965547213333, + "grad_norm": 0.43008357286453247, + "learning_rate": 0.00018123539232053424, + "loss": 0.6105, + "step": 287 + }, + { + "epoch": 0.00010109066472464948, + "grad_norm": 0.38037821650505066, + "learning_rate": 0.0001811686143572621, + "loss": 0.6649, + "step": 288 + }, + { + "epoch": 0.00010144167397716562, + "grad_norm": 0.3713517487049103, + "learning_rate": 0.00018110183639398998, + "loss": 0.6381, + "step": 289 + }, + { + "epoch": 0.00010179268322968176, + "grad_norm": 0.3437170386314392, + "learning_rate": 0.00018103505843071786, + "loss": 0.4563, + "step": 290 + }, + { + "epoch": 0.00010214369248219791, + "grad_norm": 0.3661468029022217, + "learning_rate": 0.00018096828046744576, + "loss": 0.606, + "step": 291 + }, + { + "epoch": 0.00010249470173471405, + "grad_norm": 0.36346200108528137, + "learning_rate": 0.00018090150250417363, + "loss": 0.5895, + "step": 292 + }, + { + "epoch": 0.0001028457109872302, + "grad_norm": 0.31052225828170776, + "learning_rate": 0.0001808347245409015, + "loss": 0.4409, + "step": 293 + }, + { + "epoch": 0.00010319672023974634, + "grad_norm": 0.37012970447540283, + "learning_rate": 0.00018076794657762938, + "loss": 0.505, + "step": 294 + }, + { + "epoch": 0.00010354772949226248, + "grad_norm": 0.3958667814731598, + "learning_rate": 0.00018070116861435728, + "loss": 0.5371, + "step": 295 + }, + { + "epoch": 0.00010389873874477863, + "grad_norm": 0.4892179071903229, + "learning_rate": 0.00018063439065108515, + "loss": 0.6737, + "step": 296 + }, + { + "epoch": 0.00010424974799729477, + "grad_norm": 0.41874751448631287, + "learning_rate": 0.00018056761268781305, + "loss": 0.651, + "step": 297 + }, + { + "epoch": 0.00010460075724981092, + "grad_norm": 0.4167911410331726, + "learning_rate": 0.00018050083472454092, + "loss": 0.5531, + "step": 298 + }, + { + "epoch": 0.00010495176650232706, + "grad_norm": 0.3758225440979004, + "learning_rate": 0.0001804340567612688, + "loss": 0.6285, + "step": 299 + }, + { + "epoch": 0.0001053027757548432, + "grad_norm": 0.3688598573207855, + "learning_rate": 0.00018036727879799667, + "loss": 0.5219, + "step": 300 + }, + { + "epoch": 0.00010565378500735934, + "grad_norm": 0.3501751124858856, + "learning_rate": 0.00018030050083472454, + "loss": 0.6351, + "step": 301 + }, + { + "epoch": 0.00010600479425987549, + "grad_norm": 0.42876511812210083, + "learning_rate": 0.00018023372287145244, + "loss": 0.544, + "step": 302 + }, + { + "epoch": 0.00010635580351239164, + "grad_norm": 0.47046172618865967, + "learning_rate": 0.00018016694490818031, + "loss": 0.6304, + "step": 303 + }, + { + "epoch": 0.00010670681276490778, + "grad_norm": 0.402271032333374, + "learning_rate": 0.0001801001669449082, + "loss": 0.5039, + "step": 304 + }, + { + "epoch": 0.00010705782201742393, + "grad_norm": 0.41232413053512573, + "learning_rate": 0.00018003338898163606, + "loss": 0.5892, + "step": 305 + }, + { + "epoch": 0.00010740883126994006, + "grad_norm": 0.3628154993057251, + "learning_rate": 0.00017996661101836393, + "loss": 0.5737, + "step": 306 + }, + { + "epoch": 0.00010775984052245621, + "grad_norm": 0.4291020631790161, + "learning_rate": 0.00017989983305509183, + "loss": 0.6597, + "step": 307 + }, + { + "epoch": 0.00010811084977497236, + "grad_norm": 0.33218181133270264, + "learning_rate": 0.0001798330550918197, + "loss": 0.5726, + "step": 308 + }, + { + "epoch": 0.0001084618590274885, + "grad_norm": 0.3439387381076813, + "learning_rate": 0.00017976627712854758, + "loss": 0.5615, + "step": 309 + }, + { + "epoch": 0.00010881286828000465, + "grad_norm": 0.3523644208908081, + "learning_rate": 0.00017969949916527545, + "loss": 0.4968, + "step": 310 + }, + { + "epoch": 0.00010916387753252078, + "grad_norm": 0.4045630991458893, + "learning_rate": 0.00017963272120200333, + "loss": 0.6425, + "step": 311 + }, + { + "epoch": 0.00010951488678503693, + "grad_norm": 0.3726767599582672, + "learning_rate": 0.00017956594323873123, + "loss": 0.6575, + "step": 312 + }, + { + "epoch": 0.00010986589603755308, + "grad_norm": 0.32131972908973694, + "learning_rate": 0.0001794991652754591, + "loss": 0.5146, + "step": 313 + }, + { + "epoch": 0.00011021690529006922, + "grad_norm": 0.5013764500617981, + "learning_rate": 0.000179432387312187, + "loss": 0.53, + "step": 314 + }, + { + "epoch": 0.00011056791454258537, + "grad_norm": 0.36830246448516846, + "learning_rate": 0.00017936560934891487, + "loss": 0.6291, + "step": 315 + }, + { + "epoch": 0.0001109189237951015, + "grad_norm": 0.3587378263473511, + "learning_rate": 0.00017929883138564275, + "loss": 0.4954, + "step": 316 + }, + { + "epoch": 0.00011126993304761765, + "grad_norm": 0.3480195105075836, + "learning_rate": 0.00017923205342237062, + "loss": 0.606, + "step": 317 + }, + { + "epoch": 0.0001116209423001338, + "grad_norm": 0.38415858149528503, + "learning_rate": 0.00017916527545909852, + "loss": 0.7281, + "step": 318 + }, + { + "epoch": 0.00011197195155264994, + "grad_norm": 0.35853826999664307, + "learning_rate": 0.0001790984974958264, + "loss": 0.5851, + "step": 319 + }, + { + "epoch": 0.00011232296080516609, + "grad_norm": 0.42092210054397583, + "learning_rate": 0.00017903171953255427, + "loss": 0.5324, + "step": 320 + }, + { + "epoch": 0.00011267397005768222, + "grad_norm": 0.34538987278938293, + "learning_rate": 0.00017896494156928214, + "loss": 0.6387, + "step": 321 + }, + { + "epoch": 0.00011302497931019837, + "grad_norm": 0.38299745321273804, + "learning_rate": 0.00017889816360601, + "loss": 0.6013, + "step": 322 + }, + { + "epoch": 0.00011337598856271452, + "grad_norm": 0.32100436091423035, + "learning_rate": 0.0001788313856427379, + "loss": 0.4627, + "step": 323 + }, + { + "epoch": 0.00011372699781523066, + "grad_norm": 0.3458426594734192, + "learning_rate": 0.0001787646076794658, + "loss": 0.5865, + "step": 324 + }, + { + "epoch": 0.0001140780070677468, + "grad_norm": 0.33228665590286255, + "learning_rate": 0.00017869782971619366, + "loss": 0.4611, + "step": 325 + }, + { + "epoch": 0.00011442901632026294, + "grad_norm": 0.38747021555900574, + "learning_rate": 0.00017863105175292153, + "loss": 0.5777, + "step": 326 + }, + { + "epoch": 0.00011478002557277909, + "grad_norm": 0.3888608515262604, + "learning_rate": 0.0001785642737896494, + "loss": 0.5664, + "step": 327 + }, + { + "epoch": 0.00011513103482529524, + "grad_norm": 0.4084737002849579, + "learning_rate": 0.0001784974958263773, + "loss": 0.5939, + "step": 328 + }, + { + "epoch": 0.00011548204407781138, + "grad_norm": 0.4964492917060852, + "learning_rate": 0.00017843071786310518, + "loss": 0.6256, + "step": 329 + }, + { + "epoch": 0.00011583305333032753, + "grad_norm": 0.37329745292663574, + "learning_rate": 0.00017836393989983305, + "loss": 0.5388, + "step": 330 + }, + { + "epoch": 0.00011618406258284366, + "grad_norm": 0.37680140137672424, + "learning_rate": 0.00017829716193656095, + "loss": 0.6203, + "step": 331 + }, + { + "epoch": 0.00011653507183535981, + "grad_norm": 0.4162957966327667, + "learning_rate": 0.00017823038397328883, + "loss": 0.6478, + "step": 332 + }, + { + "epoch": 0.00011688608108787596, + "grad_norm": 0.3473896086215973, + "learning_rate": 0.0001781636060100167, + "loss": 0.589, + "step": 333 + }, + { + "epoch": 0.0001172370903403921, + "grad_norm": 0.4039511978626251, + "learning_rate": 0.0001780968280467446, + "loss": 0.5681, + "step": 334 + }, + { + "epoch": 0.00011758809959290825, + "grad_norm": 0.3135715425014496, + "learning_rate": 0.00017803005008347247, + "loss": 0.5069, + "step": 335 + }, + { + "epoch": 0.00011793910884542438, + "grad_norm": 0.4296559989452362, + "learning_rate": 0.00017796327212020035, + "loss": 0.5413, + "step": 336 + }, + { + "epoch": 0.00011829011809794053, + "grad_norm": 0.4197536110877991, + "learning_rate": 0.00017789649415692822, + "loss": 0.694, + "step": 337 + }, + { + "epoch": 0.00011864112735045668, + "grad_norm": 0.3633468449115753, + "learning_rate": 0.0001778297161936561, + "loss": 0.5475, + "step": 338 + }, + { + "epoch": 0.00011899213660297282, + "grad_norm": 0.2867147922515869, + "learning_rate": 0.000177762938230384, + "loss": 0.485, + "step": 339 + }, + { + "epoch": 0.00011934314585548897, + "grad_norm": 0.3445490300655365, + "learning_rate": 0.00017769616026711187, + "loss": 0.6304, + "step": 340 + }, + { + "epoch": 0.0001196941551080051, + "grad_norm": 0.31692221760749817, + "learning_rate": 0.00017762938230383974, + "loss": 0.5804, + "step": 341 + }, + { + "epoch": 0.00012004516436052125, + "grad_norm": 0.31391167640686035, + "learning_rate": 0.0001775626043405676, + "loss": 0.5945, + "step": 342 + }, + { + "epoch": 0.0001203961736130374, + "grad_norm": 0.3484472632408142, + "learning_rate": 0.00017749582637729548, + "loss": 0.6577, + "step": 343 + }, + { + "epoch": 0.00012074718286555354, + "grad_norm": 0.37430596351623535, + "learning_rate": 0.00017742904841402339, + "loss": 0.6854, + "step": 344 + }, + { + "epoch": 0.00012109819211806969, + "grad_norm": 0.34305211901664734, + "learning_rate": 0.00017736227045075126, + "loss": 0.5123, + "step": 345 + }, + { + "epoch": 0.00012144920137058582, + "grad_norm": 0.3398534059524536, + "learning_rate": 0.00017729549248747913, + "loss": 0.5602, + "step": 346 + }, + { + "epoch": 0.00012180021062310197, + "grad_norm": 0.4278014600276947, + "learning_rate": 0.000177228714524207, + "loss": 0.5152, + "step": 347 + }, + { + "epoch": 0.00012215121987561812, + "grad_norm": 0.4011085629463196, + "learning_rate": 0.0001771619365609349, + "loss": 0.6217, + "step": 348 + }, + { + "epoch": 0.00012250222912813427, + "grad_norm": 0.3425695598125458, + "learning_rate": 0.00017709515859766278, + "loss": 0.5037, + "step": 349 + }, + { + "epoch": 0.0001228532383806504, + "grad_norm": 0.34036242961883545, + "learning_rate": 0.00017702838063439068, + "loss": 0.649, + "step": 350 + }, + { + "epoch": 0.00012320424763316654, + "grad_norm": 0.5631874203681946, + "learning_rate": 0.00017696160267111855, + "loss": 0.5656, + "step": 351 + }, + { + "epoch": 0.0001235552568856827, + "grad_norm": 0.4195176661014557, + "learning_rate": 0.00017689482470784642, + "loss": 0.6899, + "step": 352 + }, + { + "epoch": 0.00012390626613819884, + "grad_norm": 0.41814154386520386, + "learning_rate": 0.0001768280467445743, + "loss": 0.551, + "step": 353 + }, + { + "epoch": 0.000124257275390715, + "grad_norm": 0.3374340534210205, + "learning_rate": 0.00017676126878130217, + "loss": 0.7022, + "step": 354 + }, + { + "epoch": 0.00012460828464323112, + "grad_norm": 0.41464921832084656, + "learning_rate": 0.00017669449081803007, + "loss": 0.5301, + "step": 355 + }, + { + "epoch": 0.00012495929389574726, + "grad_norm": 0.4443178176879883, + "learning_rate": 0.00017662771285475794, + "loss": 0.5487, + "step": 356 + }, + { + "epoch": 0.00012531030314826341, + "grad_norm": 0.3389272093772888, + "learning_rate": 0.00017656093489148582, + "loss": 0.581, + "step": 357 + }, + { + "epoch": 0.00012566131240077956, + "grad_norm": 0.29650986194610596, + "learning_rate": 0.0001764941569282137, + "loss": 0.5801, + "step": 358 + }, + { + "epoch": 0.0001260123216532957, + "grad_norm": 0.40271905064582825, + "learning_rate": 0.00017642737896494156, + "loss": 0.6738, + "step": 359 + }, + { + "epoch": 0.00012636333090581184, + "grad_norm": 0.352225661277771, + "learning_rate": 0.00017636060100166946, + "loss": 0.5727, + "step": 360 + }, + { + "epoch": 0.00012671434015832798, + "grad_norm": 0.3469563126564026, + "learning_rate": 0.00017629382303839734, + "loss": 0.5188, + "step": 361 + }, + { + "epoch": 0.00012706534941084413, + "grad_norm": 0.30644670128822327, + "learning_rate": 0.0001762270450751252, + "loss": 0.497, + "step": 362 + }, + { + "epoch": 0.00012741635866336028, + "grad_norm": 0.3472917377948761, + "learning_rate": 0.00017616026711185308, + "loss": 0.6363, + "step": 363 + }, + { + "epoch": 0.00012776736791587643, + "grad_norm": 0.37184756994247437, + "learning_rate": 0.00017609348914858096, + "loss": 0.5223, + "step": 364 + }, + { + "epoch": 0.00012811837716839256, + "grad_norm": 0.3247138559818268, + "learning_rate": 0.00017602671118530886, + "loss": 0.5457, + "step": 365 + }, + { + "epoch": 0.0001284693864209087, + "grad_norm": 0.5236158967018127, + "learning_rate": 0.00017595993322203673, + "loss": 0.615, + "step": 366 + }, + { + "epoch": 0.00012882039567342485, + "grad_norm": 0.33708465099334717, + "learning_rate": 0.00017589315525876463, + "loss": 0.6163, + "step": 367 + }, + { + "epoch": 0.000129171404925941, + "grad_norm": 0.33848705887794495, + "learning_rate": 0.0001758263772954925, + "loss": 0.4229, + "step": 368 + }, + { + "epoch": 0.00012952241417845715, + "grad_norm": 0.5827682018280029, + "learning_rate": 0.00017575959933222038, + "loss": 0.5668, + "step": 369 + }, + { + "epoch": 0.00012987342343097328, + "grad_norm": 0.36217448115348816, + "learning_rate": 0.00017569282136894825, + "loss": 0.4983, + "step": 370 + }, + { + "epoch": 0.00013022443268348943, + "grad_norm": 0.329414963722229, + "learning_rate": 0.00017562604340567615, + "loss": 0.4281, + "step": 371 + }, + { + "epoch": 0.00013057544193600557, + "grad_norm": 0.36746612191200256, + "learning_rate": 0.00017555926544240402, + "loss": 0.6629, + "step": 372 + }, + { + "epoch": 0.00013092645118852172, + "grad_norm": 0.3954717516899109, + "learning_rate": 0.0001754924874791319, + "loss": 0.5784, + "step": 373 + }, + { + "epoch": 0.00013127746044103787, + "grad_norm": 0.41279932856559753, + "learning_rate": 0.00017542570951585977, + "loss": 0.5994, + "step": 374 + }, + { + "epoch": 0.000131628469693554, + "grad_norm": 0.3019951581954956, + "learning_rate": 0.00017535893155258764, + "loss": 0.5584, + "step": 375 + }, + { + "epoch": 0.00013197947894607015, + "grad_norm": 0.3079768121242523, + "learning_rate": 0.00017529215358931554, + "loss": 0.5904, + "step": 376 + }, + { + "epoch": 0.0001323304881985863, + "grad_norm": 0.5678027272224426, + "learning_rate": 0.00017522537562604342, + "loss": 0.6441, + "step": 377 + }, + { + "epoch": 0.00013268149745110244, + "grad_norm": 0.38624581694602966, + "learning_rate": 0.0001751585976627713, + "loss": 0.5582, + "step": 378 + }, + { + "epoch": 0.0001330325067036186, + "grad_norm": 0.4368002712726593, + "learning_rate": 0.00017509181969949916, + "loss": 0.686, + "step": 379 + }, + { + "epoch": 0.00013338351595613472, + "grad_norm": 0.3409269154071808, + "learning_rate": 0.00017502504173622704, + "loss": 0.582, + "step": 380 + }, + { + "epoch": 0.00013373452520865087, + "grad_norm": 0.3772698938846588, + "learning_rate": 0.0001749582637729549, + "loss": 0.5314, + "step": 381 + }, + { + "epoch": 0.00013408553446116702, + "grad_norm": 0.3791707158088684, + "learning_rate": 0.0001748914858096828, + "loss": 0.6143, + "step": 382 + }, + { + "epoch": 0.00013443654371368317, + "grad_norm": 0.4441101551055908, + "learning_rate": 0.0001748247078464107, + "loss": 0.5726, + "step": 383 + }, + { + "epoch": 0.0001347875529661993, + "grad_norm": 0.4160211980342865, + "learning_rate": 0.00017475792988313858, + "loss": 0.6003, + "step": 384 + }, + { + "epoch": 0.00013513856221871544, + "grad_norm": 0.41698628664016724, + "learning_rate": 0.00017469115191986646, + "loss": 0.4539, + "step": 385 + }, + { + "epoch": 0.00013548957147123159, + "grad_norm": 0.337007999420166, + "learning_rate": 0.00017462437395659433, + "loss": 0.5176, + "step": 386 + }, + { + "epoch": 0.00013584058072374774, + "grad_norm": 0.30926409363746643, + "learning_rate": 0.00017455759599332223, + "loss": 0.6072, + "step": 387 + }, + { + "epoch": 0.00013619158997626389, + "grad_norm": 0.3663052022457123, + "learning_rate": 0.0001744908180300501, + "loss": 0.538, + "step": 388 + }, + { + "epoch": 0.00013654259922878, + "grad_norm": 0.3410074710845947, + "learning_rate": 0.00017442404006677798, + "loss": 0.5687, + "step": 389 + }, + { + "epoch": 0.00013689360848129616, + "grad_norm": 0.5266095399856567, + "learning_rate": 0.00017435726210350585, + "loss": 0.6685, + "step": 390 + }, + { + "epoch": 0.0001372446177338123, + "grad_norm": 0.4020686149597168, + "learning_rate": 0.00017429048414023372, + "loss": 0.586, + "step": 391 + }, + { + "epoch": 0.00013759562698632846, + "grad_norm": 0.39995548129081726, + "learning_rate": 0.00017422370617696162, + "loss": 0.6958, + "step": 392 + }, + { + "epoch": 0.0001379466362388446, + "grad_norm": 0.4024721682071686, + "learning_rate": 0.0001741569282136895, + "loss": 0.6411, + "step": 393 + }, + { + "epoch": 0.00013829764549136073, + "grad_norm": 0.38193392753601074, + "learning_rate": 0.00017409015025041737, + "loss": 0.5857, + "step": 394 + }, + { + "epoch": 0.00013864865474387688, + "grad_norm": 0.39786526560783386, + "learning_rate": 0.00017402337228714524, + "loss": 0.5215, + "step": 395 + }, + { + "epoch": 0.00013899966399639303, + "grad_norm": 0.49223974347114563, + "learning_rate": 0.00017395659432387311, + "loss": 0.5881, + "step": 396 + }, + { + "epoch": 0.00013935067324890918, + "grad_norm": 0.3398894667625427, + "learning_rate": 0.00017388981636060101, + "loss": 0.5466, + "step": 397 + }, + { + "epoch": 0.00013970168250142533, + "grad_norm": 0.34891223907470703, + "learning_rate": 0.0001738230383973289, + "loss": 0.5901, + "step": 398 + }, + { + "epoch": 0.00014005269175394145, + "grad_norm": 0.47644108533859253, + "learning_rate": 0.00017375626043405676, + "loss": 0.5075, + "step": 399 + }, + { + "epoch": 0.0001404037010064576, + "grad_norm": 0.42530229687690735, + "learning_rate": 0.00017368948247078466, + "loss": 0.663, + "step": 400 + }, + { + "epoch": 0.00014075471025897375, + "grad_norm": 0.30858534574508667, + "learning_rate": 0.00017362270450751253, + "loss": 0.4724, + "step": 401 + }, + { + "epoch": 0.0001411057195114899, + "grad_norm": 0.42453449964523315, + "learning_rate": 0.0001735559265442404, + "loss": 0.6074, + "step": 402 + }, + { + "epoch": 0.00014145672876400605, + "grad_norm": 0.3964505195617676, + "learning_rate": 0.0001734891485809683, + "loss": 0.4913, + "step": 403 + }, + { + "epoch": 0.00014180773801652217, + "grad_norm": 0.3317703902721405, + "learning_rate": 0.00017342237061769618, + "loss": 0.5504, + "step": 404 + }, + { + "epoch": 0.00014215874726903832, + "grad_norm": 0.3912264108657837, + "learning_rate": 0.00017335559265442405, + "loss": 0.6301, + "step": 405 + }, + { + "epoch": 0.00014250975652155447, + "grad_norm": 0.3582877218723297, + "learning_rate": 0.00017328881469115193, + "loss": 0.6205, + "step": 406 + }, + { + "epoch": 0.00014286076577407062, + "grad_norm": 0.3691099286079407, + "learning_rate": 0.0001732220367278798, + "loss": 0.5348, + "step": 407 + }, + { + "epoch": 0.00014321177502658677, + "grad_norm": 0.35860803723335266, + "learning_rate": 0.0001731552587646077, + "loss": 0.6029, + "step": 408 + }, + { + "epoch": 0.0001435627842791029, + "grad_norm": 0.3640693426132202, + "learning_rate": 0.00017308848080133557, + "loss": 0.6673, + "step": 409 + }, + { + "epoch": 0.00014391379353161904, + "grad_norm": 0.3550623953342438, + "learning_rate": 0.00017302170283806345, + "loss": 0.4659, + "step": 410 + }, + { + "epoch": 0.0001442648027841352, + "grad_norm": 0.45885637402534485, + "learning_rate": 0.00017295492487479132, + "loss": 0.4781, + "step": 411 + }, + { + "epoch": 0.00014461581203665134, + "grad_norm": 0.3703556954860687, + "learning_rate": 0.0001728881469115192, + "loss": 0.4829, + "step": 412 + }, + { + "epoch": 0.0001449668212891675, + "grad_norm": 0.5436837077140808, + "learning_rate": 0.0001728213689482471, + "loss": 0.6056, + "step": 413 + }, + { + "epoch": 0.0001453178305416836, + "grad_norm": 0.3953244686126709, + "learning_rate": 0.00017275459098497497, + "loss": 0.4884, + "step": 414 + }, + { + "epoch": 0.00014566883979419976, + "grad_norm": 0.34003904461860657, + "learning_rate": 0.00017268781302170284, + "loss": 0.6014, + "step": 415 + }, + { + "epoch": 0.0001460198490467159, + "grad_norm": 0.3463648557662964, + "learning_rate": 0.0001726210350584307, + "loss": 0.603, + "step": 416 + }, + { + "epoch": 0.00014637085829923206, + "grad_norm": 0.4293590784072876, + "learning_rate": 0.0001725542570951586, + "loss": 0.6686, + "step": 417 + }, + { + "epoch": 0.0001467218675517482, + "grad_norm": 0.4243469834327698, + "learning_rate": 0.0001724874791318865, + "loss": 0.6422, + "step": 418 + }, + { + "epoch": 0.00014707287680426433, + "grad_norm": 0.38327839970588684, + "learning_rate": 0.0001724207011686144, + "loss": 0.5595, + "step": 419 + }, + { + "epoch": 0.00014742388605678048, + "grad_norm": 0.31334301829338074, + "learning_rate": 0.00017235392320534226, + "loss": 0.474, + "step": 420 + }, + { + "epoch": 0.00014777489530929663, + "grad_norm": 0.3335350453853607, + "learning_rate": 0.00017228714524207013, + "loss": 0.6172, + "step": 421 + }, + { + "epoch": 0.00014812590456181278, + "grad_norm": 0.373696506023407, + "learning_rate": 0.000172220367278798, + "loss": 0.6183, + "step": 422 + }, + { + "epoch": 0.00014847691381432893, + "grad_norm": 0.45814886689186096, + "learning_rate": 0.00017215358931552588, + "loss": 0.5059, + "step": 423 + }, + { + "epoch": 0.00014882792306684505, + "grad_norm": 0.3578277826309204, + "learning_rate": 0.00017208681135225378, + "loss": 0.5771, + "step": 424 + }, + { + "epoch": 0.0001491789323193612, + "grad_norm": 0.42081883549690247, + "learning_rate": 0.00017202003338898165, + "loss": 0.5604, + "step": 425 + }, + { + "epoch": 0.00014952994157187735, + "grad_norm": 0.3173503875732422, + "learning_rate": 0.00017195325542570953, + "loss": 0.5738, + "step": 426 + }, + { + "epoch": 0.0001498809508243935, + "grad_norm": 0.38292011618614197, + "learning_rate": 0.0001718864774624374, + "loss": 0.6067, + "step": 427 + }, + { + "epoch": 0.00015023196007690965, + "grad_norm": 0.3518977463245392, + "learning_rate": 0.00017181969949916527, + "loss": 0.5073, + "step": 428 + }, + { + "epoch": 0.00015058296932942577, + "grad_norm": 0.5157706141471863, + "learning_rate": 0.00017175292153589317, + "loss": 0.5496, + "step": 429 + }, + { + "epoch": 0.00015093397858194192, + "grad_norm": 0.32064110040664673, + "learning_rate": 0.00017168614357262105, + "loss": 0.4766, + "step": 430 + }, + { + "epoch": 0.00015128498783445807, + "grad_norm": 0.42229798436164856, + "learning_rate": 0.00017161936560934892, + "loss": 0.5953, + "step": 431 + }, + { + "epoch": 0.00015163599708697422, + "grad_norm": 0.4723895192146301, + "learning_rate": 0.0001715525876460768, + "loss": 0.4783, + "step": 432 + }, + { + "epoch": 0.00015198700633949037, + "grad_norm": 0.3841445744037628, + "learning_rate": 0.00017148580968280467, + "loss": 0.5003, + "step": 433 + }, + { + "epoch": 0.0001523380155920065, + "grad_norm": 0.38026461005210876, + "learning_rate": 0.00017141903171953257, + "loss": 0.5093, + "step": 434 + }, + { + "epoch": 0.00015268902484452264, + "grad_norm": 0.37034904956817627, + "learning_rate": 0.00017135225375626044, + "loss": 0.6158, + "step": 435 + }, + { + "epoch": 0.0001530400340970388, + "grad_norm": 0.3876091241836548, + "learning_rate": 0.00017128547579298834, + "loss": 0.5287, + "step": 436 + }, + { + "epoch": 0.00015339104334955494, + "grad_norm": 0.30055519938468933, + "learning_rate": 0.0001712186978297162, + "loss": 0.5018, + "step": 437 + }, + { + "epoch": 0.0001537420526020711, + "grad_norm": 0.36094966530799866, + "learning_rate": 0.00017115191986644409, + "loss": 0.4961, + "step": 438 + }, + { + "epoch": 0.0001540930618545872, + "grad_norm": 0.3300524055957794, + "learning_rate": 0.00017108514190317196, + "loss": 0.5246, + "step": 439 + }, + { + "epoch": 0.00015444407110710336, + "grad_norm": 0.40980783104896545, + "learning_rate": 0.00017101836393989986, + "loss": 0.5705, + "step": 440 + }, + { + "epoch": 0.0001547950803596195, + "grad_norm": 0.3442326784133911, + "learning_rate": 0.00017095158597662773, + "loss": 0.5595, + "step": 441 + }, + { + "epoch": 0.00015514608961213566, + "grad_norm": 0.48015034198760986, + "learning_rate": 0.0001708848080133556, + "loss": 0.5642, + "step": 442 + }, + { + "epoch": 0.0001554970988646518, + "grad_norm": 0.5570142269134521, + "learning_rate": 0.00017081803005008348, + "loss": 0.6111, + "step": 443 + }, + { + "epoch": 0.00015584810811716793, + "grad_norm": 0.30470094084739685, + "learning_rate": 0.00017075125208681135, + "loss": 0.5151, + "step": 444 + }, + { + "epoch": 0.00015619911736968408, + "grad_norm": 0.31946614384651184, + "learning_rate": 0.00017068447412353925, + "loss": 0.5265, + "step": 445 + }, + { + "epoch": 0.00015655012662220023, + "grad_norm": 0.38980719447135925, + "learning_rate": 0.00017061769616026712, + "loss": 0.575, + "step": 446 + }, + { + "epoch": 0.00015690113587471638, + "grad_norm": 0.4077732264995575, + "learning_rate": 0.000170550918196995, + "loss": 0.5729, + "step": 447 + }, + { + "epoch": 0.00015725214512723253, + "grad_norm": 0.38632732629776, + "learning_rate": 0.00017048414023372287, + "loss": 0.594, + "step": 448 + }, + { + "epoch": 0.00015760315437974865, + "grad_norm": 0.37193921208381653, + "learning_rate": 0.00017041736227045074, + "loss": 0.6062, + "step": 449 + }, + { + "epoch": 0.0001579541636322648, + "grad_norm": 0.399029016494751, + "learning_rate": 0.00017035058430717862, + "loss": 0.4538, + "step": 450 + }, + { + "epoch": 0.00015830517288478095, + "grad_norm": 0.37710487842559814, + "learning_rate": 0.00017028380634390652, + "loss": 0.5615, + "step": 451 + }, + { + "epoch": 0.0001586561821372971, + "grad_norm": 0.38591668009757996, + "learning_rate": 0.0001702170283806344, + "loss": 0.5316, + "step": 452 + }, + { + "epoch": 0.00015900719138981325, + "grad_norm": 0.3453538417816162, + "learning_rate": 0.0001701502504173623, + "loss": 0.4645, + "step": 453 + }, + { + "epoch": 0.00015935820064232937, + "grad_norm": 0.34171512722969055, + "learning_rate": 0.00017008347245409016, + "loss": 0.5856, + "step": 454 + }, + { + "epoch": 0.00015970920989484552, + "grad_norm": 0.39591720700263977, + "learning_rate": 0.00017001669449081804, + "loss": 0.573, + "step": 455 + }, + { + "epoch": 0.00016006021914736167, + "grad_norm": 0.4127822816371918, + "learning_rate": 0.00016994991652754594, + "loss": 0.5183, + "step": 456 + }, + { + "epoch": 0.00016041122839987782, + "grad_norm": 0.37893375754356384, + "learning_rate": 0.0001698831385642738, + "loss": 0.566, + "step": 457 + }, + { + "epoch": 0.00016076223765239397, + "grad_norm": 0.33429333567619324, + "learning_rate": 0.00016981636060100168, + "loss": 0.449, + "step": 458 + }, + { + "epoch": 0.0001611132469049101, + "grad_norm": 0.3333180546760559, + "learning_rate": 0.00016974958263772956, + "loss": 0.4441, + "step": 459 + }, + { + "epoch": 0.00016146425615742624, + "grad_norm": 0.3591359257698059, + "learning_rate": 0.00016968280467445743, + "loss": 0.55, + "step": 460 + }, + { + "epoch": 0.0001618152654099424, + "grad_norm": 0.35390427708625793, + "learning_rate": 0.00016961602671118533, + "loss": 0.6445, + "step": 461 + }, + { + "epoch": 0.00016216627466245854, + "grad_norm": 0.42036697268486023, + "learning_rate": 0.0001695492487479132, + "loss": 0.5411, + "step": 462 + }, + { + "epoch": 0.0001625172839149747, + "grad_norm": 0.42147770524024963, + "learning_rate": 0.00016948247078464108, + "loss": 0.6218, + "step": 463 + }, + { + "epoch": 0.0001628682931674908, + "grad_norm": 0.3960399329662323, + "learning_rate": 0.00016941569282136895, + "loss": 0.6608, + "step": 464 + }, + { + "epoch": 0.00016321930242000696, + "grad_norm": 0.39676985144615173, + "learning_rate": 0.00016934891485809682, + "loss": 0.5838, + "step": 465 + }, + { + "epoch": 0.0001635703116725231, + "grad_norm": 0.2839520573616028, + "learning_rate": 0.0001692821368948247, + "loss": 0.5334, + "step": 466 + }, + { + "epoch": 0.00016392132092503926, + "grad_norm": 0.3654347062110901, + "learning_rate": 0.0001692153589315526, + "loss": 0.6065, + "step": 467 + }, + { + "epoch": 0.0001642723301775554, + "grad_norm": 0.3709166646003723, + "learning_rate": 0.00016914858096828047, + "loss": 0.509, + "step": 468 + }, + { + "epoch": 0.00016462333943007153, + "grad_norm": 0.29224780201911926, + "learning_rate": 0.00016908180300500834, + "loss": 0.5372, + "step": 469 + }, + { + "epoch": 0.00016497434868258768, + "grad_norm": 0.34979283809661865, + "learning_rate": 0.00016901502504173624, + "loss": 0.3968, + "step": 470 + }, + { + "epoch": 0.00016532535793510383, + "grad_norm": 0.34580183029174805, + "learning_rate": 0.00016894824707846412, + "loss": 0.6032, + "step": 471 + }, + { + "epoch": 0.00016567636718761998, + "grad_norm": 0.39046213030815125, + "learning_rate": 0.00016888146911519202, + "loss": 0.5628, + "step": 472 + }, + { + "epoch": 0.00016602737644013613, + "grad_norm": 0.35301411151885986, + "learning_rate": 0.0001688146911519199, + "loss": 0.607, + "step": 473 + }, + { + "epoch": 0.00016637838569265225, + "grad_norm": 0.4572748839855194, + "learning_rate": 0.00016874791318864776, + "loss": 0.5018, + "step": 474 + }, + { + "epoch": 0.0001667293949451684, + "grad_norm": 0.38230374455451965, + "learning_rate": 0.00016868113522537564, + "loss": 0.5026, + "step": 475 + }, + { + "epoch": 0.00016708040419768455, + "grad_norm": 0.37066343426704407, + "learning_rate": 0.0001686143572621035, + "loss": 0.5819, + "step": 476 + }, + { + "epoch": 0.0001674314134502007, + "grad_norm": 0.3658660054206848, + "learning_rate": 0.0001685475792988314, + "loss": 0.6825, + "step": 477 + }, + { + "epoch": 0.00016778242270271685, + "grad_norm": 0.42174890637397766, + "learning_rate": 0.00016848080133555928, + "loss": 0.6065, + "step": 478 + }, + { + "epoch": 0.00016813343195523297, + "grad_norm": 0.3462882936000824, + "learning_rate": 0.00016841402337228716, + "loss": 0.5888, + "step": 479 + }, + { + "epoch": 0.00016848444120774912, + "grad_norm": 0.44681960344314575, + "learning_rate": 0.00016834724540901503, + "loss": 0.4987, + "step": 480 + }, + { + "epoch": 0.00016883545046026527, + "grad_norm": 0.3535650372505188, + "learning_rate": 0.0001682804674457429, + "loss": 0.6478, + "step": 481 + }, + { + "epoch": 0.00016918645971278142, + "grad_norm": 0.3357018232345581, + "learning_rate": 0.00016821368948247077, + "loss": 0.4949, + "step": 482 + }, + { + "epoch": 0.00016953746896529757, + "grad_norm": 0.42756739258766174, + "learning_rate": 0.00016814691151919868, + "loss": 0.6475, + "step": 483 + }, + { + "epoch": 0.0001698884782178137, + "grad_norm": 0.36174866557121277, + "learning_rate": 0.00016808013355592655, + "loss": 0.598, + "step": 484 + }, + { + "epoch": 0.00017023948747032984, + "grad_norm": 0.37115278840065, + "learning_rate": 0.00016801335559265442, + "loss": 0.6215, + "step": 485 + }, + { + "epoch": 0.000170590496722846, + "grad_norm": 0.340249627828598, + "learning_rate": 0.0001679465776293823, + "loss": 0.5702, + "step": 486 + }, + { + "epoch": 0.00017094150597536214, + "grad_norm": 0.31226348876953125, + "learning_rate": 0.0001678797996661102, + "loss": 0.6531, + "step": 487 + }, + { + "epoch": 0.0001712925152278783, + "grad_norm": 0.35571998357772827, + "learning_rate": 0.00016781302170283807, + "loss": 0.6406, + "step": 488 + }, + { + "epoch": 0.00017164352448039441, + "grad_norm": 0.4167378842830658, + "learning_rate": 0.00016774624373956597, + "loss": 0.5111, + "step": 489 + }, + { + "epoch": 0.00017199453373291056, + "grad_norm": 0.292304128408432, + "learning_rate": 0.00016767946577629384, + "loss": 0.6643, + "step": 490 + }, + { + "epoch": 0.0001723455429854267, + "grad_norm": 0.38789069652557373, + "learning_rate": 0.00016761268781302171, + "loss": 0.4542, + "step": 491 + }, + { + "epoch": 0.00017269655223794286, + "grad_norm": 0.33764714002609253, + "learning_rate": 0.0001675459098497496, + "loss": 0.4158, + "step": 492 + }, + { + "epoch": 0.00017304756149045898, + "grad_norm": 0.34849148988723755, + "learning_rate": 0.0001674791318864775, + "loss": 0.4737, + "step": 493 + }, + { + "epoch": 0.00017339857074297513, + "grad_norm": 0.2921352684497833, + "learning_rate": 0.00016741235392320536, + "loss": 0.679, + "step": 494 + }, + { + "epoch": 0.00017374957999549128, + "grad_norm": 0.33746641874313354, + "learning_rate": 0.00016734557595993323, + "loss": 0.4957, + "step": 495 + }, + { + "epoch": 0.00017410058924800743, + "grad_norm": 0.4029395878314972, + "learning_rate": 0.0001672787979966611, + "loss": 0.6708, + "step": 496 + }, + { + "epoch": 0.00017445159850052358, + "grad_norm": 0.440033882856369, + "learning_rate": 0.00016721202003338898, + "loss": 0.5889, + "step": 497 + }, + { + "epoch": 0.0001748026077530397, + "grad_norm": 0.330692857503891, + "learning_rate": 0.00016714524207011685, + "loss": 0.5942, + "step": 498 + }, + { + "epoch": 0.00017515361700555585, + "grad_norm": 0.3111809492111206, + "learning_rate": 0.00016707846410684475, + "loss": 0.5506, + "step": 499 + }, + { + "epoch": 0.000175504626258072, + "grad_norm": 0.38885676860809326, + "learning_rate": 0.00016701168614357263, + "loss": 0.4713, + "step": 500 + }, + { + "epoch": 0.00017585563551058815, + "grad_norm": 0.3697550296783447, + "learning_rate": 0.0001669449081803005, + "loss": 0.5955, + "step": 501 + }, + { + "epoch": 0.0001762066447631043, + "grad_norm": 0.35807061195373535, + "learning_rate": 0.00016687813021702837, + "loss": 0.555, + "step": 502 + }, + { + "epoch": 0.00017655765401562043, + "grad_norm": 0.44033464789390564, + "learning_rate": 0.00016681135225375625, + "loss": 0.5668, + "step": 503 + }, + { + "epoch": 0.00017690866326813657, + "grad_norm": 0.3363400399684906, + "learning_rate": 0.00016674457429048415, + "loss": 0.6176, + "step": 504 + }, + { + "epoch": 0.00017725967252065272, + "grad_norm": 0.31457507610321045, + "learning_rate": 0.00016667779632721202, + "loss": 0.6524, + "step": 505 + }, + { + "epoch": 0.00017761068177316887, + "grad_norm": 0.38115641474723816, + "learning_rate": 0.00016661101836393992, + "loss": 0.5848, + "step": 506 + }, + { + "epoch": 0.00017796169102568502, + "grad_norm": 0.3387603759765625, + "learning_rate": 0.0001665442404006678, + "loss": 0.6992, + "step": 507 + }, + { + "epoch": 0.00017831270027820115, + "grad_norm": 0.31671345233917236, + "learning_rate": 0.00016647746243739567, + "loss": 0.5744, + "step": 508 + }, + { + "epoch": 0.0001786637095307173, + "grad_norm": 0.3776471018791199, + "learning_rate": 0.00016641068447412357, + "loss": 0.622, + "step": 509 + }, + { + "epoch": 0.00017901471878323344, + "grad_norm": 0.37572941184043884, + "learning_rate": 0.00016634390651085144, + "loss": 0.5259, + "step": 510 + }, + { + "epoch": 0.0001793657280357496, + "grad_norm": 0.3335510194301605, + "learning_rate": 0.0001662771285475793, + "loss": 0.547, + "step": 511 + }, + { + "epoch": 0.00017971673728826574, + "grad_norm": 0.33241015672683716, + "learning_rate": 0.00016621035058430719, + "loss": 0.5827, + "step": 512 + }, + { + "epoch": 0.00018006774654078187, + "grad_norm": 0.3761122524738312, + "learning_rate": 0.00016614357262103506, + "loss": 0.6962, + "step": 513 + }, + { + "epoch": 0.00018041875579329802, + "grad_norm": 0.4172234833240509, + "learning_rate": 0.00016607679465776293, + "loss": 0.4922, + "step": 514 + }, + { + "epoch": 0.00018076976504581416, + "grad_norm": 0.45372599363327026, + "learning_rate": 0.00016601001669449083, + "loss": 0.5804, + "step": 515 + }, + { + "epoch": 0.00018112077429833031, + "grad_norm": 0.3854759931564331, + "learning_rate": 0.0001659432387312187, + "loss": 0.6026, + "step": 516 + }, + { + "epoch": 0.00018147178355084646, + "grad_norm": 0.3399171829223633, + "learning_rate": 0.00016587646076794658, + "loss": 0.4773, + "step": 517 + }, + { + "epoch": 0.00018182279280336259, + "grad_norm": 0.36649778485298157, + "learning_rate": 0.00016580968280467445, + "loss": 0.59, + "step": 518 + }, + { + "epoch": 0.00018217380205587874, + "grad_norm": 0.39988765120506287, + "learning_rate": 0.00016574290484140233, + "loss": 0.6094, + "step": 519 + }, + { + "epoch": 0.00018252481130839489, + "grad_norm": 0.34659436345100403, + "learning_rate": 0.00016567612687813023, + "loss": 0.4832, + "step": 520 + }, + { + "epoch": 0.00018287582056091103, + "grad_norm": 0.3742654025554657, + "learning_rate": 0.0001656093489148581, + "loss": 0.413, + "step": 521 + }, + { + "epoch": 0.00018322682981342718, + "grad_norm": 0.43068456649780273, + "learning_rate": 0.00016554257095158597, + "loss": 0.6576, + "step": 522 + }, + { + "epoch": 0.0001835778390659433, + "grad_norm": 0.42455193400382996, + "learning_rate": 0.00016547579298831387, + "loss": 0.5897, + "step": 523 + }, + { + "epoch": 0.00018392884831845946, + "grad_norm": 0.3290526568889618, + "learning_rate": 0.00016540901502504175, + "loss": 0.4022, + "step": 524 + }, + { + "epoch": 0.0001842798575709756, + "grad_norm": 0.3744141161441803, + "learning_rate": 0.00016534223706176965, + "loss": 0.5577, + "step": 525 + }, + { + "epoch": 0.00018463086682349176, + "grad_norm": 0.3516618609428406, + "learning_rate": 0.00016527545909849752, + "loss": 0.5481, + "step": 526 + }, + { + "epoch": 0.0001849818760760079, + "grad_norm": 0.3591526448726654, + "learning_rate": 0.0001652086811352254, + "loss": 0.6339, + "step": 527 + }, + { + "epoch": 0.00018533288532852403, + "grad_norm": 0.4024425745010376, + "learning_rate": 0.00016514190317195327, + "loss": 0.5268, + "step": 528 + }, + { + "epoch": 0.00018568389458104018, + "grad_norm": 0.3502136766910553, + "learning_rate": 0.00016507512520868114, + "loss": 0.5112, + "step": 529 + }, + { + "epoch": 0.00018603490383355633, + "grad_norm": 0.3338727056980133, + "learning_rate": 0.00016500834724540904, + "loss": 0.5623, + "step": 530 + }, + { + "epoch": 0.00018638591308607248, + "grad_norm": 0.43554845452308655, + "learning_rate": 0.0001649415692821369, + "loss": 0.5853, + "step": 531 + }, + { + "epoch": 0.00018673692233858862, + "grad_norm": 0.34424322843551636, + "learning_rate": 0.00016487479131886478, + "loss": 0.4951, + "step": 532 + }, + { + "epoch": 0.00018708793159110475, + "grad_norm": 0.4424237012863159, + "learning_rate": 0.00016480801335559266, + "loss": 0.4576, + "step": 533 + }, + { + "epoch": 0.0001874389408436209, + "grad_norm": 0.4616681933403015, + "learning_rate": 0.00016474123539232053, + "loss": 0.4974, + "step": 534 + }, + { + "epoch": 0.00018778995009613705, + "grad_norm": 0.3599206507205963, + "learning_rate": 0.0001646744574290484, + "loss": 0.5987, + "step": 535 + }, + { + "epoch": 0.0001881409593486532, + "grad_norm": 0.40468478202819824, + "learning_rate": 0.0001646076794657763, + "loss": 0.5914, + "step": 536 + }, + { + "epoch": 0.00018849196860116935, + "grad_norm": 0.5389227271080017, + "learning_rate": 0.00016454090150250418, + "loss": 0.6459, + "step": 537 + }, + { + "epoch": 0.00018884297785368547, + "grad_norm": 0.3493568003177643, + "learning_rate": 0.00016447412353923205, + "loss": 0.5191, + "step": 538 + }, + { + "epoch": 0.00018919398710620162, + "grad_norm": 0.31237804889678955, + "learning_rate": 0.00016440734557595992, + "loss": 0.4819, + "step": 539 + }, + { + "epoch": 0.00018954499635871777, + "grad_norm": 0.31142041087150574, + "learning_rate": 0.00016434056761268782, + "loss": 0.5659, + "step": 540 + }, + { + "epoch": 0.00018989600561123392, + "grad_norm": 0.3323245644569397, + "learning_rate": 0.0001642737896494157, + "loss": 0.5779, + "step": 541 + }, + { + "epoch": 0.00019024701486375007, + "grad_norm": 0.3679036498069763, + "learning_rate": 0.0001642070116861436, + "loss": 0.6919, + "step": 542 + }, + { + "epoch": 0.0001905980241162662, + "grad_norm": 0.3094903528690338, + "learning_rate": 0.00016414023372287147, + "loss": 0.4773, + "step": 543 + }, + { + "epoch": 0.00019094903336878234, + "grad_norm": 0.37995582818984985, + "learning_rate": 0.00016407345575959934, + "loss": 0.539, + "step": 544 + }, + { + "epoch": 0.0001913000426212985, + "grad_norm": 0.46415746212005615, + "learning_rate": 0.00016400667779632722, + "loss": 0.6708, + "step": 545 + }, + { + "epoch": 0.00019165105187381464, + "grad_norm": 0.3479398190975189, + "learning_rate": 0.00016393989983305512, + "loss": 0.5496, + "step": 546 + }, + { + "epoch": 0.00019200206112633079, + "grad_norm": 0.3740891218185425, + "learning_rate": 0.000163873121869783, + "loss": 0.6256, + "step": 547 + }, + { + "epoch": 0.0001923530703788469, + "grad_norm": 0.4934074878692627, + "learning_rate": 0.00016380634390651086, + "loss": 0.6788, + "step": 548 + }, + { + "epoch": 0.00019270407963136306, + "grad_norm": 0.42659157514572144, + "learning_rate": 0.00016373956594323874, + "loss": 0.5981, + "step": 549 + }, + { + "epoch": 0.0001930550888838792, + "grad_norm": 0.35727575421333313, + "learning_rate": 0.0001636727879799666, + "loss": 0.4095, + "step": 550 + }, + { + "epoch": 0.00019340609813639536, + "grad_norm": 0.4294300377368927, + "learning_rate": 0.00016360601001669448, + "loss": 0.5386, + "step": 551 + }, + { + "epoch": 0.0001937571073889115, + "grad_norm": 0.33482253551483154, + "learning_rate": 0.00016353923205342238, + "loss": 0.4901, + "step": 552 + }, + { + "epoch": 0.00019410811664142763, + "grad_norm": 0.3379746079444885, + "learning_rate": 0.00016347245409015026, + "loss": 0.5454, + "step": 553 + }, + { + "epoch": 0.00019445912589394378, + "grad_norm": 0.42393919825553894, + "learning_rate": 0.00016340567612687813, + "loss": 0.5959, + "step": 554 + }, + { + "epoch": 0.00019481013514645993, + "grad_norm": 0.31975501775741577, + "learning_rate": 0.000163338898163606, + "loss": 0.6048, + "step": 555 + }, + { + "epoch": 0.00019516114439897608, + "grad_norm": 0.43404972553253174, + "learning_rate": 0.00016327212020033388, + "loss": 0.6252, + "step": 556 + }, + { + "epoch": 0.00019551215365149223, + "grad_norm": 0.3559292256832123, + "learning_rate": 0.00016320534223706178, + "loss": 0.6036, + "step": 557 + }, + { + "epoch": 0.00019586316290400835, + "grad_norm": 0.3134891092777252, + "learning_rate": 0.00016313856427378965, + "loss": 0.5656, + "step": 558 + }, + { + "epoch": 0.0001962141721565245, + "grad_norm": 0.32056671380996704, + "learning_rate": 0.00016307178631051755, + "loss": 0.6509, + "step": 559 + }, + { + "epoch": 0.00019656518140904065, + "grad_norm": 0.46249130368232727, + "learning_rate": 0.00016300500834724542, + "loss": 0.6379, + "step": 560 + }, + { + "epoch": 0.0001969161906615568, + "grad_norm": 0.36366966366767883, + "learning_rate": 0.0001629382303839733, + "loss": 0.5334, + "step": 561 + }, + { + "epoch": 0.00019726719991407295, + "grad_norm": 0.4234124422073364, + "learning_rate": 0.0001628714524207012, + "loss": 0.4864, + "step": 562 + }, + { + "epoch": 0.00019761820916658907, + "grad_norm": 0.3687801659107208, + "learning_rate": 0.00016280467445742907, + "loss": 0.4855, + "step": 563 + }, + { + "epoch": 0.00019796921841910522, + "grad_norm": 0.37247028946876526, + "learning_rate": 0.00016273789649415694, + "loss": 0.6215, + "step": 564 + }, + { + "epoch": 0.00019832022767162137, + "grad_norm": 0.30445635318756104, + "learning_rate": 0.00016267111853088482, + "loss": 0.5741, + "step": 565 + }, + { + "epoch": 0.00019867123692413752, + "grad_norm": 0.3349187970161438, + "learning_rate": 0.0001626043405676127, + "loss": 0.4524, + "step": 566 + }, + { + "epoch": 0.00019902224617665367, + "grad_norm": 0.36938101053237915, + "learning_rate": 0.00016253756260434056, + "loss": 0.5046, + "step": 567 + }, + { + "epoch": 0.0001993732554291698, + "grad_norm": 0.37673529982566833, + "learning_rate": 0.00016247078464106846, + "loss": 0.5001, + "step": 568 + }, + { + "epoch": 0.00019972426468168594, + "grad_norm": 0.3571556508541107, + "learning_rate": 0.00016240400667779634, + "loss": 0.6419, + "step": 569 + }, + { + "epoch": 0.0002000752739342021, + "grad_norm": 0.35543423891067505, + "learning_rate": 0.0001623372287145242, + "loss": 0.6191, + "step": 570 + }, + { + "epoch": 0.00020042628318671824, + "grad_norm": 0.3096729516983032, + "learning_rate": 0.00016227045075125208, + "loss": 0.5373, + "step": 571 + }, + { + "epoch": 0.0002007772924392344, + "grad_norm": 0.30310383439064026, + "learning_rate": 0.00016220367278797996, + "loss": 0.558, + "step": 572 + }, + { + "epoch": 0.0002011283016917505, + "grad_norm": 0.3616211712360382, + "learning_rate": 0.00016213689482470786, + "loss": 0.6504, + "step": 573 + }, + { + "epoch": 0.00020147931094426666, + "grad_norm": 0.34818220138549805, + "learning_rate": 0.00016207011686143573, + "loss": 0.6136, + "step": 574 + }, + { + "epoch": 0.0002018303201967828, + "grad_norm": 0.36225444078445435, + "learning_rate": 0.0001620033388981636, + "loss": 0.4905, + "step": 575 + }, + { + "epoch": 0.00020218132944929896, + "grad_norm": 0.40039536356925964, + "learning_rate": 0.0001619365609348915, + "loss": 0.5997, + "step": 576 + }, + { + "epoch": 0.0002025323387018151, + "grad_norm": 0.33715930581092834, + "learning_rate": 0.00016186978297161938, + "loss": 0.5284, + "step": 577 + }, + { + "epoch": 0.00020288334795433123, + "grad_norm": 0.4137067198753357, + "learning_rate": 0.00016180300500834728, + "loss": 0.6873, + "step": 578 + }, + { + "epoch": 0.00020323435720684738, + "grad_norm": 0.41598305106163025, + "learning_rate": 0.00016173622704507515, + "loss": 0.491, + "step": 579 + }, + { + "epoch": 0.00020358536645936353, + "grad_norm": 0.5466423034667969, + "learning_rate": 0.00016166944908180302, + "loss": 0.6188, + "step": 580 + }, + { + "epoch": 0.00020393637571187968, + "grad_norm": 0.3718060851097107, + "learning_rate": 0.0001616026711185309, + "loss": 0.5573, + "step": 581 + }, + { + "epoch": 0.00020428738496439583, + "grad_norm": 0.33747225999832153, + "learning_rate": 0.00016153589315525877, + "loss": 0.4887, + "step": 582 + }, + { + "epoch": 0.00020463839421691195, + "grad_norm": 0.36478081345558167, + "learning_rate": 0.00016146911519198664, + "loss": 0.553, + "step": 583 + }, + { + "epoch": 0.0002049894034694281, + "grad_norm": 0.38441962003707886, + "learning_rate": 0.00016140233722871454, + "loss": 0.4833, + "step": 584 + }, + { + "epoch": 0.00020534041272194425, + "grad_norm": 0.45594358444213867, + "learning_rate": 0.00016133555926544241, + "loss": 0.5877, + "step": 585 + }, + { + "epoch": 0.0002056914219744604, + "grad_norm": 0.356517493724823, + "learning_rate": 0.0001612687813021703, + "loss": 0.5614, + "step": 586 + }, + { + "epoch": 0.00020604243122697655, + "grad_norm": 0.4051963686943054, + "learning_rate": 0.00016120200333889816, + "loss": 0.5208, + "step": 587 + }, + { + "epoch": 0.00020639344047949267, + "grad_norm": 0.36947959661483765, + "learning_rate": 0.00016113522537562603, + "loss": 0.4385, + "step": 588 + }, + { + "epoch": 0.00020674444973200882, + "grad_norm": 0.45947200059890747, + "learning_rate": 0.00016106844741235393, + "loss": 0.4972, + "step": 589 + }, + { + "epoch": 0.00020709545898452497, + "grad_norm": 0.40610602498054504, + "learning_rate": 0.0001610016694490818, + "loss": 0.4022, + "step": 590 + }, + { + "epoch": 0.00020744646823704112, + "grad_norm": 0.3529384732246399, + "learning_rate": 0.00016093489148580968, + "loss": 0.5222, + "step": 591 + }, + { + "epoch": 0.00020779747748955727, + "grad_norm": 0.35114821791648865, + "learning_rate": 0.00016086811352253755, + "loss": 0.6224, + "step": 592 + }, + { + "epoch": 0.0002081484867420734, + "grad_norm": 0.3596336841583252, + "learning_rate": 0.00016080133555926545, + "loss": 0.5081, + "step": 593 + }, + { + "epoch": 0.00020849949599458954, + "grad_norm": 0.4214174747467041, + "learning_rate": 0.00016073455759599333, + "loss": 0.5189, + "step": 594 + }, + { + "epoch": 0.0002088505052471057, + "grad_norm": 0.39635175466537476, + "learning_rate": 0.00016066777963272123, + "loss": 0.582, + "step": 595 + }, + { + "epoch": 0.00020920151449962184, + "grad_norm": 0.36160576343536377, + "learning_rate": 0.0001606010016694491, + "loss": 0.568, + "step": 596 + }, + { + "epoch": 0.000209552523752138, + "grad_norm": 0.4242927134037018, + "learning_rate": 0.00016053422370617697, + "loss": 0.6235, + "step": 597 + }, + { + "epoch": 0.0002099035330046541, + "grad_norm": 0.4257853925228119, + "learning_rate": 0.00016046744574290485, + "loss": 0.5294, + "step": 598 + }, + { + "epoch": 0.00021025454225717026, + "grad_norm": 0.3890500068664551, + "learning_rate": 0.00016040066777963272, + "loss": 0.6224, + "step": 599 + }, + { + "epoch": 0.0002106055515096864, + "grad_norm": 0.2971879541873932, + "learning_rate": 0.00016033388981636062, + "loss": 0.5951, + "step": 600 + }, + { + "epoch": 0.00021095656076220256, + "grad_norm": 0.29551970958709717, + "learning_rate": 0.0001602671118530885, + "loss": 0.6713, + "step": 601 + }, + { + "epoch": 0.00021130757001471868, + "grad_norm": 0.31588122248649597, + "learning_rate": 0.00016020033388981637, + "loss": 0.6384, + "step": 602 + }, + { + "epoch": 0.00021165857926723483, + "grad_norm": 0.3138657510280609, + "learning_rate": 0.00016013355592654424, + "loss": 0.5846, + "step": 603 + }, + { + "epoch": 0.00021200958851975098, + "grad_norm": 0.31286585330963135, + "learning_rate": 0.0001600667779632721, + "loss": 0.6236, + "step": 604 + }, + { + "epoch": 0.00021236059777226713, + "grad_norm": 0.32098105549812317, + "learning_rate": 0.00016, + "loss": 0.4926, + "step": 605 + }, + { + "epoch": 0.00021271160702478328, + "grad_norm": 0.371427446603775, + "learning_rate": 0.00015993322203672789, + "loss": 0.6205, + "step": 606 + }, + { + "epoch": 0.0002130626162772994, + "grad_norm": 0.28764042258262634, + "learning_rate": 0.00015986644407345576, + "loss": 0.449, + "step": 607 + }, + { + "epoch": 0.00021341362552981555, + "grad_norm": 0.35086238384246826, + "learning_rate": 0.00015979966611018363, + "loss": 0.549, + "step": 608 + }, + { + "epoch": 0.0002137646347823317, + "grad_norm": 0.3118048906326294, + "learning_rate": 0.0001597328881469115, + "loss": 0.6037, + "step": 609 + }, + { + "epoch": 0.00021411564403484785, + "grad_norm": 0.3894517123699188, + "learning_rate": 0.0001596661101836394, + "loss": 0.5989, + "step": 610 + }, + { + "epoch": 0.000214466653287364, + "grad_norm": 0.39642322063446045, + "learning_rate": 0.00015959933222036728, + "loss": 0.566, + "step": 611 + }, + { + "epoch": 0.00021481766253988012, + "grad_norm": 0.35333508253097534, + "learning_rate": 0.00015953255425709518, + "loss": 0.5055, + "step": 612 + }, + { + "epoch": 0.00021516867179239627, + "grad_norm": 0.39200490713119507, + "learning_rate": 0.00015946577629382305, + "loss": 0.5951, + "step": 613 + }, + { + "epoch": 0.00021551968104491242, + "grad_norm": 0.38436442613601685, + "learning_rate": 0.00015939899833055093, + "loss": 0.4876, + "step": 614 + }, + { + "epoch": 0.00021587069029742857, + "grad_norm": 0.3397504389286041, + "learning_rate": 0.0001593322203672788, + "loss": 0.6287, + "step": 615 + }, + { + "epoch": 0.00021622169954994472, + "grad_norm": 0.35870012640953064, + "learning_rate": 0.0001592654424040067, + "loss": 0.5857, + "step": 616 + }, + { + "epoch": 0.00021657270880246084, + "grad_norm": 0.31163597106933594, + "learning_rate": 0.00015919866444073457, + "loss": 0.4831, + "step": 617 + }, + { + "epoch": 0.000216923718054977, + "grad_norm": 0.35106539726257324, + "learning_rate": 0.00015913188647746245, + "loss": 0.5776, + "step": 618 + }, + { + "epoch": 0.00021727472730749314, + "grad_norm": 0.3639923334121704, + "learning_rate": 0.00015906510851419032, + "loss": 0.5039, + "step": 619 + }, + { + "epoch": 0.0002176257365600093, + "grad_norm": 0.3622918128967285, + "learning_rate": 0.0001589983305509182, + "loss": 0.6293, + "step": 620 + }, + { + "epoch": 0.00021797674581252544, + "grad_norm": 0.3899349868297577, + "learning_rate": 0.0001589315525876461, + "loss": 0.567, + "step": 621 + }, + { + "epoch": 0.00021832775506504156, + "grad_norm": 0.3834361732006073, + "learning_rate": 0.00015886477462437397, + "loss": 0.5106, + "step": 622 + }, + { + "epoch": 0.0002186787643175577, + "grad_norm": 0.34996962547302246, + "learning_rate": 0.00015879799666110184, + "loss": 0.5155, + "step": 623 + }, + { + "epoch": 0.00021902977357007386, + "grad_norm": 0.47908079624176025, + "learning_rate": 0.0001587312186978297, + "loss": 0.4529, + "step": 624 + }, + { + "epoch": 0.00021938078282259, + "grad_norm": 0.3167901635169983, + "learning_rate": 0.00015866444073455758, + "loss": 0.6075, + "step": 625 + }, + { + "epoch": 0.00021973179207510616, + "grad_norm": 0.4254927337169647, + "learning_rate": 0.00015859766277128548, + "loss": 0.6404, + "step": 626 + }, + { + "epoch": 0.00022008280132762228, + "grad_norm": 0.4317469000816345, + "learning_rate": 0.00015853088480801336, + "loss": 0.5881, + "step": 627 + }, + { + "epoch": 0.00022043381058013843, + "grad_norm": 0.4441644251346588, + "learning_rate": 0.00015846410684474123, + "loss": 0.5864, + "step": 628 + }, + { + "epoch": 0.00022078481983265458, + "grad_norm": 0.37883102893829346, + "learning_rate": 0.00015839732888146913, + "loss": 0.5664, + "step": 629 + }, + { + "epoch": 0.00022113582908517073, + "grad_norm": 0.35548868775367737, + "learning_rate": 0.000158330550918197, + "loss": 0.5712, + "step": 630 + }, + { + "epoch": 0.00022148683833768688, + "grad_norm": 0.31588616967201233, + "learning_rate": 0.00015826377295492488, + "loss": 0.4856, + "step": 631 + }, + { + "epoch": 0.000221837847590203, + "grad_norm": 0.3186424672603607, + "learning_rate": 0.00015819699499165278, + "loss": 0.542, + "step": 632 + }, + { + "epoch": 0.00022218885684271915, + "grad_norm": 0.41098466515541077, + "learning_rate": 0.00015813021702838065, + "loss": 0.6311, + "step": 633 + }, + { + "epoch": 0.0002225398660952353, + "grad_norm": 0.413401335477829, + "learning_rate": 0.00015806343906510852, + "loss": 0.5036, + "step": 634 + }, + { + "epoch": 0.00022289087534775145, + "grad_norm": 0.34203773736953735, + "learning_rate": 0.0001579966611018364, + "loss": 0.5508, + "step": 635 + }, + { + "epoch": 0.0002232418846002676, + "grad_norm": 0.34416648745536804, + "learning_rate": 0.00015792988313856427, + "loss": 0.5442, + "step": 636 + }, + { + "epoch": 0.00022359289385278372, + "grad_norm": 0.3439941704273224, + "learning_rate": 0.00015786310517529217, + "loss": 0.4969, + "step": 637 + }, + { + "epoch": 0.00022394390310529987, + "grad_norm": 0.3547762930393219, + "learning_rate": 0.00015779632721202004, + "loss": 0.5564, + "step": 638 + }, + { + "epoch": 0.00022429491235781602, + "grad_norm": 0.35666894912719727, + "learning_rate": 0.00015772954924874792, + "loss": 0.4759, + "step": 639 + }, + { + "epoch": 0.00022464592161033217, + "grad_norm": 0.3175058364868164, + "learning_rate": 0.0001576627712854758, + "loss": 0.5708, + "step": 640 + }, + { + "epoch": 0.00022499693086284832, + "grad_norm": 0.4329943358898163, + "learning_rate": 0.00015759599332220366, + "loss": 0.5293, + "step": 641 + }, + { + "epoch": 0.00022534794011536444, + "grad_norm": 0.5703821778297424, + "learning_rate": 0.00015752921535893156, + "loss": 0.6187, + "step": 642 + }, + { + "epoch": 0.0002256989493678806, + "grad_norm": 0.32244032621383667, + "learning_rate": 0.00015746243739565944, + "loss": 0.4847, + "step": 643 + }, + { + "epoch": 0.00022604995862039674, + "grad_norm": 0.36224085092544556, + "learning_rate": 0.0001573956594323873, + "loss": 0.6804, + "step": 644 + }, + { + "epoch": 0.0002264009678729129, + "grad_norm": 0.3316931426525116, + "learning_rate": 0.0001573288814691152, + "loss": 0.6413, + "step": 645 + }, + { + "epoch": 0.00022675197712542904, + "grad_norm": 0.38156425952911377, + "learning_rate": 0.00015726210350584308, + "loss": 0.5659, + "step": 646 + }, + { + "epoch": 0.00022710298637794516, + "grad_norm": 0.48353493213653564, + "learning_rate": 0.00015719532554257096, + "loss": 0.5788, + "step": 647 + }, + { + "epoch": 0.00022745399563046131, + "grad_norm": 0.3913673758506775, + "learning_rate": 0.00015712854757929886, + "loss": 0.6899, + "step": 648 + }, + { + "epoch": 0.00022780500488297746, + "grad_norm": 0.46836981177330017, + "learning_rate": 0.00015706176961602673, + "loss": 0.5712, + "step": 649 + }, + { + "epoch": 0.0002281560141354936, + "grad_norm": 0.34713172912597656, + "learning_rate": 0.0001569949916527546, + "loss": 0.381, + "step": 650 + }, + { + "epoch": 0.00022850702338800976, + "grad_norm": 0.3837398886680603, + "learning_rate": 0.00015692821368948248, + "loss": 0.5236, + "step": 651 + }, + { + "epoch": 0.00022885803264052589, + "grad_norm": 0.5181556940078735, + "learning_rate": 0.00015686143572621035, + "loss": 0.5889, + "step": 652 + }, + { + "epoch": 0.00022920904189304203, + "grad_norm": 0.42713961005210876, + "learning_rate": 0.00015679465776293825, + "loss": 0.5346, + "step": 653 + }, + { + "epoch": 0.00022956005114555818, + "grad_norm": 0.2868479788303375, + "learning_rate": 0.00015672787979966612, + "loss": 0.5546, + "step": 654 + }, + { + "epoch": 0.00022991106039807433, + "grad_norm": 0.31901800632476807, + "learning_rate": 0.000156661101836394, + "loss": 0.5014, + "step": 655 + }, + { + "epoch": 0.00023026206965059048, + "grad_norm": 0.41681963205337524, + "learning_rate": 0.00015659432387312187, + "loss": 0.5709, + "step": 656 + }, + { + "epoch": 0.0002306130789031066, + "grad_norm": 0.5942090749740601, + "learning_rate": 0.00015652754590984974, + "loss": 0.6022, + "step": 657 + }, + { + "epoch": 0.00023096408815562276, + "grad_norm": 0.405391126871109, + "learning_rate": 0.00015646076794657764, + "loss": 0.5363, + "step": 658 + }, + { + "epoch": 0.0002313150974081389, + "grad_norm": 0.3201390206813812, + "learning_rate": 0.00015639398998330552, + "loss": 0.6045, + "step": 659 + }, + { + "epoch": 0.00023166610666065505, + "grad_norm": 0.2989407479763031, + "learning_rate": 0.0001563272120200334, + "loss": 0.5604, + "step": 660 + }, + { + "epoch": 0.0002320171159131712, + "grad_norm": 0.3919268548488617, + "learning_rate": 0.00015626043405676126, + "loss": 0.5413, + "step": 661 + }, + { + "epoch": 0.00023236812516568733, + "grad_norm": 0.4080122709274292, + "learning_rate": 0.00015619365609348916, + "loss": 0.498, + "step": 662 + }, + { + "epoch": 0.00023271913441820348, + "grad_norm": 0.38974156975746155, + "learning_rate": 0.00015612687813021704, + "loss": 0.6149, + "step": 663 + }, + { + "epoch": 0.00023307014367071962, + "grad_norm": 0.3145015835762024, + "learning_rate": 0.00015606010016694494, + "loss": 0.4886, + "step": 664 + }, + { + "epoch": 0.00023342115292323577, + "grad_norm": 0.3009328246116638, + "learning_rate": 0.0001559933222036728, + "loss": 0.5534, + "step": 665 + }, + { + "epoch": 0.00023377216217575192, + "grad_norm": 0.4774717092514038, + "learning_rate": 0.00015592654424040068, + "loss": 0.6006, + "step": 666 + }, + { + "epoch": 0.00023412317142826805, + "grad_norm": 0.32965418696403503, + "learning_rate": 0.00015585976627712856, + "loss": 0.5463, + "step": 667 + }, + { + "epoch": 0.0002344741806807842, + "grad_norm": 0.3066554665565491, + "learning_rate": 0.00015579298831385643, + "loss": 0.5675, + "step": 668 + }, + { + "epoch": 0.00023482518993330035, + "grad_norm": 0.3879207372665405, + "learning_rate": 0.00015572621035058433, + "loss": 0.5825, + "step": 669 + }, + { + "epoch": 0.0002351761991858165, + "grad_norm": 0.3171943128108978, + "learning_rate": 0.0001556594323873122, + "loss": 0.5677, + "step": 670 + }, + { + "epoch": 0.00023552720843833264, + "grad_norm": 0.36982622742652893, + "learning_rate": 0.00015559265442404007, + "loss": 0.5885, + "step": 671 + }, + { + "epoch": 0.00023587821769084877, + "grad_norm": 0.30437183380126953, + "learning_rate": 0.00015552587646076795, + "loss": 0.6288, + "step": 672 + }, + { + "epoch": 0.00023622922694336492, + "grad_norm": 0.30654504895210266, + "learning_rate": 0.00015545909849749582, + "loss": 0.5924, + "step": 673 + }, + { + "epoch": 0.00023658023619588107, + "grad_norm": 0.3771214783191681, + "learning_rate": 0.00015539232053422372, + "loss": 0.4901, + "step": 674 + }, + { + "epoch": 0.00023693124544839721, + "grad_norm": 0.3018699884414673, + "learning_rate": 0.0001553255425709516, + "loss": 0.6159, + "step": 675 + }, + { + "epoch": 0.00023728225470091336, + "grad_norm": 0.32899734377861023, + "learning_rate": 0.00015525876460767947, + "loss": 0.6197, + "step": 676 + }, + { + "epoch": 0.0002376332639534295, + "grad_norm": 0.31837883591651917, + "learning_rate": 0.00015519198664440734, + "loss": 0.5449, + "step": 677 + }, + { + "epoch": 0.00023798427320594564, + "grad_norm": 0.35326528549194336, + "learning_rate": 0.00015512520868113521, + "loss": 0.6315, + "step": 678 + }, + { + "epoch": 0.00023833528245846179, + "grad_norm": 0.3714829385280609, + "learning_rate": 0.00015505843071786311, + "loss": 0.6352, + "step": 679 + }, + { + "epoch": 0.00023868629171097794, + "grad_norm": 0.4002094864845276, + "learning_rate": 0.000154991652754591, + "loss": 0.4235, + "step": 680 + }, + { + "epoch": 0.00023903730096349408, + "grad_norm": 0.3382783532142639, + "learning_rate": 0.0001549248747913189, + "loss": 0.5476, + "step": 681 + }, + { + "epoch": 0.0002393883102160102, + "grad_norm": 0.2985747158527374, + "learning_rate": 0.00015485809682804676, + "loss": 0.5684, + "step": 682 + }, + { + "epoch": 0.00023973931946852636, + "grad_norm": 0.3288929760456085, + "learning_rate": 0.00015479131886477463, + "loss": 0.5657, + "step": 683 + }, + { + "epoch": 0.0002400903287210425, + "grad_norm": 0.39641210436820984, + "learning_rate": 0.0001547245409015025, + "loss": 0.6283, + "step": 684 + }, + { + "epoch": 0.00024044133797355866, + "grad_norm": 0.37413230538368225, + "learning_rate": 0.0001546577629382304, + "loss": 0.5778, + "step": 685 + }, + { + "epoch": 0.0002407923472260748, + "grad_norm": 0.28837504982948303, + "learning_rate": 0.00015459098497495828, + "loss": 0.5079, + "step": 686 + }, + { + "epoch": 0.00024114335647859093, + "grad_norm": 0.32851526141166687, + "learning_rate": 0.00015452420701168615, + "loss": 0.649, + "step": 687 + }, + { + "epoch": 0.00024149436573110708, + "grad_norm": 0.3848758637905121, + "learning_rate": 0.00015445742904841403, + "loss": 0.6099, + "step": 688 + }, + { + "epoch": 0.00024184537498362323, + "grad_norm": 0.35494935512542725, + "learning_rate": 0.0001543906510851419, + "loss": 0.6498, + "step": 689 + }, + { + "epoch": 0.00024219638423613938, + "grad_norm": 0.3431280553340912, + "learning_rate": 0.0001543238731218698, + "loss": 0.4934, + "step": 690 + }, + { + "epoch": 0.00024254739348865553, + "grad_norm": 0.33980974555015564, + "learning_rate": 0.00015425709515859767, + "loss": 0.5556, + "step": 691 + }, + { + "epoch": 0.00024289840274117165, + "grad_norm": 0.3086068034172058, + "learning_rate": 0.00015419031719532555, + "loss": 0.5955, + "step": 692 + }, + { + "epoch": 0.0002432494119936878, + "grad_norm": 0.33093178272247314, + "learning_rate": 0.00015412353923205342, + "loss": 0.5926, + "step": 693 + }, + { + "epoch": 0.00024360042124620395, + "grad_norm": 0.3660534620285034, + "learning_rate": 0.0001540567612687813, + "loss": 0.5494, + "step": 694 + }, + { + "epoch": 0.0002439514304987201, + "grad_norm": 0.29803964495658875, + "learning_rate": 0.0001539899833055092, + "loss": 0.6074, + "step": 695 + }, + { + "epoch": 0.00024430243975123625, + "grad_norm": 0.36542224884033203, + "learning_rate": 0.00015392320534223707, + "loss": 0.59, + "step": 696 + }, + { + "epoch": 0.00024465344900375237, + "grad_norm": 0.34015166759490967, + "learning_rate": 0.00015385642737896494, + "loss": 0.6029, + "step": 697 + }, + { + "epoch": 0.00024500445825626854, + "grad_norm": 0.3211725950241089, + "learning_rate": 0.00015378964941569284, + "loss": 0.535, + "step": 698 + }, + { + "epoch": 0.00024535546750878467, + "grad_norm": 0.37027183175086975, + "learning_rate": 0.0001537228714524207, + "loss": 0.6265, + "step": 699 + }, + { + "epoch": 0.0002457064767613008, + "grad_norm": 0.3447396159172058, + "learning_rate": 0.00015365609348914859, + "loss": 0.6061, + "step": 700 + }, + { + "epoch": 0.00024605748601381697, + "grad_norm": 0.3344075679779053, + "learning_rate": 0.00015358931552587649, + "loss": 0.5412, + "step": 701 + }, + { + "epoch": 0.0002464084952663331, + "grad_norm": 0.29049620032310486, + "learning_rate": 0.00015352253756260436, + "loss": 0.5137, + "step": 702 + }, + { + "epoch": 0.00024675950451884926, + "grad_norm": 0.37048932909965515, + "learning_rate": 0.00015345575959933223, + "loss": 0.6118, + "step": 703 + }, + { + "epoch": 0.0002471105137713654, + "grad_norm": 0.38212522864341736, + "learning_rate": 0.0001533889816360601, + "loss": 0.466, + "step": 704 + }, + { + "epoch": 0.0002474615230238815, + "grad_norm": 0.3576483428478241, + "learning_rate": 0.00015332220367278798, + "loss": 0.561, + "step": 705 + }, + { + "epoch": 0.0002478125322763977, + "grad_norm": 0.3550293743610382, + "learning_rate": 0.00015325542570951588, + "loss": 0.5634, + "step": 706 + }, + { + "epoch": 0.0002481635415289138, + "grad_norm": 0.362474650144577, + "learning_rate": 0.00015318864774624375, + "loss": 0.5608, + "step": 707 + }, + { + "epoch": 0.00024851455078143, + "grad_norm": 0.39463603496551514, + "learning_rate": 0.00015312186978297163, + "loss": 0.64, + "step": 708 + }, + { + "epoch": 0.0002488655600339461, + "grad_norm": 0.3456307649612427, + "learning_rate": 0.0001530550918196995, + "loss": 0.4631, + "step": 709 + }, + { + "epoch": 0.00024921656928646223, + "grad_norm": 0.3300929367542267, + "learning_rate": 0.00015298831385642737, + "loss": 0.3984, + "step": 710 + }, + { + "epoch": 0.0002495675785389784, + "grad_norm": 0.35923343896865845, + "learning_rate": 0.00015292153589315527, + "loss": 0.6003, + "step": 711 + }, + { + "epoch": 0.00024991858779149453, + "grad_norm": 0.4047611653804779, + "learning_rate": 0.00015285475792988315, + "loss": 0.5715, + "step": 712 + }, + { + "epoch": 0.0002502695970440107, + "grad_norm": 0.43539851903915405, + "learning_rate": 0.00015278797996661102, + "loss": 0.571, + "step": 713 + }, + { + "epoch": 0.00025062060629652683, + "grad_norm": 0.34745046496391296, + "learning_rate": 0.0001527212020033389, + "loss": 0.622, + "step": 714 + }, + { + "epoch": 0.00025097161554904295, + "grad_norm": 0.3130028247833252, + "learning_rate": 0.0001526544240400668, + "loss": 0.507, + "step": 715 + }, + { + "epoch": 0.0002513226248015591, + "grad_norm": 0.3093617558479309, + "learning_rate": 0.00015258764607679466, + "loss": 0.4951, + "step": 716 + }, + { + "epoch": 0.00025167363405407525, + "grad_norm": 0.34299540519714355, + "learning_rate": 0.00015252086811352257, + "loss": 0.539, + "step": 717 + }, + { + "epoch": 0.0002520246433065914, + "grad_norm": 0.32698413729667664, + "learning_rate": 0.00015245409015025044, + "loss": 0.4588, + "step": 718 + }, + { + "epoch": 0.00025237565255910755, + "grad_norm": 0.37853989005088806, + "learning_rate": 0.0001523873121869783, + "loss": 0.6227, + "step": 719 + }, + { + "epoch": 0.00025272666181162367, + "grad_norm": 0.32887300848960876, + "learning_rate": 0.00015232053422370618, + "loss": 0.5893, + "step": 720 + }, + { + "epoch": 0.00025307767106413985, + "grad_norm": 0.43352028727531433, + "learning_rate": 0.00015225375626043406, + "loss": 0.5811, + "step": 721 + }, + { + "epoch": 0.00025342868031665597, + "grad_norm": 0.42844903469085693, + "learning_rate": 0.00015218697829716196, + "loss": 0.6196, + "step": 722 + }, + { + "epoch": 0.00025377968956917215, + "grad_norm": 0.39929670095443726, + "learning_rate": 0.00015212020033388983, + "loss": 0.6722, + "step": 723 + }, + { + "epoch": 0.00025413069882168827, + "grad_norm": 0.5063486695289612, + "learning_rate": 0.0001520534223706177, + "loss": 0.6086, + "step": 724 + }, + { + "epoch": 0.0002544817080742044, + "grad_norm": 0.3625267446041107, + "learning_rate": 0.00015198664440734558, + "loss": 0.6331, + "step": 725 + }, + { + "epoch": 0.00025483271732672057, + "grad_norm": 0.3452700078487396, + "learning_rate": 0.00015191986644407345, + "loss": 0.5812, + "step": 726 + }, + { + "epoch": 0.0002551837265792367, + "grad_norm": 0.31915003061294556, + "learning_rate": 0.00015185308848080135, + "loss": 0.5653, + "step": 727 + }, + { + "epoch": 0.00025553473583175287, + "grad_norm": 0.3085877299308777, + "learning_rate": 0.00015178631051752922, + "loss": 0.4702, + "step": 728 + }, + { + "epoch": 0.000255885745084269, + "grad_norm": 0.31519320607185364, + "learning_rate": 0.0001517195325542571, + "loss": 0.5096, + "step": 729 + }, + { + "epoch": 0.0002562367543367851, + "grad_norm": 0.3637699782848358, + "learning_rate": 0.00015165275459098497, + "loss": 0.6001, + "step": 730 + }, + { + "epoch": 0.0002565877635893013, + "grad_norm": 0.34056970477104187, + "learning_rate": 0.00015158597662771284, + "loss": 0.5546, + "step": 731 + }, + { + "epoch": 0.0002569387728418174, + "grad_norm": 0.37110257148742676, + "learning_rate": 0.00015151919866444074, + "loss": 0.5612, + "step": 732 + }, + { + "epoch": 0.0002572897820943336, + "grad_norm": 0.35854101181030273, + "learning_rate": 0.00015145242070116862, + "loss": 0.6364, + "step": 733 + }, + { + "epoch": 0.0002576407913468497, + "grad_norm": 0.4340030252933502, + "learning_rate": 0.00015138564273789652, + "loss": 0.5772, + "step": 734 + }, + { + "epoch": 0.00025799180059936583, + "grad_norm": 0.3807721436023712, + "learning_rate": 0.0001513188647746244, + "loss": 0.4986, + "step": 735 + }, + { + "epoch": 0.000258342809851882, + "grad_norm": 0.3522527813911438, + "learning_rate": 0.00015125208681135226, + "loss": 0.5982, + "step": 736 + }, + { + "epoch": 0.00025869381910439813, + "grad_norm": 0.31251296401023865, + "learning_rate": 0.00015118530884808014, + "loss": 0.5239, + "step": 737 + }, + { + "epoch": 0.0002590448283569143, + "grad_norm": 0.3460885286331177, + "learning_rate": 0.00015111853088480804, + "loss": 0.5881, + "step": 738 + }, + { + "epoch": 0.00025939583760943043, + "grad_norm": 0.33298879861831665, + "learning_rate": 0.0001510517529215359, + "loss": 0.5272, + "step": 739 + }, + { + "epoch": 0.00025974684686194655, + "grad_norm": 0.351468950510025, + "learning_rate": 0.00015098497495826378, + "loss": 0.6049, + "step": 740 + }, + { + "epoch": 0.00026009785611446273, + "grad_norm": 0.3449242413043976, + "learning_rate": 0.00015091819699499166, + "loss": 0.5983, + "step": 741 + }, + { + "epoch": 0.00026044886536697885, + "grad_norm": 0.34724265336990356, + "learning_rate": 0.00015085141903171953, + "loss": 0.5292, + "step": 742 + }, + { + "epoch": 0.00026079987461949503, + "grad_norm": 0.3525671660900116, + "learning_rate": 0.00015078464106844743, + "loss": 0.5391, + "step": 743 + }, + { + "epoch": 0.00026115088387201115, + "grad_norm": 0.33959653973579407, + "learning_rate": 0.0001507178631051753, + "loss": 0.5898, + "step": 744 + }, + { + "epoch": 0.00026150189312452727, + "grad_norm": 0.5051225423812866, + "learning_rate": 0.00015065108514190318, + "loss": 0.5408, + "step": 745 + }, + { + "epoch": 0.00026185290237704345, + "grad_norm": 0.3298085629940033, + "learning_rate": 0.00015058430717863105, + "loss": 0.557, + "step": 746 + }, + { + "epoch": 0.00026220391162955957, + "grad_norm": 0.3375703990459442, + "learning_rate": 0.00015051752921535892, + "loss": 0.5541, + "step": 747 + }, + { + "epoch": 0.00026255492088207575, + "grad_norm": 0.27896445989608765, + "learning_rate": 0.0001504507512520868, + "loss": 0.5273, + "step": 748 + }, + { + "epoch": 0.00026290593013459187, + "grad_norm": 0.30591917037963867, + "learning_rate": 0.0001503839732888147, + "loss": 0.5988, + "step": 749 + }, + { + "epoch": 0.000263256939387108, + "grad_norm": 0.41014084219932556, + "learning_rate": 0.00015031719532554257, + "loss": 0.555, + "step": 750 + }, + { + "epoch": 0.00026360794863962417, + "grad_norm": 0.2935464084148407, + "learning_rate": 0.00015025041736227047, + "loss": 0.625, + "step": 751 + }, + { + "epoch": 0.0002639589578921403, + "grad_norm": 0.46361032128334045, + "learning_rate": 0.00015018363939899834, + "loss": 0.4753, + "step": 752 + }, + { + "epoch": 0.00026430996714465647, + "grad_norm": 0.35808300971984863, + "learning_rate": 0.00015011686143572622, + "loss": 0.5531, + "step": 753 + }, + { + "epoch": 0.0002646609763971726, + "grad_norm": 0.3411274254322052, + "learning_rate": 0.00015005008347245412, + "loss": 0.5577, + "step": 754 + }, + { + "epoch": 0.0002650119856496887, + "grad_norm": 0.34169328212738037, + "learning_rate": 0.000149983305509182, + "loss": 0.4856, + "step": 755 + }, + { + "epoch": 0.0002653629949022049, + "grad_norm": 0.38024139404296875, + "learning_rate": 0.00014991652754590986, + "loss": 0.5203, + "step": 756 + }, + { + "epoch": 0.000265714004154721, + "grad_norm": 0.35004425048828125, + "learning_rate": 0.00014984974958263774, + "loss": 0.4999, + "step": 757 + }, + { + "epoch": 0.0002660650134072372, + "grad_norm": 0.47526153922080994, + "learning_rate": 0.0001497829716193656, + "loss": 0.5503, + "step": 758 + }, + { + "epoch": 0.0002664160226597533, + "grad_norm": 0.35096925497055054, + "learning_rate": 0.0001497161936560935, + "loss": 0.5812, + "step": 759 + }, + { + "epoch": 0.00026676703191226943, + "grad_norm": 0.4505446255207062, + "learning_rate": 0.00014964941569282138, + "loss": 0.6069, + "step": 760 + }, + { + "epoch": 0.0002671180411647856, + "grad_norm": 0.3261663019657135, + "learning_rate": 0.00014958263772954926, + "loss": 0.5601, + "step": 761 + }, + { + "epoch": 0.00026746905041730173, + "grad_norm": 0.3397548794746399, + "learning_rate": 0.00014951585976627713, + "loss": 0.5572, + "step": 762 + }, + { + "epoch": 0.00026782005966981785, + "grad_norm": 0.35547688603401184, + "learning_rate": 0.000149449081803005, + "loss": 0.5983, + "step": 763 + }, + { + "epoch": 0.00026817106892233403, + "grad_norm": 0.41515079140663147, + "learning_rate": 0.00014938230383973287, + "loss": 0.6106, + "step": 764 + }, + { + "epoch": 0.00026852207817485015, + "grad_norm": 0.3840051591396332, + "learning_rate": 0.00014931552587646077, + "loss": 0.5328, + "step": 765 + }, + { + "epoch": 0.00026887308742736633, + "grad_norm": 0.3401285707950592, + "learning_rate": 0.00014924874791318865, + "loss": 0.4666, + "step": 766 + }, + { + "epoch": 0.00026922409667988245, + "grad_norm": 0.32983794808387756, + "learning_rate": 0.00014918196994991652, + "loss": 0.5214, + "step": 767 + }, + { + "epoch": 0.0002695751059323986, + "grad_norm": 0.30202198028564453, + "learning_rate": 0.00014911519198664442, + "loss": 0.4969, + "step": 768 + }, + { + "epoch": 0.00026992611518491475, + "grad_norm": 0.3222092092037201, + "learning_rate": 0.0001490484140233723, + "loss": 0.5093, + "step": 769 + }, + { + "epoch": 0.0002702771244374309, + "grad_norm": 0.4211997091770172, + "learning_rate": 0.0001489816360601002, + "loss": 0.6295, + "step": 770 + }, + { + "epoch": 0.00027062813368994705, + "grad_norm": 0.32112184166908264, + "learning_rate": 0.00014891485809682807, + "loss": 0.5611, + "step": 771 + }, + { + "epoch": 0.00027097914294246317, + "grad_norm": 0.3272956609725952, + "learning_rate": 0.00014884808013355594, + "loss": 0.6438, + "step": 772 + }, + { + "epoch": 0.0002713301521949793, + "grad_norm": 0.39423295855522156, + "learning_rate": 0.00014878130217028381, + "loss": 0.6029, + "step": 773 + }, + { + "epoch": 0.00027168116144749547, + "grad_norm": 0.3053528070449829, + "learning_rate": 0.0001487145242070117, + "loss": 0.4978, + "step": 774 + }, + { + "epoch": 0.0002720321707000116, + "grad_norm": 0.312774658203125, + "learning_rate": 0.0001486477462437396, + "loss": 0.5753, + "step": 775 + }, + { + "epoch": 0.00027238317995252777, + "grad_norm": 0.343964546918869, + "learning_rate": 0.00014858096828046746, + "loss": 0.5173, + "step": 776 + }, + { + "epoch": 0.0002727341892050439, + "grad_norm": 0.39104631543159485, + "learning_rate": 0.00014851419031719533, + "loss": 0.6381, + "step": 777 + }, + { + "epoch": 0.00027308519845756, + "grad_norm": 0.3958207070827484, + "learning_rate": 0.0001484474123539232, + "loss": 0.6046, + "step": 778 + }, + { + "epoch": 0.0002734362077100762, + "grad_norm": 0.36198097467422485, + "learning_rate": 0.00014838063439065108, + "loss": 0.6066, + "step": 779 + }, + { + "epoch": 0.0002737872169625923, + "grad_norm": 0.29619571566581726, + "learning_rate": 0.00014831385642737895, + "loss": 0.5131, + "step": 780 + }, + { + "epoch": 0.0002741382262151085, + "grad_norm": 0.344784677028656, + "learning_rate": 0.00014824707846410685, + "loss": 0.5626, + "step": 781 + }, + { + "epoch": 0.0002744892354676246, + "grad_norm": 0.35641250014305115, + "learning_rate": 0.00014818030050083473, + "loss": 0.5451, + "step": 782 + }, + { + "epoch": 0.00027484024472014074, + "grad_norm": 0.3496847152709961, + "learning_rate": 0.0001481135225375626, + "loss": 0.4814, + "step": 783 + }, + { + "epoch": 0.0002751912539726569, + "grad_norm": 0.3726658821105957, + "learning_rate": 0.00014804674457429047, + "loss": 0.6244, + "step": 784 + }, + { + "epoch": 0.00027554226322517303, + "grad_norm": 0.3317565619945526, + "learning_rate": 0.00014797996661101837, + "loss": 0.562, + "step": 785 + }, + { + "epoch": 0.0002758932724776892, + "grad_norm": 0.3478979468345642, + "learning_rate": 0.00014791318864774625, + "loss": 0.613, + "step": 786 + }, + { + "epoch": 0.00027624428173020533, + "grad_norm": 0.3572550415992737, + "learning_rate": 0.00014784641068447415, + "loss": 0.4841, + "step": 787 + }, + { + "epoch": 0.00027659529098272146, + "grad_norm": 0.34030210971832275, + "learning_rate": 0.00014777963272120202, + "loss": 0.4879, + "step": 788 + }, + { + "epoch": 0.00027694630023523763, + "grad_norm": 0.378203421831131, + "learning_rate": 0.0001477128547579299, + "loss": 0.6086, + "step": 789 + }, + { + "epoch": 0.00027729730948775375, + "grad_norm": 0.3390562832355499, + "learning_rate": 0.00014764607679465777, + "loss": 0.586, + "step": 790 + }, + { + "epoch": 0.00027764831874026993, + "grad_norm": 0.4986645579338074, + "learning_rate": 0.00014757929883138567, + "loss": 0.5592, + "step": 791 + }, + { + "epoch": 0.00027799932799278605, + "grad_norm": 0.3361869156360626, + "learning_rate": 0.00014751252086811354, + "loss": 0.4632, + "step": 792 + }, + { + "epoch": 0.0002783503372453022, + "grad_norm": 0.3726123571395874, + "learning_rate": 0.0001474457429048414, + "loss": 0.4915, + "step": 793 + }, + { + "epoch": 0.00027870134649781835, + "grad_norm": 0.3358845114707947, + "learning_rate": 0.00014737896494156929, + "loss": 0.5593, + "step": 794 + }, + { + "epoch": 0.0002790523557503345, + "grad_norm": 0.30473607778549194, + "learning_rate": 0.00014731218697829716, + "loss": 0.3672, + "step": 795 + }, + { + "epoch": 0.00027940336500285065, + "grad_norm": 0.33929023146629333, + "learning_rate": 0.00014724540901502506, + "loss": 0.5404, + "step": 796 + }, + { + "epoch": 0.0002797543742553668, + "grad_norm": 0.30778205394744873, + "learning_rate": 0.00014717863105175293, + "loss": 0.4379, + "step": 797 + }, + { + "epoch": 0.0002801053835078829, + "grad_norm": 0.286443829536438, + "learning_rate": 0.0001471118530884808, + "loss": 0.5579, + "step": 798 + }, + { + "epoch": 0.0002804563927603991, + "grad_norm": 0.4246799051761627, + "learning_rate": 0.00014704507512520868, + "loss": 0.536, + "step": 799 + }, + { + "epoch": 0.0002808074020129152, + "grad_norm": 0.4085538983345032, + "learning_rate": 0.00014697829716193655, + "loss": 0.5309, + "step": 800 + }, + { + "epoch": 0.00028115841126543137, + "grad_norm": 0.35396453738212585, + "learning_rate": 0.00014691151919866443, + "loss": 0.5307, + "step": 801 + }, + { + "epoch": 0.0002815094205179475, + "grad_norm": 0.45588648319244385, + "learning_rate": 0.00014684474123539233, + "loss": 0.5905, + "step": 802 + }, + { + "epoch": 0.0002818604297704636, + "grad_norm": 0.3353815972805023, + "learning_rate": 0.0001467779632721202, + "loss": 0.612, + "step": 803 + }, + { + "epoch": 0.0002822114390229798, + "grad_norm": 0.4152653217315674, + "learning_rate": 0.0001467111853088481, + "loss": 0.592, + "step": 804 + }, + { + "epoch": 0.0002825624482754959, + "grad_norm": 0.3651511073112488, + "learning_rate": 0.00014664440734557597, + "loss": 0.5909, + "step": 805 + }, + { + "epoch": 0.0002829134575280121, + "grad_norm": 0.3518235385417938, + "learning_rate": 0.00014657762938230385, + "loss": 0.5684, + "step": 806 + }, + { + "epoch": 0.0002832644667805282, + "grad_norm": 0.33562156558036804, + "learning_rate": 0.00014651085141903175, + "loss": 0.5165, + "step": 807 + }, + { + "epoch": 0.00028361547603304434, + "grad_norm": 0.3648052513599396, + "learning_rate": 0.00014644407345575962, + "loss": 0.5451, + "step": 808 + }, + { + "epoch": 0.0002839664852855605, + "grad_norm": 0.44342300295829773, + "learning_rate": 0.0001463772954924875, + "loss": 0.5907, + "step": 809 + }, + { + "epoch": 0.00028431749453807664, + "grad_norm": 0.33331966400146484, + "learning_rate": 0.00014631051752921536, + "loss": 0.4254, + "step": 810 + }, + { + "epoch": 0.0002846685037905928, + "grad_norm": 0.3444873094558716, + "learning_rate": 0.00014624373956594324, + "loss": 0.5201, + "step": 811 + }, + { + "epoch": 0.00028501951304310894, + "grad_norm": 0.4239615201950073, + "learning_rate": 0.00014617696160267114, + "loss": 0.5098, + "step": 812 + }, + { + "epoch": 0.00028537052229562506, + "grad_norm": 0.47895997762680054, + "learning_rate": 0.000146110183639399, + "loss": 0.6243, + "step": 813 + }, + { + "epoch": 0.00028572153154814123, + "grad_norm": 0.47322046756744385, + "learning_rate": 0.00014604340567612688, + "loss": 0.6841, + "step": 814 + }, + { + "epoch": 0.00028607254080065736, + "grad_norm": 0.35017871856689453, + "learning_rate": 0.00014597662771285476, + "loss": 0.5313, + "step": 815 + }, + { + "epoch": 0.00028642355005317353, + "grad_norm": 0.4342300295829773, + "learning_rate": 0.00014590984974958263, + "loss": 0.4363, + "step": 816 + }, + { + "epoch": 0.00028677455930568966, + "grad_norm": 0.2966228723526001, + "learning_rate": 0.0001458430717863105, + "loss": 0.6428, + "step": 817 + }, + { + "epoch": 0.0002871255685582058, + "grad_norm": 0.3320361375808716, + "learning_rate": 0.0001457762938230384, + "loss": 0.5266, + "step": 818 + }, + { + "epoch": 0.00028747657781072195, + "grad_norm": 0.3318590223789215, + "learning_rate": 0.00014570951585976628, + "loss": 0.5676, + "step": 819 + }, + { + "epoch": 0.0002878275870632381, + "grad_norm": 0.38573157787323, + "learning_rate": 0.00014564273789649415, + "loss": 0.7083, + "step": 820 + }, + { + "epoch": 0.00028817859631575425, + "grad_norm": 0.3731164038181305, + "learning_rate": 0.00014557595993322205, + "loss": 0.578, + "step": 821 + }, + { + "epoch": 0.0002885296055682704, + "grad_norm": 0.33610039949417114, + "learning_rate": 0.00014550918196994992, + "loss": 0.5923, + "step": 822 + }, + { + "epoch": 0.0002888806148207865, + "grad_norm": 0.3393179476261139, + "learning_rate": 0.00014544240400667782, + "loss": 0.5162, + "step": 823 + }, + { + "epoch": 0.0002892316240733027, + "grad_norm": 0.35552918910980225, + "learning_rate": 0.0001453756260434057, + "loss": 0.556, + "step": 824 + }, + { + "epoch": 0.0002895826333258188, + "grad_norm": 0.32425832748413086, + "learning_rate": 0.00014530884808013357, + "loss": 0.5157, + "step": 825 + }, + { + "epoch": 0.000289933642578335, + "grad_norm": 0.3353455662727356, + "learning_rate": 0.00014524207011686144, + "loss": 0.483, + "step": 826 + }, + { + "epoch": 0.0002902846518308511, + "grad_norm": 0.46254628896713257, + "learning_rate": 0.00014517529215358932, + "loss": 0.633, + "step": 827 + }, + { + "epoch": 0.0002906356610833672, + "grad_norm": 0.3275732100009918, + "learning_rate": 0.00014510851419031722, + "loss": 0.5502, + "step": 828 + }, + { + "epoch": 0.0002909866703358834, + "grad_norm": 0.3495190441608429, + "learning_rate": 0.0001450417362270451, + "loss": 0.368, + "step": 829 + }, + { + "epoch": 0.0002913376795883995, + "grad_norm": 0.35350501537323, + "learning_rate": 0.00014497495826377296, + "loss": 0.5819, + "step": 830 + }, + { + "epoch": 0.0002916886888409157, + "grad_norm": 0.37886378169059753, + "learning_rate": 0.00014490818030050084, + "loss": 0.5418, + "step": 831 + }, + { + "epoch": 0.0002920396980934318, + "grad_norm": 0.4279928505420685, + "learning_rate": 0.0001448414023372287, + "loss": 0.5199, + "step": 832 + }, + { + "epoch": 0.00029239070734594794, + "grad_norm": 0.33105382323265076, + "learning_rate": 0.00014477462437395658, + "loss": 0.5952, + "step": 833 + }, + { + "epoch": 0.0002927417165984641, + "grad_norm": 0.40114086866378784, + "learning_rate": 0.00014470784641068448, + "loss": 0.4611, + "step": 834 + }, + { + "epoch": 0.00029309272585098024, + "grad_norm": 0.3294037878513336, + "learning_rate": 0.00014464106844741236, + "loss": 0.5562, + "step": 835 + }, + { + "epoch": 0.0002934437351034964, + "grad_norm": 0.3391546607017517, + "learning_rate": 0.00014457429048414023, + "loss": 0.5748, + "step": 836 + }, + { + "epoch": 0.00029379474435601254, + "grad_norm": 0.4093922972679138, + "learning_rate": 0.0001445075125208681, + "loss": 0.4607, + "step": 837 + }, + { + "epoch": 0.00029414575360852866, + "grad_norm": 0.3331819176673889, + "learning_rate": 0.000144440734557596, + "loss": 0.5874, + "step": 838 + }, + { + "epoch": 0.00029449676286104484, + "grad_norm": 0.43205946683883667, + "learning_rate": 0.00014437395659432388, + "loss": 0.6152, + "step": 839 + }, + { + "epoch": 0.00029484777211356096, + "grad_norm": 0.36046868562698364, + "learning_rate": 0.00014430717863105178, + "loss": 0.4781, + "step": 840 + }, + { + "epoch": 0.00029519878136607713, + "grad_norm": 0.35514524579048157, + "learning_rate": 0.00014424040066777965, + "loss": 0.568, + "step": 841 + }, + { + "epoch": 0.00029554979061859326, + "grad_norm": 0.40260326862335205, + "learning_rate": 0.00014417362270450752, + "loss": 0.6075, + "step": 842 + }, + { + "epoch": 0.0002959007998711094, + "grad_norm": 0.3102671205997467, + "learning_rate": 0.0001441068447412354, + "loss": 0.4927, + "step": 843 + }, + { + "epoch": 0.00029625180912362556, + "grad_norm": 0.30940982699394226, + "learning_rate": 0.0001440400667779633, + "loss": 0.5549, + "step": 844 + }, + { + "epoch": 0.0002966028183761417, + "grad_norm": 0.3652762174606323, + "learning_rate": 0.00014397328881469117, + "loss": 0.6085, + "step": 845 + }, + { + "epoch": 0.00029695382762865786, + "grad_norm": 0.43056777119636536, + "learning_rate": 0.00014390651085141904, + "loss": 0.494, + "step": 846 + }, + { + "epoch": 0.000297304836881174, + "grad_norm": 0.3112967014312744, + "learning_rate": 0.00014383973288814692, + "loss": 0.5141, + "step": 847 + }, + { + "epoch": 0.0002976558461336901, + "grad_norm": 0.36729326844215393, + "learning_rate": 0.0001437729549248748, + "loss": 0.5435, + "step": 848 + }, + { + "epoch": 0.0002980068553862063, + "grad_norm": 0.3128114938735962, + "learning_rate": 0.00014370617696160266, + "loss": 0.5419, + "step": 849 + }, + { + "epoch": 0.0002983578646387224, + "grad_norm": 0.4030589163303375, + "learning_rate": 0.00014363939899833056, + "loss": 0.5959, + "step": 850 + }, + { + "epoch": 0.0002987088738912386, + "grad_norm": 0.39571288228034973, + "learning_rate": 0.00014357262103505844, + "loss": 0.6798, + "step": 851 + }, + { + "epoch": 0.0002990598831437547, + "grad_norm": 0.3388408422470093, + "learning_rate": 0.0001435058430717863, + "loss": 0.4887, + "step": 852 + }, + { + "epoch": 0.0002994108923962708, + "grad_norm": 0.39615562558174133, + "learning_rate": 0.00014343906510851418, + "loss": 0.5654, + "step": 853 + }, + { + "epoch": 0.000299761901648787, + "grad_norm": 0.3967401683330536, + "learning_rate": 0.00014337228714524205, + "loss": 0.6192, + "step": 854 + }, + { + "epoch": 0.0003001129109013031, + "grad_norm": 0.5597772002220154, + "learning_rate": 0.00014330550918196995, + "loss": 0.5808, + "step": 855 + }, + { + "epoch": 0.0003004639201538193, + "grad_norm": 0.36231061816215515, + "learning_rate": 0.00014323873121869783, + "loss": 0.4936, + "step": 856 + }, + { + "epoch": 0.0003008149294063354, + "grad_norm": 0.3775942027568817, + "learning_rate": 0.00014317195325542573, + "loss": 0.5706, + "step": 857 + }, + { + "epoch": 0.00030116593865885154, + "grad_norm": 0.4139408767223358, + "learning_rate": 0.0001431051752921536, + "loss": 0.5784, + "step": 858 + }, + { + "epoch": 0.0003015169479113677, + "grad_norm": 0.4101429879665375, + "learning_rate": 0.00014303839732888147, + "loss": 0.5937, + "step": 859 + }, + { + "epoch": 0.00030186795716388384, + "grad_norm": 0.5272162556648254, + "learning_rate": 0.00014297161936560937, + "loss": 0.5244, + "step": 860 + }, + { + "epoch": 0.0003022189664164, + "grad_norm": 0.3587292730808258, + "learning_rate": 0.00014290484140233725, + "loss": 0.6333, + "step": 861 + }, + { + "epoch": 0.00030256997566891614, + "grad_norm": 0.3284890353679657, + "learning_rate": 0.00014283806343906512, + "loss": 0.5414, + "step": 862 + }, + { + "epoch": 0.00030292098492143226, + "grad_norm": 0.414974182844162, + "learning_rate": 0.000142771285475793, + "loss": 0.6116, + "step": 863 + }, + { + "epoch": 0.00030327199417394844, + "grad_norm": 0.33619245886802673, + "learning_rate": 0.00014270450751252087, + "loss": 0.5506, + "step": 864 + }, + { + "epoch": 0.00030362300342646456, + "grad_norm": 0.45475640892982483, + "learning_rate": 0.00014263772954924874, + "loss": 0.6347, + "step": 865 + }, + { + "epoch": 0.00030397401267898074, + "grad_norm": 0.2695920765399933, + "learning_rate": 0.00014257095158597664, + "loss": 0.4529, + "step": 866 + }, + { + "epoch": 0.00030432502193149686, + "grad_norm": 0.3314480781555176, + "learning_rate": 0.00014250417362270451, + "loss": 0.5812, + "step": 867 + }, + { + "epoch": 0.000304676031184013, + "grad_norm": 0.31949582695961, + "learning_rate": 0.0001424373956594324, + "loss": 0.5213, + "step": 868 + }, + { + "epoch": 0.00030502704043652916, + "grad_norm": 0.34049752354621887, + "learning_rate": 0.00014237061769616026, + "loss": 0.4645, + "step": 869 + }, + { + "epoch": 0.0003053780496890453, + "grad_norm": 0.4304719567298889, + "learning_rate": 0.00014230383973288813, + "loss": 0.5065, + "step": 870 + }, + { + "epoch": 0.00030572905894156146, + "grad_norm": 0.32379043102264404, + "learning_rate": 0.00014223706176961603, + "loss": 0.553, + "step": 871 + }, + { + "epoch": 0.0003060800681940776, + "grad_norm": 0.33285439014434814, + "learning_rate": 0.0001421702838063439, + "loss": 0.5092, + "step": 872 + }, + { + "epoch": 0.0003064310774465937, + "grad_norm": 0.336795449256897, + "learning_rate": 0.00014210350584307178, + "loss": 0.4967, + "step": 873 + }, + { + "epoch": 0.0003067820866991099, + "grad_norm": 0.34653040766716003, + "learning_rate": 0.00014203672787979968, + "loss": 0.5353, + "step": 874 + }, + { + "epoch": 0.000307133095951626, + "grad_norm": 0.3352467715740204, + "learning_rate": 0.00014196994991652755, + "loss": 0.5594, + "step": 875 + }, + { + "epoch": 0.0003074841052041422, + "grad_norm": 0.38723453879356384, + "learning_rate": 0.00014190317195325545, + "loss": 0.5897, + "step": 876 + }, + { + "epoch": 0.0003078351144566583, + "grad_norm": 0.3987238109111786, + "learning_rate": 0.00014183639398998333, + "loss": 0.4647, + "step": 877 + }, + { + "epoch": 0.0003081861237091744, + "grad_norm": 0.3452693223953247, + "learning_rate": 0.0001417696160267112, + "loss": 0.5687, + "step": 878 + }, + { + "epoch": 0.0003085371329616906, + "grad_norm": 0.3561328649520874, + "learning_rate": 0.00014170283806343907, + "loss": 0.5845, + "step": 879 + }, + { + "epoch": 0.0003088881422142067, + "grad_norm": 0.29658418893814087, + "learning_rate": 0.00014163606010016695, + "loss": 0.5202, + "step": 880 + }, + { + "epoch": 0.0003092391514667229, + "grad_norm": 0.3908213973045349, + "learning_rate": 0.00014156928213689482, + "loss": 0.4439, + "step": 881 + }, + { + "epoch": 0.000309590160719239, + "grad_norm": 0.35816919803619385, + "learning_rate": 0.00014150250417362272, + "loss": 0.5384, + "step": 882 + }, + { + "epoch": 0.00030994116997175514, + "grad_norm": 0.3681255877017975, + "learning_rate": 0.0001414357262103506, + "loss": 0.5999, + "step": 883 + }, + { + "epoch": 0.0003102921792242713, + "grad_norm": 0.31137388944625854, + "learning_rate": 0.00014136894824707847, + "loss": 0.4495, + "step": 884 + }, + { + "epoch": 0.00031064318847678744, + "grad_norm": 0.2831423878669739, + "learning_rate": 0.00014130217028380634, + "loss": 0.4576, + "step": 885 + }, + { + "epoch": 0.0003109941977293036, + "grad_norm": 0.25953516364097595, + "learning_rate": 0.0001412353923205342, + "loss": 0.5606, + "step": 886 + }, + { + "epoch": 0.00031134520698181974, + "grad_norm": 0.31105297803878784, + "learning_rate": 0.0001411686143572621, + "loss": 0.5986, + "step": 887 + }, + { + "epoch": 0.00031169621623433586, + "grad_norm": 0.35177484154701233, + "learning_rate": 0.00014110183639398999, + "loss": 0.3394, + "step": 888 + }, + { + "epoch": 0.00031204722548685204, + "grad_norm": 0.373470276594162, + "learning_rate": 0.00014103505843071786, + "loss": 0.5862, + "step": 889 + }, + { + "epoch": 0.00031239823473936816, + "grad_norm": 0.37227189540863037, + "learning_rate": 0.00014096828046744576, + "loss": 0.4677, + "step": 890 + }, + { + "epoch": 0.00031274924399188434, + "grad_norm": 0.3799666464328766, + "learning_rate": 0.00014090150250417363, + "loss": 0.5255, + "step": 891 + }, + { + "epoch": 0.00031310025324440046, + "grad_norm": 0.3630129098892212, + "learning_rate": 0.00014083472454090153, + "loss": 0.5111, + "step": 892 + }, + { + "epoch": 0.0003134512624969166, + "grad_norm": 0.5131457448005676, + "learning_rate": 0.0001407679465776294, + "loss": 0.5207, + "step": 893 + }, + { + "epoch": 0.00031380227174943276, + "grad_norm": 0.3759867548942566, + "learning_rate": 0.00014070116861435728, + "loss": 0.6678, + "step": 894 + }, + { + "epoch": 0.0003141532810019489, + "grad_norm": 0.5577414631843567, + "learning_rate": 0.00014063439065108515, + "loss": 0.62, + "step": 895 + }, + { + "epoch": 0.00031450429025446506, + "grad_norm": 0.2789120376110077, + "learning_rate": 0.00014056761268781303, + "loss": 0.4204, + "step": 896 + }, + { + "epoch": 0.0003148552995069812, + "grad_norm": 0.2897239327430725, + "learning_rate": 0.0001405008347245409, + "loss": 0.432, + "step": 897 + }, + { + "epoch": 0.0003152063087594973, + "grad_norm": 0.3552323579788208, + "learning_rate": 0.0001404340567612688, + "loss": 0.5512, + "step": 898 + }, + { + "epoch": 0.0003155573180120135, + "grad_norm": 0.49963894486427307, + "learning_rate": 0.00014036727879799667, + "loss": 0.5868, + "step": 899 + }, + { + "epoch": 0.0003159083272645296, + "grad_norm": 0.37479934096336365, + "learning_rate": 0.00014030050083472454, + "loss": 0.6682, + "step": 900 + }, + { + "epoch": 0.0003162593365170458, + "grad_norm": 0.3415648639202118, + "learning_rate": 0.00014023372287145242, + "loss": 0.5301, + "step": 901 + }, + { + "epoch": 0.0003166103457695619, + "grad_norm": 0.37530943751335144, + "learning_rate": 0.0001401669449081803, + "loss": 0.5409, + "step": 902 + }, + { + "epoch": 0.000316961355022078, + "grad_norm": 0.37487658858299255, + "learning_rate": 0.0001401001669449082, + "loss": 0.5976, + "step": 903 + }, + { + "epoch": 0.0003173123642745942, + "grad_norm": 0.37174728512763977, + "learning_rate": 0.00014003338898163606, + "loss": 0.5933, + "step": 904 + }, + { + "epoch": 0.0003176633735271103, + "grad_norm": 0.491584450006485, + "learning_rate": 0.00013996661101836394, + "loss": 0.5112, + "step": 905 + }, + { + "epoch": 0.0003180143827796265, + "grad_norm": 0.38381487131118774, + "learning_rate": 0.0001398998330550918, + "loss": 0.6486, + "step": 906 + }, + { + "epoch": 0.0003183653920321426, + "grad_norm": 0.2867659330368042, + "learning_rate": 0.0001398330550918197, + "loss": 0.5033, + "step": 907 + }, + { + "epoch": 0.00031871640128465874, + "grad_norm": 0.3146355450153351, + "learning_rate": 0.00013976627712854758, + "loss": 0.5878, + "step": 908 + }, + { + "epoch": 0.0003190674105371749, + "grad_norm": 0.3454856276512146, + "learning_rate": 0.00013969949916527548, + "loss": 0.4751, + "step": 909 + }, + { + "epoch": 0.00031941841978969104, + "grad_norm": 0.32241204380989075, + "learning_rate": 0.00013963272120200336, + "loss": 0.6378, + "step": 910 + }, + { + "epoch": 0.0003197694290422072, + "grad_norm": 0.33703315258026123, + "learning_rate": 0.00013956594323873123, + "loss": 0.4634, + "step": 911 + }, + { + "epoch": 0.00032012043829472334, + "grad_norm": 0.3781648576259613, + "learning_rate": 0.0001394991652754591, + "loss": 0.5218, + "step": 912 + }, + { + "epoch": 0.00032047144754723946, + "grad_norm": 0.4124391973018646, + "learning_rate": 0.00013943238731218698, + "loss": 0.4958, + "step": 913 + }, + { + "epoch": 0.00032082245679975564, + "grad_norm": 0.3970220685005188, + "learning_rate": 0.00013936560934891488, + "loss": 0.5624, + "step": 914 + }, + { + "epoch": 0.00032117346605227176, + "grad_norm": 0.43682703375816345, + "learning_rate": 0.00013929883138564275, + "loss": 0.544, + "step": 915 + }, + { + "epoch": 0.00032152447530478794, + "grad_norm": 0.3476586639881134, + "learning_rate": 0.00013923205342237062, + "loss": 0.4418, + "step": 916 + }, + { + "epoch": 0.00032187548455730406, + "grad_norm": 0.36963552236557007, + "learning_rate": 0.0001391652754590985, + "loss": 0.5946, + "step": 917 + }, + { + "epoch": 0.0003222264938098202, + "grad_norm": 0.3445582985877991, + "learning_rate": 0.00013909849749582637, + "loss": 0.5879, + "step": 918 + }, + { + "epoch": 0.00032257750306233636, + "grad_norm": 0.39813530445098877, + "learning_rate": 0.00013903171953255427, + "loss": 0.5759, + "step": 919 + }, + { + "epoch": 0.0003229285123148525, + "grad_norm": 0.3314265012741089, + "learning_rate": 0.00013896494156928214, + "loss": 0.6165, + "step": 920 + }, + { + "epoch": 0.00032327952156736866, + "grad_norm": 0.4094330072402954, + "learning_rate": 0.00013889816360601002, + "loss": 0.5787, + "step": 921 + }, + { + "epoch": 0.0003236305308198848, + "grad_norm": 0.36821484565734863, + "learning_rate": 0.0001388313856427379, + "loss": 0.5303, + "step": 922 + }, + { + "epoch": 0.0003239815400724009, + "grad_norm": 0.3517453968524933, + "learning_rate": 0.00013876460767946576, + "loss": 0.4586, + "step": 923 + }, + { + "epoch": 0.0003243325493249171, + "grad_norm": 0.2959018647670746, + "learning_rate": 0.00013869782971619366, + "loss": 0.5225, + "step": 924 + }, + { + "epoch": 0.0003246835585774332, + "grad_norm": 0.3286895751953125, + "learning_rate": 0.00013863105175292154, + "loss": 0.5353, + "step": 925 + }, + { + "epoch": 0.0003250345678299494, + "grad_norm": 0.3328275680541992, + "learning_rate": 0.00013856427378964944, + "loss": 0.5915, + "step": 926 + }, + { + "epoch": 0.0003253855770824655, + "grad_norm": 0.3400813937187195, + "learning_rate": 0.0001384974958263773, + "loss": 0.4598, + "step": 927 + }, + { + "epoch": 0.0003257365863349816, + "grad_norm": 0.2876541018486023, + "learning_rate": 0.00013843071786310518, + "loss": 0.4835, + "step": 928 + }, + { + "epoch": 0.0003260875955874978, + "grad_norm": 0.3401765525341034, + "learning_rate": 0.00013836393989983308, + "loss": 0.56, + "step": 929 + }, + { + "epoch": 0.0003264386048400139, + "grad_norm": 0.34506598114967346, + "learning_rate": 0.00013829716193656096, + "loss": 0.6234, + "step": 930 + }, + { + "epoch": 0.0003267896140925301, + "grad_norm": 0.33732855319976807, + "learning_rate": 0.00013823038397328883, + "loss": 0.5686, + "step": 931 + }, + { + "epoch": 0.0003271406233450462, + "grad_norm": 0.34300100803375244, + "learning_rate": 0.0001381636060100167, + "loss": 0.6091, + "step": 932 + }, + { + "epoch": 0.00032749163259756235, + "grad_norm": 0.30349200963974, + "learning_rate": 0.00013809682804674458, + "loss": 0.4836, + "step": 933 + }, + { + "epoch": 0.0003278426418500785, + "grad_norm": 0.35742175579071045, + "learning_rate": 0.00013803005008347245, + "loss": 0.6443, + "step": 934 + }, + { + "epoch": 0.00032819365110259464, + "grad_norm": 0.33582496643066406, + "learning_rate": 0.00013796327212020035, + "loss": 0.6361, + "step": 935 + }, + { + "epoch": 0.0003285446603551108, + "grad_norm": 0.33403804898262024, + "learning_rate": 0.00013789649415692822, + "loss": 0.5911, + "step": 936 + }, + { + "epoch": 0.00032889566960762694, + "grad_norm": 0.4263191521167755, + "learning_rate": 0.0001378297161936561, + "loss": 0.5243, + "step": 937 + }, + { + "epoch": 0.00032924667886014307, + "grad_norm": 0.31543296575546265, + "learning_rate": 0.00013776293823038397, + "loss": 0.554, + "step": 938 + }, + { + "epoch": 0.00032959768811265924, + "grad_norm": 0.38975203037261963, + "learning_rate": 0.00013769616026711184, + "loss": 0.5358, + "step": 939 + }, + { + "epoch": 0.00032994869736517536, + "grad_norm": 0.3175157904624939, + "learning_rate": 0.00013762938230383974, + "loss": 0.5385, + "step": 940 + }, + { + "epoch": 0.00033029970661769154, + "grad_norm": 0.32753151655197144, + "learning_rate": 0.00013756260434056762, + "loss": 0.5191, + "step": 941 + }, + { + "epoch": 0.00033065071587020766, + "grad_norm": 0.2516227066516876, + "learning_rate": 0.0001374958263772955, + "loss": 0.3496, + "step": 942 + }, + { + "epoch": 0.0003310017251227238, + "grad_norm": 0.275806188583374, + "learning_rate": 0.0001374290484140234, + "loss": 0.4197, + "step": 943 + }, + { + "epoch": 0.00033135273437523996, + "grad_norm": 0.30234864354133606, + "learning_rate": 0.00013736227045075126, + "loss": 0.4909, + "step": 944 + }, + { + "epoch": 0.0003317037436277561, + "grad_norm": 0.32561683654785156, + "learning_rate": 0.00013729549248747916, + "loss": 0.5865, + "step": 945 + }, + { + "epoch": 0.00033205475288027226, + "grad_norm": 0.32075145840644836, + "learning_rate": 0.00013722871452420704, + "loss": 0.5957, + "step": 946 + }, + { + "epoch": 0.0003324057621327884, + "grad_norm": 0.3077705204486847, + "learning_rate": 0.0001371619365609349, + "loss": 0.6026, + "step": 947 + }, + { + "epoch": 0.0003327567713853045, + "grad_norm": 0.3092177212238312, + "learning_rate": 0.00013709515859766278, + "loss": 0.553, + "step": 948 + }, + { + "epoch": 0.0003331077806378207, + "grad_norm": 0.3611501157283783, + "learning_rate": 0.00013702838063439065, + "loss": 0.5707, + "step": 949 + }, + { + "epoch": 0.0003334587898903368, + "grad_norm": 0.3343827724456787, + "learning_rate": 0.00013696160267111853, + "loss": 0.5626, + "step": 950 + }, + { + "epoch": 0.000333809799142853, + "grad_norm": 0.3330281376838684, + "learning_rate": 0.00013689482470784643, + "loss": 0.6353, + "step": 951 + }, + { + "epoch": 0.0003341608083953691, + "grad_norm": 0.4045816957950592, + "learning_rate": 0.0001368280467445743, + "loss": 0.5781, + "step": 952 + }, + { + "epoch": 0.0003345118176478852, + "grad_norm": 0.3618166446685791, + "learning_rate": 0.00013676126878130217, + "loss": 0.6702, + "step": 953 + }, + { + "epoch": 0.0003348628269004014, + "grad_norm": 0.2836553752422333, + "learning_rate": 0.00013669449081803005, + "loss": 0.4371, + "step": 954 + }, + { + "epoch": 0.0003352138361529175, + "grad_norm": 0.3100498914718628, + "learning_rate": 0.00013662771285475792, + "loss": 0.5184, + "step": 955 + }, + { + "epoch": 0.0003355648454054337, + "grad_norm": 0.34877723455429077, + "learning_rate": 0.00013656093489148582, + "loss": 0.4778, + "step": 956 + }, + { + "epoch": 0.0003359158546579498, + "grad_norm": 0.27756938338279724, + "learning_rate": 0.0001364941569282137, + "loss": 0.4314, + "step": 957 + }, + { + "epoch": 0.00033626686391046595, + "grad_norm": 0.36129051446914673, + "learning_rate": 0.00013642737896494157, + "loss": 0.5837, + "step": 958 + }, + { + "epoch": 0.0003366178731629821, + "grad_norm": 0.35625776648521423, + "learning_rate": 0.00013636060100166944, + "loss": 0.5579, + "step": 959 + }, + { + "epoch": 0.00033696888241549825, + "grad_norm": 0.3735104501247406, + "learning_rate": 0.00013629382303839734, + "loss": 0.5283, + "step": 960 + }, + { + "epoch": 0.0003373198916680144, + "grad_norm": 0.34185606241226196, + "learning_rate": 0.00013622704507512521, + "loss": 0.5669, + "step": 961 + }, + { + "epoch": 0.00033767090092053054, + "grad_norm": 0.29324260354042053, + "learning_rate": 0.00013616026711185311, + "loss": 0.4468, + "step": 962 + }, + { + "epoch": 0.00033802191017304667, + "grad_norm": 0.3439052700996399, + "learning_rate": 0.000136093489148581, + "loss": 0.5196, + "step": 963 + }, + { + "epoch": 0.00033837291942556284, + "grad_norm": 0.3536570370197296, + "learning_rate": 0.00013602671118530886, + "loss": 0.5251, + "step": 964 + }, + { + "epoch": 0.00033872392867807897, + "grad_norm": 0.4759911298751831, + "learning_rate": 0.00013595993322203673, + "loss": 0.7017, + "step": 965 + }, + { + "epoch": 0.00033907493793059514, + "grad_norm": 0.2958674728870392, + "learning_rate": 0.0001358931552587646, + "loss": 0.4936, + "step": 966 + }, + { + "epoch": 0.00033942594718311126, + "grad_norm": 0.32770562171936035, + "learning_rate": 0.0001358263772954925, + "loss": 0.5741, + "step": 967 + }, + { + "epoch": 0.0003397769564356274, + "grad_norm": 0.35697153210639954, + "learning_rate": 0.00013575959933222038, + "loss": 0.428, + "step": 968 + }, + { + "epoch": 0.00034012796568814356, + "grad_norm": 0.3409043252468109, + "learning_rate": 0.00013569282136894825, + "loss": 0.6142, + "step": 969 + }, + { + "epoch": 0.0003404789749406597, + "grad_norm": 0.47055551409721375, + "learning_rate": 0.00013562604340567613, + "loss": 0.463, + "step": 970 + }, + { + "epoch": 0.00034082998419317586, + "grad_norm": 0.38270413875579834, + "learning_rate": 0.000135559265442404, + "loss": 0.462, + "step": 971 + }, + { + "epoch": 0.000341180993445692, + "grad_norm": 0.26209867000579834, + "learning_rate": 0.0001354924874791319, + "loss": 0.5341, + "step": 972 + }, + { + "epoch": 0.0003415320026982081, + "grad_norm": 0.37498748302459717, + "learning_rate": 0.00013542570951585977, + "loss": 0.5196, + "step": 973 + }, + { + "epoch": 0.0003418830119507243, + "grad_norm": 0.36789608001708984, + "learning_rate": 0.00013535893155258765, + "loss": 0.4723, + "step": 974 + }, + { + "epoch": 0.0003422340212032404, + "grad_norm": 0.33915975689888, + "learning_rate": 0.00013529215358931552, + "loss": 0.5511, + "step": 975 + }, + { + "epoch": 0.0003425850304557566, + "grad_norm": 0.43045058846473694, + "learning_rate": 0.0001352253756260434, + "loss": 0.5667, + "step": 976 + }, + { + "epoch": 0.0003429360397082727, + "grad_norm": 0.2948949933052063, + "learning_rate": 0.0001351585976627713, + "loss": 0.4804, + "step": 977 + }, + { + "epoch": 0.00034328704896078883, + "grad_norm": 0.3249470889568329, + "learning_rate": 0.00013509181969949917, + "loss": 0.6041, + "step": 978 + }, + { + "epoch": 0.000343638058213305, + "grad_norm": 0.2865908741950989, + "learning_rate": 0.00013502504173622707, + "loss": 0.5617, + "step": 979 + }, + { + "epoch": 0.0003439890674658211, + "grad_norm": 0.3190818428993225, + "learning_rate": 0.00013495826377295494, + "loss": 0.4902, + "step": 980 + }, + { + "epoch": 0.00034434007671833725, + "grad_norm": 0.3111664950847626, + "learning_rate": 0.0001348914858096828, + "loss": 0.5504, + "step": 981 + }, + { + "epoch": 0.0003446910859708534, + "grad_norm": 0.3255857229232788, + "learning_rate": 0.00013482470784641069, + "loss": 0.5592, + "step": 982 + }, + { + "epoch": 0.00034504209522336955, + "grad_norm": 0.30806589126586914, + "learning_rate": 0.00013475792988313859, + "loss": 0.5567, + "step": 983 + }, + { + "epoch": 0.0003453931044758857, + "grad_norm": 0.33785945177078247, + "learning_rate": 0.00013469115191986646, + "loss": 0.5881, + "step": 984 + }, + { + "epoch": 0.00034574411372840185, + "grad_norm": 0.34626781940460205, + "learning_rate": 0.00013462437395659433, + "loss": 0.578, + "step": 985 + }, + { + "epoch": 0.00034609512298091797, + "grad_norm": 0.367034912109375, + "learning_rate": 0.0001345575959933222, + "loss": 0.5893, + "step": 986 + }, + { + "epoch": 0.00034644613223343415, + "grad_norm": 0.37824952602386475, + "learning_rate": 0.00013449081803005008, + "loss": 0.5681, + "step": 987 + }, + { + "epoch": 0.00034679714148595027, + "grad_norm": 0.4054035544395447, + "learning_rate": 0.00013442404006677798, + "loss": 0.6108, + "step": 988 + }, + { + "epoch": 0.00034714815073846645, + "grad_norm": 0.4374067485332489, + "learning_rate": 0.00013435726210350585, + "loss": 0.6002, + "step": 989 + }, + { + "epoch": 0.00034749915999098257, + "grad_norm": 0.3554278016090393, + "learning_rate": 0.00013429048414023373, + "loss": 0.6444, + "step": 990 + }, + { + "epoch": 0.0003478501692434987, + "grad_norm": 0.3428646922111511, + "learning_rate": 0.0001342237061769616, + "loss": 0.6527, + "step": 991 + }, + { + "epoch": 0.00034820117849601487, + "grad_norm": 0.25603657960891724, + "learning_rate": 0.00013415692821368947, + "loss": 0.5244, + "step": 992 + }, + { + "epoch": 0.000348552187748531, + "grad_norm": 0.35237595438957214, + "learning_rate": 0.00013409015025041737, + "loss": 0.557, + "step": 993 + }, + { + "epoch": 0.00034890319700104717, + "grad_norm": 0.33666110038757324, + "learning_rate": 0.00013402337228714524, + "loss": 0.5674, + "step": 994 + }, + { + "epoch": 0.0003492542062535633, + "grad_norm": 0.30283182859420776, + "learning_rate": 0.00013395659432387312, + "loss": 0.6081, + "step": 995 + }, + { + "epoch": 0.0003496052155060794, + "grad_norm": 0.30893146991729736, + "learning_rate": 0.00013388981636060102, + "loss": 0.6089, + "step": 996 + }, + { + "epoch": 0.0003499562247585956, + "grad_norm": 0.2617473304271698, + "learning_rate": 0.0001338230383973289, + "loss": 0.6104, + "step": 997 + }, + { + "epoch": 0.0003503072340111117, + "grad_norm": 0.29493093490600586, + "learning_rate": 0.00013375626043405676, + "loss": 0.5047, + "step": 998 + }, + { + "epoch": 0.0003506582432636279, + "grad_norm": 0.3991663157939911, + "learning_rate": 0.00013368948247078466, + "loss": 0.5137, + "step": 999 + }, + { + "epoch": 0.000351009252516144, + "grad_norm": 0.31760329008102417, + "learning_rate": 0.00013362270450751254, + "loss": 0.4371, + "step": 1000 + }, + { + "epoch": 0.00035136026176866013, + "grad_norm": 0.35144907236099243, + "learning_rate": 0.0001335559265442404, + "loss": 0.5085, + "step": 1001 + }, + { + "epoch": 0.0003517112710211763, + "grad_norm": 0.3597724735736847, + "learning_rate": 0.00013348914858096828, + "loss": 0.593, + "step": 1002 + }, + { + "epoch": 0.00035206228027369243, + "grad_norm": 0.33647072315216064, + "learning_rate": 0.00013342237061769616, + "loss": 0.6011, + "step": 1003 + }, + { + "epoch": 0.0003524132895262086, + "grad_norm": 0.3377489745616913, + "learning_rate": 0.00013335559265442406, + "loss": 0.6285, + "step": 1004 + }, + { + "epoch": 0.00035276429877872473, + "grad_norm": 0.3210775852203369, + "learning_rate": 0.00013328881469115193, + "loss": 0.5214, + "step": 1005 + }, + { + "epoch": 0.00035311530803124085, + "grad_norm": 0.33832573890686035, + "learning_rate": 0.0001332220367278798, + "loss": 0.5788, + "step": 1006 + }, + { + "epoch": 0.00035346631728375703, + "grad_norm": 0.3025464117527008, + "learning_rate": 0.00013315525876460768, + "loss": 0.3762, + "step": 1007 + }, + { + "epoch": 0.00035381732653627315, + "grad_norm": 0.33917921781539917, + "learning_rate": 0.00013308848080133555, + "loss": 0.5816, + "step": 1008 + }, + { + "epoch": 0.0003541683357887893, + "grad_norm": 0.3070494830608368, + "learning_rate": 0.00013302170283806345, + "loss": 0.522, + "step": 1009 + }, + { + "epoch": 0.00035451934504130545, + "grad_norm": 0.31389573216438293, + "learning_rate": 0.00013295492487479132, + "loss": 0.5966, + "step": 1010 + }, + { + "epoch": 0.00035487035429382157, + "grad_norm": 0.33663564920425415, + "learning_rate": 0.0001328881469115192, + "loss": 0.5857, + "step": 1011 + }, + { + "epoch": 0.00035522136354633775, + "grad_norm": 0.3280203640460968, + "learning_rate": 0.00013282136894824707, + "loss": 0.562, + "step": 1012 + }, + { + "epoch": 0.00035557237279885387, + "grad_norm": 0.3307760953903198, + "learning_rate": 0.00013275459098497497, + "loss": 0.6258, + "step": 1013 + }, + { + "epoch": 0.00035592338205137005, + "grad_norm": 0.34378358721733093, + "learning_rate": 0.00013268781302170284, + "loss": 0.5026, + "step": 1014 + }, + { + "epoch": 0.00035627439130388617, + "grad_norm": 0.32818603515625, + "learning_rate": 0.00013262103505843074, + "loss": 0.513, + "step": 1015 + }, + { + "epoch": 0.0003566254005564023, + "grad_norm": 0.3015523850917816, + "learning_rate": 0.00013255425709515862, + "loss": 0.5448, + "step": 1016 + }, + { + "epoch": 0.00035697640980891847, + "grad_norm": 0.2927173674106598, + "learning_rate": 0.0001324874791318865, + "loss": 0.6565, + "step": 1017 + }, + { + "epoch": 0.0003573274190614346, + "grad_norm": 0.3502102196216583, + "learning_rate": 0.00013242070116861436, + "loss": 0.6235, + "step": 1018 + }, + { + "epoch": 0.00035767842831395077, + "grad_norm": 0.32151371240615845, + "learning_rate": 0.00013235392320534224, + "loss": 0.5613, + "step": 1019 + }, + { + "epoch": 0.0003580294375664669, + "grad_norm": 0.31253233551979065, + "learning_rate": 0.00013228714524207014, + "loss": 0.4744, + "step": 1020 + }, + { + "epoch": 0.000358380446818983, + "grad_norm": 0.2831304669380188, + "learning_rate": 0.000132220367278798, + "loss": 0.5385, + "step": 1021 + }, + { + "epoch": 0.0003587314560714992, + "grad_norm": 0.32526761293411255, + "learning_rate": 0.00013215358931552588, + "loss": 0.6316, + "step": 1022 + }, + { + "epoch": 0.0003590824653240153, + "grad_norm": 0.3305005729198456, + "learning_rate": 0.00013208681135225376, + "loss": 0.5287, + "step": 1023 + }, + { + "epoch": 0.0003594334745765315, + "grad_norm": 0.29515331983566284, + "learning_rate": 0.00013202003338898163, + "loss": 0.5478, + "step": 1024 + }, + { + "epoch": 0.0003597844838290476, + "grad_norm": 0.32527396082878113, + "learning_rate": 0.00013195325542570953, + "loss": 0.6309, + "step": 1025 + }, + { + "epoch": 0.00036013549308156373, + "grad_norm": 0.3407800793647766, + "learning_rate": 0.0001318864774624374, + "loss": 0.5958, + "step": 1026 + }, + { + "epoch": 0.0003604865023340799, + "grad_norm": 0.40766170620918274, + "learning_rate": 0.00013181969949916528, + "loss": 0.5281, + "step": 1027 + }, + { + "epoch": 0.00036083751158659603, + "grad_norm": 0.3853365480899811, + "learning_rate": 0.00013175292153589315, + "loss": 0.6349, + "step": 1028 + }, + { + "epoch": 0.0003611885208391122, + "grad_norm": 0.2854768633842468, + "learning_rate": 0.00013168614357262102, + "loss": 0.4515, + "step": 1029 + }, + { + "epoch": 0.00036153953009162833, + "grad_norm": 0.3713400065898895, + "learning_rate": 0.00013161936560934892, + "loss": 0.5256, + "step": 1030 + }, + { + "epoch": 0.00036189053934414445, + "grad_norm": 0.3738803565502167, + "learning_rate": 0.0001315525876460768, + "loss": 0.647, + "step": 1031 + }, + { + "epoch": 0.00036224154859666063, + "grad_norm": 0.3904534578323364, + "learning_rate": 0.0001314858096828047, + "loss": 0.6047, + "step": 1032 + }, + { + "epoch": 0.00036259255784917675, + "grad_norm": 0.3647315204143524, + "learning_rate": 0.00013141903171953257, + "loss": 0.5027, + "step": 1033 + }, + { + "epoch": 0.00036294356710169293, + "grad_norm": 0.3410654366016388, + "learning_rate": 0.00013135225375626044, + "loss": 0.6187, + "step": 1034 + }, + { + "epoch": 0.00036329457635420905, + "grad_norm": 0.3227837383747101, + "learning_rate": 0.00013128547579298832, + "loss": 0.4749, + "step": 1035 + }, + { + "epoch": 0.00036364558560672517, + "grad_norm": 0.2792038917541504, + "learning_rate": 0.00013121869782971622, + "loss": 0.4981, + "step": 1036 + }, + { + "epoch": 0.00036399659485924135, + "grad_norm": 0.339101642370224, + "learning_rate": 0.0001311519198664441, + "loss": 0.5875, + "step": 1037 + }, + { + "epoch": 0.00036434760411175747, + "grad_norm": 0.369004487991333, + "learning_rate": 0.00013108514190317196, + "loss": 0.4854, + "step": 1038 + }, + { + "epoch": 0.00036469861336427365, + "grad_norm": 0.39061155915260315, + "learning_rate": 0.00013101836393989983, + "loss": 0.5887, + "step": 1039 + }, + { + "epoch": 0.00036504962261678977, + "grad_norm": 0.3913773000240326, + "learning_rate": 0.0001309515859766277, + "loss": 0.5388, + "step": 1040 + }, + { + "epoch": 0.0003654006318693059, + "grad_norm": 0.27972474694252014, + "learning_rate": 0.0001308848080133556, + "loss": 0.3841, + "step": 1041 + }, + { + "epoch": 0.00036575164112182207, + "grad_norm": 0.3185168504714966, + "learning_rate": 0.00013081803005008348, + "loss": 0.4955, + "step": 1042 + }, + { + "epoch": 0.0003661026503743382, + "grad_norm": 0.6088166236877441, + "learning_rate": 0.00013075125208681135, + "loss": 0.5242, + "step": 1043 + }, + { + "epoch": 0.00036645365962685437, + "grad_norm": 0.4608970582485199, + "learning_rate": 0.00013068447412353923, + "loss": 0.5375, + "step": 1044 + }, + { + "epoch": 0.0003668046688793705, + "grad_norm": 0.38970229029655457, + "learning_rate": 0.0001306176961602671, + "loss": 0.5227, + "step": 1045 + }, + { + "epoch": 0.0003671556781318866, + "grad_norm": 0.3537042438983917, + "learning_rate": 0.00013055091819699497, + "loss": 0.5022, + "step": 1046 + }, + { + "epoch": 0.0003675066873844028, + "grad_norm": 0.3243977725505829, + "learning_rate": 0.00013048414023372287, + "loss": 0.4638, + "step": 1047 + }, + { + "epoch": 0.0003678576966369189, + "grad_norm": 0.5033393502235413, + "learning_rate": 0.00013041736227045075, + "loss": 0.6124, + "step": 1048 + }, + { + "epoch": 0.0003682087058894351, + "grad_norm": 0.3304978907108307, + "learning_rate": 0.00013035058430717865, + "loss": 0.5645, + "step": 1049 + }, + { + "epoch": 0.0003685597151419512, + "grad_norm": 0.36042529344558716, + "learning_rate": 0.00013028380634390652, + "loss": 0.4484, + "step": 1050 + }, + { + "epoch": 0.00036891072439446733, + "grad_norm": 0.4284050166606903, + "learning_rate": 0.0001302170283806344, + "loss": 0.6074, + "step": 1051 + }, + { + "epoch": 0.0003692617336469835, + "grad_norm": 0.28319039940834045, + "learning_rate": 0.0001301502504173623, + "loss": 0.563, + "step": 1052 + }, + { + "epoch": 0.00036961274289949963, + "grad_norm": 0.35593390464782715, + "learning_rate": 0.00013008347245409017, + "loss": 0.5548, + "step": 1053 + }, + { + "epoch": 0.0003699637521520158, + "grad_norm": 0.3092995285987854, + "learning_rate": 0.00013001669449081804, + "loss": 0.5512, + "step": 1054 + }, + { + "epoch": 0.00037031476140453193, + "grad_norm": 0.39928558468818665, + "learning_rate": 0.00012994991652754591, + "loss": 0.5828, + "step": 1055 + }, + { + "epoch": 0.00037066577065704805, + "grad_norm": 0.3541167974472046, + "learning_rate": 0.0001298831385642738, + "loss": 0.5943, + "step": 1056 + }, + { + "epoch": 0.00037101677990956423, + "grad_norm": 0.3520177900791168, + "learning_rate": 0.0001298163606010017, + "loss": 0.5629, + "step": 1057 + }, + { + "epoch": 0.00037136778916208035, + "grad_norm": 0.26769620180130005, + "learning_rate": 0.00012974958263772956, + "loss": 0.4686, + "step": 1058 + }, + { + "epoch": 0.00037171879841459653, + "grad_norm": 0.4143349528312683, + "learning_rate": 0.00012968280467445743, + "loss": 0.5898, + "step": 1059 + }, + { + "epoch": 0.00037206980766711265, + "grad_norm": 0.29856693744659424, + "learning_rate": 0.0001296160267111853, + "loss": 0.5795, + "step": 1060 + }, + { + "epoch": 0.0003724208169196288, + "grad_norm": 0.3835422396659851, + "learning_rate": 0.00012954924874791318, + "loss": 0.657, + "step": 1061 + }, + { + "epoch": 0.00037277182617214495, + "grad_norm": 0.3311139941215515, + "learning_rate": 0.00012948247078464108, + "loss": 0.5206, + "step": 1062 + }, + { + "epoch": 0.0003731228354246611, + "grad_norm": 0.38118553161621094, + "learning_rate": 0.00012941569282136895, + "loss": 0.6101, + "step": 1063 + }, + { + "epoch": 0.00037347384467717725, + "grad_norm": 0.3357555568218231, + "learning_rate": 0.00012934891485809683, + "loss": 0.4583, + "step": 1064 + }, + { + "epoch": 0.00037382485392969337, + "grad_norm": 0.3239798843860626, + "learning_rate": 0.0001292821368948247, + "loss": 0.5717, + "step": 1065 + }, + { + "epoch": 0.0003741758631822095, + "grad_norm": 0.31502071022987366, + "learning_rate": 0.0001292153589315526, + "loss": 0.5528, + "step": 1066 + }, + { + "epoch": 0.00037452687243472567, + "grad_norm": 0.35177144408226013, + "learning_rate": 0.00012914858096828047, + "loss": 0.5404, + "step": 1067 + }, + { + "epoch": 0.0003748778816872418, + "grad_norm": 0.3457860052585602, + "learning_rate": 0.00012908180300500837, + "loss": 0.5311, + "step": 1068 + }, + { + "epoch": 0.00037522889093975797, + "grad_norm": 0.31016480922698975, + "learning_rate": 0.00012901502504173625, + "loss": 0.521, + "step": 1069 + }, + { + "epoch": 0.0003755799001922741, + "grad_norm": 0.2800024151802063, + "learning_rate": 0.00012894824707846412, + "loss": 0.4831, + "step": 1070 + }, + { + "epoch": 0.0003759309094447902, + "grad_norm": 0.3560345470905304, + "learning_rate": 0.000128881469115192, + "loss": 0.4771, + "step": 1071 + }, + { + "epoch": 0.0003762819186973064, + "grad_norm": 0.28846535086631775, + "learning_rate": 0.00012881469115191987, + "loss": 0.4444, + "step": 1072 + }, + { + "epoch": 0.0003766329279498225, + "grad_norm": 0.29720595479011536, + "learning_rate": 0.00012874791318864777, + "loss": 0.5048, + "step": 1073 + }, + { + "epoch": 0.0003769839372023387, + "grad_norm": 0.40147536993026733, + "learning_rate": 0.00012868113522537564, + "loss": 0.5521, + "step": 1074 + }, + { + "epoch": 0.0003773349464548548, + "grad_norm": 0.36368894577026367, + "learning_rate": 0.0001286143572621035, + "loss": 0.5211, + "step": 1075 + }, + { + "epoch": 0.00037768595570737094, + "grad_norm": 0.34239786863327026, + "learning_rate": 0.00012854757929883139, + "loss": 0.4327, + "step": 1076 + }, + { + "epoch": 0.0003780369649598871, + "grad_norm": 0.3420031666755676, + "learning_rate": 0.00012848080133555926, + "loss": 0.5377, + "step": 1077 + }, + { + "epoch": 0.00037838797421240323, + "grad_norm": 0.32050299644470215, + "learning_rate": 0.00012841402337228716, + "loss": 0.6428, + "step": 1078 + }, + { + "epoch": 0.0003787389834649194, + "grad_norm": 0.31478747725486755, + "learning_rate": 0.00012834724540901503, + "loss": 0.4042, + "step": 1079 + }, + { + "epoch": 0.00037908999271743553, + "grad_norm": 0.4019688367843628, + "learning_rate": 0.0001282804674457429, + "loss": 0.5806, + "step": 1080 + }, + { + "epoch": 0.00037944100196995166, + "grad_norm": 0.3169090151786804, + "learning_rate": 0.00012821368948247078, + "loss": 0.6143, + "step": 1081 + }, + { + "epoch": 0.00037979201122246783, + "grad_norm": 0.3160766363143921, + "learning_rate": 0.00012814691151919865, + "loss": 0.4358, + "step": 1082 + }, + { + "epoch": 0.00038014302047498395, + "grad_norm": 0.30607977509498596, + "learning_rate": 0.00012808013355592655, + "loss": 0.611, + "step": 1083 + }, + { + "epoch": 0.00038049402972750013, + "grad_norm": 0.3392901122570038, + "learning_rate": 0.00012801335559265442, + "loss": 0.4677, + "step": 1084 + }, + { + "epoch": 0.00038084503898001625, + "grad_norm": 0.3608296513557434, + "learning_rate": 0.00012794657762938233, + "loss": 0.4681, + "step": 1085 + }, + { + "epoch": 0.0003811960482325324, + "grad_norm": 0.35469377040863037, + "learning_rate": 0.0001278797996661102, + "loss": 0.5122, + "step": 1086 + }, + { + "epoch": 0.00038154705748504855, + "grad_norm": 0.42851918935775757, + "learning_rate": 0.00012781302170283807, + "loss": 0.511, + "step": 1087 + }, + { + "epoch": 0.0003818980667375647, + "grad_norm": 0.31718799471855164, + "learning_rate": 0.00012774624373956594, + "loss": 0.5504, + "step": 1088 + }, + { + "epoch": 0.00038224907599008085, + "grad_norm": 0.31201183795928955, + "learning_rate": 0.00012767946577629384, + "loss": 0.5846, + "step": 1089 + }, + { + "epoch": 0.000382600085242597, + "grad_norm": 0.44880107045173645, + "learning_rate": 0.00012761268781302172, + "loss": 0.6351, + "step": 1090 + }, + { + "epoch": 0.0003829510944951131, + "grad_norm": 0.3685932755470276, + "learning_rate": 0.0001275459098497496, + "loss": 0.4946, + "step": 1091 + }, + { + "epoch": 0.00038330210374762927, + "grad_norm": 0.38342320919036865, + "learning_rate": 0.00012747913188647746, + "loss": 0.4357, + "step": 1092 + }, + { + "epoch": 0.0003836531130001454, + "grad_norm": 0.2710161805152893, + "learning_rate": 0.00012741235392320534, + "loss": 0.4635, + "step": 1093 + }, + { + "epoch": 0.00038400412225266157, + "grad_norm": 0.3405950963497162, + "learning_rate": 0.00012734557595993324, + "loss": 0.4272, + "step": 1094 + }, + { + "epoch": 0.0003843551315051777, + "grad_norm": 0.3414493203163147, + "learning_rate": 0.0001272787979966611, + "loss": 0.5387, + "step": 1095 + }, + { + "epoch": 0.0003847061407576938, + "grad_norm": 0.30659371614456177, + "learning_rate": 0.00012721202003338898, + "loss": 0.451, + "step": 1096 + }, + { + "epoch": 0.00038505715001021, + "grad_norm": 0.33229631185531616, + "learning_rate": 0.00012714524207011686, + "loss": 0.6062, + "step": 1097 + }, + { + "epoch": 0.0003854081592627261, + "grad_norm": 0.29991772770881653, + "learning_rate": 0.00012707846410684473, + "loss": 0.5812, + "step": 1098 + }, + { + "epoch": 0.0003857591685152423, + "grad_norm": 0.2937552332878113, + "learning_rate": 0.0001270116861435726, + "loss": 0.4762, + "step": 1099 + }, + { + "epoch": 0.0003861101777677584, + "grad_norm": 0.3993151783943176, + "learning_rate": 0.0001269449081803005, + "loss": 0.5288, + "step": 1100 + }, + { + "epoch": 0.00038646118702027454, + "grad_norm": 0.34012341499328613, + "learning_rate": 0.00012687813021702838, + "loss": 0.5858, + "step": 1101 + }, + { + "epoch": 0.0003868121962727907, + "grad_norm": 0.31721460819244385, + "learning_rate": 0.00012681135225375628, + "loss": 0.4543, + "step": 1102 + }, + { + "epoch": 0.00038716320552530684, + "grad_norm": 0.404480904340744, + "learning_rate": 0.00012674457429048415, + "loss": 0.6425, + "step": 1103 + }, + { + "epoch": 0.000387514214777823, + "grad_norm": 0.2888083755970001, + "learning_rate": 0.00012667779632721202, + "loss": 0.5737, + "step": 1104 + }, + { + "epoch": 0.00038786522403033913, + "grad_norm": 0.316724568605423, + "learning_rate": 0.00012661101836393992, + "loss": 0.4774, + "step": 1105 + }, + { + "epoch": 0.00038821623328285526, + "grad_norm": 0.34277236461639404, + "learning_rate": 0.0001265442404006678, + "loss": 0.5722, + "step": 1106 + }, + { + "epoch": 0.00038856724253537143, + "grad_norm": 0.3688976764678955, + "learning_rate": 0.00012647746243739567, + "loss": 0.478, + "step": 1107 + }, + { + "epoch": 0.00038891825178788756, + "grad_norm": 0.30905240774154663, + "learning_rate": 0.00012641068447412354, + "loss": 0.5578, + "step": 1108 + }, + { + "epoch": 0.00038926926104040373, + "grad_norm": 0.31679004430770874, + "learning_rate": 0.00012634390651085142, + "loss": 0.5564, + "step": 1109 + }, + { + "epoch": 0.00038962027029291985, + "grad_norm": 0.31234732270240784, + "learning_rate": 0.00012627712854757932, + "loss": 0.5403, + "step": 1110 + }, + { + "epoch": 0.000389971279545436, + "grad_norm": 0.2693454921245575, + "learning_rate": 0.0001262103505843072, + "loss": 0.577, + "step": 1111 + }, + { + "epoch": 0.00039032228879795215, + "grad_norm": 0.36127611994743347, + "learning_rate": 0.00012614357262103506, + "loss": 0.5558, + "step": 1112 + }, + { + "epoch": 0.0003906732980504683, + "grad_norm": 0.3124391436576843, + "learning_rate": 0.00012607679465776294, + "loss": 0.5198, + "step": 1113 + }, + { + "epoch": 0.00039102430730298445, + "grad_norm": 0.339495986700058, + "learning_rate": 0.0001260100166944908, + "loss": 0.4415, + "step": 1114 + }, + { + "epoch": 0.0003913753165555006, + "grad_norm": 0.3561634421348572, + "learning_rate": 0.00012594323873121868, + "loss": 0.5413, + "step": 1115 + }, + { + "epoch": 0.0003917263258080167, + "grad_norm": 0.30160975456237793, + "learning_rate": 0.00012587646076794658, + "loss": 0.5754, + "step": 1116 + }, + { + "epoch": 0.0003920773350605329, + "grad_norm": 0.583508312702179, + "learning_rate": 0.00012580968280467446, + "loss": 0.5645, + "step": 1117 + }, + { + "epoch": 0.000392428344313049, + "grad_norm": 0.3197818100452423, + "learning_rate": 0.00012574290484140233, + "loss": 0.5326, + "step": 1118 + }, + { + "epoch": 0.0003927793535655652, + "grad_norm": 0.3258291482925415, + "learning_rate": 0.00012567612687813023, + "loss": 0.5504, + "step": 1119 + }, + { + "epoch": 0.0003931303628180813, + "grad_norm": 0.2790183424949646, + "learning_rate": 0.0001256093489148581, + "loss": 0.4691, + "step": 1120 + }, + { + "epoch": 0.0003934813720705974, + "grad_norm": 0.4802376627922058, + "learning_rate": 0.000125542570951586, + "loss": 0.5689, + "step": 1121 + }, + { + "epoch": 0.0003938323813231136, + "grad_norm": 0.42296934127807617, + "learning_rate": 0.00012547579298831388, + "loss": 0.5082, + "step": 1122 + }, + { + "epoch": 0.0003941833905756297, + "grad_norm": 0.4018993377685547, + "learning_rate": 0.00012540901502504175, + "loss": 0.5967, + "step": 1123 + }, + { + "epoch": 0.0003945343998281459, + "grad_norm": 0.2756693661212921, + "learning_rate": 0.00012534223706176962, + "loss": 0.5071, + "step": 1124 + }, + { + "epoch": 0.000394885409080662, + "grad_norm": 0.28827816247940063, + "learning_rate": 0.0001252754590984975, + "loss": 0.446, + "step": 1125 + }, + { + "epoch": 0.00039523641833317814, + "grad_norm": 0.33188387751579285, + "learning_rate": 0.0001252086811352254, + "loss": 0.59, + "step": 1126 + }, + { + "epoch": 0.0003955874275856943, + "grad_norm": 0.3057992458343506, + "learning_rate": 0.00012514190317195327, + "loss": 0.4665, + "step": 1127 + }, + { + "epoch": 0.00039593843683821044, + "grad_norm": 0.423970103263855, + "learning_rate": 0.00012507512520868114, + "loss": 0.5603, + "step": 1128 + }, + { + "epoch": 0.0003962894460907266, + "grad_norm": 0.4346948266029358, + "learning_rate": 0.00012500834724540902, + "loss": 0.7188, + "step": 1129 + }, + { + "epoch": 0.00039664045534324274, + "grad_norm": 0.3196350932121277, + "learning_rate": 0.0001249415692821369, + "loss": 0.499, + "step": 1130 + }, + { + "epoch": 0.00039699146459575886, + "grad_norm": 0.32787612080574036, + "learning_rate": 0.00012487479131886476, + "loss": 0.562, + "step": 1131 + }, + { + "epoch": 0.00039734247384827504, + "grad_norm": 0.3701760768890381, + "learning_rate": 0.00012480801335559266, + "loss": 0.5906, + "step": 1132 + }, + { + "epoch": 0.00039769348310079116, + "grad_norm": 0.2836174964904785, + "learning_rate": 0.00012474123539232053, + "loss": 0.5241, + "step": 1133 + }, + { + "epoch": 0.00039804449235330733, + "grad_norm": 0.3123319745063782, + "learning_rate": 0.0001246744574290484, + "loss": 0.5591, + "step": 1134 + }, + { + "epoch": 0.00039839550160582346, + "grad_norm": 0.2965394854545593, + "learning_rate": 0.0001246076794657763, + "loss": 0.5522, + "step": 1135 + }, + { + "epoch": 0.0003987465108583396, + "grad_norm": 0.3452530801296234, + "learning_rate": 0.00012454090150250418, + "loss": 0.5572, + "step": 1136 + }, + { + "epoch": 0.00039909752011085576, + "grad_norm": 0.3368155062198639, + "learning_rate": 0.00012447412353923208, + "loss": 0.4947, + "step": 1137 + }, + { + "epoch": 0.0003994485293633719, + "grad_norm": 0.31308281421661377, + "learning_rate": 0.00012440734557595995, + "loss": 0.5395, + "step": 1138 + }, + { + "epoch": 0.00039979953861588805, + "grad_norm": 0.36880385875701904, + "learning_rate": 0.00012434056761268783, + "loss": 0.5449, + "step": 1139 + }, + { + "epoch": 0.0004001505478684042, + "grad_norm": 0.3276751935482025, + "learning_rate": 0.0001242737896494157, + "loss": 0.5714, + "step": 1140 + }, + { + "epoch": 0.0004005015571209203, + "grad_norm": 0.34474796056747437, + "learning_rate": 0.00012420701168614357, + "loss": 0.5579, + "step": 1141 + }, + { + "epoch": 0.0004008525663734365, + "grad_norm": 0.3203624188899994, + "learning_rate": 0.00012414023372287147, + "loss": 0.5848, + "step": 1142 + }, + { + "epoch": 0.0004012035756259526, + "grad_norm": 0.33093470335006714, + "learning_rate": 0.00012407345575959935, + "loss": 0.5515, + "step": 1143 + }, + { + "epoch": 0.0004015545848784688, + "grad_norm": 0.2994841933250427, + "learning_rate": 0.00012400667779632722, + "loss": 0.4696, + "step": 1144 + }, + { + "epoch": 0.0004019055941309849, + "grad_norm": 0.43979793787002563, + "learning_rate": 0.0001239398998330551, + "loss": 0.5531, + "step": 1145 + }, + { + "epoch": 0.000402256603383501, + "grad_norm": 0.33747658133506775, + "learning_rate": 0.00012387312186978297, + "loss": 0.5442, + "step": 1146 + }, + { + "epoch": 0.0004026076126360172, + "grad_norm": 0.3129333257675171, + "learning_rate": 0.00012380634390651084, + "loss": 0.5812, + "step": 1147 + }, + { + "epoch": 0.0004029586218885333, + "grad_norm": 0.27842286229133606, + "learning_rate": 0.00012373956594323874, + "loss": 0.5571, + "step": 1148 + }, + { + "epoch": 0.0004033096311410495, + "grad_norm": 0.30332496762275696, + "learning_rate": 0.00012367278797996661, + "loss": 0.5264, + "step": 1149 + }, + { + "epoch": 0.0004036606403935656, + "grad_norm": 0.41959401965141296, + "learning_rate": 0.0001236060100166945, + "loss": 0.6208, + "step": 1150 + }, + { + "epoch": 0.00040401164964608174, + "grad_norm": 0.2994483411312103, + "learning_rate": 0.00012353923205342236, + "loss": 0.5311, + "step": 1151 + }, + { + "epoch": 0.0004043626588985979, + "grad_norm": 0.28562021255493164, + "learning_rate": 0.00012347245409015026, + "loss": 0.4664, + "step": 1152 + }, + { + "epoch": 0.00040471366815111404, + "grad_norm": 0.3773499131202698, + "learning_rate": 0.00012340567612687813, + "loss": 0.6372, + "step": 1153 + }, + { + "epoch": 0.0004050646774036302, + "grad_norm": 0.3149654269218445, + "learning_rate": 0.00012333889816360603, + "loss": 0.5295, + "step": 1154 + }, + { + "epoch": 0.00040541568665614634, + "grad_norm": 0.345595121383667, + "learning_rate": 0.0001232721202003339, + "loss": 0.5568, + "step": 1155 + }, + { + "epoch": 0.00040576669590866246, + "grad_norm": 0.2795856297016144, + "learning_rate": 0.00012320534223706178, + "loss": 0.4909, + "step": 1156 + }, + { + "epoch": 0.00040611770516117864, + "grad_norm": 0.37467122077941895, + "learning_rate": 0.00012313856427378965, + "loss": 0.5733, + "step": 1157 + }, + { + "epoch": 0.00040646871441369476, + "grad_norm": 0.33086350560188293, + "learning_rate": 0.00012307178631051755, + "loss": 0.5371, + "step": 1158 + }, + { + "epoch": 0.00040681972366621094, + "grad_norm": 0.3587074279785156, + "learning_rate": 0.00012300500834724543, + "loss": 0.5555, + "step": 1159 + }, + { + "epoch": 0.00040717073291872706, + "grad_norm": 0.35360291600227356, + "learning_rate": 0.0001229382303839733, + "loss": 0.5686, + "step": 1160 + }, + { + "epoch": 0.0004075217421712432, + "grad_norm": 0.32877933979034424, + "learning_rate": 0.00012287145242070117, + "loss": 0.6232, + "step": 1161 + }, + { + "epoch": 0.00040787275142375936, + "grad_norm": 0.3402215540409088, + "learning_rate": 0.00012280467445742905, + "loss": 0.5923, + "step": 1162 + }, + { + "epoch": 0.0004082237606762755, + "grad_norm": 0.3712671399116516, + "learning_rate": 0.00012273789649415692, + "loss": 0.4405, + "step": 1163 + }, + { + "epoch": 0.00040857476992879166, + "grad_norm": 0.34966424107551575, + "learning_rate": 0.00012267111853088482, + "loss": 0.5987, + "step": 1164 + }, + { + "epoch": 0.0004089257791813078, + "grad_norm": 0.8779903650283813, + "learning_rate": 0.0001226043405676127, + "loss": 0.5677, + "step": 1165 + }, + { + "epoch": 0.0004092767884338239, + "grad_norm": 0.30721041560173035, + "learning_rate": 0.00012253756260434057, + "loss": 0.4803, + "step": 1166 + }, + { + "epoch": 0.0004096277976863401, + "grad_norm": 0.3509838879108429, + "learning_rate": 0.00012247078464106844, + "loss": 0.4216, + "step": 1167 + }, + { + "epoch": 0.0004099788069388562, + "grad_norm": 0.2961578071117401, + "learning_rate": 0.0001224040066777963, + "loss": 0.5599, + "step": 1168 + }, + { + "epoch": 0.0004103298161913724, + "grad_norm": 0.28842684626579285, + "learning_rate": 0.0001223372287145242, + "loss": 0.5023, + "step": 1169 + }, + { + "epoch": 0.0004106808254438885, + "grad_norm": 0.3395219147205353, + "learning_rate": 0.00012227045075125209, + "loss": 0.6371, + "step": 1170 + }, + { + "epoch": 0.0004110318346964046, + "grad_norm": 0.2860247492790222, + "learning_rate": 0.00012220367278797999, + "loss": 0.3881, + "step": 1171 + }, + { + "epoch": 0.0004113828439489208, + "grad_norm": 0.5463435053825378, + "learning_rate": 0.00012213689482470786, + "loss": 0.5751, + "step": 1172 + }, + { + "epoch": 0.0004117338532014369, + "grad_norm": 0.30383020639419556, + "learning_rate": 0.00012207011686143572, + "loss": 0.4892, + "step": 1173 + }, + { + "epoch": 0.0004120848624539531, + "grad_norm": 0.6111129522323608, + "learning_rate": 0.00012200333889816362, + "loss": 0.6786, + "step": 1174 + }, + { + "epoch": 0.0004124358717064692, + "grad_norm": 0.32131698727607727, + "learning_rate": 0.00012193656093489149, + "loss": 0.6301, + "step": 1175 + }, + { + "epoch": 0.00041278688095898534, + "grad_norm": 0.3574715256690979, + "learning_rate": 0.00012186978297161938, + "loss": 0.5705, + "step": 1176 + }, + { + "epoch": 0.0004131378902115015, + "grad_norm": 0.46258190274238586, + "learning_rate": 0.00012180300500834725, + "loss": 0.54, + "step": 1177 + }, + { + "epoch": 0.00041348889946401764, + "grad_norm": 0.385326623916626, + "learning_rate": 0.00012173622704507512, + "loss": 0.5792, + "step": 1178 + }, + { + "epoch": 0.0004138399087165338, + "grad_norm": 0.3880153000354767, + "learning_rate": 0.00012166944908180303, + "loss": 0.5396, + "step": 1179 + }, + { + "epoch": 0.00041419091796904994, + "grad_norm": 0.32916024327278137, + "learning_rate": 0.0001216026711185309, + "loss": 0.5632, + "step": 1180 + }, + { + "epoch": 0.00041454192722156606, + "grad_norm": 0.30234548449516296, + "learning_rate": 0.00012153589315525877, + "loss": 0.5162, + "step": 1181 + }, + { + "epoch": 0.00041489293647408224, + "grad_norm": 0.3654727339744568, + "learning_rate": 0.00012146911519198664, + "loss": 0.6333, + "step": 1182 + }, + { + "epoch": 0.00041524394572659836, + "grad_norm": 0.3166685700416565, + "learning_rate": 0.00012140233722871452, + "loss": 0.5276, + "step": 1183 + }, + { + "epoch": 0.00041559495497911454, + "grad_norm": 0.3722357153892517, + "learning_rate": 0.0001213355592654424, + "loss": 0.5771, + "step": 1184 + }, + { + "epoch": 0.00041594596423163066, + "grad_norm": 0.3407818377017975, + "learning_rate": 0.00012126878130217029, + "loss": 0.5998, + "step": 1185 + }, + { + "epoch": 0.0004162969734841468, + "grad_norm": 0.28665193915367126, + "learning_rate": 0.00012120200333889818, + "loss": 0.5457, + "step": 1186 + }, + { + "epoch": 0.00041664798273666296, + "grad_norm": 0.3052026629447937, + "learning_rate": 0.00012113522537562605, + "loss": 0.5204, + "step": 1187 + }, + { + "epoch": 0.0004169989919891791, + "grad_norm": 0.286080002784729, + "learning_rate": 0.00012106844741235392, + "loss": 0.4346, + "step": 1188 + }, + { + "epoch": 0.00041735000124169526, + "grad_norm": 0.306473970413208, + "learning_rate": 0.0001210016694490818, + "loss": 0.5544, + "step": 1189 + }, + { + "epoch": 0.0004177010104942114, + "grad_norm": 0.3347833454608917, + "learning_rate": 0.0001209348914858097, + "loss": 0.4619, + "step": 1190 + }, + { + "epoch": 0.0004180520197467275, + "grad_norm": 0.28040143847465515, + "learning_rate": 0.00012086811352253757, + "loss": 0.5492, + "step": 1191 + }, + { + "epoch": 0.0004184030289992437, + "grad_norm": 0.2940806448459625, + "learning_rate": 0.00012080133555926544, + "loss": 0.5653, + "step": 1192 + }, + { + "epoch": 0.0004187540382517598, + "grad_norm": 0.37384578585624695, + "learning_rate": 0.00012073455759599333, + "loss": 0.4931, + "step": 1193 + }, + { + "epoch": 0.000419105047504276, + "grad_norm": 0.28816068172454834, + "learning_rate": 0.0001206677796327212, + "loss": 0.5292, + "step": 1194 + }, + { + "epoch": 0.0004194560567567921, + "grad_norm": 0.31325826048851013, + "learning_rate": 0.0001206010016694491, + "loss": 0.5288, + "step": 1195 + }, + { + "epoch": 0.0004198070660093082, + "grad_norm": 0.30658552050590515, + "learning_rate": 0.00012053422370617698, + "loss": 0.5854, + "step": 1196 + }, + { + "epoch": 0.0004201580752618244, + "grad_norm": 0.341240257024765, + "learning_rate": 0.00012046744574290485, + "loss": 0.5358, + "step": 1197 + }, + { + "epoch": 0.0004205090845143405, + "grad_norm": 0.3595687747001648, + "learning_rate": 0.00012040066777963272, + "loss": 0.5944, + "step": 1198 + }, + { + "epoch": 0.00042086009376685664, + "grad_norm": 0.3249213397502899, + "learning_rate": 0.0001203338898163606, + "loss": 0.4873, + "step": 1199 + }, + { + "epoch": 0.0004212111030193728, + "grad_norm": 0.37282127141952515, + "learning_rate": 0.00012026711185308848, + "loss": 0.5173, + "step": 1200 + }, + { + "epoch": 0.00042156211227188894, + "grad_norm": 0.325110524892807, + "learning_rate": 0.00012020033388981637, + "loss": 0.4819, + "step": 1201 + }, + { + "epoch": 0.0004219131215244051, + "grad_norm": 0.313388466835022, + "learning_rate": 0.00012013355592654426, + "loss": 0.5613, + "step": 1202 + }, + { + "epoch": 0.00042226413077692124, + "grad_norm": 0.38384371995925903, + "learning_rate": 0.00012006677796327213, + "loss": 0.5711, + "step": 1203 + }, + { + "epoch": 0.00042261514002943736, + "grad_norm": 0.3431423008441925, + "learning_rate": 0.00012, + "loss": 0.5593, + "step": 1204 + }, + { + "epoch": 0.00042296614928195354, + "grad_norm": 0.3032066822052002, + "learning_rate": 0.00011993322203672788, + "loss": 0.559, + "step": 1205 + }, + { + "epoch": 0.00042331715853446966, + "grad_norm": 0.30639907717704773, + "learning_rate": 0.00011986644407345578, + "loss": 0.5727, + "step": 1206 + }, + { + "epoch": 0.00042366816778698584, + "grad_norm": 0.2970695197582245, + "learning_rate": 0.00011979966611018365, + "loss": 0.5933, + "step": 1207 + }, + { + "epoch": 0.00042401917703950196, + "grad_norm": 0.3868466317653656, + "learning_rate": 0.00011973288814691152, + "loss": 0.5779, + "step": 1208 + }, + { + "epoch": 0.0004243701862920181, + "grad_norm": 0.29085230827331543, + "learning_rate": 0.0001196661101836394, + "loss": 0.6558, + "step": 1209 + }, + { + "epoch": 0.00042472119554453426, + "grad_norm": 0.33766743540763855, + "learning_rate": 0.00011959933222036728, + "loss": 0.5809, + "step": 1210 + }, + { + "epoch": 0.0004250722047970504, + "grad_norm": 0.6739090085029602, + "learning_rate": 0.00011953255425709517, + "loss": 0.6085, + "step": 1211 + }, + { + "epoch": 0.00042542321404956656, + "grad_norm": 0.35693222284317017, + "learning_rate": 0.00011946577629382306, + "loss": 0.5855, + "step": 1212 + }, + { + "epoch": 0.0004257742233020827, + "grad_norm": 0.3087833523750305, + "learning_rate": 0.00011939899833055093, + "loss": 0.6379, + "step": 1213 + }, + { + "epoch": 0.0004261252325545988, + "grad_norm": 0.3548837900161743, + "learning_rate": 0.0001193322203672788, + "loss": 0.5303, + "step": 1214 + }, + { + "epoch": 0.000426476241807115, + "grad_norm": 0.46040648221969604, + "learning_rate": 0.00011926544240400668, + "loss": 0.5171, + "step": 1215 + }, + { + "epoch": 0.0004268272510596311, + "grad_norm": 0.5730584859848022, + "learning_rate": 0.00011919866444073455, + "loss": 0.615, + "step": 1216 + }, + { + "epoch": 0.0004271782603121473, + "grad_norm": 0.34618711471557617, + "learning_rate": 0.00011913188647746245, + "loss": 0.5605, + "step": 1217 + }, + { + "epoch": 0.0004275292695646634, + "grad_norm": 0.3499528169631958, + "learning_rate": 0.00011906510851419032, + "loss": 0.5184, + "step": 1218 + }, + { + "epoch": 0.0004278802788171795, + "grad_norm": 0.33638936281204224, + "learning_rate": 0.00011899833055091821, + "loss": 0.6276, + "step": 1219 + }, + { + "epoch": 0.0004282312880696957, + "grad_norm": 0.34646880626678467, + "learning_rate": 0.00011893155258764608, + "loss": 0.5737, + "step": 1220 + }, + { + "epoch": 0.0004285822973222118, + "grad_norm": 0.2783110439777374, + "learning_rate": 0.00011886477462437396, + "loss": 0.4424, + "step": 1221 + }, + { + "epoch": 0.000428933306574728, + "grad_norm": 0.33892807364463806, + "learning_rate": 0.00011879799666110186, + "loss": 0.5656, + "step": 1222 + }, + { + "epoch": 0.0004292843158272441, + "grad_norm": 0.2782565653324127, + "learning_rate": 0.00011873121869782973, + "loss": 0.5504, + "step": 1223 + }, + { + "epoch": 0.00042963532507976025, + "grad_norm": 0.3684981167316437, + "learning_rate": 0.0001186644407345576, + "loss": 0.5532, + "step": 1224 + }, + { + "epoch": 0.0004299863343322764, + "grad_norm": 0.4034316837787628, + "learning_rate": 0.00011859766277128547, + "loss": 0.5417, + "step": 1225 + }, + { + "epoch": 0.00043033734358479254, + "grad_norm": 0.5182071924209595, + "learning_rate": 0.00011853088480801335, + "loss": 0.6118, + "step": 1226 + }, + { + "epoch": 0.0004306883528373087, + "grad_norm": 0.3137674033641815, + "learning_rate": 0.00011846410684474125, + "loss": 0.6485, + "step": 1227 + }, + { + "epoch": 0.00043103936208982484, + "grad_norm": 0.4069771468639374, + "learning_rate": 0.00011839732888146912, + "loss": 0.5452, + "step": 1228 + }, + { + "epoch": 0.00043139037134234097, + "grad_norm": 0.5212397575378418, + "learning_rate": 0.00011833055091819701, + "loss": 0.5212, + "step": 1229 + }, + { + "epoch": 0.00043174138059485714, + "grad_norm": 0.3622184693813324, + "learning_rate": 0.00011826377295492488, + "loss": 0.4333, + "step": 1230 + }, + { + "epoch": 0.00043209238984737326, + "grad_norm": 0.335044801235199, + "learning_rate": 0.00011819699499165275, + "loss": 0.5606, + "step": 1231 + }, + { + "epoch": 0.00043244339909988944, + "grad_norm": 0.31680893898010254, + "learning_rate": 0.00011813021702838063, + "loss": 0.4988, + "step": 1232 + }, + { + "epoch": 0.00043279440835240556, + "grad_norm": 0.5272301435470581, + "learning_rate": 0.00011806343906510853, + "loss": 0.6024, + "step": 1233 + }, + { + "epoch": 0.0004331454176049217, + "grad_norm": 0.3663223385810852, + "learning_rate": 0.0001179966611018364, + "loss": 0.5964, + "step": 1234 + }, + { + "epoch": 0.00043349642685743786, + "grad_norm": 0.35138314962387085, + "learning_rate": 0.00011792988313856427, + "loss": 0.5908, + "step": 1235 + }, + { + "epoch": 0.000433847436109954, + "grad_norm": 0.3744595944881439, + "learning_rate": 0.00011786310517529216, + "loss": 0.551, + "step": 1236 + }, + { + "epoch": 0.00043419844536247016, + "grad_norm": 0.31489259004592896, + "learning_rate": 0.00011779632721202003, + "loss": 0.6431, + "step": 1237 + }, + { + "epoch": 0.0004345494546149863, + "grad_norm": 0.3356812298297882, + "learning_rate": 0.00011772954924874793, + "loss": 0.4507, + "step": 1238 + }, + { + "epoch": 0.0004349004638675024, + "grad_norm": 0.3018808364868164, + "learning_rate": 0.00011766277128547581, + "loss": 0.4796, + "step": 1239 + }, + { + "epoch": 0.0004352514731200186, + "grad_norm": 0.3201460540294647, + "learning_rate": 0.00011759599332220368, + "loss": 0.4768, + "step": 1240 + }, + { + "epoch": 0.0004356024823725347, + "grad_norm": 0.3269093334674835, + "learning_rate": 0.00011752921535893155, + "loss": 0.5419, + "step": 1241 + }, + { + "epoch": 0.0004359534916250509, + "grad_norm": 0.28690990805625916, + "learning_rate": 0.00011746243739565943, + "loss": 0.5088, + "step": 1242 + }, + { + "epoch": 0.000436304500877567, + "grad_norm": 0.32765012979507446, + "learning_rate": 0.00011739565943238733, + "loss": 0.4953, + "step": 1243 + }, + { + "epoch": 0.0004366555101300831, + "grad_norm": 0.28830674290657043, + "learning_rate": 0.0001173288814691152, + "loss": 0.5179, + "step": 1244 + }, + { + "epoch": 0.0004370065193825993, + "grad_norm": 0.37793827056884766, + "learning_rate": 0.00011726210350584307, + "loss": 0.5951, + "step": 1245 + }, + { + "epoch": 0.0004373575286351154, + "grad_norm": 0.37173348665237427, + "learning_rate": 0.00011719532554257096, + "loss": 0.6059, + "step": 1246 + }, + { + "epoch": 0.0004377085378876316, + "grad_norm": 0.5363826155662537, + "learning_rate": 0.00011712854757929883, + "loss": 0.5183, + "step": 1247 + }, + { + "epoch": 0.0004380595471401477, + "grad_norm": 0.31671205163002014, + "learning_rate": 0.0001170617696160267, + "loss": 0.5711, + "step": 1248 + }, + { + "epoch": 0.00043841055639266385, + "grad_norm": 0.3112623989582062, + "learning_rate": 0.0001169949916527546, + "loss": 0.5647, + "step": 1249 + }, + { + "epoch": 0.00043876156564518, + "grad_norm": 0.3153972923755646, + "learning_rate": 0.00011692821368948248, + "loss": 0.4939, + "step": 1250 + }, + { + "epoch": 0.00043911257489769615, + "grad_norm": 0.29940372705459595, + "learning_rate": 0.00011686143572621035, + "loss": 0.5509, + "step": 1251 + }, + { + "epoch": 0.0004394635841502123, + "grad_norm": 0.42540279030799866, + "learning_rate": 0.00011679465776293823, + "loss": 0.4104, + "step": 1252 + }, + { + "epoch": 0.00043981459340272844, + "grad_norm": 0.3222522437572479, + "learning_rate": 0.00011672787979966611, + "loss": 0.6237, + "step": 1253 + }, + { + "epoch": 0.00044016560265524457, + "grad_norm": 0.34896525740623474, + "learning_rate": 0.000116661101836394, + "loss": 0.5162, + "step": 1254 + }, + { + "epoch": 0.00044051661190776074, + "grad_norm": 0.29780149459838867, + "learning_rate": 0.00011659432387312189, + "loss": 0.5805, + "step": 1255 + }, + { + "epoch": 0.00044086762116027687, + "grad_norm": 0.3533996343612671, + "learning_rate": 0.00011652754590984976, + "loss": 0.5749, + "step": 1256 + }, + { + "epoch": 0.00044121863041279304, + "grad_norm": 0.30867093801498413, + "learning_rate": 0.00011646076794657763, + "loss": 0.479, + "step": 1257 + }, + { + "epoch": 0.00044156963966530917, + "grad_norm": 0.31176280975341797, + "learning_rate": 0.0001163939899833055, + "loss": 0.5007, + "step": 1258 + }, + { + "epoch": 0.0004419206489178253, + "grad_norm": 0.3480489253997803, + "learning_rate": 0.0001163272120200334, + "loss": 0.5595, + "step": 1259 + }, + { + "epoch": 0.00044227165817034146, + "grad_norm": 0.37473055720329285, + "learning_rate": 0.00011626043405676128, + "loss": 0.5042, + "step": 1260 + }, + { + "epoch": 0.0004426226674228576, + "grad_norm": 0.3167501986026764, + "learning_rate": 0.00011619365609348915, + "loss": 0.5335, + "step": 1261 + }, + { + "epoch": 0.00044297367667537376, + "grad_norm": 0.31276339292526245, + "learning_rate": 0.00011612687813021703, + "loss": 0.5594, + "step": 1262 + }, + { + "epoch": 0.0004433246859278899, + "grad_norm": 0.42910438776016235, + "learning_rate": 0.00011606010016694491, + "loss": 0.4659, + "step": 1263 + }, + { + "epoch": 0.000443675695180406, + "grad_norm": 0.3169635534286499, + "learning_rate": 0.00011599332220367279, + "loss": 0.5463, + "step": 1264 + }, + { + "epoch": 0.0004440267044329222, + "grad_norm": 0.3419555425643921, + "learning_rate": 0.00011592654424040069, + "loss": 0.5091, + "step": 1265 + }, + { + "epoch": 0.0004443777136854383, + "grad_norm": 0.31462714076042175, + "learning_rate": 0.00011585976627712856, + "loss": 0.6233, + "step": 1266 + }, + { + "epoch": 0.0004447287229379545, + "grad_norm": 0.36186134815216064, + "learning_rate": 0.00011579298831385643, + "loss": 0.5634, + "step": 1267 + }, + { + "epoch": 0.0004450797321904706, + "grad_norm": 0.385903000831604, + "learning_rate": 0.0001157262103505843, + "loss": 0.5892, + "step": 1268 + }, + { + "epoch": 0.00044543074144298673, + "grad_norm": 0.28669610619544983, + "learning_rate": 0.00011565943238731218, + "loss": 0.4746, + "step": 1269 + }, + { + "epoch": 0.0004457817506955029, + "grad_norm": 0.37557515501976013, + "learning_rate": 0.00011559265442404008, + "loss": 0.5946, + "step": 1270 + }, + { + "epoch": 0.00044613275994801903, + "grad_norm": 0.30455920100212097, + "learning_rate": 0.00011552587646076795, + "loss": 0.4064, + "step": 1271 + }, + { + "epoch": 0.0004464837692005352, + "grad_norm": 0.36547228693962097, + "learning_rate": 0.00011545909849749584, + "loss": 0.4354, + "step": 1272 + }, + { + "epoch": 0.0004468347784530513, + "grad_norm": 0.3912973999977112, + "learning_rate": 0.00011539232053422371, + "loss": 0.544, + "step": 1273 + }, + { + "epoch": 0.00044718578770556745, + "grad_norm": 0.2993258237838745, + "learning_rate": 0.00011532554257095158, + "loss": 0.4623, + "step": 1274 + }, + { + "epoch": 0.0004475367969580836, + "grad_norm": 0.39676982164382935, + "learning_rate": 0.00011525876460767948, + "loss": 0.4735, + "step": 1275 + }, + { + "epoch": 0.00044788780621059975, + "grad_norm": 0.43738967180252075, + "learning_rate": 0.00011519198664440736, + "loss": 0.5639, + "step": 1276 + }, + { + "epoch": 0.0004482388154631159, + "grad_norm": 0.4572802186012268, + "learning_rate": 0.00011512520868113523, + "loss": 0.5043, + "step": 1277 + }, + { + "epoch": 0.00044858982471563205, + "grad_norm": 0.301929771900177, + "learning_rate": 0.0001150584307178631, + "loss": 0.3962, + "step": 1278 + }, + { + "epoch": 0.00044894083396814817, + "grad_norm": 0.42450666427612305, + "learning_rate": 0.00011499165275459098, + "loss": 0.5885, + "step": 1279 + }, + { + "epoch": 0.00044929184322066435, + "grad_norm": 0.3520278036594391, + "learning_rate": 0.00011492487479131886, + "loss": 0.5557, + "step": 1280 + }, + { + "epoch": 0.00044964285247318047, + "grad_norm": 0.32748425006866455, + "learning_rate": 0.00011485809682804675, + "loss": 0.5788, + "step": 1281 + }, + { + "epoch": 0.00044999386172569664, + "grad_norm": 0.3404058516025543, + "learning_rate": 0.00011479131886477464, + "loss": 0.431, + "step": 1282 + }, + { + "epoch": 0.00045034487097821277, + "grad_norm": 0.30703750252723694, + "learning_rate": 0.00011472454090150251, + "loss": 0.5603, + "step": 1283 + }, + { + "epoch": 0.0004506958802307289, + "grad_norm": 0.3476982116699219, + "learning_rate": 0.00011465776293823038, + "loss": 0.4984, + "step": 1284 + }, + { + "epoch": 0.00045104688948324507, + "grad_norm": 0.361433207988739, + "learning_rate": 0.00011459098497495826, + "loss": 0.4012, + "step": 1285 + }, + { + "epoch": 0.0004513978987357612, + "grad_norm": 0.31583985686302185, + "learning_rate": 0.00011452420701168616, + "loss": 0.5115, + "step": 1286 + }, + { + "epoch": 0.00045174890798827736, + "grad_norm": 0.3581843376159668, + "learning_rate": 0.00011445742904841403, + "loss": 0.5795, + "step": 1287 + }, + { + "epoch": 0.0004520999172407935, + "grad_norm": 0.30088526010513306, + "learning_rate": 0.0001143906510851419, + "loss": 0.4995, + "step": 1288 + }, + { + "epoch": 0.0004524509264933096, + "grad_norm": 0.34739211201667786, + "learning_rate": 0.00011432387312186979, + "loss": 0.5513, + "step": 1289 + }, + { + "epoch": 0.0004528019357458258, + "grad_norm": 0.3440413177013397, + "learning_rate": 0.00011425709515859766, + "loss": 0.626, + "step": 1290 + }, + { + "epoch": 0.0004531529449983419, + "grad_norm": 0.34715211391448975, + "learning_rate": 0.00011419031719532556, + "loss": 0.5567, + "step": 1291 + }, + { + "epoch": 0.0004535039542508581, + "grad_norm": 0.3141072690486908, + "learning_rate": 0.00011412353923205344, + "loss": 0.515, + "step": 1292 + }, + { + "epoch": 0.0004538549635033742, + "grad_norm": 0.3693056106567383, + "learning_rate": 0.00011405676126878131, + "loss": 0.6039, + "step": 1293 + }, + { + "epoch": 0.00045420597275589033, + "grad_norm": 0.2877582609653473, + "learning_rate": 0.00011398998330550918, + "loss": 0.627, + "step": 1294 + }, + { + "epoch": 0.0004545569820084065, + "grad_norm": 0.30727502703666687, + "learning_rate": 0.00011392320534223706, + "loss": 0.4439, + "step": 1295 + }, + { + "epoch": 0.00045490799126092263, + "grad_norm": 0.340834379196167, + "learning_rate": 0.00011385642737896493, + "loss": 0.6043, + "step": 1296 + }, + { + "epoch": 0.0004552590005134388, + "grad_norm": 0.37094762921333313, + "learning_rate": 0.00011378964941569283, + "loss": 0.5279, + "step": 1297 + }, + { + "epoch": 0.00045561000976595493, + "grad_norm": 0.352252721786499, + "learning_rate": 0.0001137228714524207, + "loss": 0.4534, + "step": 1298 + }, + { + "epoch": 0.00045596101901847105, + "grad_norm": 0.3592413663864136, + "learning_rate": 0.00011365609348914859, + "loss": 0.6009, + "step": 1299 + }, + { + "epoch": 0.0004563120282709872, + "grad_norm": 0.3028002679347992, + "learning_rate": 0.00011358931552587646, + "loss": 0.5451, + "step": 1300 + }, + { + "epoch": 0.00045666303752350335, + "grad_norm": 0.3545093238353729, + "learning_rate": 0.00011352253756260434, + "loss": 0.6022, + "step": 1301 + }, + { + "epoch": 0.0004570140467760195, + "grad_norm": 0.31239053606987, + "learning_rate": 0.00011345575959933224, + "loss": 0.5893, + "step": 1302 + }, + { + "epoch": 0.00045736505602853565, + "grad_norm": 0.2930079996585846, + "learning_rate": 0.00011338898163606011, + "loss": 0.6469, + "step": 1303 + }, + { + "epoch": 0.00045771606528105177, + "grad_norm": 0.3328670263290405, + "learning_rate": 0.00011332220367278798, + "loss": 0.551, + "step": 1304 + }, + { + "epoch": 0.00045806707453356795, + "grad_norm": 0.2958623766899109, + "learning_rate": 0.00011325542570951586, + "loss": 0.4699, + "step": 1305 + }, + { + "epoch": 0.00045841808378608407, + "grad_norm": 0.26540592312812805, + "learning_rate": 0.00011318864774624374, + "loss": 0.5651, + "step": 1306 + }, + { + "epoch": 0.00045876909303860025, + "grad_norm": 0.30372926592826843, + "learning_rate": 0.00011312186978297163, + "loss": 0.4466, + "step": 1307 + }, + { + "epoch": 0.00045912010229111637, + "grad_norm": 0.32394206523895264, + "learning_rate": 0.00011305509181969952, + "loss": 0.4651, + "step": 1308 + }, + { + "epoch": 0.0004594711115436325, + "grad_norm": 0.2792419493198395, + "learning_rate": 0.00011298831385642739, + "loss": 0.4761, + "step": 1309 + }, + { + "epoch": 0.00045982212079614867, + "grad_norm": 0.26445260643959045, + "learning_rate": 0.00011292153589315526, + "loss": 0.4564, + "step": 1310 + }, + { + "epoch": 0.0004601731300486648, + "grad_norm": 0.3601842224597931, + "learning_rate": 0.00011285475792988314, + "loss": 0.5397, + "step": 1311 + }, + { + "epoch": 0.00046052413930118097, + "grad_norm": 0.3574691712856293, + "learning_rate": 0.00011278797996661104, + "loss": 0.5961, + "step": 1312 + }, + { + "epoch": 0.0004608751485536971, + "grad_norm": 0.3000461161136627, + "learning_rate": 0.00011272120200333891, + "loss": 0.4527, + "step": 1313 + }, + { + "epoch": 0.0004612261578062132, + "grad_norm": 0.34302622079849243, + "learning_rate": 0.00011265442404006678, + "loss": 0.6379, + "step": 1314 + }, + { + "epoch": 0.0004615771670587294, + "grad_norm": 0.3945535123348236, + "learning_rate": 0.00011258764607679465, + "loss": 0.5631, + "step": 1315 + }, + { + "epoch": 0.0004619281763112455, + "grad_norm": 0.4170839786529541, + "learning_rate": 0.00011252086811352254, + "loss": 0.6339, + "step": 1316 + }, + { + "epoch": 0.0004622791855637617, + "grad_norm": 0.36513859033584595, + "learning_rate": 0.00011245409015025041, + "loss": 0.5528, + "step": 1317 + }, + { + "epoch": 0.0004626301948162778, + "grad_norm": 0.45692166686058044, + "learning_rate": 0.00011238731218697832, + "loss": 0.6315, + "step": 1318 + }, + { + "epoch": 0.00046298120406879393, + "grad_norm": 0.3772307336330414, + "learning_rate": 0.00011232053422370619, + "loss": 0.5349, + "step": 1319 + }, + { + "epoch": 0.0004633322133213101, + "grad_norm": 0.3114742636680603, + "learning_rate": 0.00011225375626043406, + "loss": 0.4121, + "step": 1320 + }, + { + "epoch": 0.00046368322257382623, + "grad_norm": 0.3508698344230652, + "learning_rate": 0.00011218697829716193, + "loss": 0.638, + "step": 1321 + }, + { + "epoch": 0.0004640342318263424, + "grad_norm": 0.34588712453842163, + "learning_rate": 0.00011212020033388981, + "loss": 0.4898, + "step": 1322 + }, + { + "epoch": 0.00046438524107885853, + "grad_norm": 0.2846747934818268, + "learning_rate": 0.00011205342237061771, + "loss": 0.5521, + "step": 1323 + }, + { + "epoch": 0.00046473625033137465, + "grad_norm": 0.31673532724380493, + "learning_rate": 0.00011198664440734558, + "loss": 0.4676, + "step": 1324 + }, + { + "epoch": 0.00046508725958389083, + "grad_norm": 0.3159814774990082, + "learning_rate": 0.00011191986644407347, + "loss": 0.508, + "step": 1325 + }, + { + "epoch": 0.00046543826883640695, + "grad_norm": 0.3438906967639923, + "learning_rate": 0.00011185308848080134, + "loss": 0.6521, + "step": 1326 + }, + { + "epoch": 0.00046578927808892313, + "grad_norm": 0.28350135684013367, + "learning_rate": 0.00011178631051752921, + "loss": 0.517, + "step": 1327 + }, + { + "epoch": 0.00046614028734143925, + "grad_norm": 0.3244381844997406, + "learning_rate": 0.00011171953255425711, + "loss": 0.4975, + "step": 1328 + }, + { + "epoch": 0.00046649129659395537, + "grad_norm": 0.32338446378707886, + "learning_rate": 0.00011165275459098499, + "loss": 0.5581, + "step": 1329 + }, + { + "epoch": 0.00046684230584647155, + "grad_norm": 0.3385190963745117, + "learning_rate": 0.00011158597662771286, + "loss": 0.5287, + "step": 1330 + }, + { + "epoch": 0.00046719331509898767, + "grad_norm": 0.30869290232658386, + "learning_rate": 0.00011151919866444073, + "loss": 0.5694, + "step": 1331 + }, + { + "epoch": 0.00046754432435150385, + "grad_norm": 0.39800670742988586, + "learning_rate": 0.00011145242070116862, + "loss": 0.6783, + "step": 1332 + }, + { + "epoch": 0.00046789533360401997, + "grad_norm": 0.3691728413105011, + "learning_rate": 0.0001113856427378965, + "loss": 0.5814, + "step": 1333 + }, + { + "epoch": 0.0004682463428565361, + "grad_norm": 0.34991732239723206, + "learning_rate": 0.0001113188647746244, + "loss": 0.414, + "step": 1334 + }, + { + "epoch": 0.00046859735210905227, + "grad_norm": 0.3095676302909851, + "learning_rate": 0.00011125208681135227, + "loss": 0.5982, + "step": 1335 + }, + { + "epoch": 0.0004689483613615684, + "grad_norm": 0.3367360830307007, + "learning_rate": 0.00011118530884808014, + "loss": 0.5794, + "step": 1336 + }, + { + "epoch": 0.00046929937061408457, + "grad_norm": 0.3058132529258728, + "learning_rate": 0.00011111853088480801, + "loss": 0.5001, + "step": 1337 + }, + { + "epoch": 0.0004696503798666007, + "grad_norm": 0.32190924882888794, + "learning_rate": 0.00011105175292153589, + "loss": 0.6184, + "step": 1338 + }, + { + "epoch": 0.0004700013891191168, + "grad_norm": 0.2544103264808655, + "learning_rate": 0.00011098497495826379, + "loss": 0.5338, + "step": 1339 + }, + { + "epoch": 0.000470352398371633, + "grad_norm": 0.3533720374107361, + "learning_rate": 0.00011091819699499166, + "loss": 0.5817, + "step": 1340 + }, + { + "epoch": 0.0004707034076241491, + "grad_norm": 0.29889243841171265, + "learning_rate": 0.00011085141903171953, + "loss": 0.4836, + "step": 1341 + }, + { + "epoch": 0.0004710544168766653, + "grad_norm": 0.3215756118297577, + "learning_rate": 0.00011078464106844742, + "loss": 0.5438, + "step": 1342 + }, + { + "epoch": 0.0004714054261291814, + "grad_norm": 0.3005795478820801, + "learning_rate": 0.00011071786310517529, + "loss": 0.5341, + "step": 1343 + }, + { + "epoch": 0.00047175643538169753, + "grad_norm": 0.31172803044319153, + "learning_rate": 0.0001106510851419032, + "loss": 0.5517, + "step": 1344 + }, + { + "epoch": 0.0004721074446342137, + "grad_norm": 0.3667462468147278, + "learning_rate": 0.00011058430717863107, + "loss": 0.5487, + "step": 1345 + }, + { + "epoch": 0.00047245845388672983, + "grad_norm": 0.3609708249568939, + "learning_rate": 0.00011051752921535894, + "loss": 0.5514, + "step": 1346 + }, + { + "epoch": 0.000472809463139246, + "grad_norm": 0.36390745639801025, + "learning_rate": 0.00011045075125208681, + "loss": 0.609, + "step": 1347 + }, + { + "epoch": 0.00047316047239176213, + "grad_norm": 0.3918192982673645, + "learning_rate": 0.00011038397328881469, + "loss": 0.5841, + "step": 1348 + }, + { + "epoch": 0.00047351148164427825, + "grad_norm": 0.3789425194263458, + "learning_rate": 0.00011031719532554257, + "loss": 0.5551, + "step": 1349 + }, + { + "epoch": 0.00047386249089679443, + "grad_norm": 0.31591498851776123, + "learning_rate": 0.00011025041736227046, + "loss": 0.5445, + "step": 1350 + }, + { + "epoch": 0.00047421350014931055, + "grad_norm": 0.3711070120334625, + "learning_rate": 0.00011018363939899835, + "loss": 0.6124, + "step": 1351 + }, + { + "epoch": 0.00047456450940182673, + "grad_norm": 0.3442644476890564, + "learning_rate": 0.00011011686143572622, + "loss": 0.5793, + "step": 1352 + }, + { + "epoch": 0.00047491551865434285, + "grad_norm": 0.2866378426551819, + "learning_rate": 0.00011005008347245409, + "loss": 0.5144, + "step": 1353 + }, + { + "epoch": 0.000475266527906859, + "grad_norm": 0.3127586841583252, + "learning_rate": 0.00010998330550918197, + "loss": 0.6036, + "step": 1354 + }, + { + "epoch": 0.00047561753715937515, + "grad_norm": 0.32305601239204407, + "learning_rate": 0.00010991652754590987, + "loss": 0.5215, + "step": 1355 + }, + { + "epoch": 0.00047596854641189127, + "grad_norm": 0.30483660101890564, + "learning_rate": 0.00010984974958263774, + "loss": 0.6094, + "step": 1356 + }, + { + "epoch": 0.00047631955566440745, + "grad_norm": 0.33019503951072693, + "learning_rate": 0.00010978297161936561, + "loss": 0.5646, + "step": 1357 + }, + { + "epoch": 0.00047667056491692357, + "grad_norm": 0.3414929509162903, + "learning_rate": 0.00010971619365609349, + "loss": 0.5262, + "step": 1358 + }, + { + "epoch": 0.0004770215741694397, + "grad_norm": 0.3471517860889435, + "learning_rate": 0.00010964941569282137, + "loss": 0.492, + "step": 1359 + }, + { + "epoch": 0.00047737258342195587, + "grad_norm": 0.3226645588874817, + "learning_rate": 0.00010958263772954926, + "loss": 0.6318, + "step": 1360 + }, + { + "epoch": 0.000477723592674472, + "grad_norm": 0.3425777852535248, + "learning_rate": 0.00010951585976627715, + "loss": 0.5878, + "step": 1361 + }, + { + "epoch": 0.00047807460192698817, + "grad_norm": 0.307462215423584, + "learning_rate": 0.00010944908180300502, + "loss": 0.4948, + "step": 1362 + }, + { + "epoch": 0.0004784256111795043, + "grad_norm": 0.34796106815338135, + "learning_rate": 0.00010938230383973289, + "loss": 0.5525, + "step": 1363 + }, + { + "epoch": 0.0004787766204320204, + "grad_norm": 0.2861281633377075, + "learning_rate": 0.00010931552587646076, + "loss": 0.4578, + "step": 1364 + }, + { + "epoch": 0.0004791276296845366, + "grad_norm": 0.2861836552619934, + "learning_rate": 0.00010924874791318864, + "loss": 0.5761, + "step": 1365 + }, + { + "epoch": 0.0004794786389370527, + "grad_norm": 0.3063654601573944, + "learning_rate": 0.00010918196994991654, + "loss": 0.5338, + "step": 1366 + }, + { + "epoch": 0.0004798296481895689, + "grad_norm": 0.3108372390270233, + "learning_rate": 0.00010911519198664441, + "loss": 0.4896, + "step": 1367 + }, + { + "epoch": 0.000480180657442085, + "grad_norm": 0.3263947069644928, + "learning_rate": 0.0001090484140233723, + "loss": 0.6142, + "step": 1368 + }, + { + "epoch": 0.00048053166669460113, + "grad_norm": 0.27663156390190125, + "learning_rate": 0.00010898163606010017, + "loss": 0.3852, + "step": 1369 + }, + { + "epoch": 0.0004808826759471173, + "grad_norm": 0.2791202962398529, + "learning_rate": 0.00010891485809682804, + "loss": 0.6032, + "step": 1370 + }, + { + "epoch": 0.00048123368519963343, + "grad_norm": 0.2715228199958801, + "learning_rate": 0.00010884808013355594, + "loss": 0.4717, + "step": 1371 + }, + { + "epoch": 0.0004815846944521496, + "grad_norm": 0.3232786953449249, + "learning_rate": 0.00010878130217028382, + "loss": 0.5511, + "step": 1372 + }, + { + "epoch": 0.00048193570370466573, + "grad_norm": 0.42948031425476074, + "learning_rate": 0.00010871452420701169, + "loss": 0.5223, + "step": 1373 + }, + { + "epoch": 0.00048228671295718185, + "grad_norm": 0.31973496079444885, + "learning_rate": 0.00010864774624373956, + "loss": 0.4532, + "step": 1374 + }, + { + "epoch": 0.00048263772220969803, + "grad_norm": 0.3149821162223816, + "learning_rate": 0.00010858096828046744, + "loss": 0.4894, + "step": 1375 + }, + { + "epoch": 0.00048298873146221415, + "grad_norm": 0.30229589343070984, + "learning_rate": 0.00010851419031719534, + "loss": 0.5039, + "step": 1376 + }, + { + "epoch": 0.00048333974071473033, + "grad_norm": 0.36127185821533203, + "learning_rate": 0.00010844741235392321, + "loss": 0.4379, + "step": 1377 + }, + { + "epoch": 0.00048369074996724645, + "grad_norm": 0.3135043978691101, + "learning_rate": 0.0001083806343906511, + "loss": 0.5172, + "step": 1378 + }, + { + "epoch": 0.0004840417592197626, + "grad_norm": 0.33123600482940674, + "learning_rate": 0.00010831385642737897, + "loss": 0.4959, + "step": 1379 + }, + { + "epoch": 0.00048439276847227875, + "grad_norm": 0.32165780663490295, + "learning_rate": 0.00010824707846410684, + "loss": 0.5152, + "step": 1380 + }, + { + "epoch": 0.0004847437777247949, + "grad_norm": 0.28580865263938904, + "learning_rate": 0.00010818030050083472, + "loss": 0.4879, + "step": 1381 + }, + { + "epoch": 0.00048509478697731105, + "grad_norm": 0.4019862711429596, + "learning_rate": 0.00010811352253756262, + "loss": 0.5475, + "step": 1382 + }, + { + "epoch": 0.0004854457962298272, + "grad_norm": 0.34479352831840515, + "learning_rate": 0.00010804674457429049, + "loss": 0.4279, + "step": 1383 + }, + { + "epoch": 0.0004857968054823433, + "grad_norm": 0.3664172887802124, + "learning_rate": 0.00010797996661101836, + "loss": 0.5815, + "step": 1384 + }, + { + "epoch": 0.00048614781473485947, + "grad_norm": 0.34667205810546875, + "learning_rate": 0.00010791318864774625, + "loss": 0.5453, + "step": 1385 + }, + { + "epoch": 0.0004864988239873756, + "grad_norm": 0.36878061294555664, + "learning_rate": 0.00010784641068447412, + "loss": 0.5464, + "step": 1386 + }, + { + "epoch": 0.00048684983323989177, + "grad_norm": 0.3552783727645874, + "learning_rate": 0.00010777963272120202, + "loss": 0.5668, + "step": 1387 + }, + { + "epoch": 0.0004872008424924079, + "grad_norm": 0.35390666127204895, + "learning_rate": 0.0001077128547579299, + "loss": 0.4799, + "step": 1388 + }, + { + "epoch": 0.000487551851744924, + "grad_norm": 0.3539852797985077, + "learning_rate": 0.00010764607679465777, + "loss": 0.6264, + "step": 1389 + }, + { + "epoch": 0.0004879028609974402, + "grad_norm": 0.3104274868965149, + "learning_rate": 0.00010757929883138564, + "loss": 0.4881, + "step": 1390 + }, + { + "epoch": 0.0004882538702499563, + "grad_norm": 0.29643991589546204, + "learning_rate": 0.00010751252086811352, + "loss": 0.5277, + "step": 1391 + }, + { + "epoch": 0.0004886048795024725, + "grad_norm": 0.3498566448688507, + "learning_rate": 0.00010744574290484142, + "loss": 0.4394, + "step": 1392 + }, + { + "epoch": 0.0004889558887549886, + "grad_norm": 0.31261810660362244, + "learning_rate": 0.00010737896494156929, + "loss": 0.4557, + "step": 1393 + }, + { + "epoch": 0.0004893068980075047, + "grad_norm": 0.301792711019516, + "learning_rate": 0.00010731218697829716, + "loss": 0.471, + "step": 1394 + }, + { + "epoch": 0.0004896579072600209, + "grad_norm": 0.34246626496315, + "learning_rate": 0.00010724540901502505, + "loss": 0.5917, + "step": 1395 + }, + { + "epoch": 0.0004900089165125371, + "grad_norm": 0.2901524305343628, + "learning_rate": 0.00010717863105175292, + "loss": 0.441, + "step": 1396 + }, + { + "epoch": 0.0004903599257650532, + "grad_norm": 0.3026966452598572, + "learning_rate": 0.0001071118530884808, + "loss": 0.5373, + "step": 1397 + }, + { + "epoch": 0.0004907109350175693, + "grad_norm": 0.29963356256484985, + "learning_rate": 0.0001070450751252087, + "loss": 0.4464, + "step": 1398 + }, + { + "epoch": 0.0004910619442700855, + "grad_norm": 0.26481980085372925, + "learning_rate": 0.00010697829716193657, + "loss": 0.5372, + "step": 1399 + }, + { + "epoch": 0.0004914129535226016, + "grad_norm": 0.26084020733833313, + "learning_rate": 0.00010691151919866444, + "loss": 0.5523, + "step": 1400 + }, + { + "epoch": 0.0004917639627751178, + "grad_norm": 0.34062638878822327, + "learning_rate": 0.00010684474123539232, + "loss": 0.5466, + "step": 1401 + }, + { + "epoch": 0.0004921149720276339, + "grad_norm": 0.3231668472290039, + "learning_rate": 0.0001067779632721202, + "loss": 0.5019, + "step": 1402 + }, + { + "epoch": 0.00049246598128015, + "grad_norm": 0.3362787961959839, + "learning_rate": 0.00010671118530884809, + "loss": 0.5251, + "step": 1403 + }, + { + "epoch": 0.0004928169905326662, + "grad_norm": 0.28928473591804504, + "learning_rate": 0.00010664440734557598, + "loss": 0.5346, + "step": 1404 + }, + { + "epoch": 0.0004931679997851824, + "grad_norm": 0.32969072461128235, + "learning_rate": 0.00010657762938230385, + "loss": 0.6131, + "step": 1405 + }, + { + "epoch": 0.0004935190090376985, + "grad_norm": 0.29733914136886597, + "learning_rate": 0.00010651085141903172, + "loss": 0.4406, + "step": 1406 + }, + { + "epoch": 0.0004938700182902146, + "grad_norm": 0.36437737941741943, + "learning_rate": 0.0001064440734557596, + "loss": 0.551, + "step": 1407 + }, + { + "epoch": 0.0004942210275427308, + "grad_norm": 0.33889076113700867, + "learning_rate": 0.0001063772954924875, + "loss": 0.5904, + "step": 1408 + }, + { + "epoch": 0.000494572036795247, + "grad_norm": 0.3446680009365082, + "learning_rate": 0.00010631051752921537, + "loss": 0.394, + "step": 1409 + }, + { + "epoch": 0.000494923046047763, + "grad_norm": 0.33298397064208984, + "learning_rate": 0.00010624373956594324, + "loss": 0.5048, + "step": 1410 + }, + { + "epoch": 0.0004952740553002792, + "grad_norm": 0.3153474032878876, + "learning_rate": 0.00010617696160267111, + "loss": 0.5314, + "step": 1411 + }, + { + "epoch": 0.0004956250645527954, + "grad_norm": 0.27105385065078735, + "learning_rate": 0.000106110183639399, + "loss": 0.5098, + "step": 1412 + }, + { + "epoch": 0.0004959760738053114, + "grad_norm": 0.3450585901737213, + "learning_rate": 0.00010604340567612687, + "loss": 0.5249, + "step": 1413 + }, + { + "epoch": 0.0004963270830578276, + "grad_norm": 0.35962969064712524, + "learning_rate": 0.00010597662771285477, + "loss": 0.4714, + "step": 1414 + }, + { + "epoch": 0.0004966780923103438, + "grad_norm": 0.33413732051849365, + "learning_rate": 0.00010590984974958265, + "loss": 0.5618, + "step": 1415 + }, + { + "epoch": 0.00049702910156286, + "grad_norm": 0.37907567620277405, + "learning_rate": 0.00010584307178631052, + "loss": 0.5751, + "step": 1416 + }, + { + "epoch": 0.000497380110815376, + "grad_norm": 0.3324087858200073, + "learning_rate": 0.0001057762938230384, + "loss": 0.5032, + "step": 1417 + }, + { + "epoch": 0.0004977311200678922, + "grad_norm": 0.2794540822505951, + "learning_rate": 0.00010570951585976627, + "loss": 0.4823, + "step": 1418 + }, + { + "epoch": 0.0004980821293204084, + "grad_norm": 0.31896448135375977, + "learning_rate": 0.00010564273789649417, + "loss": 0.5293, + "step": 1419 + }, + { + "epoch": 0.0004984331385729245, + "grad_norm": 0.39455580711364746, + "learning_rate": 0.00010557595993322204, + "loss": 0.6312, + "step": 1420 + }, + { + "epoch": 0.0004987841478254406, + "grad_norm": 0.3108445107936859, + "learning_rate": 0.00010550918196994993, + "loss": 0.4614, + "step": 1421 + }, + { + "epoch": 0.0004991351570779568, + "grad_norm": 0.2984072268009186, + "learning_rate": 0.0001054424040066778, + "loss": 0.5516, + "step": 1422 + }, + { + "epoch": 0.0004994861663304729, + "grad_norm": 0.3056257665157318, + "learning_rate": 0.00010537562604340567, + "loss": 0.5906, + "step": 1423 + }, + { + "epoch": 0.0004998371755829891, + "grad_norm": 0.29374566674232483, + "learning_rate": 0.00010530884808013357, + "loss": 0.599, + "step": 1424 + }, + { + "epoch": 0.0005001881848355052, + "grad_norm": 0.3665946424007416, + "learning_rate": 0.00010524207011686145, + "loss": 0.5599, + "step": 1425 + }, + { + "epoch": 0.0005005391940880214, + "grad_norm": 0.31262800097465515, + "learning_rate": 0.00010517529215358932, + "loss": 0.5566, + "step": 1426 + }, + { + "epoch": 0.0005008902033405375, + "grad_norm": 0.3117959797382355, + "learning_rate": 0.0001051085141903172, + "loss": 0.4372, + "step": 1427 + }, + { + "epoch": 0.0005012412125930537, + "grad_norm": 0.3499256670475006, + "learning_rate": 0.00010504173622704507, + "loss": 0.543, + "step": 1428 + }, + { + "epoch": 0.0005015922218455698, + "grad_norm": 0.3630000948905945, + "learning_rate": 0.00010497495826377295, + "loss": 0.5099, + "step": 1429 + }, + { + "epoch": 0.0005019432310980859, + "grad_norm": 0.3609743118286133, + "learning_rate": 0.00010490818030050084, + "loss": 0.5304, + "step": 1430 + }, + { + "epoch": 0.0005022942403506021, + "grad_norm": 0.3600139617919922, + "learning_rate": 0.00010484140233722873, + "loss": 0.4811, + "step": 1431 + }, + { + "epoch": 0.0005026452496031183, + "grad_norm": 0.30108320713043213, + "learning_rate": 0.0001047746243739566, + "loss": 0.6055, + "step": 1432 + }, + { + "epoch": 0.0005029962588556343, + "grad_norm": 0.34729886054992676, + "learning_rate": 0.00010470784641068447, + "loss": 0.5011, + "step": 1433 + }, + { + "epoch": 0.0005033472681081505, + "grad_norm": 0.33984988927841187, + "learning_rate": 0.00010464106844741235, + "loss": 0.5905, + "step": 1434 + }, + { + "epoch": 0.0005036982773606667, + "grad_norm": 0.3109802007675171, + "learning_rate": 0.00010457429048414025, + "loss": 0.5228, + "step": 1435 + }, + { + "epoch": 0.0005040492866131829, + "grad_norm": 0.37691593170166016, + "learning_rate": 0.00010450751252086812, + "loss": 0.5839, + "step": 1436 + }, + { + "epoch": 0.0005044002958656989, + "grad_norm": 0.3665965497493744, + "learning_rate": 0.00010444073455759599, + "loss": 0.5381, + "step": 1437 + }, + { + "epoch": 0.0005047513051182151, + "grad_norm": 0.29414570331573486, + "learning_rate": 0.00010437395659432388, + "loss": 0.6072, + "step": 1438 + }, + { + "epoch": 0.0005051023143707313, + "grad_norm": 0.3206839859485626, + "learning_rate": 0.00010430717863105175, + "loss": 0.5285, + "step": 1439 + }, + { + "epoch": 0.0005054533236232473, + "grad_norm": 0.3003496527671814, + "learning_rate": 0.00010424040066777965, + "loss": 0.4037, + "step": 1440 + }, + { + "epoch": 0.0005058043328757635, + "grad_norm": 0.2955014109611511, + "learning_rate": 0.00010417362270450753, + "loss": 0.4646, + "step": 1441 + }, + { + "epoch": 0.0005061553421282797, + "grad_norm": 0.3399007022380829, + "learning_rate": 0.0001041068447412354, + "loss": 0.5649, + "step": 1442 + }, + { + "epoch": 0.0005065063513807958, + "grad_norm": 0.3394736349582672, + "learning_rate": 0.00010404006677796327, + "loss": 0.5512, + "step": 1443 + }, + { + "epoch": 0.0005068573606333119, + "grad_norm": 0.31650441884994507, + "learning_rate": 0.00010397328881469115, + "loss": 0.4669, + "step": 1444 + }, + { + "epoch": 0.0005072083698858281, + "grad_norm": 0.3380611538887024, + "learning_rate": 0.00010390651085141905, + "loss": 0.6714, + "step": 1445 + }, + { + "epoch": 0.0005075593791383443, + "grad_norm": 0.29049673676490784, + "learning_rate": 0.00010383973288814692, + "loss": 0.5652, + "step": 1446 + }, + { + "epoch": 0.0005079103883908604, + "grad_norm": 0.37694746255874634, + "learning_rate": 0.0001037729549248748, + "loss": 0.4355, + "step": 1447 + }, + { + "epoch": 0.0005082613976433765, + "grad_norm": 0.36622750759124756, + "learning_rate": 0.00010370617696160268, + "loss": 0.4758, + "step": 1448 + }, + { + "epoch": 0.0005086124068958927, + "grad_norm": 0.3366115093231201, + "learning_rate": 0.00010363939899833055, + "loss": 0.5498, + "step": 1449 + }, + { + "epoch": 0.0005089634161484088, + "grad_norm": 0.2836514711380005, + "learning_rate": 0.00010357262103505843, + "loss": 0.5405, + "step": 1450 + }, + { + "epoch": 0.000509314425400925, + "grad_norm": 0.357666015625, + "learning_rate": 0.00010350584307178633, + "loss": 0.4738, + "step": 1451 + }, + { + "epoch": 0.0005096654346534411, + "grad_norm": 0.37991905212402344, + "learning_rate": 0.0001034390651085142, + "loss": 0.4932, + "step": 1452 + }, + { + "epoch": 0.0005100164439059572, + "grad_norm": 0.2862101197242737, + "learning_rate": 0.00010337228714524207, + "loss": 0.5387, + "step": 1453 + }, + { + "epoch": 0.0005103674531584734, + "grad_norm": 0.3000154197216034, + "learning_rate": 0.00010330550918196994, + "loss": 0.509, + "step": 1454 + }, + { + "epoch": 0.0005107184624109896, + "grad_norm": 0.29454153776168823, + "learning_rate": 0.00010323873121869783, + "loss": 0.3872, + "step": 1455 + }, + { + "epoch": 0.0005110694716635057, + "grad_norm": 0.305803507566452, + "learning_rate": 0.00010317195325542572, + "loss": 0.5, + "step": 1456 + }, + { + "epoch": 0.0005114204809160218, + "grad_norm": 0.3164152204990387, + "learning_rate": 0.0001031051752921536, + "loss": 0.5426, + "step": 1457 + }, + { + "epoch": 0.000511771490168538, + "grad_norm": 0.3026213049888611, + "learning_rate": 0.00010303839732888148, + "loss": 0.5783, + "step": 1458 + }, + { + "epoch": 0.0005121224994210542, + "grad_norm": 0.3170768618583679, + "learning_rate": 0.00010297161936560935, + "loss": 0.5701, + "step": 1459 + }, + { + "epoch": 0.0005124735086735702, + "grad_norm": 0.3275301456451416, + "learning_rate": 0.00010290484140233722, + "loss": 0.4884, + "step": 1460 + }, + { + "epoch": 0.0005128245179260864, + "grad_norm": 0.3446187973022461, + "learning_rate": 0.00010283806343906512, + "loss": 0.4516, + "step": 1461 + }, + { + "epoch": 0.0005131755271786026, + "grad_norm": 0.3188260495662689, + "learning_rate": 0.000102771285475793, + "loss": 0.561, + "step": 1462 + }, + { + "epoch": 0.0005135265364311186, + "grad_norm": 0.3547864258289337, + "learning_rate": 0.00010270450751252087, + "loss": 0.5768, + "step": 1463 + }, + { + "epoch": 0.0005138775456836348, + "grad_norm": 0.3740866482257843, + "learning_rate": 0.00010263772954924876, + "loss": 0.4197, + "step": 1464 + }, + { + "epoch": 0.000514228554936151, + "grad_norm": 0.38915491104125977, + "learning_rate": 0.00010257095158597663, + "loss": 0.553, + "step": 1465 + }, + { + "epoch": 0.0005145795641886672, + "grad_norm": 0.38494518399238586, + "learning_rate": 0.0001025041736227045, + "loss": 0.6247, + "step": 1466 + }, + { + "epoch": 0.0005149305734411832, + "grad_norm": 0.2716946303844452, + "learning_rate": 0.0001024373956594324, + "loss": 0.4426, + "step": 1467 + }, + { + "epoch": 0.0005152815826936994, + "grad_norm": 0.33764415979385376, + "learning_rate": 0.00010237061769616028, + "loss": 0.5939, + "step": 1468 + }, + { + "epoch": 0.0005156325919462156, + "grad_norm": 0.34384095668792725, + "learning_rate": 0.00010230383973288815, + "loss": 0.604, + "step": 1469 + }, + { + "epoch": 0.0005159836011987317, + "grad_norm": 0.3203445076942444, + "learning_rate": 0.00010223706176961602, + "loss": 0.5255, + "step": 1470 + }, + { + "epoch": 0.0005163346104512478, + "grad_norm": 0.2592601180076599, + "learning_rate": 0.0001021702838063439, + "loss": 0.4509, + "step": 1471 + }, + { + "epoch": 0.000516685619703764, + "grad_norm": 0.3425324261188507, + "learning_rate": 0.0001021035058430718, + "loss": 0.5498, + "step": 1472 + }, + { + "epoch": 0.0005170366289562801, + "grad_norm": 0.3077262341976166, + "learning_rate": 0.00010203672787979967, + "loss": 0.5364, + "step": 1473 + }, + { + "epoch": 0.0005173876382087963, + "grad_norm": 0.2831708788871765, + "learning_rate": 0.00010196994991652756, + "loss": 0.434, + "step": 1474 + }, + { + "epoch": 0.0005177386474613124, + "grad_norm": 0.29104581475257874, + "learning_rate": 0.00010190317195325543, + "loss": 0.5875, + "step": 1475 + }, + { + "epoch": 0.0005180896567138286, + "grad_norm": 0.29584741592407227, + "learning_rate": 0.0001018363939899833, + "loss": 0.4574, + "step": 1476 + }, + { + "epoch": 0.0005184406659663447, + "grad_norm": 0.41971537470817566, + "learning_rate": 0.0001017696160267112, + "loss": 0.5616, + "step": 1477 + }, + { + "epoch": 0.0005187916752188609, + "grad_norm": 0.3439647853374481, + "learning_rate": 0.00010170283806343908, + "loss": 0.4288, + "step": 1478 + }, + { + "epoch": 0.000519142684471377, + "grad_norm": 0.35867923498153687, + "learning_rate": 0.00010163606010016695, + "loss": 0.4415, + "step": 1479 + }, + { + "epoch": 0.0005194936937238931, + "grad_norm": 0.368987500667572, + "learning_rate": 0.00010156928213689482, + "loss": 0.5474, + "step": 1480 + }, + { + "epoch": 0.0005198447029764093, + "grad_norm": 0.30241629481315613, + "learning_rate": 0.00010150250417362271, + "loss": 0.4113, + "step": 1481 + }, + { + "epoch": 0.0005201957122289255, + "grad_norm": 0.31089895963668823, + "learning_rate": 0.00010143572621035058, + "loss": 0.4726, + "step": 1482 + }, + { + "epoch": 0.0005205467214814415, + "grad_norm": 0.2900741994380951, + "learning_rate": 0.00010136894824707848, + "loss": 0.4591, + "step": 1483 + }, + { + "epoch": 0.0005208977307339577, + "grad_norm": 0.2920607030391693, + "learning_rate": 0.00010130217028380636, + "loss": 0.508, + "step": 1484 + }, + { + "epoch": 0.0005212487399864739, + "grad_norm": 0.5145193338394165, + "learning_rate": 0.00010123539232053423, + "loss": 0.6125, + "step": 1485 + }, + { + "epoch": 0.0005215997492389901, + "grad_norm": 0.3466121554374695, + "learning_rate": 0.0001011686143572621, + "loss": 0.5236, + "step": 1486 + }, + { + "epoch": 0.0005219507584915061, + "grad_norm": 0.2820659577846527, + "learning_rate": 0.00010110183639398998, + "loss": 0.4886, + "step": 1487 + }, + { + "epoch": 0.0005223017677440223, + "grad_norm": 0.31797733902931213, + "learning_rate": 0.00010103505843071788, + "loss": 0.4605, + "step": 1488 + }, + { + "epoch": 0.0005226527769965385, + "grad_norm": 0.3547564148902893, + "learning_rate": 0.00010096828046744575, + "loss": 0.5559, + "step": 1489 + }, + { + "epoch": 0.0005230037862490545, + "grad_norm": 0.3584667146205902, + "learning_rate": 0.00010090150250417362, + "loss": 0.4402, + "step": 1490 + }, + { + "epoch": 0.0005233547955015707, + "grad_norm": 0.3230780065059662, + "learning_rate": 0.00010083472454090151, + "loss": 0.5187, + "step": 1491 + }, + { + "epoch": 0.0005237058047540869, + "grad_norm": 0.3932897448539734, + "learning_rate": 0.00010076794657762938, + "loss": 0.5758, + "step": 1492 + }, + { + "epoch": 0.000524056814006603, + "grad_norm": 0.39378783106803894, + "learning_rate": 0.00010070116861435728, + "loss": 0.5199, + "step": 1493 + }, + { + "epoch": 0.0005244078232591191, + "grad_norm": 0.33147481083869934, + "learning_rate": 0.00010063439065108516, + "loss": 0.4489, + "step": 1494 + }, + { + "epoch": 0.0005247588325116353, + "grad_norm": 0.3706863522529602, + "learning_rate": 0.00010056761268781303, + "loss": 0.4601, + "step": 1495 + }, + { + "epoch": 0.0005251098417641515, + "grad_norm": 0.45806849002838135, + "learning_rate": 0.0001005008347245409, + "loss": 0.4522, + "step": 1496 + }, + { + "epoch": 0.0005254608510166676, + "grad_norm": 0.2931700050830841, + "learning_rate": 0.00010043405676126878, + "loss": 0.3673, + "step": 1497 + }, + { + "epoch": 0.0005258118602691837, + "grad_norm": 0.31791719794273376, + "learning_rate": 0.00010036727879799666, + "loss": 0.497, + "step": 1498 + }, + { + "epoch": 0.0005261628695216999, + "grad_norm": 0.51285719871521, + "learning_rate": 0.00010030050083472455, + "loss": 0.4736, + "step": 1499 + }, + { + "epoch": 0.000526513878774216, + "grad_norm": 0.37526455521583557, + "learning_rate": 0.00010023372287145244, + "loss": 0.5242, + "step": 1500 + }, + { + "epoch": 0.0005268648880267322, + "grad_norm": 0.3161305785179138, + "learning_rate": 0.00010016694490818031, + "loss": 0.5977, + "step": 1501 + }, + { + "epoch": 0.0005272158972792483, + "grad_norm": 0.37831833958625793, + "learning_rate": 0.00010010016694490818, + "loss": 0.4086, + "step": 1502 + }, + { + "epoch": 0.0005275669065317644, + "grad_norm": 0.32192960381507874, + "learning_rate": 0.00010003338898163605, + "loss": 0.5963, + "step": 1503 + }, + { + "epoch": 0.0005279179157842806, + "grad_norm": 0.2514601945877075, + "learning_rate": 9.996661101836394e-05, + "loss": 0.4623, + "step": 1504 + }, + { + "epoch": 0.0005282689250367968, + "grad_norm": 0.2768949270248413, + "learning_rate": 9.989983305509183e-05, + "loss": 0.4104, + "step": 1505 + }, + { + "epoch": 0.0005286199342893129, + "grad_norm": 0.405597448348999, + "learning_rate": 9.98330550918197e-05, + "loss": 0.6432, + "step": 1506 + }, + { + "epoch": 0.000528970943541829, + "grad_norm": 0.36578214168548584, + "learning_rate": 9.976627712854757e-05, + "loss": 0.5242, + "step": 1507 + }, + { + "epoch": 0.0005293219527943452, + "grad_norm": 0.3062324821949005, + "learning_rate": 9.969949916527546e-05, + "loss": 0.5465, + "step": 1508 + }, + { + "epoch": 0.0005296729620468614, + "grad_norm": 0.35657453536987305, + "learning_rate": 9.963272120200335e-05, + "loss": 0.521, + "step": 1509 + }, + { + "epoch": 0.0005300239712993774, + "grad_norm": 0.4276399612426758, + "learning_rate": 9.956594323873122e-05, + "loss": 0.6213, + "step": 1510 + }, + { + "epoch": 0.0005303749805518936, + "grad_norm": 0.2819078862667084, + "learning_rate": 9.949916527545911e-05, + "loss": 0.5481, + "step": 1511 + }, + { + "epoch": 0.0005307259898044098, + "grad_norm": 0.31928518414497375, + "learning_rate": 9.943238731218698e-05, + "loss": 0.6391, + "step": 1512 + }, + { + "epoch": 0.0005310769990569258, + "grad_norm": 0.30502405762672424, + "learning_rate": 9.936560934891487e-05, + "loss": 0.6408, + "step": 1513 + }, + { + "epoch": 0.000531428008309442, + "grad_norm": 0.33620330691337585, + "learning_rate": 9.929883138564274e-05, + "loss": 0.5161, + "step": 1514 + }, + { + "epoch": 0.0005317790175619582, + "grad_norm": 0.34412580728530884, + "learning_rate": 9.923205342237061e-05, + "loss": 0.5942, + "step": 1515 + }, + { + "epoch": 0.0005321300268144744, + "grad_norm": 0.4236386716365814, + "learning_rate": 9.91652754590985e-05, + "loss": 0.6118, + "step": 1516 + }, + { + "epoch": 0.0005324810360669904, + "grad_norm": 0.3482692539691925, + "learning_rate": 9.909849749582639e-05, + "loss": 0.4935, + "step": 1517 + }, + { + "epoch": 0.0005328320453195066, + "grad_norm": 0.36736801266670227, + "learning_rate": 9.903171953255426e-05, + "loss": 0.5077, + "step": 1518 + }, + { + "epoch": 0.0005331830545720228, + "grad_norm": 0.3174130916595459, + "learning_rate": 9.896494156928215e-05, + "loss": 0.5046, + "step": 1519 + }, + { + "epoch": 0.0005335340638245389, + "grad_norm": 0.35202938318252563, + "learning_rate": 9.889816360601002e-05, + "loss": 0.5843, + "step": 1520 + }, + { + "epoch": 0.000533885073077055, + "grad_norm": 0.3530493974685669, + "learning_rate": 9.883138564273791e-05, + "loss": 0.4544, + "step": 1521 + }, + { + "epoch": 0.0005342360823295712, + "grad_norm": 0.36287322640419006, + "learning_rate": 9.876460767946578e-05, + "loss": 0.3369, + "step": 1522 + }, + { + "epoch": 0.0005345870915820873, + "grad_norm": 0.32286468148231506, + "learning_rate": 9.869782971619365e-05, + "loss": 0.4056, + "step": 1523 + }, + { + "epoch": 0.0005349381008346035, + "grad_norm": 0.34090831875801086, + "learning_rate": 9.863105175292154e-05, + "loss": 0.4746, + "step": 1524 + }, + { + "epoch": 0.0005352891100871196, + "grad_norm": 0.35454946756362915, + "learning_rate": 9.856427378964941e-05, + "loss": 0.5432, + "step": 1525 + }, + { + "epoch": 0.0005356401193396357, + "grad_norm": 0.3738378584384918, + "learning_rate": 9.84974958263773e-05, + "loss": 0.5179, + "step": 1526 + }, + { + "epoch": 0.0005359911285921519, + "grad_norm": 0.309709370136261, + "learning_rate": 9.843071786310519e-05, + "loss": 0.5872, + "step": 1527 + }, + { + "epoch": 0.0005363421378446681, + "grad_norm": 0.2821864187717438, + "learning_rate": 9.836393989983306e-05, + "loss": 0.5162, + "step": 1528 + }, + { + "epoch": 0.0005366931470971842, + "grad_norm": 0.46964001655578613, + "learning_rate": 9.829716193656095e-05, + "loss": 0.4713, + "step": 1529 + }, + { + "epoch": 0.0005370441563497003, + "grad_norm": 0.3433643877506256, + "learning_rate": 9.823038397328882e-05, + "loss": 0.516, + "step": 1530 + }, + { + "epoch": 0.0005373951656022165, + "grad_norm": 0.347112774848938, + "learning_rate": 9.816360601001669e-05, + "loss": 0.3772, + "step": 1531 + }, + { + "epoch": 0.0005377461748547327, + "grad_norm": 0.2924909293651581, + "learning_rate": 9.809682804674458e-05, + "loss": 0.5616, + "step": 1532 + }, + { + "epoch": 0.0005380971841072487, + "grad_norm": 0.36090362071990967, + "learning_rate": 9.803005008347245e-05, + "loss": 0.5715, + "step": 1533 + }, + { + "epoch": 0.0005384481933597649, + "grad_norm": 0.31504470109939575, + "learning_rate": 9.796327212020034e-05, + "loss": 0.5311, + "step": 1534 + }, + { + "epoch": 0.0005387992026122811, + "grad_norm": 0.34885862469673157, + "learning_rate": 9.789649415692823e-05, + "loss": 0.4626, + "step": 1535 + }, + { + "epoch": 0.0005391502118647971, + "grad_norm": 0.34042325615882874, + "learning_rate": 9.78297161936561e-05, + "loss": 0.4934, + "step": 1536 + }, + { + "epoch": 0.0005395012211173133, + "grad_norm": 0.39018404483795166, + "learning_rate": 9.776293823038399e-05, + "loss": 0.5814, + "step": 1537 + }, + { + "epoch": 0.0005398522303698295, + "grad_norm": 0.2676241397857666, + "learning_rate": 9.769616026711186e-05, + "loss": 0.438, + "step": 1538 + }, + { + "epoch": 0.0005402032396223457, + "grad_norm": 0.32380932569503784, + "learning_rate": 9.762938230383973e-05, + "loss": 0.6087, + "step": 1539 + }, + { + "epoch": 0.0005405542488748617, + "grad_norm": 0.35949036478996277, + "learning_rate": 9.756260434056762e-05, + "loss": 0.6076, + "step": 1540 + }, + { + "epoch": 0.0005409052581273779, + "grad_norm": 0.29408982396125793, + "learning_rate": 9.749582637729549e-05, + "loss": 0.5134, + "step": 1541 + }, + { + "epoch": 0.0005412562673798941, + "grad_norm": 0.30686628818511963, + "learning_rate": 9.742904841402337e-05, + "loss": 0.5617, + "step": 1542 + }, + { + "epoch": 0.0005416072766324102, + "grad_norm": 0.37297409772872925, + "learning_rate": 9.736227045075125e-05, + "loss": 0.4563, + "step": 1543 + }, + { + "epoch": 0.0005419582858849263, + "grad_norm": 0.3103518486022949, + "learning_rate": 9.729549248747914e-05, + "loss": 0.5235, + "step": 1544 + }, + { + "epoch": 0.0005423092951374425, + "grad_norm": 0.3941648602485657, + "learning_rate": 9.722871452420703e-05, + "loss": 0.6541, + "step": 1545 + }, + { + "epoch": 0.0005426603043899586, + "grad_norm": 0.30755361914634705, + "learning_rate": 9.71619365609349e-05, + "loss": 0.5612, + "step": 1546 + }, + { + "epoch": 0.0005430113136424748, + "grad_norm": 0.35478439927101135, + "learning_rate": 9.709515859766277e-05, + "loss": 0.6078, + "step": 1547 + }, + { + "epoch": 0.0005433623228949909, + "grad_norm": 0.30011776089668274, + "learning_rate": 9.702838063439066e-05, + "loss": 0.3989, + "step": 1548 + }, + { + "epoch": 0.0005437133321475071, + "grad_norm": 0.3524412214756012, + "learning_rate": 9.696160267111853e-05, + "loss": 0.554, + "step": 1549 + }, + { + "epoch": 0.0005440643414000232, + "grad_norm": 0.33379805088043213, + "learning_rate": 9.68948247078464e-05, + "loss": 0.4773, + "step": 1550 + }, + { + "epoch": 0.0005444153506525394, + "grad_norm": 0.3144623339176178, + "learning_rate": 9.682804674457429e-05, + "loss": 0.5397, + "step": 1551 + }, + { + "epoch": 0.0005447663599050555, + "grad_norm": 0.3189099431037903, + "learning_rate": 9.676126878130218e-05, + "loss": 0.5577, + "step": 1552 + }, + { + "epoch": 0.0005451173691575716, + "grad_norm": 0.2930092215538025, + "learning_rate": 9.669449081803006e-05, + "loss": 0.436, + "step": 1553 + }, + { + "epoch": 0.0005454683784100878, + "grad_norm": 0.30305665731430054, + "learning_rate": 9.662771285475794e-05, + "loss": 0.6663, + "step": 1554 + }, + { + "epoch": 0.000545819387662604, + "grad_norm": 0.31724509596824646, + "learning_rate": 9.656093489148581e-05, + "loss": 0.5232, + "step": 1555 + }, + { + "epoch": 0.00054617039691512, + "grad_norm": 0.3048739731311798, + "learning_rate": 9.64941569282137e-05, + "loss": 0.5975, + "step": 1556 + }, + { + "epoch": 0.0005465214061676362, + "grad_norm": 0.313481867313385, + "learning_rate": 9.642737896494157e-05, + "loss": 0.5658, + "step": 1557 + }, + { + "epoch": 0.0005468724154201524, + "grad_norm": 0.3365669548511505, + "learning_rate": 9.636060100166944e-05, + "loss": 0.5356, + "step": 1558 + }, + { + "epoch": 0.0005472234246726686, + "grad_norm": 0.29624179005622864, + "learning_rate": 9.629382303839733e-05, + "loss": 0.5596, + "step": 1559 + }, + { + "epoch": 0.0005475744339251846, + "grad_norm": 0.32584840059280396, + "learning_rate": 9.62270450751252e-05, + "loss": 0.6195, + "step": 1560 + }, + { + "epoch": 0.0005479254431777008, + "grad_norm": 0.3141777217388153, + "learning_rate": 9.616026711185309e-05, + "loss": 0.5428, + "step": 1561 + }, + { + "epoch": 0.000548276452430217, + "grad_norm": 0.49182063341140747, + "learning_rate": 9.609348914858098e-05, + "loss": 0.4425, + "step": 1562 + }, + { + "epoch": 0.000548627461682733, + "grad_norm": 0.3521610200405121, + "learning_rate": 9.602671118530885e-05, + "loss": 0.4566, + "step": 1563 + }, + { + "epoch": 0.0005489784709352492, + "grad_norm": 0.32009604573249817, + "learning_rate": 9.595993322203674e-05, + "loss": 0.4673, + "step": 1564 + }, + { + "epoch": 0.0005493294801877654, + "grad_norm": 0.4251219630241394, + "learning_rate": 9.589315525876461e-05, + "loss": 0.4582, + "step": 1565 + }, + { + "epoch": 0.0005496804894402815, + "grad_norm": 0.4044347107410431, + "learning_rate": 9.582637729549248e-05, + "loss": 0.6094, + "step": 1566 + }, + { + "epoch": 0.0005500314986927976, + "grad_norm": 0.37995630502700806, + "learning_rate": 9.575959933222037e-05, + "loss": 0.5798, + "step": 1567 + }, + { + "epoch": 0.0005503825079453138, + "grad_norm": 0.36014696955680847, + "learning_rate": 9.569282136894824e-05, + "loss": 0.5578, + "step": 1568 + }, + { + "epoch": 0.00055073351719783, + "grad_norm": 0.36085575819015503, + "learning_rate": 9.562604340567613e-05, + "loss": 0.5723, + "step": 1569 + }, + { + "epoch": 0.0005510845264503461, + "grad_norm": 0.34479430317878723, + "learning_rate": 9.555926544240402e-05, + "loss": 0.3701, + "step": 1570 + }, + { + "epoch": 0.0005514355357028622, + "grad_norm": 0.29680463671684265, + "learning_rate": 9.549248747913189e-05, + "loss": 0.443, + "step": 1571 + }, + { + "epoch": 0.0005517865449553784, + "grad_norm": 0.282615065574646, + "learning_rate": 9.542570951585978e-05, + "loss": 0.4408, + "step": 1572 + }, + { + "epoch": 0.0005521375542078945, + "grad_norm": 0.30851373076438904, + "learning_rate": 9.535893155258765e-05, + "loss": 0.5537, + "step": 1573 + }, + { + "epoch": 0.0005524885634604107, + "grad_norm": 0.41260892152786255, + "learning_rate": 9.529215358931554e-05, + "loss": 0.5401, + "step": 1574 + }, + { + "epoch": 0.0005528395727129268, + "grad_norm": 0.31149372458457947, + "learning_rate": 9.522537562604341e-05, + "loss": 0.4033, + "step": 1575 + }, + { + "epoch": 0.0005531905819654429, + "grad_norm": 0.33126652240753174, + "learning_rate": 9.515859766277128e-05, + "loss": 0.5386, + "step": 1576 + }, + { + "epoch": 0.0005535415912179591, + "grad_norm": 0.2965177297592163, + "learning_rate": 9.509181969949917e-05, + "loss": 0.4869, + "step": 1577 + }, + { + "epoch": 0.0005538926004704753, + "grad_norm": 0.28436359763145447, + "learning_rate": 9.502504173622706e-05, + "loss": 0.5632, + "step": 1578 + }, + { + "epoch": 0.0005542436097229914, + "grad_norm": 0.3518412113189697, + "learning_rate": 9.495826377295493e-05, + "loss": 0.4086, + "step": 1579 + }, + { + "epoch": 0.0005545946189755075, + "grad_norm": 0.3295888900756836, + "learning_rate": 9.489148580968282e-05, + "loss": 0.5742, + "step": 1580 + }, + { + "epoch": 0.0005549456282280237, + "grad_norm": 0.3147815763950348, + "learning_rate": 9.482470784641069e-05, + "loss": 0.5332, + "step": 1581 + }, + { + "epoch": 0.0005552966374805399, + "grad_norm": 0.30593639612197876, + "learning_rate": 9.475792988313858e-05, + "loss": 0.5496, + "step": 1582 + }, + { + "epoch": 0.0005556476467330559, + "grad_norm": 0.3162075877189636, + "learning_rate": 9.469115191986645e-05, + "loss": 0.5912, + "step": 1583 + }, + { + "epoch": 0.0005559986559855721, + "grad_norm": 0.32497403025627136, + "learning_rate": 9.462437395659432e-05, + "loss": 0.5494, + "step": 1584 + }, + { + "epoch": 0.0005563496652380883, + "grad_norm": 0.31055036187171936, + "learning_rate": 9.455759599332221e-05, + "loss": 0.6336, + "step": 1585 + }, + { + "epoch": 0.0005567006744906044, + "grad_norm": 0.33537331223487854, + "learning_rate": 9.449081803005008e-05, + "loss": 0.4221, + "step": 1586 + }, + { + "epoch": 0.0005570516837431205, + "grad_norm": 0.3572893440723419, + "learning_rate": 9.442404006677797e-05, + "loss": 0.5553, + "step": 1587 + }, + { + "epoch": 0.0005574026929956367, + "grad_norm": 0.3298802375793457, + "learning_rate": 9.435726210350586e-05, + "loss": 0.5121, + "step": 1588 + }, + { + "epoch": 0.0005577537022481529, + "grad_norm": 0.3529982268810272, + "learning_rate": 9.429048414023373e-05, + "loss": 0.3964, + "step": 1589 + }, + { + "epoch": 0.000558104711500669, + "grad_norm": 0.294223427772522, + "learning_rate": 9.422370617696162e-05, + "loss": 0.4495, + "step": 1590 + }, + { + "epoch": 0.0005584557207531851, + "grad_norm": 0.2953149676322937, + "learning_rate": 9.415692821368949e-05, + "loss": 0.4241, + "step": 1591 + }, + { + "epoch": 0.0005588067300057013, + "grad_norm": 0.31237637996673584, + "learning_rate": 9.409015025041736e-05, + "loss": 0.5025, + "step": 1592 + }, + { + "epoch": 0.0005591577392582174, + "grad_norm": 0.31202566623687744, + "learning_rate": 9.402337228714525e-05, + "loss": 0.5442, + "step": 1593 + }, + { + "epoch": 0.0005595087485107335, + "grad_norm": 0.34976473450660706, + "learning_rate": 9.395659432387312e-05, + "loss": 0.5314, + "step": 1594 + }, + { + "epoch": 0.0005598597577632497, + "grad_norm": 0.3305265009403229, + "learning_rate": 9.388981636060101e-05, + "loss": 0.4842, + "step": 1595 + }, + { + "epoch": 0.0005602107670157658, + "grad_norm": 0.30773475766181946, + "learning_rate": 9.38230383973289e-05, + "loss": 0.4621, + "step": 1596 + }, + { + "epoch": 0.000560561776268282, + "grad_norm": 0.35445886850357056, + "learning_rate": 9.375626043405677e-05, + "loss": 0.509, + "step": 1597 + }, + { + "epoch": 0.0005609127855207981, + "grad_norm": 0.46057018637657166, + "learning_rate": 9.368948247078465e-05, + "loss": 0.5471, + "step": 1598 + }, + { + "epoch": 0.0005612637947733143, + "grad_norm": 0.3413529396057129, + "learning_rate": 9.362270450751253e-05, + "loss": 0.5558, + "step": 1599 + }, + { + "epoch": 0.0005616148040258304, + "grad_norm": 0.36943134665489197, + "learning_rate": 9.35559265442404e-05, + "loss": 0.4718, + "step": 1600 + }, + { + "epoch": 0.0005619658132783466, + "grad_norm": 0.3529636263847351, + "learning_rate": 9.348914858096829e-05, + "loss": 0.4591, + "step": 1601 + }, + { + "epoch": 0.0005623168225308627, + "grad_norm": 0.3375125229358673, + "learning_rate": 9.342237061769616e-05, + "loss": 0.4971, + "step": 1602 + }, + { + "epoch": 0.0005626678317833788, + "grad_norm": 0.3923933804035187, + "learning_rate": 9.335559265442403e-05, + "loss": 0.545, + "step": 1603 + }, + { + "epoch": 0.000563018841035895, + "grad_norm": 0.3128841519355774, + "learning_rate": 9.328881469115192e-05, + "loss": 0.4374, + "step": 1604 + }, + { + "epoch": 0.0005633698502884112, + "grad_norm": 0.3729458153247833, + "learning_rate": 9.322203672787981e-05, + "loss": 0.581, + "step": 1605 + }, + { + "epoch": 0.0005637208595409272, + "grad_norm": 0.3644692003726959, + "learning_rate": 9.31552587646077e-05, + "loss": 0.5223, + "step": 1606 + }, + { + "epoch": 0.0005640718687934434, + "grad_norm": 0.365633100271225, + "learning_rate": 9.308848080133557e-05, + "loss": 0.3695, + "step": 1607 + }, + { + "epoch": 0.0005644228780459596, + "grad_norm": 0.3256838917732239, + "learning_rate": 9.302170283806344e-05, + "loss": 0.5484, + "step": 1608 + }, + { + "epoch": 0.0005647738872984758, + "grad_norm": 0.26042798161506653, + "learning_rate": 9.295492487479133e-05, + "loss": 0.529, + "step": 1609 + }, + { + "epoch": 0.0005651248965509918, + "grad_norm": 0.27954763174057007, + "learning_rate": 9.28881469115192e-05, + "loss": 0.5216, + "step": 1610 + }, + { + "epoch": 0.000565475905803508, + "grad_norm": 0.3117378354072571, + "learning_rate": 9.282136894824707e-05, + "loss": 0.4835, + "step": 1611 + }, + { + "epoch": 0.0005658269150560242, + "grad_norm": 0.3219063878059387, + "learning_rate": 9.275459098497496e-05, + "loss": 0.6403, + "step": 1612 + }, + { + "epoch": 0.0005661779243085403, + "grad_norm": 0.32121285796165466, + "learning_rate": 9.268781302170285e-05, + "loss": 0.5992, + "step": 1613 + }, + { + "epoch": 0.0005665289335610564, + "grad_norm": 0.2896992564201355, + "learning_rate": 9.262103505843073e-05, + "loss": 0.4995, + "step": 1614 + }, + { + "epoch": 0.0005668799428135726, + "grad_norm": 0.311301589012146, + "learning_rate": 9.255425709515861e-05, + "loss": 0.5201, + "step": 1615 + }, + { + "epoch": 0.0005672309520660887, + "grad_norm": 0.2977074682712555, + "learning_rate": 9.248747913188648e-05, + "loss": 0.5557, + "step": 1616 + }, + { + "epoch": 0.0005675819613186049, + "grad_norm": 0.315746009349823, + "learning_rate": 9.242070116861437e-05, + "loss": 0.5746, + "step": 1617 + }, + { + "epoch": 0.000567932970571121, + "grad_norm": 0.323231965303421, + "learning_rate": 9.235392320534224e-05, + "loss": 0.5714, + "step": 1618 + }, + { + "epoch": 0.0005682839798236372, + "grad_norm": 0.30381882190704346, + "learning_rate": 9.228714524207011e-05, + "loss": 0.5279, + "step": 1619 + }, + { + "epoch": 0.0005686349890761533, + "grad_norm": 0.3350276052951813, + "learning_rate": 9.2220367278798e-05, + "loss": 0.4504, + "step": 1620 + }, + { + "epoch": 0.0005689859983286694, + "grad_norm": 0.3821620047092438, + "learning_rate": 9.215358931552587e-05, + "loss": 0.4713, + "step": 1621 + }, + { + "epoch": 0.0005693370075811856, + "grad_norm": 0.299938827753067, + "learning_rate": 9.208681135225376e-05, + "loss": 0.5426, + "step": 1622 + }, + { + "epoch": 0.0005696880168337017, + "grad_norm": 0.3533617854118347, + "learning_rate": 9.202003338898165e-05, + "loss": 0.5947, + "step": 1623 + }, + { + "epoch": 0.0005700390260862179, + "grad_norm": 0.5132538080215454, + "learning_rate": 9.195325542570952e-05, + "loss": 0.4809, + "step": 1624 + }, + { + "epoch": 0.000570390035338734, + "grad_norm": 0.28735020756721497, + "learning_rate": 9.18864774624374e-05, + "loss": 0.5597, + "step": 1625 + }, + { + "epoch": 0.0005707410445912501, + "grad_norm": 0.3230040669441223, + "learning_rate": 9.181969949916528e-05, + "loss": 0.5099, + "step": 1626 + }, + { + "epoch": 0.0005710920538437663, + "grad_norm": 0.3185240924358368, + "learning_rate": 9.175292153589315e-05, + "loss": 0.5443, + "step": 1627 + }, + { + "epoch": 0.0005714430630962825, + "grad_norm": 0.3230789005756378, + "learning_rate": 9.168614357262104e-05, + "loss": 0.4757, + "step": 1628 + }, + { + "epoch": 0.0005717940723487986, + "grad_norm": 0.3181735873222351, + "learning_rate": 9.161936560934891e-05, + "loss": 0.4645, + "step": 1629 + }, + { + "epoch": 0.0005721450816013147, + "grad_norm": 0.31638282537460327, + "learning_rate": 9.15525876460768e-05, + "loss": 0.6041, + "step": 1630 + }, + { + "epoch": 0.0005724960908538309, + "grad_norm": 0.31525102257728577, + "learning_rate": 9.148580968280469e-05, + "loss": 0.5238, + "step": 1631 + }, + { + "epoch": 0.0005728471001063471, + "grad_norm": 0.27146804332733154, + "learning_rate": 9.141903171953256e-05, + "loss": 0.5115, + "step": 1632 + }, + { + "epoch": 0.0005731981093588631, + "grad_norm": 0.28801295161247253, + "learning_rate": 9.135225375626045e-05, + "loss": 0.4111, + "step": 1633 + }, + { + "epoch": 0.0005735491186113793, + "grad_norm": 0.3048948645591736, + "learning_rate": 9.128547579298832e-05, + "loss": 0.5293, + "step": 1634 + }, + { + "epoch": 0.0005739001278638955, + "grad_norm": 0.31797000765800476, + "learning_rate": 9.121869782971619e-05, + "loss": 0.5365, + "step": 1635 + }, + { + "epoch": 0.0005742511371164116, + "grad_norm": 0.3156517446041107, + "learning_rate": 9.115191986644408e-05, + "loss": 0.6072, + "step": 1636 + }, + { + "epoch": 0.0005746021463689277, + "grad_norm": 0.28218841552734375, + "learning_rate": 9.108514190317195e-05, + "loss": 0.5127, + "step": 1637 + }, + { + "epoch": 0.0005749531556214439, + "grad_norm": 0.34264588356018066, + "learning_rate": 9.101836393989984e-05, + "loss": 0.5442, + "step": 1638 + }, + { + "epoch": 0.0005753041648739601, + "grad_norm": 0.31075727939605713, + "learning_rate": 9.095158597662771e-05, + "loss": 0.4853, + "step": 1639 + }, + { + "epoch": 0.0005756551741264762, + "grad_norm": 0.34270209074020386, + "learning_rate": 9.08848080133556e-05, + "loss": 0.5188, + "step": 1640 + }, + { + "epoch": 0.0005760061833789923, + "grad_norm": 0.3420792520046234, + "learning_rate": 9.081803005008348e-05, + "loss": 0.552, + "step": 1641 + }, + { + "epoch": 0.0005763571926315085, + "grad_norm": 0.24184514582157135, + "learning_rate": 9.075125208681136e-05, + "loss": 0.4318, + "step": 1642 + }, + { + "epoch": 0.0005767082018840246, + "grad_norm": 0.27248474955558777, + "learning_rate": 9.068447412353923e-05, + "loss": 0.4984, + "step": 1643 + }, + { + "epoch": 0.0005770592111365408, + "grad_norm": 0.2861645817756653, + "learning_rate": 9.061769616026712e-05, + "loss": 0.4954, + "step": 1644 + }, + { + "epoch": 0.0005774102203890569, + "grad_norm": 0.3070414662361145, + "learning_rate": 9.055091819699499e-05, + "loss": 0.5734, + "step": 1645 + }, + { + "epoch": 0.000577761229641573, + "grad_norm": 0.32180657982826233, + "learning_rate": 9.048414023372288e-05, + "loss": 0.595, + "step": 1646 + }, + { + "epoch": 0.0005781122388940892, + "grad_norm": 0.29433441162109375, + "learning_rate": 9.041736227045075e-05, + "loss": 0.4721, + "step": 1647 + }, + { + "epoch": 0.0005784632481466053, + "grad_norm": 0.28735247254371643, + "learning_rate": 9.035058430717864e-05, + "loss": 0.5441, + "step": 1648 + }, + { + "epoch": 0.0005788142573991215, + "grad_norm": 0.38344794511795044, + "learning_rate": 9.028380634390652e-05, + "loss": 0.6197, + "step": 1649 + }, + { + "epoch": 0.0005791652666516376, + "grad_norm": 0.32271769642829895, + "learning_rate": 9.02170283806344e-05, + "loss": 0.5229, + "step": 1650 + }, + { + "epoch": 0.0005795162759041538, + "grad_norm": 0.27504557371139526, + "learning_rate": 9.015025041736227e-05, + "loss": 0.432, + "step": 1651 + }, + { + "epoch": 0.00057986728515667, + "grad_norm": 0.3397347033023834, + "learning_rate": 9.008347245409016e-05, + "loss": 0.5546, + "step": 1652 + }, + { + "epoch": 0.000580218294409186, + "grad_norm": 0.3478119671344757, + "learning_rate": 9.001669449081803e-05, + "loss": 0.5094, + "step": 1653 + }, + { + "epoch": 0.0005805693036617022, + "grad_norm": 0.3200027644634247, + "learning_rate": 8.994991652754592e-05, + "loss": 0.4964, + "step": 1654 + }, + { + "epoch": 0.0005809203129142184, + "grad_norm": 0.3458947539329529, + "learning_rate": 8.988313856427379e-05, + "loss": 0.5945, + "step": 1655 + }, + { + "epoch": 0.0005812713221667344, + "grad_norm": 0.30390462279319763, + "learning_rate": 8.981636060100166e-05, + "loss": 0.5664, + "step": 1656 + }, + { + "epoch": 0.0005816223314192506, + "grad_norm": 0.32214075326919556, + "learning_rate": 8.974958263772955e-05, + "loss": 0.464, + "step": 1657 + }, + { + "epoch": 0.0005819733406717668, + "grad_norm": 0.3261844217777252, + "learning_rate": 8.968280467445744e-05, + "loss": 0.6139, + "step": 1658 + }, + { + "epoch": 0.000582324349924283, + "grad_norm": 0.30164632201194763, + "learning_rate": 8.961602671118531e-05, + "loss": 0.4767, + "step": 1659 + }, + { + "epoch": 0.000582675359176799, + "grad_norm": 0.27412328124046326, + "learning_rate": 8.95492487479132e-05, + "loss": 0.4773, + "step": 1660 + }, + { + "epoch": 0.0005830263684293152, + "grad_norm": 0.3026188313961029, + "learning_rate": 8.948247078464107e-05, + "loss": 0.5091, + "step": 1661 + }, + { + "epoch": 0.0005833773776818314, + "grad_norm": 0.4182475507259369, + "learning_rate": 8.941569282136896e-05, + "loss": 0.4763, + "step": 1662 + }, + { + "epoch": 0.0005837283869343475, + "grad_norm": 0.32345879077911377, + "learning_rate": 8.934891485809683e-05, + "loss": 0.4365, + "step": 1663 + }, + { + "epoch": 0.0005840793961868636, + "grad_norm": 0.27278438210487366, + "learning_rate": 8.92821368948247e-05, + "loss": 0.4126, + "step": 1664 + }, + { + "epoch": 0.0005844304054393798, + "grad_norm": 0.2701342701911926, + "learning_rate": 8.921535893155259e-05, + "loss": 0.44, + "step": 1665 + }, + { + "epoch": 0.0005847814146918959, + "grad_norm": 0.33415308594703674, + "learning_rate": 8.914858096828048e-05, + "loss": 0.4873, + "step": 1666 + }, + { + "epoch": 0.0005851324239444121, + "grad_norm": 0.25953027606010437, + "learning_rate": 8.908180300500835e-05, + "loss": 0.5047, + "step": 1667 + }, + { + "epoch": 0.0005854834331969282, + "grad_norm": 0.2938767373561859, + "learning_rate": 8.901502504173624e-05, + "loss": 0.5472, + "step": 1668 + }, + { + "epoch": 0.0005858344424494444, + "grad_norm": 0.34639960527420044, + "learning_rate": 8.894824707846411e-05, + "loss": 0.4647, + "step": 1669 + }, + { + "epoch": 0.0005861854517019605, + "grad_norm": 0.30084356665611267, + "learning_rate": 8.8881469115192e-05, + "loss": 0.5751, + "step": 1670 + }, + { + "epoch": 0.0005865364609544767, + "grad_norm": 0.3419461250305176, + "learning_rate": 8.881469115191987e-05, + "loss": 0.5945, + "step": 1671 + }, + { + "epoch": 0.0005868874702069928, + "grad_norm": 0.30969375371932983, + "learning_rate": 8.874791318864774e-05, + "loss": 0.476, + "step": 1672 + }, + { + "epoch": 0.0005872384794595089, + "grad_norm": 0.2766319513320923, + "learning_rate": 8.868113522537563e-05, + "loss": 0.471, + "step": 1673 + }, + { + "epoch": 0.0005875894887120251, + "grad_norm": 0.2892490327358246, + "learning_rate": 8.86143572621035e-05, + "loss": 0.5525, + "step": 1674 + }, + { + "epoch": 0.0005879404979645413, + "grad_norm": 0.2913951575756073, + "learning_rate": 8.854757929883139e-05, + "loss": 0.5969, + "step": 1675 + }, + { + "epoch": 0.0005882915072170573, + "grad_norm": 0.3010789155960083, + "learning_rate": 8.848080133555928e-05, + "loss": 0.4817, + "step": 1676 + }, + { + "epoch": 0.0005886425164695735, + "grad_norm": 0.29977700114250183, + "learning_rate": 8.841402337228715e-05, + "loss": 0.4793, + "step": 1677 + }, + { + "epoch": 0.0005889935257220897, + "grad_norm": 0.3283400535583496, + "learning_rate": 8.834724540901504e-05, + "loss": 0.5056, + "step": 1678 + }, + { + "epoch": 0.0005893445349746058, + "grad_norm": 0.30444255471229553, + "learning_rate": 8.828046744574291e-05, + "loss": 0.4955, + "step": 1679 + }, + { + "epoch": 0.0005896955442271219, + "grad_norm": 0.3443448543548584, + "learning_rate": 8.821368948247078e-05, + "loss": 0.5143, + "step": 1680 + }, + { + "epoch": 0.0005900465534796381, + "grad_norm": 0.29445815086364746, + "learning_rate": 8.814691151919867e-05, + "loss": 0.5487, + "step": 1681 + }, + { + "epoch": 0.0005903975627321543, + "grad_norm": 0.2663688659667969, + "learning_rate": 8.808013355592654e-05, + "loss": 0.4625, + "step": 1682 + }, + { + "epoch": 0.0005907485719846703, + "grad_norm": 0.3313208222389221, + "learning_rate": 8.801335559265443e-05, + "loss": 0.5043, + "step": 1683 + }, + { + "epoch": 0.0005910995812371865, + "grad_norm": 0.33829203248023987, + "learning_rate": 8.794657762938232e-05, + "loss": 0.5575, + "step": 1684 + }, + { + "epoch": 0.0005914505904897027, + "grad_norm": 0.2788808047771454, + "learning_rate": 8.787979966611019e-05, + "loss": 0.3439, + "step": 1685 + }, + { + "epoch": 0.0005918015997422188, + "grad_norm": 0.2924749255180359, + "learning_rate": 8.781302170283808e-05, + "loss": 0.5249, + "step": 1686 + }, + { + "epoch": 0.0005921526089947349, + "grad_norm": 0.3375588357448578, + "learning_rate": 8.774624373956595e-05, + "loss": 0.5204, + "step": 1687 + }, + { + "epoch": 0.0005925036182472511, + "grad_norm": 0.31543827056884766, + "learning_rate": 8.767946577629382e-05, + "loss": 0.547, + "step": 1688 + }, + { + "epoch": 0.0005928546274997673, + "grad_norm": 0.29130932688713074, + "learning_rate": 8.761268781302171e-05, + "loss": 0.4064, + "step": 1689 + }, + { + "epoch": 0.0005932056367522834, + "grad_norm": 0.28948086500167847, + "learning_rate": 8.754590984974958e-05, + "loss": 0.4538, + "step": 1690 + }, + { + "epoch": 0.0005935566460047995, + "grad_norm": 0.3201799690723419, + "learning_rate": 8.747913188647745e-05, + "loss": 0.5578, + "step": 1691 + }, + { + "epoch": 0.0005939076552573157, + "grad_norm": 0.3169330954551697, + "learning_rate": 8.741235392320535e-05, + "loss": 0.5626, + "step": 1692 + }, + { + "epoch": 0.0005942586645098318, + "grad_norm": 0.34727850556373596, + "learning_rate": 8.734557595993323e-05, + "loss": 0.4209, + "step": 1693 + }, + { + "epoch": 0.000594609673762348, + "grad_norm": 0.3186934292316437, + "learning_rate": 8.727879799666111e-05, + "loss": 0.5401, + "step": 1694 + }, + { + "epoch": 0.0005949606830148641, + "grad_norm": 0.34129294753074646, + "learning_rate": 8.721202003338899e-05, + "loss": 0.4617, + "step": 1695 + }, + { + "epoch": 0.0005953116922673802, + "grad_norm": 0.3374929130077362, + "learning_rate": 8.714524207011686e-05, + "loss": 0.5558, + "step": 1696 + }, + { + "epoch": 0.0005956627015198964, + "grad_norm": 0.30274853110313416, + "learning_rate": 8.707846410684475e-05, + "loss": 0.4544, + "step": 1697 + }, + { + "epoch": 0.0005960137107724126, + "grad_norm": 0.3348468244075775, + "learning_rate": 8.701168614357262e-05, + "loss": 0.641, + "step": 1698 + }, + { + "epoch": 0.0005963647200249287, + "grad_norm": 0.2674828767776489, + "learning_rate": 8.694490818030051e-05, + "loss": 0.5649, + "step": 1699 + }, + { + "epoch": 0.0005967157292774448, + "grad_norm": 0.3447219729423523, + "learning_rate": 8.687813021702838e-05, + "loss": 0.4618, + "step": 1700 + }, + { + "epoch": 0.000597066738529961, + "grad_norm": 0.3155357241630554, + "learning_rate": 8.681135225375627e-05, + "loss": 0.4557, + "step": 1701 + }, + { + "epoch": 0.0005974177477824772, + "grad_norm": 0.2937457263469696, + "learning_rate": 8.674457429048415e-05, + "loss": 0.65, + "step": 1702 + }, + { + "epoch": 0.0005977687570349932, + "grad_norm": 0.287835031747818, + "learning_rate": 8.667779632721203e-05, + "loss": 0.4525, + "step": 1703 + }, + { + "epoch": 0.0005981197662875094, + "grad_norm": 0.3285943865776062, + "learning_rate": 8.66110183639399e-05, + "loss": 0.4686, + "step": 1704 + }, + { + "epoch": 0.0005984707755400256, + "grad_norm": 0.3463473618030548, + "learning_rate": 8.654424040066779e-05, + "loss": 0.5349, + "step": 1705 + }, + { + "epoch": 0.0005988217847925416, + "grad_norm": 0.3047028183937073, + "learning_rate": 8.647746243739566e-05, + "loss": 0.4551, + "step": 1706 + }, + { + "epoch": 0.0005991727940450578, + "grad_norm": 0.2832798361778259, + "learning_rate": 8.641068447412355e-05, + "loss": 0.4721, + "step": 1707 + }, + { + "epoch": 0.000599523803297574, + "grad_norm": 0.3024655878543854, + "learning_rate": 8.634390651085142e-05, + "loss": 0.4971, + "step": 1708 + }, + { + "epoch": 0.0005998748125500902, + "grad_norm": 0.2802872657775879, + "learning_rate": 8.62771285475793e-05, + "loss": 0.4598, + "step": 1709 + }, + { + "epoch": 0.0006002258218026062, + "grad_norm": 0.2773732841014862, + "learning_rate": 8.62103505843072e-05, + "loss": 0.4215, + "step": 1710 + }, + { + "epoch": 0.0006005768310551224, + "grad_norm": 0.3328293263912201, + "learning_rate": 8.614357262103507e-05, + "loss": 0.4502, + "step": 1711 + }, + { + "epoch": 0.0006009278403076386, + "grad_norm": 0.3046766519546509, + "learning_rate": 8.607679465776294e-05, + "loss": 0.4578, + "step": 1712 + }, + { + "epoch": 0.0006012788495601547, + "grad_norm": 0.33364781737327576, + "learning_rate": 8.601001669449083e-05, + "loss": 0.5184, + "step": 1713 + }, + { + "epoch": 0.0006016298588126708, + "grad_norm": 0.3627041280269623, + "learning_rate": 8.59432387312187e-05, + "loss": 0.5041, + "step": 1714 + }, + { + "epoch": 0.000601980868065187, + "grad_norm": 0.3411107361316681, + "learning_rate": 8.587646076794659e-05, + "loss": 0.4983, + "step": 1715 + }, + { + "epoch": 0.0006023318773177031, + "grad_norm": 0.3014586865901947, + "learning_rate": 8.580968280467446e-05, + "loss": 0.6105, + "step": 1716 + }, + { + "epoch": 0.0006026828865702193, + "grad_norm": 0.29484355449676514, + "learning_rate": 8.574290484140233e-05, + "loss": 0.4038, + "step": 1717 + }, + { + "epoch": 0.0006030338958227354, + "grad_norm": 0.37084364891052246, + "learning_rate": 8.567612687813022e-05, + "loss": 0.4537, + "step": 1718 + }, + { + "epoch": 0.0006033849050752516, + "grad_norm": 0.29114142060279846, + "learning_rate": 8.56093489148581e-05, + "loss": 0.5568, + "step": 1719 + }, + { + "epoch": 0.0006037359143277677, + "grad_norm": 0.3706299662590027, + "learning_rate": 8.554257095158598e-05, + "loss": 0.5678, + "step": 1720 + }, + { + "epoch": 0.0006040869235802839, + "grad_norm": 0.3251887857913971, + "learning_rate": 8.547579298831387e-05, + "loss": 0.573, + "step": 1721 + }, + { + "epoch": 0.0006044379328328, + "grad_norm": 0.28198716044425964, + "learning_rate": 8.540901502504174e-05, + "loss": 0.4209, + "step": 1722 + }, + { + "epoch": 0.0006047889420853161, + "grad_norm": 0.2896440029144287, + "learning_rate": 8.534223706176963e-05, + "loss": 0.4579, + "step": 1723 + }, + { + "epoch": 0.0006051399513378323, + "grad_norm": 0.3755309283733368, + "learning_rate": 8.52754590984975e-05, + "loss": 0.5178, + "step": 1724 + }, + { + "epoch": 0.0006054909605903485, + "grad_norm": 0.37272268533706665, + "learning_rate": 8.520868113522537e-05, + "loss": 0.5911, + "step": 1725 + }, + { + "epoch": 0.0006058419698428645, + "grad_norm": 0.29033470153808594, + "learning_rate": 8.514190317195326e-05, + "loss": 0.4492, + "step": 1726 + }, + { + "epoch": 0.0006061929790953807, + "grad_norm": 0.2940375804901123, + "learning_rate": 8.507512520868115e-05, + "loss": 0.4917, + "step": 1727 + }, + { + "epoch": 0.0006065439883478969, + "grad_norm": 0.3448154926300049, + "learning_rate": 8.500834724540902e-05, + "loss": 0.5053, + "step": 1728 + }, + { + "epoch": 0.000606894997600413, + "grad_norm": 0.30485787987709045, + "learning_rate": 8.49415692821369e-05, + "loss": 0.4761, + "step": 1729 + }, + { + "epoch": 0.0006072460068529291, + "grad_norm": 0.33083775639533997, + "learning_rate": 8.487479131886478e-05, + "loss": 0.5504, + "step": 1730 + }, + { + "epoch": 0.0006075970161054453, + "grad_norm": 0.2886825203895569, + "learning_rate": 8.480801335559267e-05, + "loss": 0.5908, + "step": 1731 + }, + { + "epoch": 0.0006079480253579615, + "grad_norm": 0.3262576758861542, + "learning_rate": 8.474123539232054e-05, + "loss": 0.4562, + "step": 1732 + }, + { + "epoch": 0.0006082990346104775, + "grad_norm": 0.31888243556022644, + "learning_rate": 8.467445742904841e-05, + "loss": 0.6006, + "step": 1733 + }, + { + "epoch": 0.0006086500438629937, + "grad_norm": 0.33102548122406006, + "learning_rate": 8.46076794657763e-05, + "loss": 0.5731, + "step": 1734 + }, + { + "epoch": 0.0006090010531155099, + "grad_norm": 0.31176602840423584, + "learning_rate": 8.454090150250417e-05, + "loss": 0.4979, + "step": 1735 + }, + { + "epoch": 0.000609352062368026, + "grad_norm": 0.30639031529426575, + "learning_rate": 8.447412353923206e-05, + "loss": 0.5953, + "step": 1736 + }, + { + "epoch": 0.0006097030716205421, + "grad_norm": 0.3576785922050476, + "learning_rate": 8.440734557595994e-05, + "loss": 0.612, + "step": 1737 + }, + { + "epoch": 0.0006100540808730583, + "grad_norm": 0.3325173854827881, + "learning_rate": 8.434056761268782e-05, + "loss": 0.5577, + "step": 1738 + }, + { + "epoch": 0.0006104050901255745, + "grad_norm": 0.3713616728782654, + "learning_rate": 8.42737896494157e-05, + "loss": 0.5457, + "step": 1739 + }, + { + "epoch": 0.0006107560993780906, + "grad_norm": 0.37327736616134644, + "learning_rate": 8.420701168614358e-05, + "loss": 0.4726, + "step": 1740 + }, + { + "epoch": 0.0006111071086306067, + "grad_norm": 0.3603207468986511, + "learning_rate": 8.414023372287145e-05, + "loss": 0.5489, + "step": 1741 + }, + { + "epoch": 0.0006114581178831229, + "grad_norm": 0.30581197142601013, + "learning_rate": 8.407345575959934e-05, + "loss": 0.4219, + "step": 1742 + }, + { + "epoch": 0.000611809127135639, + "grad_norm": 0.3137530982494354, + "learning_rate": 8.400667779632721e-05, + "loss": 0.4862, + "step": 1743 + }, + { + "epoch": 0.0006121601363881552, + "grad_norm": 0.28663527965545654, + "learning_rate": 8.39398998330551e-05, + "loss": 0.4886, + "step": 1744 + }, + { + "epoch": 0.0006125111456406713, + "grad_norm": 0.28816184401512146, + "learning_rate": 8.387312186978298e-05, + "loss": 0.4536, + "step": 1745 + }, + { + "epoch": 0.0006128621548931874, + "grad_norm": 0.36478331685066223, + "learning_rate": 8.380634390651086e-05, + "loss": 0.4393, + "step": 1746 + }, + { + "epoch": 0.0006132131641457036, + "grad_norm": 0.34497642517089844, + "learning_rate": 8.373956594323874e-05, + "loss": 0.4783, + "step": 1747 + }, + { + "epoch": 0.0006135641733982198, + "grad_norm": 0.34038984775543213, + "learning_rate": 8.367278797996662e-05, + "loss": 0.4024, + "step": 1748 + }, + { + "epoch": 0.0006139151826507358, + "grad_norm": 0.42788851261138916, + "learning_rate": 8.360601001669449e-05, + "loss": 0.4738, + "step": 1749 + }, + { + "epoch": 0.000614266191903252, + "grad_norm": 0.3174630105495453, + "learning_rate": 8.353923205342238e-05, + "loss": 0.571, + "step": 1750 + }, + { + "epoch": 0.0006146172011557682, + "grad_norm": 0.43922609090805054, + "learning_rate": 8.347245409015025e-05, + "loss": 0.6078, + "step": 1751 + }, + { + "epoch": 0.0006149682104082844, + "grad_norm": 0.3589128255844116, + "learning_rate": 8.340567612687812e-05, + "loss": 0.6748, + "step": 1752 + }, + { + "epoch": 0.0006153192196608004, + "grad_norm": 0.36477571725845337, + "learning_rate": 8.333889816360601e-05, + "loss": 0.4796, + "step": 1753 + }, + { + "epoch": 0.0006156702289133166, + "grad_norm": 0.3312797546386719, + "learning_rate": 8.32721202003339e-05, + "loss": 0.5847, + "step": 1754 + }, + { + "epoch": 0.0006160212381658328, + "grad_norm": 0.3113849461078644, + "learning_rate": 8.320534223706178e-05, + "loss": 0.5345, + "step": 1755 + }, + { + "epoch": 0.0006163722474183488, + "grad_norm": 0.3181850016117096, + "learning_rate": 8.313856427378966e-05, + "loss": 0.5949, + "step": 1756 + }, + { + "epoch": 0.000616723256670865, + "grad_norm": 0.44424140453338623, + "learning_rate": 8.307178631051753e-05, + "loss": 0.5702, + "step": 1757 + }, + { + "epoch": 0.0006170742659233812, + "grad_norm": 0.3985821604728699, + "learning_rate": 8.300500834724542e-05, + "loss": 0.5699, + "step": 1758 + }, + { + "epoch": 0.0006174252751758973, + "grad_norm": 0.3222169280052185, + "learning_rate": 8.293823038397329e-05, + "loss": 0.5349, + "step": 1759 + }, + { + "epoch": 0.0006177762844284134, + "grad_norm": 0.4233343303203583, + "learning_rate": 8.287145242070116e-05, + "loss": 0.5031, + "step": 1760 + }, + { + "epoch": 0.0006181272936809296, + "grad_norm": 0.3432156443595886, + "learning_rate": 8.280467445742905e-05, + "loss": 0.5084, + "step": 1761 + }, + { + "epoch": 0.0006184783029334458, + "grad_norm": 0.33886751532554626, + "learning_rate": 8.273789649415694e-05, + "loss": 0.4592, + "step": 1762 + }, + { + "epoch": 0.0006188293121859619, + "grad_norm": 0.3379828929901123, + "learning_rate": 8.267111853088482e-05, + "loss": 0.4691, + "step": 1763 + }, + { + "epoch": 0.000619180321438478, + "grad_norm": 0.2838027775287628, + "learning_rate": 8.26043405676127e-05, + "loss": 0.5345, + "step": 1764 + }, + { + "epoch": 0.0006195313306909942, + "grad_norm": 0.3198727071285248, + "learning_rate": 8.253756260434057e-05, + "loss": 0.6029, + "step": 1765 + }, + { + "epoch": 0.0006198823399435103, + "grad_norm": 0.37079837918281555, + "learning_rate": 8.247078464106846e-05, + "loss": 0.6643, + "step": 1766 + }, + { + "epoch": 0.0006202333491960265, + "grad_norm": 0.3130449652671814, + "learning_rate": 8.240400667779633e-05, + "loss": 0.5585, + "step": 1767 + }, + { + "epoch": 0.0006205843584485426, + "grad_norm": 0.29854029417037964, + "learning_rate": 8.23372287145242e-05, + "loss": 0.5202, + "step": 1768 + }, + { + "epoch": 0.0006209353677010587, + "grad_norm": 0.3536113202571869, + "learning_rate": 8.227045075125209e-05, + "loss": 0.5882, + "step": 1769 + }, + { + "epoch": 0.0006212863769535749, + "grad_norm": 0.2841801941394806, + "learning_rate": 8.220367278797996e-05, + "loss": 0.4227, + "step": 1770 + }, + { + "epoch": 0.0006216373862060911, + "grad_norm": 0.32225102186203003, + "learning_rate": 8.213689482470785e-05, + "loss": 0.5545, + "step": 1771 + }, + { + "epoch": 0.0006219883954586072, + "grad_norm": 0.3385821282863617, + "learning_rate": 8.207011686143574e-05, + "loss": 0.5307, + "step": 1772 + }, + { + "epoch": 0.0006223394047111233, + "grad_norm": 0.3400219976902008, + "learning_rate": 8.200333889816361e-05, + "loss": 0.5664, + "step": 1773 + }, + { + "epoch": 0.0006226904139636395, + "grad_norm": 0.4283548593521118, + "learning_rate": 8.19365609348915e-05, + "loss": 0.4957, + "step": 1774 + }, + { + "epoch": 0.0006230414232161557, + "grad_norm": 0.3625548779964447, + "learning_rate": 8.186978297161937e-05, + "loss": 0.4819, + "step": 1775 + }, + { + "epoch": 0.0006233924324686717, + "grad_norm": 0.34131062030792236, + "learning_rate": 8.180300500834724e-05, + "loss": 0.5277, + "step": 1776 + }, + { + "epoch": 0.0006237434417211879, + "grad_norm": 0.3383775055408478, + "learning_rate": 8.173622704507513e-05, + "loss": 0.5539, + "step": 1777 + }, + { + "epoch": 0.0006240944509737041, + "grad_norm": 0.2844056785106659, + "learning_rate": 8.1669449081803e-05, + "loss": 0.4959, + "step": 1778 + }, + { + "epoch": 0.0006244454602262201, + "grad_norm": 0.3345259428024292, + "learning_rate": 8.160267111853089e-05, + "loss": 0.5136, + "step": 1779 + }, + { + "epoch": 0.0006247964694787363, + "grad_norm": 0.32142356038093567, + "learning_rate": 8.153589315525877e-05, + "loss": 0.5348, + "step": 1780 + }, + { + "epoch": 0.0006251474787312525, + "grad_norm": 0.30291274189949036, + "learning_rate": 8.146911519198665e-05, + "loss": 0.5296, + "step": 1781 + }, + { + "epoch": 0.0006254984879837687, + "grad_norm": 0.36180031299591064, + "learning_rate": 8.140233722871453e-05, + "loss": 0.5498, + "step": 1782 + }, + { + "epoch": 0.0006258494972362847, + "grad_norm": 0.2952847182750702, + "learning_rate": 8.133555926544241e-05, + "loss": 0.5233, + "step": 1783 + }, + { + "epoch": 0.0006262005064888009, + "grad_norm": 0.2964370846748352, + "learning_rate": 8.126878130217028e-05, + "loss": 0.5787, + "step": 1784 + }, + { + "epoch": 0.0006265515157413171, + "grad_norm": 0.3017970323562622, + "learning_rate": 8.120200333889817e-05, + "loss": 0.5927, + "step": 1785 + }, + { + "epoch": 0.0006269025249938332, + "grad_norm": 0.32457467913627625, + "learning_rate": 8.113522537562604e-05, + "loss": 0.6207, + "step": 1786 + }, + { + "epoch": 0.0006272535342463493, + "grad_norm": 0.3024297058582306, + "learning_rate": 8.106844741235393e-05, + "loss": 0.5379, + "step": 1787 + }, + { + "epoch": 0.0006276045434988655, + "grad_norm": 0.2766537368297577, + "learning_rate": 8.10016694490818e-05, + "loss": 0.432, + "step": 1788 + }, + { + "epoch": 0.0006279555527513816, + "grad_norm": 0.3326070308685303, + "learning_rate": 8.093489148580969e-05, + "loss": 0.6633, + "step": 1789 + }, + { + "epoch": 0.0006283065620038978, + "grad_norm": 0.2948818802833557, + "learning_rate": 8.086811352253757e-05, + "loss": 0.4987, + "step": 1790 + }, + { + "epoch": 0.0006286575712564139, + "grad_norm": 0.28426218032836914, + "learning_rate": 8.080133555926545e-05, + "loss": 0.442, + "step": 1791 + }, + { + "epoch": 0.0006290085805089301, + "grad_norm": 0.30030035972595215, + "learning_rate": 8.073455759599332e-05, + "loss": 0.6064, + "step": 1792 + }, + { + "epoch": 0.0006293595897614462, + "grad_norm": 0.30664128065109253, + "learning_rate": 8.066777963272121e-05, + "loss": 0.4789, + "step": 1793 + }, + { + "epoch": 0.0006297105990139624, + "grad_norm": 0.30878594517707825, + "learning_rate": 8.060100166944908e-05, + "loss": 0.5365, + "step": 1794 + }, + { + "epoch": 0.0006300616082664785, + "grad_norm": 0.31132617592811584, + "learning_rate": 8.053422370617697e-05, + "loss": 0.5432, + "step": 1795 + }, + { + "epoch": 0.0006304126175189946, + "grad_norm": 0.3347366154193878, + "learning_rate": 8.046744574290484e-05, + "loss": 0.4208, + "step": 1796 + }, + { + "epoch": 0.0006307636267715108, + "grad_norm": 0.3419090509414673, + "learning_rate": 8.040066777963273e-05, + "loss": 0.4985, + "step": 1797 + }, + { + "epoch": 0.000631114636024027, + "grad_norm": 0.3174959719181061, + "learning_rate": 8.033388981636061e-05, + "loss": 0.4255, + "step": 1798 + }, + { + "epoch": 0.000631465645276543, + "grad_norm": 0.32764488458633423, + "learning_rate": 8.026711185308849e-05, + "loss": 0.6213, + "step": 1799 + }, + { + "epoch": 0.0006318166545290592, + "grad_norm": 0.3342370390892029, + "learning_rate": 8.020033388981636e-05, + "loss": 0.4789, + "step": 1800 + }, + { + "epoch": 0.0006321676637815754, + "grad_norm": 0.301438570022583, + "learning_rate": 8.013355592654425e-05, + "loss": 0.5937, + "step": 1801 + }, + { + "epoch": 0.0006325186730340916, + "grad_norm": 0.31911852955818176, + "learning_rate": 8.006677796327212e-05, + "loss": 0.5831, + "step": 1802 + }, + { + "epoch": 0.0006328696822866076, + "grad_norm": 0.2970680296421051, + "learning_rate": 8e-05, + "loss": 0.5223, + "step": 1803 + }, + { + "epoch": 0.0006332206915391238, + "grad_norm": 0.29310017824172974, + "learning_rate": 7.993322203672788e-05, + "loss": 0.5266, + "step": 1804 + }, + { + "epoch": 0.00063357170079164, + "grad_norm": 0.34701675176620483, + "learning_rate": 7.986644407345575e-05, + "loss": 0.4887, + "step": 1805 + }, + { + "epoch": 0.000633922710044156, + "grad_norm": 0.24955204129219055, + "learning_rate": 7.979966611018364e-05, + "loss": 0.437, + "step": 1806 + }, + { + "epoch": 0.0006342737192966722, + "grad_norm": 0.33152899146080017, + "learning_rate": 7.973288814691153e-05, + "loss": 0.5932, + "step": 1807 + }, + { + "epoch": 0.0006346247285491884, + "grad_norm": 0.2790103852748871, + "learning_rate": 7.96661101836394e-05, + "loss": 0.4585, + "step": 1808 + }, + { + "epoch": 0.0006349757378017045, + "grad_norm": 0.30877217650413513, + "learning_rate": 7.959933222036729e-05, + "loss": 0.5174, + "step": 1809 + }, + { + "epoch": 0.0006353267470542206, + "grad_norm": 0.38331231474876404, + "learning_rate": 7.953255425709516e-05, + "loss": 0.5696, + "step": 1810 + }, + { + "epoch": 0.0006356777563067368, + "grad_norm": 0.35821542143821716, + "learning_rate": 7.946577629382305e-05, + "loss": 0.4815, + "step": 1811 + }, + { + "epoch": 0.000636028765559253, + "grad_norm": 0.3109416365623474, + "learning_rate": 7.939899833055092e-05, + "loss": 0.5783, + "step": 1812 + }, + { + "epoch": 0.0006363797748117691, + "grad_norm": 0.3217208683490753, + "learning_rate": 7.933222036727879e-05, + "loss": 0.5606, + "step": 1813 + }, + { + "epoch": 0.0006367307840642852, + "grad_norm": 0.3818305432796478, + "learning_rate": 7.926544240400668e-05, + "loss": 0.5592, + "step": 1814 + }, + { + "epoch": 0.0006370817933168014, + "grad_norm": 0.29824909567832947, + "learning_rate": 7.919866444073457e-05, + "loss": 0.5157, + "step": 1815 + }, + { + "epoch": 0.0006374328025693175, + "grad_norm": 0.31353560090065, + "learning_rate": 7.913188647746244e-05, + "loss": 0.5991, + "step": 1816 + }, + { + "epoch": 0.0006377838118218337, + "grad_norm": 0.33129647374153137, + "learning_rate": 7.906510851419033e-05, + "loss": 0.54, + "step": 1817 + }, + { + "epoch": 0.0006381348210743498, + "grad_norm": 0.3199217915534973, + "learning_rate": 7.89983305509182e-05, + "loss": 0.4823, + "step": 1818 + }, + { + "epoch": 0.0006384858303268659, + "grad_norm": 0.2801882028579712, + "learning_rate": 7.893155258764609e-05, + "loss": 0.5379, + "step": 1819 + }, + { + "epoch": 0.0006388368395793821, + "grad_norm": 0.29676681756973267, + "learning_rate": 7.886477462437396e-05, + "loss": 0.5142, + "step": 1820 + }, + { + "epoch": 0.0006391878488318983, + "grad_norm": 0.3249494433403015, + "learning_rate": 7.879799666110183e-05, + "loss": 0.4743, + "step": 1821 + }, + { + "epoch": 0.0006395388580844144, + "grad_norm": 0.47364258766174316, + "learning_rate": 7.873121869782972e-05, + "loss": 0.5575, + "step": 1822 + }, + { + "epoch": 0.0006398898673369305, + "grad_norm": 0.310779869556427, + "learning_rate": 7.86644407345576e-05, + "loss": 0.5115, + "step": 1823 + }, + { + "epoch": 0.0006402408765894467, + "grad_norm": 0.26023536920547485, + "learning_rate": 7.859766277128548e-05, + "loss": 0.5084, + "step": 1824 + }, + { + "epoch": 0.0006405918858419629, + "grad_norm": 0.31088247895240784, + "learning_rate": 7.853088480801337e-05, + "loss": 0.513, + "step": 1825 + }, + { + "epoch": 0.0006409428950944789, + "grad_norm": 0.2561517357826233, + "learning_rate": 7.846410684474124e-05, + "loss": 0.4056, + "step": 1826 + }, + { + "epoch": 0.0006412939043469951, + "grad_norm": 0.28456807136535645, + "learning_rate": 7.839732888146912e-05, + "loss": 0.4895, + "step": 1827 + }, + { + "epoch": 0.0006416449135995113, + "grad_norm": 0.30845314264297485, + "learning_rate": 7.8330550918197e-05, + "loss": 0.5941, + "step": 1828 + }, + { + "epoch": 0.0006419959228520273, + "grad_norm": 0.30980512499809265, + "learning_rate": 7.826377295492487e-05, + "loss": 0.5307, + "step": 1829 + }, + { + "epoch": 0.0006423469321045435, + "grad_norm": 0.2923174500465393, + "learning_rate": 7.819699499165276e-05, + "loss": 0.4737, + "step": 1830 + }, + { + "epoch": 0.0006426979413570597, + "grad_norm": 0.3474715054035187, + "learning_rate": 7.813021702838063e-05, + "loss": 0.6606, + "step": 1831 + }, + { + "epoch": 0.0006430489506095759, + "grad_norm": 0.29576122760772705, + "learning_rate": 7.806343906510852e-05, + "loss": 0.4151, + "step": 1832 + }, + { + "epoch": 0.000643399959862092, + "grad_norm": 0.3127489686012268, + "learning_rate": 7.79966611018364e-05, + "loss": 0.5683, + "step": 1833 + }, + { + "epoch": 0.0006437509691146081, + "grad_norm": 0.32313060760498047, + "learning_rate": 7.792988313856428e-05, + "loss": 0.3911, + "step": 1834 + }, + { + "epoch": 0.0006441019783671243, + "grad_norm": 0.38172590732574463, + "learning_rate": 7.786310517529216e-05, + "loss": 0.4852, + "step": 1835 + }, + { + "epoch": 0.0006444529876196404, + "grad_norm": 0.38548141717910767, + "learning_rate": 7.779632721202004e-05, + "loss": 0.5238, + "step": 1836 + }, + { + "epoch": 0.0006448039968721565, + "grad_norm": 0.3326992392539978, + "learning_rate": 7.772954924874791e-05, + "loss": 0.5435, + "step": 1837 + }, + { + "epoch": 0.0006451550061246727, + "grad_norm": 0.2704392969608307, + "learning_rate": 7.76627712854758e-05, + "loss": 0.5049, + "step": 1838 + }, + { + "epoch": 0.0006455060153771888, + "grad_norm": 0.3688966929912567, + "learning_rate": 7.759599332220367e-05, + "loss": 0.5507, + "step": 1839 + }, + { + "epoch": 0.000645857024629705, + "grad_norm": 0.33513352274894714, + "learning_rate": 7.752921535893156e-05, + "loss": 0.59, + "step": 1840 + }, + { + "epoch": 0.0006462080338822211, + "grad_norm": 0.26873478293418884, + "learning_rate": 7.746243739565944e-05, + "loss": 0.4088, + "step": 1841 + }, + { + "epoch": 0.0006465590431347373, + "grad_norm": 0.41162189841270447, + "learning_rate": 7.739565943238732e-05, + "loss": 0.4159, + "step": 1842 + }, + { + "epoch": 0.0006469100523872534, + "grad_norm": 0.3542315661907196, + "learning_rate": 7.73288814691152e-05, + "loss": 0.6067, + "step": 1843 + }, + { + "epoch": 0.0006472610616397696, + "grad_norm": 0.39147111773490906, + "learning_rate": 7.726210350584308e-05, + "loss": 0.4139, + "step": 1844 + }, + { + "epoch": 0.0006476120708922857, + "grad_norm": 0.3200126588344574, + "learning_rate": 7.719532554257095e-05, + "loss": 0.4112, + "step": 1845 + }, + { + "epoch": 0.0006479630801448018, + "grad_norm": 0.34853747487068176, + "learning_rate": 7.712854757929884e-05, + "loss": 0.4983, + "step": 1846 + }, + { + "epoch": 0.000648314089397318, + "grad_norm": 0.2987789511680603, + "learning_rate": 7.706176961602671e-05, + "loss": 0.5186, + "step": 1847 + }, + { + "epoch": 0.0006486650986498342, + "grad_norm": 0.3692026436328888, + "learning_rate": 7.69949916527546e-05, + "loss": 0.4028, + "step": 1848 + }, + { + "epoch": 0.0006490161079023502, + "grad_norm": 0.26036712527275085, + "learning_rate": 7.692821368948247e-05, + "loss": 0.4971, + "step": 1849 + }, + { + "epoch": 0.0006493671171548664, + "grad_norm": 0.2928013801574707, + "learning_rate": 7.686143572621036e-05, + "loss": 0.549, + "step": 1850 + }, + { + "epoch": 0.0006497181264073826, + "grad_norm": 0.2794664204120636, + "learning_rate": 7.679465776293824e-05, + "loss": 0.4184, + "step": 1851 + }, + { + "epoch": 0.0006500691356598988, + "grad_norm": 0.282713919878006, + "learning_rate": 7.672787979966612e-05, + "loss": 0.4637, + "step": 1852 + }, + { + "epoch": 0.0006504201449124148, + "grad_norm": 0.3084028959274292, + "learning_rate": 7.666110183639399e-05, + "loss": 0.4423, + "step": 1853 + }, + { + "epoch": 0.000650771154164931, + "grad_norm": 0.35329973697662354, + "learning_rate": 7.659432387312188e-05, + "loss": 0.4868, + "step": 1854 + }, + { + "epoch": 0.0006511221634174472, + "grad_norm": 0.38975444436073303, + "learning_rate": 7.652754590984975e-05, + "loss": 0.3701, + "step": 1855 + }, + { + "epoch": 0.0006514731726699632, + "grad_norm": 0.2983016073703766, + "learning_rate": 7.646076794657764e-05, + "loss": 0.5407, + "step": 1856 + }, + { + "epoch": 0.0006518241819224794, + "grad_norm": 0.32849010825157166, + "learning_rate": 7.639398998330551e-05, + "loss": 0.548, + "step": 1857 + }, + { + "epoch": 0.0006521751911749956, + "grad_norm": 0.32322797179222107, + "learning_rate": 7.63272120200334e-05, + "loss": 0.4231, + "step": 1858 + }, + { + "epoch": 0.0006525262004275117, + "grad_norm": 0.2949173152446747, + "learning_rate": 7.626043405676128e-05, + "loss": 0.5777, + "step": 1859 + }, + { + "epoch": 0.0006528772096800278, + "grad_norm": 0.3120216727256775, + "learning_rate": 7.619365609348916e-05, + "loss": 0.4483, + "step": 1860 + }, + { + "epoch": 0.000653228218932544, + "grad_norm": 0.32363617420196533, + "learning_rate": 7.612687813021703e-05, + "loss": 0.5748, + "step": 1861 + }, + { + "epoch": 0.0006535792281850602, + "grad_norm": 0.3077629506587982, + "learning_rate": 7.606010016694492e-05, + "loss": 0.5135, + "step": 1862 + }, + { + "epoch": 0.0006539302374375763, + "grad_norm": 0.3201192319393158, + "learning_rate": 7.599332220367279e-05, + "loss": 0.6412, + "step": 1863 + }, + { + "epoch": 0.0006542812466900924, + "grad_norm": 0.3008538484573364, + "learning_rate": 7.592654424040068e-05, + "loss": 0.4858, + "step": 1864 + }, + { + "epoch": 0.0006546322559426086, + "grad_norm": 0.35019761323928833, + "learning_rate": 7.585976627712855e-05, + "loss": 0.4819, + "step": 1865 + }, + { + "epoch": 0.0006549832651951247, + "grad_norm": 0.39763036370277405, + "learning_rate": 7.579298831385642e-05, + "loss": 0.5775, + "step": 1866 + }, + { + "epoch": 0.0006553342744476409, + "grad_norm": 0.29005396366119385, + "learning_rate": 7.572621035058431e-05, + "loss": 0.4828, + "step": 1867 + }, + { + "epoch": 0.000655685283700157, + "grad_norm": 0.30613401532173157, + "learning_rate": 7.56594323873122e-05, + "loss": 0.4375, + "step": 1868 + }, + { + "epoch": 0.0006560362929526731, + "grad_norm": 0.3596465289592743, + "learning_rate": 7.559265442404007e-05, + "loss": 0.4468, + "step": 1869 + }, + { + "epoch": 0.0006563873022051893, + "grad_norm": 0.28737086057662964, + "learning_rate": 7.552587646076796e-05, + "loss": 0.5726, + "step": 1870 + }, + { + "epoch": 0.0006567383114577055, + "grad_norm": 0.38036370277404785, + "learning_rate": 7.545909849749583e-05, + "loss": 0.5747, + "step": 1871 + }, + { + "epoch": 0.0006570893207102216, + "grad_norm": 0.3192722499370575, + "learning_rate": 7.539232053422371e-05, + "loss": 0.5859, + "step": 1872 + }, + { + "epoch": 0.0006574403299627377, + "grad_norm": 0.2886595129966736, + "learning_rate": 7.532554257095159e-05, + "loss": 0.5099, + "step": 1873 + }, + { + "epoch": 0.0006577913392152539, + "grad_norm": 0.3017093241214752, + "learning_rate": 7.525876460767946e-05, + "loss": 0.4442, + "step": 1874 + }, + { + "epoch": 0.0006581423484677701, + "grad_norm": 0.3073802590370178, + "learning_rate": 7.519198664440735e-05, + "loss": 0.5022, + "step": 1875 + }, + { + "epoch": 0.0006584933577202861, + "grad_norm": 0.34113094210624695, + "learning_rate": 7.512520868113523e-05, + "loss": 0.5146, + "step": 1876 + }, + { + "epoch": 0.0006588443669728023, + "grad_norm": 0.32277509570121765, + "learning_rate": 7.505843071786311e-05, + "loss": 0.5743, + "step": 1877 + }, + { + "epoch": 0.0006591953762253185, + "grad_norm": 0.3168696463108063, + "learning_rate": 7.4991652754591e-05, + "loss": 0.417, + "step": 1878 + }, + { + "epoch": 0.0006595463854778346, + "grad_norm": 0.35164040327072144, + "learning_rate": 7.492487479131887e-05, + "loss": 0.5078, + "step": 1879 + }, + { + "epoch": 0.0006598973947303507, + "grad_norm": 0.3132971227169037, + "learning_rate": 7.485809682804675e-05, + "loss": 0.4293, + "step": 1880 + }, + { + "epoch": 0.0006602484039828669, + "grad_norm": 0.3158970773220062, + "learning_rate": 7.479131886477463e-05, + "loss": 0.5559, + "step": 1881 + }, + { + "epoch": 0.0006605994132353831, + "grad_norm": 0.3228873610496521, + "learning_rate": 7.47245409015025e-05, + "loss": 0.4935, + "step": 1882 + }, + { + "epoch": 0.0006609504224878992, + "grad_norm": 0.4734925925731659, + "learning_rate": 7.465776293823039e-05, + "loss": 0.3587, + "step": 1883 + }, + { + "epoch": 0.0006613014317404153, + "grad_norm": 0.33582058548927307, + "learning_rate": 7.459098497495826e-05, + "loss": 0.4987, + "step": 1884 + }, + { + "epoch": 0.0006616524409929315, + "grad_norm": 0.38209983706474304, + "learning_rate": 7.452420701168615e-05, + "loss": 0.4443, + "step": 1885 + }, + { + "epoch": 0.0006620034502454476, + "grad_norm": 0.3218359649181366, + "learning_rate": 7.445742904841403e-05, + "loss": 0.5087, + "step": 1886 + }, + { + "epoch": 0.0006623544594979637, + "grad_norm": 0.33005908131599426, + "learning_rate": 7.439065108514191e-05, + "loss": 0.5362, + "step": 1887 + }, + { + "epoch": 0.0006627054687504799, + "grad_norm": 0.4753172993659973, + "learning_rate": 7.43238731218698e-05, + "loss": 0.4474, + "step": 1888 + }, + { + "epoch": 0.000663056478002996, + "grad_norm": 0.3765251636505127, + "learning_rate": 7.425709515859767e-05, + "loss": 0.5993, + "step": 1889 + }, + { + "epoch": 0.0006634074872555122, + "grad_norm": 0.3113894462585449, + "learning_rate": 7.419031719532554e-05, + "loss": 0.4636, + "step": 1890 + }, + { + "epoch": 0.0006637584965080283, + "grad_norm": 0.30841702222824097, + "learning_rate": 7.412353923205343e-05, + "loss": 0.5326, + "step": 1891 + }, + { + "epoch": 0.0006641095057605445, + "grad_norm": 0.29381653666496277, + "learning_rate": 7.40567612687813e-05, + "loss": 0.325, + "step": 1892 + }, + { + "epoch": 0.0006644605150130606, + "grad_norm": 0.3482291102409363, + "learning_rate": 7.398998330550919e-05, + "loss": 0.4646, + "step": 1893 + }, + { + "epoch": 0.0006648115242655768, + "grad_norm": 0.2865064740180969, + "learning_rate": 7.392320534223707e-05, + "loss": 0.4789, + "step": 1894 + }, + { + "epoch": 0.0006651625335180929, + "grad_norm": 0.29580044746398926, + "learning_rate": 7.385642737896495e-05, + "loss": 0.5047, + "step": 1895 + }, + { + "epoch": 0.000665513542770609, + "grad_norm": 0.3370521068572998, + "learning_rate": 7.378964941569283e-05, + "loss": 0.5915, + "step": 1896 + }, + { + "epoch": 0.0006658645520231252, + "grad_norm": 0.2680570185184479, + "learning_rate": 7.37228714524207e-05, + "loss": 0.4602, + "step": 1897 + }, + { + "epoch": 0.0006662155612756414, + "grad_norm": 0.2855984568595886, + "learning_rate": 7.365609348914858e-05, + "loss": 0.5439, + "step": 1898 + }, + { + "epoch": 0.0006665665705281574, + "grad_norm": 0.28999075293540955, + "learning_rate": 7.358931552587647e-05, + "loss": 0.4828, + "step": 1899 + }, + { + "epoch": 0.0006669175797806736, + "grad_norm": 0.3230993151664734, + "learning_rate": 7.352253756260434e-05, + "loss": 0.5974, + "step": 1900 + }, + { + "epoch": 0.0006672685890331898, + "grad_norm": 0.28700417280197144, + "learning_rate": 7.345575959933221e-05, + "loss": 0.5179, + "step": 1901 + }, + { + "epoch": 0.000667619598285706, + "grad_norm": 0.2921486794948578, + "learning_rate": 7.33889816360601e-05, + "loss": 0.4727, + "step": 1902 + }, + { + "epoch": 0.000667970607538222, + "grad_norm": 0.3887636959552765, + "learning_rate": 7.332220367278799e-05, + "loss": 0.5334, + "step": 1903 + }, + { + "epoch": 0.0006683216167907382, + "grad_norm": 0.3640362322330475, + "learning_rate": 7.325542570951587e-05, + "loss": 0.5576, + "step": 1904 + }, + { + "epoch": 0.0006686726260432544, + "grad_norm": 0.2985169589519501, + "learning_rate": 7.318864774624375e-05, + "loss": 0.5544, + "step": 1905 + }, + { + "epoch": 0.0006690236352957705, + "grad_norm": 0.30294784903526306, + "learning_rate": 7.312186978297162e-05, + "loss": 0.5005, + "step": 1906 + }, + { + "epoch": 0.0006693746445482866, + "grad_norm": 0.2947355806827545, + "learning_rate": 7.30550918196995e-05, + "loss": 0.4879, + "step": 1907 + }, + { + "epoch": 0.0006697256538008028, + "grad_norm": 0.2764705419540405, + "learning_rate": 7.298831385642738e-05, + "loss": 0.4531, + "step": 1908 + }, + { + "epoch": 0.0006700766630533189, + "grad_norm": 0.4107155501842499, + "learning_rate": 7.292153589315525e-05, + "loss": 0.4532, + "step": 1909 + }, + { + "epoch": 0.000670427672305835, + "grad_norm": 0.28341203927993774, + "learning_rate": 7.285475792988314e-05, + "loss": 0.5424, + "step": 1910 + }, + { + "epoch": 0.0006707786815583512, + "grad_norm": 0.36663204431533813, + "learning_rate": 7.278797996661103e-05, + "loss": 0.599, + "step": 1911 + }, + { + "epoch": 0.0006711296908108674, + "grad_norm": 0.30708596110343933, + "learning_rate": 7.272120200333891e-05, + "loss": 0.5971, + "step": 1912 + }, + { + "epoch": 0.0006714807000633835, + "grad_norm": 0.3823882043361664, + "learning_rate": 7.265442404006679e-05, + "loss": 0.5367, + "step": 1913 + }, + { + "epoch": 0.0006718317093158996, + "grad_norm": 0.3780754804611206, + "learning_rate": 7.258764607679466e-05, + "loss": 0.5756, + "step": 1914 + }, + { + "epoch": 0.0006721827185684158, + "grad_norm": 0.31058263778686523, + "learning_rate": 7.252086811352255e-05, + "loss": 0.5966, + "step": 1915 + }, + { + "epoch": 0.0006725337278209319, + "grad_norm": 0.29191386699676514, + "learning_rate": 7.245409015025042e-05, + "loss": 0.6099, + "step": 1916 + }, + { + "epoch": 0.0006728847370734481, + "grad_norm": 0.3607024550437927, + "learning_rate": 7.238731218697829e-05, + "loss": 0.5779, + "step": 1917 + }, + { + "epoch": 0.0006732357463259642, + "grad_norm": 0.2735411524772644, + "learning_rate": 7.232053422370618e-05, + "loss": 0.5511, + "step": 1918 + }, + { + "epoch": 0.0006735867555784803, + "grad_norm": 0.37066903710365295, + "learning_rate": 7.225375626043405e-05, + "loss": 0.5984, + "step": 1919 + }, + { + "epoch": 0.0006739377648309965, + "grad_norm": 0.3535907566547394, + "learning_rate": 7.218697829716194e-05, + "loss": 0.5074, + "step": 1920 + }, + { + "epoch": 0.0006742887740835127, + "grad_norm": 0.2900503873825073, + "learning_rate": 7.212020033388982e-05, + "loss": 0.3989, + "step": 1921 + }, + { + "epoch": 0.0006746397833360288, + "grad_norm": 0.2970031201839447, + "learning_rate": 7.20534223706177e-05, + "loss": 0.5514, + "step": 1922 + }, + { + "epoch": 0.0006749907925885449, + "grad_norm": 0.30902254581451416, + "learning_rate": 7.198664440734558e-05, + "loss": 0.3982, + "step": 1923 + }, + { + "epoch": 0.0006753418018410611, + "grad_norm": 0.2622113823890686, + "learning_rate": 7.191986644407346e-05, + "loss": 0.4587, + "step": 1924 + }, + { + "epoch": 0.0006756928110935773, + "grad_norm": 0.30972495675086975, + "learning_rate": 7.185308848080133e-05, + "loss": 0.5435, + "step": 1925 + }, + { + "epoch": 0.0006760438203460933, + "grad_norm": 0.3070833384990692, + "learning_rate": 7.178631051752922e-05, + "loss": 0.5074, + "step": 1926 + }, + { + "epoch": 0.0006763948295986095, + "grad_norm": 0.3055395781993866, + "learning_rate": 7.171953255425709e-05, + "loss": 0.5999, + "step": 1927 + }, + { + "epoch": 0.0006767458388511257, + "grad_norm": 0.3127722144126892, + "learning_rate": 7.165275459098498e-05, + "loss": 0.5511, + "step": 1928 + }, + { + "epoch": 0.0006770968481036418, + "grad_norm": 0.3363809585571289, + "learning_rate": 7.158597662771286e-05, + "loss": 0.5415, + "step": 1929 + }, + { + "epoch": 0.0006774478573561579, + "grad_norm": 0.3258194625377655, + "learning_rate": 7.151919866444074e-05, + "loss": 0.5976, + "step": 1930 + }, + { + "epoch": 0.0006777988666086741, + "grad_norm": 0.3083065152168274, + "learning_rate": 7.145242070116862e-05, + "loss": 0.6067, + "step": 1931 + }, + { + "epoch": 0.0006781498758611903, + "grad_norm": 0.3474681079387665, + "learning_rate": 7.13856427378965e-05, + "loss": 0.5749, + "step": 1932 + }, + { + "epoch": 0.0006785008851137064, + "grad_norm": 0.3168641924858093, + "learning_rate": 7.131886477462437e-05, + "loss": 0.4242, + "step": 1933 + }, + { + "epoch": 0.0006788518943662225, + "grad_norm": 0.30177485942840576, + "learning_rate": 7.125208681135226e-05, + "loss": 0.4978, + "step": 1934 + }, + { + "epoch": 0.0006792029036187387, + "grad_norm": 0.3365834653377533, + "learning_rate": 7.118530884808013e-05, + "loss": 0.5994, + "step": 1935 + }, + { + "epoch": 0.0006795539128712548, + "grad_norm": 0.3282754123210907, + "learning_rate": 7.111853088480802e-05, + "loss": 0.615, + "step": 1936 + }, + { + "epoch": 0.000679904922123771, + "grad_norm": 0.24498236179351807, + "learning_rate": 7.105175292153589e-05, + "loss": 0.4254, + "step": 1937 + }, + { + "epoch": 0.0006802559313762871, + "grad_norm": 0.3450114130973816, + "learning_rate": 7.098497495826378e-05, + "loss": 0.5362, + "step": 1938 + }, + { + "epoch": 0.0006806069406288032, + "grad_norm": 0.28795021772384644, + "learning_rate": 7.091819699499166e-05, + "loss": 0.4984, + "step": 1939 + }, + { + "epoch": 0.0006809579498813194, + "grad_norm": 0.32352307438850403, + "learning_rate": 7.085141903171954e-05, + "loss": 0.4549, + "step": 1940 + }, + { + "epoch": 0.0006813089591338355, + "grad_norm": 0.34447386860847473, + "learning_rate": 7.078464106844741e-05, + "loss": 0.5349, + "step": 1941 + }, + { + "epoch": 0.0006816599683863517, + "grad_norm": 0.31918805837631226, + "learning_rate": 7.07178631051753e-05, + "loss": 0.5468, + "step": 1942 + }, + { + "epoch": 0.0006820109776388678, + "grad_norm": 0.3190132975578308, + "learning_rate": 7.065108514190317e-05, + "loss": 0.5348, + "step": 1943 + }, + { + "epoch": 0.000682361986891384, + "grad_norm": 0.32868409156799316, + "learning_rate": 7.058430717863106e-05, + "loss": 0.6209, + "step": 1944 + }, + { + "epoch": 0.0006827129961439001, + "grad_norm": 0.2713989317417145, + "learning_rate": 7.051752921535893e-05, + "loss": 0.4681, + "step": 1945 + }, + { + "epoch": 0.0006830640053964162, + "grad_norm": 0.35190147161483765, + "learning_rate": 7.045075125208682e-05, + "loss": 0.5415, + "step": 1946 + }, + { + "epoch": 0.0006834150146489324, + "grad_norm": 0.322889119386673, + "learning_rate": 7.03839732888147e-05, + "loss": 0.5586, + "step": 1947 + }, + { + "epoch": 0.0006837660239014486, + "grad_norm": 0.33939826488494873, + "learning_rate": 7.031719532554258e-05, + "loss": 0.5586, + "step": 1948 + }, + { + "epoch": 0.0006841170331539646, + "grad_norm": 0.3554326891899109, + "learning_rate": 7.025041736227045e-05, + "loss": 0.5386, + "step": 1949 + }, + { + "epoch": 0.0006844680424064808, + "grad_norm": 0.3021222949028015, + "learning_rate": 7.018363939899834e-05, + "loss": 0.5569, + "step": 1950 + }, + { + "epoch": 0.000684819051658997, + "grad_norm": 0.3286188244819641, + "learning_rate": 7.011686143572621e-05, + "loss": 0.5466, + "step": 1951 + }, + { + "epoch": 0.0006851700609115132, + "grad_norm": 0.302117258310318, + "learning_rate": 7.00500834724541e-05, + "loss": 0.4038, + "step": 1952 + }, + { + "epoch": 0.0006855210701640292, + "grad_norm": 0.3204907178878784, + "learning_rate": 6.998330550918197e-05, + "loss": 0.4429, + "step": 1953 + }, + { + "epoch": 0.0006858720794165454, + "grad_norm": 0.2782181203365326, + "learning_rate": 6.991652754590986e-05, + "loss": 0.4102, + "step": 1954 + }, + { + "epoch": 0.0006862230886690616, + "grad_norm": 0.31240731477737427, + "learning_rate": 6.984974958263774e-05, + "loss": 0.5353, + "step": 1955 + }, + { + "epoch": 0.0006865740979215777, + "grad_norm": 0.32677972316741943, + "learning_rate": 6.978297161936562e-05, + "loss": 0.4403, + "step": 1956 + }, + { + "epoch": 0.0006869251071740938, + "grad_norm": 0.33199426531791687, + "learning_rate": 6.971619365609349e-05, + "loss": 0.4433, + "step": 1957 + }, + { + "epoch": 0.00068727611642661, + "grad_norm": 0.2825728952884674, + "learning_rate": 6.964941569282138e-05, + "loss": 0.5624, + "step": 1958 + }, + { + "epoch": 0.0006876271256791261, + "grad_norm": 0.30743977427482605, + "learning_rate": 6.958263772954925e-05, + "loss": 0.565, + "step": 1959 + }, + { + "epoch": 0.0006879781349316423, + "grad_norm": 0.32357290387153625, + "learning_rate": 6.951585976627714e-05, + "loss": 0.596, + "step": 1960 + }, + { + "epoch": 0.0006883291441841584, + "grad_norm": 0.31747472286224365, + "learning_rate": 6.944908180300501e-05, + "loss": 0.5811, + "step": 1961 + }, + { + "epoch": 0.0006886801534366745, + "grad_norm": 0.3278048038482666, + "learning_rate": 6.938230383973288e-05, + "loss": 0.4468, + "step": 1962 + }, + { + "epoch": 0.0006890311626891907, + "grad_norm": 0.3308374285697937, + "learning_rate": 6.931552587646077e-05, + "loss": 0.6508, + "step": 1963 + }, + { + "epoch": 0.0006893821719417069, + "grad_norm": 0.3360099792480469, + "learning_rate": 6.924874791318865e-05, + "loss": 0.534, + "step": 1964 + }, + { + "epoch": 0.000689733181194223, + "grad_norm": 0.3039510250091553, + "learning_rate": 6.918196994991654e-05, + "loss": 0.5789, + "step": 1965 + }, + { + "epoch": 0.0006900841904467391, + "grad_norm": 0.3015453517436981, + "learning_rate": 6.911519198664441e-05, + "loss": 0.3639, + "step": 1966 + }, + { + "epoch": 0.0006904351996992553, + "grad_norm": 0.3157881498336792, + "learning_rate": 6.904841402337229e-05, + "loss": 0.5002, + "step": 1967 + }, + { + "epoch": 0.0006907862089517714, + "grad_norm": 0.28026652336120605, + "learning_rate": 6.898163606010017e-05, + "loss": 0.5289, + "step": 1968 + }, + { + "epoch": 0.0006911372182042875, + "grad_norm": 0.3170677125453949, + "learning_rate": 6.891485809682805e-05, + "loss": 0.6144, + "step": 1969 + }, + { + "epoch": 0.0006914882274568037, + "grad_norm": 0.3244359791278839, + "learning_rate": 6.884808013355592e-05, + "loss": 0.6176, + "step": 1970 + }, + { + "epoch": 0.0006918392367093199, + "grad_norm": 0.3142417371273041, + "learning_rate": 6.878130217028381e-05, + "loss": 0.6137, + "step": 1971 + }, + { + "epoch": 0.0006921902459618359, + "grad_norm": 0.3678075969219208, + "learning_rate": 6.87145242070117e-05, + "loss": 0.5228, + "step": 1972 + }, + { + "epoch": 0.0006925412552143521, + "grad_norm": 0.35631263256073, + "learning_rate": 6.864774624373958e-05, + "loss": 0.4831, + "step": 1973 + }, + { + "epoch": 0.0006928922644668683, + "grad_norm": 0.30589306354522705, + "learning_rate": 6.858096828046745e-05, + "loss": 0.47, + "step": 1974 + }, + { + "epoch": 0.0006932432737193845, + "grad_norm": 0.3037767708301544, + "learning_rate": 6.851419031719533e-05, + "loss": 0.5334, + "step": 1975 + }, + { + "epoch": 0.0006935942829719005, + "grad_norm": 0.3331162631511688, + "learning_rate": 6.844741235392321e-05, + "loss": 0.6051, + "step": 1976 + }, + { + "epoch": 0.0006939452922244167, + "grad_norm": 0.3342154622077942, + "learning_rate": 6.838063439065109e-05, + "loss": 0.5466, + "step": 1977 + }, + { + "epoch": 0.0006942963014769329, + "grad_norm": 0.3748263418674469, + "learning_rate": 6.831385642737896e-05, + "loss": 0.5265, + "step": 1978 + }, + { + "epoch": 0.000694647310729449, + "grad_norm": 0.33476313948631287, + "learning_rate": 6.824707846410685e-05, + "loss": 0.5298, + "step": 1979 + }, + { + "epoch": 0.0006949983199819651, + "grad_norm": 0.37101680040359497, + "learning_rate": 6.818030050083472e-05, + "loss": 0.5745, + "step": 1980 + }, + { + "epoch": 0.0006953493292344813, + "grad_norm": 0.3126341998577118, + "learning_rate": 6.811352253756261e-05, + "loss": 0.4874, + "step": 1981 + }, + { + "epoch": 0.0006957003384869974, + "grad_norm": 0.305896133184433, + "learning_rate": 6.80467445742905e-05, + "loss": 0.5187, + "step": 1982 + }, + { + "epoch": 0.0006960513477395136, + "grad_norm": 0.3486585319042206, + "learning_rate": 6.797996661101837e-05, + "loss": 0.5567, + "step": 1983 + }, + { + "epoch": 0.0006964023569920297, + "grad_norm": 0.33587202429771423, + "learning_rate": 6.791318864774625e-05, + "loss": 0.505, + "step": 1984 + }, + { + "epoch": 0.0006967533662445459, + "grad_norm": 0.32981690764427185, + "learning_rate": 6.784641068447413e-05, + "loss": 0.4372, + "step": 1985 + }, + { + "epoch": 0.000697104375497062, + "grad_norm": 0.30636945366859436, + "learning_rate": 6.7779632721202e-05, + "loss": 0.4731, + "step": 1986 + }, + { + "epoch": 0.0006974553847495782, + "grad_norm": 0.3573989272117615, + "learning_rate": 6.771285475792989e-05, + "loss": 0.6193, + "step": 1987 + }, + { + "epoch": 0.0006978063940020943, + "grad_norm": 0.3697716295719147, + "learning_rate": 6.764607679465776e-05, + "loss": 0.4243, + "step": 1988 + }, + { + "epoch": 0.0006981574032546104, + "grad_norm": 0.3072642385959625, + "learning_rate": 6.757929883138565e-05, + "loss": 0.5506, + "step": 1989 + }, + { + "epoch": 0.0006985084125071266, + "grad_norm": 0.3706247806549072, + "learning_rate": 6.751252086811353e-05, + "loss": 0.4897, + "step": 1990 + }, + { + "epoch": 0.0006988594217596428, + "grad_norm": 0.3179176449775696, + "learning_rate": 6.74457429048414e-05, + "loss": 0.582, + "step": 1991 + }, + { + "epoch": 0.0006992104310121588, + "grad_norm": 0.3597802519798279, + "learning_rate": 6.737896494156929e-05, + "loss": 0.5297, + "step": 1992 + }, + { + "epoch": 0.000699561440264675, + "grad_norm": 0.3542323410511017, + "learning_rate": 6.731218697829717e-05, + "loss": 0.5995, + "step": 1993 + }, + { + "epoch": 0.0006999124495171912, + "grad_norm": 0.3902435302734375, + "learning_rate": 6.724540901502504e-05, + "loss": 0.55, + "step": 1994 + }, + { + "epoch": 0.0007002634587697074, + "grad_norm": 0.433971107006073, + "learning_rate": 6.717863105175293e-05, + "loss": 0.5859, + "step": 1995 + }, + { + "epoch": 0.0007006144680222234, + "grad_norm": 0.30398884415626526, + "learning_rate": 6.71118530884808e-05, + "loss": 0.5749, + "step": 1996 + }, + { + "epoch": 0.0007009654772747396, + "grad_norm": 0.2854095995426178, + "learning_rate": 6.704507512520869e-05, + "loss": 0.5932, + "step": 1997 + }, + { + "epoch": 0.0007013164865272558, + "grad_norm": 0.3235953450202942, + "learning_rate": 6.697829716193656e-05, + "loss": 0.5921, + "step": 1998 + }, + { + "epoch": 0.0007016674957797718, + "grad_norm": 0.364388108253479, + "learning_rate": 6.691151919866445e-05, + "loss": 0.5932, + "step": 1999 + }, + { + "epoch": 0.000702018505032288, + "grad_norm": 0.2984377145767212, + "learning_rate": 6.684474123539233e-05, + "loss": 0.5099, + "step": 2000 + }, + { + "epoch": 0.0007023695142848042, + "grad_norm": 0.6035982370376587, + "learning_rate": 6.67779632721202e-05, + "loss": 0.5385, + "step": 2001 + }, + { + "epoch": 0.0007027205235373203, + "grad_norm": 0.3442158102989197, + "learning_rate": 6.671118530884808e-05, + "loss": 0.6262, + "step": 2002 + }, + { + "epoch": 0.0007030715327898364, + "grad_norm": 0.32627010345458984, + "learning_rate": 6.664440734557597e-05, + "loss": 0.5052, + "step": 2003 + }, + { + "epoch": 0.0007034225420423526, + "grad_norm": 0.2829074561595917, + "learning_rate": 6.657762938230384e-05, + "loss": 0.5151, + "step": 2004 + }, + { + "epoch": 0.0007037735512948688, + "grad_norm": 0.29303300380706787, + "learning_rate": 6.651085141903173e-05, + "loss": 0.4857, + "step": 2005 + }, + { + "epoch": 0.0007041245605473849, + "grad_norm": 0.2904541492462158, + "learning_rate": 6.64440734557596e-05, + "loss": 0.5264, + "step": 2006 + }, + { + "epoch": 0.000704475569799901, + "grad_norm": 0.36056920886039734, + "learning_rate": 6.637729549248749e-05, + "loss": 0.5662, + "step": 2007 + }, + { + "epoch": 0.0007048265790524172, + "grad_norm": 0.4564721882343292, + "learning_rate": 6.631051752921537e-05, + "loss": 0.523, + "step": 2008 + }, + { + "epoch": 0.0007051775883049333, + "grad_norm": 0.31811806559562683, + "learning_rate": 6.624373956594325e-05, + "loss": 0.5568, + "step": 2009 + }, + { + "epoch": 0.0007055285975574495, + "grad_norm": 0.30878302454948425, + "learning_rate": 6.617696160267112e-05, + "loss": 0.5579, + "step": 2010 + }, + { + "epoch": 0.0007058796068099656, + "grad_norm": 0.28648054599761963, + "learning_rate": 6.6110183639399e-05, + "loss": 0.4876, + "step": 2011 + }, + { + "epoch": 0.0007062306160624817, + "grad_norm": 0.30796393752098083, + "learning_rate": 6.604340567612688e-05, + "loss": 0.5471, + "step": 2012 + }, + { + "epoch": 0.0007065816253149979, + "grad_norm": 0.31019923090934753, + "learning_rate": 6.597662771285476e-05, + "loss": 0.5254, + "step": 2013 + }, + { + "epoch": 0.0007069326345675141, + "grad_norm": 0.2949763834476471, + "learning_rate": 6.590984974958264e-05, + "loss": 0.517, + "step": 2014 + }, + { + "epoch": 0.0007072836438200302, + "grad_norm": 0.28508061170578003, + "learning_rate": 6.584307178631051e-05, + "loss": 0.4345, + "step": 2015 + }, + { + "epoch": 0.0007076346530725463, + "grad_norm": 0.2776021957397461, + "learning_rate": 6.57762938230384e-05, + "loss": 0.4588, + "step": 2016 + }, + { + "epoch": 0.0007079856623250625, + "grad_norm": 0.31046923995018005, + "learning_rate": 6.570951585976628e-05, + "loss": 0.544, + "step": 2017 + }, + { + "epoch": 0.0007083366715775787, + "grad_norm": 0.25672435760498047, + "learning_rate": 6.564273789649416e-05, + "loss": 0.4267, + "step": 2018 + }, + { + "epoch": 0.0007086876808300947, + "grad_norm": 0.3223148286342621, + "learning_rate": 6.557595993322204e-05, + "loss": 0.4973, + "step": 2019 + }, + { + "epoch": 0.0007090386900826109, + "grad_norm": 0.3279174864292145, + "learning_rate": 6.550918196994992e-05, + "loss": 0.581, + "step": 2020 + }, + { + "epoch": 0.0007093896993351271, + "grad_norm": 0.29168081283569336, + "learning_rate": 6.54424040066778e-05, + "loss": 0.4743, + "step": 2021 + }, + { + "epoch": 0.0007097407085876431, + "grad_norm": 0.3340432345867157, + "learning_rate": 6.537562604340568e-05, + "loss": 0.5041, + "step": 2022 + }, + { + "epoch": 0.0007100917178401593, + "grad_norm": 0.32505378127098083, + "learning_rate": 6.530884808013355e-05, + "loss": 0.4696, + "step": 2023 + }, + { + "epoch": 0.0007104427270926755, + "grad_norm": 0.3208444118499756, + "learning_rate": 6.524207011686144e-05, + "loss": 0.4521, + "step": 2024 + }, + { + "epoch": 0.0007107937363451917, + "grad_norm": 0.3270561397075653, + "learning_rate": 6.517529215358932e-05, + "loss": 0.5442, + "step": 2025 + }, + { + "epoch": 0.0007111447455977077, + "grad_norm": 0.29020506143569946, + "learning_rate": 6.51085141903172e-05, + "loss": 0.5404, + "step": 2026 + }, + { + "epoch": 0.0007114957548502239, + "grad_norm": 0.364835262298584, + "learning_rate": 6.504173622704508e-05, + "loss": 0.499, + "step": 2027 + }, + { + "epoch": 0.0007118467641027401, + "grad_norm": 0.3265811502933502, + "learning_rate": 6.497495826377296e-05, + "loss": 0.4242, + "step": 2028 + }, + { + "epoch": 0.0007121977733552562, + "grad_norm": 0.2921433448791504, + "learning_rate": 6.490818030050084e-05, + "loss": 0.3823, + "step": 2029 + }, + { + "epoch": 0.0007125487826077723, + "grad_norm": 0.35920029878616333, + "learning_rate": 6.484140233722872e-05, + "loss": 0.4331, + "step": 2030 + }, + { + "epoch": 0.0007128997918602885, + "grad_norm": 0.3468065559864044, + "learning_rate": 6.477462437395659e-05, + "loss": 0.6026, + "step": 2031 + }, + { + "epoch": 0.0007132508011128046, + "grad_norm": 0.3000637888908386, + "learning_rate": 6.470784641068448e-05, + "loss": 0.3499, + "step": 2032 + }, + { + "epoch": 0.0007136018103653208, + "grad_norm": 0.34014737606048584, + "learning_rate": 6.464106844741235e-05, + "loss": 0.513, + "step": 2033 + }, + { + "epoch": 0.0007139528196178369, + "grad_norm": 0.32227322459220886, + "learning_rate": 6.457429048414024e-05, + "loss": 0.6163, + "step": 2034 + }, + { + "epoch": 0.0007143038288703531, + "grad_norm": 0.2845328450202942, + "learning_rate": 6.450751252086812e-05, + "loss": 0.4621, + "step": 2035 + }, + { + "epoch": 0.0007146548381228692, + "grad_norm": 0.3255940079689026, + "learning_rate": 6.4440734557596e-05, + "loss": 0.5099, + "step": 2036 + }, + { + "epoch": 0.0007150058473753854, + "grad_norm": 0.34483012557029724, + "learning_rate": 6.437395659432388e-05, + "loss": 0.6406, + "step": 2037 + }, + { + "epoch": 0.0007153568566279015, + "grad_norm": 0.32495322823524475, + "learning_rate": 6.430717863105176e-05, + "loss": 0.5417, + "step": 2038 + }, + { + "epoch": 0.0007157078658804176, + "grad_norm": 0.35258588194847107, + "learning_rate": 6.424040066777963e-05, + "loss": 0.5135, + "step": 2039 + }, + { + "epoch": 0.0007160588751329338, + "grad_norm": 0.40582478046417236, + "learning_rate": 6.417362270450752e-05, + "loss": 0.3909, + "step": 2040 + }, + { + "epoch": 0.00071640988438545, + "grad_norm": 0.2993859052658081, + "learning_rate": 6.410684474123539e-05, + "loss": 0.5538, + "step": 2041 + }, + { + "epoch": 0.000716760893637966, + "grad_norm": 0.2939004898071289, + "learning_rate": 6.404006677796328e-05, + "loss": 0.5033, + "step": 2042 + }, + { + "epoch": 0.0007171119028904822, + "grad_norm": 0.3596908152103424, + "learning_rate": 6.397328881469116e-05, + "loss": 0.6057, + "step": 2043 + }, + { + "epoch": 0.0007174629121429984, + "grad_norm": 0.3577982187271118, + "learning_rate": 6.390651085141904e-05, + "loss": 0.4969, + "step": 2044 + }, + { + "epoch": 0.0007178139213955146, + "grad_norm": 0.3823784589767456, + "learning_rate": 6.383973288814692e-05, + "loss": 0.4979, + "step": 2045 + }, + { + "epoch": 0.0007181649306480306, + "grad_norm": 0.31179672479629517, + "learning_rate": 6.37729549248748e-05, + "loss": 0.6643, + "step": 2046 + }, + { + "epoch": 0.0007185159399005468, + "grad_norm": 0.3055694103240967, + "learning_rate": 6.370617696160267e-05, + "loss": 0.5222, + "step": 2047 + }, + { + "epoch": 0.000718866949153063, + "grad_norm": 0.3177119493484497, + "learning_rate": 6.363939899833056e-05, + "loss": 0.4974, + "step": 2048 + }, + { + "epoch": 0.000719217958405579, + "grad_norm": 0.30025020241737366, + "learning_rate": 6.357262103505843e-05, + "loss": 0.4653, + "step": 2049 + }, + { + "epoch": 0.0007195689676580952, + "grad_norm": 0.3193509876728058, + "learning_rate": 6.35058430717863e-05, + "loss": 0.4433, + "step": 2050 + }, + { + "epoch": 0.0007199199769106114, + "grad_norm": 0.3532063961029053, + "learning_rate": 6.343906510851419e-05, + "loss": 0.5048, + "step": 2051 + }, + { + "epoch": 0.0007202709861631275, + "grad_norm": 0.3306260406970978, + "learning_rate": 6.337228714524208e-05, + "loss": 0.4217, + "step": 2052 + }, + { + "epoch": 0.0007206219954156436, + "grad_norm": 0.35060104727745056, + "learning_rate": 6.330550918196996e-05, + "loss": 0.6056, + "step": 2053 + }, + { + "epoch": 0.0007209730046681598, + "grad_norm": 0.3271511495113373, + "learning_rate": 6.323873121869784e-05, + "loss": 0.6413, + "step": 2054 + }, + { + "epoch": 0.000721324013920676, + "grad_norm": 0.3258192539215088, + "learning_rate": 6.317195325542571e-05, + "loss": 0.5181, + "step": 2055 + }, + { + "epoch": 0.0007216750231731921, + "grad_norm": 0.29275408387184143, + "learning_rate": 6.31051752921536e-05, + "loss": 0.427, + "step": 2056 + }, + { + "epoch": 0.0007220260324257082, + "grad_norm": 0.3610098659992218, + "learning_rate": 6.303839732888147e-05, + "loss": 0.4876, + "step": 2057 + }, + { + "epoch": 0.0007223770416782244, + "grad_norm": 0.2921926975250244, + "learning_rate": 6.297161936560934e-05, + "loss": 0.4277, + "step": 2058 + }, + { + "epoch": 0.0007227280509307405, + "grad_norm": 0.28178003430366516, + "learning_rate": 6.290484140233723e-05, + "loss": 0.4427, + "step": 2059 + }, + { + "epoch": 0.0007230790601832567, + "grad_norm": 0.2888009548187256, + "learning_rate": 6.283806343906511e-05, + "loss": 0.4472, + "step": 2060 + }, + { + "epoch": 0.0007234300694357728, + "grad_norm": 0.2815188467502594, + "learning_rate": 6.2771285475793e-05, + "loss": 0.4961, + "step": 2061 + }, + { + "epoch": 0.0007237810786882889, + "grad_norm": 0.3248046636581421, + "learning_rate": 6.270450751252087e-05, + "loss": 0.5689, + "step": 2062 + }, + { + "epoch": 0.0007241320879408051, + "grad_norm": 0.33134186267852783, + "learning_rate": 6.263772954924875e-05, + "loss": 0.5556, + "step": 2063 + }, + { + "epoch": 0.0007244830971933213, + "grad_norm": 0.3108437955379486, + "learning_rate": 6.257095158597663e-05, + "loss": 0.4706, + "step": 2064 + }, + { + "epoch": 0.0007248341064458374, + "grad_norm": 0.29280775785446167, + "learning_rate": 6.250417362270451e-05, + "loss": 0.5024, + "step": 2065 + }, + { + "epoch": 0.0007251851156983535, + "grad_norm": 0.3041885495185852, + "learning_rate": 6.243739565943238e-05, + "loss": 0.5804, + "step": 2066 + }, + { + "epoch": 0.0007255361249508697, + "grad_norm": 0.46564334630966187, + "learning_rate": 6.237061769616027e-05, + "loss": 0.557, + "step": 2067 + }, + { + "epoch": 0.0007258871342033859, + "grad_norm": 0.3866247832775116, + "learning_rate": 6.230383973288815e-05, + "loss": 0.6007, + "step": 2068 + }, + { + "epoch": 0.0007262381434559019, + "grad_norm": 0.3250521421432495, + "learning_rate": 6.223706176961604e-05, + "loss": 0.5898, + "step": 2069 + }, + { + "epoch": 0.0007265891527084181, + "grad_norm": 0.26282358169555664, + "learning_rate": 6.217028380634391e-05, + "loss": 0.4367, + "step": 2070 + }, + { + "epoch": 0.0007269401619609343, + "grad_norm": 0.4470483958721161, + "learning_rate": 6.210350584307179e-05, + "loss": 0.5953, + "step": 2071 + }, + { + "epoch": 0.0007272911712134503, + "grad_norm": 0.3086302578449249, + "learning_rate": 6.203672787979967e-05, + "loss": 0.4693, + "step": 2072 + }, + { + "epoch": 0.0007276421804659665, + "grad_norm": 0.26347777247428894, + "learning_rate": 6.196994991652755e-05, + "loss": 0.3897, + "step": 2073 + }, + { + "epoch": 0.0007279931897184827, + "grad_norm": 0.2902274429798126, + "learning_rate": 6.190317195325542e-05, + "loss": 0.491, + "step": 2074 + }, + { + "epoch": 0.0007283441989709989, + "grad_norm": 0.2900046706199646, + "learning_rate": 6.183639398998331e-05, + "loss": 0.43, + "step": 2075 + }, + { + "epoch": 0.0007286952082235149, + "grad_norm": 0.3595135509967804, + "learning_rate": 6.176961602671118e-05, + "loss": 0.4684, + "step": 2076 + }, + { + "epoch": 0.0007290462174760311, + "grad_norm": 0.2925381064414978, + "learning_rate": 6.170283806343907e-05, + "loss": 0.3986, + "step": 2077 + }, + { + "epoch": 0.0007293972267285473, + "grad_norm": 0.28008222579956055, + "learning_rate": 6.163606010016695e-05, + "loss": 0.4379, + "step": 2078 + }, + { + "epoch": 0.0007297482359810634, + "grad_norm": 0.2913059890270233, + "learning_rate": 6.156928213689483e-05, + "loss": 0.4155, + "step": 2079 + }, + { + "epoch": 0.0007300992452335795, + "grad_norm": 0.3182995021343231, + "learning_rate": 6.150250417362271e-05, + "loss": 0.445, + "step": 2080 + }, + { + "epoch": 0.0007304502544860957, + "grad_norm": 0.26757434010505676, + "learning_rate": 6.143572621035059e-05, + "loss": 0.4718, + "step": 2081 + }, + { + "epoch": 0.0007308012637386118, + "grad_norm": 0.29628437757492065, + "learning_rate": 6.136894824707846e-05, + "loss": 0.5108, + "step": 2082 + }, + { + "epoch": 0.000731152272991128, + "grad_norm": 0.3662126958370209, + "learning_rate": 6.130217028380635e-05, + "loss": 0.5035, + "step": 2083 + }, + { + "epoch": 0.0007315032822436441, + "grad_norm": 0.2901041805744171, + "learning_rate": 6.123539232053422e-05, + "loss": 0.5561, + "step": 2084 + }, + { + "epoch": 0.0007318542914961603, + "grad_norm": 0.3100784420967102, + "learning_rate": 6.11686143572621e-05, + "loss": 0.5088, + "step": 2085 + }, + { + "epoch": 0.0007322053007486764, + "grad_norm": 0.334096223115921, + "learning_rate": 6.110183639398999e-05, + "loss": 0.5374, + "step": 2086 + }, + { + "epoch": 0.0007325563100011926, + "grad_norm": 0.3160945773124695, + "learning_rate": 6.103505843071786e-05, + "loss": 0.5216, + "step": 2087 + }, + { + "epoch": 0.0007329073192537087, + "grad_norm": 0.2668875753879547, + "learning_rate": 6.0968280467445746e-05, + "loss": 0.5266, + "step": 2088 + }, + { + "epoch": 0.0007332583285062248, + "grad_norm": 0.3053551912307739, + "learning_rate": 6.0901502504173626e-05, + "loss": 0.4526, + "step": 2089 + }, + { + "epoch": 0.000733609337758741, + "grad_norm": 0.37118956446647644, + "learning_rate": 6.083472454090151e-05, + "loss": 0.3821, + "step": 2090 + }, + { + "epoch": 0.0007339603470112572, + "grad_norm": 0.3345617651939392, + "learning_rate": 6.0767946577629386e-05, + "loss": 0.5467, + "step": 2091 + }, + { + "epoch": 0.0007343113562637732, + "grad_norm": 0.3057086169719696, + "learning_rate": 6.070116861435726e-05, + "loss": 0.5843, + "step": 2092 + }, + { + "epoch": 0.0007346623655162894, + "grad_norm": 0.322822242975235, + "learning_rate": 6.0634390651085146e-05, + "loss": 0.4741, + "step": 2093 + }, + { + "epoch": 0.0007350133747688056, + "grad_norm": 0.3189939558506012, + "learning_rate": 6.0567612687813026e-05, + "loss": 0.6046, + "step": 2094 + }, + { + "epoch": 0.0007353643840213218, + "grad_norm": 0.2926713526248932, + "learning_rate": 6.05008347245409e-05, + "loss": 0.5153, + "step": 2095 + }, + { + "epoch": 0.0007357153932738378, + "grad_norm": 0.3285471200942993, + "learning_rate": 6.0434056761268785e-05, + "loss": 0.533, + "step": 2096 + }, + { + "epoch": 0.000736066402526354, + "grad_norm": 0.29923808574676514, + "learning_rate": 6.0367278797996665e-05, + "loss": 0.3605, + "step": 2097 + }, + { + "epoch": 0.0007364174117788702, + "grad_norm": 0.3213334381580353, + "learning_rate": 6.030050083472455e-05, + "loss": 0.5723, + "step": 2098 + }, + { + "epoch": 0.0007367684210313862, + "grad_norm": 0.30922314524650574, + "learning_rate": 6.0233722871452425e-05, + "loss": 0.4662, + "step": 2099 + }, + { + "epoch": 0.0007371194302839024, + "grad_norm": 0.2766854166984558, + "learning_rate": 6.01669449081803e-05, + "loss": 0.5024, + "step": 2100 + }, + { + "epoch": 0.0007374704395364186, + "grad_norm": 0.31957265734672546, + "learning_rate": 6.0100166944908185e-05, + "loss": 0.4026, + "step": 2101 + }, + { + "epoch": 0.0007378214487889347, + "grad_norm": 0.3103827238082886, + "learning_rate": 6.0033388981636065e-05, + "loss": 0.4981, + "step": 2102 + }, + { + "epoch": 0.0007381724580414508, + "grad_norm": 0.3047814667224884, + "learning_rate": 5.996661101836394e-05, + "loss": 0.5679, + "step": 2103 + }, + { + "epoch": 0.000738523467293967, + "grad_norm": 0.3266103267669678, + "learning_rate": 5.9899833055091825e-05, + "loss": 0.5143, + "step": 2104 + }, + { + "epoch": 0.0007388744765464832, + "grad_norm": 0.2987171709537506, + "learning_rate": 5.98330550918197e-05, + "loss": 0.3717, + "step": 2105 + }, + { + "epoch": 0.0007392254857989993, + "grad_norm": 0.2716870605945587, + "learning_rate": 5.9766277128547585e-05, + "loss": 0.4654, + "step": 2106 + }, + { + "epoch": 0.0007395764950515154, + "grad_norm": 0.2746477425098419, + "learning_rate": 5.9699499165275465e-05, + "loss": 0.5674, + "step": 2107 + }, + { + "epoch": 0.0007399275043040316, + "grad_norm": 0.30133751034736633, + "learning_rate": 5.963272120200334e-05, + "loss": 0.4119, + "step": 2108 + }, + { + "epoch": 0.0007402785135565477, + "grad_norm": 0.2945064902305603, + "learning_rate": 5.9565943238731224e-05, + "loss": 0.4342, + "step": 2109 + }, + { + "epoch": 0.0007406295228090639, + "grad_norm": 0.32607197761535645, + "learning_rate": 5.9499165275459104e-05, + "loss": 0.5316, + "step": 2110 + }, + { + "epoch": 0.00074098053206158, + "grad_norm": 0.29035452008247375, + "learning_rate": 5.943238731218698e-05, + "loss": 0.5378, + "step": 2111 + }, + { + "epoch": 0.0007413315413140961, + "grad_norm": 0.311575710773468, + "learning_rate": 5.9365609348914864e-05, + "loss": 0.4837, + "step": 2112 + }, + { + "epoch": 0.0007416825505666123, + "grad_norm": 0.29044589400291443, + "learning_rate": 5.929883138564274e-05, + "loss": 0.4939, + "step": 2113 + }, + { + "epoch": 0.0007420335598191285, + "grad_norm": 0.36396634578704834, + "learning_rate": 5.9232053422370624e-05, + "loss": 0.4681, + "step": 2114 + }, + { + "epoch": 0.0007423845690716446, + "grad_norm": 0.400389701128006, + "learning_rate": 5.9165275459098504e-05, + "loss": 0.6224, + "step": 2115 + }, + { + "epoch": 0.0007427355783241607, + "grad_norm": 0.32242146134376526, + "learning_rate": 5.909849749582638e-05, + "loss": 0.4635, + "step": 2116 + }, + { + "epoch": 0.0007430865875766769, + "grad_norm": 0.3917767107486725, + "learning_rate": 5.9031719532554264e-05, + "loss": 0.5719, + "step": 2117 + }, + { + "epoch": 0.0007434375968291931, + "grad_norm": 0.3087753653526306, + "learning_rate": 5.896494156928214e-05, + "loss": 0.4465, + "step": 2118 + }, + { + "epoch": 0.0007437886060817091, + "grad_norm": 0.3342611789703369, + "learning_rate": 5.889816360601002e-05, + "loss": 0.6269, + "step": 2119 + }, + { + "epoch": 0.0007441396153342253, + "grad_norm": 0.3343375623226166, + "learning_rate": 5.8831385642737904e-05, + "loss": 0.4707, + "step": 2120 + }, + { + "epoch": 0.0007444906245867415, + "grad_norm": 0.2936212718486786, + "learning_rate": 5.876460767946578e-05, + "loss": 0.5382, + "step": 2121 + }, + { + "epoch": 0.0007448416338392575, + "grad_norm": 0.29692140221595764, + "learning_rate": 5.8697829716193664e-05, + "loss": 0.4286, + "step": 2122 + }, + { + "epoch": 0.0007451926430917737, + "grad_norm": 0.3181445598602295, + "learning_rate": 5.863105175292154e-05, + "loss": 0.5988, + "step": 2123 + }, + { + "epoch": 0.0007455436523442899, + "grad_norm": 0.27625536918640137, + "learning_rate": 5.856427378964942e-05, + "loss": 0.5016, + "step": 2124 + }, + { + "epoch": 0.0007458946615968061, + "grad_norm": 0.28393319249153137, + "learning_rate": 5.84974958263773e-05, + "loss": 0.4275, + "step": 2125 + }, + { + "epoch": 0.0007462456708493221, + "grad_norm": 0.2632163465023041, + "learning_rate": 5.8430717863105176e-05, + "loss": 0.4724, + "step": 2126 + }, + { + "epoch": 0.0007465966801018383, + "grad_norm": 0.27599701285362244, + "learning_rate": 5.8363939899833056e-05, + "loss": 0.4605, + "step": 2127 + }, + { + "epoch": 0.0007469476893543545, + "grad_norm": 0.30116626620292664, + "learning_rate": 5.829716193656094e-05, + "loss": 0.4501, + "step": 2128 + }, + { + "epoch": 0.0007472986986068706, + "grad_norm": 0.3393256664276123, + "learning_rate": 5.8230383973288816e-05, + "loss": 0.4258, + "step": 2129 + }, + { + "epoch": 0.0007476497078593867, + "grad_norm": 0.33133190870285034, + "learning_rate": 5.81636060100167e-05, + "loss": 0.5216, + "step": 2130 + }, + { + "epoch": 0.0007480007171119029, + "grad_norm": 0.33170127868652344, + "learning_rate": 5.8096828046744576e-05, + "loss": 0.5108, + "step": 2131 + }, + { + "epoch": 0.000748351726364419, + "grad_norm": 0.2727866768836975, + "learning_rate": 5.8030050083472456e-05, + "loss": 0.4282, + "step": 2132 + }, + { + "epoch": 0.0007487027356169352, + "grad_norm": 0.3469356894493103, + "learning_rate": 5.796327212020034e-05, + "loss": 0.6135, + "step": 2133 + }, + { + "epoch": 0.0007490537448694513, + "grad_norm": 0.35446175932884216, + "learning_rate": 5.7896494156928216e-05, + "loss": 0.6232, + "step": 2134 + }, + { + "epoch": 0.0007494047541219675, + "grad_norm": 0.2925025522708893, + "learning_rate": 5.782971619365609e-05, + "loss": 0.5422, + "step": 2135 + }, + { + "epoch": 0.0007497557633744836, + "grad_norm": 0.3256363868713379, + "learning_rate": 5.7762938230383976e-05, + "loss": 0.5542, + "step": 2136 + }, + { + "epoch": 0.0007501067726269998, + "grad_norm": 0.30574938654899597, + "learning_rate": 5.7696160267111856e-05, + "loss": 0.4611, + "step": 2137 + }, + { + "epoch": 0.0007504577818795159, + "grad_norm": 0.31476491689682007, + "learning_rate": 5.762938230383974e-05, + "loss": 0.5159, + "step": 2138 + }, + { + "epoch": 0.000750808791132032, + "grad_norm": 0.3236735165119171, + "learning_rate": 5.7562604340567616e-05, + "loss": 0.41, + "step": 2139 + }, + { + "epoch": 0.0007511598003845482, + "grad_norm": 0.3269157409667969, + "learning_rate": 5.749582637729549e-05, + "loss": 0.5634, + "step": 2140 + }, + { + "epoch": 0.0007515108096370644, + "grad_norm": 0.2963981032371521, + "learning_rate": 5.7429048414023375e-05, + "loss": 0.5234, + "step": 2141 + }, + { + "epoch": 0.0007518618188895804, + "grad_norm": 0.3150303363800049, + "learning_rate": 5.7362270450751255e-05, + "loss": 0.5364, + "step": 2142 + }, + { + "epoch": 0.0007522128281420966, + "grad_norm": 0.31830161809921265, + "learning_rate": 5.729549248747913e-05, + "loss": 0.4401, + "step": 2143 + }, + { + "epoch": 0.0007525638373946128, + "grad_norm": 0.31755512952804565, + "learning_rate": 5.7228714524207015e-05, + "loss": 0.5842, + "step": 2144 + }, + { + "epoch": 0.000752914846647129, + "grad_norm": 0.3024124503135681, + "learning_rate": 5.7161936560934895e-05, + "loss": 0.4321, + "step": 2145 + }, + { + "epoch": 0.000753265855899645, + "grad_norm": 0.28298643231391907, + "learning_rate": 5.709515859766278e-05, + "loss": 0.3981, + "step": 2146 + }, + { + "epoch": 0.0007536168651521612, + "grad_norm": 0.35037901997566223, + "learning_rate": 5.7028380634390655e-05, + "loss": 0.5613, + "step": 2147 + }, + { + "epoch": 0.0007539678744046774, + "grad_norm": 0.34800994396209717, + "learning_rate": 5.696160267111853e-05, + "loss": 0.5463, + "step": 2148 + }, + { + "epoch": 0.0007543188836571934, + "grad_norm": 0.2794566750526428, + "learning_rate": 5.6894824707846415e-05, + "loss": 0.6228, + "step": 2149 + }, + { + "epoch": 0.0007546698929097096, + "grad_norm": 0.3680720031261444, + "learning_rate": 5.6828046744574295e-05, + "loss": 0.5515, + "step": 2150 + }, + { + "epoch": 0.0007550209021622258, + "grad_norm": 0.2983403503894806, + "learning_rate": 5.676126878130217e-05, + "loss": 0.4201, + "step": 2151 + }, + { + "epoch": 0.0007553719114147419, + "grad_norm": 0.3301478326320648, + "learning_rate": 5.6694490818030055e-05, + "loss": 0.5004, + "step": 2152 + }, + { + "epoch": 0.000755722920667258, + "grad_norm": 0.32439282536506653, + "learning_rate": 5.662771285475793e-05, + "loss": 0.5586, + "step": 2153 + }, + { + "epoch": 0.0007560739299197742, + "grad_norm": 0.2889827787876129, + "learning_rate": 5.6560934891485815e-05, + "loss": 0.3882, + "step": 2154 + }, + { + "epoch": 0.0007564249391722904, + "grad_norm": 0.3911135792732239, + "learning_rate": 5.6494156928213694e-05, + "loss": 0.542, + "step": 2155 + }, + { + "epoch": 0.0007567759484248065, + "grad_norm": 0.2955808937549591, + "learning_rate": 5.642737896494157e-05, + "loss": 0.5586, + "step": 2156 + }, + { + "epoch": 0.0007571269576773226, + "grad_norm": 0.5502769351005554, + "learning_rate": 5.6360601001669454e-05, + "loss": 0.5416, + "step": 2157 + }, + { + "epoch": 0.0007574779669298388, + "grad_norm": 0.3201058506965637, + "learning_rate": 5.629382303839733e-05, + "loss": 0.4864, + "step": 2158 + }, + { + "epoch": 0.0007578289761823549, + "grad_norm": 0.39620837569236755, + "learning_rate": 5.622704507512521e-05, + "loss": 0.5328, + "step": 2159 + }, + { + "epoch": 0.0007581799854348711, + "grad_norm": 0.3239331543445587, + "learning_rate": 5.6160267111853094e-05, + "loss": 0.5055, + "step": 2160 + }, + { + "epoch": 0.0007585309946873872, + "grad_norm": 0.3722355365753174, + "learning_rate": 5.609348914858097e-05, + "loss": 0.4805, + "step": 2161 + }, + { + "epoch": 0.0007588820039399033, + "grad_norm": 0.3486960232257843, + "learning_rate": 5.6026711185308854e-05, + "loss": 0.4956, + "step": 2162 + }, + { + "epoch": 0.0007592330131924195, + "grad_norm": 0.2911629378795624, + "learning_rate": 5.5959933222036734e-05, + "loss": 0.5888, + "step": 2163 + }, + { + "epoch": 0.0007595840224449357, + "grad_norm": 0.3276377022266388, + "learning_rate": 5.589315525876461e-05, + "loss": 0.4302, + "step": 2164 + }, + { + "epoch": 0.0007599350316974518, + "grad_norm": 0.3614025413990021, + "learning_rate": 5.5826377295492494e-05, + "loss": 0.5716, + "step": 2165 + }, + { + "epoch": 0.0007602860409499679, + "grad_norm": 0.2928791642189026, + "learning_rate": 5.575959933222037e-05, + "loss": 0.5744, + "step": 2166 + }, + { + "epoch": 0.0007606370502024841, + "grad_norm": 0.33232712745666504, + "learning_rate": 5.569282136894825e-05, + "loss": 0.5868, + "step": 2167 + }, + { + "epoch": 0.0007609880594550003, + "grad_norm": 0.28385528922080994, + "learning_rate": 5.5626043405676134e-05, + "loss": 0.4737, + "step": 2168 + }, + { + "epoch": 0.0007613390687075163, + "grad_norm": 0.3103507161140442, + "learning_rate": 5.555926544240401e-05, + "loss": 0.5225, + "step": 2169 + }, + { + "epoch": 0.0007616900779600325, + "grad_norm": 0.32343319058418274, + "learning_rate": 5.5492487479131893e-05, + "loss": 0.5051, + "step": 2170 + }, + { + "epoch": 0.0007620410872125487, + "grad_norm": 0.32034847140312195, + "learning_rate": 5.5425709515859767e-05, + "loss": 0.4835, + "step": 2171 + }, + { + "epoch": 0.0007623920964650648, + "grad_norm": 0.3377881646156311, + "learning_rate": 5.5358931552587646e-05, + "loss": 0.5747, + "step": 2172 + }, + { + "epoch": 0.0007627431057175809, + "grad_norm": 0.3083537518978119, + "learning_rate": 5.529215358931553e-05, + "loss": 0.4212, + "step": 2173 + }, + { + "epoch": 0.0007630941149700971, + "grad_norm": 0.29118117690086365, + "learning_rate": 5.5225375626043406e-05, + "loss": 0.4961, + "step": 2174 + }, + { + "epoch": 0.0007634451242226132, + "grad_norm": 0.28406381607055664, + "learning_rate": 5.5158597662771286e-05, + "loss": 0.4457, + "step": 2175 + }, + { + "epoch": 0.0007637961334751293, + "grad_norm": 0.30152761936187744, + "learning_rate": 5.509181969949917e-05, + "loss": 0.5338, + "step": 2176 + }, + { + "epoch": 0.0007641471427276455, + "grad_norm": 0.2911263108253479, + "learning_rate": 5.5025041736227046e-05, + "loss": 0.5034, + "step": 2177 + }, + { + "epoch": 0.0007644981519801617, + "grad_norm": 0.3346504271030426, + "learning_rate": 5.495826377295493e-05, + "loss": 0.477, + "step": 2178 + }, + { + "epoch": 0.0007648491612326778, + "grad_norm": 0.2966042459011078, + "learning_rate": 5.4891485809682806e-05, + "loss": 0.5751, + "step": 2179 + }, + { + "epoch": 0.000765200170485194, + "grad_norm": 0.3047448694705963, + "learning_rate": 5.4824707846410686e-05, + "loss": 0.5254, + "step": 2180 + }, + { + "epoch": 0.0007655511797377101, + "grad_norm": 0.2856384813785553, + "learning_rate": 5.475792988313857e-05, + "loss": 0.4936, + "step": 2181 + }, + { + "epoch": 0.0007659021889902262, + "grad_norm": 0.3124605119228363, + "learning_rate": 5.4691151919866446e-05, + "loss": 0.4669, + "step": 2182 + }, + { + "epoch": 0.0007662531982427424, + "grad_norm": 0.3174493908882141, + "learning_rate": 5.462437395659432e-05, + "loss": 0.4818, + "step": 2183 + }, + { + "epoch": 0.0007666042074952585, + "grad_norm": 0.2865968942642212, + "learning_rate": 5.4557595993322206e-05, + "loss": 0.5599, + "step": 2184 + }, + { + "epoch": 0.0007669552167477746, + "grad_norm": 0.29531776905059814, + "learning_rate": 5.4490818030050086e-05, + "loss": 0.5399, + "step": 2185 + }, + { + "epoch": 0.0007673062260002908, + "grad_norm": 0.2822519540786743, + "learning_rate": 5.442404006677797e-05, + "loss": 0.4714, + "step": 2186 + }, + { + "epoch": 0.000767657235252807, + "grad_norm": 0.3201335668563843, + "learning_rate": 5.4357262103505845e-05, + "loss": 0.4805, + "step": 2187 + }, + { + "epoch": 0.0007680082445053231, + "grad_norm": 0.3130393326282501, + "learning_rate": 5.429048414023372e-05, + "loss": 0.5615, + "step": 2188 + }, + { + "epoch": 0.0007683592537578392, + "grad_norm": 0.3346283435821533, + "learning_rate": 5.4223706176961605e-05, + "loss": 0.4135, + "step": 2189 + }, + { + "epoch": 0.0007687102630103554, + "grad_norm": 0.3538265526294708, + "learning_rate": 5.4156928213689485e-05, + "loss": 0.6236, + "step": 2190 + }, + { + "epoch": 0.0007690612722628716, + "grad_norm": 0.3180318772792816, + "learning_rate": 5.409015025041736e-05, + "loss": 0.5531, + "step": 2191 + }, + { + "epoch": 0.0007694122815153876, + "grad_norm": 0.29365554451942444, + "learning_rate": 5.4023372287145245e-05, + "loss": 0.4976, + "step": 2192 + }, + { + "epoch": 0.0007697632907679038, + "grad_norm": 0.3063375949859619, + "learning_rate": 5.3956594323873125e-05, + "loss": 0.4142, + "step": 2193 + }, + { + "epoch": 0.00077011430002042, + "grad_norm": 0.3582829535007477, + "learning_rate": 5.388981636060101e-05, + "loss": 0.5482, + "step": 2194 + }, + { + "epoch": 0.000770465309272936, + "grad_norm": 0.33901599049568176, + "learning_rate": 5.3823038397328885e-05, + "loss": 0.4793, + "step": 2195 + }, + { + "epoch": 0.0007708163185254522, + "grad_norm": 0.36831775307655334, + "learning_rate": 5.375626043405676e-05, + "loss": 0.5113, + "step": 2196 + }, + { + "epoch": 0.0007711673277779684, + "grad_norm": 0.28705963492393494, + "learning_rate": 5.3689482470784645e-05, + "loss": 0.4417, + "step": 2197 + }, + { + "epoch": 0.0007715183370304846, + "grad_norm": 0.3740500509738922, + "learning_rate": 5.3622704507512525e-05, + "loss": 0.4813, + "step": 2198 + }, + { + "epoch": 0.0007718693462830007, + "grad_norm": 0.3366188108921051, + "learning_rate": 5.35559265442404e-05, + "loss": 0.6308, + "step": 2199 + }, + { + "epoch": 0.0007722203555355168, + "grad_norm": 0.31155267357826233, + "learning_rate": 5.3489148580968285e-05, + "loss": 0.4941, + "step": 2200 + }, + { + "epoch": 0.000772571364788033, + "grad_norm": 0.30353885889053345, + "learning_rate": 5.342237061769616e-05, + "loss": 0.5345, + "step": 2201 + }, + { + "epoch": 0.0007729223740405491, + "grad_norm": 0.37722593545913696, + "learning_rate": 5.3355592654424044e-05, + "loss": 0.603, + "step": 2202 + }, + { + "epoch": 0.0007732733832930652, + "grad_norm": 0.31102925539016724, + "learning_rate": 5.3288814691151924e-05, + "loss": 0.5088, + "step": 2203 + }, + { + "epoch": 0.0007736243925455814, + "grad_norm": 0.285318523645401, + "learning_rate": 5.32220367278798e-05, + "loss": 0.4774, + "step": 2204 + }, + { + "epoch": 0.0007739754017980975, + "grad_norm": 0.26891449093818665, + "learning_rate": 5.3155258764607684e-05, + "loss": 0.4678, + "step": 2205 + }, + { + "epoch": 0.0007743264110506137, + "grad_norm": 0.40345075726509094, + "learning_rate": 5.308848080133556e-05, + "loss": 0.5511, + "step": 2206 + }, + { + "epoch": 0.0007746774203031298, + "grad_norm": 0.26102039217948914, + "learning_rate": 5.302170283806344e-05, + "loss": 0.5149, + "step": 2207 + }, + { + "epoch": 0.000775028429555646, + "grad_norm": 0.2700537443161011, + "learning_rate": 5.2954924874791324e-05, + "loss": 0.5239, + "step": 2208 + }, + { + "epoch": 0.0007753794388081621, + "grad_norm": 0.3144576847553253, + "learning_rate": 5.28881469115192e-05, + "loss": 0.4237, + "step": 2209 + }, + { + "epoch": 0.0007757304480606783, + "grad_norm": 0.26037758588790894, + "learning_rate": 5.2821368948247084e-05, + "loss": 0.4597, + "step": 2210 + }, + { + "epoch": 0.0007760814573131944, + "grad_norm": 0.2766638398170471, + "learning_rate": 5.2754590984974964e-05, + "loss": 0.4957, + "step": 2211 + }, + { + "epoch": 0.0007764324665657105, + "grad_norm": 0.30142873525619507, + "learning_rate": 5.268781302170284e-05, + "loss": 0.3903, + "step": 2212 + }, + { + "epoch": 0.0007767834758182267, + "grad_norm": 0.4231036305427551, + "learning_rate": 5.2621035058430724e-05, + "loss": 0.4719, + "step": 2213 + }, + { + "epoch": 0.0007771344850707429, + "grad_norm": 0.31130513548851013, + "learning_rate": 5.25542570951586e-05, + "loss": 0.53, + "step": 2214 + }, + { + "epoch": 0.0007774854943232589, + "grad_norm": 0.35050421953201294, + "learning_rate": 5.248747913188648e-05, + "loss": 0.5429, + "step": 2215 + }, + { + "epoch": 0.0007778365035757751, + "grad_norm": 0.3292376399040222, + "learning_rate": 5.2420701168614363e-05, + "loss": 0.4373, + "step": 2216 + }, + { + "epoch": 0.0007781875128282913, + "grad_norm": 0.319871187210083, + "learning_rate": 5.2353923205342237e-05, + "loss": 0.6004, + "step": 2217 + }, + { + "epoch": 0.0007785385220808075, + "grad_norm": 0.35365426540374756, + "learning_rate": 5.228714524207012e-05, + "loss": 0.613, + "step": 2218 + }, + { + "epoch": 0.0007788895313333235, + "grad_norm": 0.3369859457015991, + "learning_rate": 5.2220367278797996e-05, + "loss": 0.5368, + "step": 2219 + }, + { + "epoch": 0.0007792405405858397, + "grad_norm": 0.31861892342567444, + "learning_rate": 5.2153589315525876e-05, + "loss": 0.4325, + "step": 2220 + }, + { + "epoch": 0.0007795915498383559, + "grad_norm": 0.3197747766971588, + "learning_rate": 5.208681135225376e-05, + "loss": 0.5159, + "step": 2221 + }, + { + "epoch": 0.000779942559090872, + "grad_norm": 0.377331018447876, + "learning_rate": 5.2020033388981636e-05, + "loss": 0.4483, + "step": 2222 + }, + { + "epoch": 0.0007802935683433881, + "grad_norm": 0.2843930721282959, + "learning_rate": 5.195325542570952e-05, + "loss": 0.4812, + "step": 2223 + }, + { + "epoch": 0.0007806445775959043, + "grad_norm": 0.28777164220809937, + "learning_rate": 5.18864774624374e-05, + "loss": 0.4099, + "step": 2224 + }, + { + "epoch": 0.0007809955868484204, + "grad_norm": 0.3638690710067749, + "learning_rate": 5.1819699499165276e-05, + "loss": 0.5787, + "step": 2225 + }, + { + "epoch": 0.0007813465961009366, + "grad_norm": 0.36113011837005615, + "learning_rate": 5.175292153589316e-05, + "loss": 0.6242, + "step": 2226 + }, + { + "epoch": 0.0007816976053534527, + "grad_norm": 0.27899301052093506, + "learning_rate": 5.1686143572621036e-05, + "loss": 0.602, + "step": 2227 + }, + { + "epoch": 0.0007820486146059689, + "grad_norm": 0.32224345207214355, + "learning_rate": 5.1619365609348916e-05, + "loss": 0.4784, + "step": 2228 + }, + { + "epoch": 0.000782399623858485, + "grad_norm": 0.3012712597846985, + "learning_rate": 5.15525876460768e-05, + "loss": 0.5268, + "step": 2229 + }, + { + "epoch": 0.0007827506331110012, + "grad_norm": 0.3213576674461365, + "learning_rate": 5.1485809682804676e-05, + "loss": 0.4646, + "step": 2230 + }, + { + "epoch": 0.0007831016423635173, + "grad_norm": 0.3685286045074463, + "learning_rate": 5.141903171953256e-05, + "loss": 0.5744, + "step": 2231 + }, + { + "epoch": 0.0007834526516160334, + "grad_norm": 0.3282943665981293, + "learning_rate": 5.1352253756260436e-05, + "loss": 0.4577, + "step": 2232 + }, + { + "epoch": 0.0007838036608685496, + "grad_norm": 0.3141206204891205, + "learning_rate": 5.1285475792988315e-05, + "loss": 0.4351, + "step": 2233 + }, + { + "epoch": 0.0007841546701210657, + "grad_norm": 0.3435308337211609, + "learning_rate": 5.12186978297162e-05, + "loss": 0.5928, + "step": 2234 + }, + { + "epoch": 0.0007845056793735818, + "grad_norm": 0.37721729278564453, + "learning_rate": 5.1151919866444075e-05, + "loss": 0.4618, + "step": 2235 + }, + { + "epoch": 0.000784856688626098, + "grad_norm": 0.676645815372467, + "learning_rate": 5.108514190317195e-05, + "loss": 0.6159, + "step": 2236 + }, + { + "epoch": 0.0007852076978786142, + "grad_norm": 0.3856793940067291, + "learning_rate": 5.1018363939899835e-05, + "loss": 0.5401, + "step": 2237 + }, + { + "epoch": 0.0007855587071311303, + "grad_norm": 0.30672600865364075, + "learning_rate": 5.0951585976627715e-05, + "loss": 0.5427, + "step": 2238 + }, + { + "epoch": 0.0007859097163836464, + "grad_norm": 0.30035004019737244, + "learning_rate": 5.08848080133556e-05, + "loss": 0.5563, + "step": 2239 + }, + { + "epoch": 0.0007862607256361626, + "grad_norm": 0.29214805364608765, + "learning_rate": 5.0818030050083475e-05, + "loss": 0.4528, + "step": 2240 + }, + { + "epoch": 0.0007866117348886788, + "grad_norm": 0.2923140823841095, + "learning_rate": 5.0751252086811355e-05, + "loss": 0.4968, + "step": 2241 + }, + { + "epoch": 0.0007869627441411948, + "grad_norm": 0.2867215573787689, + "learning_rate": 5.068447412353924e-05, + "loss": 0.3843, + "step": 2242 + }, + { + "epoch": 0.000787313753393711, + "grad_norm": 0.35113075375556946, + "learning_rate": 5.0617696160267115e-05, + "loss": 0.5576, + "step": 2243 + }, + { + "epoch": 0.0007876647626462272, + "grad_norm": 0.3268751800060272, + "learning_rate": 5.055091819699499e-05, + "loss": 0.4707, + "step": 2244 + }, + { + "epoch": 0.0007880157718987433, + "grad_norm": 0.3053974211215973, + "learning_rate": 5.0484140233722875e-05, + "loss": 0.4569, + "step": 2245 + }, + { + "epoch": 0.0007883667811512594, + "grad_norm": 0.29972633719444275, + "learning_rate": 5.0417362270450755e-05, + "loss": 0.5683, + "step": 2246 + }, + { + "epoch": 0.0007887177904037756, + "grad_norm": 0.3231423795223236, + "learning_rate": 5.035058430717864e-05, + "loss": 0.6094, + "step": 2247 + }, + { + "epoch": 0.0007890687996562918, + "grad_norm": 0.3402612805366516, + "learning_rate": 5.0283806343906514e-05, + "loss": 0.4556, + "step": 2248 + }, + { + "epoch": 0.0007894198089088079, + "grad_norm": 0.3409544825553894, + "learning_rate": 5.021702838063439e-05, + "loss": 0.4996, + "step": 2249 + }, + { + "epoch": 0.000789770818161324, + "grad_norm": 0.36050474643707275, + "learning_rate": 5.0150250417362274e-05, + "loss": 0.5084, + "step": 2250 + }, + { + "epoch": 0.0007901218274138402, + "grad_norm": 0.26200827956199646, + "learning_rate": 5.0083472454090154e-05, + "loss": 0.4541, + "step": 2251 + }, + { + "epoch": 0.0007904728366663563, + "grad_norm": 0.2840903401374817, + "learning_rate": 5.001669449081803e-05, + "loss": 0.4942, + "step": 2252 + }, + { + "epoch": 0.0007908238459188725, + "grad_norm": 0.2910694181919098, + "learning_rate": 4.9949916527545914e-05, + "loss": 0.4837, + "step": 2253 + }, + { + "epoch": 0.0007911748551713886, + "grad_norm": 0.3161328136920929, + "learning_rate": 4.988313856427379e-05, + "loss": 0.55, + "step": 2254 + }, + { + "epoch": 0.0007915258644239047, + "grad_norm": 0.2806893587112427, + "learning_rate": 4.9816360601001674e-05, + "loss": 0.5187, + "step": 2255 + }, + { + "epoch": 0.0007918768736764209, + "grad_norm": 0.27077895402908325, + "learning_rate": 4.9749582637729554e-05, + "loss": 0.4551, + "step": 2256 + }, + { + "epoch": 0.000792227882928937, + "grad_norm": 0.29795902967453003, + "learning_rate": 4.9682804674457434e-05, + "loss": 0.4059, + "step": 2257 + }, + { + "epoch": 0.0007925788921814532, + "grad_norm": 0.2706364691257477, + "learning_rate": 4.961602671118531e-05, + "loss": 0.5352, + "step": 2258 + }, + { + "epoch": 0.0007929299014339693, + "grad_norm": 0.2788883149623871, + "learning_rate": 4.9549248747913194e-05, + "loss": 0.4768, + "step": 2259 + }, + { + "epoch": 0.0007932809106864855, + "grad_norm": 0.2956430912017822, + "learning_rate": 4.9482470784641074e-05, + "loss": 0.5953, + "step": 2260 + }, + { + "epoch": 0.0007936319199390016, + "grad_norm": 0.30534154176712036, + "learning_rate": 4.9415692821368953e-05, + "loss": 0.5104, + "step": 2261 + }, + { + "epoch": 0.0007939829291915177, + "grad_norm": 0.35847917199134827, + "learning_rate": 4.934891485809683e-05, + "loss": 0.5711, + "step": 2262 + }, + { + "epoch": 0.0007943339384440339, + "grad_norm": 0.2559013068675995, + "learning_rate": 4.9282136894824707e-05, + "loss": 0.3132, + "step": 2263 + }, + { + "epoch": 0.0007946849476965501, + "grad_norm": 0.33414438366889954, + "learning_rate": 4.921535893155259e-05, + "loss": 0.5808, + "step": 2264 + }, + { + "epoch": 0.0007950359569490661, + "grad_norm": 0.3346371650695801, + "learning_rate": 4.914858096828047e-05, + "loss": 0.5011, + "step": 2265 + }, + { + "epoch": 0.0007953869662015823, + "grad_norm": 0.3767020106315613, + "learning_rate": 4.9081803005008346e-05, + "loss": 0.5518, + "step": 2266 + }, + { + "epoch": 0.0007957379754540985, + "grad_norm": 0.34961530566215515, + "learning_rate": 4.9015025041736226e-05, + "loss": 0.5566, + "step": 2267 + }, + { + "epoch": 0.0007960889847066147, + "grad_norm": 0.42262473702430725, + "learning_rate": 4.894824707846411e-05, + "loss": 0.4769, + "step": 2268 + }, + { + "epoch": 0.0007964399939591307, + "grad_norm": 0.28671953082084656, + "learning_rate": 4.888146911519199e-05, + "loss": 0.4531, + "step": 2269 + }, + { + "epoch": 0.0007967910032116469, + "grad_norm": 0.2979021370410919, + "learning_rate": 4.8814691151919866e-05, + "loss": 0.46, + "step": 2270 + }, + { + "epoch": 0.0007971420124641631, + "grad_norm": 0.310390830039978, + "learning_rate": 4.8747913188647746e-05, + "loss": 0.4786, + "step": 2271 + }, + { + "epoch": 0.0007974930217166792, + "grad_norm": 0.2858920693397522, + "learning_rate": 4.8681135225375626e-05, + "loss": 0.5615, + "step": 2272 + }, + { + "epoch": 0.0007978440309691953, + "grad_norm": 0.30646857619285583, + "learning_rate": 4.861435726210351e-05, + "loss": 0.4111, + "step": 2273 + }, + { + "epoch": 0.0007981950402217115, + "grad_norm": 0.2704682946205139, + "learning_rate": 4.8547579298831386e-05, + "loss": 0.4601, + "step": 2274 + }, + { + "epoch": 0.0007985460494742276, + "grad_norm": 0.2745610475540161, + "learning_rate": 4.8480801335559266e-05, + "loss": 0.444, + "step": 2275 + }, + { + "epoch": 0.0007988970587267438, + "grad_norm": 0.318915992975235, + "learning_rate": 4.8414023372287146e-05, + "loss": 0.461, + "step": 2276 + }, + { + "epoch": 0.0007992480679792599, + "grad_norm": 0.38466915488243103, + "learning_rate": 4.834724540901503e-05, + "loss": 0.5273, + "step": 2277 + }, + { + "epoch": 0.0007995990772317761, + "grad_norm": 0.343703955411911, + "learning_rate": 4.8280467445742906e-05, + "loss": 0.4381, + "step": 2278 + }, + { + "epoch": 0.0007999500864842922, + "grad_norm": 0.30002158880233765, + "learning_rate": 4.8213689482470785e-05, + "loss": 0.3609, + "step": 2279 + }, + { + "epoch": 0.0008003010957368084, + "grad_norm": 0.2969815135002136, + "learning_rate": 4.8146911519198665e-05, + "loss": 0.4335, + "step": 2280 + }, + { + "epoch": 0.0008006521049893245, + "grad_norm": 0.24625307321548462, + "learning_rate": 4.8080133555926545e-05, + "loss": 0.3024, + "step": 2281 + }, + { + "epoch": 0.0008010031142418406, + "grad_norm": 0.3032619059085846, + "learning_rate": 4.8013355592654425e-05, + "loss": 0.5862, + "step": 2282 + }, + { + "epoch": 0.0008013541234943568, + "grad_norm": 0.3563072085380554, + "learning_rate": 4.7946577629382305e-05, + "loss": 0.5831, + "step": 2283 + }, + { + "epoch": 0.000801705132746873, + "grad_norm": 0.27989256381988525, + "learning_rate": 4.7879799666110185e-05, + "loss": 0.407, + "step": 2284 + }, + { + "epoch": 0.000802056141999389, + "grad_norm": 0.3893837034702301, + "learning_rate": 4.7813021702838065e-05, + "loss": 0.489, + "step": 2285 + }, + { + "epoch": 0.0008024071512519052, + "grad_norm": 0.2796432673931122, + "learning_rate": 4.7746243739565945e-05, + "loss": 0.5497, + "step": 2286 + }, + { + "epoch": 0.0008027581605044214, + "grad_norm": 0.30520594120025635, + "learning_rate": 4.7679465776293825e-05, + "loss": 0.5586, + "step": 2287 + }, + { + "epoch": 0.0008031091697569375, + "grad_norm": 0.31399065256118774, + "learning_rate": 4.7612687813021705e-05, + "loss": 0.5585, + "step": 2288 + }, + { + "epoch": 0.0008034601790094536, + "grad_norm": 0.29442235827445984, + "learning_rate": 4.7545909849749585e-05, + "loss": 0.487, + "step": 2289 + }, + { + "epoch": 0.0008038111882619698, + "grad_norm": 0.33235105872154236, + "learning_rate": 4.7479131886477465e-05, + "loss": 0.5476, + "step": 2290 + }, + { + "epoch": 0.000804162197514486, + "grad_norm": 0.31871527433395386, + "learning_rate": 4.7412353923205345e-05, + "loss": 0.5141, + "step": 2291 + }, + { + "epoch": 0.000804513206767002, + "grad_norm": 0.3413945138454437, + "learning_rate": 4.7345575959933225e-05, + "loss": 0.5544, + "step": 2292 + }, + { + "epoch": 0.0008048642160195182, + "grad_norm": 0.3110330402851105, + "learning_rate": 4.7278797996661104e-05, + "loss": 0.5044, + "step": 2293 + }, + { + "epoch": 0.0008052152252720344, + "grad_norm": 0.3235619068145752, + "learning_rate": 4.7212020033388984e-05, + "loss": 0.5005, + "step": 2294 + }, + { + "epoch": 0.0008055662345245505, + "grad_norm": 0.2979834973812103, + "learning_rate": 4.7145242070116864e-05, + "loss": 0.5182, + "step": 2295 + }, + { + "epoch": 0.0008059172437770666, + "grad_norm": 0.3092743456363678, + "learning_rate": 4.7078464106844744e-05, + "loss": 0.5234, + "step": 2296 + }, + { + "epoch": 0.0008062682530295828, + "grad_norm": 0.2838219702243805, + "learning_rate": 4.7011686143572624e-05, + "loss": 0.4375, + "step": 2297 + }, + { + "epoch": 0.000806619262282099, + "grad_norm": 0.2947825491428375, + "learning_rate": 4.6944908180300504e-05, + "loss": 0.5849, + "step": 2298 + }, + { + "epoch": 0.0008069702715346151, + "grad_norm": 0.32933109998703003, + "learning_rate": 4.6878130217028384e-05, + "loss": 0.5916, + "step": 2299 + }, + { + "epoch": 0.0008073212807871312, + "grad_norm": 0.28970029950141907, + "learning_rate": 4.6811352253756264e-05, + "loss": 0.5157, + "step": 2300 + }, + { + "epoch": 0.0008076722900396474, + "grad_norm": 0.30502164363861084, + "learning_rate": 4.6744574290484144e-05, + "loss": 0.475, + "step": 2301 + }, + { + "epoch": 0.0008080232992921635, + "grad_norm": 0.3376252353191376, + "learning_rate": 4.667779632721202e-05, + "loss": 0.5223, + "step": 2302 + }, + { + "epoch": 0.0008083743085446797, + "grad_norm": 0.3515482246875763, + "learning_rate": 4.6611018363939904e-05, + "loss": 0.4526, + "step": 2303 + }, + { + "epoch": 0.0008087253177971958, + "grad_norm": 0.27139726281166077, + "learning_rate": 4.6544240400667784e-05, + "loss": 0.3947, + "step": 2304 + }, + { + "epoch": 0.0008090763270497119, + "grad_norm": 0.329605370759964, + "learning_rate": 4.6477462437395664e-05, + "loss": 0.599, + "step": 2305 + }, + { + "epoch": 0.0008094273363022281, + "grad_norm": 0.2759001553058624, + "learning_rate": 4.641068447412354e-05, + "loss": 0.4998, + "step": 2306 + }, + { + "epoch": 0.0008097783455547443, + "grad_norm": 0.312492311000824, + "learning_rate": 4.6343906510851423e-05, + "loss": 0.4926, + "step": 2307 + }, + { + "epoch": 0.0008101293548072604, + "grad_norm": 0.29779669642448425, + "learning_rate": 4.6277128547579303e-05, + "loss": 0.4746, + "step": 2308 + }, + { + "epoch": 0.0008104803640597765, + "grad_norm": 0.3351886570453644, + "learning_rate": 4.621035058430718e-05, + "loss": 0.4445, + "step": 2309 + }, + { + "epoch": 0.0008108313733122927, + "grad_norm": 0.8489035367965698, + "learning_rate": 4.6143572621035056e-05, + "loss": 0.5599, + "step": 2310 + }, + { + "epoch": 0.0008111823825648089, + "grad_norm": 0.31646668910980225, + "learning_rate": 4.6076794657762936e-05, + "loss": 0.4444, + "step": 2311 + }, + { + "epoch": 0.0008115333918173249, + "grad_norm": 0.294809490442276, + "learning_rate": 4.601001669449082e-05, + "loss": 0.4919, + "step": 2312 + }, + { + "epoch": 0.0008118844010698411, + "grad_norm": 0.3671543598175049, + "learning_rate": 4.59432387312187e-05, + "loss": 0.4802, + "step": 2313 + }, + { + "epoch": 0.0008122354103223573, + "grad_norm": 0.2710740268230438, + "learning_rate": 4.5876460767946576e-05, + "loss": 0.3985, + "step": 2314 + }, + { + "epoch": 0.0008125864195748733, + "grad_norm": 0.32188868522644043, + "learning_rate": 4.5809682804674456e-05, + "loss": 0.4829, + "step": 2315 + }, + { + "epoch": 0.0008129374288273895, + "grad_norm": 0.3944168984889984, + "learning_rate": 4.574290484140234e-05, + "loss": 0.4558, + "step": 2316 + }, + { + "epoch": 0.0008132884380799057, + "grad_norm": 0.3056366741657257, + "learning_rate": 4.567612687813022e-05, + "loss": 0.5274, + "step": 2317 + }, + { + "epoch": 0.0008136394473324219, + "grad_norm": 0.9373723864555359, + "learning_rate": 4.5609348914858096e-05, + "loss": 0.5398, + "step": 2318 + }, + { + "epoch": 0.0008139904565849379, + "grad_norm": 0.26745036244392395, + "learning_rate": 4.5542570951585976e-05, + "loss": 0.4388, + "step": 2319 + }, + { + "epoch": 0.0008143414658374541, + "grad_norm": 0.30698806047439575, + "learning_rate": 4.5475792988313856e-05, + "loss": 0.4167, + "step": 2320 + }, + { + "epoch": 0.0008146924750899703, + "grad_norm": 0.36348575353622437, + "learning_rate": 4.540901502504174e-05, + "loss": 0.421, + "step": 2321 + }, + { + "epoch": 0.0008150434843424864, + "grad_norm": 0.49959614872932434, + "learning_rate": 4.5342237061769616e-05, + "loss": 0.3835, + "step": 2322 + }, + { + "epoch": 0.0008153944935950025, + "grad_norm": 0.3920055329799652, + "learning_rate": 4.5275459098497496e-05, + "loss": 0.4392, + "step": 2323 + }, + { + "epoch": 0.0008157455028475187, + "grad_norm": 0.3473761975765228, + "learning_rate": 4.5208681135225376e-05, + "loss": 0.5014, + "step": 2324 + }, + { + "epoch": 0.0008160965121000348, + "grad_norm": 0.29744240641593933, + "learning_rate": 4.514190317195326e-05, + "loss": 0.5175, + "step": 2325 + }, + { + "epoch": 0.000816447521352551, + "grad_norm": 0.35290253162384033, + "learning_rate": 4.5075125208681135e-05, + "loss": 0.5991, + "step": 2326 + }, + { + "epoch": 0.0008167985306050671, + "grad_norm": 0.3837706446647644, + "learning_rate": 4.5008347245409015e-05, + "loss": 0.5866, + "step": 2327 + }, + { + "epoch": 0.0008171495398575833, + "grad_norm": 0.321729451417923, + "learning_rate": 4.4941569282136895e-05, + "loss": 0.5387, + "step": 2328 + }, + { + "epoch": 0.0008175005491100994, + "grad_norm": 0.6311901211738586, + "learning_rate": 4.4874791318864775e-05, + "loss": 0.5239, + "step": 2329 + }, + { + "epoch": 0.0008178515583626156, + "grad_norm": 0.6958840489387512, + "learning_rate": 4.4808013355592655e-05, + "loss": 0.568, + "step": 2330 + }, + { + "epoch": 0.0008182025676151317, + "grad_norm": 0.5229877829551697, + "learning_rate": 4.4741235392320535e-05, + "loss": 0.6472, + "step": 2331 + }, + { + "epoch": 0.0008185535768676478, + "grad_norm": 0.3351100981235504, + "learning_rate": 4.4674457429048415e-05, + "loss": 0.5663, + "step": 2332 + }, + { + "epoch": 0.000818904586120164, + "grad_norm": 0.3409821689128876, + "learning_rate": 4.4607679465776295e-05, + "loss": 0.4835, + "step": 2333 + }, + { + "epoch": 0.0008192555953726802, + "grad_norm": 0.3333572745323181, + "learning_rate": 4.4540901502504175e-05, + "loss": 0.5457, + "step": 2334 + }, + { + "epoch": 0.0008196066046251962, + "grad_norm": 0.45605313777923584, + "learning_rate": 4.4474123539232055e-05, + "loss": 0.5375, + "step": 2335 + }, + { + "epoch": 0.0008199576138777124, + "grad_norm": 0.2985444664955139, + "learning_rate": 4.4407345575959935e-05, + "loss": 0.5018, + "step": 2336 + }, + { + "epoch": 0.0008203086231302286, + "grad_norm": 0.480658620595932, + "learning_rate": 4.4340567612687815e-05, + "loss": 0.5736, + "step": 2337 + }, + { + "epoch": 0.0008206596323827448, + "grad_norm": 0.38944509625434875, + "learning_rate": 4.4273789649415695e-05, + "loss": 0.449, + "step": 2338 + }, + { + "epoch": 0.0008210106416352608, + "grad_norm": 0.3390035629272461, + "learning_rate": 4.4207011686143574e-05, + "loss": 0.595, + "step": 2339 + }, + { + "epoch": 0.000821361650887777, + "grad_norm": 0.3503229022026062, + "learning_rate": 4.4140233722871454e-05, + "loss": 0.5451, + "step": 2340 + }, + { + "epoch": 0.0008217126601402932, + "grad_norm": 0.29299256205558777, + "learning_rate": 4.4073455759599334e-05, + "loss": 0.5404, + "step": 2341 + }, + { + "epoch": 0.0008220636693928092, + "grad_norm": 0.35951006412506104, + "learning_rate": 4.4006677796327214e-05, + "loss": 0.6232, + "step": 2342 + }, + { + "epoch": 0.0008224146786453254, + "grad_norm": 0.3211289346218109, + "learning_rate": 4.3939899833055094e-05, + "loss": 0.5601, + "step": 2343 + }, + { + "epoch": 0.0008227656878978416, + "grad_norm": 0.3218986392021179, + "learning_rate": 4.3873121869782974e-05, + "loss": 0.5392, + "step": 2344 + }, + { + "epoch": 0.0008231166971503577, + "grad_norm": 0.29046937823295593, + "learning_rate": 4.3806343906510854e-05, + "loss": 0.3494, + "step": 2345 + }, + { + "epoch": 0.0008234677064028738, + "grad_norm": 0.33025527000427246, + "learning_rate": 4.373956594323873e-05, + "loss": 0.4713, + "step": 2346 + }, + { + "epoch": 0.00082381871565539, + "grad_norm": 0.3046811521053314, + "learning_rate": 4.3672787979966614e-05, + "loss": 0.5873, + "step": 2347 + }, + { + "epoch": 0.0008241697249079062, + "grad_norm": 0.330526202917099, + "learning_rate": 4.3606010016694494e-05, + "loss": 0.5937, + "step": 2348 + }, + { + "epoch": 0.0008245207341604223, + "grad_norm": 0.309096097946167, + "learning_rate": 4.3539232053422374e-05, + "loss": 0.5905, + "step": 2349 + }, + { + "epoch": 0.0008248717434129384, + "grad_norm": 0.2798556685447693, + "learning_rate": 4.3472454090150254e-05, + "loss": 0.4126, + "step": 2350 + }, + { + "epoch": 0.0008252227526654546, + "grad_norm": 0.3218364417552948, + "learning_rate": 4.3405676126878134e-05, + "loss": 0.447, + "step": 2351 + }, + { + "epoch": 0.0008255737619179707, + "grad_norm": 0.32477137446403503, + "learning_rate": 4.3338898163606014e-05, + "loss": 0.5111, + "step": 2352 + }, + { + "epoch": 0.0008259247711704869, + "grad_norm": 0.32486987113952637, + "learning_rate": 4.3272120200333893e-05, + "loss": 0.4991, + "step": 2353 + }, + { + "epoch": 0.000826275780423003, + "grad_norm": 0.26125961542129517, + "learning_rate": 4.3205342237061773e-05, + "loss": 0.507, + "step": 2354 + }, + { + "epoch": 0.0008266267896755191, + "grad_norm": 0.29981791973114014, + "learning_rate": 4.313856427378965e-05, + "loss": 0.5704, + "step": 2355 + }, + { + "epoch": 0.0008269777989280353, + "grad_norm": 0.4315311014652252, + "learning_rate": 4.307178631051753e-05, + "loss": 0.4945, + "step": 2356 + }, + { + "epoch": 0.0008273288081805515, + "grad_norm": 0.2862604260444641, + "learning_rate": 4.300500834724541e-05, + "loss": 0.5129, + "step": 2357 + }, + { + "epoch": 0.0008276798174330676, + "grad_norm": 0.3008829951286316, + "learning_rate": 4.293823038397329e-05, + "loss": 0.4608, + "step": 2358 + }, + { + "epoch": 0.0008280308266855837, + "grad_norm": 0.3753371834754944, + "learning_rate": 4.2871452420701166e-05, + "loss": 0.3947, + "step": 2359 + }, + { + "epoch": 0.0008283818359380999, + "grad_norm": 0.310059130191803, + "learning_rate": 4.280467445742905e-05, + "loss": 0.487, + "step": 2360 + }, + { + "epoch": 0.0008287328451906161, + "grad_norm": 0.29558148980140686, + "learning_rate": 4.273789649415693e-05, + "loss": 0.4524, + "step": 2361 + }, + { + "epoch": 0.0008290838544431321, + "grad_norm": 0.3092529773712158, + "learning_rate": 4.267111853088481e-05, + "loss": 0.558, + "step": 2362 + }, + { + "epoch": 0.0008294348636956483, + "grad_norm": 0.3629109263420105, + "learning_rate": 4.2604340567612686e-05, + "loss": 0.5566, + "step": 2363 + }, + { + "epoch": 0.0008297858729481645, + "grad_norm": 0.3263145983219147, + "learning_rate": 4.253756260434057e-05, + "loss": 0.5271, + "step": 2364 + }, + { + "epoch": 0.0008301368822006805, + "grad_norm": 0.32853761315345764, + "learning_rate": 4.247078464106845e-05, + "loss": 0.5364, + "step": 2365 + }, + { + "epoch": 0.0008304878914531967, + "grad_norm": 0.29384636878967285, + "learning_rate": 4.240400667779633e-05, + "loss": 0.4424, + "step": 2366 + }, + { + "epoch": 0.0008308389007057129, + "grad_norm": 0.30362242460250854, + "learning_rate": 4.2337228714524206e-05, + "loss": 0.5283, + "step": 2367 + }, + { + "epoch": 0.0008311899099582291, + "grad_norm": 0.2768915295600891, + "learning_rate": 4.2270450751252086e-05, + "loss": 0.4866, + "step": 2368 + }, + { + "epoch": 0.0008315409192107451, + "grad_norm": 0.30676960945129395, + "learning_rate": 4.220367278797997e-05, + "loss": 0.4834, + "step": 2369 + }, + { + "epoch": 0.0008318919284632613, + "grad_norm": 0.34929925203323364, + "learning_rate": 4.213689482470785e-05, + "loss": 0.5445, + "step": 2370 + }, + { + "epoch": 0.0008322429377157775, + "grad_norm": 0.2859930396080017, + "learning_rate": 4.2070116861435725e-05, + "loss": 0.4663, + "step": 2371 + }, + { + "epoch": 0.0008325939469682936, + "grad_norm": 0.3314751088619232, + "learning_rate": 4.2003338898163605e-05, + "loss": 0.6377, + "step": 2372 + }, + { + "epoch": 0.0008329449562208097, + "grad_norm": 0.2735826373100281, + "learning_rate": 4.193656093489149e-05, + "loss": 0.3718, + "step": 2373 + }, + { + "epoch": 0.0008332959654733259, + "grad_norm": 0.3017156422138214, + "learning_rate": 4.186978297161937e-05, + "loss": 0.5561, + "step": 2374 + }, + { + "epoch": 0.000833646974725842, + "grad_norm": 0.28279563784599304, + "learning_rate": 4.1803005008347245e-05, + "loss": 0.4622, + "step": 2375 + }, + { + "epoch": 0.0008339979839783582, + "grad_norm": 0.3143702745437622, + "learning_rate": 4.1736227045075125e-05, + "loss": 0.538, + "step": 2376 + }, + { + "epoch": 0.0008343489932308743, + "grad_norm": 0.33771878480911255, + "learning_rate": 4.1669449081803005e-05, + "loss": 0.5423, + "step": 2377 + }, + { + "epoch": 0.0008347000024833905, + "grad_norm": 0.32004043459892273, + "learning_rate": 4.160267111853089e-05, + "loss": 0.5785, + "step": 2378 + }, + { + "epoch": 0.0008350510117359066, + "grad_norm": 0.3358834981918335, + "learning_rate": 4.1535893155258765e-05, + "loss": 0.6388, + "step": 2379 + }, + { + "epoch": 0.0008354020209884228, + "grad_norm": 0.3659215271472931, + "learning_rate": 4.1469115191986645e-05, + "loss": 0.5694, + "step": 2380 + }, + { + "epoch": 0.0008357530302409389, + "grad_norm": 0.29528388381004333, + "learning_rate": 4.1402337228714525e-05, + "loss": 0.4486, + "step": 2381 + }, + { + "epoch": 0.000836104039493455, + "grad_norm": 0.299845814704895, + "learning_rate": 4.133555926544241e-05, + "loss": 0.4146, + "step": 2382 + }, + { + "epoch": 0.0008364550487459712, + "grad_norm": 0.28873342275619507, + "learning_rate": 4.1268781302170285e-05, + "loss": 0.4332, + "step": 2383 + }, + { + "epoch": 0.0008368060579984874, + "grad_norm": 0.3562033176422119, + "learning_rate": 4.1202003338898165e-05, + "loss": 0.4753, + "step": 2384 + }, + { + "epoch": 0.0008371570672510034, + "grad_norm": 0.28127390146255493, + "learning_rate": 4.1135225375626044e-05, + "loss": 0.4853, + "step": 2385 + }, + { + "epoch": 0.0008375080765035196, + "grad_norm": 0.33200159668922424, + "learning_rate": 4.1068447412353924e-05, + "loss": 0.4871, + "step": 2386 + }, + { + "epoch": 0.0008378590857560358, + "grad_norm": 0.3686981499195099, + "learning_rate": 4.1001669449081804e-05, + "loss": 0.5417, + "step": 2387 + }, + { + "epoch": 0.000838210095008552, + "grad_norm": 0.3078843057155609, + "learning_rate": 4.0934891485809684e-05, + "loss": 0.5379, + "step": 2388 + }, + { + "epoch": 0.000838561104261068, + "grad_norm": 0.29550114274024963, + "learning_rate": 4.0868113522537564e-05, + "loss": 0.5334, + "step": 2389 + }, + { + "epoch": 0.0008389121135135842, + "grad_norm": 0.31512629985809326, + "learning_rate": 4.0801335559265444e-05, + "loss": 0.4291, + "step": 2390 + }, + { + "epoch": 0.0008392631227661004, + "grad_norm": 0.30229613184928894, + "learning_rate": 4.0734557595993324e-05, + "loss": 0.5583, + "step": 2391 + }, + { + "epoch": 0.0008396141320186164, + "grad_norm": 0.34097641706466675, + "learning_rate": 4.0667779632721204e-05, + "loss": 0.4972, + "step": 2392 + }, + { + "epoch": 0.0008399651412711326, + "grad_norm": 0.3585929274559021, + "learning_rate": 4.0601001669449084e-05, + "loss": 0.4456, + "step": 2393 + }, + { + "epoch": 0.0008403161505236488, + "grad_norm": 0.30176597833633423, + "learning_rate": 4.0534223706176964e-05, + "loss": 0.462, + "step": 2394 + }, + { + "epoch": 0.0008406671597761649, + "grad_norm": 0.31252893805503845, + "learning_rate": 4.0467445742904844e-05, + "loss": 0.4703, + "step": 2395 + }, + { + "epoch": 0.000841018169028681, + "grad_norm": 0.3262486159801483, + "learning_rate": 4.0400667779632724e-05, + "loss": 0.5536, + "step": 2396 + }, + { + "epoch": 0.0008413691782811972, + "grad_norm": 0.2762390077114105, + "learning_rate": 4.0333889816360604e-05, + "loss": 0.4422, + "step": 2397 + }, + { + "epoch": 0.0008417201875337133, + "grad_norm": 0.3413786292076111, + "learning_rate": 4.0267111853088484e-05, + "loss": 0.4447, + "step": 2398 + }, + { + "epoch": 0.0008420711967862295, + "grad_norm": 0.31144657731056213, + "learning_rate": 4.0200333889816363e-05, + "loss": 0.4937, + "step": 2399 + }, + { + "epoch": 0.0008424222060387456, + "grad_norm": 0.3274284899234772, + "learning_rate": 4.0133555926544243e-05, + "loss": 0.5868, + "step": 2400 + }, + { + "epoch": 0.0008427732152912618, + "grad_norm": 0.3613366186618805, + "learning_rate": 4.006677796327212e-05, + "loss": 0.4489, + "step": 2401 + }, + { + "epoch": 0.0008431242245437779, + "grad_norm": 0.38178175687789917, + "learning_rate": 4e-05, + "loss": 0.576, + "step": 2402 + }, + { + "epoch": 0.0008434752337962941, + "grad_norm": 0.35652783513069153, + "learning_rate": 3.9933222036727876e-05, + "loss": 0.6034, + "step": 2403 + }, + { + "epoch": 0.0008438262430488102, + "grad_norm": 0.3658648431301117, + "learning_rate": 3.986644407345576e-05, + "loss": 0.5243, + "step": 2404 + }, + { + "epoch": 0.0008441772523013263, + "grad_norm": 0.30486276745796204, + "learning_rate": 3.979966611018364e-05, + "loss": 0.4327, + "step": 2405 + }, + { + "epoch": 0.0008445282615538425, + "grad_norm": 0.2804754376411438, + "learning_rate": 3.973288814691152e-05, + "loss": 0.4825, + "step": 2406 + }, + { + "epoch": 0.0008448792708063587, + "grad_norm": 0.28429001569747925, + "learning_rate": 3.9666110183639396e-05, + "loss": 0.4161, + "step": 2407 + }, + { + "epoch": 0.0008452302800588747, + "grad_norm": 0.30368781089782715, + "learning_rate": 3.959933222036728e-05, + "loss": 0.5696, + "step": 2408 + }, + { + "epoch": 0.0008455812893113909, + "grad_norm": 0.33198389410972595, + "learning_rate": 3.953255425709516e-05, + "loss": 0.5458, + "step": 2409 + }, + { + "epoch": 0.0008459322985639071, + "grad_norm": 0.2976115942001343, + "learning_rate": 3.946577629382304e-05, + "loss": 0.5013, + "step": 2410 + }, + { + "epoch": 0.0008462833078164233, + "grad_norm": 0.34938329458236694, + "learning_rate": 3.9398998330550916e-05, + "loss": 0.6064, + "step": 2411 + }, + { + "epoch": 0.0008466343170689393, + "grad_norm": 0.30314376950263977, + "learning_rate": 3.93322203672788e-05, + "loss": 0.5154, + "step": 2412 + }, + { + "epoch": 0.0008469853263214555, + "grad_norm": 0.30583375692367554, + "learning_rate": 3.926544240400668e-05, + "loss": 0.5362, + "step": 2413 + }, + { + "epoch": 0.0008473363355739717, + "grad_norm": 0.3435641825199127, + "learning_rate": 3.919866444073456e-05, + "loss": 0.4087, + "step": 2414 + }, + { + "epoch": 0.0008476873448264877, + "grad_norm": 0.3141246736049652, + "learning_rate": 3.9131886477462436e-05, + "loss": 0.5155, + "step": 2415 + }, + { + "epoch": 0.0008480383540790039, + "grad_norm": 0.301431804895401, + "learning_rate": 3.9065108514190316e-05, + "loss": 0.4345, + "step": 2416 + }, + { + "epoch": 0.0008483893633315201, + "grad_norm": 0.2610575556755066, + "learning_rate": 3.89983305509182e-05, + "loss": 0.5468, + "step": 2417 + }, + { + "epoch": 0.0008487403725840362, + "grad_norm": 0.30231544375419617, + "learning_rate": 3.893155258764608e-05, + "loss": 0.4617, + "step": 2418 + }, + { + "epoch": 0.0008490913818365523, + "grad_norm": 0.3302491307258606, + "learning_rate": 3.8864774624373955e-05, + "loss": 0.5269, + "step": 2419 + }, + { + "epoch": 0.0008494423910890685, + "grad_norm": 0.31854262948036194, + "learning_rate": 3.8797996661101835e-05, + "loss": 0.5081, + "step": 2420 + }, + { + "epoch": 0.0008497934003415847, + "grad_norm": 0.356121689081192, + "learning_rate": 3.873121869782972e-05, + "loss": 0.5109, + "step": 2421 + }, + { + "epoch": 0.0008501444095941008, + "grad_norm": 0.3252284526824951, + "learning_rate": 3.86644407345576e-05, + "loss": 0.5355, + "step": 2422 + }, + { + "epoch": 0.0008504954188466169, + "grad_norm": 0.3570926785469055, + "learning_rate": 3.8597662771285475e-05, + "loss": 0.3933, + "step": 2423 + }, + { + "epoch": 0.0008508464280991331, + "grad_norm": 0.41406819224357605, + "learning_rate": 3.8530884808013355e-05, + "loss": 0.5708, + "step": 2424 + }, + { + "epoch": 0.0008511974373516492, + "grad_norm": 0.26306653022766113, + "learning_rate": 3.8464106844741235e-05, + "loss": 0.4166, + "step": 2425 + }, + { + "epoch": 0.0008515484466041654, + "grad_norm": 0.32971739768981934, + "learning_rate": 3.839732888146912e-05, + "loss": 0.4723, + "step": 2426 + }, + { + "epoch": 0.0008518994558566815, + "grad_norm": 0.3209386467933655, + "learning_rate": 3.8330550918196995e-05, + "loss": 0.4646, + "step": 2427 + }, + { + "epoch": 0.0008522504651091976, + "grad_norm": 0.40913888812065125, + "learning_rate": 3.8263772954924875e-05, + "loss": 0.4713, + "step": 2428 + }, + { + "epoch": 0.0008526014743617138, + "grad_norm": 0.3265860974788666, + "learning_rate": 3.8196994991652755e-05, + "loss": 0.4628, + "step": 2429 + }, + { + "epoch": 0.00085295248361423, + "grad_norm": 0.3348692059516907, + "learning_rate": 3.813021702838064e-05, + "loss": 0.5985, + "step": 2430 + }, + { + "epoch": 0.0008533034928667461, + "grad_norm": 0.31986677646636963, + "learning_rate": 3.8063439065108514e-05, + "loss": 0.6139, + "step": 2431 + }, + { + "epoch": 0.0008536545021192622, + "grad_norm": 0.35525721311569214, + "learning_rate": 3.7996661101836394e-05, + "loss": 0.5216, + "step": 2432 + }, + { + "epoch": 0.0008540055113717784, + "grad_norm": 0.3543768525123596, + "learning_rate": 3.7929883138564274e-05, + "loss": 0.52, + "step": 2433 + }, + { + "epoch": 0.0008543565206242946, + "grad_norm": 0.31203389167785645, + "learning_rate": 3.7863105175292154e-05, + "loss": 0.4891, + "step": 2434 + }, + { + "epoch": 0.0008547075298768106, + "grad_norm": 0.30776453018188477, + "learning_rate": 3.7796327212020034e-05, + "loss": 0.4326, + "step": 2435 + }, + { + "epoch": 0.0008550585391293268, + "grad_norm": 0.29725879430770874, + "learning_rate": 3.7729549248747914e-05, + "loss": 0.3621, + "step": 2436 + }, + { + "epoch": 0.000855409548381843, + "grad_norm": 0.3332844376564026, + "learning_rate": 3.7662771285475794e-05, + "loss": 0.5518, + "step": 2437 + }, + { + "epoch": 0.000855760557634359, + "grad_norm": 0.34597867727279663, + "learning_rate": 3.7595993322203674e-05, + "loss": 0.5084, + "step": 2438 + }, + { + "epoch": 0.0008561115668868752, + "grad_norm": 0.3425275981426239, + "learning_rate": 3.7529215358931554e-05, + "loss": 0.5683, + "step": 2439 + }, + { + "epoch": 0.0008564625761393914, + "grad_norm": 0.35414308309555054, + "learning_rate": 3.7462437395659434e-05, + "loss": 0.5962, + "step": 2440 + }, + { + "epoch": 0.0008568135853919076, + "grad_norm": 0.31397873163223267, + "learning_rate": 3.7395659432387314e-05, + "loss": 0.5067, + "step": 2441 + }, + { + "epoch": 0.0008571645946444236, + "grad_norm": 0.3142837584018707, + "learning_rate": 3.7328881469115194e-05, + "loss": 0.5437, + "step": 2442 + }, + { + "epoch": 0.0008575156038969398, + "grad_norm": 0.3198903501033783, + "learning_rate": 3.7262103505843074e-05, + "loss": 0.4864, + "step": 2443 + }, + { + "epoch": 0.000857866613149456, + "grad_norm": 0.37642693519592285, + "learning_rate": 3.7195325542570954e-05, + "loss": 0.4489, + "step": 2444 + }, + { + "epoch": 0.0008582176224019721, + "grad_norm": 0.31032124161720276, + "learning_rate": 3.7128547579298833e-05, + "loss": 0.3956, + "step": 2445 + }, + { + "epoch": 0.0008585686316544882, + "grad_norm": 0.2642196714878082, + "learning_rate": 3.7061769616026713e-05, + "loss": 0.3515, + "step": 2446 + }, + { + "epoch": 0.0008589196409070044, + "grad_norm": 0.2694128751754761, + "learning_rate": 3.699499165275459e-05, + "loss": 0.4628, + "step": 2447 + }, + { + "epoch": 0.0008592706501595205, + "grad_norm": 0.4253450632095337, + "learning_rate": 3.692821368948247e-05, + "loss": 0.5667, + "step": 2448 + }, + { + "epoch": 0.0008596216594120367, + "grad_norm": 0.32464760541915894, + "learning_rate": 3.686143572621035e-05, + "loss": 0.5857, + "step": 2449 + }, + { + "epoch": 0.0008599726686645528, + "grad_norm": 0.298491507768631, + "learning_rate": 3.679465776293823e-05, + "loss": 0.4721, + "step": 2450 + }, + { + "epoch": 0.000860323677917069, + "grad_norm": 0.36551931500434875, + "learning_rate": 3.6727879799666106e-05, + "loss": 0.491, + "step": 2451 + }, + { + "epoch": 0.0008606746871695851, + "grad_norm": 0.3350832164287567, + "learning_rate": 3.666110183639399e-05, + "loss": 0.5654, + "step": 2452 + }, + { + "epoch": 0.0008610256964221013, + "grad_norm": 0.34928208589553833, + "learning_rate": 3.659432387312187e-05, + "loss": 0.5588, + "step": 2453 + }, + { + "epoch": 0.0008613767056746174, + "grad_norm": 0.32251986861228943, + "learning_rate": 3.652754590984975e-05, + "loss": 0.4846, + "step": 2454 + }, + { + "epoch": 0.0008617277149271335, + "grad_norm": 0.3968466520309448, + "learning_rate": 3.6460767946577626e-05, + "loss": 0.5886, + "step": 2455 + }, + { + "epoch": 0.0008620787241796497, + "grad_norm": 0.3277405798435211, + "learning_rate": 3.639398998330551e-05, + "loss": 0.4697, + "step": 2456 + }, + { + "epoch": 0.0008624297334321659, + "grad_norm": 0.3197111487388611, + "learning_rate": 3.632721202003339e-05, + "loss": 0.4728, + "step": 2457 + }, + { + "epoch": 0.0008627807426846819, + "grad_norm": 0.30383023619651794, + "learning_rate": 3.626043405676127e-05, + "loss": 0.3914, + "step": 2458 + }, + { + "epoch": 0.0008631317519371981, + "grad_norm": 0.3400476276874542, + "learning_rate": 3.6193656093489146e-05, + "loss": 0.5023, + "step": 2459 + }, + { + "epoch": 0.0008634827611897143, + "grad_norm": 0.5489293932914734, + "learning_rate": 3.6126878130217026e-05, + "loss": 0.5215, + "step": 2460 + }, + { + "epoch": 0.0008638337704422305, + "grad_norm": 0.29822349548339844, + "learning_rate": 3.606010016694491e-05, + "loss": 0.5606, + "step": 2461 + }, + { + "epoch": 0.0008641847796947465, + "grad_norm": 0.35215723514556885, + "learning_rate": 3.599332220367279e-05, + "loss": 0.5509, + "step": 2462 + }, + { + "epoch": 0.0008645357889472627, + "grad_norm": 0.307216614484787, + "learning_rate": 3.5926544240400665e-05, + "loss": 0.4325, + "step": 2463 + }, + { + "epoch": 0.0008648867981997789, + "grad_norm": 0.31825220584869385, + "learning_rate": 3.5859766277128545e-05, + "loss": 0.4299, + "step": 2464 + }, + { + "epoch": 0.000865237807452295, + "grad_norm": 0.3078344166278839, + "learning_rate": 3.579298831385643e-05, + "loss": 0.5945, + "step": 2465 + }, + { + "epoch": 0.0008655888167048111, + "grad_norm": 0.29364824295043945, + "learning_rate": 3.572621035058431e-05, + "loss": 0.5109, + "step": 2466 + }, + { + "epoch": 0.0008659398259573273, + "grad_norm": 0.364878386259079, + "learning_rate": 3.5659432387312185e-05, + "loss": 0.5408, + "step": 2467 + }, + { + "epoch": 0.0008662908352098434, + "grad_norm": 0.32669126987457275, + "learning_rate": 3.5592654424040065e-05, + "loss": 0.5167, + "step": 2468 + }, + { + "epoch": 0.0008666418444623595, + "grad_norm": 0.3356972634792328, + "learning_rate": 3.5525876460767945e-05, + "loss": 0.6229, + "step": 2469 + }, + { + "epoch": 0.0008669928537148757, + "grad_norm": 0.3334660232067108, + "learning_rate": 3.545909849749583e-05, + "loss": 0.4762, + "step": 2470 + }, + { + "epoch": 0.0008673438629673919, + "grad_norm": 0.33314821124076843, + "learning_rate": 3.5392320534223705e-05, + "loss": 0.5889, + "step": 2471 + }, + { + "epoch": 0.000867694872219908, + "grad_norm": 0.2715354263782501, + "learning_rate": 3.5325542570951585e-05, + "loss": 0.4532, + "step": 2472 + }, + { + "epoch": 0.0008680458814724241, + "grad_norm": 0.3389108180999756, + "learning_rate": 3.5258764607679465e-05, + "loss": 0.5293, + "step": 2473 + }, + { + "epoch": 0.0008683968907249403, + "grad_norm": 0.28182253241539, + "learning_rate": 3.519198664440735e-05, + "loss": 0.4768, + "step": 2474 + }, + { + "epoch": 0.0008687478999774564, + "grad_norm": 0.3153379261493683, + "learning_rate": 3.5125208681135225e-05, + "loss": 0.3779, + "step": 2475 + }, + { + "epoch": 0.0008690989092299726, + "grad_norm": 0.3339671492576599, + "learning_rate": 3.5058430717863105e-05, + "loss": 0.4738, + "step": 2476 + }, + { + "epoch": 0.0008694499184824887, + "grad_norm": 0.3346128463745117, + "learning_rate": 3.4991652754590984e-05, + "loss": 0.4353, + "step": 2477 + }, + { + "epoch": 0.0008698009277350048, + "grad_norm": 0.33985427021980286, + "learning_rate": 3.492487479131887e-05, + "loss": 0.5535, + "step": 2478 + }, + { + "epoch": 0.000870151936987521, + "grad_norm": 0.36896049976348877, + "learning_rate": 3.4858096828046744e-05, + "loss": 0.4593, + "step": 2479 + }, + { + "epoch": 0.0008705029462400372, + "grad_norm": 0.3066719174385071, + "learning_rate": 3.4791318864774624e-05, + "loss": 0.5199, + "step": 2480 + }, + { + "epoch": 0.0008708539554925533, + "grad_norm": 0.28390833735466003, + "learning_rate": 3.4724540901502504e-05, + "loss": 0.4747, + "step": 2481 + }, + { + "epoch": 0.0008712049647450694, + "grad_norm": 0.3579369783401489, + "learning_rate": 3.4657762938230384e-05, + "loss": 0.342, + "step": 2482 + }, + { + "epoch": 0.0008715559739975856, + "grad_norm": 0.2909548282623291, + "learning_rate": 3.459098497495827e-05, + "loss": 0.525, + "step": 2483 + }, + { + "epoch": 0.0008719069832501018, + "grad_norm": 0.31367257237434387, + "learning_rate": 3.4524207011686144e-05, + "loss": 0.5744, + "step": 2484 + }, + { + "epoch": 0.0008722579925026178, + "grad_norm": 0.3309953510761261, + "learning_rate": 3.4457429048414024e-05, + "loss": 0.3773, + "step": 2485 + }, + { + "epoch": 0.000872609001755134, + "grad_norm": 0.32469210028648376, + "learning_rate": 3.4390651085141904e-05, + "loss": 0.5586, + "step": 2486 + }, + { + "epoch": 0.0008729600110076502, + "grad_norm": 0.3475576341152191, + "learning_rate": 3.432387312186979e-05, + "loss": 0.5238, + "step": 2487 + }, + { + "epoch": 0.0008733110202601663, + "grad_norm": 0.2654307782649994, + "learning_rate": 3.4257095158597664e-05, + "loss": 0.4142, + "step": 2488 + }, + { + "epoch": 0.0008736620295126824, + "grad_norm": 0.3001498579978943, + "learning_rate": 3.4190317195325544e-05, + "loss": 0.5085, + "step": 2489 + }, + { + "epoch": 0.0008740130387651986, + "grad_norm": 0.36860695481300354, + "learning_rate": 3.4123539232053424e-05, + "loss": 0.5357, + "step": 2490 + }, + { + "epoch": 0.0008743640480177148, + "grad_norm": 0.3456466794013977, + "learning_rate": 3.4056761268781303e-05, + "loss": 0.5146, + "step": 2491 + }, + { + "epoch": 0.0008747150572702309, + "grad_norm": 0.33204081654548645, + "learning_rate": 3.3989983305509183e-05, + "loss": 0.5977, + "step": 2492 + }, + { + "epoch": 0.000875066066522747, + "grad_norm": 0.3318590819835663, + "learning_rate": 3.392320534223706e-05, + "loss": 0.3919, + "step": 2493 + }, + { + "epoch": 0.0008754170757752632, + "grad_norm": 0.3074159324169159, + "learning_rate": 3.385642737896494e-05, + "loss": 0.4548, + "step": 2494 + }, + { + "epoch": 0.0008757680850277793, + "grad_norm": 0.33519870042800903, + "learning_rate": 3.378964941569282e-05, + "loss": 0.5339, + "step": 2495 + }, + { + "epoch": 0.0008761190942802954, + "grad_norm": 0.2852168679237366, + "learning_rate": 3.37228714524207e-05, + "loss": 0.4363, + "step": 2496 + }, + { + "epoch": 0.0008764701035328116, + "grad_norm": 0.3491702973842621, + "learning_rate": 3.365609348914858e-05, + "loss": 0.5674, + "step": 2497 + }, + { + "epoch": 0.0008768211127853277, + "grad_norm": 0.350176066160202, + "learning_rate": 3.358931552587646e-05, + "loss": 0.4088, + "step": 2498 + }, + { + "epoch": 0.0008771721220378439, + "grad_norm": 0.37386786937713623, + "learning_rate": 3.352253756260434e-05, + "loss": 0.5911, + "step": 2499 + }, + { + "epoch": 0.00087752313129036, + "grad_norm": 0.33551308512687683, + "learning_rate": 3.345575959933222e-05, + "loss": 0.5333, + "step": 2500 + }, + { + "epoch": 0.0008778741405428762, + "grad_norm": 0.3414926826953888, + "learning_rate": 3.33889816360601e-05, + "loss": 0.5327, + "step": 2501 + }, + { + "epoch": 0.0008782251497953923, + "grad_norm": 0.3484536409378052, + "learning_rate": 3.332220367278798e-05, + "loss": 0.4738, + "step": 2502 + }, + { + "epoch": 0.0008785761590479085, + "grad_norm": 0.3408408463001251, + "learning_rate": 3.325542570951586e-05, + "loss": 0.5362, + "step": 2503 + }, + { + "epoch": 0.0008789271683004246, + "grad_norm": 0.31221267580986023, + "learning_rate": 3.318864774624374e-05, + "loss": 0.5036, + "step": 2504 + }, + { + "epoch": 0.0008792781775529407, + "grad_norm": 0.31918561458587646, + "learning_rate": 3.312186978297162e-05, + "loss": 0.5474, + "step": 2505 + }, + { + "epoch": 0.0008796291868054569, + "grad_norm": 0.29524701833724976, + "learning_rate": 3.30550918196995e-05, + "loss": 0.4776, + "step": 2506 + }, + { + "epoch": 0.0008799801960579731, + "grad_norm": 0.33439961075782776, + "learning_rate": 3.298831385642738e-05, + "loss": 0.6015, + "step": 2507 + }, + { + "epoch": 0.0008803312053104891, + "grad_norm": 0.33239129185676575, + "learning_rate": 3.2921535893155256e-05, + "loss": 0.5759, + "step": 2508 + }, + { + "epoch": 0.0008806822145630053, + "grad_norm": 0.4026282727718353, + "learning_rate": 3.285475792988314e-05, + "loss": 0.5148, + "step": 2509 + }, + { + "epoch": 0.0008810332238155215, + "grad_norm": 0.31234943866729736, + "learning_rate": 3.278797996661102e-05, + "loss": 0.4727, + "step": 2510 + }, + { + "epoch": 0.0008813842330680377, + "grad_norm": 0.28196650743484497, + "learning_rate": 3.27212020033389e-05, + "loss": 0.4748, + "step": 2511 + }, + { + "epoch": 0.0008817352423205537, + "grad_norm": 0.2474774271249771, + "learning_rate": 3.2654424040066775e-05, + "loss": 0.5209, + "step": 2512 + }, + { + "epoch": 0.0008820862515730699, + "grad_norm": 0.34046825766563416, + "learning_rate": 3.258764607679466e-05, + "loss": 0.5118, + "step": 2513 + }, + { + "epoch": 0.0008824372608255861, + "grad_norm": 0.30404943227767944, + "learning_rate": 3.252086811352254e-05, + "loss": 0.5228, + "step": 2514 + }, + { + "epoch": 0.0008827882700781022, + "grad_norm": 0.27264684438705444, + "learning_rate": 3.245409015025042e-05, + "loss": 0.5414, + "step": 2515 + }, + { + "epoch": 0.0008831392793306183, + "grad_norm": 0.3055272400379181, + "learning_rate": 3.2387312186978295e-05, + "loss": 0.5167, + "step": 2516 + }, + { + "epoch": 0.0008834902885831345, + "grad_norm": 0.32805371284484863, + "learning_rate": 3.2320534223706175e-05, + "loss": 0.5108, + "step": 2517 + }, + { + "epoch": 0.0008838412978356506, + "grad_norm": 0.3217853009700775, + "learning_rate": 3.225375626043406e-05, + "loss": 0.5826, + "step": 2518 + }, + { + "epoch": 0.0008841923070881668, + "grad_norm": 0.3441222012042999, + "learning_rate": 3.218697829716194e-05, + "loss": 0.6089, + "step": 2519 + }, + { + "epoch": 0.0008845433163406829, + "grad_norm": 0.3731807768344879, + "learning_rate": 3.2120200333889815e-05, + "loss": 0.63, + "step": 2520 + }, + { + "epoch": 0.0008848943255931991, + "grad_norm": 0.3741554319858551, + "learning_rate": 3.2053422370617695e-05, + "loss": 0.398, + "step": 2521 + }, + { + "epoch": 0.0008852453348457152, + "grad_norm": 0.288764625787735, + "learning_rate": 3.198664440734558e-05, + "loss": 0.4692, + "step": 2522 + }, + { + "epoch": 0.0008855963440982313, + "grad_norm": 0.32524964213371277, + "learning_rate": 3.191986644407346e-05, + "loss": 0.4629, + "step": 2523 + }, + { + "epoch": 0.0008859473533507475, + "grad_norm": 0.2727656364440918, + "learning_rate": 3.1853088480801334e-05, + "loss": 0.3868, + "step": 2524 + }, + { + "epoch": 0.0008862983626032636, + "grad_norm": 0.29613935947418213, + "learning_rate": 3.1786310517529214e-05, + "loss": 0.4708, + "step": 2525 + }, + { + "epoch": 0.0008866493718557798, + "grad_norm": 0.278096079826355, + "learning_rate": 3.1719532554257094e-05, + "loss": 0.5313, + "step": 2526 + }, + { + "epoch": 0.000887000381108296, + "grad_norm": 0.31100913882255554, + "learning_rate": 3.165275459098498e-05, + "loss": 0.5446, + "step": 2527 + }, + { + "epoch": 0.000887351390360812, + "grad_norm": 0.3331372141838074, + "learning_rate": 3.1585976627712854e-05, + "loss": 0.4954, + "step": 2528 + }, + { + "epoch": 0.0008877023996133282, + "grad_norm": 0.31252044439315796, + "learning_rate": 3.1519198664440734e-05, + "loss": 0.5575, + "step": 2529 + }, + { + "epoch": 0.0008880534088658444, + "grad_norm": 0.30109015107154846, + "learning_rate": 3.1452420701168614e-05, + "loss": 0.4152, + "step": 2530 + }, + { + "epoch": 0.0008884044181183605, + "grad_norm": 0.3484225869178772, + "learning_rate": 3.13856427378965e-05, + "loss": 0.5694, + "step": 2531 + }, + { + "epoch": 0.0008887554273708766, + "grad_norm": 0.2851831018924713, + "learning_rate": 3.1318864774624374e-05, + "loss": 0.4683, + "step": 2532 + }, + { + "epoch": 0.0008891064366233928, + "grad_norm": 0.31114494800567627, + "learning_rate": 3.1252086811352254e-05, + "loss": 0.5718, + "step": 2533 + }, + { + "epoch": 0.000889457445875909, + "grad_norm": 0.37384262681007385, + "learning_rate": 3.1185308848080134e-05, + "loss": 0.5041, + "step": 2534 + }, + { + "epoch": 0.000889808455128425, + "grad_norm": 0.3671551048755646, + "learning_rate": 3.111853088480802e-05, + "loss": 0.4363, + "step": 2535 + }, + { + "epoch": 0.0008901594643809412, + "grad_norm": 0.30592501163482666, + "learning_rate": 3.1051752921535894e-05, + "loss": 0.4547, + "step": 2536 + }, + { + "epoch": 0.0008905104736334574, + "grad_norm": 0.3097487688064575, + "learning_rate": 3.0984974958263773e-05, + "loss": 0.4955, + "step": 2537 + }, + { + "epoch": 0.0008908614828859735, + "grad_norm": 0.3594546616077423, + "learning_rate": 3.0918196994991653e-05, + "loss": 0.5809, + "step": 2538 + }, + { + "epoch": 0.0008912124921384896, + "grad_norm": 0.2712516486644745, + "learning_rate": 3.085141903171953e-05, + "loss": 0.5126, + "step": 2539 + }, + { + "epoch": 0.0008915635013910058, + "grad_norm": 0.32262900471687317, + "learning_rate": 3.078464106844741e-05, + "loss": 0.583, + "step": 2540 + }, + { + "epoch": 0.000891914510643522, + "grad_norm": 0.33349621295928955, + "learning_rate": 3.071786310517529e-05, + "loss": 0.6349, + "step": 2541 + }, + { + "epoch": 0.0008922655198960381, + "grad_norm": 0.30835041403770447, + "learning_rate": 3.065108514190317e-05, + "loss": 0.4804, + "step": 2542 + }, + { + "epoch": 0.0008926165291485542, + "grad_norm": 0.2974499464035034, + "learning_rate": 3.058430717863105e-05, + "loss": 0.4493, + "step": 2543 + }, + { + "epoch": 0.0008929675384010704, + "grad_norm": 0.3454904556274414, + "learning_rate": 3.051752921535893e-05, + "loss": 0.5621, + "step": 2544 + }, + { + "epoch": 0.0008933185476535865, + "grad_norm": 0.3808043301105499, + "learning_rate": 3.0450751252086813e-05, + "loss": 0.6073, + "step": 2545 + }, + { + "epoch": 0.0008936695569061027, + "grad_norm": 0.3095337748527527, + "learning_rate": 3.0383973288814693e-05, + "loss": 0.4222, + "step": 2546 + }, + { + "epoch": 0.0008940205661586188, + "grad_norm": 0.33776912093162537, + "learning_rate": 3.0317195325542573e-05, + "loss": 0.4989, + "step": 2547 + }, + { + "epoch": 0.0008943715754111349, + "grad_norm": 0.2958698570728302, + "learning_rate": 3.025041736227045e-05, + "loss": 0.4433, + "step": 2548 + }, + { + "epoch": 0.0008947225846636511, + "grad_norm": 0.34064793586730957, + "learning_rate": 3.0183639398998333e-05, + "loss": 0.6095, + "step": 2549 + }, + { + "epoch": 0.0008950735939161673, + "grad_norm": 0.29243403673171997, + "learning_rate": 3.0116861435726213e-05, + "loss": 0.5361, + "step": 2550 + }, + { + "epoch": 0.0008954246031686834, + "grad_norm": 0.4017452001571655, + "learning_rate": 3.0050083472454093e-05, + "loss": 0.4546, + "step": 2551 + }, + { + "epoch": 0.0008957756124211995, + "grad_norm": 0.33234545588493347, + "learning_rate": 2.998330550918197e-05, + "loss": 0.6337, + "step": 2552 + }, + { + "epoch": 0.0008961266216737157, + "grad_norm": 0.3377830386161804, + "learning_rate": 2.991652754590985e-05, + "loss": 0.588, + "step": 2553 + }, + { + "epoch": 0.0008964776309262318, + "grad_norm": 0.31217512488365173, + "learning_rate": 2.9849749582637732e-05, + "loss": 0.5284, + "step": 2554 + }, + { + "epoch": 0.0008968286401787479, + "grad_norm": 0.25799041986465454, + "learning_rate": 2.9782971619365612e-05, + "loss": 0.5066, + "step": 2555 + }, + { + "epoch": 0.0008971796494312641, + "grad_norm": 0.3515590727329254, + "learning_rate": 2.971619365609349e-05, + "loss": 0.5457, + "step": 2556 + }, + { + "epoch": 0.0008975306586837803, + "grad_norm": 0.3715484142303467, + "learning_rate": 2.964941569282137e-05, + "loss": 0.5072, + "step": 2557 + }, + { + "epoch": 0.0008978816679362963, + "grad_norm": 0.30447039008140564, + "learning_rate": 2.9582637729549252e-05, + "loss": 0.472, + "step": 2558 + }, + { + "epoch": 0.0008982326771888125, + "grad_norm": 0.3064466118812561, + "learning_rate": 2.9515859766277132e-05, + "loss": 0.4582, + "step": 2559 + }, + { + "epoch": 0.0008985836864413287, + "grad_norm": 0.2993955910205841, + "learning_rate": 2.944908180300501e-05, + "loss": 0.4626, + "step": 2560 + }, + { + "epoch": 0.0008989346956938449, + "grad_norm": 0.33631017804145813, + "learning_rate": 2.938230383973289e-05, + "loss": 0.5393, + "step": 2561 + }, + { + "epoch": 0.0008992857049463609, + "grad_norm": 0.30874401330947876, + "learning_rate": 2.931552587646077e-05, + "loss": 0.4288, + "step": 2562 + }, + { + "epoch": 0.0008996367141988771, + "grad_norm": 0.3230644166469574, + "learning_rate": 2.924874791318865e-05, + "loss": 0.4758, + "step": 2563 + }, + { + "epoch": 0.0008999877234513933, + "grad_norm": 0.3295224606990814, + "learning_rate": 2.9181969949916528e-05, + "loss": 0.5055, + "step": 2564 + }, + { + "epoch": 0.0009003387327039094, + "grad_norm": 0.3244490623474121, + "learning_rate": 2.9115191986644408e-05, + "loss": 0.5034, + "step": 2565 + }, + { + "epoch": 0.0009006897419564255, + "grad_norm": 0.3664167821407318, + "learning_rate": 2.9048414023372288e-05, + "loss": 0.4777, + "step": 2566 + }, + { + "epoch": 0.0009010407512089417, + "grad_norm": 0.30889442563056946, + "learning_rate": 2.898163606010017e-05, + "loss": 0.5127, + "step": 2567 + }, + { + "epoch": 0.0009013917604614578, + "grad_norm": 0.2670011520385742, + "learning_rate": 2.8914858096828045e-05, + "loss": 0.2855, + "step": 2568 + }, + { + "epoch": 0.000901742769713974, + "grad_norm": 0.30626508593559265, + "learning_rate": 2.8848080133555928e-05, + "loss": 0.5068, + "step": 2569 + }, + { + "epoch": 0.0009020937789664901, + "grad_norm": 0.3131682574748993, + "learning_rate": 2.8781302170283808e-05, + "loss": 0.4815, + "step": 2570 + }, + { + "epoch": 0.0009024447882190063, + "grad_norm": 0.36325496435165405, + "learning_rate": 2.8714524207011688e-05, + "loss": 0.4843, + "step": 2571 + }, + { + "epoch": 0.0009027957974715224, + "grad_norm": 0.3229312598705292, + "learning_rate": 2.8647746243739564e-05, + "loss": 0.5097, + "step": 2572 + }, + { + "epoch": 0.0009031468067240386, + "grad_norm": 0.3027791380882263, + "learning_rate": 2.8580968280467448e-05, + "loss": 0.5758, + "step": 2573 + }, + { + "epoch": 0.0009034978159765547, + "grad_norm": 0.35117703676223755, + "learning_rate": 2.8514190317195328e-05, + "loss": 0.4981, + "step": 2574 + }, + { + "epoch": 0.0009038488252290708, + "grad_norm": 0.3374365568161011, + "learning_rate": 2.8447412353923207e-05, + "loss": 0.5341, + "step": 2575 + }, + { + "epoch": 0.000904199834481587, + "grad_norm": 0.3418431282043457, + "learning_rate": 2.8380634390651084e-05, + "loss": 0.5654, + "step": 2576 + }, + { + "epoch": 0.0009045508437341032, + "grad_norm": 0.313973993062973, + "learning_rate": 2.8313856427378964e-05, + "loss": 0.5427, + "step": 2577 + }, + { + "epoch": 0.0009049018529866192, + "grad_norm": 0.30256158113479614, + "learning_rate": 2.8247078464106847e-05, + "loss": 0.501, + "step": 2578 + }, + { + "epoch": 0.0009052528622391354, + "grad_norm": 0.3348481059074402, + "learning_rate": 2.8180300500834727e-05, + "loss": 0.511, + "step": 2579 + }, + { + "epoch": 0.0009056038714916516, + "grad_norm": 0.3513137400150299, + "learning_rate": 2.8113522537562604e-05, + "loss": 0.4414, + "step": 2580 + }, + { + "epoch": 0.0009059548807441677, + "grad_norm": 0.31264251470565796, + "learning_rate": 2.8046744574290484e-05, + "loss": 0.4867, + "step": 2581 + }, + { + "epoch": 0.0009063058899966838, + "grad_norm": 0.24990692734718323, + "learning_rate": 2.7979966611018367e-05, + "loss": 0.4083, + "step": 2582 + }, + { + "epoch": 0.0009066568992492, + "grad_norm": 0.3431778848171234, + "learning_rate": 2.7913188647746247e-05, + "loss": 0.5362, + "step": 2583 + }, + { + "epoch": 0.0009070079085017162, + "grad_norm": 0.3317341208457947, + "learning_rate": 2.7846410684474123e-05, + "loss": 0.5234, + "step": 2584 + }, + { + "epoch": 0.0009073589177542322, + "grad_norm": 0.3075803816318512, + "learning_rate": 2.7779632721202003e-05, + "loss": 0.444, + "step": 2585 + }, + { + "epoch": 0.0009077099270067484, + "grad_norm": 0.3464568555355072, + "learning_rate": 2.7712854757929883e-05, + "loss": 0.5083, + "step": 2586 + }, + { + "epoch": 0.0009080609362592646, + "grad_norm": 0.28881844878196716, + "learning_rate": 2.7646076794657767e-05, + "loss": 0.5023, + "step": 2587 + }, + { + "epoch": 0.0009084119455117807, + "grad_norm": 0.32265371084213257, + "learning_rate": 2.7579298831385643e-05, + "loss": 0.5056, + "step": 2588 + }, + { + "epoch": 0.0009087629547642968, + "grad_norm": 0.3236354887485504, + "learning_rate": 2.7512520868113523e-05, + "loss": 0.5936, + "step": 2589 + }, + { + "epoch": 0.000909113964016813, + "grad_norm": 0.3609667718410492, + "learning_rate": 2.7445742904841403e-05, + "loss": 0.4903, + "step": 2590 + }, + { + "epoch": 0.0009094649732693292, + "grad_norm": 0.2735360860824585, + "learning_rate": 2.7378964941569286e-05, + "loss": 0.3633, + "step": 2591 + }, + { + "epoch": 0.0009098159825218453, + "grad_norm": 0.34697213768959045, + "learning_rate": 2.731218697829716e-05, + "loss": 0.5559, + "step": 2592 + }, + { + "epoch": 0.0009101669917743614, + "grad_norm": 0.3032284080982208, + "learning_rate": 2.7245409015025043e-05, + "loss": 0.4981, + "step": 2593 + }, + { + "epoch": 0.0009105180010268776, + "grad_norm": 0.3123127520084381, + "learning_rate": 2.7178631051752923e-05, + "loss": 0.5648, + "step": 2594 + }, + { + "epoch": 0.0009108690102793937, + "grad_norm": 0.36966538429260254, + "learning_rate": 2.7111853088480803e-05, + "loss": 0.513, + "step": 2595 + }, + { + "epoch": 0.0009112200195319099, + "grad_norm": 0.3247453570365906, + "learning_rate": 2.704507512520868e-05, + "loss": 0.4472, + "step": 2596 + }, + { + "epoch": 0.000911571028784426, + "grad_norm": 0.42087578773498535, + "learning_rate": 2.6978297161936563e-05, + "loss": 0.5812, + "step": 2597 + }, + { + "epoch": 0.0009119220380369421, + "grad_norm": 0.3239130973815918, + "learning_rate": 2.6911519198664442e-05, + "loss": 0.5041, + "step": 2598 + }, + { + "epoch": 0.0009122730472894583, + "grad_norm": 0.2931792140007019, + "learning_rate": 2.6844741235392322e-05, + "loss": 0.5713, + "step": 2599 + }, + { + "epoch": 0.0009126240565419745, + "grad_norm": 0.32619500160217285, + "learning_rate": 2.67779632721202e-05, + "loss": 0.5978, + "step": 2600 + }, + { + "epoch": 0.0009129750657944906, + "grad_norm": 0.31743088364601135, + "learning_rate": 2.671118530884808e-05, + "loss": 0.4682, + "step": 2601 + }, + { + "epoch": 0.0009133260750470067, + "grad_norm": 0.26357951760292053, + "learning_rate": 2.6644407345575962e-05, + "loss": 0.4503, + "step": 2602 + }, + { + "epoch": 0.0009136770842995229, + "grad_norm": 0.392783522605896, + "learning_rate": 2.6577629382303842e-05, + "loss": 0.561, + "step": 2603 + }, + { + "epoch": 0.000914028093552039, + "grad_norm": 0.3406294584274292, + "learning_rate": 2.651085141903172e-05, + "loss": 0.5681, + "step": 2604 + }, + { + "epoch": 0.0009143791028045551, + "grad_norm": 0.29727840423583984, + "learning_rate": 2.64440734557596e-05, + "loss": 0.5806, + "step": 2605 + }, + { + "epoch": 0.0009147301120570713, + "grad_norm": 0.30650097131729126, + "learning_rate": 2.6377295492487482e-05, + "loss": 0.4807, + "step": 2606 + }, + { + "epoch": 0.0009150811213095875, + "grad_norm": 0.2788676619529724, + "learning_rate": 2.6310517529215362e-05, + "loss": 0.557, + "step": 2607 + }, + { + "epoch": 0.0009154321305621035, + "grad_norm": 0.3036753535270691, + "learning_rate": 2.624373956594324e-05, + "loss": 0.4407, + "step": 2608 + }, + { + "epoch": 0.0009157831398146197, + "grad_norm": 0.344601571559906, + "learning_rate": 2.6176961602671118e-05, + "loss": 0.5437, + "step": 2609 + }, + { + "epoch": 0.0009161341490671359, + "grad_norm": 0.3159584105014801, + "learning_rate": 2.6110183639398998e-05, + "loss": 0.5221, + "step": 2610 + }, + { + "epoch": 0.000916485158319652, + "grad_norm": 0.34595775604248047, + "learning_rate": 2.604340567612688e-05, + "loss": 0.5372, + "step": 2611 + }, + { + "epoch": 0.0009168361675721681, + "grad_norm": 0.3012959659099579, + "learning_rate": 2.597662771285476e-05, + "loss": 0.4692, + "step": 2612 + }, + { + "epoch": 0.0009171871768246843, + "grad_norm": 0.3277386724948883, + "learning_rate": 2.5909849749582638e-05, + "loss": 0.5359, + "step": 2613 + }, + { + "epoch": 0.0009175381860772005, + "grad_norm": 0.3289946913719177, + "learning_rate": 2.5843071786310518e-05, + "loss": 0.5677, + "step": 2614 + }, + { + "epoch": 0.0009178891953297166, + "grad_norm": 0.28243499994277954, + "learning_rate": 2.57762938230384e-05, + "loss": 0.4759, + "step": 2615 + }, + { + "epoch": 0.0009182402045822327, + "grad_norm": 0.29266831278800964, + "learning_rate": 2.570951585976628e-05, + "loss": 0.5541, + "step": 2616 + }, + { + "epoch": 0.0009185912138347489, + "grad_norm": 0.3399595022201538, + "learning_rate": 2.5642737896494158e-05, + "loss": 0.4882, + "step": 2617 + }, + { + "epoch": 0.000918942223087265, + "grad_norm": 0.31956201791763306, + "learning_rate": 2.5575959933222038e-05, + "loss": 0.6051, + "step": 2618 + }, + { + "epoch": 0.0009192932323397812, + "grad_norm": 0.3297751545906067, + "learning_rate": 2.5509181969949918e-05, + "loss": 0.5407, + "step": 2619 + }, + { + "epoch": 0.0009196442415922973, + "grad_norm": 0.30339428782463074, + "learning_rate": 2.54424040066778e-05, + "loss": 0.5183, + "step": 2620 + }, + { + "epoch": 0.0009199952508448134, + "grad_norm": 0.2914029061794281, + "learning_rate": 2.5375626043405677e-05, + "loss": 0.5046, + "step": 2621 + }, + { + "epoch": 0.0009203462600973296, + "grad_norm": 0.28545403480529785, + "learning_rate": 2.5308848080133557e-05, + "loss": 0.5017, + "step": 2622 + }, + { + "epoch": 0.0009206972693498458, + "grad_norm": 0.2777597904205322, + "learning_rate": 2.5242070116861437e-05, + "loss": 0.4521, + "step": 2623 + }, + { + "epoch": 0.0009210482786023619, + "grad_norm": 0.27011004090309143, + "learning_rate": 2.517529215358932e-05, + "loss": 0.4098, + "step": 2624 + }, + { + "epoch": 0.000921399287854878, + "grad_norm": 0.29135480523109436, + "learning_rate": 2.5108514190317194e-05, + "loss": 0.3942, + "step": 2625 + }, + { + "epoch": 0.0009217502971073942, + "grad_norm": 0.3266141712665558, + "learning_rate": 2.5041736227045077e-05, + "loss": 0.4713, + "step": 2626 + }, + { + "epoch": 0.0009221013063599104, + "grad_norm": 0.31716689467430115, + "learning_rate": 2.4974958263772957e-05, + "loss": 0.5258, + "step": 2627 + }, + { + "epoch": 0.0009224523156124264, + "grad_norm": 0.2937525510787964, + "learning_rate": 2.4908180300500837e-05, + "loss": 0.5089, + "step": 2628 + }, + { + "epoch": 0.0009228033248649426, + "grad_norm": 0.266811341047287, + "learning_rate": 2.4841402337228717e-05, + "loss": 0.3397, + "step": 2629 + }, + { + "epoch": 0.0009231543341174588, + "grad_norm": 0.3621795177459717, + "learning_rate": 2.4774624373956597e-05, + "loss": 0.5046, + "step": 2630 + }, + { + "epoch": 0.0009235053433699748, + "grad_norm": 0.2858990430831909, + "learning_rate": 2.4707846410684477e-05, + "loss": 0.3182, + "step": 2631 + }, + { + "epoch": 0.000923856352622491, + "grad_norm": 0.3191796839237213, + "learning_rate": 2.4641068447412353e-05, + "loss": 0.4707, + "step": 2632 + }, + { + "epoch": 0.0009242073618750072, + "grad_norm": 0.2611950933933258, + "learning_rate": 2.4574290484140237e-05, + "loss": 0.4645, + "step": 2633 + }, + { + "epoch": 0.0009245583711275234, + "grad_norm": 0.3045663833618164, + "learning_rate": 2.4507512520868113e-05, + "loss": 0.4884, + "step": 2634 + }, + { + "epoch": 0.0009249093803800394, + "grad_norm": 0.3385707139968872, + "learning_rate": 2.4440734557595996e-05, + "loss": 0.46, + "step": 2635 + }, + { + "epoch": 0.0009252603896325556, + "grad_norm": 0.3311261236667633, + "learning_rate": 2.4373956594323873e-05, + "loss": 0.5286, + "step": 2636 + }, + { + "epoch": 0.0009256113988850718, + "grad_norm": 0.3543814718723297, + "learning_rate": 2.4307178631051756e-05, + "loss": 0.5462, + "step": 2637 + }, + { + "epoch": 0.0009259624081375879, + "grad_norm": 0.3148309290409088, + "learning_rate": 2.4240400667779633e-05, + "loss": 0.5243, + "step": 2638 + }, + { + "epoch": 0.000926313417390104, + "grad_norm": 0.4147723913192749, + "learning_rate": 2.4173622704507516e-05, + "loss": 0.4657, + "step": 2639 + }, + { + "epoch": 0.0009266644266426202, + "grad_norm": 0.2914230227470398, + "learning_rate": 2.4106844741235393e-05, + "loss": 0.5389, + "step": 2640 + }, + { + "epoch": 0.0009270154358951363, + "grad_norm": 0.3377765119075775, + "learning_rate": 2.4040066777963273e-05, + "loss": 0.5975, + "step": 2641 + }, + { + "epoch": 0.0009273664451476525, + "grad_norm": 0.3205912411212921, + "learning_rate": 2.3973288814691153e-05, + "loss": 0.5176, + "step": 2642 + }, + { + "epoch": 0.0009277174544001686, + "grad_norm": 0.3654428720474243, + "learning_rate": 2.3906510851419033e-05, + "loss": 0.5667, + "step": 2643 + }, + { + "epoch": 0.0009280684636526848, + "grad_norm": 0.3409925401210785, + "learning_rate": 2.3839732888146912e-05, + "loss": 0.5623, + "step": 2644 + }, + { + "epoch": 0.0009284194729052009, + "grad_norm": 0.33296191692352295, + "learning_rate": 2.3772954924874792e-05, + "loss": 0.5069, + "step": 2645 + }, + { + "epoch": 0.0009287704821577171, + "grad_norm": 0.3501218557357788, + "learning_rate": 2.3706176961602672e-05, + "loss": 0.4702, + "step": 2646 + }, + { + "epoch": 0.0009291214914102332, + "grad_norm": 0.33584311604499817, + "learning_rate": 2.3639398998330552e-05, + "loss": 0.5038, + "step": 2647 + }, + { + "epoch": 0.0009294725006627493, + "grad_norm": 0.3641437590122223, + "learning_rate": 2.3572621035058432e-05, + "loss": 0.5842, + "step": 2648 + }, + { + "epoch": 0.0009298235099152655, + "grad_norm": 0.2983480989933014, + "learning_rate": 2.3505843071786312e-05, + "loss": 0.5014, + "step": 2649 + }, + { + "epoch": 0.0009301745191677817, + "grad_norm": 0.3174578547477722, + "learning_rate": 2.3439065108514192e-05, + "loss": 0.5697, + "step": 2650 + }, + { + "epoch": 0.0009305255284202977, + "grad_norm": 0.34100961685180664, + "learning_rate": 2.3372287145242072e-05, + "loss": 0.5635, + "step": 2651 + }, + { + "epoch": 0.0009308765376728139, + "grad_norm": 0.32889890670776367, + "learning_rate": 2.3305509181969952e-05, + "loss": 0.5343, + "step": 2652 + }, + { + "epoch": 0.0009312275469253301, + "grad_norm": 0.2658712565898895, + "learning_rate": 2.3238731218697832e-05, + "loss": 0.5047, + "step": 2653 + }, + { + "epoch": 0.0009315785561778463, + "grad_norm": 0.3839574158191681, + "learning_rate": 2.3171953255425712e-05, + "loss": 0.4489, + "step": 2654 + }, + { + "epoch": 0.0009319295654303623, + "grad_norm": 0.3274018466472626, + "learning_rate": 2.310517529215359e-05, + "loss": 0.4947, + "step": 2655 + }, + { + "epoch": 0.0009322805746828785, + "grad_norm": 0.33997949957847595, + "learning_rate": 2.3038397328881468e-05, + "loss": 0.46, + "step": 2656 + }, + { + "epoch": 0.0009326315839353947, + "grad_norm": 0.2808091342449188, + "learning_rate": 2.297161936560935e-05, + "loss": 0.4977, + "step": 2657 + }, + { + "epoch": 0.0009329825931879107, + "grad_norm": 0.37973394989967346, + "learning_rate": 2.2904841402337228e-05, + "loss": 0.3797, + "step": 2658 + }, + { + "epoch": 0.0009333336024404269, + "grad_norm": 0.3961915373802185, + "learning_rate": 2.283806343906511e-05, + "loss": 0.5606, + "step": 2659 + }, + { + "epoch": 0.0009336846116929431, + "grad_norm": 0.3045227825641632, + "learning_rate": 2.2771285475792988e-05, + "loss": 0.495, + "step": 2660 + }, + { + "epoch": 0.0009340356209454592, + "grad_norm": 0.3140105903148651, + "learning_rate": 2.270450751252087e-05, + "loss": 0.6215, + "step": 2661 + }, + { + "epoch": 0.0009343866301979753, + "grad_norm": 0.2776303291320801, + "learning_rate": 2.2637729549248748e-05, + "loss": 0.4455, + "step": 2662 + }, + { + "epoch": 0.0009347376394504915, + "grad_norm": 0.3289468586444855, + "learning_rate": 2.257095158597663e-05, + "loss": 0.4462, + "step": 2663 + }, + { + "epoch": 0.0009350886487030077, + "grad_norm": 0.3406628966331482, + "learning_rate": 2.2504173622704508e-05, + "loss": 0.5125, + "step": 2664 + }, + { + "epoch": 0.0009354396579555238, + "grad_norm": 0.31571251153945923, + "learning_rate": 2.2437395659432388e-05, + "loss": 0.5335, + "step": 2665 + }, + { + "epoch": 0.0009357906672080399, + "grad_norm": 0.2963699996471405, + "learning_rate": 2.2370617696160268e-05, + "loss": 0.4852, + "step": 2666 + }, + { + "epoch": 0.0009361416764605561, + "grad_norm": 0.33006179332733154, + "learning_rate": 2.2303839732888147e-05, + "loss": 0.4807, + "step": 2667 + }, + { + "epoch": 0.0009364926857130722, + "grad_norm": 0.27546727657318115, + "learning_rate": 2.2237061769616027e-05, + "loss": 0.4776, + "step": 2668 + }, + { + "epoch": 0.0009368436949655884, + "grad_norm": 0.2972474694252014, + "learning_rate": 2.2170283806343907e-05, + "loss": 0.5076, + "step": 2669 + }, + { + "epoch": 0.0009371947042181045, + "grad_norm": 0.4304138422012329, + "learning_rate": 2.2103505843071787e-05, + "loss": 0.4869, + "step": 2670 + }, + { + "epoch": 0.0009375457134706206, + "grad_norm": 0.29312118887901306, + "learning_rate": 2.2036727879799667e-05, + "loss": 0.4924, + "step": 2671 + }, + { + "epoch": 0.0009378967227231368, + "grad_norm": 0.29951146245002747, + "learning_rate": 2.1969949916527547e-05, + "loss": 0.514, + "step": 2672 + }, + { + "epoch": 0.000938247731975653, + "grad_norm": 0.34295153617858887, + "learning_rate": 2.1903171953255427e-05, + "loss": 0.4957, + "step": 2673 + }, + { + "epoch": 0.0009385987412281691, + "grad_norm": 0.2864181101322174, + "learning_rate": 2.1836393989983307e-05, + "loss": 0.3869, + "step": 2674 + }, + { + "epoch": 0.0009389497504806852, + "grad_norm": 0.36866047978401184, + "learning_rate": 2.1769616026711187e-05, + "loss": 0.5296, + "step": 2675 + }, + { + "epoch": 0.0009393007597332014, + "grad_norm": 0.3307238817214966, + "learning_rate": 2.1702838063439067e-05, + "loss": 0.4854, + "step": 2676 + }, + { + "epoch": 0.0009396517689857176, + "grad_norm": 0.3300168514251709, + "learning_rate": 2.1636060100166947e-05, + "loss": 0.5569, + "step": 2677 + }, + { + "epoch": 0.0009400027782382336, + "grad_norm": 0.3872585892677307, + "learning_rate": 2.1569282136894827e-05, + "loss": 0.655, + "step": 2678 + }, + { + "epoch": 0.0009403537874907498, + "grad_norm": 0.3557940721511841, + "learning_rate": 2.1502504173622707e-05, + "loss": 0.5593, + "step": 2679 + }, + { + "epoch": 0.000940704796743266, + "grad_norm": 0.33360832929611206, + "learning_rate": 2.1435726210350583e-05, + "loss": 0.5457, + "step": 2680 + }, + { + "epoch": 0.000941055805995782, + "grad_norm": 0.31986358761787415, + "learning_rate": 2.1368948247078466e-05, + "loss": 0.59, + "step": 2681 + }, + { + "epoch": 0.0009414068152482982, + "grad_norm": 0.31213393807411194, + "learning_rate": 2.1302170283806343e-05, + "loss": 0.4603, + "step": 2682 + }, + { + "epoch": 0.0009417578245008144, + "grad_norm": 0.34585440158843994, + "learning_rate": 2.1235392320534226e-05, + "loss": 0.5624, + "step": 2683 + }, + { + "epoch": 0.0009421088337533306, + "grad_norm": 0.3183426558971405, + "learning_rate": 2.1168614357262103e-05, + "loss": 0.4285, + "step": 2684 + }, + { + "epoch": 0.0009424598430058466, + "grad_norm": 0.24673517048358917, + "learning_rate": 2.1101836393989986e-05, + "loss": 0.4258, + "step": 2685 + }, + { + "epoch": 0.0009428108522583628, + "grad_norm": 0.36261579394340515, + "learning_rate": 2.1035058430717863e-05, + "loss": 0.523, + "step": 2686 + }, + { + "epoch": 0.000943161861510879, + "grad_norm": 0.34637051820755005, + "learning_rate": 2.0968280467445746e-05, + "loss": 0.5631, + "step": 2687 + }, + { + "epoch": 0.0009435128707633951, + "grad_norm": 0.29863280057907104, + "learning_rate": 2.0901502504173623e-05, + "loss": 0.4891, + "step": 2688 + }, + { + "epoch": 0.0009438638800159112, + "grad_norm": 0.3096980154514313, + "learning_rate": 2.0834724540901503e-05, + "loss": 0.5271, + "step": 2689 + }, + { + "epoch": 0.0009442148892684274, + "grad_norm": 0.30533885955810547, + "learning_rate": 2.0767946577629382e-05, + "loss": 0.3937, + "step": 2690 + }, + { + "epoch": 0.0009445658985209435, + "grad_norm": 0.37430548667907715, + "learning_rate": 2.0701168614357262e-05, + "loss": 0.5425, + "step": 2691 + }, + { + "epoch": 0.0009449169077734597, + "grad_norm": 0.33952566981315613, + "learning_rate": 2.0634390651085142e-05, + "loss": 0.5383, + "step": 2692 + }, + { + "epoch": 0.0009452679170259758, + "grad_norm": 0.2745051980018616, + "learning_rate": 2.0567612687813022e-05, + "loss": 0.5137, + "step": 2693 + }, + { + "epoch": 0.000945618926278492, + "grad_norm": 0.34455645084381104, + "learning_rate": 2.0500834724540902e-05, + "loss": 0.5376, + "step": 2694 + }, + { + "epoch": 0.0009459699355310081, + "grad_norm": 0.3317749798297882, + "learning_rate": 2.0434056761268782e-05, + "loss": 0.5375, + "step": 2695 + }, + { + "epoch": 0.0009463209447835243, + "grad_norm": 0.30368366837501526, + "learning_rate": 2.0367278797996662e-05, + "loss": 0.5119, + "step": 2696 + }, + { + "epoch": 0.0009466719540360404, + "grad_norm": 0.32434359192848206, + "learning_rate": 2.0300500834724542e-05, + "loss": 0.5724, + "step": 2697 + }, + { + "epoch": 0.0009470229632885565, + "grad_norm": 0.31109288334846497, + "learning_rate": 2.0233722871452422e-05, + "loss": 0.5466, + "step": 2698 + }, + { + "epoch": 0.0009473739725410727, + "grad_norm": 0.2530185878276825, + "learning_rate": 2.0166944908180302e-05, + "loss": 0.3968, + "step": 2699 + }, + { + "epoch": 0.0009477249817935889, + "grad_norm": 0.31531384587287903, + "learning_rate": 2.0100166944908182e-05, + "loss": 0.5689, + "step": 2700 + }, + { + "epoch": 0.0009480759910461049, + "grad_norm": 0.28268522024154663, + "learning_rate": 2.003338898163606e-05, + "loss": 0.4365, + "step": 2701 + }, + { + "epoch": 0.0009484270002986211, + "grad_norm": 0.3365183472633362, + "learning_rate": 1.9966611018363938e-05, + "loss": 0.4725, + "step": 2702 + }, + { + "epoch": 0.0009487780095511373, + "grad_norm": 0.33126509189605713, + "learning_rate": 1.989983305509182e-05, + "loss": 0.5494, + "step": 2703 + }, + { + "epoch": 0.0009491290188036535, + "grad_norm": 0.34097808599472046, + "learning_rate": 1.9833055091819698e-05, + "loss": 0.446, + "step": 2704 + }, + { + "epoch": 0.0009494800280561695, + "grad_norm": 0.3191957175731659, + "learning_rate": 1.976627712854758e-05, + "loss": 0.4742, + "step": 2705 + }, + { + "epoch": 0.0009498310373086857, + "grad_norm": 0.2865714728832245, + "learning_rate": 1.9699499165275458e-05, + "loss": 0.4943, + "step": 2706 + }, + { + "epoch": 0.0009501820465612019, + "grad_norm": 0.3436678647994995, + "learning_rate": 1.963272120200334e-05, + "loss": 0.5729, + "step": 2707 + }, + { + "epoch": 0.000950533055813718, + "grad_norm": 0.3311633765697479, + "learning_rate": 1.9565943238731218e-05, + "loss": 0.5629, + "step": 2708 + }, + { + "epoch": 0.0009508840650662341, + "grad_norm": 0.29449766874313354, + "learning_rate": 1.94991652754591e-05, + "loss": 0.46, + "step": 2709 + }, + { + "epoch": 0.0009512350743187503, + "grad_norm": 0.35170599818229675, + "learning_rate": 1.9432387312186978e-05, + "loss": 0.5424, + "step": 2710 + }, + { + "epoch": 0.0009515860835712664, + "grad_norm": 0.3243255913257599, + "learning_rate": 1.936560934891486e-05, + "loss": 0.53, + "step": 2711 + }, + { + "epoch": 0.0009519370928237825, + "grad_norm": 0.3076205253601074, + "learning_rate": 1.9298831385642738e-05, + "loss": 0.5195, + "step": 2712 + }, + { + "epoch": 0.0009522881020762987, + "grad_norm": 0.36274224519729614, + "learning_rate": 1.9232053422370617e-05, + "loss": 0.5222, + "step": 2713 + }, + { + "epoch": 0.0009526391113288149, + "grad_norm": 0.318545937538147, + "learning_rate": 1.9165275459098497e-05, + "loss": 0.4345, + "step": 2714 + }, + { + "epoch": 0.000952990120581331, + "grad_norm": 0.33198022842407227, + "learning_rate": 1.9098497495826377e-05, + "loss": 0.5247, + "step": 2715 + }, + { + "epoch": 0.0009533411298338471, + "grad_norm": 0.3077337145805359, + "learning_rate": 1.9031719532554257e-05, + "loss": 0.4329, + "step": 2716 + }, + { + "epoch": 0.0009536921390863633, + "grad_norm": 0.3244706094264984, + "learning_rate": 1.8964941569282137e-05, + "loss": 0.5199, + "step": 2717 + }, + { + "epoch": 0.0009540431483388794, + "grad_norm": 0.28997060656547546, + "learning_rate": 1.8898163606010017e-05, + "loss": 0.4709, + "step": 2718 + }, + { + "epoch": 0.0009543941575913956, + "grad_norm": 0.26597076654434204, + "learning_rate": 1.8831385642737897e-05, + "loss": 0.3583, + "step": 2719 + }, + { + "epoch": 0.0009547451668439117, + "grad_norm": 0.27226507663726807, + "learning_rate": 1.8764607679465777e-05, + "loss": 0.4057, + "step": 2720 + }, + { + "epoch": 0.0009550961760964278, + "grad_norm": 0.2710771858692169, + "learning_rate": 1.8697829716193657e-05, + "loss": 0.448, + "step": 2721 + }, + { + "epoch": 0.000955447185348944, + "grad_norm": 0.29069092869758606, + "learning_rate": 1.8631051752921537e-05, + "loss": 0.4514, + "step": 2722 + }, + { + "epoch": 0.0009557981946014602, + "grad_norm": 0.26912447810173035, + "learning_rate": 1.8564273789649417e-05, + "loss": 0.4136, + "step": 2723 + }, + { + "epoch": 0.0009561492038539763, + "grad_norm": 0.31414881348609924, + "learning_rate": 1.8497495826377297e-05, + "loss": 0.4793, + "step": 2724 + }, + { + "epoch": 0.0009565002131064924, + "grad_norm": 0.29217466711997986, + "learning_rate": 1.8430717863105177e-05, + "loss": 0.5489, + "step": 2725 + }, + { + "epoch": 0.0009568512223590086, + "grad_norm": 0.3560400903224945, + "learning_rate": 1.8363939899833053e-05, + "loss": 0.5932, + "step": 2726 + }, + { + "epoch": 0.0009572022316115248, + "grad_norm": 0.35290443897247314, + "learning_rate": 1.8297161936560936e-05, + "loss": 0.5109, + "step": 2727 + }, + { + "epoch": 0.0009575532408640408, + "grad_norm": 0.2893005907535553, + "learning_rate": 1.8230383973288813e-05, + "loss": 0.4507, + "step": 2728 + }, + { + "epoch": 0.000957904250116557, + "grad_norm": 0.32811588048934937, + "learning_rate": 1.8163606010016696e-05, + "loss": 0.4277, + "step": 2729 + }, + { + "epoch": 0.0009582552593690732, + "grad_norm": 0.32973745465278625, + "learning_rate": 1.8096828046744573e-05, + "loss": 0.5197, + "step": 2730 + }, + { + "epoch": 0.0009586062686215892, + "grad_norm": 0.32170772552490234, + "learning_rate": 1.8030050083472456e-05, + "loss": 0.564, + "step": 2731 + }, + { + "epoch": 0.0009589572778741054, + "grad_norm": 0.2796086370944977, + "learning_rate": 1.7963272120200333e-05, + "loss": 0.5116, + "step": 2732 + }, + { + "epoch": 0.0009593082871266216, + "grad_norm": 0.3391268849372864, + "learning_rate": 1.7896494156928216e-05, + "loss": 0.5464, + "step": 2733 + }, + { + "epoch": 0.0009596592963791378, + "grad_norm": 0.2571183741092682, + "learning_rate": 1.7829716193656093e-05, + "loss": 0.401, + "step": 2734 + }, + { + "epoch": 0.0009600103056316538, + "grad_norm": 0.3373526334762573, + "learning_rate": 1.7762938230383973e-05, + "loss": 0.5453, + "step": 2735 + }, + { + "epoch": 0.00096036131488417, + "grad_norm": 0.3045218288898468, + "learning_rate": 1.7696160267111852e-05, + "loss": 0.544, + "step": 2736 + }, + { + "epoch": 0.0009607123241366862, + "grad_norm": 0.3363627791404724, + "learning_rate": 1.7629382303839732e-05, + "loss": 0.5671, + "step": 2737 + }, + { + "epoch": 0.0009610633333892023, + "grad_norm": 0.3323043882846832, + "learning_rate": 1.7562604340567612e-05, + "loss": 0.5558, + "step": 2738 + }, + { + "epoch": 0.0009614143426417184, + "grad_norm": 0.29811328649520874, + "learning_rate": 1.7495826377295492e-05, + "loss": 0.5462, + "step": 2739 + }, + { + "epoch": 0.0009617653518942346, + "grad_norm": 0.328827828168869, + "learning_rate": 1.7429048414023372e-05, + "loss": 0.5272, + "step": 2740 + }, + { + "epoch": 0.0009621163611467507, + "grad_norm": 0.3507555425167084, + "learning_rate": 1.7362270450751252e-05, + "loss": 0.5476, + "step": 2741 + }, + { + "epoch": 0.0009624673703992669, + "grad_norm": 0.353947252035141, + "learning_rate": 1.7295492487479135e-05, + "loss": 0.6045, + "step": 2742 + }, + { + "epoch": 0.000962818379651783, + "grad_norm": 0.307493656873703, + "learning_rate": 1.7228714524207012e-05, + "loss": 0.3979, + "step": 2743 + }, + { + "epoch": 0.0009631693889042992, + "grad_norm": 0.36082038283348083, + "learning_rate": 1.7161936560934895e-05, + "loss": 0.5333, + "step": 2744 + }, + { + "epoch": 0.0009635203981568153, + "grad_norm": 0.3073471188545227, + "learning_rate": 1.7095158597662772e-05, + "loss": 0.4791, + "step": 2745 + }, + { + "epoch": 0.0009638714074093315, + "grad_norm": 0.38517406582832336, + "learning_rate": 1.7028380634390652e-05, + "loss": 0.4508, + "step": 2746 + }, + { + "epoch": 0.0009642224166618476, + "grad_norm": 0.3524985611438751, + "learning_rate": 1.696160267111853e-05, + "loss": 0.5569, + "step": 2747 + }, + { + "epoch": 0.0009645734259143637, + "grad_norm": 0.3595435619354248, + "learning_rate": 1.689482470784641e-05, + "loss": 0.4798, + "step": 2748 + }, + { + "epoch": 0.0009649244351668799, + "grad_norm": 0.3239949941635132, + "learning_rate": 1.682804674457429e-05, + "loss": 0.5236, + "step": 2749 + }, + { + "epoch": 0.0009652754444193961, + "grad_norm": 0.3428175449371338, + "learning_rate": 1.676126878130217e-05, + "loss": 0.5356, + "step": 2750 + }, + { + "epoch": 0.0009656264536719121, + "grad_norm": 0.3060053288936615, + "learning_rate": 1.669449081803005e-05, + "loss": 0.5929, + "step": 2751 + }, + { + "epoch": 0.0009659774629244283, + "grad_norm": 0.3100334703922272, + "learning_rate": 1.662771285475793e-05, + "loss": 0.4933, + "step": 2752 + }, + { + "epoch": 0.0009663284721769445, + "grad_norm": 0.3433213233947754, + "learning_rate": 1.656093489148581e-05, + "loss": 0.4991, + "step": 2753 + }, + { + "epoch": 0.0009666794814294607, + "grad_norm": 0.3321034610271454, + "learning_rate": 1.649415692821369e-05, + "loss": 0.4662, + "step": 2754 + }, + { + "epoch": 0.0009670304906819767, + "grad_norm": 0.2773591876029968, + "learning_rate": 1.642737896494157e-05, + "loss": 0.4507, + "step": 2755 + }, + { + "epoch": 0.0009673814999344929, + "grad_norm": 0.3171742260456085, + "learning_rate": 1.636060100166945e-05, + "loss": 0.573, + "step": 2756 + }, + { + "epoch": 0.0009677325091870091, + "grad_norm": 0.34338507056236267, + "learning_rate": 1.629382303839733e-05, + "loss": 0.4591, + "step": 2757 + }, + { + "epoch": 0.0009680835184395252, + "grad_norm": 0.3032603859901428, + "learning_rate": 1.622704507512521e-05, + "loss": 0.4964, + "step": 2758 + }, + { + "epoch": 0.0009684345276920413, + "grad_norm": 0.35825902223587036, + "learning_rate": 1.6160267111853087e-05, + "loss": 0.5758, + "step": 2759 + }, + { + "epoch": 0.0009687855369445575, + "grad_norm": 0.2983849346637726, + "learning_rate": 1.609348914858097e-05, + "loss": 0.5601, + "step": 2760 + }, + { + "epoch": 0.0009691365461970736, + "grad_norm": 0.2885691225528717, + "learning_rate": 1.6026711185308847e-05, + "loss": 0.5864, + "step": 2761 + }, + { + "epoch": 0.0009694875554495897, + "grad_norm": 0.3313084542751312, + "learning_rate": 1.595993322203673e-05, + "loss": 0.4756, + "step": 2762 + }, + { + "epoch": 0.0009698385647021059, + "grad_norm": 0.31722310185432434, + "learning_rate": 1.5893155258764607e-05, + "loss": 0.5505, + "step": 2763 + }, + { + "epoch": 0.0009701895739546221, + "grad_norm": 0.36518874764442444, + "learning_rate": 1.582637729549249e-05, + "loss": 0.5713, + "step": 2764 + }, + { + "epoch": 0.0009705405832071382, + "grad_norm": 0.322587251663208, + "learning_rate": 1.5759599332220367e-05, + "loss": 0.42, + "step": 2765 + }, + { + "epoch": 0.0009708915924596543, + "grad_norm": 0.3030405640602112, + "learning_rate": 1.569282136894825e-05, + "loss": 0.4896, + "step": 2766 + }, + { + "epoch": 0.0009712426017121705, + "grad_norm": 0.3449806571006775, + "learning_rate": 1.5626043405676127e-05, + "loss": 0.3638, + "step": 2767 + }, + { + "epoch": 0.0009715936109646866, + "grad_norm": 0.3092031478881836, + "learning_rate": 1.555926544240401e-05, + "loss": 0.57, + "step": 2768 + }, + { + "epoch": 0.0009719446202172028, + "grad_norm": 0.37385180592536926, + "learning_rate": 1.5492487479131887e-05, + "loss": 0.5466, + "step": 2769 + }, + { + "epoch": 0.0009722956294697189, + "grad_norm": 0.3383021056652069, + "learning_rate": 1.5425709515859767e-05, + "loss": 0.5568, + "step": 2770 + }, + { + "epoch": 0.000972646638722235, + "grad_norm": 0.321530818939209, + "learning_rate": 1.5358931552587647e-05, + "loss": 0.5297, + "step": 2771 + }, + { + "epoch": 0.0009729976479747512, + "grad_norm": 0.331918865442276, + "learning_rate": 1.5292153589315527e-05, + "loss": 0.5976, + "step": 2772 + }, + { + "epoch": 0.0009733486572272674, + "grad_norm": 0.3457392156124115, + "learning_rate": 1.5225375626043406e-05, + "loss": 0.4211, + "step": 2773 + }, + { + "epoch": 0.0009736996664797835, + "grad_norm": 0.31903064250946045, + "learning_rate": 1.5158597662771286e-05, + "loss": 0.5656, + "step": 2774 + }, + { + "epoch": 0.0009740506757322996, + "grad_norm": 0.3803582191467285, + "learning_rate": 1.5091819699499166e-05, + "loss": 0.5571, + "step": 2775 + }, + { + "epoch": 0.0009744016849848158, + "grad_norm": 0.41490522027015686, + "learning_rate": 1.5025041736227046e-05, + "loss": 0.5614, + "step": 2776 + }, + { + "epoch": 0.000974752694237332, + "grad_norm": 0.28632932901382446, + "learning_rate": 1.4958263772954924e-05, + "loss": 0.4032, + "step": 2777 + }, + { + "epoch": 0.000975103703489848, + "grad_norm": 0.3377666175365448, + "learning_rate": 1.4891485809682806e-05, + "loss": 0.4821, + "step": 2778 + }, + { + "epoch": 0.0009754547127423642, + "grad_norm": 0.3296474814414978, + "learning_rate": 1.4824707846410684e-05, + "loss": 0.5614, + "step": 2779 + }, + { + "epoch": 0.0009758057219948804, + "grad_norm": 0.3330935835838318, + "learning_rate": 1.4757929883138566e-05, + "loss": 0.5035, + "step": 2780 + }, + { + "epoch": 0.0009761567312473965, + "grad_norm": 0.3138290345668793, + "learning_rate": 1.4691151919866444e-05, + "loss": 0.4666, + "step": 2781 + }, + { + "epoch": 0.0009765077404999126, + "grad_norm": 0.3430411219596863, + "learning_rate": 1.4624373956594326e-05, + "loss": 0.6315, + "step": 2782 + }, + { + "epoch": 0.0009768587497524287, + "grad_norm": 0.3496928811073303, + "learning_rate": 1.4557595993322204e-05, + "loss": 0.5141, + "step": 2783 + }, + { + "epoch": 0.000977209759004945, + "grad_norm": 0.3003537654876709, + "learning_rate": 1.4490818030050086e-05, + "loss": 0.3869, + "step": 2784 + }, + { + "epoch": 0.000977560768257461, + "grad_norm": 0.31144484877586365, + "learning_rate": 1.4424040066777964e-05, + "loss": 0.4458, + "step": 2785 + }, + { + "epoch": 0.0009779117775099771, + "grad_norm": 0.3853313624858856, + "learning_rate": 1.4357262103505844e-05, + "loss": 0.5008, + "step": 2786 + }, + { + "epoch": 0.0009782627867624934, + "grad_norm": 0.3438868224620819, + "learning_rate": 1.4290484140233724e-05, + "loss": 0.5757, + "step": 2787 + }, + { + "epoch": 0.0009786137960150095, + "grad_norm": 0.3259068429470062, + "learning_rate": 1.4223706176961604e-05, + "loss": 0.4808, + "step": 2788 + }, + { + "epoch": 0.0009789648052675258, + "grad_norm": 0.32424217462539673, + "learning_rate": 1.4156928213689482e-05, + "loss": 0.4918, + "step": 2789 + }, + { + "epoch": 0.0009793158145200418, + "grad_norm": 0.3156449794769287, + "learning_rate": 1.4090150250417364e-05, + "loss": 0.5047, + "step": 2790 + }, + { + "epoch": 0.000979666823772558, + "grad_norm": 0.3244236707687378, + "learning_rate": 1.4023372287145242e-05, + "loss": 0.5202, + "step": 2791 + }, + { + "epoch": 0.0009800178330250742, + "grad_norm": 0.36655086278915405, + "learning_rate": 1.3956594323873123e-05, + "loss": 0.4832, + "step": 2792 + }, + { + "epoch": 0.0009803688422775902, + "grad_norm": 0.3310711085796356, + "learning_rate": 1.3889816360601002e-05, + "loss": 0.4752, + "step": 2793 + }, + { + "epoch": 0.0009807198515301063, + "grad_norm": 0.3369671106338501, + "learning_rate": 1.3823038397328883e-05, + "loss": 0.5039, + "step": 2794 + }, + { + "epoch": 0.0009810708607826226, + "grad_norm": 0.3250376880168915, + "learning_rate": 1.3756260434056762e-05, + "loss": 0.4024, + "step": 2795 + }, + { + "epoch": 0.0009814218700351387, + "grad_norm": 0.34323790669441223, + "learning_rate": 1.3689482470784643e-05, + "loss": 0.4787, + "step": 2796 + }, + { + "epoch": 0.0009817728792876547, + "grad_norm": 0.2757573127746582, + "learning_rate": 1.3622704507512521e-05, + "loss": 0.4328, + "step": 2797 + }, + { + "epoch": 0.000982123888540171, + "grad_norm": 0.3542787432670593, + "learning_rate": 1.3555926544240401e-05, + "loss": 0.5056, + "step": 2798 + }, + { + "epoch": 0.000982474897792687, + "grad_norm": 0.3100093901157379, + "learning_rate": 1.3489148580968281e-05, + "loss": 0.43, + "step": 2799 + }, + { + "epoch": 0.0009828259070452032, + "grad_norm": 0.43093618750572205, + "learning_rate": 1.3422370617696161e-05, + "loss": 0.6221, + "step": 2800 + }, + { + "epoch": 0.0009831769162977194, + "grad_norm": 0.3257125914096832, + "learning_rate": 1.335559265442404e-05, + "loss": 0.6142, + "step": 2801 + }, + { + "epoch": 0.0009835279255502355, + "grad_norm": 0.33609306812286377, + "learning_rate": 1.3288814691151921e-05, + "loss": 0.48, + "step": 2802 + }, + { + "epoch": 0.0009838789348027516, + "grad_norm": 0.3490118682384491, + "learning_rate": 1.32220367278798e-05, + "loss": 0.3786, + "step": 2803 + }, + { + "epoch": 0.0009842299440552679, + "grad_norm": 0.29620417952537537, + "learning_rate": 1.3155258764607681e-05, + "loss": 0.4581, + "step": 2804 + }, + { + "epoch": 0.000984580953307784, + "grad_norm": 0.30852359533309937, + "learning_rate": 1.3088480801335559e-05, + "loss": 0.5131, + "step": 2805 + }, + { + "epoch": 0.0009849319625603, + "grad_norm": 0.31084272265434265, + "learning_rate": 1.302170283806344e-05, + "loss": 0.4977, + "step": 2806 + }, + { + "epoch": 0.0009852829718128163, + "grad_norm": 0.3435419201850891, + "learning_rate": 1.2954924874791319e-05, + "loss": 0.5823, + "step": 2807 + }, + { + "epoch": 0.0009856339810653324, + "grad_norm": 0.4358508884906769, + "learning_rate": 1.28881469115192e-05, + "loss": 0.5658, + "step": 2808 + }, + { + "epoch": 0.0009859849903178486, + "grad_norm": 0.391532838344574, + "learning_rate": 1.2821368948247079e-05, + "loss": 0.5459, + "step": 2809 + }, + { + "epoch": 0.0009863359995703647, + "grad_norm": 0.33119237422943115, + "learning_rate": 1.2754590984974959e-05, + "loss": 0.6244, + "step": 2810 + }, + { + "epoch": 0.0009866870088228808, + "grad_norm": 0.2695590853691101, + "learning_rate": 1.2687813021702839e-05, + "loss": 0.4516, + "step": 2811 + }, + { + "epoch": 0.000987038018075397, + "grad_norm": 0.3430154025554657, + "learning_rate": 1.2621035058430719e-05, + "loss": 0.5318, + "step": 2812 + }, + { + "epoch": 0.0009873890273279131, + "grad_norm": 0.3456191122531891, + "learning_rate": 1.2554257095158597e-05, + "loss": 0.5285, + "step": 2813 + }, + { + "epoch": 0.0009877400365804292, + "grad_norm": 0.3282442092895508, + "learning_rate": 1.2487479131886479e-05, + "loss": 0.5625, + "step": 2814 + }, + { + "epoch": 0.0009880910458329455, + "grad_norm": 0.3631054759025574, + "learning_rate": 1.2420701168614358e-05, + "loss": 0.5191, + "step": 2815 + }, + { + "epoch": 0.0009884420550854615, + "grad_norm": 0.3529251515865326, + "learning_rate": 1.2353923205342238e-05, + "loss": 0.4833, + "step": 2816 + }, + { + "epoch": 0.0009887930643379776, + "grad_norm": 0.30008605122566223, + "learning_rate": 1.2287145242070118e-05, + "loss": 0.5634, + "step": 2817 + }, + { + "epoch": 0.000989144073590494, + "grad_norm": 0.36905911564826965, + "learning_rate": 1.2220367278797998e-05, + "loss": 0.496, + "step": 2818 + }, + { + "epoch": 0.00098949508284301, + "grad_norm": 0.36410990357398987, + "learning_rate": 1.2153589315525878e-05, + "loss": 0.5299, + "step": 2819 + }, + { + "epoch": 0.000989846092095526, + "grad_norm": 0.3361034393310547, + "learning_rate": 1.2086811352253758e-05, + "loss": 0.4617, + "step": 2820 + }, + { + "epoch": 0.0009901971013480423, + "grad_norm": 0.3587837219238281, + "learning_rate": 1.2020033388981636e-05, + "loss": 0.4775, + "step": 2821 + }, + { + "epoch": 0.0009905481106005584, + "grad_norm": 0.35105395317077637, + "learning_rate": 1.1953255425709516e-05, + "loss": 0.4887, + "step": 2822 + }, + { + "epoch": 0.0009908991198530745, + "grad_norm": 0.24726948142051697, + "learning_rate": 1.1886477462437396e-05, + "loss": 0.3808, + "step": 2823 + }, + { + "epoch": 0.0009912501291055907, + "grad_norm": 0.3351190388202667, + "learning_rate": 1.1819699499165276e-05, + "loss": 0.4847, + "step": 2824 + }, + { + "epoch": 0.0009916011383581068, + "grad_norm": 0.35152459144592285, + "learning_rate": 1.1752921535893156e-05, + "loss": 0.5675, + "step": 2825 + }, + { + "epoch": 0.0009919521476106229, + "grad_norm": 0.28524544835090637, + "learning_rate": 1.1686143572621036e-05, + "loss": 0.3442, + "step": 2826 + }, + { + "epoch": 0.0009923031568631392, + "grad_norm": 0.28702452778816223, + "learning_rate": 1.1619365609348916e-05, + "loss": 0.4599, + "step": 2827 + }, + { + "epoch": 0.0009926541661156552, + "grad_norm": 0.35940173268318176, + "learning_rate": 1.1552587646076796e-05, + "loss": 0.481, + "step": 2828 + }, + { + "epoch": 0.0009930051753681713, + "grad_norm": 0.3729933798313141, + "learning_rate": 1.1485809682804676e-05, + "loss": 0.4886, + "step": 2829 + }, + { + "epoch": 0.0009933561846206876, + "grad_norm": 0.3298631012439728, + "learning_rate": 1.1419031719532556e-05, + "loss": 0.4997, + "step": 2830 + }, + { + "epoch": 0.0009937071938732037, + "grad_norm": 0.3539792597293854, + "learning_rate": 1.1352253756260436e-05, + "loss": 0.4885, + "step": 2831 + }, + { + "epoch": 0.00099405820312572, + "grad_norm": 0.3330310881137848, + "learning_rate": 1.1285475792988316e-05, + "loss": 0.4347, + "step": 2832 + }, + { + "epoch": 0.000994409212378236, + "grad_norm": 0.2996682822704315, + "learning_rate": 1.1218697829716194e-05, + "loss": 0.5527, + "step": 2833 + }, + { + "epoch": 0.000994760221630752, + "grad_norm": 0.30333733558654785, + "learning_rate": 1.1151919866444074e-05, + "loss": 0.3764, + "step": 2834 + }, + { + "epoch": 0.0009951112308832684, + "grad_norm": 0.31866851449012756, + "learning_rate": 1.1085141903171954e-05, + "loss": 0.3557, + "step": 2835 + }, + { + "epoch": 0.0009954622401357844, + "grad_norm": 0.43568530678749084, + "learning_rate": 1.1018363939899834e-05, + "loss": 0.6082, + "step": 2836 + }, + { + "epoch": 0.0009958132493883005, + "grad_norm": 0.3195807933807373, + "learning_rate": 1.0951585976627714e-05, + "loss": 0.5108, + "step": 2837 + }, + { + "epoch": 0.0009961642586408168, + "grad_norm": 0.320580393075943, + "learning_rate": 1.0884808013355593e-05, + "loss": 0.5151, + "step": 2838 + }, + { + "epoch": 0.0009965152678933329, + "grad_norm": 0.2912052273750305, + "learning_rate": 1.0818030050083473e-05, + "loss": 0.4537, + "step": 2839 + }, + { + "epoch": 0.000996866277145849, + "grad_norm": 0.33394452929496765, + "learning_rate": 1.0751252086811353e-05, + "loss": 0.4992, + "step": 2840 + }, + { + "epoch": 0.0009972172863983652, + "grad_norm": 0.33535537123680115, + "learning_rate": 1.0684474123539233e-05, + "loss": 0.4956, + "step": 2841 + }, + { + "epoch": 0.0009975682956508813, + "grad_norm": 0.3085540533065796, + "learning_rate": 1.0617696160267113e-05, + "loss": 0.4702, + "step": 2842 + }, + { + "epoch": 0.0009979193049033973, + "grad_norm": 0.31785404682159424, + "learning_rate": 1.0550918196994993e-05, + "loss": 0.5323, + "step": 2843 + }, + { + "epoch": 0.0009982703141559136, + "grad_norm": 0.45626768469810486, + "learning_rate": 1.0484140233722873e-05, + "loss": 0.5169, + "step": 2844 + }, + { + "epoch": 0.0009986213234084297, + "grad_norm": 0.32607361674308777, + "learning_rate": 1.0417362270450751e-05, + "loss": 0.4837, + "step": 2845 + }, + { + "epoch": 0.0009989723326609458, + "grad_norm": 0.29072248935699463, + "learning_rate": 1.0350584307178631e-05, + "loss": 0.479, + "step": 2846 + }, + { + "epoch": 0.000999323341913462, + "grad_norm": 0.31325575709342957, + "learning_rate": 1.0283806343906511e-05, + "loss": 0.5148, + "step": 2847 + }, + { + "epoch": 0.0009996743511659781, + "grad_norm": 0.316815048456192, + "learning_rate": 1.0217028380634391e-05, + "loss": 0.475, + "step": 2848 + }, + { + "epoch": 0.0010000253604184942, + "grad_norm": 0.3155490458011627, + "learning_rate": 1.0150250417362271e-05, + "loss": 0.5017, + "step": 2849 + }, + { + "epoch": 0.0010003763696710105, + "grad_norm": 0.29394692182540894, + "learning_rate": 1.0083472454090151e-05, + "loss": 0.4944, + "step": 2850 + }, + { + "epoch": 0.0010007273789235265, + "grad_norm": 0.3168147802352905, + "learning_rate": 1.001669449081803e-05, + "loss": 0.4903, + "step": 2851 + }, + { + "epoch": 0.0010010783881760428, + "grad_norm": 0.3451838493347168, + "learning_rate": 9.94991652754591e-06, + "loss": 0.5458, + "step": 2852 + }, + { + "epoch": 0.0010014293974285589, + "grad_norm": 0.30031347274780273, + "learning_rate": 9.88313856427379e-06, + "loss": 0.5271, + "step": 2853 + }, + { + "epoch": 0.001001780406681075, + "grad_norm": 0.2595888376235962, + "learning_rate": 9.81636060100167e-06, + "loss": 0.4184, + "step": 2854 + }, + { + "epoch": 0.0010021314159335912, + "grad_norm": 0.32942327857017517, + "learning_rate": 9.74958263772955e-06, + "loss": 0.4663, + "step": 2855 + }, + { + "epoch": 0.0010024824251861073, + "grad_norm": 0.3288143575191498, + "learning_rate": 9.68280467445743e-06, + "loss": 0.484, + "step": 2856 + }, + { + "epoch": 0.0010028334344386234, + "grad_norm": 0.3024934232234955, + "learning_rate": 9.616026711185309e-06, + "loss": 0.5248, + "step": 2857 + }, + { + "epoch": 0.0010031844436911397, + "grad_norm": 0.32177647948265076, + "learning_rate": 9.549248747913189e-06, + "loss": 0.4719, + "step": 2858 + }, + { + "epoch": 0.0010035354529436557, + "grad_norm": 0.3516426682472229, + "learning_rate": 9.482470784641069e-06, + "loss": 0.475, + "step": 2859 + }, + { + "epoch": 0.0010038864621961718, + "grad_norm": 0.3293197453022003, + "learning_rate": 9.415692821368949e-06, + "loss": 0.4472, + "step": 2860 + }, + { + "epoch": 0.001004237471448688, + "grad_norm": 0.3366454243659973, + "learning_rate": 9.348914858096828e-06, + "loss": 0.3976, + "step": 2861 + }, + { + "epoch": 0.0010045884807012042, + "grad_norm": 0.3678044378757477, + "learning_rate": 9.282136894824708e-06, + "loss": 0.5673, + "step": 2862 + }, + { + "epoch": 0.0010049394899537202, + "grad_norm": 0.31383898854255676, + "learning_rate": 9.215358931552588e-06, + "loss": 0.5399, + "step": 2863 + }, + { + "epoch": 0.0010052904992062365, + "grad_norm": 0.3226788640022278, + "learning_rate": 9.148580968280468e-06, + "loss": 0.4881, + "step": 2864 + }, + { + "epoch": 0.0010056415084587526, + "grad_norm": 0.4325566291809082, + "learning_rate": 9.081803005008348e-06, + "loss": 0.4844, + "step": 2865 + }, + { + "epoch": 0.0010059925177112686, + "grad_norm": 0.36546987295150757, + "learning_rate": 9.015025041736228e-06, + "loss": 0.6402, + "step": 2866 + }, + { + "epoch": 0.001006343526963785, + "grad_norm": 0.2988201975822449, + "learning_rate": 8.948247078464108e-06, + "loss": 0.5068, + "step": 2867 + }, + { + "epoch": 0.001006694536216301, + "grad_norm": 0.30769219994544983, + "learning_rate": 8.881469115191986e-06, + "loss": 0.5663, + "step": 2868 + }, + { + "epoch": 0.001007045545468817, + "grad_norm": 0.294279009103775, + "learning_rate": 8.814691151919866e-06, + "loss": 0.4915, + "step": 2869 + }, + { + "epoch": 0.0010073965547213334, + "grad_norm": 0.3285538852214813, + "learning_rate": 8.747913188647746e-06, + "loss": 0.5493, + "step": 2870 + }, + { + "epoch": 0.0010077475639738494, + "grad_norm": 0.29056650400161743, + "learning_rate": 8.681135225375626e-06, + "loss": 0.4991, + "step": 2871 + }, + { + "epoch": 0.0010080985732263657, + "grad_norm": 0.289212703704834, + "learning_rate": 8.614357262103506e-06, + "loss": 0.513, + "step": 2872 + }, + { + "epoch": 0.0010084495824788818, + "grad_norm": 0.32771703600883484, + "learning_rate": 8.547579298831386e-06, + "loss": 0.5085, + "step": 2873 + }, + { + "epoch": 0.0010088005917313978, + "grad_norm": 0.26033174991607666, + "learning_rate": 8.480801335559266e-06, + "loss": 0.4379, + "step": 2874 + }, + { + "epoch": 0.0010091516009839141, + "grad_norm": 0.3108559846878052, + "learning_rate": 8.414023372287146e-06, + "loss": 0.4433, + "step": 2875 + }, + { + "epoch": 0.0010095026102364302, + "grad_norm": 0.388600617647171, + "learning_rate": 8.347245409015026e-06, + "loss": 0.4849, + "step": 2876 + }, + { + "epoch": 0.0010098536194889463, + "grad_norm": 0.33916473388671875, + "learning_rate": 8.280467445742906e-06, + "loss": 0.4111, + "step": 2877 + }, + { + "epoch": 0.0010102046287414625, + "grad_norm": 0.2922442555427551, + "learning_rate": 8.213689482470786e-06, + "loss": 0.4301, + "step": 2878 + }, + { + "epoch": 0.0010105556379939786, + "grad_norm": 0.3101591169834137, + "learning_rate": 8.146911519198665e-06, + "loss": 0.545, + "step": 2879 + }, + { + "epoch": 0.0010109066472464947, + "grad_norm": 0.3512362539768219, + "learning_rate": 8.080133555926544e-06, + "loss": 0.4571, + "step": 2880 + }, + { + "epoch": 0.001011257656499011, + "grad_norm": 0.29563796520233154, + "learning_rate": 8.013355592654424e-06, + "loss": 0.4759, + "step": 2881 + }, + { + "epoch": 0.001011608665751527, + "grad_norm": 0.3032160997390747, + "learning_rate": 7.946577629382304e-06, + "loss": 0.5278, + "step": 2882 + }, + { + "epoch": 0.001011959675004043, + "grad_norm": 0.3556135892868042, + "learning_rate": 7.879799666110184e-06, + "loss": 0.5723, + "step": 2883 + }, + { + "epoch": 0.0010123106842565594, + "grad_norm": 0.30408796668052673, + "learning_rate": 7.813021702838063e-06, + "loss": 0.5304, + "step": 2884 + }, + { + "epoch": 0.0010126616935090755, + "grad_norm": 0.34963956475257874, + "learning_rate": 7.746243739565943e-06, + "loss": 0.425, + "step": 2885 + }, + { + "epoch": 0.0010130127027615915, + "grad_norm": 0.2958761751651764, + "learning_rate": 7.679465776293823e-06, + "loss": 0.5885, + "step": 2886 + }, + { + "epoch": 0.0010133637120141078, + "grad_norm": 0.3414060175418854, + "learning_rate": 7.612687813021703e-06, + "loss": 0.4816, + "step": 2887 + }, + { + "epoch": 0.0010137147212666239, + "grad_norm": 0.2990204691886902, + "learning_rate": 7.545909849749583e-06, + "loss": 0.5417, + "step": 2888 + }, + { + "epoch": 0.00101406573051914, + "grad_norm": 0.3612974286079407, + "learning_rate": 7.479131886477462e-06, + "loss": 0.34, + "step": 2889 + }, + { + "epoch": 0.0010144167397716562, + "grad_norm": 0.35951128602027893, + "learning_rate": 7.412353923205342e-06, + "loss": 0.5276, + "step": 2890 + }, + { + "epoch": 0.0010147677490241723, + "grad_norm": 0.30055317282676697, + "learning_rate": 7.345575959933222e-06, + "loss": 0.4771, + "step": 2891 + }, + { + "epoch": 0.0010151187582766886, + "grad_norm": 0.3545505702495575, + "learning_rate": 7.278797996661102e-06, + "loss": 0.5787, + "step": 2892 + }, + { + "epoch": 0.0010154697675292047, + "grad_norm": 0.3374285399913788, + "learning_rate": 7.212020033388982e-06, + "loss": 0.4499, + "step": 2893 + }, + { + "epoch": 0.0010158207767817207, + "grad_norm": 0.3583277761936188, + "learning_rate": 7.145242070116862e-06, + "loss": 0.4709, + "step": 2894 + }, + { + "epoch": 0.001016171786034237, + "grad_norm": 0.3155690133571625, + "learning_rate": 7.078464106844741e-06, + "loss": 0.3971, + "step": 2895 + }, + { + "epoch": 0.001016522795286753, + "grad_norm": 0.30925673246383667, + "learning_rate": 7.011686143572621e-06, + "loss": 0.5169, + "step": 2896 + }, + { + "epoch": 0.0010168738045392691, + "grad_norm": 0.27813151478767395, + "learning_rate": 6.944908180300501e-06, + "loss": 0.4273, + "step": 2897 + }, + { + "epoch": 0.0010172248137917854, + "grad_norm": 0.3304729461669922, + "learning_rate": 6.878130217028381e-06, + "loss": 0.4964, + "step": 2898 + }, + { + "epoch": 0.0010175758230443015, + "grad_norm": 0.3034217655658722, + "learning_rate": 6.811352253756261e-06, + "loss": 0.5771, + "step": 2899 + }, + { + "epoch": 0.0010179268322968176, + "grad_norm": 0.2880434989929199, + "learning_rate": 6.744574290484141e-06, + "loss": 0.4269, + "step": 2900 + }, + { + "epoch": 0.0010182778415493338, + "grad_norm": 0.33287328481674194, + "learning_rate": 6.67779632721202e-06, + "loss": 0.533, + "step": 2901 + }, + { + "epoch": 0.00101862885080185, + "grad_norm": 0.320425420999527, + "learning_rate": 6.6110183639399e-06, + "loss": 0.5449, + "step": 2902 + }, + { + "epoch": 0.001018979860054366, + "grad_norm": 0.3151349425315857, + "learning_rate": 6.5442404006677796e-06, + "loss": 0.4434, + "step": 2903 + }, + { + "epoch": 0.0010193308693068823, + "grad_norm": 0.35453543066978455, + "learning_rate": 6.4774624373956595e-06, + "loss": 0.5613, + "step": 2904 + }, + { + "epoch": 0.0010196818785593983, + "grad_norm": 0.31413450837135315, + "learning_rate": 6.4106844741235394e-06, + "loss": 0.5319, + "step": 2905 + }, + { + "epoch": 0.0010200328878119144, + "grad_norm": 0.3112393915653229, + "learning_rate": 6.343906510851419e-06, + "loss": 0.3744, + "step": 2906 + }, + { + "epoch": 0.0010203838970644307, + "grad_norm": 0.3204953968524933, + "learning_rate": 6.2771285475792984e-06, + "loss": 0.5078, + "step": 2907 + }, + { + "epoch": 0.0010207349063169468, + "grad_norm": 0.2905207574367523, + "learning_rate": 6.210350584307179e-06, + "loss": 0.5159, + "step": 2908 + }, + { + "epoch": 0.0010210859155694628, + "grad_norm": 0.3281486928462982, + "learning_rate": 6.143572621035059e-06, + "loss": 0.4634, + "step": 2909 + }, + { + "epoch": 0.0010214369248219791, + "grad_norm": 0.35208168625831604, + "learning_rate": 6.076794657762939e-06, + "loss": 0.5245, + "step": 2910 + }, + { + "epoch": 0.0010217879340744952, + "grad_norm": 0.3454799950122833, + "learning_rate": 6.010016694490818e-06, + "loss": 0.47, + "step": 2911 + }, + { + "epoch": 0.0010221389433270115, + "grad_norm": 0.3289708197116852, + "learning_rate": 5.943238731218698e-06, + "loss": 0.4518, + "step": 2912 + }, + { + "epoch": 0.0010224899525795275, + "grad_norm": 0.3424263894557953, + "learning_rate": 5.876460767946578e-06, + "loss": 0.5117, + "step": 2913 + }, + { + "epoch": 0.0010228409618320436, + "grad_norm": 0.30878815054893494, + "learning_rate": 5.809682804674458e-06, + "loss": 0.5874, + "step": 2914 + }, + { + "epoch": 0.0010231919710845599, + "grad_norm": 0.33106786012649536, + "learning_rate": 5.742904841402338e-06, + "loss": 0.4912, + "step": 2915 + }, + { + "epoch": 0.001023542980337076, + "grad_norm": 0.3185153007507324, + "learning_rate": 5.676126878130218e-06, + "loss": 0.5368, + "step": 2916 + }, + { + "epoch": 0.001023893989589592, + "grad_norm": 0.3170020282268524, + "learning_rate": 5.609348914858097e-06, + "loss": 0.5092, + "step": 2917 + }, + { + "epoch": 0.0010242449988421083, + "grad_norm": 0.318539559841156, + "learning_rate": 5.542570951585977e-06, + "loss": 0.5921, + "step": 2918 + }, + { + "epoch": 0.0010245960080946244, + "grad_norm": 0.35273832082748413, + "learning_rate": 5.475792988313857e-06, + "loss": 0.5403, + "step": 2919 + }, + { + "epoch": 0.0010249470173471404, + "grad_norm": 0.36796239018440247, + "learning_rate": 5.409015025041737e-06, + "loss": 0.5159, + "step": 2920 + }, + { + "epoch": 0.0010252980265996567, + "grad_norm": 0.33227989077568054, + "learning_rate": 5.342237061769617e-06, + "loss": 0.5127, + "step": 2921 + }, + { + "epoch": 0.0010256490358521728, + "grad_norm": 0.3307982385158539, + "learning_rate": 5.2754590984974965e-06, + "loss": 0.5452, + "step": 2922 + }, + { + "epoch": 0.0010260000451046889, + "grad_norm": 0.3397897779941559, + "learning_rate": 5.208681135225376e-06, + "loss": 0.4151, + "step": 2923 + }, + { + "epoch": 0.0010263510543572052, + "grad_norm": 0.3205145001411438, + "learning_rate": 5.1419031719532556e-06, + "loss": 0.4546, + "step": 2924 + }, + { + "epoch": 0.0010267020636097212, + "grad_norm": 0.3622604310512543, + "learning_rate": 5.0751252086811355e-06, + "loss": 0.5765, + "step": 2925 + }, + { + "epoch": 0.0010270530728622373, + "grad_norm": 0.33206772804260254, + "learning_rate": 5.008347245409015e-06, + "loss": 0.5126, + "step": 2926 + }, + { + "epoch": 0.0010274040821147536, + "grad_norm": 0.3374924063682556, + "learning_rate": 4.941569282136895e-06, + "loss": 0.451, + "step": 2927 + }, + { + "epoch": 0.0010277550913672696, + "grad_norm": 0.3394838571548462, + "learning_rate": 4.874791318864775e-06, + "loss": 0.5149, + "step": 2928 + }, + { + "epoch": 0.0010281061006197857, + "grad_norm": 0.31065940856933594, + "learning_rate": 4.808013355592654e-06, + "loss": 0.5641, + "step": 2929 + }, + { + "epoch": 0.001028457109872302, + "grad_norm": 0.3824044466018677, + "learning_rate": 4.741235392320534e-06, + "loss": 0.5076, + "step": 2930 + }, + { + "epoch": 0.001028808119124818, + "grad_norm": 0.3511646091938019, + "learning_rate": 4.674457429048414e-06, + "loss": 0.4584, + "step": 2931 + }, + { + "epoch": 0.0010291591283773343, + "grad_norm": 0.29822078347206116, + "learning_rate": 4.607679465776294e-06, + "loss": 0.5395, + "step": 2932 + }, + { + "epoch": 0.0010295101376298504, + "grad_norm": 0.30549919605255127, + "learning_rate": 4.540901502504174e-06, + "loss": 0.5105, + "step": 2933 + }, + { + "epoch": 0.0010298611468823665, + "grad_norm": 0.34264084696769714, + "learning_rate": 4.474123539232054e-06, + "loss": 0.4386, + "step": 2934 + }, + { + "epoch": 0.0010302121561348828, + "grad_norm": 0.33599743247032166, + "learning_rate": 4.407345575959933e-06, + "loss": 0.4366, + "step": 2935 + }, + { + "epoch": 0.0010305631653873988, + "grad_norm": 0.2982969880104065, + "learning_rate": 4.340567612687813e-06, + "loss": 0.4461, + "step": 2936 + }, + { + "epoch": 0.001030914174639915, + "grad_norm": 0.33507707715034485, + "learning_rate": 4.273789649415693e-06, + "loss": 0.4071, + "step": 2937 + }, + { + "epoch": 0.0010312651838924312, + "grad_norm": 0.27803149819374084, + "learning_rate": 4.207011686143573e-06, + "loss": 0.4466, + "step": 2938 + }, + { + "epoch": 0.0010316161931449473, + "grad_norm": 0.3152257204055786, + "learning_rate": 4.140233722871453e-06, + "loss": 0.4718, + "step": 2939 + }, + { + "epoch": 0.0010319672023974633, + "grad_norm": 0.3316212594509125, + "learning_rate": 4.073455759599333e-06, + "loss": 0.4341, + "step": 2940 + }, + { + "epoch": 0.0010323182116499796, + "grad_norm": 0.32193487882614136, + "learning_rate": 4.006677796327212e-06, + "loss": 0.4467, + "step": 2941 + }, + { + "epoch": 0.0010326692209024957, + "grad_norm": 0.3784795105457306, + "learning_rate": 3.939899833055092e-06, + "loss": 0.3846, + "step": 2942 + }, + { + "epoch": 0.0010330202301550117, + "grad_norm": 0.34764155745506287, + "learning_rate": 3.873121869782972e-06, + "loss": 0.49, + "step": 2943 + }, + { + "epoch": 0.001033371239407528, + "grad_norm": 0.2982045114040375, + "learning_rate": 3.8063439065108516e-06, + "loss": 0.4722, + "step": 2944 + }, + { + "epoch": 0.001033722248660044, + "grad_norm": 0.31696122884750366, + "learning_rate": 3.739565943238731e-06, + "loss": 0.5707, + "step": 2945 + }, + { + "epoch": 0.0010340732579125602, + "grad_norm": 0.3777923285961151, + "learning_rate": 3.672787979966611e-06, + "loss": 0.5166, + "step": 2946 + }, + { + "epoch": 0.0010344242671650765, + "grad_norm": 0.3396044373512268, + "learning_rate": 3.606010016694491e-06, + "loss": 0.4944, + "step": 2947 + }, + { + "epoch": 0.0010347752764175925, + "grad_norm": 0.3235200047492981, + "learning_rate": 3.5392320534223705e-06, + "loss": 0.5703, + "step": 2948 + }, + { + "epoch": 0.0010351262856701086, + "grad_norm": 0.40763989090919495, + "learning_rate": 3.4724540901502504e-06, + "loss": 0.5539, + "step": 2949 + }, + { + "epoch": 0.0010354772949226249, + "grad_norm": 0.26959654688835144, + "learning_rate": 3.4056761268781303e-06, + "loss": 0.463, + "step": 2950 + }, + { + "epoch": 0.001035828304175141, + "grad_norm": 0.3275678753852844, + "learning_rate": 3.33889816360601e-06, + "loss": 0.4773, + "step": 2951 + }, + { + "epoch": 0.0010361793134276572, + "grad_norm": 0.3292611241340637, + "learning_rate": 3.2721202003338898e-06, + "loss": 0.5536, + "step": 2952 + }, + { + "epoch": 0.0010365303226801733, + "grad_norm": 0.331476092338562, + "learning_rate": 3.2053422370617697e-06, + "loss": 0.5231, + "step": 2953 + }, + { + "epoch": 0.0010368813319326894, + "grad_norm": 0.35063010454177856, + "learning_rate": 3.1385642737896492e-06, + "loss": 0.4525, + "step": 2954 + }, + { + "epoch": 0.0010372323411852056, + "grad_norm": 0.33868759870529175, + "learning_rate": 3.0717863105175296e-06, + "loss": 0.4964, + "step": 2955 + }, + { + "epoch": 0.0010375833504377217, + "grad_norm": 0.3151177763938904, + "learning_rate": 3.005008347245409e-06, + "loss": 0.4795, + "step": 2956 + }, + { + "epoch": 0.0010379343596902378, + "grad_norm": 0.335718035697937, + "learning_rate": 2.938230383973289e-06, + "loss": 0.5309, + "step": 2957 + }, + { + "epoch": 0.001038285368942754, + "grad_norm": 0.33830276131629944, + "learning_rate": 2.871452420701169e-06, + "loss": 0.432, + "step": 2958 + }, + { + "epoch": 0.0010386363781952701, + "grad_norm": 0.3564944267272949, + "learning_rate": 2.8046744574290484e-06, + "loss": 0.3739, + "step": 2959 + }, + { + "epoch": 0.0010389873874477862, + "grad_norm": 0.36804574728012085, + "learning_rate": 2.7378964941569284e-06, + "loss": 0.5469, + "step": 2960 + }, + { + "epoch": 0.0010393383967003025, + "grad_norm": 0.29244372248649597, + "learning_rate": 2.6711185308848083e-06, + "loss": 0.4615, + "step": 2961 + }, + { + "epoch": 0.0010396894059528186, + "grad_norm": 0.3378223478794098, + "learning_rate": 2.604340567612688e-06, + "loss": 0.4586, + "step": 2962 + }, + { + "epoch": 0.0010400404152053346, + "grad_norm": 0.3950120508670807, + "learning_rate": 2.5375626043405677e-06, + "loss": 0.551, + "step": 2963 + }, + { + "epoch": 0.001040391424457851, + "grad_norm": 0.32888704538345337, + "learning_rate": 2.4707846410684477e-06, + "loss": 0.5065, + "step": 2964 + }, + { + "epoch": 0.001040742433710367, + "grad_norm": 0.29158586263656616, + "learning_rate": 2.404006677796327e-06, + "loss": 0.5324, + "step": 2965 + }, + { + "epoch": 0.001041093442962883, + "grad_norm": 0.2751038670539856, + "learning_rate": 2.337228714524207e-06, + "loss": 0.4314, + "step": 2966 + }, + { + "epoch": 0.0010414444522153993, + "grad_norm": 0.3558200001716614, + "learning_rate": 2.270450751252087e-06, + "loss": 0.5264, + "step": 2967 + }, + { + "epoch": 0.0010417954614679154, + "grad_norm": 0.2883598208427429, + "learning_rate": 2.2036727879799665e-06, + "loss": 0.5187, + "step": 2968 + }, + { + "epoch": 0.0010421464707204315, + "grad_norm": 0.2950475811958313, + "learning_rate": 2.1368948247078465e-06, + "loss": 0.4762, + "step": 2969 + }, + { + "epoch": 0.0010424974799729478, + "grad_norm": 0.3093850314617157, + "learning_rate": 2.0701168614357264e-06, + "loss": 0.5212, + "step": 2970 + }, + { + "epoch": 0.0010428484892254638, + "grad_norm": 0.33225756883621216, + "learning_rate": 2.003338898163606e-06, + "loss": 0.4813, + "step": 2971 + }, + { + "epoch": 0.0010431994984779801, + "grad_norm": 0.37023112177848816, + "learning_rate": 1.936560934891486e-06, + "loss": 0.5312, + "step": 2972 + }, + { + "epoch": 0.0010435505077304962, + "grad_norm": 0.31859534978866577, + "learning_rate": 1.8697829716193656e-06, + "loss": 0.4767, + "step": 2973 + }, + { + "epoch": 0.0010439015169830122, + "grad_norm": 0.3548760712146759, + "learning_rate": 1.8030050083472455e-06, + "loss": 0.5868, + "step": 2974 + }, + { + "epoch": 0.0010442525262355285, + "grad_norm": 0.35896405577659607, + "learning_rate": 1.7362270450751252e-06, + "loss": 0.4882, + "step": 2975 + }, + { + "epoch": 0.0010446035354880446, + "grad_norm": 0.3278505802154541, + "learning_rate": 1.669449081803005e-06, + "loss": 0.5192, + "step": 2976 + }, + { + "epoch": 0.0010449545447405607, + "grad_norm": 0.2957611382007599, + "learning_rate": 1.6026711185308849e-06, + "loss": 0.4579, + "step": 2977 + }, + { + "epoch": 0.001045305553993077, + "grad_norm": 0.3424510657787323, + "learning_rate": 1.5358931552587648e-06, + "loss": 0.5611, + "step": 2978 + }, + { + "epoch": 0.001045656563245593, + "grad_norm": 0.36232849955558777, + "learning_rate": 1.4691151919866445e-06, + "loss": 0.5603, + "step": 2979 + }, + { + "epoch": 0.001046007572498109, + "grad_norm": 0.3009660840034485, + "learning_rate": 1.4023372287145242e-06, + "loss": 0.6034, + "step": 2980 + }, + { + "epoch": 0.0010463585817506254, + "grad_norm": 0.36870652437210083, + "learning_rate": 1.3355592654424042e-06, + "loss": 0.5196, + "step": 2981 + }, + { + "epoch": 0.0010467095910031414, + "grad_norm": 0.273653507232666, + "learning_rate": 1.2687813021702839e-06, + "loss": 0.5133, + "step": 2982 + }, + { + "epoch": 0.0010470606002556575, + "grad_norm": 0.32439151406288147, + "learning_rate": 1.2020033388981636e-06, + "loss": 0.5681, + "step": 2983 + }, + { + "epoch": 0.0010474116095081738, + "grad_norm": 0.2973668873310089, + "learning_rate": 1.1352253756260435e-06, + "loss": 0.5235, + "step": 2984 + }, + { + "epoch": 0.0010477626187606899, + "grad_norm": 0.3241012990474701, + "learning_rate": 1.0684474123539232e-06, + "loss": 0.5043, + "step": 2985 + }, + { + "epoch": 0.001048113628013206, + "grad_norm": 0.31330305337905884, + "learning_rate": 1.001669449081803e-06, + "loss": 0.3919, + "step": 2986 + }, + { + "epoch": 0.0010484646372657222, + "grad_norm": 0.3226507306098938, + "learning_rate": 9.348914858096828e-07, + "loss": 0.5759, + "step": 2987 + }, + { + "epoch": 0.0010488156465182383, + "grad_norm": 0.37003642320632935, + "learning_rate": 8.681135225375626e-07, + "loss": 0.6293, + "step": 2988 + }, + { + "epoch": 0.0010491666557707544, + "grad_norm": 0.3701813220977783, + "learning_rate": 8.013355592654424e-07, + "loss": 0.5325, + "step": 2989 + }, + { + "epoch": 0.0010495176650232706, + "grad_norm": 0.3282783329486847, + "learning_rate": 7.345575959933223e-07, + "loss": 0.6135, + "step": 2990 + }, + { + "epoch": 0.0010498686742757867, + "grad_norm": 0.31943637132644653, + "learning_rate": 6.677796327212021e-07, + "loss": 0.3762, + "step": 2991 + }, + { + "epoch": 0.001050219683528303, + "grad_norm": 0.3415220081806183, + "learning_rate": 6.010016694490818e-07, + "loss": 0.5115, + "step": 2992 + }, + { + "epoch": 0.001050570692780819, + "grad_norm": 0.3445815443992615, + "learning_rate": 5.342237061769616e-07, + "loss": 0.4507, + "step": 2993 + }, + { + "epoch": 0.0010509217020333351, + "grad_norm": 0.34057167172431946, + "learning_rate": 4.674457429048414e-07, + "loss": 0.5771, + "step": 2994 + }, + { + "epoch": 0.0010512727112858514, + "grad_norm": 0.3435412049293518, + "learning_rate": 4.006677796327212e-07, + "loss": 0.5264, + "step": 2995 + }, + { + "epoch": 0.0010516237205383675, + "grad_norm": 0.41883835196495056, + "learning_rate": 3.3388981636060104e-07, + "loss": 0.4895, + "step": 2996 + }, + { + "epoch": 0.0010519747297908835, + "grad_norm": 0.31597742438316345, + "learning_rate": 2.671118530884808e-07, + "loss": 0.4767, + "step": 2997 + }, + { + "epoch": 0.0010523257390433998, + "grad_norm": 0.3613269329071045, + "learning_rate": 2.003338898163606e-07, + "loss": 0.5164, + "step": 2998 + }, + { + "epoch": 0.001052676748295916, + "grad_norm": 0.3645285964012146, + "learning_rate": 1.335559265442404e-07, + "loss": 0.4737, + "step": 2999 + }, + { + "epoch": 0.001053027757548432, + "grad_norm": 0.3003588318824768, + "learning_rate": 6.67779632721202e-08, + "loss": 0.3335, + "step": 3000 + } + ], + "logging_steps": 1, + "max_steps": 3000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1543546971162214e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/marques/outputs/checkpoint-3000/training_args.bin b/marques/outputs/checkpoint-3000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..fd0ba520c124bb1ece608079704fa15e0236be45 --- /dev/null +++ b/marques/outputs/checkpoint-3000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09362706a3d58d219e41be1682b770b8f5069fcd630f7dbcadb71e4d4ce8859b +size 6289 diff --git a/marques/outputs/checkpoint-500/README.md b/marques/outputs/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d90a96dfe2e51221657a6e936d376789e21081f9 --- /dev/null +++ b/marques/outputs/checkpoint-500/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/marques/outputs/checkpoint-500/adapter_config.json b/marques/outputs/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9930a191a30254256c9550b1bdffa58b8d7aee8 --- /dev/null +++ b/marques/outputs/checkpoint-500/adapter_config.json @@ -0,0 +1,50 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "LlamaForCausalLM", + "parent_library": "transformers.models.llama.modeling_llama", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "gate_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/marques/outputs/checkpoint-500/adapter_model.safetensors b/marques/outputs/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7eb4f8c42fcf948c1e158b4243c4aa05521f43e0 --- /dev/null +++ b/marques/outputs/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3abf895b9b57b6bdb4ee0f24308a722e43764da39a44557cfcee22a357e88dcd +size 167832240 diff --git a/marques/outputs/checkpoint-500/optimizer.pt b/marques/outputs/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..378a3288b9649c153c1e5057df91014568e8fc3e --- /dev/null +++ b/marques/outputs/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8782e4eb9a8034fbe1dfb686c0ce167e62ba6c2921372f182de7d8a0cb019623 +size 85724133 diff --git a/marques/outputs/checkpoint-500/rng_state.pth b/marques/outputs/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ef66339b9befa098183fd5d69faed6838e526b0 --- /dev/null +++ b/marques/outputs/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1d565802a8e26c4e8a31328752b7a7fdc186d9401aa008e65697d0ad8c22e33 +size 14645 diff --git a/marques/outputs/checkpoint-500/scheduler.pt b/marques/outputs/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bcb70495c472b0e8e4a1075c6dfc1f39fc89fedc --- /dev/null +++ b/marques/outputs/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cfa2dd06c29118fdd59db9dfda853dae5b1aaa493534c3ee2b09dbf4a23d9b3 +size 1465 diff --git a/marques/outputs/checkpoint-500/special_tokens_map.json b/marques/outputs/checkpoint-500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..68b10c7f0a479eae0c358eac6a14959b3f9acdf1 --- /dev/null +++ b/marques/outputs/checkpoint-500/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/marques/outputs/checkpoint-500/tokenizer.json b/marques/outputs/checkpoint-500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/marques/outputs/checkpoint-500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/marques/outputs/checkpoint-500/tokenizer_config.json b/marques/outputs/checkpoint-500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..92b1d94e894e5474ebea1d171e14751be79ca3e5 --- /dev/null +++ b/marques/outputs/checkpoint-500/tokenizer_config.json @@ -0,0 +1,2066 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/marques/outputs/checkpoint-500/trainer_state.json b/marques/outputs/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8016cabeea1a09486c264808c1fd261c97698630 --- /dev/null +++ b/marques/outputs/checkpoint-500/trainer_state.json @@ -0,0 +1,3534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.000175504626258072, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.5100925251614403e-07, + "grad_norm": 0.53782719373703, + "learning_rate": 0.0, + "loss": 0.5835, + "step": 1 + }, + { + "epoch": 7.020185050322881e-07, + "grad_norm": 0.6201626062393188, + "learning_rate": 4e-05, + "loss": 0.5242, + "step": 2 + }, + { + "epoch": 1.053027757548432e-06, + "grad_norm": 0.7571901082992554, + "learning_rate": 8e-05, + "loss": 0.5642, + "step": 3 + }, + { + "epoch": 1.4040370100645761e-06, + "grad_norm": 0.5588695406913757, + "learning_rate": 0.00012, + "loss": 0.4859, + "step": 4 + }, + { + "epoch": 1.75504626258072e-06, + "grad_norm": 0.7208331227302551, + "learning_rate": 0.00016, + "loss": 0.4645, + "step": 5 + }, + { + "epoch": 2.106055515096864e-06, + "grad_norm": 0.8169743418693542, + "learning_rate": 0.0002, + "loss": 0.3702, + "step": 6 + }, + { + "epoch": 2.4570647676130083e-06, + "grad_norm": 2.051530599594116, + "learning_rate": 0.00019993322203672788, + "loss": 0.4856, + "step": 7 + }, + { + "epoch": 2.8080740201291522e-06, + "grad_norm": 1.2310550212860107, + "learning_rate": 0.00019986644407345576, + "loss": 0.5192, + "step": 8 + }, + { + "epoch": 3.1590832726452962e-06, + "grad_norm": 1.612046241760254, + "learning_rate": 0.00019979966611018366, + "loss": 0.4719, + "step": 9 + }, + { + "epoch": 3.51009252516144e-06, + "grad_norm": 1.4484680891036987, + "learning_rate": 0.00019973288814691153, + "loss": 0.4416, + "step": 10 + }, + { + "epoch": 3.861101777677584e-06, + "grad_norm": 1.4529719352722168, + "learning_rate": 0.0001996661101836394, + "loss": 0.6275, + "step": 11 + }, + { + "epoch": 4.212111030193728e-06, + "grad_norm": 1.3963671922683716, + "learning_rate": 0.00019959933222036728, + "loss": 0.5874, + "step": 12 + }, + { + "epoch": 4.563120282709872e-06, + "grad_norm": 1.4744153022766113, + "learning_rate": 0.00019953255425709515, + "loss": 0.6422, + "step": 13 + }, + { + "epoch": 4.9141295352260165e-06, + "grad_norm": 0.8640050888061523, + "learning_rate": 0.00019946577629382305, + "loss": 0.5064, + "step": 14 + }, + { + "epoch": 5.26513878774216e-06, + "grad_norm": 0.7137419581413269, + "learning_rate": 0.00019939899833055092, + "loss": 0.5218, + "step": 15 + }, + { + "epoch": 5.6161480402583045e-06, + "grad_norm": 0.7769026756286621, + "learning_rate": 0.00019933222036727882, + "loss": 0.5377, + "step": 16 + }, + { + "epoch": 5.967157292774448e-06, + "grad_norm": 0.7558479905128479, + "learning_rate": 0.0001992654424040067, + "loss": 0.5054, + "step": 17 + }, + { + "epoch": 6.3181665452905924e-06, + "grad_norm": 0.8237054347991943, + "learning_rate": 0.00019919866444073457, + "loss": 0.5094, + "step": 18 + }, + { + "epoch": 6.669175797806736e-06, + "grad_norm": 1.0375059843063354, + "learning_rate": 0.00019913188647746244, + "loss": 0.5751, + "step": 19 + }, + { + "epoch": 7.02018505032288e-06, + "grad_norm": 1.075869083404541, + "learning_rate": 0.00019906510851419034, + "loss": 0.594, + "step": 20 + }, + { + "epoch": 7.371194302839024e-06, + "grad_norm": 0.8041358590126038, + "learning_rate": 0.00019899833055091822, + "loss": 0.553, + "step": 21 + }, + { + "epoch": 7.722203555355168e-06, + "grad_norm": 0.9264736771583557, + "learning_rate": 0.0001989315525876461, + "loss": 0.5555, + "step": 22 + }, + { + "epoch": 8.073212807871313e-06, + "grad_norm": 1.0074031352996826, + "learning_rate": 0.00019886477462437396, + "loss": 0.5353, + "step": 23 + }, + { + "epoch": 8.424222060387455e-06, + "grad_norm": 0.8725020885467529, + "learning_rate": 0.00019879799666110183, + "loss": 0.5557, + "step": 24 + }, + { + "epoch": 8.7752313129036e-06, + "grad_norm": 0.8867582678794861, + "learning_rate": 0.00019873121869782974, + "loss": 0.5992, + "step": 25 + }, + { + "epoch": 9.126240565419744e-06, + "grad_norm": 0.9235608577728271, + "learning_rate": 0.0001986644407345576, + "loss": 0.516, + "step": 26 + }, + { + "epoch": 9.477249817935889e-06, + "grad_norm": 0.8653218150138855, + "learning_rate": 0.00019859766277128548, + "loss": 0.5249, + "step": 27 + }, + { + "epoch": 9.828259070452033e-06, + "grad_norm": 0.7479026913642883, + "learning_rate": 0.00019853088480801335, + "loss": 0.5037, + "step": 28 + }, + { + "epoch": 1.0179268322968176e-05, + "grad_norm": 0.9531452655792236, + "learning_rate": 0.00019846410684474123, + "loss": 0.5896, + "step": 29 + }, + { + "epoch": 1.053027757548432e-05, + "grad_norm": 1.1012492179870605, + "learning_rate": 0.00019839732888146913, + "loss": 0.5139, + "step": 30 + }, + { + "epoch": 1.0881286828000465e-05, + "grad_norm": 1.0198887586593628, + "learning_rate": 0.000198330550918197, + "loss": 0.5587, + "step": 31 + }, + { + "epoch": 1.1232296080516609e-05, + "grad_norm": 0.8081266283988953, + "learning_rate": 0.00019826377295492487, + "loss": 0.4762, + "step": 32 + }, + { + "epoch": 1.1583305333032752e-05, + "grad_norm": 1.1965891122817993, + "learning_rate": 0.00019819699499165277, + "loss": 0.5719, + "step": 33 + }, + { + "epoch": 1.1934314585548896e-05, + "grad_norm": 1.214903473854065, + "learning_rate": 0.00019813021702838065, + "loss": 0.5756, + "step": 34 + }, + { + "epoch": 1.228532383806504e-05, + "grad_norm": 0.8360006213188171, + "learning_rate": 0.00019806343906510852, + "loss": 0.5688, + "step": 35 + }, + { + "epoch": 1.2636333090581185e-05, + "grad_norm": 0.8328489065170288, + "learning_rate": 0.00019799666110183642, + "loss": 0.6418, + "step": 36 + }, + { + "epoch": 1.298734234309733e-05, + "grad_norm": 1.1427714824676514, + "learning_rate": 0.0001979298831385643, + "loss": 0.6531, + "step": 37 + }, + { + "epoch": 1.3338351595613472e-05, + "grad_norm": 1.0145376920700073, + "learning_rate": 0.00019786310517529217, + "loss": 0.6473, + "step": 38 + }, + { + "epoch": 1.3689360848129616e-05, + "grad_norm": 0.8427861928939819, + "learning_rate": 0.00019779632721202004, + "loss": 0.5882, + "step": 39 + }, + { + "epoch": 1.404037010064576e-05, + "grad_norm": 0.8792659044265747, + "learning_rate": 0.00019772954924874791, + "loss": 0.608, + "step": 40 + }, + { + "epoch": 1.4391379353161905e-05, + "grad_norm": 0.9338463544845581, + "learning_rate": 0.00019766277128547581, + "loss": 0.7118, + "step": 41 + }, + { + "epoch": 1.4742388605678048e-05, + "grad_norm": 0.7554420232772827, + "learning_rate": 0.0001975959933222037, + "loss": 0.5898, + "step": 42 + }, + { + "epoch": 1.5093397858194192e-05, + "grad_norm": 0.7700084447860718, + "learning_rate": 0.00019752921535893156, + "loss": 0.6466, + "step": 43 + }, + { + "epoch": 1.5444407110710337e-05, + "grad_norm": 0.8639333248138428, + "learning_rate": 0.00019746243739565943, + "loss": 0.7253, + "step": 44 + }, + { + "epoch": 1.579541636322648e-05, + "grad_norm": 0.7760612964630127, + "learning_rate": 0.0001973956594323873, + "loss": 0.7099, + "step": 45 + }, + { + "epoch": 1.6146425615742626e-05, + "grad_norm": 0.7319066524505615, + "learning_rate": 0.0001973288814691152, + "loss": 0.6664, + "step": 46 + }, + { + "epoch": 1.6497434868258768e-05, + "grad_norm": 0.7557100057601929, + "learning_rate": 0.00019726210350584308, + "loss": 0.6318, + "step": 47 + }, + { + "epoch": 1.684844412077491e-05, + "grad_norm": 0.6420389413833618, + "learning_rate": 0.00019719532554257095, + "loss": 0.6688, + "step": 48 + }, + { + "epoch": 1.7199453373291057e-05, + "grad_norm": 0.660383939743042, + "learning_rate": 0.00019712854757929883, + "loss": 0.6204, + "step": 49 + }, + { + "epoch": 1.75504626258072e-05, + "grad_norm": 0.5614909529685974, + "learning_rate": 0.00019706176961602673, + "loss": 0.664, + "step": 50 + }, + { + "epoch": 1.7901471878323346e-05, + "grad_norm": 0.502738356590271, + "learning_rate": 0.0001969949916527546, + "loss": 0.6918, + "step": 51 + }, + { + "epoch": 1.825248113083949e-05, + "grad_norm": 0.47578102350234985, + "learning_rate": 0.0001969282136894825, + "loss": 0.6747, + "step": 52 + }, + { + "epoch": 1.860349038335563e-05, + "grad_norm": 0.5528931617736816, + "learning_rate": 0.00019686143572621037, + "loss": 0.765, + "step": 53 + }, + { + "epoch": 1.8954499635871777e-05, + "grad_norm": 0.6176997423171997, + "learning_rate": 0.00019679465776293825, + "loss": 0.5959, + "step": 54 + }, + { + "epoch": 1.930550888838792e-05, + "grad_norm": 0.43425047397613525, + "learning_rate": 0.00019672787979966612, + "loss": 0.6437, + "step": 55 + }, + { + "epoch": 1.9656518140904066e-05, + "grad_norm": 0.5135884881019592, + "learning_rate": 0.000196661101836394, + "loss": 0.7019, + "step": 56 + }, + { + "epoch": 2.000752739342021e-05, + "grad_norm": 0.4628916084766388, + "learning_rate": 0.0001965943238731219, + "loss": 0.5722, + "step": 57 + }, + { + "epoch": 2.035853664593635e-05, + "grad_norm": 0.48201897740364075, + "learning_rate": 0.00019652754590984977, + "loss": 0.6288, + "step": 58 + }, + { + "epoch": 2.0709545898452498e-05, + "grad_norm": 0.5772811770439148, + "learning_rate": 0.00019646076794657764, + "loss": 0.6067, + "step": 59 + }, + { + "epoch": 2.106055515096864e-05, + "grad_norm": 0.4976802170276642, + "learning_rate": 0.0001963939899833055, + "loss": 0.4722, + "step": 60 + }, + { + "epoch": 2.1411564403484786e-05, + "grad_norm": 0.4842129051685333, + "learning_rate": 0.00019632721202003339, + "loss": 0.5876, + "step": 61 + }, + { + "epoch": 2.176257365600093e-05, + "grad_norm": 0.46149536967277527, + "learning_rate": 0.00019626043405676129, + "loss": 0.6373, + "step": 62 + }, + { + "epoch": 2.2113582908517072e-05, + "grad_norm": 0.47199445962905884, + "learning_rate": 0.00019619365609348916, + "loss": 0.5546, + "step": 63 + }, + { + "epoch": 2.2464592161033218e-05, + "grad_norm": 0.6109340190887451, + "learning_rate": 0.00019612687813021703, + "loss": 0.6069, + "step": 64 + }, + { + "epoch": 2.281560141354936e-05, + "grad_norm": 0.5529135465621948, + "learning_rate": 0.0001960601001669449, + "loss": 0.553, + "step": 65 + }, + { + "epoch": 2.3166610666065503e-05, + "grad_norm": 0.500245213508606, + "learning_rate": 0.00019599332220367278, + "loss": 0.6149, + "step": 66 + }, + { + "epoch": 2.351761991858165e-05, + "grad_norm": 0.4841914474964142, + "learning_rate": 0.00019592654424040068, + "loss": 0.6509, + "step": 67 + }, + { + "epoch": 2.3868629171097792e-05, + "grad_norm": 0.5308504104614258, + "learning_rate": 0.00019585976627712855, + "loss": 0.7017, + "step": 68 + }, + { + "epoch": 2.4219638423613938e-05, + "grad_norm": 0.5157874822616577, + "learning_rate": 0.00019579298831385645, + "loss": 0.7125, + "step": 69 + }, + { + "epoch": 2.457064767613008e-05, + "grad_norm": 0.47787800431251526, + "learning_rate": 0.00019572621035058433, + "loss": 0.5792, + "step": 70 + }, + { + "epoch": 2.4921656928646224e-05, + "grad_norm": 0.46792763471603394, + "learning_rate": 0.0001956594323873122, + "loss": 0.7, + "step": 71 + }, + { + "epoch": 2.527266618116237e-05, + "grad_norm": 0.5394675135612488, + "learning_rate": 0.00019559265442404007, + "loss": 0.5549, + "step": 72 + }, + { + "epoch": 2.5623675433678512e-05, + "grad_norm": 0.45065200328826904, + "learning_rate": 0.00019552587646076797, + "loss": 0.6663, + "step": 73 + }, + { + "epoch": 2.597468468619466e-05, + "grad_norm": 0.4026688039302826, + "learning_rate": 0.00019545909849749584, + "loss": 0.6315, + "step": 74 + }, + { + "epoch": 2.63256939387108e-05, + "grad_norm": 0.42353659868240356, + "learning_rate": 0.00019539232053422372, + "loss": 0.5419, + "step": 75 + }, + { + "epoch": 2.6676703191226944e-05, + "grad_norm": 0.45561954379081726, + "learning_rate": 0.0001953255425709516, + "loss": 0.6624, + "step": 76 + }, + { + "epoch": 2.702771244374309e-05, + "grad_norm": 0.3954075574874878, + "learning_rate": 0.00019525876460767946, + "loss": 0.5479, + "step": 77 + }, + { + "epoch": 2.7378721696259233e-05, + "grad_norm": 0.4994329512119293, + "learning_rate": 0.00019519198664440736, + "loss": 0.7224, + "step": 78 + }, + { + "epoch": 2.7729730948775375e-05, + "grad_norm": 0.41149672865867615, + "learning_rate": 0.00019512520868113524, + "loss": 0.5621, + "step": 79 + }, + { + "epoch": 2.808074020129152e-05, + "grad_norm": 0.4199008345603943, + "learning_rate": 0.0001950584307178631, + "loss": 0.7038, + "step": 80 + }, + { + "epoch": 2.8431749453807664e-05, + "grad_norm": 0.4378969371318817, + "learning_rate": 0.00019499165275459098, + "loss": 0.6654, + "step": 81 + }, + { + "epoch": 2.878275870632381e-05, + "grad_norm": 0.4653928279876709, + "learning_rate": 0.00019492487479131886, + "loss": 0.6241, + "step": 82 + }, + { + "epoch": 2.9133767958839953e-05, + "grad_norm": 0.5166454911231995, + "learning_rate": 0.00019485809682804673, + "loss": 0.5366, + "step": 83 + }, + { + "epoch": 2.9484777211356096e-05, + "grad_norm": 0.43180733919143677, + "learning_rate": 0.00019479131886477463, + "loss": 0.6178, + "step": 84 + }, + { + "epoch": 2.9835786463872242e-05, + "grad_norm": 0.44828200340270996, + "learning_rate": 0.0001947245409015025, + "loss": 0.6706, + "step": 85 + }, + { + "epoch": 3.0186795716388385e-05, + "grad_norm": 0.384175181388855, + "learning_rate": 0.0001946577629382304, + "loss": 0.5551, + "step": 86 + }, + { + "epoch": 3.053780496890453e-05, + "grad_norm": 0.4359772503376007, + "learning_rate": 0.00019459098497495828, + "loss": 0.5626, + "step": 87 + }, + { + "epoch": 3.0888814221420673e-05, + "grad_norm": 0.4177016615867615, + "learning_rate": 0.00019452420701168615, + "loss": 0.6023, + "step": 88 + }, + { + "epoch": 3.1239823473936816e-05, + "grad_norm": 0.43592438101768494, + "learning_rate": 0.00019445742904841405, + "loss": 0.682, + "step": 89 + }, + { + "epoch": 3.159083272645296e-05, + "grad_norm": 0.48027974367141724, + "learning_rate": 0.00019439065108514192, + "loss": 0.7596, + "step": 90 + }, + { + "epoch": 3.194184197896911e-05, + "grad_norm": 0.35989537835121155, + "learning_rate": 0.0001943238731218698, + "loss": 0.6018, + "step": 91 + }, + { + "epoch": 3.229285123148525e-05, + "grad_norm": 0.48477092385292053, + "learning_rate": 0.00019425709515859767, + "loss": 0.512, + "step": 92 + }, + { + "epoch": 3.2643860484001394e-05, + "grad_norm": 0.38858646154403687, + "learning_rate": 0.00019419031719532554, + "loss": 0.6371, + "step": 93 + }, + { + "epoch": 3.2994869736517536e-05, + "grad_norm": 0.5323147177696228, + "learning_rate": 0.00019412353923205344, + "loss": 0.5221, + "step": 94 + }, + { + "epoch": 3.334587898903368e-05, + "grad_norm": 0.3784274160861969, + "learning_rate": 0.00019405676126878132, + "loss": 0.6158, + "step": 95 + }, + { + "epoch": 3.369688824154982e-05, + "grad_norm": 0.4076334834098816, + "learning_rate": 0.0001939899833055092, + "loss": 0.5535, + "step": 96 + }, + { + "epoch": 3.404789749406597e-05, + "grad_norm": 0.43930479884147644, + "learning_rate": 0.00019392320534223706, + "loss": 0.6482, + "step": 97 + }, + { + "epoch": 3.4398906746582114e-05, + "grad_norm": 0.4266909658908844, + "learning_rate": 0.00019385642737896494, + "loss": 0.6, + "step": 98 + }, + { + "epoch": 3.474991599909826e-05, + "grad_norm": 0.45353513956069946, + "learning_rate": 0.0001937896494156928, + "loss": 0.6596, + "step": 99 + }, + { + "epoch": 3.51009252516144e-05, + "grad_norm": 0.3424838185310364, + "learning_rate": 0.0001937228714524207, + "loss": 0.555, + "step": 100 + }, + { + "epoch": 3.545193450413054e-05, + "grad_norm": 0.40126165747642517, + "learning_rate": 0.00019365609348914858, + "loss": 0.6921, + "step": 101 + }, + { + "epoch": 3.580294375664669e-05, + "grad_norm": 0.36572012305259705, + "learning_rate": 0.00019358931552587646, + "loss": 0.5485, + "step": 102 + }, + { + "epoch": 3.6153953009162834e-05, + "grad_norm": 0.3972407281398773, + "learning_rate": 0.00019352253756260436, + "loss": 0.5884, + "step": 103 + }, + { + "epoch": 3.650496226167898e-05, + "grad_norm": 0.3900579512119293, + "learning_rate": 0.00019345575959933223, + "loss": 0.6664, + "step": 104 + }, + { + "epoch": 3.685597151419512e-05, + "grad_norm": 0.31666621565818787, + "learning_rate": 0.00019338898163606013, + "loss": 0.5009, + "step": 105 + }, + { + "epoch": 3.720698076671126e-05, + "grad_norm": 0.5269597172737122, + "learning_rate": 0.000193322203672788, + "loss": 0.6292, + "step": 106 + }, + { + "epoch": 3.755799001922741e-05, + "grad_norm": 0.4645126163959503, + "learning_rate": 0.00019325542570951588, + "loss": 0.636, + "step": 107 + }, + { + "epoch": 3.7908999271743555e-05, + "grad_norm": 0.3900754153728485, + "learning_rate": 0.00019318864774624375, + "loss": 0.5367, + "step": 108 + }, + { + "epoch": 3.82600085242597e-05, + "grad_norm": 0.42533883452415466, + "learning_rate": 0.00019312186978297162, + "loss": 0.6862, + "step": 109 + }, + { + "epoch": 3.861101777677584e-05, + "grad_norm": 0.6809422969818115, + "learning_rate": 0.00019305509181969952, + "loss": 0.6434, + "step": 110 + }, + { + "epoch": 3.896202702929198e-05, + "grad_norm": 0.5127860307693481, + "learning_rate": 0.0001929883138564274, + "loss": 0.6266, + "step": 111 + }, + { + "epoch": 3.931303628180813e-05, + "grad_norm": 0.5254234671592712, + "learning_rate": 0.00019292153589315527, + "loss": 0.6982, + "step": 112 + }, + { + "epoch": 3.9664045534324275e-05, + "grad_norm": 0.3699031472206116, + "learning_rate": 0.00019285475792988314, + "loss": 0.6037, + "step": 113 + }, + { + "epoch": 4.001505478684042e-05, + "grad_norm": 0.3807130455970764, + "learning_rate": 0.00019278797996661101, + "loss": 0.5861, + "step": 114 + }, + { + "epoch": 4.036606403935656e-05, + "grad_norm": 0.4455645978450775, + "learning_rate": 0.0001927212020033389, + "loss": 0.5658, + "step": 115 + }, + { + "epoch": 4.07170732918727e-05, + "grad_norm": 0.3830210864543915, + "learning_rate": 0.0001926544240400668, + "loss": 0.606, + "step": 116 + }, + { + "epoch": 4.106808254438885e-05, + "grad_norm": 0.41419631242752075, + "learning_rate": 0.00019258764607679466, + "loss": 0.6095, + "step": 117 + }, + { + "epoch": 4.1419091796904995e-05, + "grad_norm": 0.3929574489593506, + "learning_rate": 0.00019252086811352253, + "loss": 0.6464, + "step": 118 + }, + { + "epoch": 4.177010104942114e-05, + "grad_norm": 0.35958629846572876, + "learning_rate": 0.0001924540901502504, + "loss": 0.5185, + "step": 119 + }, + { + "epoch": 4.212111030193728e-05, + "grad_norm": 0.3790556490421295, + "learning_rate": 0.0001923873121869783, + "loss": 0.5156, + "step": 120 + }, + { + "epoch": 4.2472119554453423e-05, + "grad_norm": 0.37452438473701477, + "learning_rate": 0.00019232053422370618, + "loss": 0.5711, + "step": 121 + }, + { + "epoch": 4.282312880696957e-05, + "grad_norm": 0.38976770639419556, + "learning_rate": 0.00019225375626043408, + "loss": 0.6075, + "step": 122 + }, + { + "epoch": 4.3174138059485716e-05, + "grad_norm": 0.4098513424396515, + "learning_rate": 0.00019218697829716195, + "loss": 0.5312, + "step": 123 + }, + { + "epoch": 4.352514731200186e-05, + "grad_norm": 0.33890047669410706, + "learning_rate": 0.00019212020033388983, + "loss": 0.4984, + "step": 124 + }, + { + "epoch": 4.3876156564518e-05, + "grad_norm": 0.49077001214027405, + "learning_rate": 0.0001920534223706177, + "loss": 0.7159, + "step": 125 + }, + { + "epoch": 4.4227165817034144e-05, + "grad_norm": 0.41653814911842346, + "learning_rate": 0.0001919866444073456, + "loss": 0.5642, + "step": 126 + }, + { + "epoch": 4.4578175069550286e-05, + "grad_norm": 0.45710283517837524, + "learning_rate": 0.00019191986644407347, + "loss": 0.6936, + "step": 127 + }, + { + "epoch": 4.4929184322066436e-05, + "grad_norm": 0.36976873874664307, + "learning_rate": 0.00019185308848080135, + "loss": 0.5407, + "step": 128 + }, + { + "epoch": 4.528019357458258e-05, + "grad_norm": 0.42852675914764404, + "learning_rate": 0.00019178631051752922, + "loss": 0.6731, + "step": 129 + }, + { + "epoch": 4.563120282709872e-05, + "grad_norm": 0.5426310300827026, + "learning_rate": 0.0001917195325542571, + "loss": 0.5775, + "step": 130 + }, + { + "epoch": 4.5982212079614864e-05, + "grad_norm": 0.38442543148994446, + "learning_rate": 0.00019165275459098497, + "loss": 0.5994, + "step": 131 + }, + { + "epoch": 4.633322133213101e-05, + "grad_norm": 0.4298035502433777, + "learning_rate": 0.00019158597662771287, + "loss": 0.5563, + "step": 132 + }, + { + "epoch": 4.6684230584647156e-05, + "grad_norm": 0.40397605299949646, + "learning_rate": 0.00019151919866444074, + "loss": 0.6924, + "step": 133 + }, + { + "epoch": 4.70352398371633e-05, + "grad_norm": 0.4338497519493103, + "learning_rate": 0.0001914524207011686, + "loss": 0.5739, + "step": 134 + }, + { + "epoch": 4.738624908967944e-05, + "grad_norm": 0.39713653922080994, + "learning_rate": 0.0001913856427378965, + "loss": 0.4529, + "step": 135 + }, + { + "epoch": 4.7737258342195584e-05, + "grad_norm": 0.31409478187561035, + "learning_rate": 0.0001913188647746244, + "loss": 0.562, + "step": 136 + }, + { + "epoch": 4.808826759471173e-05, + "grad_norm": 0.371624618768692, + "learning_rate": 0.00019125208681135226, + "loss": 0.5288, + "step": 137 + }, + { + "epoch": 4.8439276847227877e-05, + "grad_norm": 0.4600190818309784, + "learning_rate": 0.00019118530884808016, + "loss": 0.6215, + "step": 138 + }, + { + "epoch": 4.879028609974402e-05, + "grad_norm": 0.45351359248161316, + "learning_rate": 0.00019111853088480803, + "loss": 0.686, + "step": 139 + }, + { + "epoch": 4.914129535226016e-05, + "grad_norm": 0.42282962799072266, + "learning_rate": 0.0001910517529215359, + "loss": 0.5966, + "step": 140 + }, + { + "epoch": 4.9492304604776305e-05, + "grad_norm": 0.41479986906051636, + "learning_rate": 0.00019098497495826378, + "loss": 0.5948, + "step": 141 + }, + { + "epoch": 4.984331385729245e-05, + "grad_norm": 0.40453553199768066, + "learning_rate": 0.00019091819699499168, + "loss": 0.6411, + "step": 142 + }, + { + "epoch": 5.01943231098086e-05, + "grad_norm": 0.3939369320869446, + "learning_rate": 0.00019085141903171955, + "loss": 0.5513, + "step": 143 + }, + { + "epoch": 5.054533236232474e-05, + "grad_norm": 0.3700481653213501, + "learning_rate": 0.00019078464106844743, + "loss": 0.5459, + "step": 144 + }, + { + "epoch": 5.089634161484088e-05, + "grad_norm": 0.4377487897872925, + "learning_rate": 0.0001907178631051753, + "loss": 0.6076, + "step": 145 + }, + { + "epoch": 5.1247350867357025e-05, + "grad_norm": 0.37919673323631287, + "learning_rate": 0.00019065108514190317, + "loss": 0.5207, + "step": 146 + }, + { + "epoch": 5.159836011987317e-05, + "grad_norm": 0.3841630816459656, + "learning_rate": 0.00019058430717863107, + "loss": 0.614, + "step": 147 + }, + { + "epoch": 5.194936937238932e-05, + "grad_norm": 0.43541714549064636, + "learning_rate": 0.00019051752921535895, + "loss": 0.6283, + "step": 148 + }, + { + "epoch": 5.230037862490546e-05, + "grad_norm": 0.4853285253047943, + "learning_rate": 0.00019045075125208682, + "loss": 0.5807, + "step": 149 + }, + { + "epoch": 5.26513878774216e-05, + "grad_norm": 0.3572970926761627, + "learning_rate": 0.0001903839732888147, + "loss": 0.6866, + "step": 150 + }, + { + "epoch": 5.3002397129937745e-05, + "grad_norm": 0.3674347698688507, + "learning_rate": 0.00019031719532554257, + "loss": 0.5552, + "step": 151 + }, + { + "epoch": 5.335340638245389e-05, + "grad_norm": 0.37748461961746216, + "learning_rate": 0.00019025041736227044, + "loss": 0.6278, + "step": 152 + }, + { + "epoch": 5.370441563497003e-05, + "grad_norm": 0.3788503408432007, + "learning_rate": 0.00019018363939899834, + "loss": 0.622, + "step": 153 + }, + { + "epoch": 5.405542488748618e-05, + "grad_norm": 0.3736303150653839, + "learning_rate": 0.0001901168614357262, + "loss": 0.5822, + "step": 154 + }, + { + "epoch": 5.440643414000232e-05, + "grad_norm": 0.32680070400238037, + "learning_rate": 0.0001900500834724541, + "loss": 0.5715, + "step": 155 + }, + { + "epoch": 5.4757443392518466e-05, + "grad_norm": 0.34495192766189575, + "learning_rate": 0.00018998330550918199, + "loss": 0.6497, + "step": 156 + }, + { + "epoch": 5.510845264503461e-05, + "grad_norm": 0.4244193136692047, + "learning_rate": 0.00018991652754590986, + "loss": 0.5519, + "step": 157 + }, + { + "epoch": 5.545946189755075e-05, + "grad_norm": 0.4024031162261963, + "learning_rate": 0.00018984974958263776, + "loss": 0.5339, + "step": 158 + }, + { + "epoch": 5.58104711500669e-05, + "grad_norm": 0.46051299571990967, + "learning_rate": 0.00018978297161936563, + "loss": 0.5979, + "step": 159 + }, + { + "epoch": 5.616148040258304e-05, + "grad_norm": 0.49051615595817566, + "learning_rate": 0.0001897161936560935, + "loss": 0.5563, + "step": 160 + }, + { + "epoch": 5.6512489655099186e-05, + "grad_norm": 0.43045854568481445, + "learning_rate": 0.00018964941569282138, + "loss": 0.5984, + "step": 161 + }, + { + "epoch": 5.686349890761533e-05, + "grad_norm": 0.37778228521347046, + "learning_rate": 0.00018958263772954925, + "loss": 0.5955, + "step": 162 + }, + { + "epoch": 5.721450816013147e-05, + "grad_norm": 0.3736341893672943, + "learning_rate": 0.00018951585976627715, + "loss": 0.6438, + "step": 163 + }, + { + "epoch": 5.756551741264762e-05, + "grad_norm": 0.3940117061138153, + "learning_rate": 0.00018944908180300502, + "loss": 0.503, + "step": 164 + }, + { + "epoch": 5.7916526665163763e-05, + "grad_norm": 0.4193519055843353, + "learning_rate": 0.0001893823038397329, + "loss": 0.6324, + "step": 165 + }, + { + "epoch": 5.8267535917679906e-05, + "grad_norm": 0.34481996297836304, + "learning_rate": 0.00018931552587646077, + "loss": 0.5745, + "step": 166 + }, + { + "epoch": 5.861854517019605e-05, + "grad_norm": 0.38285771012306213, + "learning_rate": 0.00018924874791318864, + "loss": 0.639, + "step": 167 + }, + { + "epoch": 5.896955442271219e-05, + "grad_norm": 0.36933982372283936, + "learning_rate": 0.00018918196994991652, + "loss": 0.6681, + "step": 168 + }, + { + "epoch": 5.932056367522834e-05, + "grad_norm": 0.36970776319503784, + "learning_rate": 0.00018911519198664442, + "loss": 0.5626, + "step": 169 + }, + { + "epoch": 5.9671572927744484e-05, + "grad_norm": 0.38494783639907837, + "learning_rate": 0.0001890484140233723, + "loss": 0.6066, + "step": 170 + }, + { + "epoch": 6.0022582180260627e-05, + "grad_norm": 0.3446069061756134, + "learning_rate": 0.00018898163606010016, + "loss": 0.6354, + "step": 171 + }, + { + "epoch": 6.037359143277677e-05, + "grad_norm": 0.4466759264469147, + "learning_rate": 0.00018891485809682806, + "loss": 0.4737, + "step": 172 + }, + { + "epoch": 6.072460068529291e-05, + "grad_norm": 0.43630918860435486, + "learning_rate": 0.00018884808013355594, + "loss": 0.6839, + "step": 173 + }, + { + "epoch": 6.107560993780906e-05, + "grad_norm": 0.37083202600479126, + "learning_rate": 0.00018878130217028384, + "loss": 0.5372, + "step": 174 + }, + { + "epoch": 6.14266191903252e-05, + "grad_norm": 0.37066200375556946, + "learning_rate": 0.0001887145242070117, + "loss": 0.6653, + "step": 175 + }, + { + "epoch": 6.177762844284135e-05, + "grad_norm": 0.5191747546195984, + "learning_rate": 0.00018864774624373958, + "loss": 0.6677, + "step": 176 + }, + { + "epoch": 6.21286376953575e-05, + "grad_norm": 0.4235158860683441, + "learning_rate": 0.00018858096828046746, + "loss": 0.5971, + "step": 177 + }, + { + "epoch": 6.247964694787363e-05, + "grad_norm": 0.405074805021286, + "learning_rate": 0.00018851419031719533, + "loss": 0.5717, + "step": 178 + }, + { + "epoch": 6.283065620038978e-05, + "grad_norm": 0.45817336440086365, + "learning_rate": 0.00018844741235392323, + "loss": 0.5878, + "step": 179 + }, + { + "epoch": 6.318166545290592e-05, + "grad_norm": 0.6313037276268005, + "learning_rate": 0.0001883806343906511, + "loss": 0.62, + "step": 180 + }, + { + "epoch": 6.353267470542207e-05, + "grad_norm": 0.41896742582321167, + "learning_rate": 0.00018831385642737898, + "loss": 0.5565, + "step": 181 + }, + { + "epoch": 6.388368395793822e-05, + "grad_norm": 0.4143432676792145, + "learning_rate": 0.00018824707846410685, + "loss": 0.5552, + "step": 182 + }, + { + "epoch": 6.423469321045435e-05, + "grad_norm": 0.38745641708374023, + "learning_rate": 0.00018818030050083472, + "loss": 0.5949, + "step": 183 + }, + { + "epoch": 6.45857024629705e-05, + "grad_norm": 0.7472612261772156, + "learning_rate": 0.0001881135225375626, + "loss": 0.6708, + "step": 184 + }, + { + "epoch": 6.493671171548664e-05, + "grad_norm": 0.4416198432445526, + "learning_rate": 0.0001880467445742905, + "loss": 0.6069, + "step": 185 + }, + { + "epoch": 6.528772096800279e-05, + "grad_norm": 0.4312993884086609, + "learning_rate": 0.00018797996661101837, + "loss": 0.5778, + "step": 186 + }, + { + "epoch": 6.563873022051894e-05, + "grad_norm": 0.4524860978126526, + "learning_rate": 0.00018791318864774624, + "loss": 0.5091, + "step": 187 + }, + { + "epoch": 6.598973947303507e-05, + "grad_norm": 0.4320828914642334, + "learning_rate": 0.00018784641068447412, + "loss": 0.6557, + "step": 188 + }, + { + "epoch": 6.634074872555122e-05, + "grad_norm": 0.6967452168464661, + "learning_rate": 0.00018777963272120202, + "loss": 0.612, + "step": 189 + }, + { + "epoch": 6.669175797806736e-05, + "grad_norm": 0.4389924705028534, + "learning_rate": 0.0001877128547579299, + "loss": 0.6271, + "step": 190 + }, + { + "epoch": 6.704276723058351e-05, + "grad_norm": 0.3693922162055969, + "learning_rate": 0.0001876460767946578, + "loss": 0.6715, + "step": 191 + }, + { + "epoch": 6.739377648309964e-05, + "grad_norm": 0.32230404019355774, + "learning_rate": 0.00018757929883138566, + "loss": 0.6344, + "step": 192 + }, + { + "epoch": 6.774478573561579e-05, + "grad_norm": 0.4440002143383026, + "learning_rate": 0.00018751252086811354, + "loss": 0.6671, + "step": 193 + }, + { + "epoch": 6.809579498813194e-05, + "grad_norm": 0.5676587820053101, + "learning_rate": 0.0001874457429048414, + "loss": 0.6818, + "step": 194 + }, + { + "epoch": 6.844680424064808e-05, + "grad_norm": 0.36207348108291626, + "learning_rate": 0.0001873789649415693, + "loss": 0.5029, + "step": 195 + }, + { + "epoch": 6.879781349316423e-05, + "grad_norm": 0.35714131593704224, + "learning_rate": 0.00018731218697829718, + "loss": 0.6127, + "step": 196 + }, + { + "epoch": 6.914882274568036e-05, + "grad_norm": 0.4285273551940918, + "learning_rate": 0.00018724540901502506, + "loss": 0.6355, + "step": 197 + }, + { + "epoch": 6.949983199819651e-05, + "grad_norm": 0.42585939168930054, + "learning_rate": 0.00018717863105175293, + "loss": 0.6302, + "step": 198 + }, + { + "epoch": 6.985084125071266e-05, + "grad_norm": 0.524303138256073, + "learning_rate": 0.0001871118530884808, + "loss": 0.6683, + "step": 199 + }, + { + "epoch": 7.02018505032288e-05, + "grad_norm": 0.39635923504829407, + "learning_rate": 0.00018704507512520868, + "loss": 0.6694, + "step": 200 + }, + { + "epoch": 7.055285975574495e-05, + "grad_norm": 0.39712437987327576, + "learning_rate": 0.00018697829716193658, + "loss": 0.5794, + "step": 201 + }, + { + "epoch": 7.090386900826108e-05, + "grad_norm": 0.4115397334098816, + "learning_rate": 0.00018691151919866445, + "loss": 0.5579, + "step": 202 + }, + { + "epoch": 7.125487826077723e-05, + "grad_norm": 0.4776385724544525, + "learning_rate": 0.00018684474123539232, + "loss": 0.5589, + "step": 203 + }, + { + "epoch": 7.160588751329338e-05, + "grad_norm": 0.35574638843536377, + "learning_rate": 0.0001867779632721202, + "loss": 0.5311, + "step": 204 + }, + { + "epoch": 7.195689676580952e-05, + "grad_norm": 0.44872432947158813, + "learning_rate": 0.00018671118530884807, + "loss": 0.635, + "step": 205 + }, + { + "epoch": 7.230790601832567e-05, + "grad_norm": 0.3511079251766205, + "learning_rate": 0.00018664440734557597, + "loss": 0.5317, + "step": 206 + }, + { + "epoch": 7.26589152708418e-05, + "grad_norm": 0.39862194657325745, + "learning_rate": 0.00018657762938230384, + "loss": 0.6653, + "step": 207 + }, + { + "epoch": 7.300992452335795e-05, + "grad_norm": 0.4046575725078583, + "learning_rate": 0.00018651085141903174, + "loss": 0.6065, + "step": 208 + }, + { + "epoch": 7.33609337758741e-05, + "grad_norm": 0.4231868088245392, + "learning_rate": 0.00018644407345575962, + "loss": 0.7078, + "step": 209 + }, + { + "epoch": 7.371194302839024e-05, + "grad_norm": 0.364700049161911, + "learning_rate": 0.0001863772954924875, + "loss": 0.6309, + "step": 210 + }, + { + "epoch": 7.406295228090639e-05, + "grad_norm": 0.5385531187057495, + "learning_rate": 0.0001863105175292154, + "loss": 0.4233, + "step": 211 + }, + { + "epoch": 7.441396153342252e-05, + "grad_norm": 0.39415115118026733, + "learning_rate": 0.00018624373956594326, + "loss": 0.5928, + "step": 212 + }, + { + "epoch": 7.476497078593867e-05, + "grad_norm": 0.6021363735198975, + "learning_rate": 0.00018617696160267113, + "loss": 0.6611, + "step": 213 + }, + { + "epoch": 7.511598003845482e-05, + "grad_norm": 0.3709903061389923, + "learning_rate": 0.000186110183639399, + "loss": 0.6136, + "step": 214 + }, + { + "epoch": 7.546698929097096e-05, + "grad_norm": 0.36710435152053833, + "learning_rate": 0.00018604340567612688, + "loss": 0.5267, + "step": 215 + }, + { + "epoch": 7.581799854348711e-05, + "grad_norm": 0.4379352033138275, + "learning_rate": 0.00018597662771285475, + "loss": 0.6429, + "step": 216 + }, + { + "epoch": 7.616900779600325e-05, + "grad_norm": 0.3408482074737549, + "learning_rate": 0.00018590984974958265, + "loss": 0.5379, + "step": 217 + }, + { + "epoch": 7.65200170485194e-05, + "grad_norm": 0.4487043023109436, + "learning_rate": 0.00018584307178631053, + "loss": 0.6582, + "step": 218 + }, + { + "epoch": 7.687102630103554e-05, + "grad_norm": 0.42003679275512695, + "learning_rate": 0.0001857762938230384, + "loss": 0.5712, + "step": 219 + }, + { + "epoch": 7.722203555355168e-05, + "grad_norm": 0.4698665738105774, + "learning_rate": 0.00018570951585976627, + "loss": 0.5715, + "step": 220 + }, + { + "epoch": 7.757304480606783e-05, + "grad_norm": 0.3777780830860138, + "learning_rate": 0.00018564273789649415, + "loss": 0.4667, + "step": 221 + }, + { + "epoch": 7.792405405858397e-05, + "grad_norm": 0.36794212460517883, + "learning_rate": 0.00018557595993322205, + "loss": 0.5382, + "step": 222 + }, + { + "epoch": 7.827506331110012e-05, + "grad_norm": 0.4582989513874054, + "learning_rate": 0.00018550918196994992, + "loss": 0.6437, + "step": 223 + }, + { + "epoch": 7.862607256361626e-05, + "grad_norm": 0.4065852761268616, + "learning_rate": 0.0001854424040066778, + "loss": 0.6928, + "step": 224 + }, + { + "epoch": 7.89770818161324e-05, + "grad_norm": 0.3857649564743042, + "learning_rate": 0.0001853756260434057, + "loss": 0.5405, + "step": 225 + }, + { + "epoch": 7.932809106864855e-05, + "grad_norm": 0.40056589245796204, + "learning_rate": 0.00018530884808013357, + "loss": 0.6425, + "step": 226 + }, + { + "epoch": 7.967910032116469e-05, + "grad_norm": 0.43137016892433167, + "learning_rate": 0.00018524207011686147, + "loss": 0.5001, + "step": 227 + }, + { + "epoch": 8.003010957368084e-05, + "grad_norm": 0.3723987340927124, + "learning_rate": 0.00018517529215358934, + "loss": 0.5118, + "step": 228 + }, + { + "epoch": 8.038111882619698e-05, + "grad_norm": 0.34196361899375916, + "learning_rate": 0.00018510851419031721, + "loss": 0.5468, + "step": 229 + }, + { + "epoch": 8.073212807871312e-05, + "grad_norm": 0.4319117069244385, + "learning_rate": 0.0001850417362270451, + "loss": 0.5703, + "step": 230 + }, + { + "epoch": 8.108313733122927e-05, + "grad_norm": 0.4467247724533081, + "learning_rate": 0.00018497495826377296, + "loss": 0.6536, + "step": 231 + }, + { + "epoch": 8.14341465837454e-05, + "grad_norm": 0.3569909632205963, + "learning_rate": 0.00018490818030050083, + "loss": 0.5335, + "step": 232 + }, + { + "epoch": 8.178515583626156e-05, + "grad_norm": 0.33486437797546387, + "learning_rate": 0.00018484140233722873, + "loss": 0.6803, + "step": 233 + }, + { + "epoch": 8.21361650887777e-05, + "grad_norm": 0.3783140480518341, + "learning_rate": 0.0001847746243739566, + "loss": 0.6361, + "step": 234 + }, + { + "epoch": 8.248717434129384e-05, + "grad_norm": 0.4844662547111511, + "learning_rate": 0.00018470784641068448, + "loss": 0.5322, + "step": 235 + }, + { + "epoch": 8.283818359380999e-05, + "grad_norm": 0.508406400680542, + "learning_rate": 0.00018464106844741235, + "loss": 0.6676, + "step": 236 + }, + { + "epoch": 8.318919284632613e-05, + "grad_norm": 0.3710225820541382, + "learning_rate": 0.00018457429048414023, + "loss": 0.6656, + "step": 237 + }, + { + "epoch": 8.354020209884228e-05, + "grad_norm": 0.3757292628288269, + "learning_rate": 0.00018450751252086813, + "loss": 0.6095, + "step": 238 + }, + { + "epoch": 8.389121135135843e-05, + "grad_norm": 0.40651261806488037, + "learning_rate": 0.000184440734557596, + "loss": 0.6626, + "step": 239 + }, + { + "epoch": 8.424222060387456e-05, + "grad_norm": 0.40700778365135193, + "learning_rate": 0.00018437395659432387, + "loss": 0.5328, + "step": 240 + }, + { + "epoch": 8.459322985639071e-05, + "grad_norm": 0.5067440867424011, + "learning_rate": 0.00018430717863105175, + "loss": 0.4811, + "step": 241 + }, + { + "epoch": 8.494423910890685e-05, + "grad_norm": 0.3934602737426758, + "learning_rate": 0.00018424040066777965, + "loss": 0.5691, + "step": 242 + }, + { + "epoch": 8.5295248361423e-05, + "grad_norm": 0.3360019624233246, + "learning_rate": 0.00018417362270450752, + "loss": 0.5542, + "step": 243 + }, + { + "epoch": 8.564625761393915e-05, + "grad_norm": 0.4023631513118744, + "learning_rate": 0.00018410684474123542, + "loss": 0.5192, + "step": 244 + }, + { + "epoch": 8.599726686645528e-05, + "grad_norm": 0.41704171895980835, + "learning_rate": 0.0001840400667779633, + "loss": 0.5018, + "step": 245 + }, + { + "epoch": 8.634827611897143e-05, + "grad_norm": 0.361977756023407, + "learning_rate": 0.00018397328881469117, + "loss": 0.6193, + "step": 246 + }, + { + "epoch": 8.669928537148757e-05, + "grad_norm": 0.37774717807769775, + "learning_rate": 0.00018390651085141904, + "loss": 0.5552, + "step": 247 + }, + { + "epoch": 8.705029462400372e-05, + "grad_norm": 0.3408471941947937, + "learning_rate": 0.0001838397328881469, + "loss": 0.5876, + "step": 248 + }, + { + "epoch": 8.740130387651985e-05, + "grad_norm": 0.3892226815223694, + "learning_rate": 0.0001837729549248748, + "loss": 0.4227, + "step": 249 + }, + { + "epoch": 8.7752313129036e-05, + "grad_norm": 0.5315036177635193, + "learning_rate": 0.00018370617696160269, + "loss": 0.5826, + "step": 250 + }, + { + "epoch": 8.810332238155215e-05, + "grad_norm": 0.35433024168014526, + "learning_rate": 0.00018363939899833056, + "loss": 0.5992, + "step": 251 + }, + { + "epoch": 8.845433163406829e-05, + "grad_norm": 0.34777382016181946, + "learning_rate": 0.00018357262103505843, + "loss": 0.4973, + "step": 252 + }, + { + "epoch": 8.880534088658444e-05, + "grad_norm": 0.3936387002468109, + "learning_rate": 0.0001835058430717863, + "loss": 0.6254, + "step": 253 + }, + { + "epoch": 8.915635013910057e-05, + "grad_norm": 0.4009217917919159, + "learning_rate": 0.0001834390651085142, + "loss": 0.4843, + "step": 254 + }, + { + "epoch": 8.950735939161672e-05, + "grad_norm": 0.4863683879375458, + "learning_rate": 0.00018337228714524208, + "loss": 0.5204, + "step": 255 + }, + { + "epoch": 8.985836864413287e-05, + "grad_norm": 0.6100988984107971, + "learning_rate": 0.00018330550918196995, + "loss": 0.7296, + "step": 256 + }, + { + "epoch": 9.020937789664901e-05, + "grad_norm": 0.40949374437332153, + "learning_rate": 0.00018323873121869782, + "loss": 0.5707, + "step": 257 + }, + { + "epoch": 9.056038714916516e-05, + "grad_norm": 0.47316402196884155, + "learning_rate": 0.0001831719532554257, + "loss": 0.6655, + "step": 258 + }, + { + "epoch": 9.091139640168129e-05, + "grad_norm": 0.4053696393966675, + "learning_rate": 0.0001831051752921536, + "loss": 0.5822, + "step": 259 + }, + { + "epoch": 9.126240565419744e-05, + "grad_norm": 0.4582972228527069, + "learning_rate": 0.00018303839732888147, + "loss": 0.5475, + "step": 260 + }, + { + "epoch": 9.161341490671359e-05, + "grad_norm": 0.38666802644729614, + "learning_rate": 0.00018297161936560937, + "loss": 0.4744, + "step": 261 + }, + { + "epoch": 9.196442415922973e-05, + "grad_norm": 0.31954991817474365, + "learning_rate": 0.00018290484140233724, + "loss": 0.6337, + "step": 262 + }, + { + "epoch": 9.231543341174588e-05, + "grad_norm": 0.3590424358844757, + "learning_rate": 0.00018283806343906512, + "loss": 0.5683, + "step": 263 + }, + { + "epoch": 9.266644266426201e-05, + "grad_norm": 0.4042195975780487, + "learning_rate": 0.000182771285475793, + "loss": 0.6142, + "step": 264 + }, + { + "epoch": 9.301745191677816e-05, + "grad_norm": 0.3474234342575073, + "learning_rate": 0.0001827045075125209, + "loss": 0.6035, + "step": 265 + }, + { + "epoch": 9.336846116929431e-05, + "grad_norm": 0.337091326713562, + "learning_rate": 0.00018263772954924876, + "loss": 0.6107, + "step": 266 + }, + { + "epoch": 9.371947042181045e-05, + "grad_norm": 0.3313732445240021, + "learning_rate": 0.00018257095158597664, + "loss": 0.6491, + "step": 267 + }, + { + "epoch": 9.40704796743266e-05, + "grad_norm": 0.3931679129600525, + "learning_rate": 0.0001825041736227045, + "loss": 0.5492, + "step": 268 + }, + { + "epoch": 9.442148892684273e-05, + "grad_norm": 0.5848420262336731, + "learning_rate": 0.00018243739565943238, + "loss": 0.7091, + "step": 269 + }, + { + "epoch": 9.477249817935888e-05, + "grad_norm": 0.4851846992969513, + "learning_rate": 0.00018237061769616028, + "loss": 0.5856, + "step": 270 + }, + { + "epoch": 9.512350743187503e-05, + "grad_norm": 0.3434993326663971, + "learning_rate": 0.00018230383973288816, + "loss": 0.5085, + "step": 271 + }, + { + "epoch": 9.547451668439117e-05, + "grad_norm": 0.2978988587856293, + "learning_rate": 0.00018223706176961603, + "loss": 0.481, + "step": 272 + }, + { + "epoch": 9.582552593690732e-05, + "grad_norm": 0.34215858578681946, + "learning_rate": 0.0001821702838063439, + "loss": 0.5723, + "step": 273 + }, + { + "epoch": 9.617653518942345e-05, + "grad_norm": 0.43445509672164917, + "learning_rate": 0.00018210350584307178, + "loss": 0.5691, + "step": 274 + }, + { + "epoch": 9.65275444419396e-05, + "grad_norm": 0.36094945669174194, + "learning_rate": 0.00018203672787979968, + "loss": 0.5543, + "step": 275 + }, + { + "epoch": 9.687855369445575e-05, + "grad_norm": 0.386106014251709, + "learning_rate": 0.00018196994991652755, + "loss": 0.5561, + "step": 276 + }, + { + "epoch": 9.722956294697189e-05, + "grad_norm": 0.36676689982414246, + "learning_rate": 0.00018190317195325542, + "loss": 0.5479, + "step": 277 + }, + { + "epoch": 9.758057219948804e-05, + "grad_norm": 0.37988394498825073, + "learning_rate": 0.00018183639398998332, + "loss": 0.5772, + "step": 278 + }, + { + "epoch": 9.793158145200417e-05, + "grad_norm": 0.4024789035320282, + "learning_rate": 0.0001817696160267112, + "loss": 0.6065, + "step": 279 + }, + { + "epoch": 9.828259070452032e-05, + "grad_norm": 0.3697255551815033, + "learning_rate": 0.0001817028380634391, + "loss": 0.5021, + "step": 280 + }, + { + "epoch": 9.863359995703647e-05, + "grad_norm": 0.43579426407814026, + "learning_rate": 0.00018163606010016697, + "loss": 0.555, + "step": 281 + }, + { + "epoch": 9.898460920955261e-05, + "grad_norm": 0.4760832190513611, + "learning_rate": 0.00018156928213689484, + "loss": 0.6438, + "step": 282 + }, + { + "epoch": 9.933561846206876e-05, + "grad_norm": 0.45258408784866333, + "learning_rate": 0.00018150250417362272, + "loss": 0.4717, + "step": 283 + }, + { + "epoch": 9.96866277145849e-05, + "grad_norm": 0.428108274936676, + "learning_rate": 0.0001814357262103506, + "loss": 0.6029, + "step": 284 + }, + { + "epoch": 0.00010003763696710104, + "grad_norm": 0.3999852240085602, + "learning_rate": 0.00018136894824707846, + "loss": 0.4524, + "step": 285 + }, + { + "epoch": 0.0001003886462196172, + "grad_norm": 0.44319403171539307, + "learning_rate": 0.00018130217028380636, + "loss": 0.6619, + "step": 286 + }, + { + "epoch": 0.00010073965547213333, + "grad_norm": 0.43008357286453247, + "learning_rate": 0.00018123539232053424, + "loss": 0.6105, + "step": 287 + }, + { + "epoch": 0.00010109066472464948, + "grad_norm": 0.38037821650505066, + "learning_rate": 0.0001811686143572621, + "loss": 0.6649, + "step": 288 + }, + { + "epoch": 0.00010144167397716562, + "grad_norm": 0.3713517487049103, + "learning_rate": 0.00018110183639398998, + "loss": 0.6381, + "step": 289 + }, + { + "epoch": 0.00010179268322968176, + "grad_norm": 0.3437170386314392, + "learning_rate": 0.00018103505843071786, + "loss": 0.4563, + "step": 290 + }, + { + "epoch": 0.00010214369248219791, + "grad_norm": 0.3661468029022217, + "learning_rate": 0.00018096828046744576, + "loss": 0.606, + "step": 291 + }, + { + "epoch": 0.00010249470173471405, + "grad_norm": 0.36346200108528137, + "learning_rate": 0.00018090150250417363, + "loss": 0.5895, + "step": 292 + }, + { + "epoch": 0.0001028457109872302, + "grad_norm": 0.31052225828170776, + "learning_rate": 0.0001808347245409015, + "loss": 0.4409, + "step": 293 + }, + { + "epoch": 0.00010319672023974634, + "grad_norm": 0.37012970447540283, + "learning_rate": 0.00018076794657762938, + "loss": 0.505, + "step": 294 + }, + { + "epoch": 0.00010354772949226248, + "grad_norm": 0.3958667814731598, + "learning_rate": 0.00018070116861435728, + "loss": 0.5371, + "step": 295 + }, + { + "epoch": 0.00010389873874477863, + "grad_norm": 0.4892179071903229, + "learning_rate": 0.00018063439065108515, + "loss": 0.6737, + "step": 296 + }, + { + "epoch": 0.00010424974799729477, + "grad_norm": 0.41874751448631287, + "learning_rate": 0.00018056761268781305, + "loss": 0.651, + "step": 297 + }, + { + "epoch": 0.00010460075724981092, + "grad_norm": 0.4167911410331726, + "learning_rate": 0.00018050083472454092, + "loss": 0.5531, + "step": 298 + }, + { + "epoch": 0.00010495176650232706, + "grad_norm": 0.3758225440979004, + "learning_rate": 0.0001804340567612688, + "loss": 0.6285, + "step": 299 + }, + { + "epoch": 0.0001053027757548432, + "grad_norm": 0.3688598573207855, + "learning_rate": 0.00018036727879799667, + "loss": 0.5219, + "step": 300 + }, + { + "epoch": 0.00010565378500735934, + "grad_norm": 0.3501751124858856, + "learning_rate": 0.00018030050083472454, + "loss": 0.6351, + "step": 301 + }, + { + "epoch": 0.00010600479425987549, + "grad_norm": 0.42876511812210083, + "learning_rate": 0.00018023372287145244, + "loss": 0.544, + "step": 302 + }, + { + "epoch": 0.00010635580351239164, + "grad_norm": 0.47046172618865967, + "learning_rate": 0.00018016694490818031, + "loss": 0.6304, + "step": 303 + }, + { + "epoch": 0.00010670681276490778, + "grad_norm": 0.402271032333374, + "learning_rate": 0.0001801001669449082, + "loss": 0.5039, + "step": 304 + }, + { + "epoch": 0.00010705782201742393, + "grad_norm": 0.41232413053512573, + "learning_rate": 0.00018003338898163606, + "loss": 0.5892, + "step": 305 + }, + { + "epoch": 0.00010740883126994006, + "grad_norm": 0.3628154993057251, + "learning_rate": 0.00017996661101836393, + "loss": 0.5737, + "step": 306 + }, + { + "epoch": 0.00010775984052245621, + "grad_norm": 0.4291020631790161, + "learning_rate": 0.00017989983305509183, + "loss": 0.6597, + "step": 307 + }, + { + "epoch": 0.00010811084977497236, + "grad_norm": 0.33218181133270264, + "learning_rate": 0.0001798330550918197, + "loss": 0.5726, + "step": 308 + }, + { + "epoch": 0.0001084618590274885, + "grad_norm": 0.3439387381076813, + "learning_rate": 0.00017976627712854758, + "loss": 0.5615, + "step": 309 + }, + { + "epoch": 0.00010881286828000465, + "grad_norm": 0.3523644208908081, + "learning_rate": 0.00017969949916527545, + "loss": 0.4968, + "step": 310 + }, + { + "epoch": 0.00010916387753252078, + "grad_norm": 0.4045630991458893, + "learning_rate": 0.00017963272120200333, + "loss": 0.6425, + "step": 311 + }, + { + "epoch": 0.00010951488678503693, + "grad_norm": 0.3726767599582672, + "learning_rate": 0.00017956594323873123, + "loss": 0.6575, + "step": 312 + }, + { + "epoch": 0.00010986589603755308, + "grad_norm": 0.32131972908973694, + "learning_rate": 0.0001794991652754591, + "loss": 0.5146, + "step": 313 + }, + { + "epoch": 0.00011021690529006922, + "grad_norm": 0.5013764500617981, + "learning_rate": 0.000179432387312187, + "loss": 0.53, + "step": 314 + }, + { + "epoch": 0.00011056791454258537, + "grad_norm": 0.36830246448516846, + "learning_rate": 0.00017936560934891487, + "loss": 0.6291, + "step": 315 + }, + { + "epoch": 0.0001109189237951015, + "grad_norm": 0.3587378263473511, + "learning_rate": 0.00017929883138564275, + "loss": 0.4954, + "step": 316 + }, + { + "epoch": 0.00011126993304761765, + "grad_norm": 0.3480195105075836, + "learning_rate": 0.00017923205342237062, + "loss": 0.606, + "step": 317 + }, + { + "epoch": 0.0001116209423001338, + "grad_norm": 0.38415858149528503, + "learning_rate": 0.00017916527545909852, + "loss": 0.7281, + "step": 318 + }, + { + "epoch": 0.00011197195155264994, + "grad_norm": 0.35853826999664307, + "learning_rate": 0.0001790984974958264, + "loss": 0.5851, + "step": 319 + }, + { + "epoch": 0.00011232296080516609, + "grad_norm": 0.42092210054397583, + "learning_rate": 0.00017903171953255427, + "loss": 0.5324, + "step": 320 + }, + { + "epoch": 0.00011267397005768222, + "grad_norm": 0.34538987278938293, + "learning_rate": 0.00017896494156928214, + "loss": 0.6387, + "step": 321 + }, + { + "epoch": 0.00011302497931019837, + "grad_norm": 0.38299745321273804, + "learning_rate": 0.00017889816360601, + "loss": 0.6013, + "step": 322 + }, + { + "epoch": 0.00011337598856271452, + "grad_norm": 0.32100436091423035, + "learning_rate": 0.0001788313856427379, + "loss": 0.4627, + "step": 323 + }, + { + "epoch": 0.00011372699781523066, + "grad_norm": 0.3458426594734192, + "learning_rate": 0.0001787646076794658, + "loss": 0.5865, + "step": 324 + }, + { + "epoch": 0.0001140780070677468, + "grad_norm": 0.33228665590286255, + "learning_rate": 0.00017869782971619366, + "loss": 0.4611, + "step": 325 + }, + { + "epoch": 0.00011442901632026294, + "grad_norm": 0.38747021555900574, + "learning_rate": 0.00017863105175292153, + "loss": 0.5777, + "step": 326 + }, + { + "epoch": 0.00011478002557277909, + "grad_norm": 0.3888608515262604, + "learning_rate": 0.0001785642737896494, + "loss": 0.5664, + "step": 327 + }, + { + "epoch": 0.00011513103482529524, + "grad_norm": 0.4084737002849579, + "learning_rate": 0.0001784974958263773, + "loss": 0.5939, + "step": 328 + }, + { + "epoch": 0.00011548204407781138, + "grad_norm": 0.4964492917060852, + "learning_rate": 0.00017843071786310518, + "loss": 0.6256, + "step": 329 + }, + { + "epoch": 0.00011583305333032753, + "grad_norm": 0.37329745292663574, + "learning_rate": 0.00017836393989983305, + "loss": 0.5388, + "step": 330 + }, + { + "epoch": 0.00011618406258284366, + "grad_norm": 0.37680140137672424, + "learning_rate": 0.00017829716193656095, + "loss": 0.6203, + "step": 331 + }, + { + "epoch": 0.00011653507183535981, + "grad_norm": 0.4162957966327667, + "learning_rate": 0.00017823038397328883, + "loss": 0.6478, + "step": 332 + }, + { + "epoch": 0.00011688608108787596, + "grad_norm": 0.3473896086215973, + "learning_rate": 0.0001781636060100167, + "loss": 0.589, + "step": 333 + }, + { + "epoch": 0.0001172370903403921, + "grad_norm": 0.4039511978626251, + "learning_rate": 0.0001780968280467446, + "loss": 0.5681, + "step": 334 + }, + { + "epoch": 0.00011758809959290825, + "grad_norm": 0.3135715425014496, + "learning_rate": 0.00017803005008347247, + "loss": 0.5069, + "step": 335 + }, + { + "epoch": 0.00011793910884542438, + "grad_norm": 0.4296559989452362, + "learning_rate": 0.00017796327212020035, + "loss": 0.5413, + "step": 336 + }, + { + "epoch": 0.00011829011809794053, + "grad_norm": 0.4197536110877991, + "learning_rate": 0.00017789649415692822, + "loss": 0.694, + "step": 337 + }, + { + "epoch": 0.00011864112735045668, + "grad_norm": 0.3633468449115753, + "learning_rate": 0.0001778297161936561, + "loss": 0.5475, + "step": 338 + }, + { + "epoch": 0.00011899213660297282, + "grad_norm": 0.2867147922515869, + "learning_rate": 0.000177762938230384, + "loss": 0.485, + "step": 339 + }, + { + "epoch": 0.00011934314585548897, + "grad_norm": 0.3445490300655365, + "learning_rate": 0.00017769616026711187, + "loss": 0.6304, + "step": 340 + }, + { + "epoch": 0.0001196941551080051, + "grad_norm": 0.31692221760749817, + "learning_rate": 0.00017762938230383974, + "loss": 0.5804, + "step": 341 + }, + { + "epoch": 0.00012004516436052125, + "grad_norm": 0.31391167640686035, + "learning_rate": 0.0001775626043405676, + "loss": 0.5945, + "step": 342 + }, + { + "epoch": 0.0001203961736130374, + "grad_norm": 0.3484472632408142, + "learning_rate": 0.00017749582637729548, + "loss": 0.6577, + "step": 343 + }, + { + "epoch": 0.00012074718286555354, + "grad_norm": 0.37430596351623535, + "learning_rate": 0.00017742904841402339, + "loss": 0.6854, + "step": 344 + }, + { + "epoch": 0.00012109819211806969, + "grad_norm": 0.34305211901664734, + "learning_rate": 0.00017736227045075126, + "loss": 0.5123, + "step": 345 + }, + { + "epoch": 0.00012144920137058582, + "grad_norm": 0.3398534059524536, + "learning_rate": 0.00017729549248747913, + "loss": 0.5602, + "step": 346 + }, + { + "epoch": 0.00012180021062310197, + "grad_norm": 0.4278014600276947, + "learning_rate": 0.000177228714524207, + "loss": 0.5152, + "step": 347 + }, + { + "epoch": 0.00012215121987561812, + "grad_norm": 0.4011085629463196, + "learning_rate": 0.0001771619365609349, + "loss": 0.6217, + "step": 348 + }, + { + "epoch": 0.00012250222912813427, + "grad_norm": 0.3425695598125458, + "learning_rate": 0.00017709515859766278, + "loss": 0.5037, + "step": 349 + }, + { + "epoch": 0.0001228532383806504, + "grad_norm": 0.34036242961883545, + "learning_rate": 0.00017702838063439068, + "loss": 0.649, + "step": 350 + }, + { + "epoch": 0.00012320424763316654, + "grad_norm": 0.5631874203681946, + "learning_rate": 0.00017696160267111855, + "loss": 0.5656, + "step": 351 + }, + { + "epoch": 0.0001235552568856827, + "grad_norm": 0.4195176661014557, + "learning_rate": 0.00017689482470784642, + "loss": 0.6899, + "step": 352 + }, + { + "epoch": 0.00012390626613819884, + "grad_norm": 0.41814154386520386, + "learning_rate": 0.0001768280467445743, + "loss": 0.551, + "step": 353 + }, + { + "epoch": 0.000124257275390715, + "grad_norm": 0.3374340534210205, + "learning_rate": 0.00017676126878130217, + "loss": 0.7022, + "step": 354 + }, + { + "epoch": 0.00012460828464323112, + "grad_norm": 0.41464921832084656, + "learning_rate": 0.00017669449081803007, + "loss": 0.5301, + "step": 355 + }, + { + "epoch": 0.00012495929389574726, + "grad_norm": 0.4443178176879883, + "learning_rate": 0.00017662771285475794, + "loss": 0.5487, + "step": 356 + }, + { + "epoch": 0.00012531030314826341, + "grad_norm": 0.3389272093772888, + "learning_rate": 0.00017656093489148582, + "loss": 0.581, + "step": 357 + }, + { + "epoch": 0.00012566131240077956, + "grad_norm": 0.29650986194610596, + "learning_rate": 0.0001764941569282137, + "loss": 0.5801, + "step": 358 + }, + { + "epoch": 0.0001260123216532957, + "grad_norm": 0.40271905064582825, + "learning_rate": 0.00017642737896494156, + "loss": 0.6738, + "step": 359 + }, + { + "epoch": 0.00012636333090581184, + "grad_norm": 0.352225661277771, + "learning_rate": 0.00017636060100166946, + "loss": 0.5727, + "step": 360 + }, + { + "epoch": 0.00012671434015832798, + "grad_norm": 0.3469563126564026, + "learning_rate": 0.00017629382303839734, + "loss": 0.5188, + "step": 361 + }, + { + "epoch": 0.00012706534941084413, + "grad_norm": 0.30644670128822327, + "learning_rate": 0.0001762270450751252, + "loss": 0.497, + "step": 362 + }, + { + "epoch": 0.00012741635866336028, + "grad_norm": 0.3472917377948761, + "learning_rate": 0.00017616026711185308, + "loss": 0.6363, + "step": 363 + }, + { + "epoch": 0.00012776736791587643, + "grad_norm": 0.37184756994247437, + "learning_rate": 0.00017609348914858096, + "loss": 0.5223, + "step": 364 + }, + { + "epoch": 0.00012811837716839256, + "grad_norm": 0.3247138559818268, + "learning_rate": 0.00017602671118530886, + "loss": 0.5457, + "step": 365 + }, + { + "epoch": 0.0001284693864209087, + "grad_norm": 0.5236158967018127, + "learning_rate": 0.00017595993322203673, + "loss": 0.615, + "step": 366 + }, + { + "epoch": 0.00012882039567342485, + "grad_norm": 0.33708465099334717, + "learning_rate": 0.00017589315525876463, + "loss": 0.6163, + "step": 367 + }, + { + "epoch": 0.000129171404925941, + "grad_norm": 0.33848705887794495, + "learning_rate": 0.0001758263772954925, + "loss": 0.4229, + "step": 368 + }, + { + "epoch": 0.00012952241417845715, + "grad_norm": 0.5827682018280029, + "learning_rate": 0.00017575959933222038, + "loss": 0.5668, + "step": 369 + }, + { + "epoch": 0.00012987342343097328, + "grad_norm": 0.36217448115348816, + "learning_rate": 0.00017569282136894825, + "loss": 0.4983, + "step": 370 + }, + { + "epoch": 0.00013022443268348943, + "grad_norm": 0.329414963722229, + "learning_rate": 0.00017562604340567615, + "loss": 0.4281, + "step": 371 + }, + { + "epoch": 0.00013057544193600557, + "grad_norm": 0.36746612191200256, + "learning_rate": 0.00017555926544240402, + "loss": 0.6629, + "step": 372 + }, + { + "epoch": 0.00013092645118852172, + "grad_norm": 0.3954717516899109, + "learning_rate": 0.0001754924874791319, + "loss": 0.5784, + "step": 373 + }, + { + "epoch": 0.00013127746044103787, + "grad_norm": 0.41279932856559753, + "learning_rate": 0.00017542570951585977, + "loss": 0.5994, + "step": 374 + }, + { + "epoch": 0.000131628469693554, + "grad_norm": 0.3019951581954956, + "learning_rate": 0.00017535893155258764, + "loss": 0.5584, + "step": 375 + }, + { + "epoch": 0.00013197947894607015, + "grad_norm": 0.3079768121242523, + "learning_rate": 0.00017529215358931554, + "loss": 0.5904, + "step": 376 + }, + { + "epoch": 0.0001323304881985863, + "grad_norm": 0.5678027272224426, + "learning_rate": 0.00017522537562604342, + "loss": 0.6441, + "step": 377 + }, + { + "epoch": 0.00013268149745110244, + "grad_norm": 0.38624581694602966, + "learning_rate": 0.0001751585976627713, + "loss": 0.5582, + "step": 378 + }, + { + "epoch": 0.0001330325067036186, + "grad_norm": 0.4368002712726593, + "learning_rate": 0.00017509181969949916, + "loss": 0.686, + "step": 379 + }, + { + "epoch": 0.00013338351595613472, + "grad_norm": 0.3409269154071808, + "learning_rate": 0.00017502504173622704, + "loss": 0.582, + "step": 380 + }, + { + "epoch": 0.00013373452520865087, + "grad_norm": 0.3772698938846588, + "learning_rate": 0.0001749582637729549, + "loss": 0.5314, + "step": 381 + }, + { + "epoch": 0.00013408553446116702, + "grad_norm": 0.3791707158088684, + "learning_rate": 0.0001748914858096828, + "loss": 0.6143, + "step": 382 + }, + { + "epoch": 0.00013443654371368317, + "grad_norm": 0.4441101551055908, + "learning_rate": 0.0001748247078464107, + "loss": 0.5726, + "step": 383 + }, + { + "epoch": 0.0001347875529661993, + "grad_norm": 0.4160211980342865, + "learning_rate": 0.00017475792988313858, + "loss": 0.6003, + "step": 384 + }, + { + "epoch": 0.00013513856221871544, + "grad_norm": 0.41698628664016724, + "learning_rate": 0.00017469115191986646, + "loss": 0.4539, + "step": 385 + }, + { + "epoch": 0.00013548957147123159, + "grad_norm": 0.337007999420166, + "learning_rate": 0.00017462437395659433, + "loss": 0.5176, + "step": 386 + }, + { + "epoch": 0.00013584058072374774, + "grad_norm": 0.30926409363746643, + "learning_rate": 0.00017455759599332223, + "loss": 0.6072, + "step": 387 + }, + { + "epoch": 0.00013619158997626389, + "grad_norm": 0.3663052022457123, + "learning_rate": 0.0001744908180300501, + "loss": 0.538, + "step": 388 + }, + { + "epoch": 0.00013654259922878, + "grad_norm": 0.3410074710845947, + "learning_rate": 0.00017442404006677798, + "loss": 0.5687, + "step": 389 + }, + { + "epoch": 0.00013689360848129616, + "grad_norm": 0.5266095399856567, + "learning_rate": 0.00017435726210350585, + "loss": 0.6685, + "step": 390 + }, + { + "epoch": 0.0001372446177338123, + "grad_norm": 0.4020686149597168, + "learning_rate": 0.00017429048414023372, + "loss": 0.586, + "step": 391 + }, + { + "epoch": 0.00013759562698632846, + "grad_norm": 0.39995548129081726, + "learning_rate": 0.00017422370617696162, + "loss": 0.6958, + "step": 392 + }, + { + "epoch": 0.0001379466362388446, + "grad_norm": 0.4024721682071686, + "learning_rate": 0.0001741569282136895, + "loss": 0.6411, + "step": 393 + }, + { + "epoch": 0.00013829764549136073, + "grad_norm": 0.38193392753601074, + "learning_rate": 0.00017409015025041737, + "loss": 0.5857, + "step": 394 + }, + { + "epoch": 0.00013864865474387688, + "grad_norm": 0.39786526560783386, + "learning_rate": 0.00017402337228714524, + "loss": 0.5215, + "step": 395 + }, + { + "epoch": 0.00013899966399639303, + "grad_norm": 0.49223974347114563, + "learning_rate": 0.00017395659432387311, + "loss": 0.5881, + "step": 396 + }, + { + "epoch": 0.00013935067324890918, + "grad_norm": 0.3398894667625427, + "learning_rate": 0.00017388981636060101, + "loss": 0.5466, + "step": 397 + }, + { + "epoch": 0.00013970168250142533, + "grad_norm": 0.34891223907470703, + "learning_rate": 0.0001738230383973289, + "loss": 0.5901, + "step": 398 + }, + { + "epoch": 0.00014005269175394145, + "grad_norm": 0.47644108533859253, + "learning_rate": 0.00017375626043405676, + "loss": 0.5075, + "step": 399 + }, + { + "epoch": 0.0001404037010064576, + "grad_norm": 0.42530229687690735, + "learning_rate": 0.00017368948247078466, + "loss": 0.663, + "step": 400 + }, + { + "epoch": 0.00014075471025897375, + "grad_norm": 0.30858534574508667, + "learning_rate": 0.00017362270450751253, + "loss": 0.4724, + "step": 401 + }, + { + "epoch": 0.0001411057195114899, + "grad_norm": 0.42453449964523315, + "learning_rate": 0.0001735559265442404, + "loss": 0.6074, + "step": 402 + }, + { + "epoch": 0.00014145672876400605, + "grad_norm": 0.3964505195617676, + "learning_rate": 0.0001734891485809683, + "loss": 0.4913, + "step": 403 + }, + { + "epoch": 0.00014180773801652217, + "grad_norm": 0.3317703902721405, + "learning_rate": 0.00017342237061769618, + "loss": 0.5504, + "step": 404 + }, + { + "epoch": 0.00014215874726903832, + "grad_norm": 0.3912264108657837, + "learning_rate": 0.00017335559265442405, + "loss": 0.6301, + "step": 405 + }, + { + "epoch": 0.00014250975652155447, + "grad_norm": 0.3582877218723297, + "learning_rate": 0.00017328881469115193, + "loss": 0.6205, + "step": 406 + }, + { + "epoch": 0.00014286076577407062, + "grad_norm": 0.3691099286079407, + "learning_rate": 0.0001732220367278798, + "loss": 0.5348, + "step": 407 + }, + { + "epoch": 0.00014321177502658677, + "grad_norm": 0.35860803723335266, + "learning_rate": 0.0001731552587646077, + "loss": 0.6029, + "step": 408 + }, + { + "epoch": 0.0001435627842791029, + "grad_norm": 0.3640693426132202, + "learning_rate": 0.00017308848080133557, + "loss": 0.6673, + "step": 409 + }, + { + "epoch": 0.00014391379353161904, + "grad_norm": 0.3550623953342438, + "learning_rate": 0.00017302170283806345, + "loss": 0.4659, + "step": 410 + }, + { + "epoch": 0.0001442648027841352, + "grad_norm": 0.45885637402534485, + "learning_rate": 0.00017295492487479132, + "loss": 0.4781, + "step": 411 + }, + { + "epoch": 0.00014461581203665134, + "grad_norm": 0.3703556954860687, + "learning_rate": 0.0001728881469115192, + "loss": 0.4829, + "step": 412 + }, + { + "epoch": 0.0001449668212891675, + "grad_norm": 0.5436837077140808, + "learning_rate": 0.0001728213689482471, + "loss": 0.6056, + "step": 413 + }, + { + "epoch": 0.0001453178305416836, + "grad_norm": 0.3953244686126709, + "learning_rate": 0.00017275459098497497, + "loss": 0.4884, + "step": 414 + }, + { + "epoch": 0.00014566883979419976, + "grad_norm": 0.34003904461860657, + "learning_rate": 0.00017268781302170284, + "loss": 0.6014, + "step": 415 + }, + { + "epoch": 0.0001460198490467159, + "grad_norm": 0.3463648557662964, + "learning_rate": 0.0001726210350584307, + "loss": 0.603, + "step": 416 + }, + { + "epoch": 0.00014637085829923206, + "grad_norm": 0.4293590784072876, + "learning_rate": 0.0001725542570951586, + "loss": 0.6686, + "step": 417 + }, + { + "epoch": 0.0001467218675517482, + "grad_norm": 0.4243469834327698, + "learning_rate": 0.0001724874791318865, + "loss": 0.6422, + "step": 418 + }, + { + "epoch": 0.00014707287680426433, + "grad_norm": 0.38327839970588684, + "learning_rate": 0.0001724207011686144, + "loss": 0.5595, + "step": 419 + }, + { + "epoch": 0.00014742388605678048, + "grad_norm": 0.31334301829338074, + "learning_rate": 0.00017235392320534226, + "loss": 0.474, + "step": 420 + }, + { + "epoch": 0.00014777489530929663, + "grad_norm": 0.3335350453853607, + "learning_rate": 0.00017228714524207013, + "loss": 0.6172, + "step": 421 + }, + { + "epoch": 0.00014812590456181278, + "grad_norm": 0.373696506023407, + "learning_rate": 0.000172220367278798, + "loss": 0.6183, + "step": 422 + }, + { + "epoch": 0.00014847691381432893, + "grad_norm": 0.45814886689186096, + "learning_rate": 0.00017215358931552588, + "loss": 0.5059, + "step": 423 + }, + { + "epoch": 0.00014882792306684505, + "grad_norm": 0.3578277826309204, + "learning_rate": 0.00017208681135225378, + "loss": 0.5771, + "step": 424 + }, + { + "epoch": 0.0001491789323193612, + "grad_norm": 0.42081883549690247, + "learning_rate": 0.00017202003338898165, + "loss": 0.5604, + "step": 425 + }, + { + "epoch": 0.00014952994157187735, + "grad_norm": 0.3173503875732422, + "learning_rate": 0.00017195325542570953, + "loss": 0.5738, + "step": 426 + }, + { + "epoch": 0.0001498809508243935, + "grad_norm": 0.38292011618614197, + "learning_rate": 0.0001718864774624374, + "loss": 0.6067, + "step": 427 + }, + { + "epoch": 0.00015023196007690965, + "grad_norm": 0.3518977463245392, + "learning_rate": 0.00017181969949916527, + "loss": 0.5073, + "step": 428 + }, + { + "epoch": 0.00015058296932942577, + "grad_norm": 0.5157706141471863, + "learning_rate": 0.00017175292153589317, + "loss": 0.5496, + "step": 429 + }, + { + "epoch": 0.00015093397858194192, + "grad_norm": 0.32064110040664673, + "learning_rate": 0.00017168614357262105, + "loss": 0.4766, + "step": 430 + }, + { + "epoch": 0.00015128498783445807, + "grad_norm": 0.42229798436164856, + "learning_rate": 0.00017161936560934892, + "loss": 0.5953, + "step": 431 + }, + { + "epoch": 0.00015163599708697422, + "grad_norm": 0.4723895192146301, + "learning_rate": 0.0001715525876460768, + "loss": 0.4783, + "step": 432 + }, + { + "epoch": 0.00015198700633949037, + "grad_norm": 0.3841445744037628, + "learning_rate": 0.00017148580968280467, + "loss": 0.5003, + "step": 433 + }, + { + "epoch": 0.0001523380155920065, + "grad_norm": 0.38026461005210876, + "learning_rate": 0.00017141903171953257, + "loss": 0.5093, + "step": 434 + }, + { + "epoch": 0.00015268902484452264, + "grad_norm": 0.37034904956817627, + "learning_rate": 0.00017135225375626044, + "loss": 0.6158, + "step": 435 + }, + { + "epoch": 0.0001530400340970388, + "grad_norm": 0.3876091241836548, + "learning_rate": 0.00017128547579298834, + "loss": 0.5287, + "step": 436 + }, + { + "epoch": 0.00015339104334955494, + "grad_norm": 0.30055519938468933, + "learning_rate": 0.0001712186978297162, + "loss": 0.5018, + "step": 437 + }, + { + "epoch": 0.0001537420526020711, + "grad_norm": 0.36094966530799866, + "learning_rate": 0.00017115191986644409, + "loss": 0.4961, + "step": 438 + }, + { + "epoch": 0.0001540930618545872, + "grad_norm": 0.3300524055957794, + "learning_rate": 0.00017108514190317196, + "loss": 0.5246, + "step": 439 + }, + { + "epoch": 0.00015444407110710336, + "grad_norm": 0.40980783104896545, + "learning_rate": 0.00017101836393989986, + "loss": 0.5705, + "step": 440 + }, + { + "epoch": 0.0001547950803596195, + "grad_norm": 0.3442326784133911, + "learning_rate": 0.00017095158597662773, + "loss": 0.5595, + "step": 441 + }, + { + "epoch": 0.00015514608961213566, + "grad_norm": 0.48015034198760986, + "learning_rate": 0.0001708848080133556, + "loss": 0.5642, + "step": 442 + }, + { + "epoch": 0.0001554970988646518, + "grad_norm": 0.5570142269134521, + "learning_rate": 0.00017081803005008348, + "loss": 0.6111, + "step": 443 + }, + { + "epoch": 0.00015584810811716793, + "grad_norm": 0.30470094084739685, + "learning_rate": 0.00017075125208681135, + "loss": 0.5151, + "step": 444 + }, + { + "epoch": 0.00015619911736968408, + "grad_norm": 0.31946614384651184, + "learning_rate": 0.00017068447412353925, + "loss": 0.5265, + "step": 445 + }, + { + "epoch": 0.00015655012662220023, + "grad_norm": 0.38980719447135925, + "learning_rate": 0.00017061769616026712, + "loss": 0.575, + "step": 446 + }, + { + "epoch": 0.00015690113587471638, + "grad_norm": 0.4077732264995575, + "learning_rate": 0.000170550918196995, + "loss": 0.5729, + "step": 447 + }, + { + "epoch": 0.00015725214512723253, + "grad_norm": 0.38632732629776, + "learning_rate": 0.00017048414023372287, + "loss": 0.594, + "step": 448 + }, + { + "epoch": 0.00015760315437974865, + "grad_norm": 0.37193921208381653, + "learning_rate": 0.00017041736227045074, + "loss": 0.6062, + "step": 449 + }, + { + "epoch": 0.0001579541636322648, + "grad_norm": 0.399029016494751, + "learning_rate": 0.00017035058430717862, + "loss": 0.4538, + "step": 450 + }, + { + "epoch": 0.00015830517288478095, + "grad_norm": 0.37710487842559814, + "learning_rate": 0.00017028380634390652, + "loss": 0.5615, + "step": 451 + }, + { + "epoch": 0.0001586561821372971, + "grad_norm": 0.38591668009757996, + "learning_rate": 0.0001702170283806344, + "loss": 0.5316, + "step": 452 + }, + { + "epoch": 0.00015900719138981325, + "grad_norm": 0.3453538417816162, + "learning_rate": 0.0001701502504173623, + "loss": 0.4645, + "step": 453 + }, + { + "epoch": 0.00015935820064232937, + "grad_norm": 0.34171512722969055, + "learning_rate": 0.00017008347245409016, + "loss": 0.5856, + "step": 454 + }, + { + "epoch": 0.00015970920989484552, + "grad_norm": 0.39591720700263977, + "learning_rate": 0.00017001669449081804, + "loss": 0.573, + "step": 455 + }, + { + "epoch": 0.00016006021914736167, + "grad_norm": 0.4127822816371918, + "learning_rate": 0.00016994991652754594, + "loss": 0.5183, + "step": 456 + }, + { + "epoch": 0.00016041122839987782, + "grad_norm": 0.37893375754356384, + "learning_rate": 0.0001698831385642738, + "loss": 0.566, + "step": 457 + }, + { + "epoch": 0.00016076223765239397, + "grad_norm": 0.33429333567619324, + "learning_rate": 0.00016981636060100168, + "loss": 0.449, + "step": 458 + }, + { + "epoch": 0.0001611132469049101, + "grad_norm": 0.3333180546760559, + "learning_rate": 0.00016974958263772956, + "loss": 0.4441, + "step": 459 + }, + { + "epoch": 0.00016146425615742624, + "grad_norm": 0.3591359257698059, + "learning_rate": 0.00016968280467445743, + "loss": 0.55, + "step": 460 + }, + { + "epoch": 0.0001618152654099424, + "grad_norm": 0.35390427708625793, + "learning_rate": 0.00016961602671118533, + "loss": 0.6445, + "step": 461 + }, + { + "epoch": 0.00016216627466245854, + "grad_norm": 0.42036697268486023, + "learning_rate": 0.0001695492487479132, + "loss": 0.5411, + "step": 462 + }, + { + "epoch": 0.0001625172839149747, + "grad_norm": 0.42147770524024963, + "learning_rate": 0.00016948247078464108, + "loss": 0.6218, + "step": 463 + }, + { + "epoch": 0.0001628682931674908, + "grad_norm": 0.3960399329662323, + "learning_rate": 0.00016941569282136895, + "loss": 0.6608, + "step": 464 + }, + { + "epoch": 0.00016321930242000696, + "grad_norm": 0.39676985144615173, + "learning_rate": 0.00016934891485809682, + "loss": 0.5838, + "step": 465 + }, + { + "epoch": 0.0001635703116725231, + "grad_norm": 0.2839520573616028, + "learning_rate": 0.0001692821368948247, + "loss": 0.5334, + "step": 466 + }, + { + "epoch": 0.00016392132092503926, + "grad_norm": 0.3654347062110901, + "learning_rate": 0.0001692153589315526, + "loss": 0.6065, + "step": 467 + }, + { + "epoch": 0.0001642723301775554, + "grad_norm": 0.3709166646003723, + "learning_rate": 0.00016914858096828047, + "loss": 0.509, + "step": 468 + }, + { + "epoch": 0.00016462333943007153, + "grad_norm": 0.29224780201911926, + "learning_rate": 0.00016908180300500834, + "loss": 0.5372, + "step": 469 + }, + { + "epoch": 0.00016497434868258768, + "grad_norm": 0.34979283809661865, + "learning_rate": 0.00016901502504173624, + "loss": 0.3968, + "step": 470 + }, + { + "epoch": 0.00016532535793510383, + "grad_norm": 0.34580183029174805, + "learning_rate": 0.00016894824707846412, + "loss": 0.6032, + "step": 471 + }, + { + "epoch": 0.00016567636718761998, + "grad_norm": 0.39046213030815125, + "learning_rate": 0.00016888146911519202, + "loss": 0.5628, + "step": 472 + }, + { + "epoch": 0.00016602737644013613, + "grad_norm": 0.35301411151885986, + "learning_rate": 0.0001688146911519199, + "loss": 0.607, + "step": 473 + }, + { + "epoch": 0.00016637838569265225, + "grad_norm": 0.4572748839855194, + "learning_rate": 0.00016874791318864776, + "loss": 0.5018, + "step": 474 + }, + { + "epoch": 0.0001667293949451684, + "grad_norm": 0.38230374455451965, + "learning_rate": 0.00016868113522537564, + "loss": 0.5026, + "step": 475 + }, + { + "epoch": 0.00016708040419768455, + "grad_norm": 0.37066343426704407, + "learning_rate": 0.0001686143572621035, + "loss": 0.5819, + "step": 476 + }, + { + "epoch": 0.0001674314134502007, + "grad_norm": 0.3658660054206848, + "learning_rate": 0.0001685475792988314, + "loss": 0.6825, + "step": 477 + }, + { + "epoch": 0.00016778242270271685, + "grad_norm": 0.42174890637397766, + "learning_rate": 0.00016848080133555928, + "loss": 0.6065, + "step": 478 + }, + { + "epoch": 0.00016813343195523297, + "grad_norm": 0.3462882936000824, + "learning_rate": 0.00016841402337228716, + "loss": 0.5888, + "step": 479 + }, + { + "epoch": 0.00016848444120774912, + "grad_norm": 0.44681960344314575, + "learning_rate": 0.00016834724540901503, + "loss": 0.4987, + "step": 480 + }, + { + "epoch": 0.00016883545046026527, + "grad_norm": 0.3535650372505188, + "learning_rate": 0.0001682804674457429, + "loss": 0.6478, + "step": 481 + }, + { + "epoch": 0.00016918645971278142, + "grad_norm": 0.3357018232345581, + "learning_rate": 0.00016821368948247077, + "loss": 0.4949, + "step": 482 + }, + { + "epoch": 0.00016953746896529757, + "grad_norm": 0.42756739258766174, + "learning_rate": 0.00016814691151919868, + "loss": 0.6475, + "step": 483 + }, + { + "epoch": 0.0001698884782178137, + "grad_norm": 0.36174866557121277, + "learning_rate": 0.00016808013355592655, + "loss": 0.598, + "step": 484 + }, + { + "epoch": 0.00017023948747032984, + "grad_norm": 0.37115278840065, + "learning_rate": 0.00016801335559265442, + "loss": 0.6215, + "step": 485 + }, + { + "epoch": 0.000170590496722846, + "grad_norm": 0.340249627828598, + "learning_rate": 0.0001679465776293823, + "loss": 0.5702, + "step": 486 + }, + { + "epoch": 0.00017094150597536214, + "grad_norm": 0.31226348876953125, + "learning_rate": 0.0001678797996661102, + "loss": 0.6531, + "step": 487 + }, + { + "epoch": 0.0001712925152278783, + "grad_norm": 0.35571998357772827, + "learning_rate": 0.00016781302170283807, + "loss": 0.6406, + "step": 488 + }, + { + "epoch": 0.00017164352448039441, + "grad_norm": 0.4167378842830658, + "learning_rate": 0.00016774624373956597, + "loss": 0.5111, + "step": 489 + }, + { + "epoch": 0.00017199453373291056, + "grad_norm": 0.292304128408432, + "learning_rate": 0.00016767946577629384, + "loss": 0.6643, + "step": 490 + }, + { + "epoch": 0.0001723455429854267, + "grad_norm": 0.38789069652557373, + "learning_rate": 0.00016761268781302171, + "loss": 0.4542, + "step": 491 + }, + { + "epoch": 0.00017269655223794286, + "grad_norm": 0.33764714002609253, + "learning_rate": 0.0001675459098497496, + "loss": 0.4158, + "step": 492 + }, + { + "epoch": 0.00017304756149045898, + "grad_norm": 0.34849148988723755, + "learning_rate": 0.0001674791318864775, + "loss": 0.4737, + "step": 493 + }, + { + "epoch": 0.00017339857074297513, + "grad_norm": 0.2921352684497833, + "learning_rate": 0.00016741235392320536, + "loss": 0.679, + "step": 494 + }, + { + "epoch": 0.00017374957999549128, + "grad_norm": 0.33746641874313354, + "learning_rate": 0.00016734557595993323, + "loss": 0.4957, + "step": 495 + }, + { + "epoch": 0.00017410058924800743, + "grad_norm": 0.4029395878314972, + "learning_rate": 0.0001672787979966611, + "loss": 0.6708, + "step": 496 + }, + { + "epoch": 0.00017445159850052358, + "grad_norm": 0.440033882856369, + "learning_rate": 0.00016721202003338898, + "loss": 0.5889, + "step": 497 + }, + { + "epoch": 0.0001748026077530397, + "grad_norm": 0.330692857503891, + "learning_rate": 0.00016714524207011685, + "loss": 0.5942, + "step": 498 + }, + { + "epoch": 0.00017515361700555585, + "grad_norm": 0.3111809492111206, + "learning_rate": 0.00016707846410684475, + "loss": 0.5506, + "step": 499 + }, + { + "epoch": 0.000175504626258072, + "grad_norm": 0.38885676860809326, + "learning_rate": 0.00016701168614357263, + "loss": 0.4713, + "step": 500 + } + ], + "logging_steps": 1, + "max_steps": 3000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9226037933441024e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/marques/outputs/checkpoint-500/training_args.bin b/marques/outputs/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..fd0ba520c124bb1ece608079704fa15e0236be45 --- /dev/null +++ b/marques/outputs/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09362706a3d58d219e41be1682b770b8f5069fcd630f7dbcadb71e4d4ce8859b +size 6289 diff --git a/marques/outputs/checkpoint-60/README.md b/marques/outputs/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d90a96dfe2e51221657a6e936d376789e21081f9 --- /dev/null +++ b/marques/outputs/checkpoint-60/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/marques/outputs/checkpoint-60/adapter_config.json b/marques/outputs/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9930a191a30254256c9550b1bdffa58b8d7aee8 --- /dev/null +++ b/marques/outputs/checkpoint-60/adapter_config.json @@ -0,0 +1,50 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "LlamaForCausalLM", + "parent_library": "transformers.models.llama.modeling_llama", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "gate_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/marques/outputs/checkpoint-60/adapter_model.safetensors b/marques/outputs/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eb1b8a7fb811eba8eebec883becc0b4837861372 --- /dev/null +++ b/marques/outputs/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1864333a338e9c981fec69c8271b789aeab72342578618c57f3a2e6441143ea3 +size 167832240 diff --git a/marques/outputs/checkpoint-60/optimizer.pt b/marques/outputs/checkpoint-60/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c7cbfdf46a90f07c6b6f8addb3887e49205ab56 --- /dev/null +++ b/marques/outputs/checkpoint-60/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fcafc5530611b48f2235792501638d72408dbfaf3411197c1db9c111dad4359 +size 85723685 diff --git a/marques/outputs/checkpoint-60/rng_state.pth b/marques/outputs/checkpoint-60/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ef66339b9befa098183fd5d69faed6838e526b0 --- /dev/null +++ b/marques/outputs/checkpoint-60/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1d565802a8e26c4e8a31328752b7a7fdc186d9401aa008e65697d0ad8c22e33 +size 14645 diff --git a/marques/outputs/checkpoint-60/scheduler.pt b/marques/outputs/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f479d2115380c4b61e2a754016f0c8727d035f79 --- /dev/null +++ b/marques/outputs/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22ae51817158590b7adfad82fb9a3380e5197063501e610f9eaa5c6decb93fd2 +size 1465 diff --git a/marques/outputs/checkpoint-60/special_tokens_map.json b/marques/outputs/checkpoint-60/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..68b10c7f0a479eae0c358eac6a14959b3f9acdf1 --- /dev/null +++ b/marques/outputs/checkpoint-60/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/marques/outputs/checkpoint-60/tokenizer.json b/marques/outputs/checkpoint-60/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/marques/outputs/checkpoint-60/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/marques/outputs/checkpoint-60/tokenizer_config.json b/marques/outputs/checkpoint-60/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..92b1d94e894e5474ebea1d171e14751be79ca3e5 --- /dev/null +++ b/marques/outputs/checkpoint-60/tokenizer_config.json @@ -0,0 +1,2066 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/marques/outputs/checkpoint-60/trainer_state.json b/marques/outputs/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8be7c7e2914dd50bb4f0e3a2891f792971518fcd --- /dev/null +++ b/marques/outputs/checkpoint-60/trainer_state.json @@ -0,0 +1,454 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.106055515096864e-05, + "eval_steps": 500, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.5100925251614403e-07, + "grad_norm": 1.3439100980758667, + "learning_rate": 0.0, + "loss": 2.6895, + "step": 1 + }, + { + "epoch": 7.020185050322881e-07, + "grad_norm": 1.3290140628814697, + "learning_rate": 4e-05, + "loss": 2.6313, + "step": 2 + }, + { + "epoch": 1.053027757548432e-06, + "grad_norm": 1.4032689332962036, + "learning_rate": 8e-05, + "loss": 2.7548, + "step": 3 + }, + { + "epoch": 1.4040370100645761e-06, + "grad_norm": 1.4434138536453247, + "learning_rate": 0.00012, + "loss": 2.5591, + "step": 4 + }, + { + "epoch": 1.75504626258072e-06, + "grad_norm": 1.4139286279678345, + "learning_rate": 0.00016, + "loss": 2.3189, + "step": 5 + }, + { + "epoch": 2.106055515096864e-06, + "grad_norm": 1.614639163017273, + "learning_rate": 0.0002, + "loss": 1.9921, + "step": 6 + }, + { + "epoch": 2.4570647676130083e-06, + "grad_norm": 4.1337995529174805, + "learning_rate": 0.00019636363636363636, + "loss": 1.8724, + "step": 7 + }, + { + "epoch": 2.8080740201291522e-06, + "grad_norm": 1.8266983032226562, + "learning_rate": 0.00019272727272727274, + "loss": 1.5255, + "step": 8 + }, + { + "epoch": 3.1590832726452962e-06, + "grad_norm": 1.3771417140960693, + "learning_rate": 0.0001890909090909091, + "loss": 1.2087, + "step": 9 + }, + { + "epoch": 3.51009252516144e-06, + "grad_norm": 1.1512426137924194, + "learning_rate": 0.00018545454545454545, + "loss": 0.9243, + "step": 10 + }, + { + "epoch": 3.861101777677584e-06, + "grad_norm": 1.0865983963012695, + "learning_rate": 0.00018181818181818183, + "loss": 0.9623, + "step": 11 + }, + { + "epoch": 4.212111030193728e-06, + "grad_norm": 0.877534806728363, + "learning_rate": 0.0001781818181818182, + "loss": 0.8208, + "step": 12 + }, + { + "epoch": 4.563120282709872e-06, + "grad_norm": 2.0119693279266357, + "learning_rate": 0.00017454545454545454, + "loss": 0.8427, + "step": 13 + }, + { + "epoch": 4.9141295352260165e-06, + "grad_norm": 1.1565978527069092, + "learning_rate": 0.0001709090909090909, + "loss": 0.81, + "step": 14 + }, + { + "epoch": 5.26513878774216e-06, + "grad_norm": 0.7689681649208069, + "learning_rate": 0.00016727272727272728, + "loss": 0.7098, + "step": 15 + }, + { + "epoch": 5.6161480402583045e-06, + "grad_norm": 0.7343666553497314, + "learning_rate": 0.00016363636363636366, + "loss": 0.7439, + "step": 16 + }, + { + "epoch": 5.967157292774448e-06, + "grad_norm": 0.8168903589248657, + "learning_rate": 0.00016, + "loss": 0.6915, + "step": 17 + }, + { + "epoch": 6.3181665452905924e-06, + "grad_norm": 0.85252845287323, + "learning_rate": 0.00015636363636363637, + "loss": 0.7202, + "step": 18 + }, + { + "epoch": 6.669175797806736e-06, + "grad_norm": 0.8797001838684082, + "learning_rate": 0.00015272727272727275, + "loss": 0.7545, + "step": 19 + }, + { + "epoch": 7.02018505032288e-06, + "grad_norm": 0.8794913291931152, + "learning_rate": 0.0001490909090909091, + "loss": 0.7479, + "step": 20 + }, + { + "epoch": 7.371194302839024e-06, + "grad_norm": 1.0016080141067505, + "learning_rate": 0.00014545454545454546, + "loss": 0.6765, + "step": 21 + }, + { + "epoch": 7.722203555355168e-06, + "grad_norm": 0.9535176753997803, + "learning_rate": 0.00014181818181818184, + "loss": 0.6871, + "step": 22 + }, + { + "epoch": 8.073212807871313e-06, + "grad_norm": 0.7885817289352417, + "learning_rate": 0.0001381818181818182, + "loss": 0.661, + "step": 23 + }, + { + "epoch": 8.424222060387455e-06, + "grad_norm": 0.5434080362319946, + "learning_rate": 0.00013454545454545455, + "loss": 0.6883, + "step": 24 + }, + { + "epoch": 8.7752313129036e-06, + "grad_norm": 0.5278591513633728, + "learning_rate": 0.00013090909090909093, + "loss": 0.7006, + "step": 25 + }, + { + "epoch": 9.126240565419744e-06, + "grad_norm": 0.5194696187973022, + "learning_rate": 0.00012727272727272728, + "loss": 0.6501, + "step": 26 + }, + { + "epoch": 9.477249817935889e-06, + "grad_norm": 0.6083847284317017, + "learning_rate": 0.00012363636363636364, + "loss": 0.6588, + "step": 27 + }, + { + "epoch": 9.828259070452033e-06, + "grad_norm": 0.4585495591163635, + "learning_rate": 0.00012, + "loss": 0.605, + "step": 28 + }, + { + "epoch": 1.0179268322968176e-05, + "grad_norm": 0.47044819593429565, + "learning_rate": 0.00011636363636363636, + "loss": 0.6838, + "step": 29 + }, + { + "epoch": 1.053027757548432e-05, + "grad_norm": 0.5118638873100281, + "learning_rate": 0.00011272727272727272, + "loss": 0.6128, + "step": 30 + }, + { + "epoch": 1.0881286828000465e-05, + "grad_norm": 0.5699962377548218, + "learning_rate": 0.00010909090909090909, + "loss": 0.6426, + "step": 31 + }, + { + "epoch": 1.1232296080516609e-05, + "grad_norm": 0.6765233874320984, + "learning_rate": 0.00010545454545454545, + "loss": 0.5343, + "step": 32 + }, + { + "epoch": 1.1583305333032752e-05, + "grad_norm": 0.5130237340927124, + "learning_rate": 0.00010181818181818181, + "loss": 0.6389, + "step": 33 + }, + { + "epoch": 1.1934314585548896e-05, + "grad_norm": 0.5473573803901672, + "learning_rate": 9.818181818181818e-05, + "loss": 0.6286, + "step": 34 + }, + { + "epoch": 1.228532383806504e-05, + "grad_norm": 0.5036295652389526, + "learning_rate": 9.454545454545455e-05, + "loss": 0.6204, + "step": 35 + }, + { + "epoch": 1.2636333090581185e-05, + "grad_norm": 0.5273815989494324, + "learning_rate": 9.090909090909092e-05, + "loss": 0.7215, + "step": 36 + }, + { + "epoch": 1.298734234309733e-05, + "grad_norm": 0.5185922980308533, + "learning_rate": 8.727272727272727e-05, + "loss": 0.6942, + "step": 37 + }, + { + "epoch": 1.3338351595613472e-05, + "grad_norm": 0.3941945731639862, + "learning_rate": 8.363636363636364e-05, + "loss": 0.7038, + "step": 38 + }, + { + "epoch": 1.3689360848129616e-05, + "grad_norm": 0.5436787009239197, + "learning_rate": 8e-05, + "loss": 0.6119, + "step": 39 + }, + { + "epoch": 1.404037010064576e-05, + "grad_norm": 0.4706076383590698, + "learning_rate": 7.636363636363637e-05, + "loss": 0.5971, + "step": 40 + }, + { + "epoch": 1.4391379353161905e-05, + "grad_norm": 0.44814205169677734, + "learning_rate": 7.272727272727273e-05, + "loss": 0.7108, + "step": 41 + }, + { + "epoch": 1.4742388605678048e-05, + "grad_norm": 0.506255030632019, + "learning_rate": 6.90909090909091e-05, + "loss": 0.609, + "step": 42 + }, + { + "epoch": 1.5093397858194192e-05, + "grad_norm": 0.4711558520793915, + "learning_rate": 6.545454545454546e-05, + "loss": 0.6348, + "step": 43 + }, + { + "epoch": 1.5444407110710337e-05, + "grad_norm": 0.4260883331298828, + "learning_rate": 6.181818181818182e-05, + "loss": 0.7328, + "step": 44 + }, + { + "epoch": 1.579541636322648e-05, + "grad_norm": 0.3931660056114197, + "learning_rate": 5.818181818181818e-05, + "loss": 0.6853, + "step": 45 + }, + { + "epoch": 1.6146425615742626e-05, + "grad_norm": 0.42857930064201355, + "learning_rate": 5.4545454545454546e-05, + "loss": 0.6452, + "step": 46 + }, + { + "epoch": 1.6497434868258768e-05, + "grad_norm": 0.45734405517578125, + "learning_rate": 5.090909090909091e-05, + "loss": 0.6251, + "step": 47 + }, + { + "epoch": 1.684844412077491e-05, + "grad_norm": 0.4702906906604767, + "learning_rate": 4.7272727272727275e-05, + "loss": 0.6446, + "step": 48 + }, + { + "epoch": 1.7199453373291057e-05, + "grad_norm": 0.5064498782157898, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.5957, + "step": 49 + }, + { + "epoch": 1.75504626258072e-05, + "grad_norm": 0.8002666234970093, + "learning_rate": 4e-05, + "loss": 0.6584, + "step": 50 + }, + { + "epoch": 1.7901471878323346e-05, + "grad_norm": 0.4723510444164276, + "learning_rate": 3.6363636363636364e-05, + "loss": 0.6886, + "step": 51 + }, + { + "epoch": 1.825248113083949e-05, + "grad_norm": 0.5034312009811401, + "learning_rate": 3.272727272727273e-05, + "loss": 0.6713, + "step": 52 + }, + { + "epoch": 1.860349038335563e-05, + "grad_norm": 0.39916467666625977, + "learning_rate": 2.909090909090909e-05, + "loss": 0.7523, + "step": 53 + }, + { + "epoch": 1.8954499635871777e-05, + "grad_norm": 0.4958766996860504, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.5825, + "step": 54 + }, + { + "epoch": 1.930550888838792e-05, + "grad_norm": 0.41744646430015564, + "learning_rate": 2.1818181818181818e-05, + "loss": 0.6487, + "step": 55 + }, + { + "epoch": 1.9656518140904066e-05, + "grad_norm": 0.5056137442588806, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.7061, + "step": 56 + }, + { + "epoch": 2.000752739342021e-05, + "grad_norm": 0.49774253368377686, + "learning_rate": 1.4545454545454545e-05, + "loss": 0.5808, + "step": 57 + }, + { + "epoch": 2.035853664593635e-05, + "grad_norm": 0.4683721959590912, + "learning_rate": 1.0909090909090909e-05, + "loss": 0.6343, + "step": 58 + }, + { + "epoch": 2.0709545898452498e-05, + "grad_norm": 0.4637458026409149, + "learning_rate": 7.272727272727272e-06, + "loss": 0.5951, + "step": 59 + }, + { + "epoch": 2.106055515096864e-05, + "grad_norm": 0.48048844933509827, + "learning_rate": 3.636363636363636e-06, + "loss": 0.4746, + "step": 60 + } + ], + "logging_steps": 1, + "max_steps": 60, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2300013591379968.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/marques/outputs/checkpoint-60/training_args.bin b/marques/outputs/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..daf896f743447ddcc62c1edf01cea45ddf6b170a --- /dev/null +++ b/marques/outputs/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc78e0fb9d45883c88535afcb3b214f6bdeb826a5b630640f711b334c4d35d5a +size 6289