diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..668c998a994c3b3fff2f9d0d3fff3ec1afcc426f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-1086/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1629/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2172/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2715/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3258/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3801/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-4344/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-4887/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-543/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-5430/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c4354d1e6c5874fc6ead8955cc29893a9d09fcb8 --- /dev/null +++ b/README.md @@ -0,0 +1,62 @@ +--- +base_model: Qwen/Qwen2.5-7B-Instruct +library_name: peft +model_name: seed_42 +tags: +- base_model:adapter:Qwen/Qwen2.5-7B-Instruct +- lora +- sft +- transformers +- trl +licence: license +pipeline_tag: text-generation +--- + +# Model Card for seed_42 + +This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/camilab-stanford-university/subliminal_learning/runs/9746rvh5) + + + +This model was trained with SFT. + +### Framework versions + +- PEFT 0.19.1 +- TRL: 1.2.0 +- Transformers: 5.5.4 +- Pytorch: 2.10.0 +- Datasets: 4.8.4 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..88178e58c6d24c2bb788f871d548c8bff3b8b16a --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,48 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "down_proj", + "q_proj", + "gate_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c57a413b91fad886bd899d552a4aace9704085e4 --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44a37f4e701d0d74b2f3087cbbb4d2cce354e5bfcb1651dbb4d48e82fd2234d7 +size 80792096 diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..bdf7919a96cfe43d50914a007b9c0877bd0ec27e --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-1086/README.md b/checkpoint-1086/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dd49221a54a197d9f0cab86c7982e2b2fe0776b3 --- /dev/null +++ b/checkpoint-1086/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen2.5-7B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen2.5-7B-Instruct +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-1086/adapter_config.json b/checkpoint-1086/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..88178e58c6d24c2bb788f871d548c8bff3b8b16a --- /dev/null +++ b/checkpoint-1086/adapter_config.json @@ -0,0 +1,48 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "down_proj", + "q_proj", + "gate_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1086/adapter_model.safetensors b/checkpoint-1086/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..80d00e51d8c73891cebcae009d307dfdd97cc00e --- /dev/null +++ b/checkpoint-1086/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eace20f3ff41af75c2ea9c9643e6063d8734d60b4b3bcec95f05c8940d3430be +size 80792096 diff --git a/checkpoint-1086/chat_template.jinja b/checkpoint-1086/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..bdf7919a96cfe43d50914a007b9c0877bd0ec27e --- /dev/null +++ b/checkpoint-1086/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-1086/tokenizer.json b/checkpoint-1086/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/checkpoint-1086/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/checkpoint-1086/tokenizer_config.json b/checkpoint-1086/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d75d3bb5300d205e48769cc1999073ab5971214 --- /dev/null +++ b/checkpoint-1086/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-1086/trainer_state.json b/checkpoint-1086/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b028f54e62aaa80a880b3caf59fa9a2987e776da --- /dev/null +++ b/checkpoint-1086/trainer_state.json @@ -0,0 +1,1136 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 1086, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2237394809722901, + "epoch": 0.01841620626151013, + "grad_norm": 5.082435607910156, + "learning_rate": 3.308823529411765e-06, + "loss": 0.9237876892089844, + "mean_token_accuracy": 0.7685343027114868, + "num_tokens": 205423.0, + "step": 10 + }, + { + "entropy": 1.2295925617218018, + "epoch": 0.03683241252302026, + "grad_norm": 4.672000408172607, + "learning_rate": 6.985294117647059e-06, + "loss": 0.8900892257690429, + "mean_token_accuracy": 0.7677771031856537, + "num_tokens": 410849.0, + "step": 20 + }, + { + "entropy": 1.2285718679428101, + "epoch": 0.055248618784530384, + "grad_norm": 1.4828118085861206, + "learning_rate": 1.0661764705882354e-05, + "loss": 0.5975452899932862, + "mean_token_accuracy": 0.8146551787853241, + "num_tokens": 616438.0, + "step": 30 + }, + { + "entropy": 1.210776400566101, + "epoch": 0.07366482504604052, + "grad_norm": 0.7761328816413879, + "learning_rate": 1.4338235294117647e-05, + "loss": 0.40664992332458494, + "mean_token_accuracy": 0.8699092030525207, + "num_tokens": 822118.0, + "step": 40 + }, + { + "entropy": 1.200321125984192, + "epoch": 0.09208103130755065, + "grad_norm": 0.5363371968269348, + "learning_rate": 1.8014705882352943e-05, + "loss": 0.3313469409942627, + "mean_token_accuracy": 0.8904915869235992, + "num_tokens": 1027941.0, + "step": 50 + }, + { + "entropy": 1.1809936046600342, + "epoch": 0.11049723756906077, + "grad_norm": 0.39541518688201904, + "learning_rate": 2.1691176470588237e-05, + "loss": 0.27568228244781495, + "mean_token_accuracy": 0.9047131836414337, + "num_tokens": 1233620.0, + "step": 60 + }, + { + "entropy": 1.169810914993286, + "epoch": 0.1289134438305709, + "grad_norm": 0.341960072517395, + "learning_rate": 2.536764705882353e-05, + "loss": 0.245219087600708, + "mean_token_accuracy": 0.9150686681270599, + "num_tokens": 1438656.0, + "step": 70 + }, + { + "entropy": 1.1652960777282715, + "epoch": 0.14732965009208104, + "grad_norm": 0.36872178316116333, + "learning_rate": 2.9044117647058828e-05, + "loss": 0.2220149040222168, + "mean_token_accuracy": 0.9224777698516846, + "num_tokens": 1643877.0, + "step": 80 + }, + { + "entropy": 1.154341197013855, + "epoch": 0.16574585635359115, + "grad_norm": 0.4152425229549408, + "learning_rate": 3.272058823529412e-05, + "loss": 0.2002798557281494, + "mean_token_accuracy": 0.9285802960395813, + "num_tokens": 1849506.0, + "step": 90 + }, + { + "entropy": 1.1507258892059327, + "epoch": 0.1841620626151013, + "grad_norm": 0.47647765278816223, + "learning_rate": 3.639705882352941e-05, + "loss": 0.18871363401412963, + "mean_token_accuracy": 0.9318056285381318, + "num_tokens": 2055071.0, + "step": 100 + }, + { + "entropy": 1.1455535531044005, + "epoch": 0.20257826887661143, + "grad_norm": 0.4853009581565857, + "learning_rate": 4.007352941176471e-05, + "loss": 0.17836341857910157, + "mean_token_accuracy": 0.9367631554603577, + "num_tokens": 2260643.0, + "step": 110 + }, + { + "entropy": 1.1402526497840881, + "epoch": 0.22099447513812154, + "grad_norm": 0.4455392360687256, + "learning_rate": 4.375e-05, + "loss": 0.16921783685684205, + "mean_token_accuracy": 0.9386959195137023, + "num_tokens": 2466085.0, + "step": 120 + }, + { + "entropy": 1.1374777555465698, + "epoch": 0.23941068139963168, + "grad_norm": 0.5880279541015625, + "learning_rate": 4.742647058823529e-05, + "loss": 0.15989291667938232, + "mean_token_accuracy": 0.9421182632446289, + "num_tokens": 2671024.0, + "step": 130 + }, + { + "entropy": 1.1273940205574036, + "epoch": 0.2578268876611418, + "grad_norm": 0.612959086894989, + "learning_rate": 5.110294117647059e-05, + "loss": 0.14701461791992188, + "mean_token_accuracy": 0.9463540315628052, + "num_tokens": 2876848.0, + "step": 140 + }, + { + "entropy": 1.1263513088226318, + "epoch": 0.27624309392265195, + "grad_norm": 0.5695255398750305, + "learning_rate": 5.477941176470589e-05, + "loss": 0.14604382514953612, + "mean_token_accuracy": 0.946351945400238, + "num_tokens": 3082589.0, + "step": 150 + }, + { + "entropy": 1.1290789365768432, + "epoch": 0.2946593001841621, + "grad_norm": 0.6608090996742249, + "learning_rate": 5.845588235294118e-05, + "loss": 0.1409450054168701, + "mean_token_accuracy": 0.9481450319290161, + "num_tokens": 3287459.0, + "step": 160 + }, + { + "entropy": 1.1291529774665832, + "epoch": 0.31307550644567217, + "grad_norm": 0.652715802192688, + "learning_rate": 6.213235294117647e-05, + "loss": 0.14441155195236205, + "mean_token_accuracy": 0.9466125547885895, + "num_tokens": 3493682.0, + "step": 170 + }, + { + "entropy": 1.1244838953018188, + "epoch": 0.3314917127071823, + "grad_norm": 0.7815241813659668, + "learning_rate": 6.580882352941177e-05, + "loss": 0.13361064195632935, + "mean_token_accuracy": 0.9512295544147491, + "num_tokens": 3699573.0, + "step": 180 + }, + { + "entropy": 1.1217721104621887, + "epoch": 0.34990791896869244, + "grad_norm": 0.7933160066604614, + "learning_rate": 6.948529411764706e-05, + "loss": 0.13089522123336791, + "mean_token_accuracy": 0.9520221531391144, + "num_tokens": 3905156.0, + "step": 190 + }, + { + "entropy": 1.1206679105758668, + "epoch": 0.3683241252302026, + "grad_norm": 0.6815240383148193, + "learning_rate": 7.316176470588236e-05, + "loss": 0.13400404453277587, + "mean_token_accuracy": 0.9501322209835052, + "num_tokens": 4110570.0, + "step": 200 + }, + { + "entropy": 1.1161052227020263, + "epoch": 0.3867403314917127, + "grad_norm": 0.8297767639160156, + "learning_rate": 7.683823529411766e-05, + "loss": 0.13389937877655028, + "mean_token_accuracy": 0.9501932203769684, + "num_tokens": 4315834.0, + "step": 210 + }, + { + "entropy": 1.1098745942115784, + "epoch": 0.40515653775322286, + "grad_norm": 0.5943381786346436, + "learning_rate": 8.051470588235294e-05, + "loss": 0.13452907800674438, + "mean_token_accuracy": 0.9503286242485046, + "num_tokens": 4520807.0, + "step": 220 + }, + { + "entropy": 1.100480353832245, + "epoch": 0.42357274401473294, + "grad_norm": 0.6094359755516052, + "learning_rate": 8.419117647058824e-05, + "loss": 0.12827746868133544, + "mean_token_accuracy": 0.952492094039917, + "num_tokens": 4725867.0, + "step": 230 + }, + { + "entropy": 1.0901286959648133, + "epoch": 0.4419889502762431, + "grad_norm": 0.7240597605705261, + "learning_rate": 8.786764705882353e-05, + "loss": 0.12171242237091065, + "mean_token_accuracy": 0.953943532705307, + "num_tokens": 4931629.0, + "step": 240 + }, + { + "entropy": 1.0885071873664856, + "epoch": 0.4604051565377532, + "grad_norm": 0.6939547657966614, + "learning_rate": 9.154411764705882e-05, + "loss": 0.12155698537826538, + "mean_token_accuracy": 0.9545870959758759, + "num_tokens": 5137285.0, + "step": 250 + }, + { + "entropy": 1.086272156238556, + "epoch": 0.47882136279926335, + "grad_norm": 0.5752800703048706, + "learning_rate": 9.522058823529412e-05, + "loss": 0.12157790660858155, + "mean_token_accuracy": 0.9541126549243927, + "num_tokens": 5342575.0, + "step": 260 + }, + { + "entropy": 1.0857678413391114, + "epoch": 0.4972375690607735, + "grad_norm": 0.7565123438835144, + "learning_rate": 9.889705882352942e-05, + "loss": 0.12349612712860107, + "mean_token_accuracy": 0.9535140514373779, + "num_tokens": 5547995.0, + "step": 270 + }, + { + "entropy": 1.079762625694275, + "epoch": 0.5156537753222836, + "grad_norm": 0.6972768306732178, + "learning_rate": 9.999954556423843e-05, + "loss": 0.11875582933425903, + "mean_token_accuracy": 0.9556483089923858, + "num_tokens": 5753195.0, + "step": 280 + }, + { + "entropy": 1.0742079138755798, + "epoch": 0.5340699815837937, + "grad_norm": 0.7821696996688843, + "learning_rate": 9.999731977631227e-05, + "loss": 0.11824090480804443, + "mean_token_accuracy": 0.9557521045207977, + "num_tokens": 5958236.0, + "step": 290 + }, + { + "entropy": 1.0679773569107056, + "epoch": 0.5524861878453039, + "grad_norm": 0.5846888422966003, + "learning_rate": 9.999323925089486e-05, + "loss": 0.11707355976104736, + "mean_token_accuracy": 0.9554719448089599, + "num_tokens": 6163992.0, + "step": 300 + }, + { + "entropy": 1.0655727863311768, + "epoch": 0.570902394106814, + "grad_norm": 0.5812502503395081, + "learning_rate": 9.998730413936037e-05, + "loss": 0.11371417045593261, + "mean_token_accuracy": 0.9576376020908356, + "num_tokens": 6369456.0, + "step": 310 + }, + { + "entropy": 1.0607039332389832, + "epoch": 0.5893186003683242, + "grad_norm": 0.6238475441932678, + "learning_rate": 9.99795146618821e-05, + "loss": 0.11775733232498169, + "mean_token_accuracy": 0.9557221591472626, + "num_tokens": 6574833.0, + "step": 320 + }, + { + "entropy": 1.0504255175590516, + "epoch": 0.6077348066298343, + "grad_norm": 0.6496815085411072, + "learning_rate": 9.996987110742422e-05, + "loss": 0.10904088020324706, + "mean_token_accuracy": 0.9585366368293762, + "num_tokens": 6780108.0, + "step": 330 + }, + { + "entropy": 1.0456081986427308, + "epoch": 0.6261510128913443, + "grad_norm": 0.786702573299408, + "learning_rate": 9.995837383373119e-05, + "loss": 0.10642309188842773, + "mean_token_accuracy": 0.9596696078777314, + "num_tokens": 6985920.0, + "step": 340 + }, + { + "entropy": 1.0455098271369934, + "epoch": 0.6445672191528545, + "grad_norm": 0.5473790168762207, + "learning_rate": 9.994502326731434e-05, + "loss": 0.10822961330413819, + "mean_token_accuracy": 0.959563136100769, + "num_tokens": 7191465.0, + "step": 350 + }, + { + "entropy": 1.04240562915802, + "epoch": 0.6629834254143646, + "grad_norm": 0.6672356128692627, + "learning_rate": 9.992981990343614e-05, + "loss": 0.1110004186630249, + "mean_token_accuracy": 0.9582514643669129, + "num_tokens": 7396877.0, + "step": 360 + }, + { + "entropy": 1.0386811256408692, + "epoch": 0.6813996316758748, + "grad_norm": 0.698539674282074, + "learning_rate": 9.99127643060918e-05, + "loss": 0.107539963722229, + "mean_token_accuracy": 0.9593036234378814, + "num_tokens": 7602437.0, + "step": 370 + }, + { + "entropy": 1.0311225533485413, + "epoch": 0.6998158379373849, + "grad_norm": 0.6629284024238586, + "learning_rate": 9.989385710798837e-05, + "loss": 0.1064023494720459, + "mean_token_accuracy": 0.9602205216884613, + "num_tokens": 7808142.0, + "step": 380 + }, + { + "entropy": 1.030210506916046, + "epoch": 0.7182320441988951, + "grad_norm": 0.5616748929023743, + "learning_rate": 9.987309901052121e-05, + "loss": 0.10717041492462158, + "mean_token_accuracy": 0.9599347949028015, + "num_tokens": 8013407.0, + "step": 390 + }, + { + "entropy": 1.0208017826080322, + "epoch": 0.7366482504604052, + "grad_norm": 0.6329049468040466, + "learning_rate": 9.985049078374806e-05, + "loss": 0.10359601974487305, + "mean_token_accuracy": 0.9603756129741668, + "num_tokens": 8219040.0, + "step": 400 + }, + { + "entropy": 1.015640377998352, + "epoch": 0.7550644567219152, + "grad_norm": 0.6516013741493225, + "learning_rate": 9.982603326636037e-05, + "loss": 0.10146439075469971, + "mean_token_accuracy": 0.9627702474594116, + "num_tokens": 8424678.0, + "step": 410 + }, + { + "entropy": 1.0105359435081482, + "epoch": 0.7734806629834254, + "grad_norm": 0.6920603513717651, + "learning_rate": 9.979972736565226e-05, + "loss": 0.10770498514175415, + "mean_token_accuracy": 0.9591470420360565, + "num_tokens": 8629868.0, + "step": 420 + }, + { + "entropy": 0.9966452836990356, + "epoch": 0.7918968692449355, + "grad_norm": 0.6857476234436035, + "learning_rate": 9.977157405748687e-05, + "loss": 0.10282524824142455, + "mean_token_accuracy": 0.9612209022045135, + "num_tokens": 8835320.0, + "step": 430 + }, + { + "entropy": 0.9945534646511078, + "epoch": 0.8103130755064457, + "grad_norm": 0.7208472490310669, + "learning_rate": 9.974157438626008e-05, + "loss": 0.10069938898086547, + "mean_token_accuracy": 0.9620070576667785, + "num_tokens": 9041123.0, + "step": 440 + }, + { + "entropy": 0.979461395740509, + "epoch": 0.8287292817679558, + "grad_norm": 0.5071915984153748, + "learning_rate": 9.970972946486185e-05, + "loss": 0.09799174070358277, + "mean_token_accuracy": 0.9620374023914338, + "num_tokens": 9246361.0, + "step": 450 + }, + { + "entropy": 0.9830998003482818, + "epoch": 0.8471454880294659, + "grad_norm": 0.8660802245140076, + "learning_rate": 9.967604047463493e-05, + "loss": 0.10378165245056152, + "mean_token_accuracy": 0.9606865763664245, + "num_tokens": 9451845.0, + "step": 460 + }, + { + "entropy": 0.9813413023948669, + "epoch": 0.8655616942909761, + "grad_norm": 0.7642477750778198, + "learning_rate": 9.964050866533094e-05, + "loss": 0.1010061264038086, + "mean_token_accuracy": 0.9608745336532593, + "num_tokens": 9656802.0, + "step": 470 + }, + { + "entropy": 0.967874163389206, + "epoch": 0.8839779005524862, + "grad_norm": 0.5987281799316406, + "learning_rate": 9.960313535506411e-05, + "loss": 0.10169394016265869, + "mean_token_accuracy": 0.9611998200416565, + "num_tokens": 9861719.0, + "step": 480 + }, + { + "entropy": 0.9663491308689117, + "epoch": 0.9023941068139963, + "grad_norm": 0.6124638319015503, + "learning_rate": 9.956392193026239e-05, + "loss": 0.102389657497406, + "mean_token_accuracy": 0.9611884355545044, + "num_tokens": 10066673.0, + "step": 490 + }, + { + "entropy": 0.959654438495636, + "epoch": 0.9208103130755064, + "grad_norm": 0.7873051762580872, + "learning_rate": 9.952286984561592e-05, + "loss": 0.10170392990112305, + "mean_token_accuracy": 0.9610928475856781, + "num_tokens": 10272091.0, + "step": 500 + }, + { + "entropy": 0.9550537407398224, + "epoch": 0.9392265193370166, + "grad_norm": 0.6071968078613281, + "learning_rate": 9.947998062402313e-05, + "loss": 0.09448277950286865, + "mean_token_accuracy": 0.9648977637290954, + "num_tokens": 10477632.0, + "step": 510 + }, + { + "entropy": 0.9538533687591553, + "epoch": 0.9576427255985267, + "grad_norm": 0.6317242980003357, + "learning_rate": 9.943525585653428e-05, + "loss": 0.09542192220687866, + "mean_token_accuracy": 0.9635261118412017, + "num_tokens": 10682828.0, + "step": 520 + }, + { + "entropy": 0.9362513542175293, + "epoch": 0.9760589318600368, + "grad_norm": 0.6421944499015808, + "learning_rate": 9.938869720229234e-05, + "loss": 0.09382058382034301, + "mean_token_accuracy": 0.9648073971271515, + "num_tokens": 10888741.0, + "step": 530 + }, + { + "entropy": 0.9235438346862793, + "epoch": 0.994475138121547, + "grad_norm": 0.7986873388290405, + "learning_rate": 9.934030638847155e-05, + "loss": 0.09827429056167603, + "mean_token_accuracy": 0.9621128737926483, + "num_tokens": 11094387.0, + "step": 540 + }, + { + "epoch": 1.0, + "eval_entropy": 0.9137652366057686, + "eval_loss": 0.09368764609098434, + "eval_mean_token_accuracy": 0.9640816880309063, + "eval_num_tokens": 11155908.0, + "eval_runtime": 10.4701, + "eval_samples_per_second": 349.377, + "eval_steps_per_second": 10.984, + "step": 543 + }, + { + "entropy": 0.9047818422317505, + "epoch": 1.0128913443830572, + "grad_norm": 0.6781501173973083, + "learning_rate": 9.929008521021325e-05, + "loss": 0.0863916516304016, + "mean_token_accuracy": 0.9673655688762665, + "num_tokens": 11299715.0, + "step": 550 + }, + { + "entropy": 0.8856981039047241, + "epoch": 1.0313075506445673, + "grad_norm": 0.7143136858940125, + "learning_rate": 9.923803553055937e-05, + "loss": 0.08632323145866394, + "mean_token_accuracy": 0.9677783191204071, + "num_tokens": 11505059.0, + "step": 560 + }, + { + "entropy": 0.8937099635601043, + "epoch": 1.0497237569060773, + "grad_norm": 0.7751694321632385, + "learning_rate": 9.918415928038325e-05, + "loss": 0.08178263902664185, + "mean_token_accuracy": 0.9694291114807129, + "num_tokens": 11710464.0, + "step": 570 + }, + { + "entropy": 0.8858704209327698, + "epoch": 1.0681399631675874, + "grad_norm": 0.7492292523384094, + "learning_rate": 9.912845845831805e-05, + "loss": 0.08074211478233337, + "mean_token_accuracy": 0.9692470014095307, + "num_tokens": 11915959.0, + "step": 580 + }, + { + "entropy": 0.8948039829730987, + "epoch": 1.0865561694290977, + "grad_norm": 0.8116479516029358, + "learning_rate": 9.907093513068259e-05, + "loss": 0.08712012171745301, + "mean_token_accuracy": 0.9669980227947235, + "num_tokens": 12121499.0, + "step": 590 + }, + { + "entropy": 0.8846789538860321, + "epoch": 1.1049723756906078, + "grad_norm": 0.7295626997947693, + "learning_rate": 9.901159143140471e-05, + "loss": 0.08444435596466064, + "mean_token_accuracy": 0.9674544095993042, + "num_tokens": 12327061.0, + "step": 600 + }, + { + "entropy": 0.8734103918075562, + "epoch": 1.1233885819521179, + "grad_norm": 0.9585768580436707, + "learning_rate": 9.89504295619421e-05, + "loss": 0.08022565841674804, + "mean_token_accuracy": 0.969569206237793, + "num_tokens": 12532305.0, + "step": 610 + }, + { + "entropy": 0.8640486001968384, + "epoch": 1.141804788213628, + "grad_norm": 0.7891159057617188, + "learning_rate": 9.88874517912006e-05, + "loss": 0.08415375947952271, + "mean_token_accuracy": 0.9678892493247986, + "num_tokens": 12737828.0, + "step": 620 + }, + { + "entropy": 0.8599755525588989, + "epoch": 1.160220994475138, + "grad_norm": 0.5801345109939575, + "learning_rate": 9.882266045545012e-05, + "loss": 0.08100489974021911, + "mean_token_accuracy": 0.9688023269176483, + "num_tokens": 12943343.0, + "step": 630 + }, + { + "entropy": 0.86524977684021, + "epoch": 1.1786372007366483, + "grad_norm": 0.7633041143417358, + "learning_rate": 9.87560579582379e-05, + "loss": 0.07859406471252442, + "mean_token_accuracy": 0.9702189445495606, + "num_tokens": 13148473.0, + "step": 640 + }, + { + "entropy": 0.8466695249080658, + "epoch": 1.1970534069981584, + "grad_norm": 0.8672215938568115, + "learning_rate": 9.868764677029934e-05, + "loss": 0.08082623481750488, + "mean_token_accuracy": 0.9689972400665283, + "num_tokens": 13353890.0, + "step": 650 + }, + { + "entropy": 0.8596941530704498, + "epoch": 1.2154696132596685, + "grad_norm": 0.7524124383926392, + "learning_rate": 9.861742942946639e-05, + "loss": 0.0789935290813446, + "mean_token_accuracy": 0.9693858206272126, + "num_tokens": 13559475.0, + "step": 660 + }, + { + "entropy": 0.8708749234676361, + "epoch": 1.2338858195211786, + "grad_norm": 0.5777031183242798, + "learning_rate": 9.854540854057337e-05, + "loss": 0.07773642539978028, + "mean_token_accuracy": 0.970385092496872, + "num_tokens": 13765076.0, + "step": 670 + }, + { + "entropy": 0.8651713371276856, + "epoch": 1.2523020257826887, + "grad_norm": 0.7924166321754456, + "learning_rate": 9.847158677536034e-05, + "loss": 0.0766686737537384, + "mean_token_accuracy": 0.9702267110347748, + "num_tokens": 13970642.0, + "step": 680 + }, + { + "entropy": 0.8763024985790253, + "epoch": 1.270718232044199, + "grad_norm": 0.741219162940979, + "learning_rate": 9.839596687237403e-05, + "loss": 0.07189929485321045, + "mean_token_accuracy": 0.9727097094058991, + "num_tokens": 14176556.0, + "step": 690 + }, + { + "entropy": 0.8556921362876893, + "epoch": 1.289134438305709, + "grad_norm": 0.6298198103904724, + "learning_rate": 9.831855163686618e-05, + "loss": 0.07608137726783752, + "mean_token_accuracy": 0.9716399371623993, + "num_tokens": 14381686.0, + "step": 700 + }, + { + "entropy": 0.869178420305252, + "epoch": 1.3075506445672191, + "grad_norm": 0.5850273370742798, + "learning_rate": 9.823934394068952e-05, + "loss": 0.07437651753425598, + "mean_token_accuracy": 0.9709566533565521, + "num_tokens": 14586814.0, + "step": 710 + }, + { + "entropy": 0.8708595156669616, + "epoch": 1.3259668508287292, + "grad_norm": 0.6580632328987122, + "learning_rate": 9.815834672219127e-05, + "loss": 0.07518917322158813, + "mean_token_accuracy": 0.9717426657676697, + "num_tokens": 14792321.0, + "step": 720 + }, + { + "entropy": 0.8826817810535431, + "epoch": 1.3443830570902393, + "grad_norm": 0.8788532018661499, + "learning_rate": 9.807556298610404e-05, + "loss": 0.07579240798950196, + "mean_token_accuracy": 0.9706341981887817, + "num_tokens": 14997810.0, + "step": 730 + }, + { + "entropy": 0.9012470185756684, + "epoch": 1.3627992633517496, + "grad_norm": 0.7022138237953186, + "learning_rate": 9.799099580343441e-05, + "loss": 0.0775588572025299, + "mean_token_accuracy": 0.9699241399765015, + "num_tokens": 15203795.0, + "step": 740 + }, + { + "entropy": 0.886955714225769, + "epoch": 1.3812154696132597, + "grad_norm": 0.7881133556365967, + "learning_rate": 9.790464831134903e-05, + "loss": 0.07125020027160645, + "mean_token_accuracy": 0.9723815560340882, + "num_tokens": 15408974.0, + "step": 750 + }, + { + "entropy": 0.9047374844551086, + "epoch": 1.3996316758747698, + "grad_norm": 0.9082005023956299, + "learning_rate": 9.781652371305824e-05, + "loss": 0.07004334926605224, + "mean_token_accuracy": 0.9725580036640167, + "num_tokens": 15614399.0, + "step": 760 + }, + { + "entropy": 0.9039053857326508, + "epoch": 1.4180478821362799, + "grad_norm": 0.8060817122459412, + "learning_rate": 9.77266252776972e-05, + "loss": 0.07103485465049744, + "mean_token_accuracy": 0.9721468150615692, + "num_tokens": 15819895.0, + "step": 770 + }, + { + "entropy": 0.8998047232627868, + "epoch": 1.43646408839779, + "grad_norm": 1.0152642726898193, + "learning_rate": 9.763495634020467e-05, + "loss": 0.07411704063415528, + "mean_token_accuracy": 0.9711063146591187, + "num_tokens": 16025297.0, + "step": 780 + }, + { + "entropy": 0.9120213568210602, + "epoch": 1.4548802946593002, + "grad_norm": 0.6288319826126099, + "learning_rate": 9.754152030119921e-05, + "loss": 0.07223712205886841, + "mean_token_accuracy": 0.9722476422786712, + "num_tokens": 16230656.0, + "step": 790 + }, + { + "entropy": 0.9142370820045471, + "epoch": 1.4732965009208103, + "grad_norm": 0.7854700088500977, + "learning_rate": 9.744632062685311e-05, + "loss": 0.07186744809150696, + "mean_token_accuracy": 0.972247713804245, + "num_tokens": 16435943.0, + "step": 800 + }, + { + "entropy": 0.8920814216136932, + "epoch": 1.4917127071823204, + "grad_norm": 0.6227074265480042, + "learning_rate": 9.734936084876383e-05, + "loss": 0.07016961574554444, + "mean_token_accuracy": 0.9725603640079499, + "num_tokens": 16641635.0, + "step": 810 + }, + { + "entropy": 0.891328877210617, + "epoch": 1.5101289134438307, + "grad_norm": 0.7601346969604492, + "learning_rate": 9.725064456382283e-05, + "loss": 0.07137494087219239, + "mean_token_accuracy": 0.9722997546195984, + "num_tokens": 16847194.0, + "step": 820 + }, + { + "entropy": 0.8921217978000641, + "epoch": 1.5285451197053406, + "grad_norm": 0.7813850045204163, + "learning_rate": 9.715017543408233e-05, + "loss": 0.06890199184417725, + "mean_token_accuracy": 0.9735044002532959, + "num_tokens": 17052807.0, + "step": 830 + }, + { + "entropy": 0.9085914671421051, + "epoch": 1.5469613259668509, + "grad_norm": 0.6184289455413818, + "learning_rate": 9.704795718661939e-05, + "loss": 0.07043765187263488, + "mean_token_accuracy": 0.9725716531276702, + "num_tokens": 17258284.0, + "step": 840 + }, + { + "entropy": 0.9029861629009247, + "epoch": 1.565377532228361, + "grad_norm": 0.7082377076148987, + "learning_rate": 9.694399361339752e-05, + "loss": 0.07113839387893676, + "mean_token_accuracy": 0.9725669205188752, + "num_tokens": 17464326.0, + "step": 850 + }, + { + "entropy": 0.8856533527374267, + "epoch": 1.583793738489871, + "grad_norm": 0.7409216165542603, + "learning_rate": 9.683828857112627e-05, + "loss": 0.07077333331108093, + "mean_token_accuracy": 0.9731084644794464, + "num_tokens": 17669537.0, + "step": 860 + }, + { + "entropy": 0.8613030433654785, + "epoch": 1.6022099447513813, + "grad_norm": 0.6801561713218689, + "learning_rate": 9.673084598111789e-05, + "loss": 0.06885308027267456, + "mean_token_accuracy": 0.97266526222229, + "num_tokens": 17875289.0, + "step": 870 + }, + { + "entropy": 0.8692965865135193, + "epoch": 1.6206261510128912, + "grad_norm": 1.1621277332305908, + "learning_rate": 9.662166982914203e-05, + "loss": 0.07017780542373657, + "mean_token_accuracy": 0.9733059942722321, + "num_tokens": 18080404.0, + "step": 880 + }, + { + "entropy": 0.8671502113342285, + "epoch": 1.6390423572744015, + "grad_norm": 0.7518903613090515, + "learning_rate": 9.651076416527787e-05, + "loss": 0.06977018713951111, + "mean_token_accuracy": 0.9730017304420471, + "num_tokens": 18285699.0, + "step": 890 + }, + { + "entropy": 0.8662045657634735, + "epoch": 1.6574585635359116, + "grad_norm": 0.6622698903083801, + "learning_rate": 9.639813310376378e-05, + "loss": 0.06620995998382569, + "mean_token_accuracy": 0.9737491130828857, + "num_tokens": 18491097.0, + "step": 900 + }, + { + "entropy": 0.8548173069953918, + "epoch": 1.6758747697974217, + "grad_norm": 0.8941843509674072, + "learning_rate": 9.628378082284479e-05, + "loss": 0.06711119413375854, + "mean_token_accuracy": 0.9740589797496796, + "num_tokens": 18696827.0, + "step": 910 + }, + { + "entropy": 0.8763562262058258, + "epoch": 1.694290976058932, + "grad_norm": 0.7571700215339661, + "learning_rate": 9.616771156461755e-05, + "loss": 0.07263468503952027, + "mean_token_accuracy": 0.9717419981956482, + "num_tokens": 18902513.0, + "step": 920 + }, + { + "entropy": 0.8663733780384064, + "epoch": 1.7127071823204418, + "grad_norm": 0.7886489629745483, + "learning_rate": 9.604992963487298e-05, + "loss": 0.07074605226516724, + "mean_token_accuracy": 0.9724965393543243, + "num_tokens": 19107812.0, + "step": 930 + }, + { + "entropy": 0.8673004627227783, + "epoch": 1.7311233885819521, + "grad_norm": 0.8180726170539856, + "learning_rate": 9.593043940293647e-05, + "loss": 0.06831735372543335, + "mean_token_accuracy": 0.9733696818351746, + "num_tokens": 19313330.0, + "step": 940 + }, + { + "entropy": 0.8525971233844757, + "epoch": 1.7495395948434622, + "grad_norm": 0.6576228737831116, + "learning_rate": 9.580924530150595e-05, + "loss": 0.06567002534866333, + "mean_token_accuracy": 0.9745754361152649, + "num_tokens": 19518671.0, + "step": 950 + }, + { + "entropy": 0.8605451703071594, + "epoch": 1.7679558011049723, + "grad_norm": 0.7171661257743835, + "learning_rate": 9.568635182648725e-05, + "loss": 0.06872050762176514, + "mean_token_accuracy": 0.9732091546058654, + "num_tokens": 19724135.0, + "step": 960 + }, + { + "entropy": 0.8642210960388184, + "epoch": 1.7863720073664826, + "grad_norm": 0.7603147029876709, + "learning_rate": 9.556176353682746e-05, + "loss": 0.06766576766967773, + "mean_token_accuracy": 0.9728681743144989, + "num_tokens": 19928785.0, + "step": 970 + }, + { + "entropy": 0.8543185651302337, + "epoch": 1.8047882136279927, + "grad_norm": 0.7280875444412231, + "learning_rate": 9.543548505434581e-05, + "loss": 0.06851862668991089, + "mean_token_accuracy": 0.9737437188625335, + "num_tokens": 20134195.0, + "step": 980 + }, + { + "entropy": 0.8744745373725891, + "epoch": 1.8232044198895028, + "grad_norm": 0.5897248983383179, + "learning_rate": 9.530752106356209e-05, + "loss": 0.06809053421020508, + "mean_token_accuracy": 0.9733593761920929, + "num_tokens": 20339517.0, + "step": 990 + }, + { + "entropy": 0.8623859465122223, + "epoch": 1.8416206261510129, + "grad_norm": 0.7515265345573425, + "learning_rate": 9.517787631152298e-05, + "loss": 0.07257847785949707, + "mean_token_accuracy": 0.9714054942131043, + "num_tokens": 20545249.0, + "step": 1000 + }, + { + "entropy": 0.8669404804706573, + "epoch": 1.860036832412523, + "grad_norm": 0.7144560813903809, + "learning_rate": 9.504655560762596e-05, + "loss": 0.06832354068756104, + "mean_token_accuracy": 0.9735779523849487, + "num_tokens": 20750507.0, + "step": 1010 + }, + { + "entropy": 0.8493516445159912, + "epoch": 1.8784530386740332, + "grad_norm": 0.6559189558029175, + "learning_rate": 9.491356382344081e-05, + "loss": 0.0629766047000885, + "mean_token_accuracy": 0.9754977762699127, + "num_tokens": 20955956.0, + "step": 1020 + }, + { + "entropy": 0.8599376022815705, + "epoch": 1.8968692449355433, + "grad_norm": 0.6792973279953003, + "learning_rate": 9.477890589252895e-05, + "loss": 0.0666757881641388, + "mean_token_accuracy": 0.974083811044693, + "num_tokens": 21161163.0, + "step": 1030 + }, + { + "entropy": 0.8458438158035279, + "epoch": 1.9152854511970534, + "grad_norm": 0.6941778659820557, + "learning_rate": 9.464258681026042e-05, + "loss": 0.06307152509689332, + "mean_token_accuracy": 0.9757042229175568, + "num_tokens": 21366525.0, + "step": 1040 + }, + { + "entropy": 0.848515909910202, + "epoch": 1.9337016574585635, + "grad_norm": 0.7307806611061096, + "learning_rate": 9.450461163362855e-05, + "loss": 0.06307026147842407, + "mean_token_accuracy": 0.9750974595546722, + "num_tokens": 21572238.0, + "step": 1050 + }, + { + "entropy": 0.8563454031944275, + "epoch": 1.9521178637200736, + "grad_norm": 0.7222106456756592, + "learning_rate": 9.436498548106236e-05, + "loss": 0.0647726058959961, + "mean_token_accuracy": 0.974629694223404, + "num_tokens": 21777633.0, + "step": 1060 + }, + { + "entropy": 0.8656457483768463, + "epoch": 1.9705340699815839, + "grad_norm": 0.67178875207901, + "learning_rate": 9.422371353223674e-05, + "loss": 0.06573554277420043, + "mean_token_accuracy": 0.9745908617973328, + "num_tokens": 21983116.0, + "step": 1070 + }, + { + "entropy": 0.8630891263484954, + "epoch": 1.988950276243094, + "grad_norm": 0.6956593990325928, + "learning_rate": 9.408080102788016e-05, + "loss": 0.06630704402923585, + "mean_token_accuracy": 0.9741333484649658, + "num_tokens": 22188662.0, + "step": 1080 + }, + { + "epoch": 2.0, + "eval_entropy": 0.8560857042022373, + "eval_loss": 0.06494329869747162, + "eval_mean_token_accuracy": 0.9745692672936813, + "eval_num_tokens": 22311800.0, + "eval_runtime": 10.129, + "eval_samples_per_second": 361.142, + "eval_steps_per_second": 11.354, + "step": 1086 + } + ], + "logging_steps": 10, + "max_steps": 5430, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0639691635941704e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1086/training_args.bin b/checkpoint-1086/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e182f9a244f56249cd86c9db6563a54fbeed7bd --- /dev/null +++ b/checkpoint-1086/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21325c9bdff5ed34f0cc34837ee67ed216c9301ab4d9b2e26f048b563564bd75 +size 5777 diff --git a/checkpoint-1629/README.md b/checkpoint-1629/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dd49221a54a197d9f0cab86c7982e2b2fe0776b3 --- /dev/null +++ b/checkpoint-1629/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen2.5-7B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen2.5-7B-Instruct +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-1629/adapter_config.json b/checkpoint-1629/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..88178e58c6d24c2bb788f871d548c8bff3b8b16a --- /dev/null +++ b/checkpoint-1629/adapter_config.json @@ -0,0 +1,48 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "down_proj", + "q_proj", + "gate_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1629/adapter_model.safetensors b/checkpoint-1629/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..004ab27f8498192090bcd87ba2405f20b982200b --- /dev/null +++ b/checkpoint-1629/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebf4459fd4ce731043eb056554bc1c81afea162e43afc91a4e21906da57bbdc0 +size 80792096 diff --git a/checkpoint-1629/chat_template.jinja b/checkpoint-1629/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..bdf7919a96cfe43d50914a007b9c0877bd0ec27e --- /dev/null +++ b/checkpoint-1629/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-1629/tokenizer.json b/checkpoint-1629/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/checkpoint-1629/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/checkpoint-1629/tokenizer_config.json b/checkpoint-1629/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d75d3bb5300d205e48769cc1999073ab5971214 --- /dev/null +++ b/checkpoint-1629/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-1629/trainer_state.json b/checkpoint-1629/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..19c86b53fe97ac71abe1a1e1bfb5b2fe70c3f1dc --- /dev/null +++ b/checkpoint-1629/trainer_state.json @@ -0,0 +1,1687 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1629, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2237394809722901, + "epoch": 0.01841620626151013, + "grad_norm": 5.082435607910156, + "learning_rate": 3.308823529411765e-06, + "loss": 0.9237876892089844, + "mean_token_accuracy": 0.7685343027114868, + "num_tokens": 205423.0, + "step": 10 + }, + { + "entropy": 1.2295925617218018, + "epoch": 0.03683241252302026, + "grad_norm": 4.672000408172607, + "learning_rate": 6.985294117647059e-06, + "loss": 0.8900892257690429, + "mean_token_accuracy": 0.7677771031856537, + "num_tokens": 410849.0, + "step": 20 + }, + { + "entropy": 1.2285718679428101, + "epoch": 0.055248618784530384, + "grad_norm": 1.4828118085861206, + "learning_rate": 1.0661764705882354e-05, + "loss": 0.5975452899932862, + "mean_token_accuracy": 0.8146551787853241, + "num_tokens": 616438.0, + "step": 30 + }, + { + "entropy": 1.210776400566101, + "epoch": 0.07366482504604052, + "grad_norm": 0.7761328816413879, + "learning_rate": 1.4338235294117647e-05, + "loss": 0.40664992332458494, + "mean_token_accuracy": 0.8699092030525207, + "num_tokens": 822118.0, + "step": 40 + }, + { + "entropy": 1.200321125984192, + "epoch": 0.09208103130755065, + "grad_norm": 0.5363371968269348, + "learning_rate": 1.8014705882352943e-05, + "loss": 0.3313469409942627, + "mean_token_accuracy": 0.8904915869235992, + "num_tokens": 1027941.0, + "step": 50 + }, + { + "entropy": 1.1809936046600342, + "epoch": 0.11049723756906077, + "grad_norm": 0.39541518688201904, + "learning_rate": 2.1691176470588237e-05, + "loss": 0.27568228244781495, + "mean_token_accuracy": 0.9047131836414337, + "num_tokens": 1233620.0, + "step": 60 + }, + { + "entropy": 1.169810914993286, + "epoch": 0.1289134438305709, + "grad_norm": 0.341960072517395, + "learning_rate": 2.536764705882353e-05, + "loss": 0.245219087600708, + "mean_token_accuracy": 0.9150686681270599, + "num_tokens": 1438656.0, + "step": 70 + }, + { + "entropy": 1.1652960777282715, + "epoch": 0.14732965009208104, + "grad_norm": 0.36872178316116333, + "learning_rate": 2.9044117647058828e-05, + "loss": 0.2220149040222168, + "mean_token_accuracy": 0.9224777698516846, + "num_tokens": 1643877.0, + "step": 80 + }, + { + "entropy": 1.154341197013855, + "epoch": 0.16574585635359115, + "grad_norm": 0.4152425229549408, + "learning_rate": 3.272058823529412e-05, + "loss": 0.2002798557281494, + "mean_token_accuracy": 0.9285802960395813, + "num_tokens": 1849506.0, + "step": 90 + }, + { + "entropy": 1.1507258892059327, + "epoch": 0.1841620626151013, + "grad_norm": 0.47647765278816223, + "learning_rate": 3.639705882352941e-05, + "loss": 0.18871363401412963, + "mean_token_accuracy": 0.9318056285381318, + "num_tokens": 2055071.0, + "step": 100 + }, + { + "entropy": 1.1455535531044005, + "epoch": 0.20257826887661143, + "grad_norm": 0.4853009581565857, + "learning_rate": 4.007352941176471e-05, + "loss": 0.17836341857910157, + "mean_token_accuracy": 0.9367631554603577, + "num_tokens": 2260643.0, + "step": 110 + }, + { + "entropy": 1.1402526497840881, + "epoch": 0.22099447513812154, + "grad_norm": 0.4455392360687256, + "learning_rate": 4.375e-05, + "loss": 0.16921783685684205, + "mean_token_accuracy": 0.9386959195137023, + "num_tokens": 2466085.0, + "step": 120 + }, + { + "entropy": 1.1374777555465698, + "epoch": 0.23941068139963168, + "grad_norm": 0.5880279541015625, + "learning_rate": 4.742647058823529e-05, + "loss": 0.15989291667938232, + "mean_token_accuracy": 0.9421182632446289, + "num_tokens": 2671024.0, + "step": 130 + }, + { + "entropy": 1.1273940205574036, + "epoch": 0.2578268876611418, + "grad_norm": 0.612959086894989, + "learning_rate": 5.110294117647059e-05, + "loss": 0.14701461791992188, + "mean_token_accuracy": 0.9463540315628052, + "num_tokens": 2876848.0, + "step": 140 + }, + { + "entropy": 1.1263513088226318, + "epoch": 0.27624309392265195, + "grad_norm": 0.5695255398750305, + "learning_rate": 5.477941176470589e-05, + "loss": 0.14604382514953612, + "mean_token_accuracy": 0.946351945400238, + "num_tokens": 3082589.0, + "step": 150 + }, + { + "entropy": 1.1290789365768432, + "epoch": 0.2946593001841621, + "grad_norm": 0.6608090996742249, + "learning_rate": 5.845588235294118e-05, + "loss": 0.1409450054168701, + "mean_token_accuracy": 0.9481450319290161, + "num_tokens": 3287459.0, + "step": 160 + }, + { + "entropy": 1.1291529774665832, + "epoch": 0.31307550644567217, + "grad_norm": 0.652715802192688, + "learning_rate": 6.213235294117647e-05, + "loss": 0.14441155195236205, + "mean_token_accuracy": 0.9466125547885895, + "num_tokens": 3493682.0, + "step": 170 + }, + { + "entropy": 1.1244838953018188, + "epoch": 0.3314917127071823, + "grad_norm": 0.7815241813659668, + "learning_rate": 6.580882352941177e-05, + "loss": 0.13361064195632935, + "mean_token_accuracy": 0.9512295544147491, + "num_tokens": 3699573.0, + "step": 180 + }, + { + "entropy": 1.1217721104621887, + "epoch": 0.34990791896869244, + "grad_norm": 0.7933160066604614, + "learning_rate": 6.948529411764706e-05, + "loss": 0.13089522123336791, + "mean_token_accuracy": 0.9520221531391144, + "num_tokens": 3905156.0, + "step": 190 + }, + { + "entropy": 1.1206679105758668, + "epoch": 0.3683241252302026, + "grad_norm": 0.6815240383148193, + "learning_rate": 7.316176470588236e-05, + "loss": 0.13400404453277587, + "mean_token_accuracy": 0.9501322209835052, + "num_tokens": 4110570.0, + "step": 200 + }, + { + "entropy": 1.1161052227020263, + "epoch": 0.3867403314917127, + "grad_norm": 0.8297767639160156, + "learning_rate": 7.683823529411766e-05, + "loss": 0.13389937877655028, + "mean_token_accuracy": 0.9501932203769684, + "num_tokens": 4315834.0, + "step": 210 + }, + { + "entropy": 1.1098745942115784, + "epoch": 0.40515653775322286, + "grad_norm": 0.5943381786346436, + "learning_rate": 8.051470588235294e-05, + "loss": 0.13452907800674438, + "mean_token_accuracy": 0.9503286242485046, + "num_tokens": 4520807.0, + "step": 220 + }, + { + "entropy": 1.100480353832245, + "epoch": 0.42357274401473294, + "grad_norm": 0.6094359755516052, + "learning_rate": 8.419117647058824e-05, + "loss": 0.12827746868133544, + "mean_token_accuracy": 0.952492094039917, + "num_tokens": 4725867.0, + "step": 230 + }, + { + "entropy": 1.0901286959648133, + "epoch": 0.4419889502762431, + "grad_norm": 0.7240597605705261, + "learning_rate": 8.786764705882353e-05, + "loss": 0.12171242237091065, + "mean_token_accuracy": 0.953943532705307, + "num_tokens": 4931629.0, + "step": 240 + }, + { + "entropy": 1.0885071873664856, + "epoch": 0.4604051565377532, + "grad_norm": 0.6939547657966614, + "learning_rate": 9.154411764705882e-05, + "loss": 0.12155698537826538, + "mean_token_accuracy": 0.9545870959758759, + "num_tokens": 5137285.0, + "step": 250 + }, + { + "entropy": 1.086272156238556, + "epoch": 0.47882136279926335, + "grad_norm": 0.5752800703048706, + "learning_rate": 9.522058823529412e-05, + "loss": 0.12157790660858155, + "mean_token_accuracy": 0.9541126549243927, + "num_tokens": 5342575.0, + "step": 260 + }, + { + "entropy": 1.0857678413391114, + "epoch": 0.4972375690607735, + "grad_norm": 0.7565123438835144, + "learning_rate": 9.889705882352942e-05, + "loss": 0.12349612712860107, + "mean_token_accuracy": 0.9535140514373779, + "num_tokens": 5547995.0, + "step": 270 + }, + { + "entropy": 1.079762625694275, + "epoch": 0.5156537753222836, + "grad_norm": 0.6972768306732178, + "learning_rate": 9.999954556423843e-05, + "loss": 0.11875582933425903, + "mean_token_accuracy": 0.9556483089923858, + "num_tokens": 5753195.0, + "step": 280 + }, + { + "entropy": 1.0742079138755798, + "epoch": 0.5340699815837937, + "grad_norm": 0.7821696996688843, + "learning_rate": 9.999731977631227e-05, + "loss": 0.11824090480804443, + "mean_token_accuracy": 0.9557521045207977, + "num_tokens": 5958236.0, + "step": 290 + }, + { + "entropy": 1.0679773569107056, + "epoch": 0.5524861878453039, + "grad_norm": 0.5846888422966003, + "learning_rate": 9.999323925089486e-05, + "loss": 0.11707355976104736, + "mean_token_accuracy": 0.9554719448089599, + "num_tokens": 6163992.0, + "step": 300 + }, + { + "entropy": 1.0655727863311768, + "epoch": 0.570902394106814, + "grad_norm": 0.5812502503395081, + "learning_rate": 9.998730413936037e-05, + "loss": 0.11371417045593261, + "mean_token_accuracy": 0.9576376020908356, + "num_tokens": 6369456.0, + "step": 310 + }, + { + "entropy": 1.0607039332389832, + "epoch": 0.5893186003683242, + "grad_norm": 0.6238475441932678, + "learning_rate": 9.99795146618821e-05, + "loss": 0.11775733232498169, + "mean_token_accuracy": 0.9557221591472626, + "num_tokens": 6574833.0, + "step": 320 + }, + { + "entropy": 1.0504255175590516, + "epoch": 0.6077348066298343, + "grad_norm": 0.6496815085411072, + "learning_rate": 9.996987110742422e-05, + "loss": 0.10904088020324706, + "mean_token_accuracy": 0.9585366368293762, + "num_tokens": 6780108.0, + "step": 330 + }, + { + "entropy": 1.0456081986427308, + "epoch": 0.6261510128913443, + "grad_norm": 0.786702573299408, + "learning_rate": 9.995837383373119e-05, + "loss": 0.10642309188842773, + "mean_token_accuracy": 0.9596696078777314, + "num_tokens": 6985920.0, + "step": 340 + }, + { + "entropy": 1.0455098271369934, + "epoch": 0.6445672191528545, + "grad_norm": 0.5473790168762207, + "learning_rate": 9.994502326731434e-05, + "loss": 0.10822961330413819, + "mean_token_accuracy": 0.959563136100769, + "num_tokens": 7191465.0, + "step": 350 + }, + { + "entropy": 1.04240562915802, + "epoch": 0.6629834254143646, + "grad_norm": 0.6672356128692627, + "learning_rate": 9.992981990343614e-05, + "loss": 0.1110004186630249, + "mean_token_accuracy": 0.9582514643669129, + "num_tokens": 7396877.0, + "step": 360 + }, + { + "entropy": 1.0386811256408692, + "epoch": 0.6813996316758748, + "grad_norm": 0.698539674282074, + "learning_rate": 9.99127643060918e-05, + "loss": 0.107539963722229, + "mean_token_accuracy": 0.9593036234378814, + "num_tokens": 7602437.0, + "step": 370 + }, + { + "entropy": 1.0311225533485413, + "epoch": 0.6998158379373849, + "grad_norm": 0.6629284024238586, + "learning_rate": 9.989385710798837e-05, + "loss": 0.1064023494720459, + "mean_token_accuracy": 0.9602205216884613, + "num_tokens": 7808142.0, + "step": 380 + }, + { + "entropy": 1.030210506916046, + "epoch": 0.7182320441988951, + "grad_norm": 0.5616748929023743, + "learning_rate": 9.987309901052121e-05, + "loss": 0.10717041492462158, + "mean_token_accuracy": 0.9599347949028015, + "num_tokens": 8013407.0, + "step": 390 + }, + { + "entropy": 1.0208017826080322, + "epoch": 0.7366482504604052, + "grad_norm": 0.6329049468040466, + "learning_rate": 9.985049078374806e-05, + "loss": 0.10359601974487305, + "mean_token_accuracy": 0.9603756129741668, + "num_tokens": 8219040.0, + "step": 400 + }, + { + "entropy": 1.015640377998352, + "epoch": 0.7550644567219152, + "grad_norm": 0.6516013741493225, + "learning_rate": 9.982603326636037e-05, + "loss": 0.10146439075469971, + "mean_token_accuracy": 0.9627702474594116, + "num_tokens": 8424678.0, + "step": 410 + }, + { + "entropy": 1.0105359435081482, + "epoch": 0.7734806629834254, + "grad_norm": 0.6920603513717651, + "learning_rate": 9.979972736565226e-05, + "loss": 0.10770498514175415, + "mean_token_accuracy": 0.9591470420360565, + "num_tokens": 8629868.0, + "step": 420 + }, + { + "entropy": 0.9966452836990356, + "epoch": 0.7918968692449355, + "grad_norm": 0.6857476234436035, + "learning_rate": 9.977157405748687e-05, + "loss": 0.10282524824142455, + "mean_token_accuracy": 0.9612209022045135, + "num_tokens": 8835320.0, + "step": 430 + }, + { + "entropy": 0.9945534646511078, + "epoch": 0.8103130755064457, + "grad_norm": 0.7208472490310669, + "learning_rate": 9.974157438626008e-05, + "loss": 0.10069938898086547, + "mean_token_accuracy": 0.9620070576667785, + "num_tokens": 9041123.0, + "step": 440 + }, + { + "entropy": 0.979461395740509, + "epoch": 0.8287292817679558, + "grad_norm": 0.5071915984153748, + "learning_rate": 9.970972946486185e-05, + "loss": 0.09799174070358277, + "mean_token_accuracy": 0.9620374023914338, + "num_tokens": 9246361.0, + "step": 450 + }, + { + "entropy": 0.9830998003482818, + "epoch": 0.8471454880294659, + "grad_norm": 0.8660802245140076, + "learning_rate": 9.967604047463493e-05, + "loss": 0.10378165245056152, + "mean_token_accuracy": 0.9606865763664245, + "num_tokens": 9451845.0, + "step": 460 + }, + { + "entropy": 0.9813413023948669, + "epoch": 0.8655616942909761, + "grad_norm": 0.7642477750778198, + "learning_rate": 9.964050866533094e-05, + "loss": 0.1010061264038086, + "mean_token_accuracy": 0.9608745336532593, + "num_tokens": 9656802.0, + "step": 470 + }, + { + "entropy": 0.967874163389206, + "epoch": 0.8839779005524862, + "grad_norm": 0.5987281799316406, + "learning_rate": 9.960313535506411e-05, + "loss": 0.10169394016265869, + "mean_token_accuracy": 0.9611998200416565, + "num_tokens": 9861719.0, + "step": 480 + }, + { + "entropy": 0.9663491308689117, + "epoch": 0.9023941068139963, + "grad_norm": 0.6124638319015503, + "learning_rate": 9.956392193026239e-05, + "loss": 0.102389657497406, + "mean_token_accuracy": 0.9611884355545044, + "num_tokens": 10066673.0, + "step": 490 + }, + { + "entropy": 0.959654438495636, + "epoch": 0.9208103130755064, + "grad_norm": 0.7873051762580872, + "learning_rate": 9.952286984561592e-05, + "loss": 0.10170392990112305, + "mean_token_accuracy": 0.9610928475856781, + "num_tokens": 10272091.0, + "step": 500 + }, + { + "entropy": 0.9550537407398224, + "epoch": 0.9392265193370166, + "grad_norm": 0.6071968078613281, + "learning_rate": 9.947998062402313e-05, + "loss": 0.09448277950286865, + "mean_token_accuracy": 0.9648977637290954, + "num_tokens": 10477632.0, + "step": 510 + }, + { + "entropy": 0.9538533687591553, + "epoch": 0.9576427255985267, + "grad_norm": 0.6317242980003357, + "learning_rate": 9.943525585653428e-05, + "loss": 0.09542192220687866, + "mean_token_accuracy": 0.9635261118412017, + "num_tokens": 10682828.0, + "step": 520 + }, + { + "entropy": 0.9362513542175293, + "epoch": 0.9760589318600368, + "grad_norm": 0.6421944499015808, + "learning_rate": 9.938869720229234e-05, + "loss": 0.09382058382034301, + "mean_token_accuracy": 0.9648073971271515, + "num_tokens": 10888741.0, + "step": 530 + }, + { + "entropy": 0.9235438346862793, + "epoch": 0.994475138121547, + "grad_norm": 0.7986873388290405, + "learning_rate": 9.934030638847155e-05, + "loss": 0.09827429056167603, + "mean_token_accuracy": 0.9621128737926483, + "num_tokens": 11094387.0, + "step": 540 + }, + { + "epoch": 1.0, + "eval_entropy": 0.9137652366057686, + "eval_loss": 0.09368764609098434, + "eval_mean_token_accuracy": 0.9640816880309063, + "eval_num_tokens": 11155908.0, + "eval_runtime": 10.4701, + "eval_samples_per_second": 349.377, + "eval_steps_per_second": 10.984, + "step": 543 + }, + { + "entropy": 0.9047818422317505, + "epoch": 1.0128913443830572, + "grad_norm": 0.6781501173973083, + "learning_rate": 9.929008521021325e-05, + "loss": 0.0863916516304016, + "mean_token_accuracy": 0.9673655688762665, + "num_tokens": 11299715.0, + "step": 550 + }, + { + "entropy": 0.8856981039047241, + "epoch": 1.0313075506445673, + "grad_norm": 0.7143136858940125, + "learning_rate": 9.923803553055937e-05, + "loss": 0.08632323145866394, + "mean_token_accuracy": 0.9677783191204071, + "num_tokens": 11505059.0, + "step": 560 + }, + { + "entropy": 0.8937099635601043, + "epoch": 1.0497237569060773, + "grad_norm": 0.7751694321632385, + "learning_rate": 9.918415928038325e-05, + "loss": 0.08178263902664185, + "mean_token_accuracy": 0.9694291114807129, + "num_tokens": 11710464.0, + "step": 570 + }, + { + "entropy": 0.8858704209327698, + "epoch": 1.0681399631675874, + "grad_norm": 0.7492292523384094, + "learning_rate": 9.912845845831805e-05, + "loss": 0.08074211478233337, + "mean_token_accuracy": 0.9692470014095307, + "num_tokens": 11915959.0, + "step": 580 + }, + { + "entropy": 0.8948039829730987, + "epoch": 1.0865561694290977, + "grad_norm": 0.8116479516029358, + "learning_rate": 9.907093513068259e-05, + "loss": 0.08712012171745301, + "mean_token_accuracy": 0.9669980227947235, + "num_tokens": 12121499.0, + "step": 590 + }, + { + "entropy": 0.8846789538860321, + "epoch": 1.1049723756906078, + "grad_norm": 0.7295626997947693, + "learning_rate": 9.901159143140471e-05, + "loss": 0.08444435596466064, + "mean_token_accuracy": 0.9674544095993042, + "num_tokens": 12327061.0, + "step": 600 + }, + { + "entropy": 0.8734103918075562, + "epoch": 1.1233885819521179, + "grad_norm": 0.9585768580436707, + "learning_rate": 9.89504295619421e-05, + "loss": 0.08022565841674804, + "mean_token_accuracy": 0.969569206237793, + "num_tokens": 12532305.0, + "step": 610 + }, + { + "entropy": 0.8640486001968384, + "epoch": 1.141804788213628, + "grad_norm": 0.7891159057617188, + "learning_rate": 9.88874517912006e-05, + "loss": 0.08415375947952271, + "mean_token_accuracy": 0.9678892493247986, + "num_tokens": 12737828.0, + "step": 620 + }, + { + "entropy": 0.8599755525588989, + "epoch": 1.160220994475138, + "grad_norm": 0.5801345109939575, + "learning_rate": 9.882266045545012e-05, + "loss": 0.08100489974021911, + "mean_token_accuracy": 0.9688023269176483, + "num_tokens": 12943343.0, + "step": 630 + }, + { + "entropy": 0.86524977684021, + "epoch": 1.1786372007366483, + "grad_norm": 0.7633041143417358, + "learning_rate": 9.87560579582379e-05, + "loss": 0.07859406471252442, + "mean_token_accuracy": 0.9702189445495606, + "num_tokens": 13148473.0, + "step": 640 + }, + { + "entropy": 0.8466695249080658, + "epoch": 1.1970534069981584, + "grad_norm": 0.8672215938568115, + "learning_rate": 9.868764677029934e-05, + "loss": 0.08082623481750488, + "mean_token_accuracy": 0.9689972400665283, + "num_tokens": 13353890.0, + "step": 650 + }, + { + "entropy": 0.8596941530704498, + "epoch": 1.2154696132596685, + "grad_norm": 0.7524124383926392, + "learning_rate": 9.861742942946639e-05, + "loss": 0.0789935290813446, + "mean_token_accuracy": 0.9693858206272126, + "num_tokens": 13559475.0, + "step": 660 + }, + { + "entropy": 0.8708749234676361, + "epoch": 1.2338858195211786, + "grad_norm": 0.5777031183242798, + "learning_rate": 9.854540854057337e-05, + "loss": 0.07773642539978028, + "mean_token_accuracy": 0.970385092496872, + "num_tokens": 13765076.0, + "step": 670 + }, + { + "entropy": 0.8651713371276856, + "epoch": 1.2523020257826887, + "grad_norm": 0.7924166321754456, + "learning_rate": 9.847158677536034e-05, + "loss": 0.0766686737537384, + "mean_token_accuracy": 0.9702267110347748, + "num_tokens": 13970642.0, + "step": 680 + }, + { + "entropy": 0.8763024985790253, + "epoch": 1.270718232044199, + "grad_norm": 0.741219162940979, + "learning_rate": 9.839596687237403e-05, + "loss": 0.07189929485321045, + "mean_token_accuracy": 0.9727097094058991, + "num_tokens": 14176556.0, + "step": 690 + }, + { + "entropy": 0.8556921362876893, + "epoch": 1.289134438305709, + "grad_norm": 0.6298198103904724, + "learning_rate": 9.831855163686618e-05, + "loss": 0.07608137726783752, + "mean_token_accuracy": 0.9716399371623993, + "num_tokens": 14381686.0, + "step": 700 + }, + { + "entropy": 0.869178420305252, + "epoch": 1.3075506445672191, + "grad_norm": 0.5850273370742798, + "learning_rate": 9.823934394068952e-05, + "loss": 0.07437651753425598, + "mean_token_accuracy": 0.9709566533565521, + "num_tokens": 14586814.0, + "step": 710 + }, + { + "entropy": 0.8708595156669616, + "epoch": 1.3259668508287292, + "grad_norm": 0.6580632328987122, + "learning_rate": 9.815834672219127e-05, + "loss": 0.07518917322158813, + "mean_token_accuracy": 0.9717426657676697, + "num_tokens": 14792321.0, + "step": 720 + }, + { + "entropy": 0.8826817810535431, + "epoch": 1.3443830570902393, + "grad_norm": 0.8788532018661499, + "learning_rate": 9.807556298610404e-05, + "loss": 0.07579240798950196, + "mean_token_accuracy": 0.9706341981887817, + "num_tokens": 14997810.0, + "step": 730 + }, + { + "entropy": 0.9012470185756684, + "epoch": 1.3627992633517496, + "grad_norm": 0.7022138237953186, + "learning_rate": 9.799099580343441e-05, + "loss": 0.0775588572025299, + "mean_token_accuracy": 0.9699241399765015, + "num_tokens": 15203795.0, + "step": 740 + }, + { + "entropy": 0.886955714225769, + "epoch": 1.3812154696132597, + "grad_norm": 0.7881133556365967, + "learning_rate": 9.790464831134903e-05, + "loss": 0.07125020027160645, + "mean_token_accuracy": 0.9723815560340882, + "num_tokens": 15408974.0, + "step": 750 + }, + { + "entropy": 0.9047374844551086, + "epoch": 1.3996316758747698, + "grad_norm": 0.9082005023956299, + "learning_rate": 9.781652371305824e-05, + "loss": 0.07004334926605224, + "mean_token_accuracy": 0.9725580036640167, + "num_tokens": 15614399.0, + "step": 760 + }, + { + "entropy": 0.9039053857326508, + "epoch": 1.4180478821362799, + "grad_norm": 0.8060817122459412, + "learning_rate": 9.77266252776972e-05, + "loss": 0.07103485465049744, + "mean_token_accuracy": 0.9721468150615692, + "num_tokens": 15819895.0, + "step": 770 + }, + { + "entropy": 0.8998047232627868, + "epoch": 1.43646408839779, + "grad_norm": 1.0152642726898193, + "learning_rate": 9.763495634020467e-05, + "loss": 0.07411704063415528, + "mean_token_accuracy": 0.9711063146591187, + "num_tokens": 16025297.0, + "step": 780 + }, + { + "entropy": 0.9120213568210602, + "epoch": 1.4548802946593002, + "grad_norm": 0.6288319826126099, + "learning_rate": 9.754152030119921e-05, + "loss": 0.07223712205886841, + "mean_token_accuracy": 0.9722476422786712, + "num_tokens": 16230656.0, + "step": 790 + }, + { + "entropy": 0.9142370820045471, + "epoch": 1.4732965009208103, + "grad_norm": 0.7854700088500977, + "learning_rate": 9.744632062685311e-05, + "loss": 0.07186744809150696, + "mean_token_accuracy": 0.972247713804245, + "num_tokens": 16435943.0, + "step": 800 + }, + { + "entropy": 0.8920814216136932, + "epoch": 1.4917127071823204, + "grad_norm": 0.6227074265480042, + "learning_rate": 9.734936084876383e-05, + "loss": 0.07016961574554444, + "mean_token_accuracy": 0.9725603640079499, + "num_tokens": 16641635.0, + "step": 810 + }, + { + "entropy": 0.891328877210617, + "epoch": 1.5101289134438307, + "grad_norm": 0.7601346969604492, + "learning_rate": 9.725064456382283e-05, + "loss": 0.07137494087219239, + "mean_token_accuracy": 0.9722997546195984, + "num_tokens": 16847194.0, + "step": 820 + }, + { + "entropy": 0.8921217978000641, + "epoch": 1.5285451197053406, + "grad_norm": 0.7813850045204163, + "learning_rate": 9.715017543408233e-05, + "loss": 0.06890199184417725, + "mean_token_accuracy": 0.9735044002532959, + "num_tokens": 17052807.0, + "step": 830 + }, + { + "entropy": 0.9085914671421051, + "epoch": 1.5469613259668509, + "grad_norm": 0.6184289455413818, + "learning_rate": 9.704795718661939e-05, + "loss": 0.07043765187263488, + "mean_token_accuracy": 0.9725716531276702, + "num_tokens": 17258284.0, + "step": 840 + }, + { + "entropy": 0.9029861629009247, + "epoch": 1.565377532228361, + "grad_norm": 0.7082377076148987, + "learning_rate": 9.694399361339752e-05, + "loss": 0.07113839387893676, + "mean_token_accuracy": 0.9725669205188752, + "num_tokens": 17464326.0, + "step": 850 + }, + { + "entropy": 0.8856533527374267, + "epoch": 1.583793738489871, + "grad_norm": 0.7409216165542603, + "learning_rate": 9.683828857112627e-05, + "loss": 0.07077333331108093, + "mean_token_accuracy": 0.9731084644794464, + "num_tokens": 17669537.0, + "step": 860 + }, + { + "entropy": 0.8613030433654785, + "epoch": 1.6022099447513813, + "grad_norm": 0.6801561713218689, + "learning_rate": 9.673084598111789e-05, + "loss": 0.06885308027267456, + "mean_token_accuracy": 0.97266526222229, + "num_tokens": 17875289.0, + "step": 870 + }, + { + "entropy": 0.8692965865135193, + "epoch": 1.6206261510128912, + "grad_norm": 1.1621277332305908, + "learning_rate": 9.662166982914203e-05, + "loss": 0.07017780542373657, + "mean_token_accuracy": 0.9733059942722321, + "num_tokens": 18080404.0, + "step": 880 + }, + { + "entropy": 0.8671502113342285, + "epoch": 1.6390423572744015, + "grad_norm": 0.7518903613090515, + "learning_rate": 9.651076416527787e-05, + "loss": 0.06977018713951111, + "mean_token_accuracy": 0.9730017304420471, + "num_tokens": 18285699.0, + "step": 890 + }, + { + "entropy": 0.8662045657634735, + "epoch": 1.6574585635359116, + "grad_norm": 0.6622698903083801, + "learning_rate": 9.639813310376378e-05, + "loss": 0.06620995998382569, + "mean_token_accuracy": 0.9737491130828857, + "num_tokens": 18491097.0, + "step": 900 + }, + { + "entropy": 0.8548173069953918, + "epoch": 1.6758747697974217, + "grad_norm": 0.8941843509674072, + "learning_rate": 9.628378082284479e-05, + "loss": 0.06711119413375854, + "mean_token_accuracy": 0.9740589797496796, + "num_tokens": 18696827.0, + "step": 910 + }, + { + "entropy": 0.8763562262058258, + "epoch": 1.694290976058932, + "grad_norm": 0.7571700215339661, + "learning_rate": 9.616771156461755e-05, + "loss": 0.07263468503952027, + "mean_token_accuracy": 0.9717419981956482, + "num_tokens": 18902513.0, + "step": 920 + }, + { + "entropy": 0.8663733780384064, + "epoch": 1.7127071823204418, + "grad_norm": 0.7886489629745483, + "learning_rate": 9.604992963487298e-05, + "loss": 0.07074605226516724, + "mean_token_accuracy": 0.9724965393543243, + "num_tokens": 19107812.0, + "step": 930 + }, + { + "entropy": 0.8673004627227783, + "epoch": 1.7311233885819521, + "grad_norm": 0.8180726170539856, + "learning_rate": 9.593043940293647e-05, + "loss": 0.06831735372543335, + "mean_token_accuracy": 0.9733696818351746, + "num_tokens": 19313330.0, + "step": 940 + }, + { + "entropy": 0.8525971233844757, + "epoch": 1.7495395948434622, + "grad_norm": 0.6576228737831116, + "learning_rate": 9.580924530150595e-05, + "loss": 0.06567002534866333, + "mean_token_accuracy": 0.9745754361152649, + "num_tokens": 19518671.0, + "step": 950 + }, + { + "entropy": 0.8605451703071594, + "epoch": 1.7679558011049723, + "grad_norm": 0.7171661257743835, + "learning_rate": 9.568635182648725e-05, + "loss": 0.06872050762176514, + "mean_token_accuracy": 0.9732091546058654, + "num_tokens": 19724135.0, + "step": 960 + }, + { + "entropy": 0.8642210960388184, + "epoch": 1.7863720073664826, + "grad_norm": 0.7603147029876709, + "learning_rate": 9.556176353682746e-05, + "loss": 0.06766576766967773, + "mean_token_accuracy": 0.9728681743144989, + "num_tokens": 19928785.0, + "step": 970 + }, + { + "entropy": 0.8543185651302337, + "epoch": 1.8047882136279927, + "grad_norm": 0.7280875444412231, + "learning_rate": 9.543548505434581e-05, + "loss": 0.06851862668991089, + "mean_token_accuracy": 0.9737437188625335, + "num_tokens": 20134195.0, + "step": 980 + }, + { + "entropy": 0.8744745373725891, + "epoch": 1.8232044198895028, + "grad_norm": 0.5897248983383179, + "learning_rate": 9.530752106356209e-05, + "loss": 0.06809053421020508, + "mean_token_accuracy": 0.9733593761920929, + "num_tokens": 20339517.0, + "step": 990 + }, + { + "entropy": 0.8623859465122223, + "epoch": 1.8416206261510129, + "grad_norm": 0.7515265345573425, + "learning_rate": 9.517787631152298e-05, + "loss": 0.07257847785949707, + "mean_token_accuracy": 0.9714054942131043, + "num_tokens": 20545249.0, + "step": 1000 + }, + { + "entropy": 0.8669404804706573, + "epoch": 1.860036832412523, + "grad_norm": 0.7144560813903809, + "learning_rate": 9.504655560762596e-05, + "loss": 0.06832354068756104, + "mean_token_accuracy": 0.9735779523849487, + "num_tokens": 20750507.0, + "step": 1010 + }, + { + "entropy": 0.8493516445159912, + "epoch": 1.8784530386740332, + "grad_norm": 0.6559189558029175, + "learning_rate": 9.491356382344081e-05, + "loss": 0.0629766047000885, + "mean_token_accuracy": 0.9754977762699127, + "num_tokens": 20955956.0, + "step": 1020 + }, + { + "entropy": 0.8599376022815705, + "epoch": 1.8968692449355433, + "grad_norm": 0.6792973279953003, + "learning_rate": 9.477890589252895e-05, + "loss": 0.0666757881641388, + "mean_token_accuracy": 0.974083811044693, + "num_tokens": 21161163.0, + "step": 1030 + }, + { + "entropy": 0.8458438158035279, + "epoch": 1.9152854511970534, + "grad_norm": 0.6941778659820557, + "learning_rate": 9.464258681026042e-05, + "loss": 0.06307152509689332, + "mean_token_accuracy": 0.9757042229175568, + "num_tokens": 21366525.0, + "step": 1040 + }, + { + "entropy": 0.848515909910202, + "epoch": 1.9337016574585635, + "grad_norm": 0.7307806611061096, + "learning_rate": 9.450461163362855e-05, + "loss": 0.06307026147842407, + "mean_token_accuracy": 0.9750974595546722, + "num_tokens": 21572238.0, + "step": 1050 + }, + { + "entropy": 0.8563454031944275, + "epoch": 1.9521178637200736, + "grad_norm": 0.7222106456756592, + "learning_rate": 9.436498548106236e-05, + "loss": 0.0647726058959961, + "mean_token_accuracy": 0.974629694223404, + "num_tokens": 21777633.0, + "step": 1060 + }, + { + "entropy": 0.8656457483768463, + "epoch": 1.9705340699815839, + "grad_norm": 0.67178875207901, + "learning_rate": 9.422371353223674e-05, + "loss": 0.06573554277420043, + "mean_token_accuracy": 0.9745908617973328, + "num_tokens": 21983116.0, + "step": 1070 + }, + { + "entropy": 0.8630891263484954, + "epoch": 1.988950276243094, + "grad_norm": 0.6956593990325928, + "learning_rate": 9.408080102788016e-05, + "loss": 0.06630704402923585, + "mean_token_accuracy": 0.9741333484649658, + "num_tokens": 22188662.0, + "step": 1080 + }, + { + "epoch": 2.0, + "eval_entropy": 0.8560857042022373, + "eval_loss": 0.06494329869747162, + "eval_mean_token_accuracy": 0.9745692672936813, + "eval_num_tokens": 22311800.0, + "eval_runtime": 10.129, + "eval_samples_per_second": 361.142, + "eval_steps_per_second": 11.354, + "step": 1086 + }, + { + "entropy": 0.8616272270679474, + "epoch": 2.007366482504604, + "grad_norm": 0.7778105139732361, + "learning_rate": 9.393625326958041e-05, + "loss": 0.054407155513763426, + "mean_token_accuracy": 0.9792074799537659, + "num_tokens": 22394215.0, + "step": 1090 + }, + { + "entropy": 0.8496910452842712, + "epoch": 2.0257826887661143, + "grad_norm": 0.7422528266906738, + "learning_rate": 9.379007561958792e-05, + "loss": 0.051881587505340575, + "mean_token_accuracy": 0.9799090325832367, + "num_tokens": 22599599.0, + "step": 1100 + }, + { + "entropy": 0.8531602442264556, + "epoch": 2.044198895027624, + "grad_norm": 0.9075332880020142, + "learning_rate": 9.36422735006167e-05, + "loss": 0.05190724730491638, + "mean_token_accuracy": 0.979931116104126, + "num_tokens": 22805318.0, + "step": 1110 + }, + { + "entropy": 0.8657277703285218, + "epoch": 2.0626151012891345, + "grad_norm": 0.9466913938522339, + "learning_rate": 9.349285239564325e-05, + "loss": 0.053853434324264524, + "mean_token_accuracy": 0.9796103596687317, + "num_tokens": 23010438.0, + "step": 1120 + }, + { + "entropy": 0.8578485429286957, + "epoch": 2.0810313075506444, + "grad_norm": 0.6903054714202881, + "learning_rate": 9.334181784770326e-05, + "loss": 0.05228850841522217, + "mean_token_accuracy": 0.9802409887313843, + "num_tokens": 23215795.0, + "step": 1130 + }, + { + "entropy": 0.8450767934322357, + "epoch": 2.0994475138121547, + "grad_norm": 0.6615211367607117, + "learning_rate": 9.318917545968581e-05, + "loss": 0.050570905208587646, + "mean_token_accuracy": 0.9802053451538086, + "num_tokens": 23421157.0, + "step": 1140 + }, + { + "entropy": 0.8325044393539429, + "epoch": 2.117863720073665, + "grad_norm": 0.760960578918457, + "learning_rate": 9.303493089412564e-05, + "loss": 0.051966112852096555, + "mean_token_accuracy": 0.9796205997467041, + "num_tokens": 23626584.0, + "step": 1150 + }, + { + "entropy": 0.8416404843330383, + "epoch": 2.136279926335175, + "grad_norm": 0.6947009563446045, + "learning_rate": 9.287908987299306e-05, + "loss": 0.05144861936569214, + "mean_token_accuracy": 0.9800034642219544, + "num_tokens": 23832137.0, + "step": 1160 + }, + { + "entropy": 0.8564540028572083, + "epoch": 2.154696132596685, + "grad_norm": 0.733252763748169, + "learning_rate": 9.272165817748164e-05, + "loss": 0.04944799542427063, + "mean_token_accuracy": 0.9808157980442047, + "num_tokens": 24038006.0, + "step": 1170 + }, + { + "entropy": 0.8575525343418121, + "epoch": 2.1731123388581954, + "grad_norm": 0.8911028504371643, + "learning_rate": 9.25626416477938e-05, + "loss": 0.05037952661514282, + "mean_token_accuracy": 0.980946284532547, + "num_tokens": 24243374.0, + "step": 1180 + }, + { + "entropy": 0.8599720418453216, + "epoch": 2.1915285451197053, + "grad_norm": 0.7713524103164673, + "learning_rate": 9.240204618292416e-05, + "loss": 0.050603735446929934, + "mean_token_accuracy": 0.980896121263504, + "num_tokens": 24448585.0, + "step": 1190 + }, + { + "entropy": 0.8566664934158326, + "epoch": 2.2099447513812156, + "grad_norm": 0.8439353704452515, + "learning_rate": 9.223987774044066e-05, + "loss": 0.054171699285507205, + "mean_token_accuracy": 0.9796543836593627, + "num_tokens": 24653863.0, + "step": 1200 + }, + { + "entropy": 0.846601277589798, + "epoch": 2.2283609576427255, + "grad_norm": 0.7025637030601501, + "learning_rate": 9.207614233626356e-05, + "loss": 0.048924127221107484, + "mean_token_accuracy": 0.9809681415557862, + "num_tokens": 24859801.0, + "step": 1210 + }, + { + "entropy": 0.8564423739910125, + "epoch": 2.2467771639042358, + "grad_norm": 0.7788274884223938, + "learning_rate": 9.191084604444233e-05, + "loss": 0.05260283350944519, + "mean_token_accuracy": 0.9793797850608825, + "num_tokens": 25065368.0, + "step": 1220 + }, + { + "entropy": 0.865056723356247, + "epoch": 2.265193370165746, + "grad_norm": 0.8728818297386169, + "learning_rate": 9.174399499693027e-05, + "loss": 0.05016371011734009, + "mean_token_accuracy": 0.9807134211063385, + "num_tokens": 25270945.0, + "step": 1230 + }, + { + "entropy": 0.8642262935638427, + "epoch": 2.283609576427256, + "grad_norm": 1.0582489967346191, + "learning_rate": 9.157559538335703e-05, + "loss": 0.05316779017448425, + "mean_token_accuracy": 0.9794209063053131, + "num_tokens": 25476575.0, + "step": 1240 + }, + { + "entropy": 0.8677761554718018, + "epoch": 2.3020257826887662, + "grad_norm": 0.760109543800354, + "learning_rate": 9.140565345079901e-05, + "loss": 0.05115479230880737, + "mean_token_accuracy": 0.9802310705184937, + "num_tokens": 25682814.0, + "step": 1250 + }, + { + "entropy": 0.8592945456504821, + "epoch": 2.320441988950276, + "grad_norm": 0.6537907123565674, + "learning_rate": 9.123417550354761e-05, + "loss": 0.050543540716171266, + "mean_token_accuracy": 0.9806945025920868, + "num_tokens": 25887575.0, + "step": 1260 + }, + { + "entropy": 0.8692500293254852, + "epoch": 2.3388581952117864, + "grad_norm": 0.7771905064582825, + "learning_rate": 9.106116790287541e-05, + "loss": 0.049718713760375975, + "mean_token_accuracy": 0.9805168390274048, + "num_tokens": 26092950.0, + "step": 1270 + }, + { + "entropy": 0.8841261565685272, + "epoch": 2.3572744014732967, + "grad_norm": 0.7791076898574829, + "learning_rate": 9.08866370668001e-05, + "loss": 0.0527400553226471, + "mean_token_accuracy": 0.9796754539012908, + "num_tokens": 26298182.0, + "step": 1280 + }, + { + "entropy": 0.8675022900104523, + "epoch": 2.3756906077348066, + "grad_norm": 0.8481605648994446, + "learning_rate": 9.07105894698464e-05, + "loss": 0.05320838689804077, + "mean_token_accuracy": 0.9792274832725525, + "num_tokens": 26503425.0, + "step": 1290 + }, + { + "entropy": 0.8704026222229004, + "epoch": 2.394106813996317, + "grad_norm": 0.8235505819320679, + "learning_rate": 9.053303164280602e-05, + "loss": 0.055045205354690555, + "mean_token_accuracy": 0.9788750648498535, + "num_tokens": 26708755.0, + "step": 1300 + }, + { + "entropy": 0.8525134027004242, + "epoch": 2.4125230202578267, + "grad_norm": 0.7611598968505859, + "learning_rate": 9.035397017249518e-05, + "loss": 0.05029621124267578, + "mean_token_accuracy": 0.9802757322788238, + "num_tokens": 26914704.0, + "step": 1310 + }, + { + "entropy": 0.8630305290222168, + "epoch": 2.430939226519337, + "grad_norm": 0.790408194065094, + "learning_rate": 9.017341170151041e-05, + "loss": 0.04856040775775909, + "mean_token_accuracy": 0.9809690833091735, + "num_tokens": 27120151.0, + "step": 1320 + }, + { + "entropy": 0.8579159140586853, + "epoch": 2.4493554327808473, + "grad_norm": 0.781972348690033, + "learning_rate": 8.999136292798207e-05, + "loss": 0.04869682788848877, + "mean_token_accuracy": 0.9816130697727203, + "num_tokens": 27325673.0, + "step": 1330 + }, + { + "entropy": 0.8634716987609863, + "epoch": 2.467771639042357, + "grad_norm": 0.8500784039497375, + "learning_rate": 8.980783060532588e-05, + "loss": 0.05050289034843445, + "mean_token_accuracy": 0.980079609155655, + "num_tokens": 27531270.0, + "step": 1340 + }, + { + "entropy": 0.8660618126392364, + "epoch": 2.4861878453038675, + "grad_norm": 0.719760537147522, + "learning_rate": 8.96228215419924e-05, + "loss": 0.04892141819000244, + "mean_token_accuracy": 0.9814020991325378, + "num_tokens": 27736542.0, + "step": 1350 + }, + { + "entropy": 0.8572284400463104, + "epoch": 2.5046040515653774, + "grad_norm": 1.0197229385375977, + "learning_rate": 8.943634260121442e-05, + "loss": 0.05104702711105347, + "mean_token_accuracy": 0.9798846662044525, + "num_tokens": 27941566.0, + "step": 1360 + }, + { + "entropy": 0.8702241241931915, + "epoch": 2.5230202578268877, + "grad_norm": 0.7136003375053406, + "learning_rate": 8.924840070075247e-05, + "loss": 0.04855787754058838, + "mean_token_accuracy": 0.9811685383319855, + "num_tokens": 28146943.0, + "step": 1370 + }, + { + "entropy": 0.874957013130188, + "epoch": 2.541436464088398, + "grad_norm": 0.8775497674942017, + "learning_rate": 8.905900281263804e-05, + "loss": 0.052434295415878296, + "mean_token_accuracy": 0.9795438170433044, + "num_tokens": 28352640.0, + "step": 1380 + }, + { + "entropy": 0.8776536166667939, + "epoch": 2.559852670349908, + "grad_norm": 0.8895741105079651, + "learning_rate": 8.8868155962915e-05, + "loss": 0.05282890796661377, + "mean_token_accuracy": 0.9790538609027862, + "num_tokens": 28558153.0, + "step": 1390 + }, + { + "entropy": 0.8738743245601654, + "epoch": 2.578268876611418, + "grad_norm": 0.788800060749054, + "learning_rate": 8.867586723137906e-05, + "loss": 0.048841872811317445, + "mean_token_accuracy": 0.9809149026870727, + "num_tokens": 28763613.0, + "step": 1400 + }, + { + "entropy": 0.8750253796577454, + "epoch": 2.596685082872928, + "grad_norm": 0.8738002777099609, + "learning_rate": 8.848214375131497e-05, + "loss": 0.048261132836341855, + "mean_token_accuracy": 0.980789190530777, + "num_tokens": 28969248.0, + "step": 1410 + }, + { + "entropy": 0.8624245524406433, + "epoch": 2.6151012891344383, + "grad_norm": 0.6404895186424255, + "learning_rate": 8.828699270923196e-05, + "loss": 0.04970468282699585, + "mean_token_accuracy": 0.9807762265205383, + "num_tokens": 29174779.0, + "step": 1420 + }, + { + "entropy": 0.8792938470840455, + "epoch": 2.6335174953959486, + "grad_norm": 0.7856965661048889, + "learning_rate": 8.80904213445972e-05, + "loss": 0.053334391117095946, + "mean_token_accuracy": 0.9790222108364105, + "num_tokens": 29380474.0, + "step": 1430 + }, + { + "entropy": 0.8831034600734711, + "epoch": 2.6519337016574585, + "grad_norm": 0.7739618420600891, + "learning_rate": 8.789243694956716e-05, + "loss": 0.04959054589271546, + "mean_token_accuracy": 0.9803965091705322, + "num_tokens": 29585985.0, + "step": 1440 + }, + { + "entropy": 0.8934672951698304, + "epoch": 2.6703499079189688, + "grad_norm": 0.6999697089195251, + "learning_rate": 8.769304686871719e-05, + "loss": 0.05165250301361084, + "mean_token_accuracy": 0.9798884153366089, + "num_tokens": 29791238.0, + "step": 1450 + }, + { + "entropy": 0.9053199410438537, + "epoch": 2.6887661141804786, + "grad_norm": 0.9199564456939697, + "learning_rate": 8.749225849876892e-05, + "loss": 0.04924143850803375, + "mean_token_accuracy": 0.9810785710811615, + "num_tokens": 29996589.0, + "step": 1460 + }, + { + "entropy": 0.888091403245926, + "epoch": 2.707182320441989, + "grad_norm": 0.7480106353759766, + "learning_rate": 8.729007928831597e-05, + "loss": 0.04948916733264923, + "mean_token_accuracy": 0.9809579730033875, + "num_tokens": 30201875.0, + "step": 1470 + }, + { + "entropy": 0.8723407983779907, + "epoch": 2.7255985267034992, + "grad_norm": 0.9506945013999939, + "learning_rate": 8.708651673754763e-05, + "loss": 0.048927539587020875, + "mean_token_accuracy": 0.980553150177002, + "num_tokens": 30407550.0, + "step": 1480 + }, + { + "entropy": 0.8737521529197693, + "epoch": 2.744014732965009, + "grad_norm": 0.8015706539154053, + "learning_rate": 8.688157839797062e-05, + "loss": 0.04963063597679138, + "mean_token_accuracy": 0.9809738755226135, + "num_tokens": 30612839.0, + "step": 1490 + }, + { + "entropy": 0.8800762951374054, + "epoch": 2.7624309392265194, + "grad_norm": 0.9429986476898193, + "learning_rate": 8.667527187212885e-05, + "loss": 0.0524174690246582, + "mean_token_accuracy": 0.9788767337799072, + "num_tokens": 30818578.0, + "step": 1500 + }, + { + "entropy": 0.8871055901050567, + "epoch": 2.7808471454880292, + "grad_norm": 0.5909196138381958, + "learning_rate": 8.646760481332157e-05, + "loss": 0.05166680812835693, + "mean_token_accuracy": 0.980216771364212, + "num_tokens": 31023829.0, + "step": 1510 + }, + { + "entropy": 0.8908755779266357, + "epoch": 2.7992633517495396, + "grad_norm": 0.9154611229896545, + "learning_rate": 8.625858492531931e-05, + "loss": 0.04951836466789246, + "mean_token_accuracy": 0.9801484227180481, + "num_tokens": 31229635.0, + "step": 1520 + }, + { + "entropy": 0.92480548620224, + "epoch": 2.81767955801105, + "grad_norm": 0.5989938378334045, + "learning_rate": 8.604821996207819e-05, + "loss": 0.04799881279468536, + "mean_token_accuracy": 0.9817522585391998, + "num_tokens": 31435456.0, + "step": 1530 + }, + { + "entropy": 0.9173881888389588, + "epoch": 2.8360957642725597, + "grad_norm": 0.899413526058197, + "learning_rate": 8.58365177274522e-05, + "loss": 0.0487445592880249, + "mean_token_accuracy": 0.9812625288963318, + "num_tokens": 31640904.0, + "step": 1540 + }, + { + "entropy": 0.9076135993003845, + "epoch": 2.85451197053407, + "grad_norm": 0.8494166135787964, + "learning_rate": 8.562348607490376e-05, + "loss": 0.05005228519439697, + "mean_token_accuracy": 0.9806681036949157, + "num_tokens": 31845807.0, + "step": 1550 + }, + { + "entropy": 0.9092245221138, + "epoch": 2.87292817679558, + "grad_norm": 0.8225123286247253, + "learning_rate": 8.540913290721234e-05, + "loss": 0.048654764890670776, + "mean_token_accuracy": 0.9805659353733063, + "num_tokens": 32051523.0, + "step": 1560 + }, + { + "entropy": 0.9062779664993286, + "epoch": 2.89134438305709, + "grad_norm": 0.7074014544487, + "learning_rate": 8.519346617618134e-05, + "loss": 0.049209845066070554, + "mean_token_accuracy": 0.9807434439659118, + "num_tokens": 32256895.0, + "step": 1570 + }, + { + "entropy": 0.9190246641635895, + "epoch": 2.9097605893186005, + "grad_norm": 0.8860642910003662, + "learning_rate": 8.497649388234304e-05, + "loss": 0.051211881637573245, + "mean_token_accuracy": 0.9802342295646668, + "num_tokens": 32462031.0, + "step": 1580 + }, + { + "entropy": 0.9088015079498291, + "epoch": 2.9281767955801103, + "grad_norm": 0.8062726855278015, + "learning_rate": 8.475822407466188e-05, + "loss": 0.053512704372406, + "mean_token_accuracy": 0.979486483335495, + "num_tokens": 32667533.0, + "step": 1590 + }, + { + "entropy": 0.9462027847766876, + "epoch": 2.9465930018416207, + "grad_norm": 0.7962909936904907, + "learning_rate": 8.453866485023579e-05, + "loss": 0.0501457154750824, + "mean_token_accuracy": 0.9803222417831421, + "num_tokens": 32872900.0, + "step": 1600 + }, + { + "entropy": 0.9671471297740937, + "epoch": 2.9650092081031305, + "grad_norm": 0.7641744017601013, + "learning_rate": 8.431782435399587e-05, + "loss": 0.04629061222076416, + "mean_token_accuracy": 0.9823175370693207, + "num_tokens": 33077850.0, + "step": 1610 + }, + { + "entropy": 0.955865204334259, + "epoch": 2.983425414364641, + "grad_norm": 0.6772348880767822, + "learning_rate": 8.409571077840426e-05, + "loss": 0.048368623852729796, + "mean_token_accuracy": 0.9808700799942016, + "num_tokens": 33283117.0, + "step": 1620 + }, + { + "epoch": 3.0, + "eval_entropy": 0.9563225186389426, + "eval_loss": 0.059064481407403946, + "eval_mean_token_accuracy": 0.9773589429648026, + "eval_num_tokens": 33467712.0, + "eval_runtime": 10.1471, + "eval_samples_per_second": 360.499, + "eval_steps_per_second": 11.333, + "step": 1629 + } + ], + "logging_steps": 10, + "max_steps": 5430, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.595677368674943e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1629/training_args.bin b/checkpoint-1629/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e182f9a244f56249cd86c9db6563a54fbeed7bd --- /dev/null +++ b/checkpoint-1629/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21325c9bdff5ed34f0cc34837ee67ed216c9301ab4d9b2e26f048b563564bd75 +size 5777 diff --git a/checkpoint-2172/README.md b/checkpoint-2172/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dd49221a54a197d9f0cab86c7982e2b2fe0776b3 --- /dev/null +++ b/checkpoint-2172/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen2.5-7B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen2.5-7B-Instruct +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-2172/adapter_config.json b/checkpoint-2172/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..88178e58c6d24c2bb788f871d548c8bff3b8b16a --- /dev/null +++ b/checkpoint-2172/adapter_config.json @@ -0,0 +1,48 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "down_proj", + "q_proj", + "gate_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2172/adapter_model.safetensors b/checkpoint-2172/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7878cb3ceb7cba2f95b9bd9db6115f077efed3ef --- /dev/null +++ b/checkpoint-2172/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c86f8f8223f27673f137496bfe71dc599b6baf7e185de17ad979b78a2ac98e6 +size 80792096 diff --git a/checkpoint-2172/chat_template.jinja b/checkpoint-2172/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..bdf7919a96cfe43d50914a007b9c0877bd0ec27e --- /dev/null +++ b/checkpoint-2172/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-2172/tokenizer.json b/checkpoint-2172/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/checkpoint-2172/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/checkpoint-2172/tokenizer_config.json b/checkpoint-2172/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d75d3bb5300d205e48769cc1999073ab5971214 --- /dev/null +++ b/checkpoint-2172/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-2172/trainer_state.json b/checkpoint-2172/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b3ac32f90bac7db9571b734d9459ed3f9092efc4 --- /dev/null +++ b/checkpoint-2172/trainer_state.json @@ -0,0 +1,2248 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 2172, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2237394809722901, + "epoch": 0.01841620626151013, + "grad_norm": 5.082435607910156, + "learning_rate": 3.308823529411765e-06, + "loss": 0.9237876892089844, + "mean_token_accuracy": 0.7685343027114868, + "num_tokens": 205423.0, + "step": 10 + }, + { + "entropy": 1.2295925617218018, + "epoch": 0.03683241252302026, + "grad_norm": 4.672000408172607, + "learning_rate": 6.985294117647059e-06, + "loss": 0.8900892257690429, + "mean_token_accuracy": 0.7677771031856537, + "num_tokens": 410849.0, + "step": 20 + }, + { + "entropy": 1.2285718679428101, + "epoch": 0.055248618784530384, + "grad_norm": 1.4828118085861206, + "learning_rate": 1.0661764705882354e-05, + "loss": 0.5975452899932862, + "mean_token_accuracy": 0.8146551787853241, + "num_tokens": 616438.0, + "step": 30 + }, + { + "entropy": 1.210776400566101, + "epoch": 0.07366482504604052, + "grad_norm": 0.7761328816413879, + "learning_rate": 1.4338235294117647e-05, + "loss": 0.40664992332458494, + "mean_token_accuracy": 0.8699092030525207, + "num_tokens": 822118.0, + "step": 40 + }, + { + "entropy": 1.200321125984192, + "epoch": 0.09208103130755065, + "grad_norm": 0.5363371968269348, + "learning_rate": 1.8014705882352943e-05, + "loss": 0.3313469409942627, + "mean_token_accuracy": 0.8904915869235992, + "num_tokens": 1027941.0, + "step": 50 + }, + { + "entropy": 1.1809936046600342, + "epoch": 0.11049723756906077, + "grad_norm": 0.39541518688201904, + "learning_rate": 2.1691176470588237e-05, + "loss": 0.27568228244781495, + "mean_token_accuracy": 0.9047131836414337, + "num_tokens": 1233620.0, + "step": 60 + }, + { + "entropy": 1.169810914993286, + "epoch": 0.1289134438305709, + "grad_norm": 0.341960072517395, + "learning_rate": 2.536764705882353e-05, + "loss": 0.245219087600708, + "mean_token_accuracy": 0.9150686681270599, + "num_tokens": 1438656.0, + "step": 70 + }, + { + "entropy": 1.1652960777282715, + "epoch": 0.14732965009208104, + "grad_norm": 0.36872178316116333, + "learning_rate": 2.9044117647058828e-05, + "loss": 0.2220149040222168, + "mean_token_accuracy": 0.9224777698516846, + "num_tokens": 1643877.0, + "step": 80 + }, + { + "entropy": 1.154341197013855, + "epoch": 0.16574585635359115, + "grad_norm": 0.4152425229549408, + "learning_rate": 3.272058823529412e-05, + "loss": 0.2002798557281494, + "mean_token_accuracy": 0.9285802960395813, + "num_tokens": 1849506.0, + "step": 90 + }, + { + "entropy": 1.1507258892059327, + "epoch": 0.1841620626151013, + "grad_norm": 0.47647765278816223, + "learning_rate": 3.639705882352941e-05, + "loss": 0.18871363401412963, + "mean_token_accuracy": 0.9318056285381318, + "num_tokens": 2055071.0, + "step": 100 + }, + { + "entropy": 1.1455535531044005, + "epoch": 0.20257826887661143, + "grad_norm": 0.4853009581565857, + "learning_rate": 4.007352941176471e-05, + "loss": 0.17836341857910157, + "mean_token_accuracy": 0.9367631554603577, + "num_tokens": 2260643.0, + "step": 110 + }, + { + "entropy": 1.1402526497840881, + "epoch": 0.22099447513812154, + "grad_norm": 0.4455392360687256, + "learning_rate": 4.375e-05, + "loss": 0.16921783685684205, + "mean_token_accuracy": 0.9386959195137023, + "num_tokens": 2466085.0, + "step": 120 + }, + { + "entropy": 1.1374777555465698, + "epoch": 0.23941068139963168, + "grad_norm": 0.5880279541015625, + "learning_rate": 4.742647058823529e-05, + "loss": 0.15989291667938232, + "mean_token_accuracy": 0.9421182632446289, + "num_tokens": 2671024.0, + "step": 130 + }, + { + "entropy": 1.1273940205574036, + "epoch": 0.2578268876611418, + "grad_norm": 0.612959086894989, + "learning_rate": 5.110294117647059e-05, + "loss": 0.14701461791992188, + "mean_token_accuracy": 0.9463540315628052, + "num_tokens": 2876848.0, + "step": 140 + }, + { + "entropy": 1.1263513088226318, + "epoch": 0.27624309392265195, + "grad_norm": 0.5695255398750305, + "learning_rate": 5.477941176470589e-05, + "loss": 0.14604382514953612, + "mean_token_accuracy": 0.946351945400238, + "num_tokens": 3082589.0, + "step": 150 + }, + { + "entropy": 1.1290789365768432, + "epoch": 0.2946593001841621, + "grad_norm": 0.6608090996742249, + "learning_rate": 5.845588235294118e-05, + "loss": 0.1409450054168701, + "mean_token_accuracy": 0.9481450319290161, + "num_tokens": 3287459.0, + "step": 160 + }, + { + "entropy": 1.1291529774665832, + "epoch": 0.31307550644567217, + "grad_norm": 0.652715802192688, + "learning_rate": 6.213235294117647e-05, + "loss": 0.14441155195236205, + "mean_token_accuracy": 0.9466125547885895, + "num_tokens": 3493682.0, + "step": 170 + }, + { + "entropy": 1.1244838953018188, + "epoch": 0.3314917127071823, + "grad_norm": 0.7815241813659668, + "learning_rate": 6.580882352941177e-05, + "loss": 0.13361064195632935, + "mean_token_accuracy": 0.9512295544147491, + "num_tokens": 3699573.0, + "step": 180 + }, + { + "entropy": 1.1217721104621887, + "epoch": 0.34990791896869244, + "grad_norm": 0.7933160066604614, + "learning_rate": 6.948529411764706e-05, + "loss": 0.13089522123336791, + "mean_token_accuracy": 0.9520221531391144, + "num_tokens": 3905156.0, + "step": 190 + }, + { + "entropy": 1.1206679105758668, + "epoch": 0.3683241252302026, + "grad_norm": 0.6815240383148193, + "learning_rate": 7.316176470588236e-05, + "loss": 0.13400404453277587, + "mean_token_accuracy": 0.9501322209835052, + "num_tokens": 4110570.0, + "step": 200 + }, + { + "entropy": 1.1161052227020263, + "epoch": 0.3867403314917127, + "grad_norm": 0.8297767639160156, + "learning_rate": 7.683823529411766e-05, + "loss": 0.13389937877655028, + "mean_token_accuracy": 0.9501932203769684, + "num_tokens": 4315834.0, + "step": 210 + }, + { + "entropy": 1.1098745942115784, + "epoch": 0.40515653775322286, + "grad_norm": 0.5943381786346436, + "learning_rate": 8.051470588235294e-05, + "loss": 0.13452907800674438, + "mean_token_accuracy": 0.9503286242485046, + "num_tokens": 4520807.0, + "step": 220 + }, + { + "entropy": 1.100480353832245, + "epoch": 0.42357274401473294, + "grad_norm": 0.6094359755516052, + "learning_rate": 8.419117647058824e-05, + "loss": 0.12827746868133544, + "mean_token_accuracy": 0.952492094039917, + "num_tokens": 4725867.0, + "step": 230 + }, + { + "entropy": 1.0901286959648133, + "epoch": 0.4419889502762431, + "grad_norm": 0.7240597605705261, + "learning_rate": 8.786764705882353e-05, + "loss": 0.12171242237091065, + "mean_token_accuracy": 0.953943532705307, + "num_tokens": 4931629.0, + "step": 240 + }, + { + "entropy": 1.0885071873664856, + "epoch": 0.4604051565377532, + "grad_norm": 0.6939547657966614, + "learning_rate": 9.154411764705882e-05, + "loss": 0.12155698537826538, + "mean_token_accuracy": 0.9545870959758759, + "num_tokens": 5137285.0, + "step": 250 + }, + { + "entropy": 1.086272156238556, + "epoch": 0.47882136279926335, + "grad_norm": 0.5752800703048706, + "learning_rate": 9.522058823529412e-05, + "loss": 0.12157790660858155, + "mean_token_accuracy": 0.9541126549243927, + "num_tokens": 5342575.0, + "step": 260 + }, + { + "entropy": 1.0857678413391114, + "epoch": 0.4972375690607735, + "grad_norm": 0.7565123438835144, + "learning_rate": 9.889705882352942e-05, + "loss": 0.12349612712860107, + "mean_token_accuracy": 0.9535140514373779, + "num_tokens": 5547995.0, + "step": 270 + }, + { + "entropy": 1.079762625694275, + "epoch": 0.5156537753222836, + "grad_norm": 0.6972768306732178, + "learning_rate": 9.999954556423843e-05, + "loss": 0.11875582933425903, + "mean_token_accuracy": 0.9556483089923858, + "num_tokens": 5753195.0, + "step": 280 + }, + { + "entropy": 1.0742079138755798, + "epoch": 0.5340699815837937, + "grad_norm": 0.7821696996688843, + "learning_rate": 9.999731977631227e-05, + "loss": 0.11824090480804443, + "mean_token_accuracy": 0.9557521045207977, + "num_tokens": 5958236.0, + "step": 290 + }, + { + "entropy": 1.0679773569107056, + "epoch": 0.5524861878453039, + "grad_norm": 0.5846888422966003, + "learning_rate": 9.999323925089486e-05, + "loss": 0.11707355976104736, + "mean_token_accuracy": 0.9554719448089599, + "num_tokens": 6163992.0, + "step": 300 + }, + { + "entropy": 1.0655727863311768, + "epoch": 0.570902394106814, + "grad_norm": 0.5812502503395081, + "learning_rate": 9.998730413936037e-05, + "loss": 0.11371417045593261, + "mean_token_accuracy": 0.9576376020908356, + "num_tokens": 6369456.0, + "step": 310 + }, + { + "entropy": 1.0607039332389832, + "epoch": 0.5893186003683242, + "grad_norm": 0.6238475441932678, + "learning_rate": 9.99795146618821e-05, + "loss": 0.11775733232498169, + "mean_token_accuracy": 0.9557221591472626, + "num_tokens": 6574833.0, + "step": 320 + }, + { + "entropy": 1.0504255175590516, + "epoch": 0.6077348066298343, + "grad_norm": 0.6496815085411072, + "learning_rate": 9.996987110742422e-05, + "loss": 0.10904088020324706, + "mean_token_accuracy": 0.9585366368293762, + "num_tokens": 6780108.0, + "step": 330 + }, + { + "entropy": 1.0456081986427308, + "epoch": 0.6261510128913443, + "grad_norm": 0.786702573299408, + "learning_rate": 9.995837383373119e-05, + "loss": 0.10642309188842773, + "mean_token_accuracy": 0.9596696078777314, + "num_tokens": 6985920.0, + "step": 340 + }, + { + "entropy": 1.0455098271369934, + "epoch": 0.6445672191528545, + "grad_norm": 0.5473790168762207, + "learning_rate": 9.994502326731434e-05, + "loss": 0.10822961330413819, + "mean_token_accuracy": 0.959563136100769, + "num_tokens": 7191465.0, + "step": 350 + }, + { + "entropy": 1.04240562915802, + "epoch": 0.6629834254143646, + "grad_norm": 0.6672356128692627, + "learning_rate": 9.992981990343614e-05, + "loss": 0.1110004186630249, + "mean_token_accuracy": 0.9582514643669129, + "num_tokens": 7396877.0, + "step": 360 + }, + { + "entropy": 1.0386811256408692, + "epoch": 0.6813996316758748, + "grad_norm": 0.698539674282074, + "learning_rate": 9.99127643060918e-05, + "loss": 0.107539963722229, + "mean_token_accuracy": 0.9593036234378814, + "num_tokens": 7602437.0, + "step": 370 + }, + { + "entropy": 1.0311225533485413, + "epoch": 0.6998158379373849, + "grad_norm": 0.6629284024238586, + "learning_rate": 9.989385710798837e-05, + "loss": 0.1064023494720459, + "mean_token_accuracy": 0.9602205216884613, + "num_tokens": 7808142.0, + "step": 380 + }, + { + "entropy": 1.030210506916046, + "epoch": 0.7182320441988951, + "grad_norm": 0.5616748929023743, + "learning_rate": 9.987309901052121e-05, + "loss": 0.10717041492462158, + "mean_token_accuracy": 0.9599347949028015, + "num_tokens": 8013407.0, + "step": 390 + }, + { + "entropy": 1.0208017826080322, + "epoch": 0.7366482504604052, + "grad_norm": 0.6329049468040466, + "learning_rate": 9.985049078374806e-05, + "loss": 0.10359601974487305, + "mean_token_accuracy": 0.9603756129741668, + "num_tokens": 8219040.0, + "step": 400 + }, + { + "entropy": 1.015640377998352, + "epoch": 0.7550644567219152, + "grad_norm": 0.6516013741493225, + "learning_rate": 9.982603326636037e-05, + "loss": 0.10146439075469971, + "mean_token_accuracy": 0.9627702474594116, + "num_tokens": 8424678.0, + "step": 410 + }, + { + "entropy": 1.0105359435081482, + "epoch": 0.7734806629834254, + "grad_norm": 0.6920603513717651, + "learning_rate": 9.979972736565226e-05, + "loss": 0.10770498514175415, + "mean_token_accuracy": 0.9591470420360565, + "num_tokens": 8629868.0, + "step": 420 + }, + { + "entropy": 0.9966452836990356, + "epoch": 0.7918968692449355, + "grad_norm": 0.6857476234436035, + "learning_rate": 9.977157405748687e-05, + "loss": 0.10282524824142455, + "mean_token_accuracy": 0.9612209022045135, + "num_tokens": 8835320.0, + "step": 430 + }, + { + "entropy": 0.9945534646511078, + "epoch": 0.8103130755064457, + "grad_norm": 0.7208472490310669, + "learning_rate": 9.974157438626008e-05, + "loss": 0.10069938898086547, + "mean_token_accuracy": 0.9620070576667785, + "num_tokens": 9041123.0, + "step": 440 + }, + { + "entropy": 0.979461395740509, + "epoch": 0.8287292817679558, + "grad_norm": 0.5071915984153748, + "learning_rate": 9.970972946486185e-05, + "loss": 0.09799174070358277, + "mean_token_accuracy": 0.9620374023914338, + "num_tokens": 9246361.0, + "step": 450 + }, + { + "entropy": 0.9830998003482818, + "epoch": 0.8471454880294659, + "grad_norm": 0.8660802245140076, + "learning_rate": 9.967604047463493e-05, + "loss": 0.10378165245056152, + "mean_token_accuracy": 0.9606865763664245, + "num_tokens": 9451845.0, + "step": 460 + }, + { + "entropy": 0.9813413023948669, + "epoch": 0.8655616942909761, + "grad_norm": 0.7642477750778198, + "learning_rate": 9.964050866533094e-05, + "loss": 0.1010061264038086, + "mean_token_accuracy": 0.9608745336532593, + "num_tokens": 9656802.0, + "step": 470 + }, + { + "entropy": 0.967874163389206, + "epoch": 0.8839779005524862, + "grad_norm": 0.5987281799316406, + "learning_rate": 9.960313535506411e-05, + "loss": 0.10169394016265869, + "mean_token_accuracy": 0.9611998200416565, + "num_tokens": 9861719.0, + "step": 480 + }, + { + "entropy": 0.9663491308689117, + "epoch": 0.9023941068139963, + "grad_norm": 0.6124638319015503, + "learning_rate": 9.956392193026239e-05, + "loss": 0.102389657497406, + "mean_token_accuracy": 0.9611884355545044, + "num_tokens": 10066673.0, + "step": 490 + }, + { + "entropy": 0.959654438495636, + "epoch": 0.9208103130755064, + "grad_norm": 0.7873051762580872, + "learning_rate": 9.952286984561592e-05, + "loss": 0.10170392990112305, + "mean_token_accuracy": 0.9610928475856781, + "num_tokens": 10272091.0, + "step": 500 + }, + { + "entropy": 0.9550537407398224, + "epoch": 0.9392265193370166, + "grad_norm": 0.6071968078613281, + "learning_rate": 9.947998062402313e-05, + "loss": 0.09448277950286865, + "mean_token_accuracy": 0.9648977637290954, + "num_tokens": 10477632.0, + "step": 510 + }, + { + "entropy": 0.9538533687591553, + "epoch": 0.9576427255985267, + "grad_norm": 0.6317242980003357, + "learning_rate": 9.943525585653428e-05, + "loss": 0.09542192220687866, + "mean_token_accuracy": 0.9635261118412017, + "num_tokens": 10682828.0, + "step": 520 + }, + { + "entropy": 0.9362513542175293, + "epoch": 0.9760589318600368, + "grad_norm": 0.6421944499015808, + "learning_rate": 9.938869720229234e-05, + "loss": 0.09382058382034301, + "mean_token_accuracy": 0.9648073971271515, + "num_tokens": 10888741.0, + "step": 530 + }, + { + "entropy": 0.9235438346862793, + "epoch": 0.994475138121547, + "grad_norm": 0.7986873388290405, + "learning_rate": 9.934030638847155e-05, + "loss": 0.09827429056167603, + "mean_token_accuracy": 0.9621128737926483, + "num_tokens": 11094387.0, + "step": 540 + }, + { + "epoch": 1.0, + "eval_entropy": 0.9137652366057686, + "eval_loss": 0.09368764609098434, + "eval_mean_token_accuracy": 0.9640816880309063, + "eval_num_tokens": 11155908.0, + "eval_runtime": 10.4701, + "eval_samples_per_second": 349.377, + "eval_steps_per_second": 10.984, + "step": 543 + }, + { + "entropy": 0.9047818422317505, + "epoch": 1.0128913443830572, + "grad_norm": 0.6781501173973083, + "learning_rate": 9.929008521021325e-05, + "loss": 0.0863916516304016, + "mean_token_accuracy": 0.9673655688762665, + "num_tokens": 11299715.0, + "step": 550 + }, + { + "entropy": 0.8856981039047241, + "epoch": 1.0313075506445673, + "grad_norm": 0.7143136858940125, + "learning_rate": 9.923803553055937e-05, + "loss": 0.08632323145866394, + "mean_token_accuracy": 0.9677783191204071, + "num_tokens": 11505059.0, + "step": 560 + }, + { + "entropy": 0.8937099635601043, + "epoch": 1.0497237569060773, + "grad_norm": 0.7751694321632385, + "learning_rate": 9.918415928038325e-05, + "loss": 0.08178263902664185, + "mean_token_accuracy": 0.9694291114807129, + "num_tokens": 11710464.0, + "step": 570 + }, + { + "entropy": 0.8858704209327698, + "epoch": 1.0681399631675874, + "grad_norm": 0.7492292523384094, + "learning_rate": 9.912845845831805e-05, + "loss": 0.08074211478233337, + "mean_token_accuracy": 0.9692470014095307, + "num_tokens": 11915959.0, + "step": 580 + }, + { + "entropy": 0.8948039829730987, + "epoch": 1.0865561694290977, + "grad_norm": 0.8116479516029358, + "learning_rate": 9.907093513068259e-05, + "loss": 0.08712012171745301, + "mean_token_accuracy": 0.9669980227947235, + "num_tokens": 12121499.0, + "step": 590 + }, + { + "entropy": 0.8846789538860321, + "epoch": 1.1049723756906078, + "grad_norm": 0.7295626997947693, + "learning_rate": 9.901159143140471e-05, + "loss": 0.08444435596466064, + "mean_token_accuracy": 0.9674544095993042, + "num_tokens": 12327061.0, + "step": 600 + }, + { + "entropy": 0.8734103918075562, + "epoch": 1.1233885819521179, + "grad_norm": 0.9585768580436707, + "learning_rate": 9.89504295619421e-05, + "loss": 0.08022565841674804, + "mean_token_accuracy": 0.969569206237793, + "num_tokens": 12532305.0, + "step": 610 + }, + { + "entropy": 0.8640486001968384, + "epoch": 1.141804788213628, + "grad_norm": 0.7891159057617188, + "learning_rate": 9.88874517912006e-05, + "loss": 0.08415375947952271, + "mean_token_accuracy": 0.9678892493247986, + "num_tokens": 12737828.0, + "step": 620 + }, + { + "entropy": 0.8599755525588989, + "epoch": 1.160220994475138, + "grad_norm": 0.5801345109939575, + "learning_rate": 9.882266045545012e-05, + "loss": 0.08100489974021911, + "mean_token_accuracy": 0.9688023269176483, + "num_tokens": 12943343.0, + "step": 630 + }, + { + "entropy": 0.86524977684021, + "epoch": 1.1786372007366483, + "grad_norm": 0.7633041143417358, + "learning_rate": 9.87560579582379e-05, + "loss": 0.07859406471252442, + "mean_token_accuracy": 0.9702189445495606, + "num_tokens": 13148473.0, + "step": 640 + }, + { + "entropy": 0.8466695249080658, + "epoch": 1.1970534069981584, + "grad_norm": 0.8672215938568115, + "learning_rate": 9.868764677029934e-05, + "loss": 0.08082623481750488, + "mean_token_accuracy": 0.9689972400665283, + "num_tokens": 13353890.0, + "step": 650 + }, + { + "entropy": 0.8596941530704498, + "epoch": 1.2154696132596685, + "grad_norm": 0.7524124383926392, + "learning_rate": 9.861742942946639e-05, + "loss": 0.0789935290813446, + "mean_token_accuracy": 0.9693858206272126, + "num_tokens": 13559475.0, + "step": 660 + }, + { + "entropy": 0.8708749234676361, + "epoch": 1.2338858195211786, + "grad_norm": 0.5777031183242798, + "learning_rate": 9.854540854057337e-05, + "loss": 0.07773642539978028, + "mean_token_accuracy": 0.970385092496872, + "num_tokens": 13765076.0, + "step": 670 + }, + { + "entropy": 0.8651713371276856, + "epoch": 1.2523020257826887, + "grad_norm": 0.7924166321754456, + "learning_rate": 9.847158677536034e-05, + "loss": 0.0766686737537384, + "mean_token_accuracy": 0.9702267110347748, + "num_tokens": 13970642.0, + "step": 680 + }, + { + "entropy": 0.8763024985790253, + "epoch": 1.270718232044199, + "grad_norm": 0.741219162940979, + "learning_rate": 9.839596687237403e-05, + "loss": 0.07189929485321045, + "mean_token_accuracy": 0.9727097094058991, + "num_tokens": 14176556.0, + "step": 690 + }, + { + "entropy": 0.8556921362876893, + "epoch": 1.289134438305709, + "grad_norm": 0.6298198103904724, + "learning_rate": 9.831855163686618e-05, + "loss": 0.07608137726783752, + "mean_token_accuracy": 0.9716399371623993, + "num_tokens": 14381686.0, + "step": 700 + }, + { + "entropy": 0.869178420305252, + "epoch": 1.3075506445672191, + "grad_norm": 0.5850273370742798, + "learning_rate": 9.823934394068952e-05, + "loss": 0.07437651753425598, + "mean_token_accuracy": 0.9709566533565521, + "num_tokens": 14586814.0, + "step": 710 + }, + { + "entropy": 0.8708595156669616, + "epoch": 1.3259668508287292, + "grad_norm": 0.6580632328987122, + "learning_rate": 9.815834672219127e-05, + "loss": 0.07518917322158813, + "mean_token_accuracy": 0.9717426657676697, + "num_tokens": 14792321.0, + "step": 720 + }, + { + "entropy": 0.8826817810535431, + "epoch": 1.3443830570902393, + "grad_norm": 0.8788532018661499, + "learning_rate": 9.807556298610404e-05, + "loss": 0.07579240798950196, + "mean_token_accuracy": 0.9706341981887817, + "num_tokens": 14997810.0, + "step": 730 + }, + { + "entropy": 0.9012470185756684, + "epoch": 1.3627992633517496, + "grad_norm": 0.7022138237953186, + "learning_rate": 9.799099580343441e-05, + "loss": 0.0775588572025299, + "mean_token_accuracy": 0.9699241399765015, + "num_tokens": 15203795.0, + "step": 740 + }, + { + "entropy": 0.886955714225769, + "epoch": 1.3812154696132597, + "grad_norm": 0.7881133556365967, + "learning_rate": 9.790464831134903e-05, + "loss": 0.07125020027160645, + "mean_token_accuracy": 0.9723815560340882, + "num_tokens": 15408974.0, + "step": 750 + }, + { + "entropy": 0.9047374844551086, + "epoch": 1.3996316758747698, + "grad_norm": 0.9082005023956299, + "learning_rate": 9.781652371305824e-05, + "loss": 0.07004334926605224, + "mean_token_accuracy": 0.9725580036640167, + "num_tokens": 15614399.0, + "step": 760 + }, + { + "entropy": 0.9039053857326508, + "epoch": 1.4180478821362799, + "grad_norm": 0.8060817122459412, + "learning_rate": 9.77266252776972e-05, + "loss": 0.07103485465049744, + "mean_token_accuracy": 0.9721468150615692, + "num_tokens": 15819895.0, + "step": 770 + }, + { + "entropy": 0.8998047232627868, + "epoch": 1.43646408839779, + "grad_norm": 1.0152642726898193, + "learning_rate": 9.763495634020467e-05, + "loss": 0.07411704063415528, + "mean_token_accuracy": 0.9711063146591187, + "num_tokens": 16025297.0, + "step": 780 + }, + { + "entropy": 0.9120213568210602, + "epoch": 1.4548802946593002, + "grad_norm": 0.6288319826126099, + "learning_rate": 9.754152030119921e-05, + "loss": 0.07223712205886841, + "mean_token_accuracy": 0.9722476422786712, + "num_tokens": 16230656.0, + "step": 790 + }, + { + "entropy": 0.9142370820045471, + "epoch": 1.4732965009208103, + "grad_norm": 0.7854700088500977, + "learning_rate": 9.744632062685311e-05, + "loss": 0.07186744809150696, + "mean_token_accuracy": 0.972247713804245, + "num_tokens": 16435943.0, + "step": 800 + }, + { + "entropy": 0.8920814216136932, + "epoch": 1.4917127071823204, + "grad_norm": 0.6227074265480042, + "learning_rate": 9.734936084876383e-05, + "loss": 0.07016961574554444, + "mean_token_accuracy": 0.9725603640079499, + "num_tokens": 16641635.0, + "step": 810 + }, + { + "entropy": 0.891328877210617, + "epoch": 1.5101289134438307, + "grad_norm": 0.7601346969604492, + "learning_rate": 9.725064456382283e-05, + "loss": 0.07137494087219239, + "mean_token_accuracy": 0.9722997546195984, + "num_tokens": 16847194.0, + "step": 820 + }, + { + "entropy": 0.8921217978000641, + "epoch": 1.5285451197053406, + "grad_norm": 0.7813850045204163, + "learning_rate": 9.715017543408233e-05, + "loss": 0.06890199184417725, + "mean_token_accuracy": 0.9735044002532959, + "num_tokens": 17052807.0, + "step": 830 + }, + { + "entropy": 0.9085914671421051, + "epoch": 1.5469613259668509, + "grad_norm": 0.6184289455413818, + "learning_rate": 9.704795718661939e-05, + "loss": 0.07043765187263488, + "mean_token_accuracy": 0.9725716531276702, + "num_tokens": 17258284.0, + "step": 840 + }, + { + "entropy": 0.9029861629009247, + "epoch": 1.565377532228361, + "grad_norm": 0.7082377076148987, + "learning_rate": 9.694399361339752e-05, + "loss": 0.07113839387893676, + "mean_token_accuracy": 0.9725669205188752, + "num_tokens": 17464326.0, + "step": 850 + }, + { + "entropy": 0.8856533527374267, + "epoch": 1.583793738489871, + "grad_norm": 0.7409216165542603, + "learning_rate": 9.683828857112627e-05, + "loss": 0.07077333331108093, + "mean_token_accuracy": 0.9731084644794464, + "num_tokens": 17669537.0, + "step": 860 + }, + { + "entropy": 0.8613030433654785, + "epoch": 1.6022099447513813, + "grad_norm": 0.6801561713218689, + "learning_rate": 9.673084598111789e-05, + "loss": 0.06885308027267456, + "mean_token_accuracy": 0.97266526222229, + "num_tokens": 17875289.0, + "step": 870 + }, + { + "entropy": 0.8692965865135193, + "epoch": 1.6206261510128912, + "grad_norm": 1.1621277332305908, + "learning_rate": 9.662166982914203e-05, + "loss": 0.07017780542373657, + "mean_token_accuracy": 0.9733059942722321, + "num_tokens": 18080404.0, + "step": 880 + }, + { + "entropy": 0.8671502113342285, + "epoch": 1.6390423572744015, + "grad_norm": 0.7518903613090515, + "learning_rate": 9.651076416527787e-05, + "loss": 0.06977018713951111, + "mean_token_accuracy": 0.9730017304420471, + "num_tokens": 18285699.0, + "step": 890 + }, + { + "entropy": 0.8662045657634735, + "epoch": 1.6574585635359116, + "grad_norm": 0.6622698903083801, + "learning_rate": 9.639813310376378e-05, + "loss": 0.06620995998382569, + "mean_token_accuracy": 0.9737491130828857, + "num_tokens": 18491097.0, + "step": 900 + }, + { + "entropy": 0.8548173069953918, + "epoch": 1.6758747697974217, + "grad_norm": 0.8941843509674072, + "learning_rate": 9.628378082284479e-05, + "loss": 0.06711119413375854, + "mean_token_accuracy": 0.9740589797496796, + "num_tokens": 18696827.0, + "step": 910 + }, + { + "entropy": 0.8763562262058258, + "epoch": 1.694290976058932, + "grad_norm": 0.7571700215339661, + "learning_rate": 9.616771156461755e-05, + "loss": 0.07263468503952027, + "mean_token_accuracy": 0.9717419981956482, + "num_tokens": 18902513.0, + "step": 920 + }, + { + "entropy": 0.8663733780384064, + "epoch": 1.7127071823204418, + "grad_norm": 0.7886489629745483, + "learning_rate": 9.604992963487298e-05, + "loss": 0.07074605226516724, + "mean_token_accuracy": 0.9724965393543243, + "num_tokens": 19107812.0, + "step": 930 + }, + { + "entropy": 0.8673004627227783, + "epoch": 1.7311233885819521, + "grad_norm": 0.8180726170539856, + "learning_rate": 9.593043940293647e-05, + "loss": 0.06831735372543335, + "mean_token_accuracy": 0.9733696818351746, + "num_tokens": 19313330.0, + "step": 940 + }, + { + "entropy": 0.8525971233844757, + "epoch": 1.7495395948434622, + "grad_norm": 0.6576228737831116, + "learning_rate": 9.580924530150595e-05, + "loss": 0.06567002534866333, + "mean_token_accuracy": 0.9745754361152649, + "num_tokens": 19518671.0, + "step": 950 + }, + { + "entropy": 0.8605451703071594, + "epoch": 1.7679558011049723, + "grad_norm": 0.7171661257743835, + "learning_rate": 9.568635182648725e-05, + "loss": 0.06872050762176514, + "mean_token_accuracy": 0.9732091546058654, + "num_tokens": 19724135.0, + "step": 960 + }, + { + "entropy": 0.8642210960388184, + "epoch": 1.7863720073664826, + "grad_norm": 0.7603147029876709, + "learning_rate": 9.556176353682746e-05, + "loss": 0.06766576766967773, + "mean_token_accuracy": 0.9728681743144989, + "num_tokens": 19928785.0, + "step": 970 + }, + { + "entropy": 0.8543185651302337, + "epoch": 1.8047882136279927, + "grad_norm": 0.7280875444412231, + "learning_rate": 9.543548505434581e-05, + "loss": 0.06851862668991089, + "mean_token_accuracy": 0.9737437188625335, + "num_tokens": 20134195.0, + "step": 980 + }, + { + "entropy": 0.8744745373725891, + "epoch": 1.8232044198895028, + "grad_norm": 0.5897248983383179, + "learning_rate": 9.530752106356209e-05, + "loss": 0.06809053421020508, + "mean_token_accuracy": 0.9733593761920929, + "num_tokens": 20339517.0, + "step": 990 + }, + { + "entropy": 0.8623859465122223, + "epoch": 1.8416206261510129, + "grad_norm": 0.7515265345573425, + "learning_rate": 9.517787631152298e-05, + "loss": 0.07257847785949707, + "mean_token_accuracy": 0.9714054942131043, + "num_tokens": 20545249.0, + "step": 1000 + }, + { + "entropy": 0.8669404804706573, + "epoch": 1.860036832412523, + "grad_norm": 0.7144560813903809, + "learning_rate": 9.504655560762596e-05, + "loss": 0.06832354068756104, + "mean_token_accuracy": 0.9735779523849487, + "num_tokens": 20750507.0, + "step": 1010 + }, + { + "entropy": 0.8493516445159912, + "epoch": 1.8784530386740332, + "grad_norm": 0.6559189558029175, + "learning_rate": 9.491356382344081e-05, + "loss": 0.0629766047000885, + "mean_token_accuracy": 0.9754977762699127, + "num_tokens": 20955956.0, + "step": 1020 + }, + { + "entropy": 0.8599376022815705, + "epoch": 1.8968692449355433, + "grad_norm": 0.6792973279953003, + "learning_rate": 9.477890589252895e-05, + "loss": 0.0666757881641388, + "mean_token_accuracy": 0.974083811044693, + "num_tokens": 21161163.0, + "step": 1030 + }, + { + "entropy": 0.8458438158035279, + "epoch": 1.9152854511970534, + "grad_norm": 0.6941778659820557, + "learning_rate": 9.464258681026042e-05, + "loss": 0.06307152509689332, + "mean_token_accuracy": 0.9757042229175568, + "num_tokens": 21366525.0, + "step": 1040 + }, + { + "entropy": 0.848515909910202, + "epoch": 1.9337016574585635, + "grad_norm": 0.7307806611061096, + "learning_rate": 9.450461163362855e-05, + "loss": 0.06307026147842407, + "mean_token_accuracy": 0.9750974595546722, + "num_tokens": 21572238.0, + "step": 1050 + }, + { + "entropy": 0.8563454031944275, + "epoch": 1.9521178637200736, + "grad_norm": 0.7222106456756592, + "learning_rate": 9.436498548106236e-05, + "loss": 0.0647726058959961, + "mean_token_accuracy": 0.974629694223404, + "num_tokens": 21777633.0, + "step": 1060 + }, + { + "entropy": 0.8656457483768463, + "epoch": 1.9705340699815839, + "grad_norm": 0.67178875207901, + "learning_rate": 9.422371353223674e-05, + "loss": 0.06573554277420043, + "mean_token_accuracy": 0.9745908617973328, + "num_tokens": 21983116.0, + "step": 1070 + }, + { + "entropy": 0.8630891263484954, + "epoch": 1.988950276243094, + "grad_norm": 0.6956593990325928, + "learning_rate": 9.408080102788016e-05, + "loss": 0.06630704402923585, + "mean_token_accuracy": 0.9741333484649658, + "num_tokens": 22188662.0, + "step": 1080 + }, + { + "epoch": 2.0, + "eval_entropy": 0.8560857042022373, + "eval_loss": 0.06494329869747162, + "eval_mean_token_accuracy": 0.9745692672936813, + "eval_num_tokens": 22311800.0, + "eval_runtime": 10.129, + "eval_samples_per_second": 361.142, + "eval_steps_per_second": 11.354, + "step": 1086 + }, + { + "entropy": 0.8616272270679474, + "epoch": 2.007366482504604, + "grad_norm": 0.7778105139732361, + "learning_rate": 9.393625326958041e-05, + "loss": 0.054407155513763426, + "mean_token_accuracy": 0.9792074799537659, + "num_tokens": 22394215.0, + "step": 1090 + }, + { + "entropy": 0.8496910452842712, + "epoch": 2.0257826887661143, + "grad_norm": 0.7422528266906738, + "learning_rate": 9.379007561958792e-05, + "loss": 0.051881587505340575, + "mean_token_accuracy": 0.9799090325832367, + "num_tokens": 22599599.0, + "step": 1100 + }, + { + "entropy": 0.8531602442264556, + "epoch": 2.044198895027624, + "grad_norm": 0.9075332880020142, + "learning_rate": 9.36422735006167e-05, + "loss": 0.05190724730491638, + "mean_token_accuracy": 0.979931116104126, + "num_tokens": 22805318.0, + "step": 1110 + }, + { + "entropy": 0.8657277703285218, + "epoch": 2.0626151012891345, + "grad_norm": 0.9466913938522339, + "learning_rate": 9.349285239564325e-05, + "loss": 0.053853434324264524, + "mean_token_accuracy": 0.9796103596687317, + "num_tokens": 23010438.0, + "step": 1120 + }, + { + "entropy": 0.8578485429286957, + "epoch": 2.0810313075506444, + "grad_norm": 0.6903054714202881, + "learning_rate": 9.334181784770326e-05, + "loss": 0.05228850841522217, + "mean_token_accuracy": 0.9802409887313843, + "num_tokens": 23215795.0, + "step": 1130 + }, + { + "entropy": 0.8450767934322357, + "epoch": 2.0994475138121547, + "grad_norm": 0.6615211367607117, + "learning_rate": 9.318917545968581e-05, + "loss": 0.050570905208587646, + "mean_token_accuracy": 0.9802053451538086, + "num_tokens": 23421157.0, + "step": 1140 + }, + { + "entropy": 0.8325044393539429, + "epoch": 2.117863720073665, + "grad_norm": 0.760960578918457, + "learning_rate": 9.303493089412564e-05, + "loss": 0.051966112852096555, + "mean_token_accuracy": 0.9796205997467041, + "num_tokens": 23626584.0, + "step": 1150 + }, + { + "entropy": 0.8416404843330383, + "epoch": 2.136279926335175, + "grad_norm": 0.6947009563446045, + "learning_rate": 9.287908987299306e-05, + "loss": 0.05144861936569214, + "mean_token_accuracy": 0.9800034642219544, + "num_tokens": 23832137.0, + "step": 1160 + }, + { + "entropy": 0.8564540028572083, + "epoch": 2.154696132596685, + "grad_norm": 0.733252763748169, + "learning_rate": 9.272165817748164e-05, + "loss": 0.04944799542427063, + "mean_token_accuracy": 0.9808157980442047, + "num_tokens": 24038006.0, + "step": 1170 + }, + { + "entropy": 0.8575525343418121, + "epoch": 2.1731123388581954, + "grad_norm": 0.8911028504371643, + "learning_rate": 9.25626416477938e-05, + "loss": 0.05037952661514282, + "mean_token_accuracy": 0.980946284532547, + "num_tokens": 24243374.0, + "step": 1180 + }, + { + "entropy": 0.8599720418453216, + "epoch": 2.1915285451197053, + "grad_norm": 0.7713524103164673, + "learning_rate": 9.240204618292416e-05, + "loss": 0.050603735446929934, + "mean_token_accuracy": 0.980896121263504, + "num_tokens": 24448585.0, + "step": 1190 + }, + { + "entropy": 0.8566664934158326, + "epoch": 2.2099447513812156, + "grad_norm": 0.8439353704452515, + "learning_rate": 9.223987774044066e-05, + "loss": 0.054171699285507205, + "mean_token_accuracy": 0.9796543836593627, + "num_tokens": 24653863.0, + "step": 1200 + }, + { + "entropy": 0.846601277589798, + "epoch": 2.2283609576427255, + "grad_norm": 0.7025637030601501, + "learning_rate": 9.207614233626356e-05, + "loss": 0.048924127221107484, + "mean_token_accuracy": 0.9809681415557862, + "num_tokens": 24859801.0, + "step": 1210 + }, + { + "entropy": 0.8564423739910125, + "epoch": 2.2467771639042358, + "grad_norm": 0.7788274884223938, + "learning_rate": 9.191084604444233e-05, + "loss": 0.05260283350944519, + "mean_token_accuracy": 0.9793797850608825, + "num_tokens": 25065368.0, + "step": 1220 + }, + { + "entropy": 0.865056723356247, + "epoch": 2.265193370165746, + "grad_norm": 0.8728818297386169, + "learning_rate": 9.174399499693027e-05, + "loss": 0.05016371011734009, + "mean_token_accuracy": 0.9807134211063385, + "num_tokens": 25270945.0, + "step": 1230 + }, + { + "entropy": 0.8642262935638427, + "epoch": 2.283609576427256, + "grad_norm": 1.0582489967346191, + "learning_rate": 9.157559538335703e-05, + "loss": 0.05316779017448425, + "mean_token_accuracy": 0.9794209063053131, + "num_tokens": 25476575.0, + "step": 1240 + }, + { + "entropy": 0.8677761554718018, + "epoch": 2.3020257826887662, + "grad_norm": 0.760109543800354, + "learning_rate": 9.140565345079901e-05, + "loss": 0.05115479230880737, + "mean_token_accuracy": 0.9802310705184937, + "num_tokens": 25682814.0, + "step": 1250 + }, + { + "entropy": 0.8592945456504821, + "epoch": 2.320441988950276, + "grad_norm": 0.6537907123565674, + "learning_rate": 9.123417550354761e-05, + "loss": 0.050543540716171266, + "mean_token_accuracy": 0.9806945025920868, + "num_tokens": 25887575.0, + "step": 1260 + }, + { + "entropy": 0.8692500293254852, + "epoch": 2.3388581952117864, + "grad_norm": 0.7771905064582825, + "learning_rate": 9.106116790287541e-05, + "loss": 0.049718713760375975, + "mean_token_accuracy": 0.9805168390274048, + "num_tokens": 26092950.0, + "step": 1270 + }, + { + "entropy": 0.8841261565685272, + "epoch": 2.3572744014732967, + "grad_norm": 0.7791076898574829, + "learning_rate": 9.08866370668001e-05, + "loss": 0.0527400553226471, + "mean_token_accuracy": 0.9796754539012908, + "num_tokens": 26298182.0, + "step": 1280 + }, + { + "entropy": 0.8675022900104523, + "epoch": 2.3756906077348066, + "grad_norm": 0.8481605648994446, + "learning_rate": 9.07105894698464e-05, + "loss": 0.05320838689804077, + "mean_token_accuracy": 0.9792274832725525, + "num_tokens": 26503425.0, + "step": 1290 + }, + { + "entropy": 0.8704026222229004, + "epoch": 2.394106813996317, + "grad_norm": 0.8235505819320679, + "learning_rate": 9.053303164280602e-05, + "loss": 0.055045205354690555, + "mean_token_accuracy": 0.9788750648498535, + "num_tokens": 26708755.0, + "step": 1300 + }, + { + "entropy": 0.8525134027004242, + "epoch": 2.4125230202578267, + "grad_norm": 0.7611598968505859, + "learning_rate": 9.035397017249518e-05, + "loss": 0.05029621124267578, + "mean_token_accuracy": 0.9802757322788238, + "num_tokens": 26914704.0, + "step": 1310 + }, + { + "entropy": 0.8630305290222168, + "epoch": 2.430939226519337, + "grad_norm": 0.790408194065094, + "learning_rate": 9.017341170151041e-05, + "loss": 0.04856040775775909, + "mean_token_accuracy": 0.9809690833091735, + "num_tokens": 27120151.0, + "step": 1320 + }, + { + "entropy": 0.8579159140586853, + "epoch": 2.4493554327808473, + "grad_norm": 0.781972348690033, + "learning_rate": 8.999136292798207e-05, + "loss": 0.04869682788848877, + "mean_token_accuracy": 0.9816130697727203, + "num_tokens": 27325673.0, + "step": 1330 + }, + { + "entropy": 0.8634716987609863, + "epoch": 2.467771639042357, + "grad_norm": 0.8500784039497375, + "learning_rate": 8.980783060532588e-05, + "loss": 0.05050289034843445, + "mean_token_accuracy": 0.980079609155655, + "num_tokens": 27531270.0, + "step": 1340 + }, + { + "entropy": 0.8660618126392364, + "epoch": 2.4861878453038675, + "grad_norm": 0.719760537147522, + "learning_rate": 8.96228215419924e-05, + "loss": 0.04892141819000244, + "mean_token_accuracy": 0.9814020991325378, + "num_tokens": 27736542.0, + "step": 1350 + }, + { + "entropy": 0.8572284400463104, + "epoch": 2.5046040515653774, + "grad_norm": 1.0197229385375977, + "learning_rate": 8.943634260121442e-05, + "loss": 0.05104702711105347, + "mean_token_accuracy": 0.9798846662044525, + "num_tokens": 27941566.0, + "step": 1360 + }, + { + "entropy": 0.8702241241931915, + "epoch": 2.5230202578268877, + "grad_norm": 0.7136003375053406, + "learning_rate": 8.924840070075247e-05, + "loss": 0.04855787754058838, + "mean_token_accuracy": 0.9811685383319855, + "num_tokens": 28146943.0, + "step": 1370 + }, + { + "entropy": 0.874957013130188, + "epoch": 2.541436464088398, + "grad_norm": 0.8775497674942017, + "learning_rate": 8.905900281263804e-05, + "loss": 0.052434295415878296, + "mean_token_accuracy": 0.9795438170433044, + "num_tokens": 28352640.0, + "step": 1380 + }, + { + "entropy": 0.8776536166667939, + "epoch": 2.559852670349908, + "grad_norm": 0.8895741105079651, + "learning_rate": 8.8868155962915e-05, + "loss": 0.05282890796661377, + "mean_token_accuracy": 0.9790538609027862, + "num_tokens": 28558153.0, + "step": 1390 + }, + { + "entropy": 0.8738743245601654, + "epoch": 2.578268876611418, + "grad_norm": 0.788800060749054, + "learning_rate": 8.867586723137906e-05, + "loss": 0.048841872811317445, + "mean_token_accuracy": 0.9809149026870727, + "num_tokens": 28763613.0, + "step": 1400 + }, + { + "entropy": 0.8750253796577454, + "epoch": 2.596685082872928, + "grad_norm": 0.8738002777099609, + "learning_rate": 8.848214375131497e-05, + "loss": 0.048261132836341855, + "mean_token_accuracy": 0.980789190530777, + "num_tokens": 28969248.0, + "step": 1410 + }, + { + "entropy": 0.8624245524406433, + "epoch": 2.6151012891344383, + "grad_norm": 0.6404895186424255, + "learning_rate": 8.828699270923196e-05, + "loss": 0.04970468282699585, + "mean_token_accuracy": 0.9807762265205383, + "num_tokens": 29174779.0, + "step": 1420 + }, + { + "entropy": 0.8792938470840455, + "epoch": 2.6335174953959486, + "grad_norm": 0.7856965661048889, + "learning_rate": 8.80904213445972e-05, + "loss": 0.053334391117095946, + "mean_token_accuracy": 0.9790222108364105, + "num_tokens": 29380474.0, + "step": 1430 + }, + { + "entropy": 0.8831034600734711, + "epoch": 2.6519337016574585, + "grad_norm": 0.7739618420600891, + "learning_rate": 8.789243694956716e-05, + "loss": 0.04959054589271546, + "mean_token_accuracy": 0.9803965091705322, + "num_tokens": 29585985.0, + "step": 1440 + }, + { + "entropy": 0.8934672951698304, + "epoch": 2.6703499079189688, + "grad_norm": 0.6999697089195251, + "learning_rate": 8.769304686871719e-05, + "loss": 0.05165250301361084, + "mean_token_accuracy": 0.9798884153366089, + "num_tokens": 29791238.0, + "step": 1450 + }, + { + "entropy": 0.9053199410438537, + "epoch": 2.6887661141804786, + "grad_norm": 0.9199564456939697, + "learning_rate": 8.749225849876892e-05, + "loss": 0.04924143850803375, + "mean_token_accuracy": 0.9810785710811615, + "num_tokens": 29996589.0, + "step": 1460 + }, + { + "entropy": 0.888091403245926, + "epoch": 2.707182320441989, + "grad_norm": 0.7480106353759766, + "learning_rate": 8.729007928831597e-05, + "loss": 0.04948916733264923, + "mean_token_accuracy": 0.9809579730033875, + "num_tokens": 30201875.0, + "step": 1470 + }, + { + "entropy": 0.8723407983779907, + "epoch": 2.7255985267034992, + "grad_norm": 0.9506945013999939, + "learning_rate": 8.708651673754763e-05, + "loss": 0.048927539587020875, + "mean_token_accuracy": 0.980553150177002, + "num_tokens": 30407550.0, + "step": 1480 + }, + { + "entropy": 0.8737521529197693, + "epoch": 2.744014732965009, + "grad_norm": 0.8015706539154053, + "learning_rate": 8.688157839797062e-05, + "loss": 0.04963063597679138, + "mean_token_accuracy": 0.9809738755226135, + "num_tokens": 30612839.0, + "step": 1490 + }, + { + "entropy": 0.8800762951374054, + "epoch": 2.7624309392265194, + "grad_norm": 0.9429986476898193, + "learning_rate": 8.667527187212885e-05, + "loss": 0.0524174690246582, + "mean_token_accuracy": 0.9788767337799072, + "num_tokens": 30818578.0, + "step": 1500 + }, + { + "entropy": 0.8871055901050567, + "epoch": 2.7808471454880292, + "grad_norm": 0.5909196138381958, + "learning_rate": 8.646760481332157e-05, + "loss": 0.05166680812835693, + "mean_token_accuracy": 0.980216771364212, + "num_tokens": 31023829.0, + "step": 1510 + }, + { + "entropy": 0.8908755779266357, + "epoch": 2.7992633517495396, + "grad_norm": 0.9154611229896545, + "learning_rate": 8.625858492531931e-05, + "loss": 0.04951836466789246, + "mean_token_accuracy": 0.9801484227180481, + "num_tokens": 31229635.0, + "step": 1520 + }, + { + "entropy": 0.92480548620224, + "epoch": 2.81767955801105, + "grad_norm": 0.5989938378334045, + "learning_rate": 8.604821996207819e-05, + "loss": 0.04799881279468536, + "mean_token_accuracy": 0.9817522585391998, + "num_tokens": 31435456.0, + "step": 1530 + }, + { + "entropy": 0.9173881888389588, + "epoch": 2.8360957642725597, + "grad_norm": 0.899413526058197, + "learning_rate": 8.58365177274522e-05, + "loss": 0.0487445592880249, + "mean_token_accuracy": 0.9812625288963318, + "num_tokens": 31640904.0, + "step": 1540 + }, + { + "entropy": 0.9076135993003845, + "epoch": 2.85451197053407, + "grad_norm": 0.8494166135787964, + "learning_rate": 8.562348607490376e-05, + "loss": 0.05005228519439697, + "mean_token_accuracy": 0.9806681036949157, + "num_tokens": 31845807.0, + "step": 1550 + }, + { + "entropy": 0.9092245221138, + "epoch": 2.87292817679558, + "grad_norm": 0.8225123286247253, + "learning_rate": 8.540913290721234e-05, + "loss": 0.048654764890670776, + "mean_token_accuracy": 0.9805659353733063, + "num_tokens": 32051523.0, + "step": 1560 + }, + { + "entropy": 0.9062779664993286, + "epoch": 2.89134438305709, + "grad_norm": 0.7074014544487, + "learning_rate": 8.519346617618134e-05, + "loss": 0.049209845066070554, + "mean_token_accuracy": 0.9807434439659118, + "num_tokens": 32256895.0, + "step": 1570 + }, + { + "entropy": 0.9190246641635895, + "epoch": 2.9097605893186005, + "grad_norm": 0.8860642910003662, + "learning_rate": 8.497649388234304e-05, + "loss": 0.051211881637573245, + "mean_token_accuracy": 0.9802342295646668, + "num_tokens": 32462031.0, + "step": 1580 + }, + { + "entropy": 0.9088015079498291, + "epoch": 2.9281767955801103, + "grad_norm": 0.8062726855278015, + "learning_rate": 8.475822407466188e-05, + "loss": 0.053512704372406, + "mean_token_accuracy": 0.979486483335495, + "num_tokens": 32667533.0, + "step": 1590 + }, + { + "entropy": 0.9462027847766876, + "epoch": 2.9465930018416207, + "grad_norm": 0.7962909936904907, + "learning_rate": 8.453866485023579e-05, + "loss": 0.0501457154750824, + "mean_token_accuracy": 0.9803222417831421, + "num_tokens": 32872900.0, + "step": 1600 + }, + { + "entropy": 0.9671471297740937, + "epoch": 2.9650092081031305, + "grad_norm": 0.7641744017601013, + "learning_rate": 8.431782435399587e-05, + "loss": 0.04629061222076416, + "mean_token_accuracy": 0.9823175370693207, + "num_tokens": 33077850.0, + "step": 1610 + }, + { + "entropy": 0.955865204334259, + "epoch": 2.983425414364641, + "grad_norm": 0.6772348880767822, + "learning_rate": 8.409571077840426e-05, + "loss": 0.048368623852729796, + "mean_token_accuracy": 0.9808700799942016, + "num_tokens": 33283117.0, + "step": 1620 + }, + { + "epoch": 3.0, + "eval_entropy": 0.9563225186389426, + "eval_loss": 0.059064481407403946, + "eval_mean_token_accuracy": 0.9773589429648026, + "eval_num_tokens": 33467712.0, + "eval_runtime": 10.1471, + "eval_samples_per_second": 360.499, + "eval_steps_per_second": 11.333, + "step": 1629 + }, + { + "entropy": 0.9337226033210755, + "epoch": 3.001841620626151, + "grad_norm": 0.646203875541687, + "learning_rate": 8.387233236315016e-05, + "loss": 0.043352216482162476, + "mean_token_accuracy": 0.9830620110034942, + "num_tokens": 33488302.0, + "step": 1630 + }, + { + "entropy": 0.9734923839569092, + "epoch": 3.020257826887661, + "grad_norm": 0.7564226984977722, + "learning_rate": 8.364769739484416e-05, + "loss": 0.033932483196258544, + "mean_token_accuracy": 0.9872806966304779, + "num_tokens": 33693531.0, + "step": 1640 + }, + { + "entropy": 0.9669206500053406, + "epoch": 3.0386740331491713, + "grad_norm": 0.7126886248588562, + "learning_rate": 8.342181420671096e-05, + "loss": 0.03818287253379822, + "mean_token_accuracy": 0.9852082908153534, + "num_tokens": 33899305.0, + "step": 1650 + }, + { + "entropy": 0.9522916138172149, + "epoch": 3.0570902394106816, + "grad_norm": 1.0571653842926025, + "learning_rate": 8.319469117828007e-05, + "loss": 0.03456039130687714, + "mean_token_accuracy": 0.9867027878761292, + "num_tokens": 34104585.0, + "step": 1660 + }, + { + "entropy": 0.9568560004234314, + "epoch": 3.0755064456721914, + "grad_norm": 0.780940592288971, + "learning_rate": 8.296633673507505e-05, + "loss": 0.03551802039146423, + "mean_token_accuracy": 0.9867531359195709, + "num_tokens": 34309516.0, + "step": 1670 + }, + { + "entropy": 0.9590656876564025, + "epoch": 3.0939226519337018, + "grad_norm": 0.8330219388008118, + "learning_rate": 8.273675934830094e-05, + "loss": 0.03674865961074829, + "mean_token_accuracy": 0.9864118576049805, + "num_tokens": 34515170.0, + "step": 1680 + }, + { + "entropy": 0.975881814956665, + "epoch": 3.1123388581952116, + "grad_norm": 0.7010637521743774, + "learning_rate": 8.250596753453e-05, + "loss": 0.03550414443016052, + "mean_token_accuracy": 0.9864102602005005, + "num_tokens": 34720896.0, + "step": 1690 + }, + { + "entropy": 0.9599562883377075, + "epoch": 3.130755064456722, + "grad_norm": 0.6694278717041016, + "learning_rate": 8.227396985538578e-05, + "loss": 0.035564273595809937, + "mean_token_accuracy": 0.9867321848869324, + "num_tokens": 34925970.0, + "step": 1700 + }, + { + "entropy": 0.9582216143608093, + "epoch": 3.149171270718232, + "grad_norm": 0.9333199262619019, + "learning_rate": 8.204077491722546e-05, + "loss": 0.035575729608535764, + "mean_token_accuracy": 0.9862452208995819, + "num_tokens": 35131543.0, + "step": 1710 + }, + { + "entropy": 0.9579678058624268, + "epoch": 3.167587476979742, + "grad_norm": 0.9450218081474304, + "learning_rate": 8.180639137082066e-05, + "loss": 0.0385298490524292, + "mean_token_accuracy": 0.98538036942482, + "num_tokens": 35336790.0, + "step": 1720 + }, + { + "entropy": 0.9640831351280212, + "epoch": 3.1860036832412524, + "grad_norm": 0.8551534414291382, + "learning_rate": 8.157082791103649e-05, + "loss": 0.03702138364315033, + "mean_token_accuracy": 0.9852015495300293, + "num_tokens": 35542294.0, + "step": 1730 + }, + { + "entropy": 0.9867071211338043, + "epoch": 3.2044198895027622, + "grad_norm": 0.7138128876686096, + "learning_rate": 8.133409327650897e-05, + "loss": 0.035626694560050964, + "mean_token_accuracy": 0.986064875125885, + "num_tokens": 35747447.0, + "step": 1740 + }, + { + "entropy": 0.9639089345932007, + "epoch": 3.2228360957642725, + "grad_norm": 0.7131415009498596, + "learning_rate": 8.109619624932092e-05, + "loss": 0.035885071754455565, + "mean_token_accuracy": 0.986273056268692, + "num_tokens": 35952258.0, + "step": 1750 + }, + { + "entropy": 0.9516046345233917, + "epoch": 3.241252302025783, + "grad_norm": 0.6900200843811035, + "learning_rate": 8.085714565467611e-05, + "loss": 0.03535219430923462, + "mean_token_accuracy": 0.985836285352707, + "num_tokens": 36157938.0, + "step": 1760 + }, + { + "entropy": 0.9373646557331086, + "epoch": 3.2596685082872927, + "grad_norm": 0.6101690530776978, + "learning_rate": 8.061695036057191e-05, + "loss": 0.034940996766090394, + "mean_token_accuracy": 0.9863743901252746, + "num_tokens": 36363825.0, + "step": 1770 + }, + { + "entropy": 0.9444344758987426, + "epoch": 3.278084714548803, + "grad_norm": 0.7518529295921326, + "learning_rate": 8.03756192774703e-05, + "loss": 0.03404279053211212, + "mean_token_accuracy": 0.9866396844387054, + "num_tokens": 36568961.0, + "step": 1780 + }, + { + "entropy": 0.9550357758998871, + "epoch": 3.2965009208103133, + "grad_norm": 0.7687555551528931, + "learning_rate": 8.013316135796734e-05, + "loss": 0.038447052240371704, + "mean_token_accuracy": 0.985325163602829, + "num_tokens": 36774514.0, + "step": 1790 + }, + { + "entropy": 0.9477231681346894, + "epoch": 3.314917127071823, + "grad_norm": 0.7521633505821228, + "learning_rate": 7.988958559646102e-05, + "loss": 0.03746694028377533, + "mean_token_accuracy": 0.9853165090084076, + "num_tokens": 36979660.0, + "step": 1800 + }, + { + "entropy": 0.925805002450943, + "epoch": 3.3333333333333335, + "grad_norm": 0.9333297610282898, + "learning_rate": 7.964490102881768e-05, + "loss": 0.03700103759765625, + "mean_token_accuracy": 0.9850880861282348, + "num_tokens": 37185191.0, + "step": 1810 + }, + { + "entropy": 0.9225482225418091, + "epoch": 3.3517495395948433, + "grad_norm": 0.7928622961044312, + "learning_rate": 7.939911673203665e-05, + "loss": 0.03825801610946655, + "mean_token_accuracy": 0.9850241422653199, + "num_tokens": 37390749.0, + "step": 1820 + }, + { + "entropy": 0.9597147881984711, + "epoch": 3.3701657458563536, + "grad_norm": 0.7658583521842957, + "learning_rate": 7.915224182391375e-05, + "loss": 0.039855146408081056, + "mean_token_accuracy": 0.9845879554748536, + "num_tokens": 37596052.0, + "step": 1830 + }, + { + "entropy": 0.9485619068145752, + "epoch": 3.388581952117864, + "grad_norm": 0.8492130637168884, + "learning_rate": 7.890428546270278e-05, + "loss": 0.039359599351882935, + "mean_token_accuracy": 0.9847265422344208, + "num_tokens": 37802063.0, + "step": 1840 + }, + { + "entropy": 0.9670301914215088, + "epoch": 3.406998158379374, + "grad_norm": 0.7527599930763245, + "learning_rate": 7.865525684677608e-05, + "loss": 0.03752985596656799, + "mean_token_accuracy": 0.9855137526988983, + "num_tokens": 38007432.0, + "step": 1850 + }, + { + "entropy": 0.9681244969367981, + "epoch": 3.425414364640884, + "grad_norm": 0.7599612474441528, + "learning_rate": 7.840516521428303e-05, + "loss": 0.03653894364833832, + "mean_token_accuracy": 0.9858933389186859, + "num_tokens": 38212923.0, + "step": 1860 + }, + { + "entropy": 0.9706049561500549, + "epoch": 3.443830570902394, + "grad_norm": 0.7678127884864807, + "learning_rate": 7.815401984280748e-05, + "loss": 0.0366938978433609, + "mean_token_accuracy": 0.9854713797569274, + "num_tokens": 38418422.0, + "step": 1870 + }, + { + "entropy": 0.9637093842029572, + "epoch": 3.4622467771639043, + "grad_norm": 0.762824535369873, + "learning_rate": 7.790183004902359e-05, + "loss": 0.03516915142536163, + "mean_token_accuracy": 0.9866003453731537, + "num_tokens": 38624389.0, + "step": 1880 + }, + { + "entropy": 0.9373565018177032, + "epoch": 3.4806629834254146, + "grad_norm": 0.8221780061721802, + "learning_rate": 7.764860518835014e-05, + "loss": 0.04049026966094971, + "mean_token_accuracy": 0.984089481830597, + "num_tokens": 38829654.0, + "step": 1890 + }, + { + "entropy": 0.9356025457382202, + "epoch": 3.4990791896869244, + "grad_norm": 0.7583426237106323, + "learning_rate": 7.739435465460356e-05, + "loss": 0.03658481240272522, + "mean_token_accuracy": 0.9857318818569183, + "num_tokens": 39034638.0, + "step": 1900 + }, + { + "entropy": 0.9740163326263428, + "epoch": 3.5174953959484347, + "grad_norm": 0.7332878112792969, + "learning_rate": 7.713908787964937e-05, + "loss": 0.03508963882923126, + "mean_token_accuracy": 0.9863419532775879, + "num_tokens": 39240265.0, + "step": 1910 + }, + { + "entropy": 0.9528286933898926, + "epoch": 3.5359116022099446, + "grad_norm": 0.6515451669692993, + "learning_rate": 7.688281433305233e-05, + "loss": 0.036055779457092284, + "mean_token_accuracy": 0.9860979080200195, + "num_tokens": 39445546.0, + "step": 1920 + }, + { + "entropy": 0.9480705261230469, + "epoch": 3.554327808471455, + "grad_norm": 0.7725827097892761, + "learning_rate": 7.662554352172515e-05, + "loss": 0.037101513147354125, + "mean_token_accuracy": 0.985782790184021, + "num_tokens": 39651078.0, + "step": 1930 + }, + { + "entropy": 0.9655321061611175, + "epoch": 3.572744014732965, + "grad_norm": 0.7756506204605103, + "learning_rate": 7.636728498957581e-05, + "loss": 0.03721855878829956, + "mean_token_accuracy": 0.9857951939105988, + "num_tokens": 39856542.0, + "step": 1940 + }, + { + "entropy": 0.9772682309150695, + "epoch": 3.591160220994475, + "grad_norm": 0.9084987640380859, + "learning_rate": 7.610804831715355e-05, + "loss": 0.03570749163627625, + "mean_token_accuracy": 0.9863450109958649, + "num_tokens": 40061913.0, + "step": 1950 + }, + { + "entropy": 0.9579685389995575, + "epoch": 3.6095764272559854, + "grad_norm": 0.6358487606048584, + "learning_rate": 7.584784312129334e-05, + "loss": 0.038210684061050416, + "mean_token_accuracy": 0.9850837290287018, + "num_tokens": 40267398.0, + "step": 1960 + }, + { + "entropy": 0.9605201721191406, + "epoch": 3.6279926335174952, + "grad_norm": 0.6263149976730347, + "learning_rate": 7.558667905475927e-05, + "loss": 0.03509160876274109, + "mean_token_accuracy": 0.9868143379688263, + "num_tokens": 40472827.0, + "step": 1970 + }, + { + "entropy": 0.964026153087616, + "epoch": 3.6464088397790055, + "grad_norm": 0.90068119764328, + "learning_rate": 7.532456580588638e-05, + "loss": 0.036211782693862916, + "mean_token_accuracy": 0.9858468770980835, + "num_tokens": 40677935.0, + "step": 1980 + }, + { + "entropy": 0.9494135618209839, + "epoch": 3.664825046040516, + "grad_norm": 0.760134756565094, + "learning_rate": 7.50615130982213e-05, + "loss": 0.03786201477050781, + "mean_token_accuracy": 0.9852500438690186, + "num_tokens": 40883750.0, + "step": 1990 + }, + { + "entropy": 0.9527071297168732, + "epoch": 3.6832412523020257, + "grad_norm": 0.9812107682228088, + "learning_rate": 7.479753069016152e-05, + "loss": 0.03803159594535828, + "mean_token_accuracy": 0.9852405369281769, + "num_tokens": 41089115.0, + "step": 2000 + }, + { + "entropy": 0.9639330863952636, + "epoch": 3.701657458563536, + "grad_norm": 0.7164933681488037, + "learning_rate": 7.453262837459332e-05, + "loss": 0.03912568986415863, + "mean_token_accuracy": 0.9849458575248718, + "num_tokens": 41294694.0, + "step": 2010 + }, + { + "entropy": 0.9536987483501435, + "epoch": 3.720073664825046, + "grad_norm": 0.6804596185684204, + "learning_rate": 7.426681597852863e-05, + "loss": 0.036410006880760196, + "mean_token_accuracy": 0.985712206363678, + "num_tokens": 41499817.0, + "step": 2020 + }, + { + "entropy": 0.9478164672851562, + "epoch": 3.738489871086556, + "grad_norm": 0.8799397349357605, + "learning_rate": 7.400010336274037e-05, + "loss": 0.03801035583019256, + "mean_token_accuracy": 0.9850274682044983, + "num_tokens": 41704932.0, + "step": 2030 + }, + { + "entropy": 0.9383447647094727, + "epoch": 3.7569060773480665, + "grad_norm": 0.8386216163635254, + "learning_rate": 7.373250042139664e-05, + "loss": 0.0373637855052948, + "mean_token_accuracy": 0.9854822158813477, + "num_tokens": 41910804.0, + "step": 2040 + }, + { + "entropy": 0.925172996520996, + "epoch": 3.7753222836095763, + "grad_norm": 0.7599324584007263, + "learning_rate": 7.346401708169377e-05, + "loss": 0.03585260808467865, + "mean_token_accuracy": 0.9860672950744629, + "num_tokens": 42116706.0, + "step": 2050 + }, + { + "entropy": 0.9463765442371368, + "epoch": 3.7937384898710866, + "grad_norm": 0.9030149579048157, + "learning_rate": 7.319466330348797e-05, + "loss": 0.035877206921577455, + "mean_token_accuracy": 0.9863968968391419, + "num_tokens": 42322670.0, + "step": 2060 + }, + { + "entropy": 0.9942441761493683, + "epoch": 3.8121546961325965, + "grad_norm": 0.6400449275970459, + "learning_rate": 7.292444907892587e-05, + "loss": 0.037310433387756345, + "mean_token_accuracy": 0.9854151606559753, + "num_tokens": 42527752.0, + "step": 2070 + }, + { + "entropy": 0.9577703952789307, + "epoch": 3.830570902394107, + "grad_norm": 0.6193167567253113, + "learning_rate": 7.265338443207387e-05, + "loss": 0.03648848831653595, + "mean_token_accuracy": 0.9856530070304871, + "num_tokens": 42732981.0, + "step": 2080 + }, + { + "entropy": 0.9663952767848969, + "epoch": 3.848987108655617, + "grad_norm": 0.759611189365387, + "learning_rate": 7.238147941854625e-05, + "loss": 0.036112996935844424, + "mean_token_accuracy": 0.9862765550613404, + "num_tokens": 42938619.0, + "step": 2090 + }, + { + "entropy": 0.9484863519668579, + "epoch": 3.867403314917127, + "grad_norm": 0.7420705556869507, + "learning_rate": 7.210874412513218e-05, + "loss": 0.03703283965587616, + "mean_token_accuracy": 0.9857317566871643, + "num_tokens": 43143753.0, + "step": 2100 + }, + { + "entropy": 0.964326673746109, + "epoch": 3.8858195211786373, + "grad_norm": 0.8779639601707458, + "learning_rate": 7.183518866942147e-05, + "loss": 0.03739701807498932, + "mean_token_accuracy": 0.9852154791355133, + "num_tokens": 43349451.0, + "step": 2110 + }, + { + "entropy": 0.9729791641235351, + "epoch": 3.904235727440147, + "grad_norm": 0.7582741379737854, + "learning_rate": 7.156082319942929e-05, + "loss": 0.03894525766372681, + "mean_token_accuracy": 0.9847454309463501, + "num_tokens": 43554598.0, + "step": 2120 + }, + { + "entropy": 0.9860592544078827, + "epoch": 3.9226519337016574, + "grad_norm": 0.860698938369751, + "learning_rate": 7.128565789321969e-05, + "loss": 0.0365300178527832, + "mean_token_accuracy": 0.9859121859073638, + "num_tokens": 43760081.0, + "step": 2130 + }, + { + "entropy": 0.9916551172733307, + "epoch": 3.9410681399631677, + "grad_norm": 0.8363776206970215, + "learning_rate": 7.100970295852805e-05, + "loss": 0.036221379041671754, + "mean_token_accuracy": 0.9859034180641174, + "num_tokens": 43965432.0, + "step": 2140 + }, + { + "entropy": 0.9553558886051178, + "epoch": 3.9594843462246776, + "grad_norm": 0.9627474546432495, + "learning_rate": 7.073296863238242e-05, + "loss": 0.03684481382369995, + "mean_token_accuracy": 0.9857315957546234, + "num_tokens": 44171232.0, + "step": 2150 + }, + { + "entropy": 0.9538035809993743, + "epoch": 3.977900552486188, + "grad_norm": 0.8399474620819092, + "learning_rate": 7.045546518072366e-05, + "loss": 0.03825397789478302, + "mean_token_accuracy": 0.9846831560134888, + "num_tokens": 44376723.0, + "step": 2160 + }, + { + "entropy": 0.9476235210895538, + "epoch": 3.9963167587476978, + "grad_norm": 0.708739697933197, + "learning_rate": 7.017720289802472e-05, + "loss": 0.03618018329143524, + "mean_token_accuracy": 0.9861325800418854, + "num_tokens": 44582407.0, + "step": 2170 + }, + { + "epoch": 4.0, + "eval_entropy": 0.9569619194321011, + "eval_loss": 0.059838198125362396, + "eval_mean_token_accuracy": 0.9777795366618944, + "eval_num_tokens": 44623647.0, + "eval_runtime": 10.0379, + "eval_samples_per_second": 364.42, + "eval_steps_per_second": 11.457, + "step": 2172 + } + ], + "logging_steps": 10, + "max_steps": 5430, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.12729708313523e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2172/training_args.bin b/checkpoint-2172/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e182f9a244f56249cd86c9db6563a54fbeed7bd --- /dev/null +++ b/checkpoint-2172/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21325c9bdff5ed34f0cc34837ee67ed216c9301ab4d9b2e26f048b563564bd75 +size 5777 diff --git a/checkpoint-2715/README.md b/checkpoint-2715/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dd49221a54a197d9f0cab86c7982e2b2fe0776b3 --- /dev/null +++ b/checkpoint-2715/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen2.5-7B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen2.5-7B-Instruct +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-2715/adapter_config.json b/checkpoint-2715/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..88178e58c6d24c2bb788f871d548c8bff3b8b16a --- /dev/null +++ b/checkpoint-2715/adapter_config.json @@ -0,0 +1,48 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "down_proj", + "q_proj", + "gate_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2715/adapter_model.safetensors b/checkpoint-2715/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5a72e2e421dee68709606aa3c59d0a38a675ebd6 --- /dev/null +++ b/checkpoint-2715/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65a66fa5ef9ed41e342eac55fca7f83744379f75f4d29b573d2790ba504c1659 +size 80792096 diff --git a/checkpoint-2715/chat_template.jinja b/checkpoint-2715/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..bdf7919a96cfe43d50914a007b9c0877bd0ec27e --- /dev/null +++ b/checkpoint-2715/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-2715/tokenizer.json b/checkpoint-2715/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/checkpoint-2715/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/checkpoint-2715/tokenizer_config.json b/checkpoint-2715/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d75d3bb5300d205e48769cc1999073ab5971214 --- /dev/null +++ b/checkpoint-2715/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-2715/trainer_state.json b/checkpoint-2715/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0eb3eba2e0652f0cc35262a3123c6ef135fce116 --- /dev/null +++ b/checkpoint-2715/trainer_state.json @@ -0,0 +1,2799 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 2715, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2237394809722901, + "epoch": 0.01841620626151013, + "grad_norm": 5.082435607910156, + "learning_rate": 3.308823529411765e-06, + "loss": 0.9237876892089844, + "mean_token_accuracy": 0.7685343027114868, + "num_tokens": 205423.0, + "step": 10 + }, + { + "entropy": 1.2295925617218018, + "epoch": 0.03683241252302026, + "grad_norm": 4.672000408172607, + "learning_rate": 6.985294117647059e-06, + "loss": 0.8900892257690429, + "mean_token_accuracy": 0.7677771031856537, + "num_tokens": 410849.0, + "step": 20 + }, + { + "entropy": 1.2285718679428101, + "epoch": 0.055248618784530384, + "grad_norm": 1.4828118085861206, + "learning_rate": 1.0661764705882354e-05, + "loss": 0.5975452899932862, + "mean_token_accuracy": 0.8146551787853241, + "num_tokens": 616438.0, + "step": 30 + }, + { + "entropy": 1.210776400566101, + "epoch": 0.07366482504604052, + "grad_norm": 0.7761328816413879, + "learning_rate": 1.4338235294117647e-05, + "loss": 0.40664992332458494, + "mean_token_accuracy": 0.8699092030525207, + "num_tokens": 822118.0, + "step": 40 + }, + { + "entropy": 1.200321125984192, + "epoch": 0.09208103130755065, + "grad_norm": 0.5363371968269348, + "learning_rate": 1.8014705882352943e-05, + "loss": 0.3313469409942627, + "mean_token_accuracy": 0.8904915869235992, + "num_tokens": 1027941.0, + "step": 50 + }, + { + "entropy": 1.1809936046600342, + "epoch": 0.11049723756906077, + "grad_norm": 0.39541518688201904, + "learning_rate": 2.1691176470588237e-05, + "loss": 0.27568228244781495, + "mean_token_accuracy": 0.9047131836414337, + "num_tokens": 1233620.0, + "step": 60 + }, + { + "entropy": 1.169810914993286, + "epoch": 0.1289134438305709, + "grad_norm": 0.341960072517395, + "learning_rate": 2.536764705882353e-05, + "loss": 0.245219087600708, + "mean_token_accuracy": 0.9150686681270599, + "num_tokens": 1438656.0, + "step": 70 + }, + { + "entropy": 1.1652960777282715, + "epoch": 0.14732965009208104, + "grad_norm": 0.36872178316116333, + "learning_rate": 2.9044117647058828e-05, + "loss": 0.2220149040222168, + "mean_token_accuracy": 0.9224777698516846, + "num_tokens": 1643877.0, + "step": 80 + }, + { + "entropy": 1.154341197013855, + "epoch": 0.16574585635359115, + "grad_norm": 0.4152425229549408, + "learning_rate": 3.272058823529412e-05, + "loss": 0.2002798557281494, + "mean_token_accuracy": 0.9285802960395813, + "num_tokens": 1849506.0, + "step": 90 + }, + { + "entropy": 1.1507258892059327, + "epoch": 0.1841620626151013, + "grad_norm": 0.47647765278816223, + "learning_rate": 3.639705882352941e-05, + "loss": 0.18871363401412963, + "mean_token_accuracy": 0.9318056285381318, + "num_tokens": 2055071.0, + "step": 100 + }, + { + "entropy": 1.1455535531044005, + "epoch": 0.20257826887661143, + "grad_norm": 0.4853009581565857, + "learning_rate": 4.007352941176471e-05, + "loss": 0.17836341857910157, + "mean_token_accuracy": 0.9367631554603577, + "num_tokens": 2260643.0, + "step": 110 + }, + { + "entropy": 1.1402526497840881, + "epoch": 0.22099447513812154, + "grad_norm": 0.4455392360687256, + "learning_rate": 4.375e-05, + "loss": 0.16921783685684205, + "mean_token_accuracy": 0.9386959195137023, + "num_tokens": 2466085.0, + "step": 120 + }, + { + "entropy": 1.1374777555465698, + "epoch": 0.23941068139963168, + "grad_norm": 0.5880279541015625, + "learning_rate": 4.742647058823529e-05, + "loss": 0.15989291667938232, + "mean_token_accuracy": 0.9421182632446289, + "num_tokens": 2671024.0, + "step": 130 + }, + { + "entropy": 1.1273940205574036, + "epoch": 0.2578268876611418, + "grad_norm": 0.612959086894989, + "learning_rate": 5.110294117647059e-05, + "loss": 0.14701461791992188, + "mean_token_accuracy": 0.9463540315628052, + "num_tokens": 2876848.0, + "step": 140 + }, + { + "entropy": 1.1263513088226318, + "epoch": 0.27624309392265195, + "grad_norm": 0.5695255398750305, + "learning_rate": 5.477941176470589e-05, + "loss": 0.14604382514953612, + "mean_token_accuracy": 0.946351945400238, + "num_tokens": 3082589.0, + "step": 150 + }, + { + "entropy": 1.1290789365768432, + "epoch": 0.2946593001841621, + "grad_norm": 0.6608090996742249, + "learning_rate": 5.845588235294118e-05, + "loss": 0.1409450054168701, + "mean_token_accuracy": 0.9481450319290161, + "num_tokens": 3287459.0, + "step": 160 + }, + { + "entropy": 1.1291529774665832, + "epoch": 0.31307550644567217, + "grad_norm": 0.652715802192688, + "learning_rate": 6.213235294117647e-05, + "loss": 0.14441155195236205, + "mean_token_accuracy": 0.9466125547885895, + "num_tokens": 3493682.0, + "step": 170 + }, + { + "entropy": 1.1244838953018188, + "epoch": 0.3314917127071823, + "grad_norm": 0.7815241813659668, + "learning_rate": 6.580882352941177e-05, + "loss": 0.13361064195632935, + "mean_token_accuracy": 0.9512295544147491, + "num_tokens": 3699573.0, + "step": 180 + }, + { + "entropy": 1.1217721104621887, + "epoch": 0.34990791896869244, + "grad_norm": 0.7933160066604614, + "learning_rate": 6.948529411764706e-05, + "loss": 0.13089522123336791, + "mean_token_accuracy": 0.9520221531391144, + "num_tokens": 3905156.0, + "step": 190 + }, + { + "entropy": 1.1206679105758668, + "epoch": 0.3683241252302026, + "grad_norm": 0.6815240383148193, + "learning_rate": 7.316176470588236e-05, + "loss": 0.13400404453277587, + "mean_token_accuracy": 0.9501322209835052, + "num_tokens": 4110570.0, + "step": 200 + }, + { + "entropy": 1.1161052227020263, + "epoch": 0.3867403314917127, + "grad_norm": 0.8297767639160156, + "learning_rate": 7.683823529411766e-05, + "loss": 0.13389937877655028, + "mean_token_accuracy": 0.9501932203769684, + "num_tokens": 4315834.0, + "step": 210 + }, + { + "entropy": 1.1098745942115784, + "epoch": 0.40515653775322286, + "grad_norm": 0.5943381786346436, + "learning_rate": 8.051470588235294e-05, + "loss": 0.13452907800674438, + "mean_token_accuracy": 0.9503286242485046, + "num_tokens": 4520807.0, + "step": 220 + }, + { + "entropy": 1.100480353832245, + "epoch": 0.42357274401473294, + "grad_norm": 0.6094359755516052, + "learning_rate": 8.419117647058824e-05, + "loss": 0.12827746868133544, + "mean_token_accuracy": 0.952492094039917, + "num_tokens": 4725867.0, + "step": 230 + }, + { + "entropy": 1.0901286959648133, + "epoch": 0.4419889502762431, + "grad_norm": 0.7240597605705261, + "learning_rate": 8.786764705882353e-05, + "loss": 0.12171242237091065, + "mean_token_accuracy": 0.953943532705307, + "num_tokens": 4931629.0, + "step": 240 + }, + { + "entropy": 1.0885071873664856, + "epoch": 0.4604051565377532, + "grad_norm": 0.6939547657966614, + "learning_rate": 9.154411764705882e-05, + "loss": 0.12155698537826538, + "mean_token_accuracy": 0.9545870959758759, + "num_tokens": 5137285.0, + "step": 250 + }, + { + "entropy": 1.086272156238556, + "epoch": 0.47882136279926335, + "grad_norm": 0.5752800703048706, + "learning_rate": 9.522058823529412e-05, + "loss": 0.12157790660858155, + "mean_token_accuracy": 0.9541126549243927, + "num_tokens": 5342575.0, + "step": 260 + }, + { + "entropy": 1.0857678413391114, + "epoch": 0.4972375690607735, + "grad_norm": 0.7565123438835144, + "learning_rate": 9.889705882352942e-05, + "loss": 0.12349612712860107, + "mean_token_accuracy": 0.9535140514373779, + "num_tokens": 5547995.0, + "step": 270 + }, + { + "entropy": 1.079762625694275, + "epoch": 0.5156537753222836, + "grad_norm": 0.6972768306732178, + "learning_rate": 9.999954556423843e-05, + "loss": 0.11875582933425903, + "mean_token_accuracy": 0.9556483089923858, + "num_tokens": 5753195.0, + "step": 280 + }, + { + "entropy": 1.0742079138755798, + "epoch": 0.5340699815837937, + "grad_norm": 0.7821696996688843, + "learning_rate": 9.999731977631227e-05, + "loss": 0.11824090480804443, + "mean_token_accuracy": 0.9557521045207977, + "num_tokens": 5958236.0, + "step": 290 + }, + { + "entropy": 1.0679773569107056, + "epoch": 0.5524861878453039, + "grad_norm": 0.5846888422966003, + "learning_rate": 9.999323925089486e-05, + "loss": 0.11707355976104736, + "mean_token_accuracy": 0.9554719448089599, + "num_tokens": 6163992.0, + "step": 300 + }, + { + "entropy": 1.0655727863311768, + "epoch": 0.570902394106814, + "grad_norm": 0.5812502503395081, + "learning_rate": 9.998730413936037e-05, + "loss": 0.11371417045593261, + "mean_token_accuracy": 0.9576376020908356, + "num_tokens": 6369456.0, + "step": 310 + }, + { + "entropy": 1.0607039332389832, + "epoch": 0.5893186003683242, + "grad_norm": 0.6238475441932678, + "learning_rate": 9.99795146618821e-05, + "loss": 0.11775733232498169, + "mean_token_accuracy": 0.9557221591472626, + "num_tokens": 6574833.0, + "step": 320 + }, + { + "entropy": 1.0504255175590516, + "epoch": 0.6077348066298343, + "grad_norm": 0.6496815085411072, + "learning_rate": 9.996987110742422e-05, + "loss": 0.10904088020324706, + "mean_token_accuracy": 0.9585366368293762, + "num_tokens": 6780108.0, + "step": 330 + }, + { + "entropy": 1.0456081986427308, + "epoch": 0.6261510128913443, + "grad_norm": 0.786702573299408, + "learning_rate": 9.995837383373119e-05, + "loss": 0.10642309188842773, + "mean_token_accuracy": 0.9596696078777314, + "num_tokens": 6985920.0, + "step": 340 + }, + { + "entropy": 1.0455098271369934, + "epoch": 0.6445672191528545, + "grad_norm": 0.5473790168762207, + "learning_rate": 9.994502326731434e-05, + "loss": 0.10822961330413819, + "mean_token_accuracy": 0.959563136100769, + "num_tokens": 7191465.0, + "step": 350 + }, + { + "entropy": 1.04240562915802, + "epoch": 0.6629834254143646, + "grad_norm": 0.6672356128692627, + "learning_rate": 9.992981990343614e-05, + "loss": 0.1110004186630249, + "mean_token_accuracy": 0.9582514643669129, + "num_tokens": 7396877.0, + "step": 360 + }, + { + "entropy": 1.0386811256408692, + "epoch": 0.6813996316758748, + "grad_norm": 0.698539674282074, + "learning_rate": 9.99127643060918e-05, + "loss": 0.107539963722229, + "mean_token_accuracy": 0.9593036234378814, + "num_tokens": 7602437.0, + "step": 370 + }, + { + "entropy": 1.0311225533485413, + "epoch": 0.6998158379373849, + "grad_norm": 0.6629284024238586, + "learning_rate": 9.989385710798837e-05, + "loss": 0.1064023494720459, + "mean_token_accuracy": 0.9602205216884613, + "num_tokens": 7808142.0, + "step": 380 + }, + { + "entropy": 1.030210506916046, + "epoch": 0.7182320441988951, + "grad_norm": 0.5616748929023743, + "learning_rate": 9.987309901052121e-05, + "loss": 0.10717041492462158, + "mean_token_accuracy": 0.9599347949028015, + "num_tokens": 8013407.0, + "step": 390 + }, + { + "entropy": 1.0208017826080322, + "epoch": 0.7366482504604052, + "grad_norm": 0.6329049468040466, + "learning_rate": 9.985049078374806e-05, + "loss": 0.10359601974487305, + "mean_token_accuracy": 0.9603756129741668, + "num_tokens": 8219040.0, + "step": 400 + }, + { + "entropy": 1.015640377998352, + "epoch": 0.7550644567219152, + "grad_norm": 0.6516013741493225, + "learning_rate": 9.982603326636037e-05, + "loss": 0.10146439075469971, + "mean_token_accuracy": 0.9627702474594116, + "num_tokens": 8424678.0, + "step": 410 + }, + { + "entropy": 1.0105359435081482, + "epoch": 0.7734806629834254, + "grad_norm": 0.6920603513717651, + "learning_rate": 9.979972736565226e-05, + "loss": 0.10770498514175415, + "mean_token_accuracy": 0.9591470420360565, + "num_tokens": 8629868.0, + "step": 420 + }, + { + "entropy": 0.9966452836990356, + "epoch": 0.7918968692449355, + "grad_norm": 0.6857476234436035, + "learning_rate": 9.977157405748687e-05, + "loss": 0.10282524824142455, + "mean_token_accuracy": 0.9612209022045135, + "num_tokens": 8835320.0, + "step": 430 + }, + { + "entropy": 0.9945534646511078, + "epoch": 0.8103130755064457, + "grad_norm": 0.7208472490310669, + "learning_rate": 9.974157438626008e-05, + "loss": 0.10069938898086547, + "mean_token_accuracy": 0.9620070576667785, + "num_tokens": 9041123.0, + "step": 440 + }, + { + "entropy": 0.979461395740509, + "epoch": 0.8287292817679558, + "grad_norm": 0.5071915984153748, + "learning_rate": 9.970972946486185e-05, + "loss": 0.09799174070358277, + "mean_token_accuracy": 0.9620374023914338, + "num_tokens": 9246361.0, + "step": 450 + }, + { + "entropy": 0.9830998003482818, + "epoch": 0.8471454880294659, + "grad_norm": 0.8660802245140076, + "learning_rate": 9.967604047463493e-05, + "loss": 0.10378165245056152, + "mean_token_accuracy": 0.9606865763664245, + "num_tokens": 9451845.0, + "step": 460 + }, + { + "entropy": 0.9813413023948669, + "epoch": 0.8655616942909761, + "grad_norm": 0.7642477750778198, + "learning_rate": 9.964050866533094e-05, + "loss": 0.1010061264038086, + "mean_token_accuracy": 0.9608745336532593, + "num_tokens": 9656802.0, + "step": 470 + }, + { + "entropy": 0.967874163389206, + "epoch": 0.8839779005524862, + "grad_norm": 0.5987281799316406, + "learning_rate": 9.960313535506411e-05, + "loss": 0.10169394016265869, + "mean_token_accuracy": 0.9611998200416565, + "num_tokens": 9861719.0, + "step": 480 + }, + { + "entropy": 0.9663491308689117, + "epoch": 0.9023941068139963, + "grad_norm": 0.6124638319015503, + "learning_rate": 9.956392193026239e-05, + "loss": 0.102389657497406, + "mean_token_accuracy": 0.9611884355545044, + "num_tokens": 10066673.0, + "step": 490 + }, + { + "entropy": 0.959654438495636, + "epoch": 0.9208103130755064, + "grad_norm": 0.7873051762580872, + "learning_rate": 9.952286984561592e-05, + "loss": 0.10170392990112305, + "mean_token_accuracy": 0.9610928475856781, + "num_tokens": 10272091.0, + "step": 500 + }, + { + "entropy": 0.9550537407398224, + "epoch": 0.9392265193370166, + "grad_norm": 0.6071968078613281, + "learning_rate": 9.947998062402313e-05, + "loss": 0.09448277950286865, + "mean_token_accuracy": 0.9648977637290954, + "num_tokens": 10477632.0, + "step": 510 + }, + { + "entropy": 0.9538533687591553, + "epoch": 0.9576427255985267, + "grad_norm": 0.6317242980003357, + "learning_rate": 9.943525585653428e-05, + "loss": 0.09542192220687866, + "mean_token_accuracy": 0.9635261118412017, + "num_tokens": 10682828.0, + "step": 520 + }, + { + "entropy": 0.9362513542175293, + "epoch": 0.9760589318600368, + "grad_norm": 0.6421944499015808, + "learning_rate": 9.938869720229234e-05, + "loss": 0.09382058382034301, + "mean_token_accuracy": 0.9648073971271515, + "num_tokens": 10888741.0, + "step": 530 + }, + { + "entropy": 0.9235438346862793, + "epoch": 0.994475138121547, + "grad_norm": 0.7986873388290405, + "learning_rate": 9.934030638847155e-05, + "loss": 0.09827429056167603, + "mean_token_accuracy": 0.9621128737926483, + "num_tokens": 11094387.0, + "step": 540 + }, + { + "epoch": 1.0, + "eval_entropy": 0.9137652366057686, + "eval_loss": 0.09368764609098434, + "eval_mean_token_accuracy": 0.9640816880309063, + "eval_num_tokens": 11155908.0, + "eval_runtime": 10.4701, + "eval_samples_per_second": 349.377, + "eval_steps_per_second": 10.984, + "step": 543 + }, + { + "entropy": 0.9047818422317505, + "epoch": 1.0128913443830572, + "grad_norm": 0.6781501173973083, + "learning_rate": 9.929008521021325e-05, + "loss": 0.0863916516304016, + "mean_token_accuracy": 0.9673655688762665, + "num_tokens": 11299715.0, + "step": 550 + }, + { + "entropy": 0.8856981039047241, + "epoch": 1.0313075506445673, + "grad_norm": 0.7143136858940125, + "learning_rate": 9.923803553055937e-05, + "loss": 0.08632323145866394, + "mean_token_accuracy": 0.9677783191204071, + "num_tokens": 11505059.0, + "step": 560 + }, + { + "entropy": 0.8937099635601043, + "epoch": 1.0497237569060773, + "grad_norm": 0.7751694321632385, + "learning_rate": 9.918415928038325e-05, + "loss": 0.08178263902664185, + "mean_token_accuracy": 0.9694291114807129, + "num_tokens": 11710464.0, + "step": 570 + }, + { + "entropy": 0.8858704209327698, + "epoch": 1.0681399631675874, + "grad_norm": 0.7492292523384094, + "learning_rate": 9.912845845831805e-05, + "loss": 0.08074211478233337, + "mean_token_accuracy": 0.9692470014095307, + "num_tokens": 11915959.0, + "step": 580 + }, + { + "entropy": 0.8948039829730987, + "epoch": 1.0865561694290977, + "grad_norm": 0.8116479516029358, + "learning_rate": 9.907093513068259e-05, + "loss": 0.08712012171745301, + "mean_token_accuracy": 0.9669980227947235, + "num_tokens": 12121499.0, + "step": 590 + }, + { + "entropy": 0.8846789538860321, + "epoch": 1.1049723756906078, + "grad_norm": 0.7295626997947693, + "learning_rate": 9.901159143140471e-05, + "loss": 0.08444435596466064, + "mean_token_accuracy": 0.9674544095993042, + "num_tokens": 12327061.0, + "step": 600 + }, + { + "entropy": 0.8734103918075562, + "epoch": 1.1233885819521179, + "grad_norm": 0.9585768580436707, + "learning_rate": 9.89504295619421e-05, + "loss": 0.08022565841674804, + "mean_token_accuracy": 0.969569206237793, + "num_tokens": 12532305.0, + "step": 610 + }, + { + "entropy": 0.8640486001968384, + "epoch": 1.141804788213628, + "grad_norm": 0.7891159057617188, + "learning_rate": 9.88874517912006e-05, + "loss": 0.08415375947952271, + "mean_token_accuracy": 0.9678892493247986, + "num_tokens": 12737828.0, + "step": 620 + }, + { + "entropy": 0.8599755525588989, + "epoch": 1.160220994475138, + "grad_norm": 0.5801345109939575, + "learning_rate": 9.882266045545012e-05, + "loss": 0.08100489974021911, + "mean_token_accuracy": 0.9688023269176483, + "num_tokens": 12943343.0, + "step": 630 + }, + { + "entropy": 0.86524977684021, + "epoch": 1.1786372007366483, + "grad_norm": 0.7633041143417358, + "learning_rate": 9.87560579582379e-05, + "loss": 0.07859406471252442, + "mean_token_accuracy": 0.9702189445495606, + "num_tokens": 13148473.0, + "step": 640 + }, + { + "entropy": 0.8466695249080658, + "epoch": 1.1970534069981584, + "grad_norm": 0.8672215938568115, + "learning_rate": 9.868764677029934e-05, + "loss": 0.08082623481750488, + "mean_token_accuracy": 0.9689972400665283, + "num_tokens": 13353890.0, + "step": 650 + }, + { + "entropy": 0.8596941530704498, + "epoch": 1.2154696132596685, + "grad_norm": 0.7524124383926392, + "learning_rate": 9.861742942946639e-05, + "loss": 0.0789935290813446, + "mean_token_accuracy": 0.9693858206272126, + "num_tokens": 13559475.0, + "step": 660 + }, + { + "entropy": 0.8708749234676361, + "epoch": 1.2338858195211786, + "grad_norm": 0.5777031183242798, + "learning_rate": 9.854540854057337e-05, + "loss": 0.07773642539978028, + "mean_token_accuracy": 0.970385092496872, + "num_tokens": 13765076.0, + "step": 670 + }, + { + "entropy": 0.8651713371276856, + "epoch": 1.2523020257826887, + "grad_norm": 0.7924166321754456, + "learning_rate": 9.847158677536034e-05, + "loss": 0.0766686737537384, + "mean_token_accuracy": 0.9702267110347748, + "num_tokens": 13970642.0, + "step": 680 + }, + { + "entropy": 0.8763024985790253, + "epoch": 1.270718232044199, + "grad_norm": 0.741219162940979, + "learning_rate": 9.839596687237403e-05, + "loss": 0.07189929485321045, + "mean_token_accuracy": 0.9727097094058991, + "num_tokens": 14176556.0, + "step": 690 + }, + { + "entropy": 0.8556921362876893, + "epoch": 1.289134438305709, + "grad_norm": 0.6298198103904724, + "learning_rate": 9.831855163686618e-05, + "loss": 0.07608137726783752, + "mean_token_accuracy": 0.9716399371623993, + "num_tokens": 14381686.0, + "step": 700 + }, + { + "entropy": 0.869178420305252, + "epoch": 1.3075506445672191, + "grad_norm": 0.5850273370742798, + "learning_rate": 9.823934394068952e-05, + "loss": 0.07437651753425598, + "mean_token_accuracy": 0.9709566533565521, + "num_tokens": 14586814.0, + "step": 710 + }, + { + "entropy": 0.8708595156669616, + "epoch": 1.3259668508287292, + "grad_norm": 0.6580632328987122, + "learning_rate": 9.815834672219127e-05, + "loss": 0.07518917322158813, + "mean_token_accuracy": 0.9717426657676697, + "num_tokens": 14792321.0, + "step": 720 + }, + { + "entropy": 0.8826817810535431, + "epoch": 1.3443830570902393, + "grad_norm": 0.8788532018661499, + "learning_rate": 9.807556298610404e-05, + "loss": 0.07579240798950196, + "mean_token_accuracy": 0.9706341981887817, + "num_tokens": 14997810.0, + "step": 730 + }, + { + "entropy": 0.9012470185756684, + "epoch": 1.3627992633517496, + "grad_norm": 0.7022138237953186, + "learning_rate": 9.799099580343441e-05, + "loss": 0.0775588572025299, + "mean_token_accuracy": 0.9699241399765015, + "num_tokens": 15203795.0, + "step": 740 + }, + { + "entropy": 0.886955714225769, + "epoch": 1.3812154696132597, + "grad_norm": 0.7881133556365967, + "learning_rate": 9.790464831134903e-05, + "loss": 0.07125020027160645, + "mean_token_accuracy": 0.9723815560340882, + "num_tokens": 15408974.0, + "step": 750 + }, + { + "entropy": 0.9047374844551086, + "epoch": 1.3996316758747698, + "grad_norm": 0.9082005023956299, + "learning_rate": 9.781652371305824e-05, + "loss": 0.07004334926605224, + "mean_token_accuracy": 0.9725580036640167, + "num_tokens": 15614399.0, + "step": 760 + }, + { + "entropy": 0.9039053857326508, + "epoch": 1.4180478821362799, + "grad_norm": 0.8060817122459412, + "learning_rate": 9.77266252776972e-05, + "loss": 0.07103485465049744, + "mean_token_accuracy": 0.9721468150615692, + "num_tokens": 15819895.0, + "step": 770 + }, + { + "entropy": 0.8998047232627868, + "epoch": 1.43646408839779, + "grad_norm": 1.0152642726898193, + "learning_rate": 9.763495634020467e-05, + "loss": 0.07411704063415528, + "mean_token_accuracy": 0.9711063146591187, + "num_tokens": 16025297.0, + "step": 780 + }, + { + "entropy": 0.9120213568210602, + "epoch": 1.4548802946593002, + "grad_norm": 0.6288319826126099, + "learning_rate": 9.754152030119921e-05, + "loss": 0.07223712205886841, + "mean_token_accuracy": 0.9722476422786712, + "num_tokens": 16230656.0, + "step": 790 + }, + { + "entropy": 0.9142370820045471, + "epoch": 1.4732965009208103, + "grad_norm": 0.7854700088500977, + "learning_rate": 9.744632062685311e-05, + "loss": 0.07186744809150696, + "mean_token_accuracy": 0.972247713804245, + "num_tokens": 16435943.0, + "step": 800 + }, + { + "entropy": 0.8920814216136932, + "epoch": 1.4917127071823204, + "grad_norm": 0.6227074265480042, + "learning_rate": 9.734936084876383e-05, + "loss": 0.07016961574554444, + "mean_token_accuracy": 0.9725603640079499, + "num_tokens": 16641635.0, + "step": 810 + }, + { + "entropy": 0.891328877210617, + "epoch": 1.5101289134438307, + "grad_norm": 0.7601346969604492, + "learning_rate": 9.725064456382283e-05, + "loss": 0.07137494087219239, + "mean_token_accuracy": 0.9722997546195984, + "num_tokens": 16847194.0, + "step": 820 + }, + { + "entropy": 0.8921217978000641, + "epoch": 1.5285451197053406, + "grad_norm": 0.7813850045204163, + "learning_rate": 9.715017543408233e-05, + "loss": 0.06890199184417725, + "mean_token_accuracy": 0.9735044002532959, + "num_tokens": 17052807.0, + "step": 830 + }, + { + "entropy": 0.9085914671421051, + "epoch": 1.5469613259668509, + "grad_norm": 0.6184289455413818, + "learning_rate": 9.704795718661939e-05, + "loss": 0.07043765187263488, + "mean_token_accuracy": 0.9725716531276702, + "num_tokens": 17258284.0, + "step": 840 + }, + { + "entropy": 0.9029861629009247, + "epoch": 1.565377532228361, + "grad_norm": 0.7082377076148987, + "learning_rate": 9.694399361339752e-05, + "loss": 0.07113839387893676, + "mean_token_accuracy": 0.9725669205188752, + "num_tokens": 17464326.0, + "step": 850 + }, + { + "entropy": 0.8856533527374267, + "epoch": 1.583793738489871, + "grad_norm": 0.7409216165542603, + "learning_rate": 9.683828857112627e-05, + "loss": 0.07077333331108093, + "mean_token_accuracy": 0.9731084644794464, + "num_tokens": 17669537.0, + "step": 860 + }, + { + "entropy": 0.8613030433654785, + "epoch": 1.6022099447513813, + "grad_norm": 0.6801561713218689, + "learning_rate": 9.673084598111789e-05, + "loss": 0.06885308027267456, + "mean_token_accuracy": 0.97266526222229, + "num_tokens": 17875289.0, + "step": 870 + }, + { + "entropy": 0.8692965865135193, + "epoch": 1.6206261510128912, + "grad_norm": 1.1621277332305908, + "learning_rate": 9.662166982914203e-05, + "loss": 0.07017780542373657, + "mean_token_accuracy": 0.9733059942722321, + "num_tokens": 18080404.0, + "step": 880 + }, + { + "entropy": 0.8671502113342285, + "epoch": 1.6390423572744015, + "grad_norm": 0.7518903613090515, + "learning_rate": 9.651076416527787e-05, + "loss": 0.06977018713951111, + "mean_token_accuracy": 0.9730017304420471, + "num_tokens": 18285699.0, + "step": 890 + }, + { + "entropy": 0.8662045657634735, + "epoch": 1.6574585635359116, + "grad_norm": 0.6622698903083801, + "learning_rate": 9.639813310376378e-05, + "loss": 0.06620995998382569, + "mean_token_accuracy": 0.9737491130828857, + "num_tokens": 18491097.0, + "step": 900 + }, + { + "entropy": 0.8548173069953918, + "epoch": 1.6758747697974217, + "grad_norm": 0.8941843509674072, + "learning_rate": 9.628378082284479e-05, + "loss": 0.06711119413375854, + "mean_token_accuracy": 0.9740589797496796, + "num_tokens": 18696827.0, + "step": 910 + }, + { + "entropy": 0.8763562262058258, + "epoch": 1.694290976058932, + "grad_norm": 0.7571700215339661, + "learning_rate": 9.616771156461755e-05, + "loss": 0.07263468503952027, + "mean_token_accuracy": 0.9717419981956482, + "num_tokens": 18902513.0, + "step": 920 + }, + { + "entropy": 0.8663733780384064, + "epoch": 1.7127071823204418, + "grad_norm": 0.7886489629745483, + "learning_rate": 9.604992963487298e-05, + "loss": 0.07074605226516724, + "mean_token_accuracy": 0.9724965393543243, + "num_tokens": 19107812.0, + "step": 930 + }, + { + "entropy": 0.8673004627227783, + "epoch": 1.7311233885819521, + "grad_norm": 0.8180726170539856, + "learning_rate": 9.593043940293647e-05, + "loss": 0.06831735372543335, + "mean_token_accuracy": 0.9733696818351746, + "num_tokens": 19313330.0, + "step": 940 + }, + { + "entropy": 0.8525971233844757, + "epoch": 1.7495395948434622, + "grad_norm": 0.6576228737831116, + "learning_rate": 9.580924530150595e-05, + "loss": 0.06567002534866333, + "mean_token_accuracy": 0.9745754361152649, + "num_tokens": 19518671.0, + "step": 950 + }, + { + "entropy": 0.8605451703071594, + "epoch": 1.7679558011049723, + "grad_norm": 0.7171661257743835, + "learning_rate": 9.568635182648725e-05, + "loss": 0.06872050762176514, + "mean_token_accuracy": 0.9732091546058654, + "num_tokens": 19724135.0, + "step": 960 + }, + { + "entropy": 0.8642210960388184, + "epoch": 1.7863720073664826, + "grad_norm": 0.7603147029876709, + "learning_rate": 9.556176353682746e-05, + "loss": 0.06766576766967773, + "mean_token_accuracy": 0.9728681743144989, + "num_tokens": 19928785.0, + "step": 970 + }, + { + "entropy": 0.8543185651302337, + "epoch": 1.8047882136279927, + "grad_norm": 0.7280875444412231, + "learning_rate": 9.543548505434581e-05, + "loss": 0.06851862668991089, + "mean_token_accuracy": 0.9737437188625335, + "num_tokens": 20134195.0, + "step": 980 + }, + { + "entropy": 0.8744745373725891, + "epoch": 1.8232044198895028, + "grad_norm": 0.5897248983383179, + "learning_rate": 9.530752106356209e-05, + "loss": 0.06809053421020508, + "mean_token_accuracy": 0.9733593761920929, + "num_tokens": 20339517.0, + "step": 990 + }, + { + "entropy": 0.8623859465122223, + "epoch": 1.8416206261510129, + "grad_norm": 0.7515265345573425, + "learning_rate": 9.517787631152298e-05, + "loss": 0.07257847785949707, + "mean_token_accuracy": 0.9714054942131043, + "num_tokens": 20545249.0, + "step": 1000 + }, + { + "entropy": 0.8669404804706573, + "epoch": 1.860036832412523, + "grad_norm": 0.7144560813903809, + "learning_rate": 9.504655560762596e-05, + "loss": 0.06832354068756104, + "mean_token_accuracy": 0.9735779523849487, + "num_tokens": 20750507.0, + "step": 1010 + }, + { + "entropy": 0.8493516445159912, + "epoch": 1.8784530386740332, + "grad_norm": 0.6559189558029175, + "learning_rate": 9.491356382344081e-05, + "loss": 0.0629766047000885, + "mean_token_accuracy": 0.9754977762699127, + "num_tokens": 20955956.0, + "step": 1020 + }, + { + "entropy": 0.8599376022815705, + "epoch": 1.8968692449355433, + "grad_norm": 0.6792973279953003, + "learning_rate": 9.477890589252895e-05, + "loss": 0.0666757881641388, + "mean_token_accuracy": 0.974083811044693, + "num_tokens": 21161163.0, + "step": 1030 + }, + { + "entropy": 0.8458438158035279, + "epoch": 1.9152854511970534, + "grad_norm": 0.6941778659820557, + "learning_rate": 9.464258681026042e-05, + "loss": 0.06307152509689332, + "mean_token_accuracy": 0.9757042229175568, + "num_tokens": 21366525.0, + "step": 1040 + }, + { + "entropy": 0.848515909910202, + "epoch": 1.9337016574585635, + "grad_norm": 0.7307806611061096, + "learning_rate": 9.450461163362855e-05, + "loss": 0.06307026147842407, + "mean_token_accuracy": 0.9750974595546722, + "num_tokens": 21572238.0, + "step": 1050 + }, + { + "entropy": 0.8563454031944275, + "epoch": 1.9521178637200736, + "grad_norm": 0.7222106456756592, + "learning_rate": 9.436498548106236e-05, + "loss": 0.0647726058959961, + "mean_token_accuracy": 0.974629694223404, + "num_tokens": 21777633.0, + "step": 1060 + }, + { + "entropy": 0.8656457483768463, + "epoch": 1.9705340699815839, + "grad_norm": 0.67178875207901, + "learning_rate": 9.422371353223674e-05, + "loss": 0.06573554277420043, + "mean_token_accuracy": 0.9745908617973328, + "num_tokens": 21983116.0, + "step": 1070 + }, + { + "entropy": 0.8630891263484954, + "epoch": 1.988950276243094, + "grad_norm": 0.6956593990325928, + "learning_rate": 9.408080102788016e-05, + "loss": 0.06630704402923585, + "mean_token_accuracy": 0.9741333484649658, + "num_tokens": 22188662.0, + "step": 1080 + }, + { + "epoch": 2.0, + "eval_entropy": 0.8560857042022373, + "eval_loss": 0.06494329869747162, + "eval_mean_token_accuracy": 0.9745692672936813, + "eval_num_tokens": 22311800.0, + "eval_runtime": 10.129, + "eval_samples_per_second": 361.142, + "eval_steps_per_second": 11.354, + "step": 1086 + }, + { + "entropy": 0.8616272270679474, + "epoch": 2.007366482504604, + "grad_norm": 0.7778105139732361, + "learning_rate": 9.393625326958041e-05, + "loss": 0.054407155513763426, + "mean_token_accuracy": 0.9792074799537659, + "num_tokens": 22394215.0, + "step": 1090 + }, + { + "entropy": 0.8496910452842712, + "epoch": 2.0257826887661143, + "grad_norm": 0.7422528266906738, + "learning_rate": 9.379007561958792e-05, + "loss": 0.051881587505340575, + "mean_token_accuracy": 0.9799090325832367, + "num_tokens": 22599599.0, + "step": 1100 + }, + { + "entropy": 0.8531602442264556, + "epoch": 2.044198895027624, + "grad_norm": 0.9075332880020142, + "learning_rate": 9.36422735006167e-05, + "loss": 0.05190724730491638, + "mean_token_accuracy": 0.979931116104126, + "num_tokens": 22805318.0, + "step": 1110 + }, + { + "entropy": 0.8657277703285218, + "epoch": 2.0626151012891345, + "grad_norm": 0.9466913938522339, + "learning_rate": 9.349285239564325e-05, + "loss": 0.053853434324264524, + "mean_token_accuracy": 0.9796103596687317, + "num_tokens": 23010438.0, + "step": 1120 + }, + { + "entropy": 0.8578485429286957, + "epoch": 2.0810313075506444, + "grad_norm": 0.6903054714202881, + "learning_rate": 9.334181784770326e-05, + "loss": 0.05228850841522217, + "mean_token_accuracy": 0.9802409887313843, + "num_tokens": 23215795.0, + "step": 1130 + }, + { + "entropy": 0.8450767934322357, + "epoch": 2.0994475138121547, + "grad_norm": 0.6615211367607117, + "learning_rate": 9.318917545968581e-05, + "loss": 0.050570905208587646, + "mean_token_accuracy": 0.9802053451538086, + "num_tokens": 23421157.0, + "step": 1140 + }, + { + "entropy": 0.8325044393539429, + "epoch": 2.117863720073665, + "grad_norm": 0.760960578918457, + "learning_rate": 9.303493089412564e-05, + "loss": 0.051966112852096555, + "mean_token_accuracy": 0.9796205997467041, + "num_tokens": 23626584.0, + "step": 1150 + }, + { + "entropy": 0.8416404843330383, + "epoch": 2.136279926335175, + "grad_norm": 0.6947009563446045, + "learning_rate": 9.287908987299306e-05, + "loss": 0.05144861936569214, + "mean_token_accuracy": 0.9800034642219544, + "num_tokens": 23832137.0, + "step": 1160 + }, + { + "entropy": 0.8564540028572083, + "epoch": 2.154696132596685, + "grad_norm": 0.733252763748169, + "learning_rate": 9.272165817748164e-05, + "loss": 0.04944799542427063, + "mean_token_accuracy": 0.9808157980442047, + "num_tokens": 24038006.0, + "step": 1170 + }, + { + "entropy": 0.8575525343418121, + "epoch": 2.1731123388581954, + "grad_norm": 0.8911028504371643, + "learning_rate": 9.25626416477938e-05, + "loss": 0.05037952661514282, + "mean_token_accuracy": 0.980946284532547, + "num_tokens": 24243374.0, + "step": 1180 + }, + { + "entropy": 0.8599720418453216, + "epoch": 2.1915285451197053, + "grad_norm": 0.7713524103164673, + "learning_rate": 9.240204618292416e-05, + "loss": 0.050603735446929934, + "mean_token_accuracy": 0.980896121263504, + "num_tokens": 24448585.0, + "step": 1190 + }, + { + "entropy": 0.8566664934158326, + "epoch": 2.2099447513812156, + "grad_norm": 0.8439353704452515, + "learning_rate": 9.223987774044066e-05, + "loss": 0.054171699285507205, + "mean_token_accuracy": 0.9796543836593627, + "num_tokens": 24653863.0, + "step": 1200 + }, + { + "entropy": 0.846601277589798, + "epoch": 2.2283609576427255, + "grad_norm": 0.7025637030601501, + "learning_rate": 9.207614233626356e-05, + "loss": 0.048924127221107484, + "mean_token_accuracy": 0.9809681415557862, + "num_tokens": 24859801.0, + "step": 1210 + }, + { + "entropy": 0.8564423739910125, + "epoch": 2.2467771639042358, + "grad_norm": 0.7788274884223938, + "learning_rate": 9.191084604444233e-05, + "loss": 0.05260283350944519, + "mean_token_accuracy": 0.9793797850608825, + "num_tokens": 25065368.0, + "step": 1220 + }, + { + "entropy": 0.865056723356247, + "epoch": 2.265193370165746, + "grad_norm": 0.8728818297386169, + "learning_rate": 9.174399499693027e-05, + "loss": 0.05016371011734009, + "mean_token_accuracy": 0.9807134211063385, + "num_tokens": 25270945.0, + "step": 1230 + }, + { + "entropy": 0.8642262935638427, + "epoch": 2.283609576427256, + "grad_norm": 1.0582489967346191, + "learning_rate": 9.157559538335703e-05, + "loss": 0.05316779017448425, + "mean_token_accuracy": 0.9794209063053131, + "num_tokens": 25476575.0, + "step": 1240 + }, + { + "entropy": 0.8677761554718018, + "epoch": 2.3020257826887662, + "grad_norm": 0.760109543800354, + "learning_rate": 9.140565345079901e-05, + "loss": 0.05115479230880737, + "mean_token_accuracy": 0.9802310705184937, + "num_tokens": 25682814.0, + "step": 1250 + }, + { + "entropy": 0.8592945456504821, + "epoch": 2.320441988950276, + "grad_norm": 0.6537907123565674, + "learning_rate": 9.123417550354761e-05, + "loss": 0.050543540716171266, + "mean_token_accuracy": 0.9806945025920868, + "num_tokens": 25887575.0, + "step": 1260 + }, + { + "entropy": 0.8692500293254852, + "epoch": 2.3388581952117864, + "grad_norm": 0.7771905064582825, + "learning_rate": 9.106116790287541e-05, + "loss": 0.049718713760375975, + "mean_token_accuracy": 0.9805168390274048, + "num_tokens": 26092950.0, + "step": 1270 + }, + { + "entropy": 0.8841261565685272, + "epoch": 2.3572744014732967, + "grad_norm": 0.7791076898574829, + "learning_rate": 9.08866370668001e-05, + "loss": 0.0527400553226471, + "mean_token_accuracy": 0.9796754539012908, + "num_tokens": 26298182.0, + "step": 1280 + }, + { + "entropy": 0.8675022900104523, + "epoch": 2.3756906077348066, + "grad_norm": 0.8481605648994446, + "learning_rate": 9.07105894698464e-05, + "loss": 0.05320838689804077, + "mean_token_accuracy": 0.9792274832725525, + "num_tokens": 26503425.0, + "step": 1290 + }, + { + "entropy": 0.8704026222229004, + "epoch": 2.394106813996317, + "grad_norm": 0.8235505819320679, + "learning_rate": 9.053303164280602e-05, + "loss": 0.055045205354690555, + "mean_token_accuracy": 0.9788750648498535, + "num_tokens": 26708755.0, + "step": 1300 + }, + { + "entropy": 0.8525134027004242, + "epoch": 2.4125230202578267, + "grad_norm": 0.7611598968505859, + "learning_rate": 9.035397017249518e-05, + "loss": 0.05029621124267578, + "mean_token_accuracy": 0.9802757322788238, + "num_tokens": 26914704.0, + "step": 1310 + }, + { + "entropy": 0.8630305290222168, + "epoch": 2.430939226519337, + "grad_norm": 0.790408194065094, + "learning_rate": 9.017341170151041e-05, + "loss": 0.04856040775775909, + "mean_token_accuracy": 0.9809690833091735, + "num_tokens": 27120151.0, + "step": 1320 + }, + { + "entropy": 0.8579159140586853, + "epoch": 2.4493554327808473, + "grad_norm": 0.781972348690033, + "learning_rate": 8.999136292798207e-05, + "loss": 0.04869682788848877, + "mean_token_accuracy": 0.9816130697727203, + "num_tokens": 27325673.0, + "step": 1330 + }, + { + "entropy": 0.8634716987609863, + "epoch": 2.467771639042357, + "grad_norm": 0.8500784039497375, + "learning_rate": 8.980783060532588e-05, + "loss": 0.05050289034843445, + "mean_token_accuracy": 0.980079609155655, + "num_tokens": 27531270.0, + "step": 1340 + }, + { + "entropy": 0.8660618126392364, + "epoch": 2.4861878453038675, + "grad_norm": 0.719760537147522, + "learning_rate": 8.96228215419924e-05, + "loss": 0.04892141819000244, + "mean_token_accuracy": 0.9814020991325378, + "num_tokens": 27736542.0, + "step": 1350 + }, + { + "entropy": 0.8572284400463104, + "epoch": 2.5046040515653774, + "grad_norm": 1.0197229385375977, + "learning_rate": 8.943634260121442e-05, + "loss": 0.05104702711105347, + "mean_token_accuracy": 0.9798846662044525, + "num_tokens": 27941566.0, + "step": 1360 + }, + { + "entropy": 0.8702241241931915, + "epoch": 2.5230202578268877, + "grad_norm": 0.7136003375053406, + "learning_rate": 8.924840070075247e-05, + "loss": 0.04855787754058838, + "mean_token_accuracy": 0.9811685383319855, + "num_tokens": 28146943.0, + "step": 1370 + }, + { + "entropy": 0.874957013130188, + "epoch": 2.541436464088398, + "grad_norm": 0.8775497674942017, + "learning_rate": 8.905900281263804e-05, + "loss": 0.052434295415878296, + "mean_token_accuracy": 0.9795438170433044, + "num_tokens": 28352640.0, + "step": 1380 + }, + { + "entropy": 0.8776536166667939, + "epoch": 2.559852670349908, + "grad_norm": 0.8895741105079651, + "learning_rate": 8.8868155962915e-05, + "loss": 0.05282890796661377, + "mean_token_accuracy": 0.9790538609027862, + "num_tokens": 28558153.0, + "step": 1390 + }, + { + "entropy": 0.8738743245601654, + "epoch": 2.578268876611418, + "grad_norm": 0.788800060749054, + "learning_rate": 8.867586723137906e-05, + "loss": 0.048841872811317445, + "mean_token_accuracy": 0.9809149026870727, + "num_tokens": 28763613.0, + "step": 1400 + }, + { + "entropy": 0.8750253796577454, + "epoch": 2.596685082872928, + "grad_norm": 0.8738002777099609, + "learning_rate": 8.848214375131497e-05, + "loss": 0.048261132836341855, + "mean_token_accuracy": 0.980789190530777, + "num_tokens": 28969248.0, + "step": 1410 + }, + { + "entropy": 0.8624245524406433, + "epoch": 2.6151012891344383, + "grad_norm": 0.6404895186424255, + "learning_rate": 8.828699270923196e-05, + "loss": 0.04970468282699585, + "mean_token_accuracy": 0.9807762265205383, + "num_tokens": 29174779.0, + "step": 1420 + }, + { + "entropy": 0.8792938470840455, + "epoch": 2.6335174953959486, + "grad_norm": 0.7856965661048889, + "learning_rate": 8.80904213445972e-05, + "loss": 0.053334391117095946, + "mean_token_accuracy": 0.9790222108364105, + "num_tokens": 29380474.0, + "step": 1430 + }, + { + "entropy": 0.8831034600734711, + "epoch": 2.6519337016574585, + "grad_norm": 0.7739618420600891, + "learning_rate": 8.789243694956716e-05, + "loss": 0.04959054589271546, + "mean_token_accuracy": 0.9803965091705322, + "num_tokens": 29585985.0, + "step": 1440 + }, + { + "entropy": 0.8934672951698304, + "epoch": 2.6703499079189688, + "grad_norm": 0.6999697089195251, + "learning_rate": 8.769304686871719e-05, + "loss": 0.05165250301361084, + "mean_token_accuracy": 0.9798884153366089, + "num_tokens": 29791238.0, + "step": 1450 + }, + { + "entropy": 0.9053199410438537, + "epoch": 2.6887661141804786, + "grad_norm": 0.9199564456939697, + "learning_rate": 8.749225849876892e-05, + "loss": 0.04924143850803375, + "mean_token_accuracy": 0.9810785710811615, + "num_tokens": 29996589.0, + "step": 1460 + }, + { + "entropy": 0.888091403245926, + "epoch": 2.707182320441989, + "grad_norm": 0.7480106353759766, + "learning_rate": 8.729007928831597e-05, + "loss": 0.04948916733264923, + "mean_token_accuracy": 0.9809579730033875, + "num_tokens": 30201875.0, + "step": 1470 + }, + { + "entropy": 0.8723407983779907, + "epoch": 2.7255985267034992, + "grad_norm": 0.9506945013999939, + "learning_rate": 8.708651673754763e-05, + "loss": 0.048927539587020875, + "mean_token_accuracy": 0.980553150177002, + "num_tokens": 30407550.0, + "step": 1480 + }, + { + "entropy": 0.8737521529197693, + "epoch": 2.744014732965009, + "grad_norm": 0.8015706539154053, + "learning_rate": 8.688157839797062e-05, + "loss": 0.04963063597679138, + "mean_token_accuracy": 0.9809738755226135, + "num_tokens": 30612839.0, + "step": 1490 + }, + { + "entropy": 0.8800762951374054, + "epoch": 2.7624309392265194, + "grad_norm": 0.9429986476898193, + "learning_rate": 8.667527187212885e-05, + "loss": 0.0524174690246582, + "mean_token_accuracy": 0.9788767337799072, + "num_tokens": 30818578.0, + "step": 1500 + }, + { + "entropy": 0.8871055901050567, + "epoch": 2.7808471454880292, + "grad_norm": 0.5909196138381958, + "learning_rate": 8.646760481332157e-05, + "loss": 0.05166680812835693, + "mean_token_accuracy": 0.980216771364212, + "num_tokens": 31023829.0, + "step": 1510 + }, + { + "entropy": 0.8908755779266357, + "epoch": 2.7992633517495396, + "grad_norm": 0.9154611229896545, + "learning_rate": 8.625858492531931e-05, + "loss": 0.04951836466789246, + "mean_token_accuracy": 0.9801484227180481, + "num_tokens": 31229635.0, + "step": 1520 + }, + { + "entropy": 0.92480548620224, + "epoch": 2.81767955801105, + "grad_norm": 0.5989938378334045, + "learning_rate": 8.604821996207819e-05, + "loss": 0.04799881279468536, + "mean_token_accuracy": 0.9817522585391998, + "num_tokens": 31435456.0, + "step": 1530 + }, + { + "entropy": 0.9173881888389588, + "epoch": 2.8360957642725597, + "grad_norm": 0.899413526058197, + "learning_rate": 8.58365177274522e-05, + "loss": 0.0487445592880249, + "mean_token_accuracy": 0.9812625288963318, + "num_tokens": 31640904.0, + "step": 1540 + }, + { + "entropy": 0.9076135993003845, + "epoch": 2.85451197053407, + "grad_norm": 0.8494166135787964, + "learning_rate": 8.562348607490376e-05, + "loss": 0.05005228519439697, + "mean_token_accuracy": 0.9806681036949157, + "num_tokens": 31845807.0, + "step": 1550 + }, + { + "entropy": 0.9092245221138, + "epoch": 2.87292817679558, + "grad_norm": 0.8225123286247253, + "learning_rate": 8.540913290721234e-05, + "loss": 0.048654764890670776, + "mean_token_accuracy": 0.9805659353733063, + "num_tokens": 32051523.0, + "step": 1560 + }, + { + "entropy": 0.9062779664993286, + "epoch": 2.89134438305709, + "grad_norm": 0.7074014544487, + "learning_rate": 8.519346617618134e-05, + "loss": 0.049209845066070554, + "mean_token_accuracy": 0.9807434439659118, + "num_tokens": 32256895.0, + "step": 1570 + }, + { + "entropy": 0.9190246641635895, + "epoch": 2.9097605893186005, + "grad_norm": 0.8860642910003662, + "learning_rate": 8.497649388234304e-05, + "loss": 0.051211881637573245, + "mean_token_accuracy": 0.9802342295646668, + "num_tokens": 32462031.0, + "step": 1580 + }, + { + "entropy": 0.9088015079498291, + "epoch": 2.9281767955801103, + "grad_norm": 0.8062726855278015, + "learning_rate": 8.475822407466188e-05, + "loss": 0.053512704372406, + "mean_token_accuracy": 0.979486483335495, + "num_tokens": 32667533.0, + "step": 1590 + }, + { + "entropy": 0.9462027847766876, + "epoch": 2.9465930018416207, + "grad_norm": 0.7962909936904907, + "learning_rate": 8.453866485023579e-05, + "loss": 0.0501457154750824, + "mean_token_accuracy": 0.9803222417831421, + "num_tokens": 32872900.0, + "step": 1600 + }, + { + "entropy": 0.9671471297740937, + "epoch": 2.9650092081031305, + "grad_norm": 0.7641744017601013, + "learning_rate": 8.431782435399587e-05, + "loss": 0.04629061222076416, + "mean_token_accuracy": 0.9823175370693207, + "num_tokens": 33077850.0, + "step": 1610 + }, + { + "entropy": 0.955865204334259, + "epoch": 2.983425414364641, + "grad_norm": 0.6772348880767822, + "learning_rate": 8.409571077840426e-05, + "loss": 0.048368623852729796, + "mean_token_accuracy": 0.9808700799942016, + "num_tokens": 33283117.0, + "step": 1620 + }, + { + "epoch": 3.0, + "eval_entropy": 0.9563225186389426, + "eval_loss": 0.059064481407403946, + "eval_mean_token_accuracy": 0.9773589429648026, + "eval_num_tokens": 33467712.0, + "eval_runtime": 10.1471, + "eval_samples_per_second": 360.499, + "eval_steps_per_second": 11.333, + "step": 1629 + }, + { + "entropy": 0.9337226033210755, + "epoch": 3.001841620626151, + "grad_norm": 0.646203875541687, + "learning_rate": 8.387233236315016e-05, + "loss": 0.043352216482162476, + "mean_token_accuracy": 0.9830620110034942, + "num_tokens": 33488302.0, + "step": 1630 + }, + { + "entropy": 0.9734923839569092, + "epoch": 3.020257826887661, + "grad_norm": 0.7564226984977722, + "learning_rate": 8.364769739484416e-05, + "loss": 0.033932483196258544, + "mean_token_accuracy": 0.9872806966304779, + "num_tokens": 33693531.0, + "step": 1640 + }, + { + "entropy": 0.9669206500053406, + "epoch": 3.0386740331491713, + "grad_norm": 0.7126886248588562, + "learning_rate": 8.342181420671096e-05, + "loss": 0.03818287253379822, + "mean_token_accuracy": 0.9852082908153534, + "num_tokens": 33899305.0, + "step": 1650 + }, + { + "entropy": 0.9522916138172149, + "epoch": 3.0570902394106816, + "grad_norm": 1.0571653842926025, + "learning_rate": 8.319469117828007e-05, + "loss": 0.03456039130687714, + "mean_token_accuracy": 0.9867027878761292, + "num_tokens": 34104585.0, + "step": 1660 + }, + { + "entropy": 0.9568560004234314, + "epoch": 3.0755064456721914, + "grad_norm": 0.780940592288971, + "learning_rate": 8.296633673507505e-05, + "loss": 0.03551802039146423, + "mean_token_accuracy": 0.9867531359195709, + "num_tokens": 34309516.0, + "step": 1670 + }, + { + "entropy": 0.9590656876564025, + "epoch": 3.0939226519337018, + "grad_norm": 0.8330219388008118, + "learning_rate": 8.273675934830094e-05, + "loss": 0.03674865961074829, + "mean_token_accuracy": 0.9864118576049805, + "num_tokens": 34515170.0, + "step": 1680 + }, + { + "entropy": 0.975881814956665, + "epoch": 3.1123388581952116, + "grad_norm": 0.7010637521743774, + "learning_rate": 8.250596753453e-05, + "loss": 0.03550414443016052, + "mean_token_accuracy": 0.9864102602005005, + "num_tokens": 34720896.0, + "step": 1690 + }, + { + "entropy": 0.9599562883377075, + "epoch": 3.130755064456722, + "grad_norm": 0.6694278717041016, + "learning_rate": 8.227396985538578e-05, + "loss": 0.035564273595809937, + "mean_token_accuracy": 0.9867321848869324, + "num_tokens": 34925970.0, + "step": 1700 + }, + { + "entropy": 0.9582216143608093, + "epoch": 3.149171270718232, + "grad_norm": 0.9333199262619019, + "learning_rate": 8.204077491722546e-05, + "loss": 0.035575729608535764, + "mean_token_accuracy": 0.9862452208995819, + "num_tokens": 35131543.0, + "step": 1710 + }, + { + "entropy": 0.9579678058624268, + "epoch": 3.167587476979742, + "grad_norm": 0.9450218081474304, + "learning_rate": 8.180639137082066e-05, + "loss": 0.0385298490524292, + "mean_token_accuracy": 0.98538036942482, + "num_tokens": 35336790.0, + "step": 1720 + }, + { + "entropy": 0.9640831351280212, + "epoch": 3.1860036832412524, + "grad_norm": 0.8551534414291382, + "learning_rate": 8.157082791103649e-05, + "loss": 0.03702138364315033, + "mean_token_accuracy": 0.9852015495300293, + "num_tokens": 35542294.0, + "step": 1730 + }, + { + "entropy": 0.9867071211338043, + "epoch": 3.2044198895027622, + "grad_norm": 0.7138128876686096, + "learning_rate": 8.133409327650897e-05, + "loss": 0.035626694560050964, + "mean_token_accuracy": 0.986064875125885, + "num_tokens": 35747447.0, + "step": 1740 + }, + { + "entropy": 0.9639089345932007, + "epoch": 3.2228360957642725, + "grad_norm": 0.7131415009498596, + "learning_rate": 8.109619624932092e-05, + "loss": 0.035885071754455565, + "mean_token_accuracy": 0.986273056268692, + "num_tokens": 35952258.0, + "step": 1750 + }, + { + "entropy": 0.9516046345233917, + "epoch": 3.241252302025783, + "grad_norm": 0.6900200843811035, + "learning_rate": 8.085714565467611e-05, + "loss": 0.03535219430923462, + "mean_token_accuracy": 0.985836285352707, + "num_tokens": 36157938.0, + "step": 1760 + }, + { + "entropy": 0.9373646557331086, + "epoch": 3.2596685082872927, + "grad_norm": 0.6101690530776978, + "learning_rate": 8.061695036057191e-05, + "loss": 0.034940996766090394, + "mean_token_accuracy": 0.9863743901252746, + "num_tokens": 36363825.0, + "step": 1770 + }, + { + "entropy": 0.9444344758987426, + "epoch": 3.278084714548803, + "grad_norm": 0.7518529295921326, + "learning_rate": 8.03756192774703e-05, + "loss": 0.03404279053211212, + "mean_token_accuracy": 0.9866396844387054, + "num_tokens": 36568961.0, + "step": 1780 + }, + { + "entropy": 0.9550357758998871, + "epoch": 3.2965009208103133, + "grad_norm": 0.7687555551528931, + "learning_rate": 8.013316135796734e-05, + "loss": 0.038447052240371704, + "mean_token_accuracy": 0.985325163602829, + "num_tokens": 36774514.0, + "step": 1790 + }, + { + "entropy": 0.9477231681346894, + "epoch": 3.314917127071823, + "grad_norm": 0.7521633505821228, + "learning_rate": 7.988958559646102e-05, + "loss": 0.03746694028377533, + "mean_token_accuracy": 0.9853165090084076, + "num_tokens": 36979660.0, + "step": 1800 + }, + { + "entropy": 0.925805002450943, + "epoch": 3.3333333333333335, + "grad_norm": 0.9333297610282898, + "learning_rate": 7.964490102881768e-05, + "loss": 0.03700103759765625, + "mean_token_accuracy": 0.9850880861282348, + "num_tokens": 37185191.0, + "step": 1810 + }, + { + "entropy": 0.9225482225418091, + "epoch": 3.3517495395948433, + "grad_norm": 0.7928622961044312, + "learning_rate": 7.939911673203665e-05, + "loss": 0.03825801610946655, + "mean_token_accuracy": 0.9850241422653199, + "num_tokens": 37390749.0, + "step": 1820 + }, + { + "entropy": 0.9597147881984711, + "epoch": 3.3701657458563536, + "grad_norm": 0.7658583521842957, + "learning_rate": 7.915224182391375e-05, + "loss": 0.039855146408081056, + "mean_token_accuracy": 0.9845879554748536, + "num_tokens": 37596052.0, + "step": 1830 + }, + { + "entropy": 0.9485619068145752, + "epoch": 3.388581952117864, + "grad_norm": 0.8492130637168884, + "learning_rate": 7.890428546270278e-05, + "loss": 0.039359599351882935, + "mean_token_accuracy": 0.9847265422344208, + "num_tokens": 37802063.0, + "step": 1840 + }, + { + "entropy": 0.9670301914215088, + "epoch": 3.406998158379374, + "grad_norm": 0.7527599930763245, + "learning_rate": 7.865525684677608e-05, + "loss": 0.03752985596656799, + "mean_token_accuracy": 0.9855137526988983, + "num_tokens": 38007432.0, + "step": 1850 + }, + { + "entropy": 0.9681244969367981, + "epoch": 3.425414364640884, + "grad_norm": 0.7599612474441528, + "learning_rate": 7.840516521428303e-05, + "loss": 0.03653894364833832, + "mean_token_accuracy": 0.9858933389186859, + "num_tokens": 38212923.0, + "step": 1860 + }, + { + "entropy": 0.9706049561500549, + "epoch": 3.443830570902394, + "grad_norm": 0.7678127884864807, + "learning_rate": 7.815401984280748e-05, + "loss": 0.0366938978433609, + "mean_token_accuracy": 0.9854713797569274, + "num_tokens": 38418422.0, + "step": 1870 + }, + { + "entropy": 0.9637093842029572, + "epoch": 3.4622467771639043, + "grad_norm": 0.762824535369873, + "learning_rate": 7.790183004902359e-05, + "loss": 0.03516915142536163, + "mean_token_accuracy": 0.9866003453731537, + "num_tokens": 38624389.0, + "step": 1880 + }, + { + "entropy": 0.9373565018177032, + "epoch": 3.4806629834254146, + "grad_norm": 0.8221780061721802, + "learning_rate": 7.764860518835014e-05, + "loss": 0.04049026966094971, + "mean_token_accuracy": 0.984089481830597, + "num_tokens": 38829654.0, + "step": 1890 + }, + { + "entropy": 0.9356025457382202, + "epoch": 3.4990791896869244, + "grad_norm": 0.7583426237106323, + "learning_rate": 7.739435465460356e-05, + "loss": 0.03658481240272522, + "mean_token_accuracy": 0.9857318818569183, + "num_tokens": 39034638.0, + "step": 1900 + }, + { + "entropy": 0.9740163326263428, + "epoch": 3.5174953959484347, + "grad_norm": 0.7332878112792969, + "learning_rate": 7.713908787964937e-05, + "loss": 0.03508963882923126, + "mean_token_accuracy": 0.9863419532775879, + "num_tokens": 39240265.0, + "step": 1910 + }, + { + "entropy": 0.9528286933898926, + "epoch": 3.5359116022099446, + "grad_norm": 0.6515451669692993, + "learning_rate": 7.688281433305233e-05, + "loss": 0.036055779457092284, + "mean_token_accuracy": 0.9860979080200195, + "num_tokens": 39445546.0, + "step": 1920 + }, + { + "entropy": 0.9480705261230469, + "epoch": 3.554327808471455, + "grad_norm": 0.7725827097892761, + "learning_rate": 7.662554352172515e-05, + "loss": 0.037101513147354125, + "mean_token_accuracy": 0.985782790184021, + "num_tokens": 39651078.0, + "step": 1930 + }, + { + "entropy": 0.9655321061611175, + "epoch": 3.572744014732965, + "grad_norm": 0.7756506204605103, + "learning_rate": 7.636728498957581e-05, + "loss": 0.03721855878829956, + "mean_token_accuracy": 0.9857951939105988, + "num_tokens": 39856542.0, + "step": 1940 + }, + { + "entropy": 0.9772682309150695, + "epoch": 3.591160220994475, + "grad_norm": 0.9084987640380859, + "learning_rate": 7.610804831715355e-05, + "loss": 0.03570749163627625, + "mean_token_accuracy": 0.9863450109958649, + "num_tokens": 40061913.0, + "step": 1950 + }, + { + "entropy": 0.9579685389995575, + "epoch": 3.6095764272559854, + "grad_norm": 0.6358487606048584, + "learning_rate": 7.584784312129334e-05, + "loss": 0.038210684061050416, + "mean_token_accuracy": 0.9850837290287018, + "num_tokens": 40267398.0, + "step": 1960 + }, + { + "entropy": 0.9605201721191406, + "epoch": 3.6279926335174952, + "grad_norm": 0.6263149976730347, + "learning_rate": 7.558667905475927e-05, + "loss": 0.03509160876274109, + "mean_token_accuracy": 0.9868143379688263, + "num_tokens": 40472827.0, + "step": 1970 + }, + { + "entropy": 0.964026153087616, + "epoch": 3.6464088397790055, + "grad_norm": 0.90068119764328, + "learning_rate": 7.532456580588638e-05, + "loss": 0.036211782693862916, + "mean_token_accuracy": 0.9858468770980835, + "num_tokens": 40677935.0, + "step": 1980 + }, + { + "entropy": 0.9494135618209839, + "epoch": 3.664825046040516, + "grad_norm": 0.760134756565094, + "learning_rate": 7.50615130982213e-05, + "loss": 0.03786201477050781, + "mean_token_accuracy": 0.9852500438690186, + "num_tokens": 40883750.0, + "step": 1990 + }, + { + "entropy": 0.9527071297168732, + "epoch": 3.6832412523020257, + "grad_norm": 0.9812107682228088, + "learning_rate": 7.479753069016152e-05, + "loss": 0.03803159594535828, + "mean_token_accuracy": 0.9852405369281769, + "num_tokens": 41089115.0, + "step": 2000 + }, + { + "entropy": 0.9639330863952636, + "epoch": 3.701657458563536, + "grad_norm": 0.7164933681488037, + "learning_rate": 7.453262837459332e-05, + "loss": 0.03912568986415863, + "mean_token_accuracy": 0.9849458575248718, + "num_tokens": 41294694.0, + "step": 2010 + }, + { + "entropy": 0.9536987483501435, + "epoch": 3.720073664825046, + "grad_norm": 0.6804596185684204, + "learning_rate": 7.426681597852863e-05, + "loss": 0.036410006880760196, + "mean_token_accuracy": 0.985712206363678, + "num_tokens": 41499817.0, + "step": 2020 + }, + { + "entropy": 0.9478164672851562, + "epoch": 3.738489871086556, + "grad_norm": 0.8799397349357605, + "learning_rate": 7.400010336274037e-05, + "loss": 0.03801035583019256, + "mean_token_accuracy": 0.9850274682044983, + "num_tokens": 41704932.0, + "step": 2030 + }, + { + "entropy": 0.9383447647094727, + "epoch": 3.7569060773480665, + "grad_norm": 0.8386216163635254, + "learning_rate": 7.373250042139664e-05, + "loss": 0.0373637855052948, + "mean_token_accuracy": 0.9854822158813477, + "num_tokens": 41910804.0, + "step": 2040 + }, + { + "entropy": 0.925172996520996, + "epoch": 3.7753222836095763, + "grad_norm": 0.7599324584007263, + "learning_rate": 7.346401708169377e-05, + "loss": 0.03585260808467865, + "mean_token_accuracy": 0.9860672950744629, + "num_tokens": 42116706.0, + "step": 2050 + }, + { + "entropy": 0.9463765442371368, + "epoch": 3.7937384898710866, + "grad_norm": 0.9030149579048157, + "learning_rate": 7.319466330348797e-05, + "loss": 0.035877206921577455, + "mean_token_accuracy": 0.9863968968391419, + "num_tokens": 42322670.0, + "step": 2060 + }, + { + "entropy": 0.9942441761493683, + "epoch": 3.8121546961325965, + "grad_norm": 0.6400449275970459, + "learning_rate": 7.292444907892587e-05, + "loss": 0.037310433387756345, + "mean_token_accuracy": 0.9854151606559753, + "num_tokens": 42527752.0, + "step": 2070 + }, + { + "entropy": 0.9577703952789307, + "epoch": 3.830570902394107, + "grad_norm": 0.6193167567253113, + "learning_rate": 7.265338443207387e-05, + "loss": 0.03648848831653595, + "mean_token_accuracy": 0.9856530070304871, + "num_tokens": 42732981.0, + "step": 2080 + }, + { + "entropy": 0.9663952767848969, + "epoch": 3.848987108655617, + "grad_norm": 0.759611189365387, + "learning_rate": 7.238147941854625e-05, + "loss": 0.036112996935844424, + "mean_token_accuracy": 0.9862765550613404, + "num_tokens": 42938619.0, + "step": 2090 + }, + { + "entropy": 0.9484863519668579, + "epoch": 3.867403314917127, + "grad_norm": 0.7420705556869507, + "learning_rate": 7.210874412513218e-05, + "loss": 0.03703283965587616, + "mean_token_accuracy": 0.9857317566871643, + "num_tokens": 43143753.0, + "step": 2100 + }, + { + "entropy": 0.964326673746109, + "epoch": 3.8858195211786373, + "grad_norm": 0.8779639601707458, + "learning_rate": 7.183518866942147e-05, + "loss": 0.03739701807498932, + "mean_token_accuracy": 0.9852154791355133, + "num_tokens": 43349451.0, + "step": 2110 + }, + { + "entropy": 0.9729791641235351, + "epoch": 3.904235727440147, + "grad_norm": 0.7582741379737854, + "learning_rate": 7.156082319942929e-05, + "loss": 0.03894525766372681, + "mean_token_accuracy": 0.9847454309463501, + "num_tokens": 43554598.0, + "step": 2120 + }, + { + "entropy": 0.9860592544078827, + "epoch": 3.9226519337016574, + "grad_norm": 0.860698938369751, + "learning_rate": 7.128565789321969e-05, + "loss": 0.0365300178527832, + "mean_token_accuracy": 0.9859121859073638, + "num_tokens": 43760081.0, + "step": 2130 + }, + { + "entropy": 0.9916551172733307, + "epoch": 3.9410681399631677, + "grad_norm": 0.8363776206970215, + "learning_rate": 7.100970295852805e-05, + "loss": 0.036221379041671754, + "mean_token_accuracy": 0.9859034180641174, + "num_tokens": 43965432.0, + "step": 2140 + }, + { + "entropy": 0.9553558886051178, + "epoch": 3.9594843462246776, + "grad_norm": 0.9627474546432495, + "learning_rate": 7.073296863238242e-05, + "loss": 0.03684481382369995, + "mean_token_accuracy": 0.9857315957546234, + "num_tokens": 44171232.0, + "step": 2150 + }, + { + "entropy": 0.9538035809993743, + "epoch": 3.977900552486188, + "grad_norm": 0.8399474620819092, + "learning_rate": 7.045546518072366e-05, + "loss": 0.03825397789478302, + "mean_token_accuracy": 0.9846831560134888, + "num_tokens": 44376723.0, + "step": 2160 + }, + { + "entropy": 0.9476235210895538, + "epoch": 3.9963167587476978, + "grad_norm": 0.708739697933197, + "learning_rate": 7.017720289802472e-05, + "loss": 0.03618018329143524, + "mean_token_accuracy": 0.9861325800418854, + "num_tokens": 44582407.0, + "step": 2170 + }, + { + "epoch": 4.0, + "eval_entropy": 0.9569619194321011, + "eval_loss": 0.059838198125362396, + "eval_mean_token_accuracy": 0.9777795366618944, + "eval_num_tokens": 44623647.0, + "eval_runtime": 10.0379, + "eval_samples_per_second": 364.42, + "eval_steps_per_second": 11.457, + "step": 2172 + }, + { + "entropy": 0.9558675646781921, + "epoch": 4.014732965009208, + "grad_norm": 0.7347508668899536, + "learning_rate": 6.989819210690872e-05, + "loss": 0.02886659502983093, + "mean_token_accuracy": 0.9892994821071625, + "num_tokens": 44788219.0, + "step": 2180 + }, + { + "entropy": 1.0037677466869355, + "epoch": 4.033149171270718, + "grad_norm": 0.7403206825256348, + "learning_rate": 6.961844315776596e-05, + "loss": 0.02395295798778534, + "mean_token_accuracy": 0.9906026899814606, + "num_tokens": 44993505.0, + "step": 2190 + }, + { + "entropy": 1.0068290829658508, + "epoch": 4.051565377532229, + "grad_norm": 0.7979726195335388, + "learning_rate": 6.933796642837003e-05, + "loss": 0.02605988085269928, + "mean_token_accuracy": 0.9899706900119781, + "num_tokens": 45199193.0, + "step": 2200 + }, + { + "entropy": 0.9942211747169495, + "epoch": 4.069981583793738, + "grad_norm": 0.6460402011871338, + "learning_rate": 6.905677232349278e-05, + "loss": 0.025350230932235717, + "mean_token_accuracy": 0.9899386286735534, + "num_tokens": 45404030.0, + "step": 2210 + }, + { + "entropy": 0.9783595442771912, + "epoch": 4.088397790055248, + "grad_norm": 0.8177055716514587, + "learning_rate": 6.877487127451834e-05, + "loss": 0.02696993052959442, + "mean_token_accuracy": 0.9896106541156768, + "num_tokens": 45609763.0, + "step": 2220 + }, + { + "entropy": 0.9801763832569123, + "epoch": 4.106813996316759, + "grad_norm": 0.6608165502548218, + "learning_rate": 6.849227373905618e-05, + "loss": 0.025101393461227417, + "mean_token_accuracy": 0.9904372334480286, + "num_tokens": 45814941.0, + "step": 2230 + }, + { + "entropy": 0.9695689737796783, + "epoch": 4.125230202578269, + "grad_norm": 0.8036547899246216, + "learning_rate": 6.820899020055314e-05, + "loss": 0.027827343344688414, + "mean_token_accuracy": 0.9890337705612182, + "num_tokens": 46020535.0, + "step": 2240 + }, + { + "entropy": 0.9828635334968567, + "epoch": 4.143646408839779, + "grad_norm": 0.7729921936988831, + "learning_rate": 6.792503116790455e-05, + "loss": 0.02779492735862732, + "mean_token_accuracy": 0.9894372522830963, + "num_tokens": 46226013.0, + "step": 2250 + }, + { + "entropy": 0.9978842556476593, + "epoch": 4.162062615101289, + "grad_norm": 0.7334664463996887, + "learning_rate": 6.764040717506432e-05, + "loss": 0.025673511624336242, + "mean_token_accuracy": 0.9899355113506317, + "num_tokens": 46432087.0, + "step": 2260 + }, + { + "entropy": 1.0116403937339782, + "epoch": 4.180478821362799, + "grad_norm": 0.6769368052482605, + "learning_rate": 6.735512878065427e-05, + "loss": 0.024705511331558228, + "mean_token_accuracy": 0.9906128525733948, + "num_tokens": 46637478.0, + "step": 2270 + }, + { + "entropy": 0.9985016226768494, + "epoch": 4.198895027624309, + "grad_norm": 0.8301573991775513, + "learning_rate": 6.706920656757234e-05, + "loss": 0.02455987185239792, + "mean_token_accuracy": 0.9905728340148926, + "num_tokens": 46842562.0, + "step": 2280 + }, + { + "entropy": 0.9909430682659149, + "epoch": 4.21731123388582, + "grad_norm": 0.656026303768158, + "learning_rate": 6.67826511426001e-05, + "loss": 0.022711564600467683, + "mean_token_accuracy": 0.9910893619060517, + "num_tokens": 47048071.0, + "step": 2290 + }, + { + "entropy": 0.9868666052818298, + "epoch": 4.23572744014733, + "grad_norm": 0.7614991068840027, + "learning_rate": 6.649547313600916e-05, + "loss": 0.02453812211751938, + "mean_token_accuracy": 0.9908901154994965, + "num_tokens": 47253507.0, + "step": 2300 + }, + { + "entropy": 0.9870487153530121, + "epoch": 4.25414364640884, + "grad_norm": 0.7617276906967163, + "learning_rate": 6.62076832011669e-05, + "loss": 0.025818097591400146, + "mean_token_accuracy": 0.990347957611084, + "num_tokens": 47458747.0, + "step": 2310 + }, + { + "entropy": 0.9691080570220947, + "epoch": 4.27255985267035, + "grad_norm": 0.6743029952049255, + "learning_rate": 6.591929201414124e-05, + "loss": 0.02456912100315094, + "mean_token_accuracy": 0.9905289709568024, + "num_tokens": 47663643.0, + "step": 2320 + }, + { + "entropy": 0.9701108932495117, + "epoch": 4.29097605893186, + "grad_norm": 0.6964483261108398, + "learning_rate": 6.56303102733046e-05, + "loss": 0.02575681209564209, + "mean_token_accuracy": 0.9898503363132477, + "num_tokens": 47868982.0, + "step": 2330 + }, + { + "entropy": 0.969528192281723, + "epoch": 4.30939226519337, + "grad_norm": 0.7521987557411194, + "learning_rate": 6.5340748698937e-05, + "loss": 0.02678089737892151, + "mean_token_accuracy": 0.9898572087287902, + "num_tokens": 48074314.0, + "step": 2340 + }, + { + "entropy": 0.9921871721744537, + "epoch": 4.327808471454881, + "grad_norm": 0.6944513320922852, + "learning_rate": 6.505061803282844e-05, + "loss": 0.025553321838378905, + "mean_token_accuracy": 0.9907529592514038, + "num_tokens": 48279731.0, + "step": 2350 + }, + { + "entropy": 0.9768964886665344, + "epoch": 4.346224677716391, + "grad_norm": 0.6553092002868652, + "learning_rate": 6.47599290378803e-05, + "loss": 0.0250235915184021, + "mean_token_accuracy": 0.9904054701328278, + "num_tokens": 48485401.0, + "step": 2360 + }, + { + "entropy": 0.9612838506698609, + "epoch": 4.3646408839779, + "grad_norm": 0.916820228099823, + "learning_rate": 6.446869249770619e-05, + "loss": 0.028156182169914244, + "mean_token_accuracy": 0.9888657331466675, + "num_tokens": 48691047.0, + "step": 2370 + }, + { + "entropy": 0.9665832936763763, + "epoch": 4.383057090239411, + "grad_norm": 0.9197776913642883, + "learning_rate": 6.417691921623185e-05, + "loss": 0.025303921103477477, + "mean_token_accuracy": 0.989986252784729, + "num_tokens": 48896234.0, + "step": 2380 + }, + { + "entropy": 0.9686589121818543, + "epoch": 4.401473296500921, + "grad_norm": 0.8505764603614807, + "learning_rate": 6.388462001729434e-05, + "loss": 0.024816396832466125, + "mean_token_accuracy": 0.9909265041351318, + "num_tokens": 49101893.0, + "step": 2390 + }, + { + "entropy": 0.9625210344791413, + "epoch": 4.419889502762431, + "grad_norm": 1.0601766109466553, + "learning_rate": 6.359180574424062e-05, + "loss": 0.02706078290939331, + "mean_token_accuracy": 0.9895522117614746, + "num_tokens": 49307467.0, + "step": 2400 + }, + { + "entropy": 0.9679551541805267, + "epoch": 4.4383057090239415, + "grad_norm": 0.776253879070282, + "learning_rate": 6.329848725952514e-05, + "loss": 0.02693203091621399, + "mean_token_accuracy": 0.9893981635570526, + "num_tokens": 49513020.0, + "step": 2410 + }, + { + "entropy": 0.9704959928989411, + "epoch": 4.456721915285451, + "grad_norm": 0.5459668636322021, + "learning_rate": 6.3004675444307e-05, + "loss": 0.0279473751783371, + "mean_token_accuracy": 0.9894329369068146, + "num_tokens": 49718405.0, + "step": 2420 + }, + { + "entropy": 0.961863350868225, + "epoch": 4.475138121546961, + "grad_norm": 0.9338833093643188, + "learning_rate": 6.27103811980462e-05, + "loss": 0.026478803157806395, + "mean_token_accuracy": 0.9902269721031189, + "num_tokens": 49923375.0, + "step": 2430 + }, + { + "entropy": 0.9708506822586059, + "epoch": 4.4935543278084715, + "grad_norm": 0.9073707461357117, + "learning_rate": 6.241561543809947e-05, + "loss": 0.025289520621299744, + "mean_token_accuracy": 0.9904769957065582, + "num_tokens": 50128901.0, + "step": 2440 + }, + { + "entropy": 0.984996622800827, + "epoch": 4.511970534069982, + "grad_norm": 0.8674206733703613, + "learning_rate": 6.212038909931503e-05, + "loss": 0.026442551612854005, + "mean_token_accuracy": 0.9905101835727692, + "num_tokens": 50334449.0, + "step": 2450 + }, + { + "entropy": 0.9926377475261688, + "epoch": 4.530386740331492, + "grad_norm": 0.7571811079978943, + "learning_rate": 6.182471313362717e-05, + "loss": 0.026819539070129395, + "mean_token_accuracy": 0.9898989200592041, + "num_tokens": 50539597.0, + "step": 2460 + }, + { + "entropy": 0.9450563549995422, + "epoch": 4.5488029465930016, + "grad_norm": 0.6651087403297424, + "learning_rate": 6.15285985096498e-05, + "loss": 0.02665227949619293, + "mean_token_accuracy": 0.9897156655788422, + "num_tokens": 50744926.0, + "step": 2470 + }, + { + "entropy": 0.9715635657310486, + "epoch": 4.567219152854512, + "grad_norm": 0.7445545196533203, + "learning_rate": 6.12320562122697e-05, + "loss": 0.026212453842163086, + "mean_token_accuracy": 0.9904700636863708, + "num_tokens": 50950152.0, + "step": 2480 + }, + { + "entropy": 0.9613442063331604, + "epoch": 4.585635359116022, + "grad_norm": 0.7168459296226501, + "learning_rate": 6.0935097242238837e-05, + "loss": 0.02508128583431244, + "mean_token_accuracy": 0.9901923894882202, + "num_tokens": 51155430.0, + "step": 2490 + }, + { + "entropy": 0.9571944534778595, + "epoch": 4.6040515653775325, + "grad_norm": 0.7590732574462891, + "learning_rate": 6.063773261576646e-05, + "loss": 0.025445500016212465, + "mean_token_accuracy": 0.9902949810028077, + "num_tokens": 51360826.0, + "step": 2500 + }, + { + "entropy": 0.947079461812973, + "epoch": 4.622467771639043, + "grad_norm": 0.6942175030708313, + "learning_rate": 6.033997336411035e-05, + "loss": 0.026132801175117494, + "mean_token_accuracy": 0.9900939345359803, + "num_tokens": 51566095.0, + "step": 2510 + }, + { + "entropy": 0.970003741979599, + "epoch": 4.640883977900552, + "grad_norm": 0.6562672257423401, + "learning_rate": 6.00418305331675e-05, + "loss": 0.024759869277477264, + "mean_token_accuracy": 0.9905019223690033, + "num_tokens": 51771177.0, + "step": 2520 + }, + { + "entropy": 0.9715348601341247, + "epoch": 4.6593001841620625, + "grad_norm": 0.6151639819145203, + "learning_rate": 5.9743315183064564e-05, + "loss": 0.024138522148132325, + "mean_token_accuracy": 0.9910101473331452, + "num_tokens": 51976349.0, + "step": 2530 + }, + { + "entropy": 0.9552160143852234, + "epoch": 4.677716390423573, + "grad_norm": 0.968815267086029, + "learning_rate": 5.9444438387747336e-05, + "loss": 0.027274739742279053, + "mean_token_accuracy": 0.9896075248718261, + "num_tokens": 52181820.0, + "step": 2540 + }, + { + "entropy": 0.9265012145042419, + "epoch": 4.696132596685083, + "grad_norm": 0.8966720700263977, + "learning_rate": 5.914521123457015e-05, + "loss": 0.0291823148727417, + "mean_token_accuracy": 0.9886700630187988, + "num_tokens": 52387511.0, + "step": 2550 + }, + { + "entropy": 0.9156096875667572, + "epoch": 4.714548802946593, + "grad_norm": 0.7747519612312317, + "learning_rate": 5.88456448238844e-05, + "loss": 0.02809179127216339, + "mean_token_accuracy": 0.9891100466251374, + "num_tokens": 52592737.0, + "step": 2560 + }, + { + "entropy": 0.924511456489563, + "epoch": 4.732965009208103, + "grad_norm": 1.0087049007415771, + "learning_rate": 5.8545750268626844e-05, + "loss": 0.02683232128620148, + "mean_token_accuracy": 0.9896528899669648, + "num_tokens": 52798814.0, + "step": 2570 + }, + { + "entropy": 0.9662951111793519, + "epoch": 4.751381215469613, + "grad_norm": 0.7709590792655945, + "learning_rate": 5.824553869390734e-05, + "loss": 0.02503817081451416, + "mean_token_accuracy": 0.9900161385536194, + "num_tokens": 53004478.0, + "step": 2580 + }, + { + "entropy": 0.9889141619205475, + "epoch": 4.769797421731123, + "grad_norm": 0.815858006477356, + "learning_rate": 5.794502123659613e-05, + "loss": 0.026327347755432128, + "mean_token_accuracy": 0.9900785744190216, + "num_tokens": 53209888.0, + "step": 2590 + }, + { + "entropy": 0.9785685896873474, + "epoch": 4.788213627992634, + "grad_norm": 0.6514431238174438, + "learning_rate": 5.7644209044910735e-05, + "loss": 0.025033789873123168, + "mean_token_accuracy": 0.9902650475502014, + "num_tokens": 53415533.0, + "step": 2600 + }, + { + "entropy": 0.9723869919776916, + "epoch": 4.806629834254144, + "grad_norm": 0.8778963685035706, + "learning_rate": 5.7343113278002284e-05, + "loss": 0.02379843294620514, + "mean_token_accuracy": 0.9909472465515137, + "num_tokens": 53620850.0, + "step": 2610 + }, + { + "entropy": 0.9572711050510406, + "epoch": 4.8250460405156534, + "grad_norm": 0.8927134871482849, + "learning_rate": 5.70417451055417e-05, + "loss": 0.024856947362422943, + "mean_token_accuracy": 0.9904125213623047, + "num_tokens": 53826259.0, + "step": 2620 + }, + { + "entropy": 0.9523135125637054, + "epoch": 4.843462246777164, + "grad_norm": 0.6832691431045532, + "learning_rate": 5.674011570730523e-05, + "loss": 0.025352203845977785, + "mean_token_accuracy": 0.990432596206665, + "num_tokens": 54031531.0, + "step": 2630 + }, + { + "entropy": 0.9735220730304718, + "epoch": 4.861878453038674, + "grad_norm": 0.6399164795875549, + "learning_rate": 5.643823627275972e-05, + "loss": 0.026541513204574586, + "mean_token_accuracy": 0.9900369107723236, + "num_tokens": 54237155.0, + "step": 2640 + }, + { + "entropy": 0.9566517114639282, + "epoch": 4.880294659300184, + "grad_norm": 0.8725414276123047, + "learning_rate": 5.6136118000647616e-05, + "loss": 0.02675778865814209, + "mean_token_accuracy": 0.9894899427890778, + "num_tokens": 54442739.0, + "step": 2650 + }, + { + "entropy": 0.9447909593582153, + "epoch": 4.898710865561695, + "grad_norm": 0.8169302344322205, + "learning_rate": 5.583377209857138e-05, + "loss": 0.02642086148262024, + "mean_token_accuracy": 0.989885401725769, + "num_tokens": 54648098.0, + "step": 2660 + }, + { + "entropy": 0.9180052697658538, + "epoch": 4.917127071823204, + "grad_norm": 0.7768753170967102, + "learning_rate": 5.553120978257787e-05, + "loss": 0.02552323341369629, + "mean_token_accuracy": 0.9899512350559234, + "num_tokens": 54854281.0, + "step": 2670 + }, + { + "entropy": 0.917166668176651, + "epoch": 4.935543278084714, + "grad_norm": 0.8241410851478577, + "learning_rate": 5.5228442276742153e-05, + "loss": 0.02788199484348297, + "mean_token_accuracy": 0.989625746011734, + "num_tokens": 55059495.0, + "step": 2680 + }, + { + "entropy": 0.9345465302467346, + "epoch": 4.953959484346225, + "grad_norm": 0.7645496129989624, + "learning_rate": 5.4925480812751166e-05, + "loss": 0.02517639398574829, + "mean_token_accuracy": 0.9902283847332001, + "num_tokens": 55265381.0, + "step": 2690 + }, + { + "entropy": 0.9386432528495788, + "epoch": 4.972375690607735, + "grad_norm": 0.8371859192848206, + "learning_rate": 5.46223366294871e-05, + "loss": 0.025585666298866272, + "mean_token_accuracy": 0.9903791427612305, + "num_tokens": 55471210.0, + "step": 2700 + }, + { + "entropy": 0.9267561137676239, + "epoch": 4.990791896869245, + "grad_norm": 0.6789297461509705, + "learning_rate": 5.43190209726104e-05, + "loss": 0.024646708369255067, + "mean_token_accuracy": 0.9904700815677643, + "num_tokens": 55676877.0, + "step": 2710 + }, + { + "epoch": 5.0, + "eval_entropy": 0.9283919717954553, + "eval_loss": 0.06225527077913284, + "eval_mean_token_accuracy": 0.9784110421719758, + "eval_num_tokens": 55779559.0, + "eval_runtime": 10.0613, + "eval_samples_per_second": 363.573, + "eval_steps_per_second": 11.43, + "step": 2715 + } + ], + "logging_steps": 10, + "max_steps": 5430, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.6591019550449336e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2715/training_args.bin b/checkpoint-2715/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e182f9a244f56249cd86c9db6563a54fbeed7bd --- /dev/null +++ b/checkpoint-2715/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21325c9bdff5ed34f0cc34837ee67ed216c9301ab4d9b2e26f048b563564bd75 +size 5777 diff --git a/checkpoint-3258/README.md b/checkpoint-3258/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dd49221a54a197d9f0cab86c7982e2b2fe0776b3 --- /dev/null +++ b/checkpoint-3258/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen2.5-7B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen2.5-7B-Instruct +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-3258/adapter_config.json b/checkpoint-3258/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..88178e58c6d24c2bb788f871d548c8bff3b8b16a --- /dev/null +++ b/checkpoint-3258/adapter_config.json @@ -0,0 +1,48 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "down_proj", + "q_proj", + "gate_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3258/adapter_model.safetensors b/checkpoint-3258/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f27b00c4d6100a9fc71d7adbdda5f5139ea7b293 --- /dev/null +++ b/checkpoint-3258/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342d90add5af5dfae9087ea9a560f86a7ebd48116022794da4d371303756af39 +size 80792096 diff --git a/checkpoint-3258/chat_template.jinja b/checkpoint-3258/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..bdf7919a96cfe43d50914a007b9c0877bd0ec27e --- /dev/null +++ b/checkpoint-3258/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-3258/tokenizer.json b/checkpoint-3258/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/checkpoint-3258/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/checkpoint-3258/tokenizer_config.json b/checkpoint-3258/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d75d3bb5300d205e48769cc1999073ab5971214 --- /dev/null +++ b/checkpoint-3258/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-3258/trainer_state.json b/checkpoint-3258/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e6fc06294443d3496db256d8e7637f81727d006b --- /dev/null +++ b/checkpoint-3258/trainer_state.json @@ -0,0 +1,3350 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 3258, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2237394809722901, + "epoch": 0.01841620626151013, + "grad_norm": 5.082435607910156, + "learning_rate": 3.308823529411765e-06, + "loss": 0.9237876892089844, + "mean_token_accuracy": 0.7685343027114868, + "num_tokens": 205423.0, + "step": 10 + }, + { + "entropy": 1.2295925617218018, + "epoch": 0.03683241252302026, + "grad_norm": 4.672000408172607, + "learning_rate": 6.985294117647059e-06, + "loss": 0.8900892257690429, + "mean_token_accuracy": 0.7677771031856537, + "num_tokens": 410849.0, + "step": 20 + }, + { + "entropy": 1.2285718679428101, + "epoch": 0.055248618784530384, + "grad_norm": 1.4828118085861206, + "learning_rate": 1.0661764705882354e-05, + "loss": 0.5975452899932862, + "mean_token_accuracy": 0.8146551787853241, + "num_tokens": 616438.0, + "step": 30 + }, + { + "entropy": 1.210776400566101, + "epoch": 0.07366482504604052, + "grad_norm": 0.7761328816413879, + "learning_rate": 1.4338235294117647e-05, + "loss": 0.40664992332458494, + "mean_token_accuracy": 0.8699092030525207, + "num_tokens": 822118.0, + "step": 40 + }, + { + "entropy": 1.200321125984192, + "epoch": 0.09208103130755065, + "grad_norm": 0.5363371968269348, + "learning_rate": 1.8014705882352943e-05, + "loss": 0.3313469409942627, + "mean_token_accuracy": 0.8904915869235992, + "num_tokens": 1027941.0, + "step": 50 + }, + { + "entropy": 1.1809936046600342, + "epoch": 0.11049723756906077, + "grad_norm": 0.39541518688201904, + "learning_rate": 2.1691176470588237e-05, + "loss": 0.27568228244781495, + "mean_token_accuracy": 0.9047131836414337, + "num_tokens": 1233620.0, + "step": 60 + }, + { + "entropy": 1.169810914993286, + "epoch": 0.1289134438305709, + "grad_norm": 0.341960072517395, + "learning_rate": 2.536764705882353e-05, + "loss": 0.245219087600708, + "mean_token_accuracy": 0.9150686681270599, + "num_tokens": 1438656.0, + "step": 70 + }, + { + "entropy": 1.1652960777282715, + "epoch": 0.14732965009208104, + "grad_norm": 0.36872178316116333, + "learning_rate": 2.9044117647058828e-05, + "loss": 0.2220149040222168, + "mean_token_accuracy": 0.9224777698516846, + "num_tokens": 1643877.0, + "step": 80 + }, + { + "entropy": 1.154341197013855, + "epoch": 0.16574585635359115, + "grad_norm": 0.4152425229549408, + "learning_rate": 3.272058823529412e-05, + "loss": 0.2002798557281494, + "mean_token_accuracy": 0.9285802960395813, + "num_tokens": 1849506.0, + "step": 90 + }, + { + "entropy": 1.1507258892059327, + "epoch": 0.1841620626151013, + "grad_norm": 0.47647765278816223, + "learning_rate": 3.639705882352941e-05, + "loss": 0.18871363401412963, + "mean_token_accuracy": 0.9318056285381318, + "num_tokens": 2055071.0, + "step": 100 + }, + { + "entropy": 1.1455535531044005, + "epoch": 0.20257826887661143, + "grad_norm": 0.4853009581565857, + "learning_rate": 4.007352941176471e-05, + "loss": 0.17836341857910157, + "mean_token_accuracy": 0.9367631554603577, + "num_tokens": 2260643.0, + "step": 110 + }, + { + "entropy": 1.1402526497840881, + "epoch": 0.22099447513812154, + "grad_norm": 0.4455392360687256, + "learning_rate": 4.375e-05, + "loss": 0.16921783685684205, + "mean_token_accuracy": 0.9386959195137023, + "num_tokens": 2466085.0, + "step": 120 + }, + { + "entropy": 1.1374777555465698, + "epoch": 0.23941068139963168, + "grad_norm": 0.5880279541015625, + "learning_rate": 4.742647058823529e-05, + "loss": 0.15989291667938232, + "mean_token_accuracy": 0.9421182632446289, + "num_tokens": 2671024.0, + "step": 130 + }, + { + "entropy": 1.1273940205574036, + "epoch": 0.2578268876611418, + "grad_norm": 0.612959086894989, + "learning_rate": 5.110294117647059e-05, + "loss": 0.14701461791992188, + "mean_token_accuracy": 0.9463540315628052, + "num_tokens": 2876848.0, + "step": 140 + }, + { + "entropy": 1.1263513088226318, + "epoch": 0.27624309392265195, + "grad_norm": 0.5695255398750305, + "learning_rate": 5.477941176470589e-05, + "loss": 0.14604382514953612, + "mean_token_accuracy": 0.946351945400238, + "num_tokens": 3082589.0, + "step": 150 + }, + { + "entropy": 1.1290789365768432, + "epoch": 0.2946593001841621, + "grad_norm": 0.6608090996742249, + "learning_rate": 5.845588235294118e-05, + "loss": 0.1409450054168701, + "mean_token_accuracy": 0.9481450319290161, + "num_tokens": 3287459.0, + "step": 160 + }, + { + "entropy": 1.1291529774665832, + "epoch": 0.31307550644567217, + "grad_norm": 0.652715802192688, + "learning_rate": 6.213235294117647e-05, + "loss": 0.14441155195236205, + "mean_token_accuracy": 0.9466125547885895, + "num_tokens": 3493682.0, + "step": 170 + }, + { + "entropy": 1.1244838953018188, + "epoch": 0.3314917127071823, + "grad_norm": 0.7815241813659668, + "learning_rate": 6.580882352941177e-05, + "loss": 0.13361064195632935, + "mean_token_accuracy": 0.9512295544147491, + "num_tokens": 3699573.0, + "step": 180 + }, + { + "entropy": 1.1217721104621887, + "epoch": 0.34990791896869244, + "grad_norm": 0.7933160066604614, + "learning_rate": 6.948529411764706e-05, + "loss": 0.13089522123336791, + "mean_token_accuracy": 0.9520221531391144, + "num_tokens": 3905156.0, + "step": 190 + }, + { + "entropy": 1.1206679105758668, + "epoch": 0.3683241252302026, + "grad_norm": 0.6815240383148193, + "learning_rate": 7.316176470588236e-05, + "loss": 0.13400404453277587, + "mean_token_accuracy": 0.9501322209835052, + "num_tokens": 4110570.0, + "step": 200 + }, + { + "entropy": 1.1161052227020263, + "epoch": 0.3867403314917127, + "grad_norm": 0.8297767639160156, + "learning_rate": 7.683823529411766e-05, + "loss": 0.13389937877655028, + "mean_token_accuracy": 0.9501932203769684, + "num_tokens": 4315834.0, + "step": 210 + }, + { + "entropy": 1.1098745942115784, + "epoch": 0.40515653775322286, + "grad_norm": 0.5943381786346436, + "learning_rate": 8.051470588235294e-05, + "loss": 0.13452907800674438, + "mean_token_accuracy": 0.9503286242485046, + "num_tokens": 4520807.0, + "step": 220 + }, + { + "entropy": 1.100480353832245, + "epoch": 0.42357274401473294, + "grad_norm": 0.6094359755516052, + "learning_rate": 8.419117647058824e-05, + "loss": 0.12827746868133544, + "mean_token_accuracy": 0.952492094039917, + "num_tokens": 4725867.0, + "step": 230 + }, + { + "entropy": 1.0901286959648133, + "epoch": 0.4419889502762431, + "grad_norm": 0.7240597605705261, + "learning_rate": 8.786764705882353e-05, + "loss": 0.12171242237091065, + "mean_token_accuracy": 0.953943532705307, + "num_tokens": 4931629.0, + "step": 240 + }, + { + "entropy": 1.0885071873664856, + "epoch": 0.4604051565377532, + "grad_norm": 0.6939547657966614, + "learning_rate": 9.154411764705882e-05, + "loss": 0.12155698537826538, + "mean_token_accuracy": 0.9545870959758759, + "num_tokens": 5137285.0, + "step": 250 + }, + { + "entropy": 1.086272156238556, + "epoch": 0.47882136279926335, + "grad_norm": 0.5752800703048706, + "learning_rate": 9.522058823529412e-05, + "loss": 0.12157790660858155, + "mean_token_accuracy": 0.9541126549243927, + "num_tokens": 5342575.0, + "step": 260 + }, + { + "entropy": 1.0857678413391114, + "epoch": 0.4972375690607735, + "grad_norm": 0.7565123438835144, + "learning_rate": 9.889705882352942e-05, + "loss": 0.12349612712860107, + "mean_token_accuracy": 0.9535140514373779, + "num_tokens": 5547995.0, + "step": 270 + }, + { + "entropy": 1.079762625694275, + "epoch": 0.5156537753222836, + "grad_norm": 0.6972768306732178, + "learning_rate": 9.999954556423843e-05, + "loss": 0.11875582933425903, + "mean_token_accuracy": 0.9556483089923858, + "num_tokens": 5753195.0, + "step": 280 + }, + { + "entropy": 1.0742079138755798, + "epoch": 0.5340699815837937, + "grad_norm": 0.7821696996688843, + "learning_rate": 9.999731977631227e-05, + "loss": 0.11824090480804443, + "mean_token_accuracy": 0.9557521045207977, + "num_tokens": 5958236.0, + "step": 290 + }, + { + "entropy": 1.0679773569107056, + "epoch": 0.5524861878453039, + "grad_norm": 0.5846888422966003, + "learning_rate": 9.999323925089486e-05, + "loss": 0.11707355976104736, + "mean_token_accuracy": 0.9554719448089599, + "num_tokens": 6163992.0, + "step": 300 + }, + { + "entropy": 1.0655727863311768, + "epoch": 0.570902394106814, + "grad_norm": 0.5812502503395081, + "learning_rate": 9.998730413936037e-05, + "loss": 0.11371417045593261, + "mean_token_accuracy": 0.9576376020908356, + "num_tokens": 6369456.0, + "step": 310 + }, + { + "entropy": 1.0607039332389832, + "epoch": 0.5893186003683242, + "grad_norm": 0.6238475441932678, + "learning_rate": 9.99795146618821e-05, + "loss": 0.11775733232498169, + "mean_token_accuracy": 0.9557221591472626, + "num_tokens": 6574833.0, + "step": 320 + }, + { + "entropy": 1.0504255175590516, + "epoch": 0.6077348066298343, + "grad_norm": 0.6496815085411072, + "learning_rate": 9.996987110742422e-05, + "loss": 0.10904088020324706, + "mean_token_accuracy": 0.9585366368293762, + "num_tokens": 6780108.0, + "step": 330 + }, + { + "entropy": 1.0456081986427308, + "epoch": 0.6261510128913443, + "grad_norm": 0.786702573299408, + "learning_rate": 9.995837383373119e-05, + "loss": 0.10642309188842773, + "mean_token_accuracy": 0.9596696078777314, + "num_tokens": 6985920.0, + "step": 340 + }, + { + "entropy": 1.0455098271369934, + "epoch": 0.6445672191528545, + "grad_norm": 0.5473790168762207, + "learning_rate": 9.994502326731434e-05, + "loss": 0.10822961330413819, + "mean_token_accuracy": 0.959563136100769, + "num_tokens": 7191465.0, + "step": 350 + }, + { + "entropy": 1.04240562915802, + "epoch": 0.6629834254143646, + "grad_norm": 0.6672356128692627, + "learning_rate": 9.992981990343614e-05, + "loss": 0.1110004186630249, + "mean_token_accuracy": 0.9582514643669129, + "num_tokens": 7396877.0, + "step": 360 + }, + { + "entropy": 1.0386811256408692, + "epoch": 0.6813996316758748, + "grad_norm": 0.698539674282074, + "learning_rate": 9.99127643060918e-05, + "loss": 0.107539963722229, + "mean_token_accuracy": 0.9593036234378814, + "num_tokens": 7602437.0, + "step": 370 + }, + { + "entropy": 1.0311225533485413, + "epoch": 0.6998158379373849, + "grad_norm": 0.6629284024238586, + "learning_rate": 9.989385710798837e-05, + "loss": 0.1064023494720459, + "mean_token_accuracy": 0.9602205216884613, + "num_tokens": 7808142.0, + "step": 380 + }, + { + "entropy": 1.030210506916046, + "epoch": 0.7182320441988951, + "grad_norm": 0.5616748929023743, + "learning_rate": 9.987309901052121e-05, + "loss": 0.10717041492462158, + "mean_token_accuracy": 0.9599347949028015, + "num_tokens": 8013407.0, + "step": 390 + }, + { + "entropy": 1.0208017826080322, + "epoch": 0.7366482504604052, + "grad_norm": 0.6329049468040466, + "learning_rate": 9.985049078374806e-05, + "loss": 0.10359601974487305, + "mean_token_accuracy": 0.9603756129741668, + "num_tokens": 8219040.0, + "step": 400 + }, + { + "entropy": 1.015640377998352, + "epoch": 0.7550644567219152, + "grad_norm": 0.6516013741493225, + "learning_rate": 9.982603326636037e-05, + "loss": 0.10146439075469971, + "mean_token_accuracy": 0.9627702474594116, + "num_tokens": 8424678.0, + "step": 410 + }, + { + "entropy": 1.0105359435081482, + "epoch": 0.7734806629834254, + "grad_norm": 0.6920603513717651, + "learning_rate": 9.979972736565226e-05, + "loss": 0.10770498514175415, + "mean_token_accuracy": 0.9591470420360565, + "num_tokens": 8629868.0, + "step": 420 + }, + { + "entropy": 0.9966452836990356, + "epoch": 0.7918968692449355, + "grad_norm": 0.6857476234436035, + "learning_rate": 9.977157405748687e-05, + "loss": 0.10282524824142455, + "mean_token_accuracy": 0.9612209022045135, + "num_tokens": 8835320.0, + "step": 430 + }, + { + "entropy": 0.9945534646511078, + "epoch": 0.8103130755064457, + "grad_norm": 0.7208472490310669, + "learning_rate": 9.974157438626008e-05, + "loss": 0.10069938898086547, + "mean_token_accuracy": 0.9620070576667785, + "num_tokens": 9041123.0, + "step": 440 + }, + { + "entropy": 0.979461395740509, + "epoch": 0.8287292817679558, + "grad_norm": 0.5071915984153748, + "learning_rate": 9.970972946486185e-05, + "loss": 0.09799174070358277, + "mean_token_accuracy": 0.9620374023914338, + "num_tokens": 9246361.0, + "step": 450 + }, + { + "entropy": 0.9830998003482818, + "epoch": 0.8471454880294659, + "grad_norm": 0.8660802245140076, + "learning_rate": 9.967604047463493e-05, + "loss": 0.10378165245056152, + "mean_token_accuracy": 0.9606865763664245, + "num_tokens": 9451845.0, + "step": 460 + }, + { + "entropy": 0.9813413023948669, + "epoch": 0.8655616942909761, + "grad_norm": 0.7642477750778198, + "learning_rate": 9.964050866533094e-05, + "loss": 0.1010061264038086, + "mean_token_accuracy": 0.9608745336532593, + "num_tokens": 9656802.0, + "step": 470 + }, + { + "entropy": 0.967874163389206, + "epoch": 0.8839779005524862, + "grad_norm": 0.5987281799316406, + "learning_rate": 9.960313535506411e-05, + "loss": 0.10169394016265869, + "mean_token_accuracy": 0.9611998200416565, + "num_tokens": 9861719.0, + "step": 480 + }, + { + "entropy": 0.9663491308689117, + "epoch": 0.9023941068139963, + "grad_norm": 0.6124638319015503, + "learning_rate": 9.956392193026239e-05, + "loss": 0.102389657497406, + "mean_token_accuracy": 0.9611884355545044, + "num_tokens": 10066673.0, + "step": 490 + }, + { + "entropy": 0.959654438495636, + "epoch": 0.9208103130755064, + "grad_norm": 0.7873051762580872, + "learning_rate": 9.952286984561592e-05, + "loss": 0.10170392990112305, + "mean_token_accuracy": 0.9610928475856781, + "num_tokens": 10272091.0, + "step": 500 + }, + { + "entropy": 0.9550537407398224, + "epoch": 0.9392265193370166, + "grad_norm": 0.6071968078613281, + "learning_rate": 9.947998062402313e-05, + "loss": 0.09448277950286865, + "mean_token_accuracy": 0.9648977637290954, + "num_tokens": 10477632.0, + "step": 510 + }, + { + "entropy": 0.9538533687591553, + "epoch": 0.9576427255985267, + "grad_norm": 0.6317242980003357, + "learning_rate": 9.943525585653428e-05, + "loss": 0.09542192220687866, + "mean_token_accuracy": 0.9635261118412017, + "num_tokens": 10682828.0, + "step": 520 + }, + { + "entropy": 0.9362513542175293, + "epoch": 0.9760589318600368, + "grad_norm": 0.6421944499015808, + "learning_rate": 9.938869720229234e-05, + "loss": 0.09382058382034301, + "mean_token_accuracy": 0.9648073971271515, + "num_tokens": 10888741.0, + "step": 530 + }, + { + "entropy": 0.9235438346862793, + "epoch": 0.994475138121547, + "grad_norm": 0.7986873388290405, + "learning_rate": 9.934030638847155e-05, + "loss": 0.09827429056167603, + "mean_token_accuracy": 0.9621128737926483, + "num_tokens": 11094387.0, + "step": 540 + }, + { + "epoch": 1.0, + "eval_entropy": 0.9137652366057686, + "eval_loss": 0.09368764609098434, + "eval_mean_token_accuracy": 0.9640816880309063, + "eval_num_tokens": 11155908.0, + "eval_runtime": 10.4701, + "eval_samples_per_second": 349.377, + "eval_steps_per_second": 10.984, + "step": 543 + }, + { + "entropy": 0.9047818422317505, + "epoch": 1.0128913443830572, + "grad_norm": 0.6781501173973083, + "learning_rate": 9.929008521021325e-05, + "loss": 0.0863916516304016, + "mean_token_accuracy": 0.9673655688762665, + "num_tokens": 11299715.0, + "step": 550 + }, + { + "entropy": 0.8856981039047241, + "epoch": 1.0313075506445673, + "grad_norm": 0.7143136858940125, + "learning_rate": 9.923803553055937e-05, + "loss": 0.08632323145866394, + "mean_token_accuracy": 0.9677783191204071, + "num_tokens": 11505059.0, + "step": 560 + }, + { + "entropy": 0.8937099635601043, + "epoch": 1.0497237569060773, + "grad_norm": 0.7751694321632385, + "learning_rate": 9.918415928038325e-05, + "loss": 0.08178263902664185, + "mean_token_accuracy": 0.9694291114807129, + "num_tokens": 11710464.0, + "step": 570 + }, + { + "entropy": 0.8858704209327698, + "epoch": 1.0681399631675874, + "grad_norm": 0.7492292523384094, + "learning_rate": 9.912845845831805e-05, + "loss": 0.08074211478233337, + "mean_token_accuracy": 0.9692470014095307, + "num_tokens": 11915959.0, + "step": 580 + }, + { + "entropy": 0.8948039829730987, + "epoch": 1.0865561694290977, + "grad_norm": 0.8116479516029358, + "learning_rate": 9.907093513068259e-05, + "loss": 0.08712012171745301, + "mean_token_accuracy": 0.9669980227947235, + "num_tokens": 12121499.0, + "step": 590 + }, + { + "entropy": 0.8846789538860321, + "epoch": 1.1049723756906078, + "grad_norm": 0.7295626997947693, + "learning_rate": 9.901159143140471e-05, + "loss": 0.08444435596466064, + "mean_token_accuracy": 0.9674544095993042, + "num_tokens": 12327061.0, + "step": 600 + }, + { + "entropy": 0.8734103918075562, + "epoch": 1.1233885819521179, + "grad_norm": 0.9585768580436707, + "learning_rate": 9.89504295619421e-05, + "loss": 0.08022565841674804, + "mean_token_accuracy": 0.969569206237793, + "num_tokens": 12532305.0, + "step": 610 + }, + { + "entropy": 0.8640486001968384, + "epoch": 1.141804788213628, + "grad_norm": 0.7891159057617188, + "learning_rate": 9.88874517912006e-05, + "loss": 0.08415375947952271, + "mean_token_accuracy": 0.9678892493247986, + "num_tokens": 12737828.0, + "step": 620 + }, + { + "entropy": 0.8599755525588989, + "epoch": 1.160220994475138, + "grad_norm": 0.5801345109939575, + "learning_rate": 9.882266045545012e-05, + "loss": 0.08100489974021911, + "mean_token_accuracy": 0.9688023269176483, + "num_tokens": 12943343.0, + "step": 630 + }, + { + "entropy": 0.86524977684021, + "epoch": 1.1786372007366483, + "grad_norm": 0.7633041143417358, + "learning_rate": 9.87560579582379e-05, + "loss": 0.07859406471252442, + "mean_token_accuracy": 0.9702189445495606, + "num_tokens": 13148473.0, + "step": 640 + }, + { + "entropy": 0.8466695249080658, + "epoch": 1.1970534069981584, + "grad_norm": 0.8672215938568115, + "learning_rate": 9.868764677029934e-05, + "loss": 0.08082623481750488, + "mean_token_accuracy": 0.9689972400665283, + "num_tokens": 13353890.0, + "step": 650 + }, + { + "entropy": 0.8596941530704498, + "epoch": 1.2154696132596685, + "grad_norm": 0.7524124383926392, + "learning_rate": 9.861742942946639e-05, + "loss": 0.0789935290813446, + "mean_token_accuracy": 0.9693858206272126, + "num_tokens": 13559475.0, + "step": 660 + }, + { + "entropy": 0.8708749234676361, + "epoch": 1.2338858195211786, + "grad_norm": 0.5777031183242798, + "learning_rate": 9.854540854057337e-05, + "loss": 0.07773642539978028, + "mean_token_accuracy": 0.970385092496872, + "num_tokens": 13765076.0, + "step": 670 + }, + { + "entropy": 0.8651713371276856, + "epoch": 1.2523020257826887, + "grad_norm": 0.7924166321754456, + "learning_rate": 9.847158677536034e-05, + "loss": 0.0766686737537384, + "mean_token_accuracy": 0.9702267110347748, + "num_tokens": 13970642.0, + "step": 680 + }, + { + "entropy": 0.8763024985790253, + "epoch": 1.270718232044199, + "grad_norm": 0.741219162940979, + "learning_rate": 9.839596687237403e-05, + "loss": 0.07189929485321045, + "mean_token_accuracy": 0.9727097094058991, + "num_tokens": 14176556.0, + "step": 690 + }, + { + "entropy": 0.8556921362876893, + "epoch": 1.289134438305709, + "grad_norm": 0.6298198103904724, + "learning_rate": 9.831855163686618e-05, + "loss": 0.07608137726783752, + "mean_token_accuracy": 0.9716399371623993, + "num_tokens": 14381686.0, + "step": 700 + }, + { + "entropy": 0.869178420305252, + "epoch": 1.3075506445672191, + "grad_norm": 0.5850273370742798, + "learning_rate": 9.823934394068952e-05, + "loss": 0.07437651753425598, + "mean_token_accuracy": 0.9709566533565521, + "num_tokens": 14586814.0, + "step": 710 + }, + { + "entropy": 0.8708595156669616, + "epoch": 1.3259668508287292, + "grad_norm": 0.6580632328987122, + "learning_rate": 9.815834672219127e-05, + "loss": 0.07518917322158813, + "mean_token_accuracy": 0.9717426657676697, + "num_tokens": 14792321.0, + "step": 720 + }, + { + "entropy": 0.8826817810535431, + "epoch": 1.3443830570902393, + "grad_norm": 0.8788532018661499, + "learning_rate": 9.807556298610404e-05, + "loss": 0.07579240798950196, + "mean_token_accuracy": 0.9706341981887817, + "num_tokens": 14997810.0, + "step": 730 + }, + { + "entropy": 0.9012470185756684, + "epoch": 1.3627992633517496, + "grad_norm": 0.7022138237953186, + "learning_rate": 9.799099580343441e-05, + "loss": 0.0775588572025299, + "mean_token_accuracy": 0.9699241399765015, + "num_tokens": 15203795.0, + "step": 740 + }, + { + "entropy": 0.886955714225769, + "epoch": 1.3812154696132597, + "grad_norm": 0.7881133556365967, + "learning_rate": 9.790464831134903e-05, + "loss": 0.07125020027160645, + "mean_token_accuracy": 0.9723815560340882, + "num_tokens": 15408974.0, + "step": 750 + }, + { + "entropy": 0.9047374844551086, + "epoch": 1.3996316758747698, + "grad_norm": 0.9082005023956299, + "learning_rate": 9.781652371305824e-05, + "loss": 0.07004334926605224, + "mean_token_accuracy": 0.9725580036640167, + "num_tokens": 15614399.0, + "step": 760 + }, + { + "entropy": 0.9039053857326508, + "epoch": 1.4180478821362799, + "grad_norm": 0.8060817122459412, + "learning_rate": 9.77266252776972e-05, + "loss": 0.07103485465049744, + "mean_token_accuracy": 0.9721468150615692, + "num_tokens": 15819895.0, + "step": 770 + }, + { + "entropy": 0.8998047232627868, + "epoch": 1.43646408839779, + "grad_norm": 1.0152642726898193, + "learning_rate": 9.763495634020467e-05, + "loss": 0.07411704063415528, + "mean_token_accuracy": 0.9711063146591187, + "num_tokens": 16025297.0, + "step": 780 + }, + { + "entropy": 0.9120213568210602, + "epoch": 1.4548802946593002, + "grad_norm": 0.6288319826126099, + "learning_rate": 9.754152030119921e-05, + "loss": 0.07223712205886841, + "mean_token_accuracy": 0.9722476422786712, + "num_tokens": 16230656.0, + "step": 790 + }, + { + "entropy": 0.9142370820045471, + "epoch": 1.4732965009208103, + "grad_norm": 0.7854700088500977, + "learning_rate": 9.744632062685311e-05, + "loss": 0.07186744809150696, + "mean_token_accuracy": 0.972247713804245, + "num_tokens": 16435943.0, + "step": 800 + }, + { + "entropy": 0.8920814216136932, + "epoch": 1.4917127071823204, + "grad_norm": 0.6227074265480042, + "learning_rate": 9.734936084876383e-05, + "loss": 0.07016961574554444, + "mean_token_accuracy": 0.9725603640079499, + "num_tokens": 16641635.0, + "step": 810 + }, + { + "entropy": 0.891328877210617, + "epoch": 1.5101289134438307, + "grad_norm": 0.7601346969604492, + "learning_rate": 9.725064456382283e-05, + "loss": 0.07137494087219239, + "mean_token_accuracy": 0.9722997546195984, + "num_tokens": 16847194.0, + "step": 820 + }, + { + "entropy": 0.8921217978000641, + "epoch": 1.5285451197053406, + "grad_norm": 0.7813850045204163, + "learning_rate": 9.715017543408233e-05, + "loss": 0.06890199184417725, + "mean_token_accuracy": 0.9735044002532959, + "num_tokens": 17052807.0, + "step": 830 + }, + { + "entropy": 0.9085914671421051, + "epoch": 1.5469613259668509, + "grad_norm": 0.6184289455413818, + "learning_rate": 9.704795718661939e-05, + "loss": 0.07043765187263488, + "mean_token_accuracy": 0.9725716531276702, + "num_tokens": 17258284.0, + "step": 840 + }, + { + "entropy": 0.9029861629009247, + "epoch": 1.565377532228361, + "grad_norm": 0.7082377076148987, + "learning_rate": 9.694399361339752e-05, + "loss": 0.07113839387893676, + "mean_token_accuracy": 0.9725669205188752, + "num_tokens": 17464326.0, + "step": 850 + }, + { + "entropy": 0.8856533527374267, + "epoch": 1.583793738489871, + "grad_norm": 0.7409216165542603, + "learning_rate": 9.683828857112627e-05, + "loss": 0.07077333331108093, + "mean_token_accuracy": 0.9731084644794464, + "num_tokens": 17669537.0, + "step": 860 + }, + { + "entropy": 0.8613030433654785, + "epoch": 1.6022099447513813, + "grad_norm": 0.6801561713218689, + "learning_rate": 9.673084598111789e-05, + "loss": 0.06885308027267456, + "mean_token_accuracy": 0.97266526222229, + "num_tokens": 17875289.0, + "step": 870 + }, + { + "entropy": 0.8692965865135193, + "epoch": 1.6206261510128912, + "grad_norm": 1.1621277332305908, + "learning_rate": 9.662166982914203e-05, + "loss": 0.07017780542373657, + "mean_token_accuracy": 0.9733059942722321, + "num_tokens": 18080404.0, + "step": 880 + }, + { + "entropy": 0.8671502113342285, + "epoch": 1.6390423572744015, + "grad_norm": 0.7518903613090515, + "learning_rate": 9.651076416527787e-05, + "loss": 0.06977018713951111, + "mean_token_accuracy": 0.9730017304420471, + "num_tokens": 18285699.0, + "step": 890 + }, + { + "entropy": 0.8662045657634735, + "epoch": 1.6574585635359116, + "grad_norm": 0.6622698903083801, + "learning_rate": 9.639813310376378e-05, + "loss": 0.06620995998382569, + "mean_token_accuracy": 0.9737491130828857, + "num_tokens": 18491097.0, + "step": 900 + }, + { + "entropy": 0.8548173069953918, + "epoch": 1.6758747697974217, + "grad_norm": 0.8941843509674072, + "learning_rate": 9.628378082284479e-05, + "loss": 0.06711119413375854, + "mean_token_accuracy": 0.9740589797496796, + "num_tokens": 18696827.0, + "step": 910 + }, + { + "entropy": 0.8763562262058258, + "epoch": 1.694290976058932, + "grad_norm": 0.7571700215339661, + "learning_rate": 9.616771156461755e-05, + "loss": 0.07263468503952027, + "mean_token_accuracy": 0.9717419981956482, + "num_tokens": 18902513.0, + "step": 920 + }, + { + "entropy": 0.8663733780384064, + "epoch": 1.7127071823204418, + "grad_norm": 0.7886489629745483, + "learning_rate": 9.604992963487298e-05, + "loss": 0.07074605226516724, + "mean_token_accuracy": 0.9724965393543243, + "num_tokens": 19107812.0, + "step": 930 + }, + { + "entropy": 0.8673004627227783, + "epoch": 1.7311233885819521, + "grad_norm": 0.8180726170539856, + "learning_rate": 9.593043940293647e-05, + "loss": 0.06831735372543335, + "mean_token_accuracy": 0.9733696818351746, + "num_tokens": 19313330.0, + "step": 940 + }, + { + "entropy": 0.8525971233844757, + "epoch": 1.7495395948434622, + "grad_norm": 0.6576228737831116, + "learning_rate": 9.580924530150595e-05, + "loss": 0.06567002534866333, + "mean_token_accuracy": 0.9745754361152649, + "num_tokens": 19518671.0, + "step": 950 + }, + { + "entropy": 0.8605451703071594, + "epoch": 1.7679558011049723, + "grad_norm": 0.7171661257743835, + "learning_rate": 9.568635182648725e-05, + "loss": 0.06872050762176514, + "mean_token_accuracy": 0.9732091546058654, + "num_tokens": 19724135.0, + "step": 960 + }, + { + "entropy": 0.8642210960388184, + "epoch": 1.7863720073664826, + "grad_norm": 0.7603147029876709, + "learning_rate": 9.556176353682746e-05, + "loss": 0.06766576766967773, + "mean_token_accuracy": 0.9728681743144989, + "num_tokens": 19928785.0, + "step": 970 + }, + { + "entropy": 0.8543185651302337, + "epoch": 1.8047882136279927, + "grad_norm": 0.7280875444412231, + "learning_rate": 9.543548505434581e-05, + "loss": 0.06851862668991089, + "mean_token_accuracy": 0.9737437188625335, + "num_tokens": 20134195.0, + "step": 980 + }, + { + "entropy": 0.8744745373725891, + "epoch": 1.8232044198895028, + "grad_norm": 0.5897248983383179, + "learning_rate": 9.530752106356209e-05, + "loss": 0.06809053421020508, + "mean_token_accuracy": 0.9733593761920929, + "num_tokens": 20339517.0, + "step": 990 + }, + { + "entropy": 0.8623859465122223, + "epoch": 1.8416206261510129, + "grad_norm": 0.7515265345573425, + "learning_rate": 9.517787631152298e-05, + "loss": 0.07257847785949707, + "mean_token_accuracy": 0.9714054942131043, + "num_tokens": 20545249.0, + "step": 1000 + }, + { + "entropy": 0.8669404804706573, + "epoch": 1.860036832412523, + "grad_norm": 0.7144560813903809, + "learning_rate": 9.504655560762596e-05, + "loss": 0.06832354068756104, + "mean_token_accuracy": 0.9735779523849487, + "num_tokens": 20750507.0, + "step": 1010 + }, + { + "entropy": 0.8493516445159912, + "epoch": 1.8784530386740332, + "grad_norm": 0.6559189558029175, + "learning_rate": 9.491356382344081e-05, + "loss": 0.0629766047000885, + "mean_token_accuracy": 0.9754977762699127, + "num_tokens": 20955956.0, + "step": 1020 + }, + { + "entropy": 0.8599376022815705, + "epoch": 1.8968692449355433, + "grad_norm": 0.6792973279953003, + "learning_rate": 9.477890589252895e-05, + "loss": 0.0666757881641388, + "mean_token_accuracy": 0.974083811044693, + "num_tokens": 21161163.0, + "step": 1030 + }, + { + "entropy": 0.8458438158035279, + "epoch": 1.9152854511970534, + "grad_norm": 0.6941778659820557, + "learning_rate": 9.464258681026042e-05, + "loss": 0.06307152509689332, + "mean_token_accuracy": 0.9757042229175568, + "num_tokens": 21366525.0, + "step": 1040 + }, + { + "entropy": 0.848515909910202, + "epoch": 1.9337016574585635, + "grad_norm": 0.7307806611061096, + "learning_rate": 9.450461163362855e-05, + "loss": 0.06307026147842407, + "mean_token_accuracy": 0.9750974595546722, + "num_tokens": 21572238.0, + "step": 1050 + }, + { + "entropy": 0.8563454031944275, + "epoch": 1.9521178637200736, + "grad_norm": 0.7222106456756592, + "learning_rate": 9.436498548106236e-05, + "loss": 0.0647726058959961, + "mean_token_accuracy": 0.974629694223404, + "num_tokens": 21777633.0, + "step": 1060 + }, + { + "entropy": 0.8656457483768463, + "epoch": 1.9705340699815839, + "grad_norm": 0.67178875207901, + "learning_rate": 9.422371353223674e-05, + "loss": 0.06573554277420043, + "mean_token_accuracy": 0.9745908617973328, + "num_tokens": 21983116.0, + "step": 1070 + }, + { + "entropy": 0.8630891263484954, + "epoch": 1.988950276243094, + "grad_norm": 0.6956593990325928, + "learning_rate": 9.408080102788016e-05, + "loss": 0.06630704402923585, + "mean_token_accuracy": 0.9741333484649658, + "num_tokens": 22188662.0, + "step": 1080 + }, + { + "epoch": 2.0, + "eval_entropy": 0.8560857042022373, + "eval_loss": 0.06494329869747162, + "eval_mean_token_accuracy": 0.9745692672936813, + "eval_num_tokens": 22311800.0, + "eval_runtime": 10.129, + "eval_samples_per_second": 361.142, + "eval_steps_per_second": 11.354, + "step": 1086 + }, + { + "entropy": 0.8616272270679474, + "epoch": 2.007366482504604, + "grad_norm": 0.7778105139732361, + "learning_rate": 9.393625326958041e-05, + "loss": 0.054407155513763426, + "mean_token_accuracy": 0.9792074799537659, + "num_tokens": 22394215.0, + "step": 1090 + }, + { + "entropy": 0.8496910452842712, + "epoch": 2.0257826887661143, + "grad_norm": 0.7422528266906738, + "learning_rate": 9.379007561958792e-05, + "loss": 0.051881587505340575, + "mean_token_accuracy": 0.9799090325832367, + "num_tokens": 22599599.0, + "step": 1100 + }, + { + "entropy": 0.8531602442264556, + "epoch": 2.044198895027624, + "grad_norm": 0.9075332880020142, + "learning_rate": 9.36422735006167e-05, + "loss": 0.05190724730491638, + "mean_token_accuracy": 0.979931116104126, + "num_tokens": 22805318.0, + "step": 1110 + }, + { + "entropy": 0.8657277703285218, + "epoch": 2.0626151012891345, + "grad_norm": 0.9466913938522339, + "learning_rate": 9.349285239564325e-05, + "loss": 0.053853434324264524, + "mean_token_accuracy": 0.9796103596687317, + "num_tokens": 23010438.0, + "step": 1120 + }, + { + "entropy": 0.8578485429286957, + "epoch": 2.0810313075506444, + "grad_norm": 0.6903054714202881, + "learning_rate": 9.334181784770326e-05, + "loss": 0.05228850841522217, + "mean_token_accuracy": 0.9802409887313843, + "num_tokens": 23215795.0, + "step": 1130 + }, + { + "entropy": 0.8450767934322357, + "epoch": 2.0994475138121547, + "grad_norm": 0.6615211367607117, + "learning_rate": 9.318917545968581e-05, + "loss": 0.050570905208587646, + "mean_token_accuracy": 0.9802053451538086, + "num_tokens": 23421157.0, + "step": 1140 + }, + { + "entropy": 0.8325044393539429, + "epoch": 2.117863720073665, + "grad_norm": 0.760960578918457, + "learning_rate": 9.303493089412564e-05, + "loss": 0.051966112852096555, + "mean_token_accuracy": 0.9796205997467041, + "num_tokens": 23626584.0, + "step": 1150 + }, + { + "entropy": 0.8416404843330383, + "epoch": 2.136279926335175, + "grad_norm": 0.6947009563446045, + "learning_rate": 9.287908987299306e-05, + "loss": 0.05144861936569214, + "mean_token_accuracy": 0.9800034642219544, + "num_tokens": 23832137.0, + "step": 1160 + }, + { + "entropy": 0.8564540028572083, + "epoch": 2.154696132596685, + "grad_norm": 0.733252763748169, + "learning_rate": 9.272165817748164e-05, + "loss": 0.04944799542427063, + "mean_token_accuracy": 0.9808157980442047, + "num_tokens": 24038006.0, + "step": 1170 + }, + { + "entropy": 0.8575525343418121, + "epoch": 2.1731123388581954, + "grad_norm": 0.8911028504371643, + "learning_rate": 9.25626416477938e-05, + "loss": 0.05037952661514282, + "mean_token_accuracy": 0.980946284532547, + "num_tokens": 24243374.0, + "step": 1180 + }, + { + "entropy": 0.8599720418453216, + "epoch": 2.1915285451197053, + "grad_norm": 0.7713524103164673, + "learning_rate": 9.240204618292416e-05, + "loss": 0.050603735446929934, + "mean_token_accuracy": 0.980896121263504, + "num_tokens": 24448585.0, + "step": 1190 + }, + { + "entropy": 0.8566664934158326, + "epoch": 2.2099447513812156, + "grad_norm": 0.8439353704452515, + "learning_rate": 9.223987774044066e-05, + "loss": 0.054171699285507205, + "mean_token_accuracy": 0.9796543836593627, + "num_tokens": 24653863.0, + "step": 1200 + }, + { + "entropy": 0.846601277589798, + "epoch": 2.2283609576427255, + "grad_norm": 0.7025637030601501, + "learning_rate": 9.207614233626356e-05, + "loss": 0.048924127221107484, + "mean_token_accuracy": 0.9809681415557862, + "num_tokens": 24859801.0, + "step": 1210 + }, + { + "entropy": 0.8564423739910125, + "epoch": 2.2467771639042358, + "grad_norm": 0.7788274884223938, + "learning_rate": 9.191084604444233e-05, + "loss": 0.05260283350944519, + "mean_token_accuracy": 0.9793797850608825, + "num_tokens": 25065368.0, + "step": 1220 + }, + { + "entropy": 0.865056723356247, + "epoch": 2.265193370165746, + "grad_norm": 0.8728818297386169, + "learning_rate": 9.174399499693027e-05, + "loss": 0.05016371011734009, + "mean_token_accuracy": 0.9807134211063385, + "num_tokens": 25270945.0, + "step": 1230 + }, + { + "entropy": 0.8642262935638427, + "epoch": 2.283609576427256, + "grad_norm": 1.0582489967346191, + "learning_rate": 9.157559538335703e-05, + "loss": 0.05316779017448425, + "mean_token_accuracy": 0.9794209063053131, + "num_tokens": 25476575.0, + "step": 1240 + }, + { + "entropy": 0.8677761554718018, + "epoch": 2.3020257826887662, + "grad_norm": 0.760109543800354, + "learning_rate": 9.140565345079901e-05, + "loss": 0.05115479230880737, + "mean_token_accuracy": 0.9802310705184937, + "num_tokens": 25682814.0, + "step": 1250 + }, + { + "entropy": 0.8592945456504821, + "epoch": 2.320441988950276, + "grad_norm": 0.6537907123565674, + "learning_rate": 9.123417550354761e-05, + "loss": 0.050543540716171266, + "mean_token_accuracy": 0.9806945025920868, + "num_tokens": 25887575.0, + "step": 1260 + }, + { + "entropy": 0.8692500293254852, + "epoch": 2.3388581952117864, + "grad_norm": 0.7771905064582825, + "learning_rate": 9.106116790287541e-05, + "loss": 0.049718713760375975, + "mean_token_accuracy": 0.9805168390274048, + "num_tokens": 26092950.0, + "step": 1270 + }, + { + "entropy": 0.8841261565685272, + "epoch": 2.3572744014732967, + "grad_norm": 0.7791076898574829, + "learning_rate": 9.08866370668001e-05, + "loss": 0.0527400553226471, + "mean_token_accuracy": 0.9796754539012908, + "num_tokens": 26298182.0, + "step": 1280 + }, + { + "entropy": 0.8675022900104523, + "epoch": 2.3756906077348066, + "grad_norm": 0.8481605648994446, + "learning_rate": 9.07105894698464e-05, + "loss": 0.05320838689804077, + "mean_token_accuracy": 0.9792274832725525, + "num_tokens": 26503425.0, + "step": 1290 + }, + { + "entropy": 0.8704026222229004, + "epoch": 2.394106813996317, + "grad_norm": 0.8235505819320679, + "learning_rate": 9.053303164280602e-05, + "loss": 0.055045205354690555, + "mean_token_accuracy": 0.9788750648498535, + "num_tokens": 26708755.0, + "step": 1300 + }, + { + "entropy": 0.8525134027004242, + "epoch": 2.4125230202578267, + "grad_norm": 0.7611598968505859, + "learning_rate": 9.035397017249518e-05, + "loss": 0.05029621124267578, + "mean_token_accuracy": 0.9802757322788238, + "num_tokens": 26914704.0, + "step": 1310 + }, + { + "entropy": 0.8630305290222168, + "epoch": 2.430939226519337, + "grad_norm": 0.790408194065094, + "learning_rate": 9.017341170151041e-05, + "loss": 0.04856040775775909, + "mean_token_accuracy": 0.9809690833091735, + "num_tokens": 27120151.0, + "step": 1320 + }, + { + "entropy": 0.8579159140586853, + "epoch": 2.4493554327808473, + "grad_norm": 0.781972348690033, + "learning_rate": 8.999136292798207e-05, + "loss": 0.04869682788848877, + "mean_token_accuracy": 0.9816130697727203, + "num_tokens": 27325673.0, + "step": 1330 + }, + { + "entropy": 0.8634716987609863, + "epoch": 2.467771639042357, + "grad_norm": 0.8500784039497375, + "learning_rate": 8.980783060532588e-05, + "loss": 0.05050289034843445, + "mean_token_accuracy": 0.980079609155655, + "num_tokens": 27531270.0, + "step": 1340 + }, + { + "entropy": 0.8660618126392364, + "epoch": 2.4861878453038675, + "grad_norm": 0.719760537147522, + "learning_rate": 8.96228215419924e-05, + "loss": 0.04892141819000244, + "mean_token_accuracy": 0.9814020991325378, + "num_tokens": 27736542.0, + "step": 1350 + }, + { + "entropy": 0.8572284400463104, + "epoch": 2.5046040515653774, + "grad_norm": 1.0197229385375977, + "learning_rate": 8.943634260121442e-05, + "loss": 0.05104702711105347, + "mean_token_accuracy": 0.9798846662044525, + "num_tokens": 27941566.0, + "step": 1360 + }, + { + "entropy": 0.8702241241931915, + "epoch": 2.5230202578268877, + "grad_norm": 0.7136003375053406, + "learning_rate": 8.924840070075247e-05, + "loss": 0.04855787754058838, + "mean_token_accuracy": 0.9811685383319855, + "num_tokens": 28146943.0, + "step": 1370 + }, + { + "entropy": 0.874957013130188, + "epoch": 2.541436464088398, + "grad_norm": 0.8775497674942017, + "learning_rate": 8.905900281263804e-05, + "loss": 0.052434295415878296, + "mean_token_accuracy": 0.9795438170433044, + "num_tokens": 28352640.0, + "step": 1380 + }, + { + "entropy": 0.8776536166667939, + "epoch": 2.559852670349908, + "grad_norm": 0.8895741105079651, + "learning_rate": 8.8868155962915e-05, + "loss": 0.05282890796661377, + "mean_token_accuracy": 0.9790538609027862, + "num_tokens": 28558153.0, + "step": 1390 + }, + { + "entropy": 0.8738743245601654, + "epoch": 2.578268876611418, + "grad_norm": 0.788800060749054, + "learning_rate": 8.867586723137906e-05, + "loss": 0.048841872811317445, + "mean_token_accuracy": 0.9809149026870727, + "num_tokens": 28763613.0, + "step": 1400 + }, + { + "entropy": 0.8750253796577454, + "epoch": 2.596685082872928, + "grad_norm": 0.8738002777099609, + "learning_rate": 8.848214375131497e-05, + "loss": 0.048261132836341855, + "mean_token_accuracy": 0.980789190530777, + "num_tokens": 28969248.0, + "step": 1410 + }, + { + "entropy": 0.8624245524406433, + "epoch": 2.6151012891344383, + "grad_norm": 0.6404895186424255, + "learning_rate": 8.828699270923196e-05, + "loss": 0.04970468282699585, + "mean_token_accuracy": 0.9807762265205383, + "num_tokens": 29174779.0, + "step": 1420 + }, + { + "entropy": 0.8792938470840455, + "epoch": 2.6335174953959486, + "grad_norm": 0.7856965661048889, + "learning_rate": 8.80904213445972e-05, + "loss": 0.053334391117095946, + "mean_token_accuracy": 0.9790222108364105, + "num_tokens": 29380474.0, + "step": 1430 + }, + { + "entropy": 0.8831034600734711, + "epoch": 2.6519337016574585, + "grad_norm": 0.7739618420600891, + "learning_rate": 8.789243694956716e-05, + "loss": 0.04959054589271546, + "mean_token_accuracy": 0.9803965091705322, + "num_tokens": 29585985.0, + "step": 1440 + }, + { + "entropy": 0.8934672951698304, + "epoch": 2.6703499079189688, + "grad_norm": 0.6999697089195251, + "learning_rate": 8.769304686871719e-05, + "loss": 0.05165250301361084, + "mean_token_accuracy": 0.9798884153366089, + "num_tokens": 29791238.0, + "step": 1450 + }, + { + "entropy": 0.9053199410438537, + "epoch": 2.6887661141804786, + "grad_norm": 0.9199564456939697, + "learning_rate": 8.749225849876892e-05, + "loss": 0.04924143850803375, + "mean_token_accuracy": 0.9810785710811615, + "num_tokens": 29996589.0, + "step": 1460 + }, + { + "entropy": 0.888091403245926, + "epoch": 2.707182320441989, + "grad_norm": 0.7480106353759766, + "learning_rate": 8.729007928831597e-05, + "loss": 0.04948916733264923, + "mean_token_accuracy": 0.9809579730033875, + "num_tokens": 30201875.0, + "step": 1470 + }, + { + "entropy": 0.8723407983779907, + "epoch": 2.7255985267034992, + "grad_norm": 0.9506945013999939, + "learning_rate": 8.708651673754763e-05, + "loss": 0.048927539587020875, + "mean_token_accuracy": 0.980553150177002, + "num_tokens": 30407550.0, + "step": 1480 + }, + { + "entropy": 0.8737521529197693, + "epoch": 2.744014732965009, + "grad_norm": 0.8015706539154053, + "learning_rate": 8.688157839797062e-05, + "loss": 0.04963063597679138, + "mean_token_accuracy": 0.9809738755226135, + "num_tokens": 30612839.0, + "step": 1490 + }, + { + "entropy": 0.8800762951374054, + "epoch": 2.7624309392265194, + "grad_norm": 0.9429986476898193, + "learning_rate": 8.667527187212885e-05, + "loss": 0.0524174690246582, + "mean_token_accuracy": 0.9788767337799072, + "num_tokens": 30818578.0, + "step": 1500 + }, + { + "entropy": 0.8871055901050567, + "epoch": 2.7808471454880292, + "grad_norm": 0.5909196138381958, + "learning_rate": 8.646760481332157e-05, + "loss": 0.05166680812835693, + "mean_token_accuracy": 0.980216771364212, + "num_tokens": 31023829.0, + "step": 1510 + }, + { + "entropy": 0.8908755779266357, + "epoch": 2.7992633517495396, + "grad_norm": 0.9154611229896545, + "learning_rate": 8.625858492531931e-05, + "loss": 0.04951836466789246, + "mean_token_accuracy": 0.9801484227180481, + "num_tokens": 31229635.0, + "step": 1520 + }, + { + "entropy": 0.92480548620224, + "epoch": 2.81767955801105, + "grad_norm": 0.5989938378334045, + "learning_rate": 8.604821996207819e-05, + "loss": 0.04799881279468536, + "mean_token_accuracy": 0.9817522585391998, + "num_tokens": 31435456.0, + "step": 1530 + }, + { + "entropy": 0.9173881888389588, + "epoch": 2.8360957642725597, + "grad_norm": 0.899413526058197, + "learning_rate": 8.58365177274522e-05, + "loss": 0.0487445592880249, + "mean_token_accuracy": 0.9812625288963318, + "num_tokens": 31640904.0, + "step": 1540 + }, + { + "entropy": 0.9076135993003845, + "epoch": 2.85451197053407, + "grad_norm": 0.8494166135787964, + "learning_rate": 8.562348607490376e-05, + "loss": 0.05005228519439697, + "mean_token_accuracy": 0.9806681036949157, + "num_tokens": 31845807.0, + "step": 1550 + }, + { + "entropy": 0.9092245221138, + "epoch": 2.87292817679558, + "grad_norm": 0.8225123286247253, + "learning_rate": 8.540913290721234e-05, + "loss": 0.048654764890670776, + "mean_token_accuracy": 0.9805659353733063, + "num_tokens": 32051523.0, + "step": 1560 + }, + { + "entropy": 0.9062779664993286, + "epoch": 2.89134438305709, + "grad_norm": 0.7074014544487, + "learning_rate": 8.519346617618134e-05, + "loss": 0.049209845066070554, + "mean_token_accuracy": 0.9807434439659118, + "num_tokens": 32256895.0, + "step": 1570 + }, + { + "entropy": 0.9190246641635895, + "epoch": 2.9097605893186005, + "grad_norm": 0.8860642910003662, + "learning_rate": 8.497649388234304e-05, + "loss": 0.051211881637573245, + "mean_token_accuracy": 0.9802342295646668, + "num_tokens": 32462031.0, + "step": 1580 + }, + { + "entropy": 0.9088015079498291, + "epoch": 2.9281767955801103, + "grad_norm": 0.8062726855278015, + "learning_rate": 8.475822407466188e-05, + "loss": 0.053512704372406, + "mean_token_accuracy": 0.979486483335495, + "num_tokens": 32667533.0, + "step": 1590 + }, + { + "entropy": 0.9462027847766876, + "epoch": 2.9465930018416207, + "grad_norm": 0.7962909936904907, + "learning_rate": 8.453866485023579e-05, + "loss": 0.0501457154750824, + "mean_token_accuracy": 0.9803222417831421, + "num_tokens": 32872900.0, + "step": 1600 + }, + { + "entropy": 0.9671471297740937, + "epoch": 2.9650092081031305, + "grad_norm": 0.7641744017601013, + "learning_rate": 8.431782435399587e-05, + "loss": 0.04629061222076416, + "mean_token_accuracy": 0.9823175370693207, + "num_tokens": 33077850.0, + "step": 1610 + }, + { + "entropy": 0.955865204334259, + "epoch": 2.983425414364641, + "grad_norm": 0.6772348880767822, + "learning_rate": 8.409571077840426e-05, + "loss": 0.048368623852729796, + "mean_token_accuracy": 0.9808700799942016, + "num_tokens": 33283117.0, + "step": 1620 + }, + { + "epoch": 3.0, + "eval_entropy": 0.9563225186389426, + "eval_loss": 0.059064481407403946, + "eval_mean_token_accuracy": 0.9773589429648026, + "eval_num_tokens": 33467712.0, + "eval_runtime": 10.1471, + "eval_samples_per_second": 360.499, + "eval_steps_per_second": 11.333, + "step": 1629 + }, + { + "entropy": 0.9337226033210755, + "epoch": 3.001841620626151, + "grad_norm": 0.646203875541687, + "learning_rate": 8.387233236315016e-05, + "loss": 0.043352216482162476, + "mean_token_accuracy": 0.9830620110034942, + "num_tokens": 33488302.0, + "step": 1630 + }, + { + "entropy": 0.9734923839569092, + "epoch": 3.020257826887661, + "grad_norm": 0.7564226984977722, + "learning_rate": 8.364769739484416e-05, + "loss": 0.033932483196258544, + "mean_token_accuracy": 0.9872806966304779, + "num_tokens": 33693531.0, + "step": 1640 + }, + { + "entropy": 0.9669206500053406, + "epoch": 3.0386740331491713, + "grad_norm": 0.7126886248588562, + "learning_rate": 8.342181420671096e-05, + "loss": 0.03818287253379822, + "mean_token_accuracy": 0.9852082908153534, + "num_tokens": 33899305.0, + "step": 1650 + }, + { + "entropy": 0.9522916138172149, + "epoch": 3.0570902394106816, + "grad_norm": 1.0571653842926025, + "learning_rate": 8.319469117828007e-05, + "loss": 0.03456039130687714, + "mean_token_accuracy": 0.9867027878761292, + "num_tokens": 34104585.0, + "step": 1660 + }, + { + "entropy": 0.9568560004234314, + "epoch": 3.0755064456721914, + "grad_norm": 0.780940592288971, + "learning_rate": 8.296633673507505e-05, + "loss": 0.03551802039146423, + "mean_token_accuracy": 0.9867531359195709, + "num_tokens": 34309516.0, + "step": 1670 + }, + { + "entropy": 0.9590656876564025, + "epoch": 3.0939226519337018, + "grad_norm": 0.8330219388008118, + "learning_rate": 8.273675934830094e-05, + "loss": 0.03674865961074829, + "mean_token_accuracy": 0.9864118576049805, + "num_tokens": 34515170.0, + "step": 1680 + }, + { + "entropy": 0.975881814956665, + "epoch": 3.1123388581952116, + "grad_norm": 0.7010637521743774, + "learning_rate": 8.250596753453e-05, + "loss": 0.03550414443016052, + "mean_token_accuracy": 0.9864102602005005, + "num_tokens": 34720896.0, + "step": 1690 + }, + { + "entropy": 0.9599562883377075, + "epoch": 3.130755064456722, + "grad_norm": 0.6694278717041016, + "learning_rate": 8.227396985538578e-05, + "loss": 0.035564273595809937, + "mean_token_accuracy": 0.9867321848869324, + "num_tokens": 34925970.0, + "step": 1700 + }, + { + "entropy": 0.9582216143608093, + "epoch": 3.149171270718232, + "grad_norm": 0.9333199262619019, + "learning_rate": 8.204077491722546e-05, + "loss": 0.035575729608535764, + "mean_token_accuracy": 0.9862452208995819, + "num_tokens": 35131543.0, + "step": 1710 + }, + { + "entropy": 0.9579678058624268, + "epoch": 3.167587476979742, + "grad_norm": 0.9450218081474304, + "learning_rate": 8.180639137082066e-05, + "loss": 0.0385298490524292, + "mean_token_accuracy": 0.98538036942482, + "num_tokens": 35336790.0, + "step": 1720 + }, + { + "entropy": 0.9640831351280212, + "epoch": 3.1860036832412524, + "grad_norm": 0.8551534414291382, + "learning_rate": 8.157082791103649e-05, + "loss": 0.03702138364315033, + "mean_token_accuracy": 0.9852015495300293, + "num_tokens": 35542294.0, + "step": 1730 + }, + { + "entropy": 0.9867071211338043, + "epoch": 3.2044198895027622, + "grad_norm": 0.7138128876686096, + "learning_rate": 8.133409327650897e-05, + "loss": 0.035626694560050964, + "mean_token_accuracy": 0.986064875125885, + "num_tokens": 35747447.0, + "step": 1740 + }, + { + "entropy": 0.9639089345932007, + "epoch": 3.2228360957642725, + "grad_norm": 0.7131415009498596, + "learning_rate": 8.109619624932092e-05, + "loss": 0.035885071754455565, + "mean_token_accuracy": 0.986273056268692, + "num_tokens": 35952258.0, + "step": 1750 + }, + { + "entropy": 0.9516046345233917, + "epoch": 3.241252302025783, + "grad_norm": 0.6900200843811035, + "learning_rate": 8.085714565467611e-05, + "loss": 0.03535219430923462, + "mean_token_accuracy": 0.985836285352707, + "num_tokens": 36157938.0, + "step": 1760 + }, + { + "entropy": 0.9373646557331086, + "epoch": 3.2596685082872927, + "grad_norm": 0.6101690530776978, + "learning_rate": 8.061695036057191e-05, + "loss": 0.034940996766090394, + "mean_token_accuracy": 0.9863743901252746, + "num_tokens": 36363825.0, + "step": 1770 + }, + { + "entropy": 0.9444344758987426, + "epoch": 3.278084714548803, + "grad_norm": 0.7518529295921326, + "learning_rate": 8.03756192774703e-05, + "loss": 0.03404279053211212, + "mean_token_accuracy": 0.9866396844387054, + "num_tokens": 36568961.0, + "step": 1780 + }, + { + "entropy": 0.9550357758998871, + "epoch": 3.2965009208103133, + "grad_norm": 0.7687555551528931, + "learning_rate": 8.013316135796734e-05, + "loss": 0.038447052240371704, + "mean_token_accuracy": 0.985325163602829, + "num_tokens": 36774514.0, + "step": 1790 + }, + { + "entropy": 0.9477231681346894, + "epoch": 3.314917127071823, + "grad_norm": 0.7521633505821228, + "learning_rate": 7.988958559646102e-05, + "loss": 0.03746694028377533, + "mean_token_accuracy": 0.9853165090084076, + "num_tokens": 36979660.0, + "step": 1800 + }, + { + "entropy": 0.925805002450943, + "epoch": 3.3333333333333335, + "grad_norm": 0.9333297610282898, + "learning_rate": 7.964490102881768e-05, + "loss": 0.03700103759765625, + "mean_token_accuracy": 0.9850880861282348, + "num_tokens": 37185191.0, + "step": 1810 + }, + { + "entropy": 0.9225482225418091, + "epoch": 3.3517495395948433, + "grad_norm": 0.7928622961044312, + "learning_rate": 7.939911673203665e-05, + "loss": 0.03825801610946655, + "mean_token_accuracy": 0.9850241422653199, + "num_tokens": 37390749.0, + "step": 1820 + }, + { + "entropy": 0.9597147881984711, + "epoch": 3.3701657458563536, + "grad_norm": 0.7658583521842957, + "learning_rate": 7.915224182391375e-05, + "loss": 0.039855146408081056, + "mean_token_accuracy": 0.9845879554748536, + "num_tokens": 37596052.0, + "step": 1830 + }, + { + "entropy": 0.9485619068145752, + "epoch": 3.388581952117864, + "grad_norm": 0.8492130637168884, + "learning_rate": 7.890428546270278e-05, + "loss": 0.039359599351882935, + "mean_token_accuracy": 0.9847265422344208, + "num_tokens": 37802063.0, + "step": 1840 + }, + { + "entropy": 0.9670301914215088, + "epoch": 3.406998158379374, + "grad_norm": 0.7527599930763245, + "learning_rate": 7.865525684677608e-05, + "loss": 0.03752985596656799, + "mean_token_accuracy": 0.9855137526988983, + "num_tokens": 38007432.0, + "step": 1850 + }, + { + "entropy": 0.9681244969367981, + "epoch": 3.425414364640884, + "grad_norm": 0.7599612474441528, + "learning_rate": 7.840516521428303e-05, + "loss": 0.03653894364833832, + "mean_token_accuracy": 0.9858933389186859, + "num_tokens": 38212923.0, + "step": 1860 + }, + { + "entropy": 0.9706049561500549, + "epoch": 3.443830570902394, + "grad_norm": 0.7678127884864807, + "learning_rate": 7.815401984280748e-05, + "loss": 0.0366938978433609, + "mean_token_accuracy": 0.9854713797569274, + "num_tokens": 38418422.0, + "step": 1870 + }, + { + "entropy": 0.9637093842029572, + "epoch": 3.4622467771639043, + "grad_norm": 0.762824535369873, + "learning_rate": 7.790183004902359e-05, + "loss": 0.03516915142536163, + "mean_token_accuracy": 0.9866003453731537, + "num_tokens": 38624389.0, + "step": 1880 + }, + { + "entropy": 0.9373565018177032, + "epoch": 3.4806629834254146, + "grad_norm": 0.8221780061721802, + "learning_rate": 7.764860518835014e-05, + "loss": 0.04049026966094971, + "mean_token_accuracy": 0.984089481830597, + "num_tokens": 38829654.0, + "step": 1890 + }, + { + "entropy": 0.9356025457382202, + "epoch": 3.4990791896869244, + "grad_norm": 0.7583426237106323, + "learning_rate": 7.739435465460356e-05, + "loss": 0.03658481240272522, + "mean_token_accuracy": 0.9857318818569183, + "num_tokens": 39034638.0, + "step": 1900 + }, + { + "entropy": 0.9740163326263428, + "epoch": 3.5174953959484347, + "grad_norm": 0.7332878112792969, + "learning_rate": 7.713908787964937e-05, + "loss": 0.03508963882923126, + "mean_token_accuracy": 0.9863419532775879, + "num_tokens": 39240265.0, + "step": 1910 + }, + { + "entropy": 0.9528286933898926, + "epoch": 3.5359116022099446, + "grad_norm": 0.6515451669692993, + "learning_rate": 7.688281433305233e-05, + "loss": 0.036055779457092284, + "mean_token_accuracy": 0.9860979080200195, + "num_tokens": 39445546.0, + "step": 1920 + }, + { + "entropy": 0.9480705261230469, + "epoch": 3.554327808471455, + "grad_norm": 0.7725827097892761, + "learning_rate": 7.662554352172515e-05, + "loss": 0.037101513147354125, + "mean_token_accuracy": 0.985782790184021, + "num_tokens": 39651078.0, + "step": 1930 + }, + { + "entropy": 0.9655321061611175, + "epoch": 3.572744014732965, + "grad_norm": 0.7756506204605103, + "learning_rate": 7.636728498957581e-05, + "loss": 0.03721855878829956, + "mean_token_accuracy": 0.9857951939105988, + "num_tokens": 39856542.0, + "step": 1940 + }, + { + "entropy": 0.9772682309150695, + "epoch": 3.591160220994475, + "grad_norm": 0.9084987640380859, + "learning_rate": 7.610804831715355e-05, + "loss": 0.03570749163627625, + "mean_token_accuracy": 0.9863450109958649, + "num_tokens": 40061913.0, + "step": 1950 + }, + { + "entropy": 0.9579685389995575, + "epoch": 3.6095764272559854, + "grad_norm": 0.6358487606048584, + "learning_rate": 7.584784312129334e-05, + "loss": 0.038210684061050416, + "mean_token_accuracy": 0.9850837290287018, + "num_tokens": 40267398.0, + "step": 1960 + }, + { + "entropy": 0.9605201721191406, + "epoch": 3.6279926335174952, + "grad_norm": 0.6263149976730347, + "learning_rate": 7.558667905475927e-05, + "loss": 0.03509160876274109, + "mean_token_accuracy": 0.9868143379688263, + "num_tokens": 40472827.0, + "step": 1970 + }, + { + "entropy": 0.964026153087616, + "epoch": 3.6464088397790055, + "grad_norm": 0.90068119764328, + "learning_rate": 7.532456580588638e-05, + "loss": 0.036211782693862916, + "mean_token_accuracy": 0.9858468770980835, + "num_tokens": 40677935.0, + "step": 1980 + }, + { + "entropy": 0.9494135618209839, + "epoch": 3.664825046040516, + "grad_norm": 0.760134756565094, + "learning_rate": 7.50615130982213e-05, + "loss": 0.03786201477050781, + "mean_token_accuracy": 0.9852500438690186, + "num_tokens": 40883750.0, + "step": 1990 + }, + { + "entropy": 0.9527071297168732, + "epoch": 3.6832412523020257, + "grad_norm": 0.9812107682228088, + "learning_rate": 7.479753069016152e-05, + "loss": 0.03803159594535828, + "mean_token_accuracy": 0.9852405369281769, + "num_tokens": 41089115.0, + "step": 2000 + }, + { + "entropy": 0.9639330863952636, + "epoch": 3.701657458563536, + "grad_norm": 0.7164933681488037, + "learning_rate": 7.453262837459332e-05, + "loss": 0.03912568986415863, + "mean_token_accuracy": 0.9849458575248718, + "num_tokens": 41294694.0, + "step": 2010 + }, + { + "entropy": 0.9536987483501435, + "epoch": 3.720073664825046, + "grad_norm": 0.6804596185684204, + "learning_rate": 7.426681597852863e-05, + "loss": 0.036410006880760196, + "mean_token_accuracy": 0.985712206363678, + "num_tokens": 41499817.0, + "step": 2020 + }, + { + "entropy": 0.9478164672851562, + "epoch": 3.738489871086556, + "grad_norm": 0.8799397349357605, + "learning_rate": 7.400010336274037e-05, + "loss": 0.03801035583019256, + "mean_token_accuracy": 0.9850274682044983, + "num_tokens": 41704932.0, + "step": 2030 + }, + { + "entropy": 0.9383447647094727, + "epoch": 3.7569060773480665, + "grad_norm": 0.8386216163635254, + "learning_rate": 7.373250042139664e-05, + "loss": 0.0373637855052948, + "mean_token_accuracy": 0.9854822158813477, + "num_tokens": 41910804.0, + "step": 2040 + }, + { + "entropy": 0.925172996520996, + "epoch": 3.7753222836095763, + "grad_norm": 0.7599324584007263, + "learning_rate": 7.346401708169377e-05, + "loss": 0.03585260808467865, + "mean_token_accuracy": 0.9860672950744629, + "num_tokens": 42116706.0, + "step": 2050 + }, + { + "entropy": 0.9463765442371368, + "epoch": 3.7937384898710866, + "grad_norm": 0.9030149579048157, + "learning_rate": 7.319466330348797e-05, + "loss": 0.035877206921577455, + "mean_token_accuracy": 0.9863968968391419, + "num_tokens": 42322670.0, + "step": 2060 + }, + { + "entropy": 0.9942441761493683, + "epoch": 3.8121546961325965, + "grad_norm": 0.6400449275970459, + "learning_rate": 7.292444907892587e-05, + "loss": 0.037310433387756345, + "mean_token_accuracy": 0.9854151606559753, + "num_tokens": 42527752.0, + "step": 2070 + }, + { + "entropy": 0.9577703952789307, + "epoch": 3.830570902394107, + "grad_norm": 0.6193167567253113, + "learning_rate": 7.265338443207387e-05, + "loss": 0.03648848831653595, + "mean_token_accuracy": 0.9856530070304871, + "num_tokens": 42732981.0, + "step": 2080 + }, + { + "entropy": 0.9663952767848969, + "epoch": 3.848987108655617, + "grad_norm": 0.759611189365387, + "learning_rate": 7.238147941854625e-05, + "loss": 0.036112996935844424, + "mean_token_accuracy": 0.9862765550613404, + "num_tokens": 42938619.0, + "step": 2090 + }, + { + "entropy": 0.9484863519668579, + "epoch": 3.867403314917127, + "grad_norm": 0.7420705556869507, + "learning_rate": 7.210874412513218e-05, + "loss": 0.03703283965587616, + "mean_token_accuracy": 0.9857317566871643, + "num_tokens": 43143753.0, + "step": 2100 + }, + { + "entropy": 0.964326673746109, + "epoch": 3.8858195211786373, + "grad_norm": 0.8779639601707458, + "learning_rate": 7.183518866942147e-05, + "loss": 0.03739701807498932, + "mean_token_accuracy": 0.9852154791355133, + "num_tokens": 43349451.0, + "step": 2110 + }, + { + "entropy": 0.9729791641235351, + "epoch": 3.904235727440147, + "grad_norm": 0.7582741379737854, + "learning_rate": 7.156082319942929e-05, + "loss": 0.03894525766372681, + "mean_token_accuracy": 0.9847454309463501, + "num_tokens": 43554598.0, + "step": 2120 + }, + { + "entropy": 0.9860592544078827, + "epoch": 3.9226519337016574, + "grad_norm": 0.860698938369751, + "learning_rate": 7.128565789321969e-05, + "loss": 0.0365300178527832, + "mean_token_accuracy": 0.9859121859073638, + "num_tokens": 43760081.0, + "step": 2130 + }, + { + "entropy": 0.9916551172733307, + "epoch": 3.9410681399631677, + "grad_norm": 0.8363776206970215, + "learning_rate": 7.100970295852805e-05, + "loss": 0.036221379041671754, + "mean_token_accuracy": 0.9859034180641174, + "num_tokens": 43965432.0, + "step": 2140 + }, + { + "entropy": 0.9553558886051178, + "epoch": 3.9594843462246776, + "grad_norm": 0.9627474546432495, + "learning_rate": 7.073296863238242e-05, + "loss": 0.03684481382369995, + "mean_token_accuracy": 0.9857315957546234, + "num_tokens": 44171232.0, + "step": 2150 + }, + { + "entropy": 0.9538035809993743, + "epoch": 3.977900552486188, + "grad_norm": 0.8399474620819092, + "learning_rate": 7.045546518072366e-05, + "loss": 0.03825397789478302, + "mean_token_accuracy": 0.9846831560134888, + "num_tokens": 44376723.0, + "step": 2160 + }, + { + "entropy": 0.9476235210895538, + "epoch": 3.9963167587476978, + "grad_norm": 0.708739697933197, + "learning_rate": 7.017720289802472e-05, + "loss": 0.03618018329143524, + "mean_token_accuracy": 0.9861325800418854, + "num_tokens": 44582407.0, + "step": 2170 + }, + { + "epoch": 4.0, + "eval_entropy": 0.9569619194321011, + "eval_loss": 0.059838198125362396, + "eval_mean_token_accuracy": 0.9777795366618944, + "eval_num_tokens": 44623647.0, + "eval_runtime": 10.0379, + "eval_samples_per_second": 364.42, + "eval_steps_per_second": 11.457, + "step": 2172 + }, + { + "entropy": 0.9558675646781921, + "epoch": 4.014732965009208, + "grad_norm": 0.7347508668899536, + "learning_rate": 6.989819210690872e-05, + "loss": 0.02886659502983093, + "mean_token_accuracy": 0.9892994821071625, + "num_tokens": 44788219.0, + "step": 2180 + }, + { + "entropy": 1.0037677466869355, + "epoch": 4.033149171270718, + "grad_norm": 0.7403206825256348, + "learning_rate": 6.961844315776596e-05, + "loss": 0.02395295798778534, + "mean_token_accuracy": 0.9906026899814606, + "num_tokens": 44993505.0, + "step": 2190 + }, + { + "entropy": 1.0068290829658508, + "epoch": 4.051565377532229, + "grad_norm": 0.7979726195335388, + "learning_rate": 6.933796642837003e-05, + "loss": 0.02605988085269928, + "mean_token_accuracy": 0.9899706900119781, + "num_tokens": 45199193.0, + "step": 2200 + }, + { + "entropy": 0.9942211747169495, + "epoch": 4.069981583793738, + "grad_norm": 0.6460402011871338, + "learning_rate": 6.905677232349278e-05, + "loss": 0.025350230932235717, + "mean_token_accuracy": 0.9899386286735534, + "num_tokens": 45404030.0, + "step": 2210 + }, + { + "entropy": 0.9783595442771912, + "epoch": 4.088397790055248, + "grad_norm": 0.8177055716514587, + "learning_rate": 6.877487127451834e-05, + "loss": 0.02696993052959442, + "mean_token_accuracy": 0.9896106541156768, + "num_tokens": 45609763.0, + "step": 2220 + }, + { + "entropy": 0.9801763832569123, + "epoch": 4.106813996316759, + "grad_norm": 0.6608165502548218, + "learning_rate": 6.849227373905618e-05, + "loss": 0.025101393461227417, + "mean_token_accuracy": 0.9904372334480286, + "num_tokens": 45814941.0, + "step": 2230 + }, + { + "entropy": 0.9695689737796783, + "epoch": 4.125230202578269, + "grad_norm": 0.8036547899246216, + "learning_rate": 6.820899020055314e-05, + "loss": 0.027827343344688414, + "mean_token_accuracy": 0.9890337705612182, + "num_tokens": 46020535.0, + "step": 2240 + }, + { + "entropy": 0.9828635334968567, + "epoch": 4.143646408839779, + "grad_norm": 0.7729921936988831, + "learning_rate": 6.792503116790455e-05, + "loss": 0.02779492735862732, + "mean_token_accuracy": 0.9894372522830963, + "num_tokens": 46226013.0, + "step": 2250 + }, + { + "entropy": 0.9978842556476593, + "epoch": 4.162062615101289, + "grad_norm": 0.7334664463996887, + "learning_rate": 6.764040717506432e-05, + "loss": 0.025673511624336242, + "mean_token_accuracy": 0.9899355113506317, + "num_tokens": 46432087.0, + "step": 2260 + }, + { + "entropy": 1.0116403937339782, + "epoch": 4.180478821362799, + "grad_norm": 0.6769368052482605, + "learning_rate": 6.735512878065427e-05, + "loss": 0.024705511331558228, + "mean_token_accuracy": 0.9906128525733948, + "num_tokens": 46637478.0, + "step": 2270 + }, + { + "entropy": 0.9985016226768494, + "epoch": 4.198895027624309, + "grad_norm": 0.8301573991775513, + "learning_rate": 6.706920656757234e-05, + "loss": 0.02455987185239792, + "mean_token_accuracy": 0.9905728340148926, + "num_tokens": 46842562.0, + "step": 2280 + }, + { + "entropy": 0.9909430682659149, + "epoch": 4.21731123388582, + "grad_norm": 0.656026303768158, + "learning_rate": 6.67826511426001e-05, + "loss": 0.022711564600467683, + "mean_token_accuracy": 0.9910893619060517, + "num_tokens": 47048071.0, + "step": 2290 + }, + { + "entropy": 0.9868666052818298, + "epoch": 4.23572744014733, + "grad_norm": 0.7614991068840027, + "learning_rate": 6.649547313600916e-05, + "loss": 0.02453812211751938, + "mean_token_accuracy": 0.9908901154994965, + "num_tokens": 47253507.0, + "step": 2300 + }, + { + "entropy": 0.9870487153530121, + "epoch": 4.25414364640884, + "grad_norm": 0.7617276906967163, + "learning_rate": 6.62076832011669e-05, + "loss": 0.025818097591400146, + "mean_token_accuracy": 0.990347957611084, + "num_tokens": 47458747.0, + "step": 2310 + }, + { + "entropy": 0.9691080570220947, + "epoch": 4.27255985267035, + "grad_norm": 0.6743029952049255, + "learning_rate": 6.591929201414124e-05, + "loss": 0.02456912100315094, + "mean_token_accuracy": 0.9905289709568024, + "num_tokens": 47663643.0, + "step": 2320 + }, + { + "entropy": 0.9701108932495117, + "epoch": 4.29097605893186, + "grad_norm": 0.6964483261108398, + "learning_rate": 6.56303102733046e-05, + "loss": 0.02575681209564209, + "mean_token_accuracy": 0.9898503363132477, + "num_tokens": 47868982.0, + "step": 2330 + }, + { + "entropy": 0.969528192281723, + "epoch": 4.30939226519337, + "grad_norm": 0.7521987557411194, + "learning_rate": 6.5340748698937e-05, + "loss": 0.02678089737892151, + "mean_token_accuracy": 0.9898572087287902, + "num_tokens": 48074314.0, + "step": 2340 + }, + { + "entropy": 0.9921871721744537, + "epoch": 4.327808471454881, + "grad_norm": 0.6944513320922852, + "learning_rate": 6.505061803282844e-05, + "loss": 0.025553321838378905, + "mean_token_accuracy": 0.9907529592514038, + "num_tokens": 48279731.0, + "step": 2350 + }, + { + "entropy": 0.9768964886665344, + "epoch": 4.346224677716391, + "grad_norm": 0.6553092002868652, + "learning_rate": 6.47599290378803e-05, + "loss": 0.0250235915184021, + "mean_token_accuracy": 0.9904054701328278, + "num_tokens": 48485401.0, + "step": 2360 + }, + { + "entropy": 0.9612838506698609, + "epoch": 4.3646408839779, + "grad_norm": 0.916820228099823, + "learning_rate": 6.446869249770619e-05, + "loss": 0.028156182169914244, + "mean_token_accuracy": 0.9888657331466675, + "num_tokens": 48691047.0, + "step": 2370 + }, + { + "entropy": 0.9665832936763763, + "epoch": 4.383057090239411, + "grad_norm": 0.9197776913642883, + "learning_rate": 6.417691921623185e-05, + "loss": 0.025303921103477477, + "mean_token_accuracy": 0.989986252784729, + "num_tokens": 48896234.0, + "step": 2380 + }, + { + "entropy": 0.9686589121818543, + "epoch": 4.401473296500921, + "grad_norm": 0.8505764603614807, + "learning_rate": 6.388462001729434e-05, + "loss": 0.024816396832466125, + "mean_token_accuracy": 0.9909265041351318, + "num_tokens": 49101893.0, + "step": 2390 + }, + { + "entropy": 0.9625210344791413, + "epoch": 4.419889502762431, + "grad_norm": 1.0601766109466553, + "learning_rate": 6.359180574424062e-05, + "loss": 0.02706078290939331, + "mean_token_accuracy": 0.9895522117614746, + "num_tokens": 49307467.0, + "step": 2400 + }, + { + "entropy": 0.9679551541805267, + "epoch": 4.4383057090239415, + "grad_norm": 0.776253879070282, + "learning_rate": 6.329848725952514e-05, + "loss": 0.02693203091621399, + "mean_token_accuracy": 0.9893981635570526, + "num_tokens": 49513020.0, + "step": 2410 + }, + { + "entropy": 0.9704959928989411, + "epoch": 4.456721915285451, + "grad_norm": 0.5459668636322021, + "learning_rate": 6.3004675444307e-05, + "loss": 0.0279473751783371, + "mean_token_accuracy": 0.9894329369068146, + "num_tokens": 49718405.0, + "step": 2420 + }, + { + "entropy": 0.961863350868225, + "epoch": 4.475138121546961, + "grad_norm": 0.9338833093643188, + "learning_rate": 6.27103811980462e-05, + "loss": 0.026478803157806395, + "mean_token_accuracy": 0.9902269721031189, + "num_tokens": 49923375.0, + "step": 2430 + }, + { + "entropy": 0.9708506822586059, + "epoch": 4.4935543278084715, + "grad_norm": 0.9073707461357117, + "learning_rate": 6.241561543809947e-05, + "loss": 0.025289520621299744, + "mean_token_accuracy": 0.9904769957065582, + "num_tokens": 50128901.0, + "step": 2440 + }, + { + "entropy": 0.984996622800827, + "epoch": 4.511970534069982, + "grad_norm": 0.8674206733703613, + "learning_rate": 6.212038909931503e-05, + "loss": 0.026442551612854005, + "mean_token_accuracy": 0.9905101835727692, + "num_tokens": 50334449.0, + "step": 2450 + }, + { + "entropy": 0.9926377475261688, + "epoch": 4.530386740331492, + "grad_norm": 0.7571811079978943, + "learning_rate": 6.182471313362717e-05, + "loss": 0.026819539070129395, + "mean_token_accuracy": 0.9898989200592041, + "num_tokens": 50539597.0, + "step": 2460 + }, + { + "entropy": 0.9450563549995422, + "epoch": 4.5488029465930016, + "grad_norm": 0.6651087403297424, + "learning_rate": 6.15285985096498e-05, + "loss": 0.02665227949619293, + "mean_token_accuracy": 0.9897156655788422, + "num_tokens": 50744926.0, + "step": 2470 + }, + { + "entropy": 0.9715635657310486, + "epoch": 4.567219152854512, + "grad_norm": 0.7445545196533203, + "learning_rate": 6.12320562122697e-05, + "loss": 0.026212453842163086, + "mean_token_accuracy": 0.9904700636863708, + "num_tokens": 50950152.0, + "step": 2480 + }, + { + "entropy": 0.9613442063331604, + "epoch": 4.585635359116022, + "grad_norm": 0.7168459296226501, + "learning_rate": 6.0935097242238837e-05, + "loss": 0.02508128583431244, + "mean_token_accuracy": 0.9901923894882202, + "num_tokens": 51155430.0, + "step": 2490 + }, + { + "entropy": 0.9571944534778595, + "epoch": 4.6040515653775325, + "grad_norm": 0.7590732574462891, + "learning_rate": 6.063773261576646e-05, + "loss": 0.025445500016212465, + "mean_token_accuracy": 0.9902949810028077, + "num_tokens": 51360826.0, + "step": 2500 + }, + { + "entropy": 0.947079461812973, + "epoch": 4.622467771639043, + "grad_norm": 0.6942175030708313, + "learning_rate": 6.033997336411035e-05, + "loss": 0.026132801175117494, + "mean_token_accuracy": 0.9900939345359803, + "num_tokens": 51566095.0, + "step": 2510 + }, + { + "entropy": 0.970003741979599, + "epoch": 4.640883977900552, + "grad_norm": 0.6562672257423401, + "learning_rate": 6.00418305331675e-05, + "loss": 0.024759869277477264, + "mean_token_accuracy": 0.9905019223690033, + "num_tokens": 51771177.0, + "step": 2520 + }, + { + "entropy": 0.9715348601341247, + "epoch": 4.6593001841620625, + "grad_norm": 0.6151639819145203, + "learning_rate": 5.9743315183064564e-05, + "loss": 0.024138522148132325, + "mean_token_accuracy": 0.9910101473331452, + "num_tokens": 51976349.0, + "step": 2530 + }, + { + "entropy": 0.9552160143852234, + "epoch": 4.677716390423573, + "grad_norm": 0.968815267086029, + "learning_rate": 5.9444438387747336e-05, + "loss": 0.027274739742279053, + "mean_token_accuracy": 0.9896075248718261, + "num_tokens": 52181820.0, + "step": 2540 + }, + { + "entropy": 0.9265012145042419, + "epoch": 4.696132596685083, + "grad_norm": 0.8966720700263977, + "learning_rate": 5.914521123457015e-05, + "loss": 0.0291823148727417, + "mean_token_accuracy": 0.9886700630187988, + "num_tokens": 52387511.0, + "step": 2550 + }, + { + "entropy": 0.9156096875667572, + "epoch": 4.714548802946593, + "grad_norm": 0.7747519612312317, + "learning_rate": 5.88456448238844e-05, + "loss": 0.02809179127216339, + "mean_token_accuracy": 0.9891100466251374, + "num_tokens": 52592737.0, + "step": 2560 + }, + { + "entropy": 0.924511456489563, + "epoch": 4.732965009208103, + "grad_norm": 1.0087049007415771, + "learning_rate": 5.8545750268626844e-05, + "loss": 0.02683232128620148, + "mean_token_accuracy": 0.9896528899669648, + "num_tokens": 52798814.0, + "step": 2570 + }, + { + "entropy": 0.9662951111793519, + "epoch": 4.751381215469613, + "grad_norm": 0.7709590792655945, + "learning_rate": 5.824553869390734e-05, + "loss": 0.02503817081451416, + "mean_token_accuracy": 0.9900161385536194, + "num_tokens": 53004478.0, + "step": 2580 + }, + { + "entropy": 0.9889141619205475, + "epoch": 4.769797421731123, + "grad_norm": 0.815858006477356, + "learning_rate": 5.794502123659613e-05, + "loss": 0.026327347755432128, + "mean_token_accuracy": 0.9900785744190216, + "num_tokens": 53209888.0, + "step": 2590 + }, + { + "entropy": 0.9785685896873474, + "epoch": 4.788213627992634, + "grad_norm": 0.6514431238174438, + "learning_rate": 5.7644209044910735e-05, + "loss": 0.025033789873123168, + "mean_token_accuracy": 0.9902650475502014, + "num_tokens": 53415533.0, + "step": 2600 + }, + { + "entropy": 0.9723869919776916, + "epoch": 4.806629834254144, + "grad_norm": 0.8778963685035706, + "learning_rate": 5.7343113278002284e-05, + "loss": 0.02379843294620514, + "mean_token_accuracy": 0.9909472465515137, + "num_tokens": 53620850.0, + "step": 2610 + }, + { + "entropy": 0.9572711050510406, + "epoch": 4.8250460405156534, + "grad_norm": 0.8927134871482849, + "learning_rate": 5.70417451055417e-05, + "loss": 0.024856947362422943, + "mean_token_accuracy": 0.9904125213623047, + "num_tokens": 53826259.0, + "step": 2620 + }, + { + "entropy": 0.9523135125637054, + "epoch": 4.843462246777164, + "grad_norm": 0.6832691431045532, + "learning_rate": 5.674011570730523e-05, + "loss": 0.025352203845977785, + "mean_token_accuracy": 0.990432596206665, + "num_tokens": 54031531.0, + "step": 2630 + }, + { + "entropy": 0.9735220730304718, + "epoch": 4.861878453038674, + "grad_norm": 0.6399164795875549, + "learning_rate": 5.643823627275972e-05, + "loss": 0.026541513204574586, + "mean_token_accuracy": 0.9900369107723236, + "num_tokens": 54237155.0, + "step": 2640 + }, + { + "entropy": 0.9566517114639282, + "epoch": 4.880294659300184, + "grad_norm": 0.8725414276123047, + "learning_rate": 5.6136118000647616e-05, + "loss": 0.02675778865814209, + "mean_token_accuracy": 0.9894899427890778, + "num_tokens": 54442739.0, + "step": 2650 + }, + { + "entropy": 0.9447909593582153, + "epoch": 4.898710865561695, + "grad_norm": 0.8169302344322205, + "learning_rate": 5.583377209857138e-05, + "loss": 0.02642086148262024, + "mean_token_accuracy": 0.989885401725769, + "num_tokens": 54648098.0, + "step": 2660 + }, + { + "entropy": 0.9180052697658538, + "epoch": 4.917127071823204, + "grad_norm": 0.7768753170967102, + "learning_rate": 5.553120978257787e-05, + "loss": 0.02552323341369629, + "mean_token_accuracy": 0.9899512350559234, + "num_tokens": 54854281.0, + "step": 2670 + }, + { + "entropy": 0.917166668176651, + "epoch": 4.935543278084714, + "grad_norm": 0.8241410851478577, + "learning_rate": 5.5228442276742153e-05, + "loss": 0.02788199484348297, + "mean_token_accuracy": 0.989625746011734, + "num_tokens": 55059495.0, + "step": 2680 + }, + { + "entropy": 0.9345465302467346, + "epoch": 4.953959484346225, + "grad_norm": 0.7645496129989624, + "learning_rate": 5.4925480812751166e-05, + "loss": 0.02517639398574829, + "mean_token_accuracy": 0.9902283847332001, + "num_tokens": 55265381.0, + "step": 2690 + }, + { + "entropy": 0.9386432528495788, + "epoch": 4.972375690607735, + "grad_norm": 0.8371859192848206, + "learning_rate": 5.46223366294871e-05, + "loss": 0.025585666298866272, + "mean_token_accuracy": 0.9903791427612305, + "num_tokens": 55471210.0, + "step": 2700 + }, + { + "entropy": 0.9267561137676239, + "epoch": 4.990791896869245, + "grad_norm": 0.6789297461509705, + "learning_rate": 5.43190209726104e-05, + "loss": 0.024646708369255067, + "mean_token_accuracy": 0.9904700815677643, + "num_tokens": 55676877.0, + "step": 2710 + }, + { + "epoch": 5.0, + "eval_entropy": 0.9283919717954553, + "eval_loss": 0.06225527077913284, + "eval_mean_token_accuracy": 0.9784110421719758, + "eval_num_tokens": 55779559.0, + "eval_runtime": 10.0613, + "eval_samples_per_second": 363.573, + "eval_steps_per_second": 11.43, + "step": 2715 + }, + { + "entropy": 0.9269404351711273, + "epoch": 5.009208103130755, + "grad_norm": 0.540570080280304, + "learning_rate": 5.401554509414264e-05, + "loss": 0.019513805210590363, + "mean_token_accuracy": 0.9927033007144928, + "num_tokens": 55882241.0, + "step": 2720 + }, + { + "entropy": 0.9377441763877868, + "epoch": 5.027624309392265, + "grad_norm": 0.5840998888015747, + "learning_rate": 5.3711920252049085e-05, + "loss": 0.015180909633636474, + "mean_token_accuracy": 0.9944471418857574, + "num_tokens": 56087470.0, + "step": 2730 + }, + { + "entropy": 0.949122017621994, + "epoch": 5.046040515653775, + "grad_norm": 0.6938672065734863, + "learning_rate": 5.340815770982106e-05, + "loss": 0.0153742715716362, + "mean_token_accuracy": 0.9941534519195556, + "num_tokens": 56292226.0, + "step": 2740 + }, + { + "entropy": 0.9394402146339417, + "epoch": 5.064456721915286, + "grad_norm": 0.8259939551353455, + "learning_rate": 5.310426873605814e-05, + "loss": 0.014350908994674682, + "mean_token_accuracy": 0.9945570707321167, + "num_tokens": 56497839.0, + "step": 2750 + }, + { + "entropy": 0.9323545396327972, + "epoch": 5.082872928176796, + "grad_norm": 0.9675024747848511, + "learning_rate": 5.280026460405005e-05, + "loss": 0.016550135612487794, + "mean_token_accuracy": 0.9938908398151398, + "num_tokens": 56702932.0, + "step": 2760 + }, + { + "entropy": 0.89125554561615, + "epoch": 5.101289134438305, + "grad_norm": 0.8347184658050537, + "learning_rate": 5.2496156591358566e-05, + "loss": 0.017917826771736145, + "mean_token_accuracy": 0.9934309899806977, + "num_tokens": 56908644.0, + "step": 2770 + }, + { + "entropy": 0.8773505449295044, + "epoch": 5.119705340699816, + "grad_norm": 0.8869524598121643, + "learning_rate": 5.219195597939908e-05, + "loss": 0.017221055924892426, + "mean_token_accuracy": 0.993448656797409, + "num_tokens": 57114171.0, + "step": 2780 + }, + { + "entropy": 0.8874686002731323, + "epoch": 5.138121546961326, + "grad_norm": 1.0294251441955566, + "learning_rate": 5.1887674053022084e-05, + "loss": 0.018111808598041533, + "mean_token_accuracy": 0.9931293666362763, + "num_tokens": 57319158.0, + "step": 2790 + }, + { + "entropy": 0.8893351197242737, + "epoch": 5.156537753222836, + "grad_norm": 0.6253597736358643, + "learning_rate": 5.15833221000946e-05, + "loss": 0.017256538569927215, + "mean_token_accuracy": 0.9936724424362182, + "num_tokens": 57524901.0, + "step": 2800 + }, + { + "entropy": 0.9157109141349793, + "epoch": 5.1749539594843466, + "grad_norm": 0.6379142999649048, + "learning_rate": 5.12789114110814e-05, + "loss": 0.016415870189666747, + "mean_token_accuracy": 0.9939744889736175, + "num_tokens": 57730135.0, + "step": 2810 + }, + { + "entropy": 0.9157932877540589, + "epoch": 5.193370165745856, + "grad_norm": 0.7195688486099243, + "learning_rate": 5.097445327862619e-05, + "loss": 0.01577536463737488, + "mean_token_accuracy": 0.9941773355007172, + "num_tokens": 57936210.0, + "step": 2820 + }, + { + "entropy": 0.9179767727851867, + "epoch": 5.211786372007366, + "grad_norm": 0.7149335741996765, + "learning_rate": 5.066995899713264e-05, + "loss": 0.01606254279613495, + "mean_token_accuracy": 0.9937664806842804, + "num_tokens": 58141736.0, + "step": 2830 + }, + { + "entropy": 0.895512479543686, + "epoch": 5.230202578268877, + "grad_norm": 0.6460169553756714, + "learning_rate": 5.036543986234543e-05, + "loss": 0.01605578660964966, + "mean_token_accuracy": 0.994063013792038, + "num_tokens": 58347178.0, + "step": 2840 + }, + { + "entropy": 0.8883109211921691, + "epoch": 5.248618784530387, + "grad_norm": 0.72477787733078, + "learning_rate": 5.006090717093128e-05, + "loss": 0.016773784160614015, + "mean_token_accuracy": 0.9940340936183929, + "num_tokens": 58552952.0, + "step": 2850 + }, + { + "entropy": 0.8942575633525849, + "epoch": 5.267034990791897, + "grad_norm": 0.7344926595687866, + "learning_rate": 4.9756372220059736e-05, + "loss": 0.01604126989841461, + "mean_token_accuracy": 0.994256991147995, + "num_tokens": 58758449.0, + "step": 2860 + }, + { + "entropy": 0.8854099690914154, + "epoch": 5.285451197053407, + "grad_norm": 0.6142122149467468, + "learning_rate": 4.9451846306984214e-05, + "loss": 0.016244474053382873, + "mean_token_accuracy": 0.9938375532627106, + "num_tokens": 58963691.0, + "step": 2870 + }, + { + "entropy": 0.8745675146579742, + "epoch": 5.303867403314917, + "grad_norm": 0.8025366067886353, + "learning_rate": 4.9147340728622816e-05, + "loss": 0.01611460596323013, + "mean_token_accuracy": 0.9941173672676087, + "num_tokens": 59169484.0, + "step": 2880 + }, + { + "entropy": 0.8812389194965362, + "epoch": 5.322283609576427, + "grad_norm": 0.7699193358421326, + "learning_rate": 4.884286678113935e-05, + "loss": 0.016995206475257874, + "mean_token_accuracy": 0.9937523245811463, + "num_tokens": 59374627.0, + "step": 2890 + }, + { + "entropy": 0.8924362242221833, + "epoch": 5.3406998158379375, + "grad_norm": 0.7516226172447205, + "learning_rate": 4.853843575952414e-05, + "loss": 0.01652217358350754, + "mean_token_accuracy": 0.9936819314956665, + "num_tokens": 59580135.0, + "step": 2900 + }, + { + "entropy": 0.8972602427005768, + "epoch": 5.359116022099448, + "grad_norm": 0.7781681418418884, + "learning_rate": 4.823405895717521e-05, + "loss": 0.017360319197177888, + "mean_token_accuracy": 0.9935634732246399, + "num_tokens": 59785392.0, + "step": 2910 + }, + { + "entropy": 0.900998342037201, + "epoch": 5.377532228360957, + "grad_norm": 0.6837047934532166, + "learning_rate": 4.792974766547911e-05, + "loss": 0.017162233591079712, + "mean_token_accuracy": 0.993264091014862, + "num_tokens": 59991448.0, + "step": 2920 + }, + { + "entropy": 0.9239763855934143, + "epoch": 5.3959484346224675, + "grad_norm": 0.7188259363174438, + "learning_rate": 4.762551317339226e-05, + "loss": 0.01718595027923584, + "mean_token_accuracy": 0.9933857440948486, + "num_tokens": 60197079.0, + "step": 2930 + }, + { + "entropy": 0.9056789398193359, + "epoch": 5.414364640883978, + "grad_norm": 0.6922260522842407, + "learning_rate": 4.732136676702198e-05, + "loss": 0.016596907377243043, + "mean_token_accuracy": 0.9937462329864502, + "num_tokens": 60402299.0, + "step": 2940 + }, + { + "entropy": 0.9038522362709045, + "epoch": 5.432780847145488, + "grad_norm": 0.7937009930610657, + "learning_rate": 4.7017319729207954e-05, + "loss": 0.016130413115024566, + "mean_token_accuracy": 0.9941940546035767, + "num_tokens": 60607907.0, + "step": 2950 + }, + { + "entropy": 0.8911147236824035, + "epoch": 5.4511970534069984, + "grad_norm": 0.6261171698570251, + "learning_rate": 4.671338333910359e-05, + "loss": 0.01622493863105774, + "mean_token_accuracy": 0.9937945663928985, + "num_tokens": 60813428.0, + "step": 2960 + }, + { + "entropy": 0.8894322276115417, + "epoch": 5.469613259668508, + "grad_norm": 0.6898378729820251, + "learning_rate": 4.6409568871757645e-05, + "loss": 0.016513559222221374, + "mean_token_accuracy": 0.9936174690723419, + "num_tokens": 61018404.0, + "step": 2970 + }, + { + "entropy": 0.9026601016521454, + "epoch": 5.488029465930018, + "grad_norm": 0.7027897834777832, + "learning_rate": 4.610588759769593e-05, + "loss": 0.016727012395858765, + "mean_token_accuracy": 0.9941417872905731, + "num_tokens": 61223660.0, + "step": 2980 + }, + { + "entropy": 0.8903301954269409, + "epoch": 5.5064456721915285, + "grad_norm": 0.9087063074111938, + "learning_rate": 4.5802350782503196e-05, + "loss": 0.016929233074188234, + "mean_token_accuracy": 0.9935264468193055, + "num_tokens": 61429438.0, + "step": 2990 + }, + { + "entropy": 0.8886692762374878, + "epoch": 5.524861878453039, + "grad_norm": 0.8283822536468506, + "learning_rate": 4.5498969686405266e-05, + "loss": 0.015396638214588166, + "mean_token_accuracy": 0.99433131814003, + "num_tokens": 61635274.0, + "step": 3000 + }, + { + "entropy": 0.8902086555957794, + "epoch": 5.543278084714549, + "grad_norm": 0.7676647305488586, + "learning_rate": 4.5195755563851336e-05, + "loss": 0.01673731654882431, + "mean_token_accuracy": 0.9938134133815766, + "num_tokens": 61840778.0, + "step": 3010 + }, + { + "entropy": 0.8941606819629669, + "epoch": 5.5616942909760585, + "grad_norm": 0.7026392221450806, + "learning_rate": 4.489271966309634e-05, + "loss": 0.01694796681404114, + "mean_token_accuracy": 0.9936233103275299, + "num_tokens": 62046355.0, + "step": 3020 + }, + { + "entropy": 0.90918750166893, + "epoch": 5.580110497237569, + "grad_norm": 0.7146924734115601, + "learning_rate": 4.4589873225783806e-05, + "loss": 0.01852080672979355, + "mean_token_accuracy": 0.9928994178771973, + "num_tokens": 62251709.0, + "step": 3030 + }, + { + "entropy": 0.8946544349193573, + "epoch": 5.598526703499079, + "grad_norm": 0.607246458530426, + "learning_rate": 4.428722748652881e-05, + "loss": 0.016636812686920167, + "mean_token_accuracy": 0.9939334273338318, + "num_tokens": 62456680.0, + "step": 3040 + }, + { + "entropy": 0.8854653835296631, + "epoch": 5.616942909760589, + "grad_norm": 0.7457882165908813, + "learning_rate": 4.3984793672501124e-05, + "loss": 0.016008296608924867, + "mean_token_accuracy": 0.9940589666366577, + "num_tokens": 62662038.0, + "step": 3050 + }, + { + "entropy": 0.8921085000038147, + "epoch": 5.6353591160221, + "grad_norm": 0.7707350254058838, + "learning_rate": 4.368258300300888e-05, + "loss": 0.016655120253562927, + "mean_token_accuracy": 0.993935889005661, + "num_tokens": 62867272.0, + "step": 3060 + }, + { + "entropy": 0.8768653869628906, + "epoch": 5.653775322283609, + "grad_norm": 0.6994554996490479, + "learning_rate": 4.3380606689082166e-05, + "loss": 0.015841150283813478, + "mean_token_accuracy": 0.9944550096988678, + "num_tokens": 63072403.0, + "step": 3070 + }, + { + "entropy": 0.8767679035663605, + "epoch": 5.672191528545119, + "grad_norm": 0.8327192068099976, + "learning_rate": 4.307887593305733e-05, + "loss": 0.015826576948165895, + "mean_token_accuracy": 0.9941202461719513, + "num_tokens": 63277635.0, + "step": 3080 + }, + { + "entropy": 0.8763292253017425, + "epoch": 5.69060773480663, + "grad_norm": 0.7224747538566589, + "learning_rate": 4.277740192816127e-05, + "loss": 0.015298140048980714, + "mean_token_accuracy": 0.9943080008029938, + "num_tokens": 63483196.0, + "step": 3090 + }, + { + "entropy": 0.8834661841392517, + "epoch": 5.70902394106814, + "grad_norm": 0.9508277773857117, + "learning_rate": 4.247619585809627e-05, + "loss": 0.01658404469490051, + "mean_token_accuracy": 0.9934300124645233, + "num_tokens": 63688721.0, + "step": 3100 + }, + { + "entropy": 0.899389523267746, + "epoch": 5.72744014732965, + "grad_norm": 0.7170981168746948, + "learning_rate": 4.217526889662512e-05, + "loss": 0.015803813934326172, + "mean_token_accuracy": 0.9940325975418091, + "num_tokens": 63894220.0, + "step": 3110 + }, + { + "entropy": 0.8968011736869812, + "epoch": 5.74585635359116, + "grad_norm": 0.6686251163482666, + "learning_rate": 4.187463220715659e-05, + "loss": 0.015874400734901428, + "mean_token_accuracy": 0.9940970957279205, + "num_tokens": 64099768.0, + "step": 3120 + }, + { + "entropy": 0.8900792479515076, + "epoch": 5.76427255985267, + "grad_norm": 0.5979828238487244, + "learning_rate": 4.157429694233128e-05, + "loss": 0.01613767147064209, + "mean_token_accuracy": 0.9942961037158966, + "num_tokens": 64305055.0, + "step": 3130 + }, + { + "entropy": 0.8899810135364532, + "epoch": 5.78268876611418, + "grad_norm": 0.7330048084259033, + "learning_rate": 4.127427424360794e-05, + "loss": 0.016168563067913054, + "mean_token_accuracy": 0.9941077649593353, + "num_tokens": 64510002.0, + "step": 3140 + }, + { + "entropy": 0.8805335581302642, + "epoch": 5.801104972375691, + "grad_norm": 0.5978623032569885, + "learning_rate": 4.09745752408501e-05, + "loss": 0.01524556577205658, + "mean_token_accuracy": 0.994326776266098, + "num_tokens": 64715431.0, + "step": 3150 + }, + { + "entropy": 0.878781646490097, + "epoch": 5.819521178637201, + "grad_norm": 0.6749313473701477, + "learning_rate": 4.067521105191331e-05, + "loss": 0.015209287405014038, + "mean_token_accuracy": 0.9942974805831909, + "num_tokens": 64921579.0, + "step": 3160 + }, + { + "entropy": 0.8844729900360108, + "epoch": 5.83793738489871, + "grad_norm": 0.6887196898460388, + "learning_rate": 4.037619278223255e-05, + "loss": 0.01619938760995865, + "mean_token_accuracy": 0.9937683045864105, + "num_tokens": 65127007.0, + "step": 3170 + }, + { + "entropy": 0.8780498623847961, + "epoch": 5.856353591160221, + "grad_norm": 0.6962174773216248, + "learning_rate": 4.0077531524410304e-05, + "loss": 0.015934592485427855, + "mean_token_accuracy": 0.9935103774070739, + "num_tokens": 65332418.0, + "step": 3180 + }, + { + "entropy": 0.8889612555503845, + "epoch": 5.874769797421731, + "grad_norm": 0.6049854159355164, + "learning_rate": 3.977923835780517e-05, + "loss": 0.01600206792354584, + "mean_token_accuracy": 0.9937360048294067, + "num_tokens": 65537845.0, + "step": 3190 + }, + { + "entropy": 0.8960810244083405, + "epoch": 5.893186003683241, + "grad_norm": 0.6341013312339783, + "learning_rate": 3.948132434812065e-05, + "loss": 0.0143389493227005, + "mean_token_accuracy": 0.9948007702827454, + "num_tokens": 65743412.0, + "step": 3200 + }, + { + "entropy": 0.887304550409317, + "epoch": 5.911602209944752, + "grad_norm": 0.7564852237701416, + "learning_rate": 3.9183800546994886e-05, + "loss": 0.016044440865516662, + "mean_token_accuracy": 0.9939335525035858, + "num_tokens": 65948884.0, + "step": 3210 + }, + { + "entropy": 0.8823239862918854, + "epoch": 5.930018416206261, + "grad_norm": 0.6525556445121765, + "learning_rate": 3.8886677991590435e-05, + "loss": 0.016112390160560607, + "mean_token_accuracy": 0.9938134670257568, + "num_tokens": 66153768.0, + "step": 3220 + }, + { + "entropy": 0.8712829887866974, + "epoch": 5.948434622467771, + "grad_norm": 0.676167368888855, + "learning_rate": 3.858996770418504e-05, + "loss": 0.015146306157112122, + "mean_token_accuracy": 0.9944733619689942, + "num_tokens": 66359661.0, + "step": 3230 + }, + { + "entropy": 0.8734102070331573, + "epoch": 5.966850828729282, + "grad_norm": 0.6284340023994446, + "learning_rate": 3.829368069176257e-05, + "loss": 0.017269474267959595, + "mean_token_accuracy": 0.9938443183898926, + "num_tokens": 66565228.0, + "step": 3240 + }, + { + "entropy": 0.8757335782051087, + "epoch": 5.985267034990792, + "grad_norm": 0.722522497177124, + "learning_rate": 3.799782794560484e-05, + "loss": 0.015032704174518585, + "mean_token_accuracy": 0.9942249894142151, + "num_tokens": 66770718.0, + "step": 3250 + }, + { + "epoch": 6.0, + "eval_entropy": 0.8780099873957427, + "eval_loss": 0.06740746647119522, + "eval_mean_token_accuracy": 0.9795082377350849, + "eval_num_tokens": 66935435.0, + "eval_runtime": 10.0955, + "eval_samples_per_second": 362.34, + "eval_steps_per_second": 11.391, + "step": 3258 + } + ], + "logging_steps": 10, + "max_steps": 5430, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.190938138876838e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3258/training_args.bin b/checkpoint-3258/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e182f9a244f56249cd86c9db6563a54fbeed7bd --- /dev/null +++ b/checkpoint-3258/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21325c9bdff5ed34f0cc34837ee67ed216c9301ab4d9b2e26f048b563564bd75 +size 5777 diff --git a/checkpoint-3801/README.md b/checkpoint-3801/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dd49221a54a197d9f0cab86c7982e2b2fe0776b3 --- /dev/null +++ b/checkpoint-3801/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen2.5-7B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen2.5-7B-Instruct +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-3801/adapter_config.json b/checkpoint-3801/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..88178e58c6d24c2bb788f871d548c8bff3b8b16a --- /dev/null +++ b/checkpoint-3801/adapter_config.json @@ -0,0 +1,48 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "down_proj", + "q_proj", + "gate_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3801/adapter_model.safetensors b/checkpoint-3801/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fad066cd11dea180adcd18bf26f23c7a0c03596c --- /dev/null +++ b/checkpoint-3801/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddf199d0ea570dc0a07cf04650028719a3b079854d9cd9c9c87302cd1e916916 +size 80792096 diff --git a/checkpoint-3801/chat_template.jinja b/checkpoint-3801/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..bdf7919a96cfe43d50914a007b9c0877bd0ec27e --- /dev/null +++ b/checkpoint-3801/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-3801/tokenizer.json b/checkpoint-3801/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/checkpoint-3801/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/checkpoint-3801/tokenizer_config.json b/checkpoint-3801/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d75d3bb5300d205e48769cc1999073ab5971214 --- /dev/null +++ b/checkpoint-3801/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-3801/trainer_state.json b/checkpoint-3801/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a69b93042bcfdb7ab0113d02b9fabece89b6d21d --- /dev/null +++ b/checkpoint-3801/trainer_state.json @@ -0,0 +1,3911 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.0, + "eval_steps": 500, + "global_step": 3801, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2237394809722901, + "epoch": 0.01841620626151013, + "grad_norm": 5.082435607910156, + "learning_rate": 3.308823529411765e-06, + "loss": 0.9237876892089844, + "mean_token_accuracy": 0.7685343027114868, + "num_tokens": 205423.0, + "step": 10 + }, + { + "entropy": 1.2295925617218018, + "epoch": 0.03683241252302026, + "grad_norm": 4.672000408172607, + "learning_rate": 6.985294117647059e-06, + "loss": 0.8900892257690429, + "mean_token_accuracy": 0.7677771031856537, + "num_tokens": 410849.0, + "step": 20 + }, + { + "entropy": 1.2285718679428101, + "epoch": 0.055248618784530384, + "grad_norm": 1.4828118085861206, + "learning_rate": 1.0661764705882354e-05, + "loss": 0.5975452899932862, + "mean_token_accuracy": 0.8146551787853241, + "num_tokens": 616438.0, + "step": 30 + }, + { + "entropy": 1.210776400566101, + "epoch": 0.07366482504604052, + "grad_norm": 0.7761328816413879, + "learning_rate": 1.4338235294117647e-05, + "loss": 0.40664992332458494, + "mean_token_accuracy": 0.8699092030525207, + "num_tokens": 822118.0, + "step": 40 + }, + { + "entropy": 1.200321125984192, + "epoch": 0.09208103130755065, + "grad_norm": 0.5363371968269348, + "learning_rate": 1.8014705882352943e-05, + "loss": 0.3313469409942627, + "mean_token_accuracy": 0.8904915869235992, + "num_tokens": 1027941.0, + "step": 50 + }, + { + "entropy": 1.1809936046600342, + "epoch": 0.11049723756906077, + "grad_norm": 0.39541518688201904, + "learning_rate": 2.1691176470588237e-05, + "loss": 0.27568228244781495, + "mean_token_accuracy": 0.9047131836414337, + "num_tokens": 1233620.0, + "step": 60 + }, + { + "entropy": 1.169810914993286, + "epoch": 0.1289134438305709, + "grad_norm": 0.341960072517395, + "learning_rate": 2.536764705882353e-05, + "loss": 0.245219087600708, + "mean_token_accuracy": 0.9150686681270599, + "num_tokens": 1438656.0, + "step": 70 + }, + { + "entropy": 1.1652960777282715, + "epoch": 0.14732965009208104, + "grad_norm": 0.36872178316116333, + "learning_rate": 2.9044117647058828e-05, + "loss": 0.2220149040222168, + "mean_token_accuracy": 0.9224777698516846, + "num_tokens": 1643877.0, + "step": 80 + }, + { + "entropy": 1.154341197013855, + "epoch": 0.16574585635359115, + "grad_norm": 0.4152425229549408, + "learning_rate": 3.272058823529412e-05, + "loss": 0.2002798557281494, + "mean_token_accuracy": 0.9285802960395813, + "num_tokens": 1849506.0, + "step": 90 + }, + { + "entropy": 1.1507258892059327, + "epoch": 0.1841620626151013, + "grad_norm": 0.47647765278816223, + "learning_rate": 3.639705882352941e-05, + "loss": 0.18871363401412963, + "mean_token_accuracy": 0.9318056285381318, + "num_tokens": 2055071.0, + "step": 100 + }, + { + "entropy": 1.1455535531044005, + "epoch": 0.20257826887661143, + "grad_norm": 0.4853009581565857, + "learning_rate": 4.007352941176471e-05, + "loss": 0.17836341857910157, + "mean_token_accuracy": 0.9367631554603577, + "num_tokens": 2260643.0, + "step": 110 + }, + { + "entropy": 1.1402526497840881, + "epoch": 0.22099447513812154, + "grad_norm": 0.4455392360687256, + "learning_rate": 4.375e-05, + "loss": 0.16921783685684205, + "mean_token_accuracy": 0.9386959195137023, + "num_tokens": 2466085.0, + "step": 120 + }, + { + "entropy": 1.1374777555465698, + "epoch": 0.23941068139963168, + "grad_norm": 0.5880279541015625, + "learning_rate": 4.742647058823529e-05, + "loss": 0.15989291667938232, + "mean_token_accuracy": 0.9421182632446289, + "num_tokens": 2671024.0, + "step": 130 + }, + { + "entropy": 1.1273940205574036, + "epoch": 0.2578268876611418, + "grad_norm": 0.612959086894989, + "learning_rate": 5.110294117647059e-05, + "loss": 0.14701461791992188, + "mean_token_accuracy": 0.9463540315628052, + "num_tokens": 2876848.0, + "step": 140 + }, + { + "entropy": 1.1263513088226318, + "epoch": 0.27624309392265195, + "grad_norm": 0.5695255398750305, + "learning_rate": 5.477941176470589e-05, + "loss": 0.14604382514953612, + "mean_token_accuracy": 0.946351945400238, + "num_tokens": 3082589.0, + "step": 150 + }, + { + "entropy": 1.1290789365768432, + "epoch": 0.2946593001841621, + "grad_norm": 0.6608090996742249, + "learning_rate": 5.845588235294118e-05, + "loss": 0.1409450054168701, + "mean_token_accuracy": 0.9481450319290161, + "num_tokens": 3287459.0, + "step": 160 + }, + { + "entropy": 1.1291529774665832, + "epoch": 0.31307550644567217, + "grad_norm": 0.652715802192688, + "learning_rate": 6.213235294117647e-05, + "loss": 0.14441155195236205, + "mean_token_accuracy": 0.9466125547885895, + "num_tokens": 3493682.0, + "step": 170 + }, + { + "entropy": 1.1244838953018188, + "epoch": 0.3314917127071823, + "grad_norm": 0.7815241813659668, + "learning_rate": 6.580882352941177e-05, + "loss": 0.13361064195632935, + "mean_token_accuracy": 0.9512295544147491, + "num_tokens": 3699573.0, + "step": 180 + }, + { + "entropy": 1.1217721104621887, + "epoch": 0.34990791896869244, + "grad_norm": 0.7933160066604614, + "learning_rate": 6.948529411764706e-05, + "loss": 0.13089522123336791, + "mean_token_accuracy": 0.9520221531391144, + "num_tokens": 3905156.0, + "step": 190 + }, + { + "entropy": 1.1206679105758668, + "epoch": 0.3683241252302026, + "grad_norm": 0.6815240383148193, + "learning_rate": 7.316176470588236e-05, + "loss": 0.13400404453277587, + "mean_token_accuracy": 0.9501322209835052, + "num_tokens": 4110570.0, + "step": 200 + }, + { + "entropy": 1.1161052227020263, + "epoch": 0.3867403314917127, + "grad_norm": 0.8297767639160156, + "learning_rate": 7.683823529411766e-05, + "loss": 0.13389937877655028, + "mean_token_accuracy": 0.9501932203769684, + "num_tokens": 4315834.0, + "step": 210 + }, + { + "entropy": 1.1098745942115784, + "epoch": 0.40515653775322286, + "grad_norm": 0.5943381786346436, + "learning_rate": 8.051470588235294e-05, + "loss": 0.13452907800674438, + "mean_token_accuracy": 0.9503286242485046, + "num_tokens": 4520807.0, + "step": 220 + }, + { + "entropy": 1.100480353832245, + "epoch": 0.42357274401473294, + "grad_norm": 0.6094359755516052, + "learning_rate": 8.419117647058824e-05, + "loss": 0.12827746868133544, + "mean_token_accuracy": 0.952492094039917, + "num_tokens": 4725867.0, + "step": 230 + }, + { + "entropy": 1.0901286959648133, + "epoch": 0.4419889502762431, + "grad_norm": 0.7240597605705261, + "learning_rate": 8.786764705882353e-05, + "loss": 0.12171242237091065, + "mean_token_accuracy": 0.953943532705307, + "num_tokens": 4931629.0, + "step": 240 + }, + { + "entropy": 1.0885071873664856, + "epoch": 0.4604051565377532, + "grad_norm": 0.6939547657966614, + "learning_rate": 9.154411764705882e-05, + "loss": 0.12155698537826538, + "mean_token_accuracy": 0.9545870959758759, + "num_tokens": 5137285.0, + "step": 250 + }, + { + "entropy": 1.086272156238556, + "epoch": 0.47882136279926335, + "grad_norm": 0.5752800703048706, + "learning_rate": 9.522058823529412e-05, + "loss": 0.12157790660858155, + "mean_token_accuracy": 0.9541126549243927, + "num_tokens": 5342575.0, + "step": 260 + }, + { + "entropy": 1.0857678413391114, + "epoch": 0.4972375690607735, + "grad_norm": 0.7565123438835144, + "learning_rate": 9.889705882352942e-05, + "loss": 0.12349612712860107, + "mean_token_accuracy": 0.9535140514373779, + "num_tokens": 5547995.0, + "step": 270 + }, + { + "entropy": 1.079762625694275, + "epoch": 0.5156537753222836, + "grad_norm": 0.6972768306732178, + "learning_rate": 9.999954556423843e-05, + "loss": 0.11875582933425903, + "mean_token_accuracy": 0.9556483089923858, + "num_tokens": 5753195.0, + "step": 280 + }, + { + "entropy": 1.0742079138755798, + "epoch": 0.5340699815837937, + "grad_norm": 0.7821696996688843, + "learning_rate": 9.999731977631227e-05, + "loss": 0.11824090480804443, + "mean_token_accuracy": 0.9557521045207977, + "num_tokens": 5958236.0, + "step": 290 + }, + { + "entropy": 1.0679773569107056, + "epoch": 0.5524861878453039, + "grad_norm": 0.5846888422966003, + "learning_rate": 9.999323925089486e-05, + "loss": 0.11707355976104736, + "mean_token_accuracy": 0.9554719448089599, + "num_tokens": 6163992.0, + "step": 300 + }, + { + "entropy": 1.0655727863311768, + "epoch": 0.570902394106814, + "grad_norm": 0.5812502503395081, + "learning_rate": 9.998730413936037e-05, + "loss": 0.11371417045593261, + "mean_token_accuracy": 0.9576376020908356, + "num_tokens": 6369456.0, + "step": 310 + }, + { + "entropy": 1.0607039332389832, + "epoch": 0.5893186003683242, + "grad_norm": 0.6238475441932678, + "learning_rate": 9.99795146618821e-05, + "loss": 0.11775733232498169, + "mean_token_accuracy": 0.9557221591472626, + "num_tokens": 6574833.0, + "step": 320 + }, + { + "entropy": 1.0504255175590516, + "epoch": 0.6077348066298343, + "grad_norm": 0.6496815085411072, + "learning_rate": 9.996987110742422e-05, + "loss": 0.10904088020324706, + "mean_token_accuracy": 0.9585366368293762, + "num_tokens": 6780108.0, + "step": 330 + }, + { + "entropy": 1.0456081986427308, + "epoch": 0.6261510128913443, + "grad_norm": 0.786702573299408, + "learning_rate": 9.995837383373119e-05, + "loss": 0.10642309188842773, + "mean_token_accuracy": 0.9596696078777314, + "num_tokens": 6985920.0, + "step": 340 + }, + { + "entropy": 1.0455098271369934, + "epoch": 0.6445672191528545, + "grad_norm": 0.5473790168762207, + "learning_rate": 9.994502326731434e-05, + "loss": 0.10822961330413819, + "mean_token_accuracy": 0.959563136100769, + "num_tokens": 7191465.0, + "step": 350 + }, + { + "entropy": 1.04240562915802, + "epoch": 0.6629834254143646, + "grad_norm": 0.6672356128692627, + "learning_rate": 9.992981990343614e-05, + "loss": 0.1110004186630249, + "mean_token_accuracy": 0.9582514643669129, + "num_tokens": 7396877.0, + "step": 360 + }, + { + "entropy": 1.0386811256408692, + "epoch": 0.6813996316758748, + "grad_norm": 0.698539674282074, + "learning_rate": 9.99127643060918e-05, + "loss": 0.107539963722229, + "mean_token_accuracy": 0.9593036234378814, + "num_tokens": 7602437.0, + "step": 370 + }, + { + "entropy": 1.0311225533485413, + "epoch": 0.6998158379373849, + "grad_norm": 0.6629284024238586, + "learning_rate": 9.989385710798837e-05, + "loss": 0.1064023494720459, + "mean_token_accuracy": 0.9602205216884613, + "num_tokens": 7808142.0, + "step": 380 + }, + { + "entropy": 1.030210506916046, + "epoch": 0.7182320441988951, + "grad_norm": 0.5616748929023743, + "learning_rate": 9.987309901052121e-05, + "loss": 0.10717041492462158, + "mean_token_accuracy": 0.9599347949028015, + "num_tokens": 8013407.0, + "step": 390 + }, + { + "entropy": 1.0208017826080322, + "epoch": 0.7366482504604052, + "grad_norm": 0.6329049468040466, + "learning_rate": 9.985049078374806e-05, + "loss": 0.10359601974487305, + "mean_token_accuracy": 0.9603756129741668, + "num_tokens": 8219040.0, + "step": 400 + }, + { + "entropy": 1.015640377998352, + "epoch": 0.7550644567219152, + "grad_norm": 0.6516013741493225, + "learning_rate": 9.982603326636037e-05, + "loss": 0.10146439075469971, + "mean_token_accuracy": 0.9627702474594116, + "num_tokens": 8424678.0, + "step": 410 + }, + { + "entropy": 1.0105359435081482, + "epoch": 0.7734806629834254, + "grad_norm": 0.6920603513717651, + "learning_rate": 9.979972736565226e-05, + "loss": 0.10770498514175415, + "mean_token_accuracy": 0.9591470420360565, + "num_tokens": 8629868.0, + "step": 420 + }, + { + "entropy": 0.9966452836990356, + "epoch": 0.7918968692449355, + "grad_norm": 0.6857476234436035, + "learning_rate": 9.977157405748687e-05, + "loss": 0.10282524824142455, + "mean_token_accuracy": 0.9612209022045135, + "num_tokens": 8835320.0, + "step": 430 + }, + { + "entropy": 0.9945534646511078, + "epoch": 0.8103130755064457, + "grad_norm": 0.7208472490310669, + "learning_rate": 9.974157438626008e-05, + "loss": 0.10069938898086547, + "mean_token_accuracy": 0.9620070576667785, + "num_tokens": 9041123.0, + "step": 440 + }, + { + "entropy": 0.979461395740509, + "epoch": 0.8287292817679558, + "grad_norm": 0.5071915984153748, + "learning_rate": 9.970972946486185e-05, + "loss": 0.09799174070358277, + "mean_token_accuracy": 0.9620374023914338, + "num_tokens": 9246361.0, + "step": 450 + }, + { + "entropy": 0.9830998003482818, + "epoch": 0.8471454880294659, + "grad_norm": 0.8660802245140076, + "learning_rate": 9.967604047463493e-05, + "loss": 0.10378165245056152, + "mean_token_accuracy": 0.9606865763664245, + "num_tokens": 9451845.0, + "step": 460 + }, + { + "entropy": 0.9813413023948669, + "epoch": 0.8655616942909761, + "grad_norm": 0.7642477750778198, + "learning_rate": 9.964050866533094e-05, + "loss": 0.1010061264038086, + "mean_token_accuracy": 0.9608745336532593, + "num_tokens": 9656802.0, + "step": 470 + }, + { + "entropy": 0.967874163389206, + "epoch": 0.8839779005524862, + "grad_norm": 0.5987281799316406, + "learning_rate": 9.960313535506411e-05, + "loss": 0.10169394016265869, + "mean_token_accuracy": 0.9611998200416565, + "num_tokens": 9861719.0, + "step": 480 + }, + { + "entropy": 0.9663491308689117, + "epoch": 0.9023941068139963, + "grad_norm": 0.6124638319015503, + "learning_rate": 9.956392193026239e-05, + "loss": 0.102389657497406, + "mean_token_accuracy": 0.9611884355545044, + "num_tokens": 10066673.0, + "step": 490 + }, + { + "entropy": 0.959654438495636, + "epoch": 0.9208103130755064, + "grad_norm": 0.7873051762580872, + "learning_rate": 9.952286984561592e-05, + "loss": 0.10170392990112305, + "mean_token_accuracy": 0.9610928475856781, + "num_tokens": 10272091.0, + "step": 500 + }, + { + "entropy": 0.9550537407398224, + "epoch": 0.9392265193370166, + "grad_norm": 0.6071968078613281, + "learning_rate": 9.947998062402313e-05, + "loss": 0.09448277950286865, + "mean_token_accuracy": 0.9648977637290954, + "num_tokens": 10477632.0, + "step": 510 + }, + { + "entropy": 0.9538533687591553, + "epoch": 0.9576427255985267, + "grad_norm": 0.6317242980003357, + "learning_rate": 9.943525585653428e-05, + "loss": 0.09542192220687866, + "mean_token_accuracy": 0.9635261118412017, + "num_tokens": 10682828.0, + "step": 520 + }, + { + "entropy": 0.9362513542175293, + "epoch": 0.9760589318600368, + "grad_norm": 0.6421944499015808, + "learning_rate": 9.938869720229234e-05, + "loss": 0.09382058382034301, + "mean_token_accuracy": 0.9648073971271515, + "num_tokens": 10888741.0, + "step": 530 + }, + { + "entropy": 0.9235438346862793, + "epoch": 0.994475138121547, + "grad_norm": 0.7986873388290405, + "learning_rate": 9.934030638847155e-05, + "loss": 0.09827429056167603, + "mean_token_accuracy": 0.9621128737926483, + "num_tokens": 11094387.0, + "step": 540 + }, + { + "epoch": 1.0, + "eval_entropy": 0.9137652366057686, + "eval_loss": 0.09368764609098434, + "eval_mean_token_accuracy": 0.9640816880309063, + "eval_num_tokens": 11155908.0, + "eval_runtime": 10.4701, + "eval_samples_per_second": 349.377, + "eval_steps_per_second": 10.984, + "step": 543 + }, + { + "entropy": 0.9047818422317505, + "epoch": 1.0128913443830572, + "grad_norm": 0.6781501173973083, + "learning_rate": 9.929008521021325e-05, + "loss": 0.0863916516304016, + "mean_token_accuracy": 0.9673655688762665, + "num_tokens": 11299715.0, + "step": 550 + }, + { + "entropy": 0.8856981039047241, + "epoch": 1.0313075506445673, + "grad_norm": 0.7143136858940125, + "learning_rate": 9.923803553055937e-05, + "loss": 0.08632323145866394, + "mean_token_accuracy": 0.9677783191204071, + "num_tokens": 11505059.0, + "step": 560 + }, + { + "entropy": 0.8937099635601043, + "epoch": 1.0497237569060773, + "grad_norm": 0.7751694321632385, + "learning_rate": 9.918415928038325e-05, + "loss": 0.08178263902664185, + "mean_token_accuracy": 0.9694291114807129, + "num_tokens": 11710464.0, + "step": 570 + }, + { + "entropy": 0.8858704209327698, + "epoch": 1.0681399631675874, + "grad_norm": 0.7492292523384094, + "learning_rate": 9.912845845831805e-05, + "loss": 0.08074211478233337, + "mean_token_accuracy": 0.9692470014095307, + "num_tokens": 11915959.0, + "step": 580 + }, + { + "entropy": 0.8948039829730987, + "epoch": 1.0865561694290977, + "grad_norm": 0.8116479516029358, + "learning_rate": 9.907093513068259e-05, + "loss": 0.08712012171745301, + "mean_token_accuracy": 0.9669980227947235, + "num_tokens": 12121499.0, + "step": 590 + }, + { + "entropy": 0.8846789538860321, + "epoch": 1.1049723756906078, + "grad_norm": 0.7295626997947693, + "learning_rate": 9.901159143140471e-05, + "loss": 0.08444435596466064, + "mean_token_accuracy": 0.9674544095993042, + "num_tokens": 12327061.0, + "step": 600 + }, + { + "entropy": 0.8734103918075562, + "epoch": 1.1233885819521179, + "grad_norm": 0.9585768580436707, + "learning_rate": 9.89504295619421e-05, + "loss": 0.08022565841674804, + "mean_token_accuracy": 0.969569206237793, + "num_tokens": 12532305.0, + "step": 610 + }, + { + "entropy": 0.8640486001968384, + "epoch": 1.141804788213628, + "grad_norm": 0.7891159057617188, + "learning_rate": 9.88874517912006e-05, + "loss": 0.08415375947952271, + "mean_token_accuracy": 0.9678892493247986, + "num_tokens": 12737828.0, + "step": 620 + }, + { + "entropy": 0.8599755525588989, + "epoch": 1.160220994475138, + "grad_norm": 0.5801345109939575, + "learning_rate": 9.882266045545012e-05, + "loss": 0.08100489974021911, + "mean_token_accuracy": 0.9688023269176483, + "num_tokens": 12943343.0, + "step": 630 + }, + { + "entropy": 0.86524977684021, + "epoch": 1.1786372007366483, + "grad_norm": 0.7633041143417358, + "learning_rate": 9.87560579582379e-05, + "loss": 0.07859406471252442, + "mean_token_accuracy": 0.9702189445495606, + "num_tokens": 13148473.0, + "step": 640 + }, + { + "entropy": 0.8466695249080658, + "epoch": 1.1970534069981584, + "grad_norm": 0.8672215938568115, + "learning_rate": 9.868764677029934e-05, + "loss": 0.08082623481750488, + "mean_token_accuracy": 0.9689972400665283, + "num_tokens": 13353890.0, + "step": 650 + }, + { + "entropy": 0.8596941530704498, + "epoch": 1.2154696132596685, + "grad_norm": 0.7524124383926392, + "learning_rate": 9.861742942946639e-05, + "loss": 0.0789935290813446, + "mean_token_accuracy": 0.9693858206272126, + "num_tokens": 13559475.0, + "step": 660 + }, + { + "entropy": 0.8708749234676361, + "epoch": 1.2338858195211786, + "grad_norm": 0.5777031183242798, + "learning_rate": 9.854540854057337e-05, + "loss": 0.07773642539978028, + "mean_token_accuracy": 0.970385092496872, + "num_tokens": 13765076.0, + "step": 670 + }, + { + "entropy": 0.8651713371276856, + "epoch": 1.2523020257826887, + "grad_norm": 0.7924166321754456, + "learning_rate": 9.847158677536034e-05, + "loss": 0.0766686737537384, + "mean_token_accuracy": 0.9702267110347748, + "num_tokens": 13970642.0, + "step": 680 + }, + { + "entropy": 0.8763024985790253, + "epoch": 1.270718232044199, + "grad_norm": 0.741219162940979, + "learning_rate": 9.839596687237403e-05, + "loss": 0.07189929485321045, + "mean_token_accuracy": 0.9727097094058991, + "num_tokens": 14176556.0, + "step": 690 + }, + { + "entropy": 0.8556921362876893, + "epoch": 1.289134438305709, + "grad_norm": 0.6298198103904724, + "learning_rate": 9.831855163686618e-05, + "loss": 0.07608137726783752, + "mean_token_accuracy": 0.9716399371623993, + "num_tokens": 14381686.0, + "step": 700 + }, + { + "entropy": 0.869178420305252, + "epoch": 1.3075506445672191, + "grad_norm": 0.5850273370742798, + "learning_rate": 9.823934394068952e-05, + "loss": 0.07437651753425598, + "mean_token_accuracy": 0.9709566533565521, + "num_tokens": 14586814.0, + "step": 710 + }, + { + "entropy": 0.8708595156669616, + "epoch": 1.3259668508287292, + "grad_norm": 0.6580632328987122, + "learning_rate": 9.815834672219127e-05, + "loss": 0.07518917322158813, + "mean_token_accuracy": 0.9717426657676697, + "num_tokens": 14792321.0, + "step": 720 + }, + { + "entropy": 0.8826817810535431, + "epoch": 1.3443830570902393, + "grad_norm": 0.8788532018661499, + "learning_rate": 9.807556298610404e-05, + "loss": 0.07579240798950196, + "mean_token_accuracy": 0.9706341981887817, + "num_tokens": 14997810.0, + "step": 730 + }, + { + "entropy": 0.9012470185756684, + "epoch": 1.3627992633517496, + "grad_norm": 0.7022138237953186, + "learning_rate": 9.799099580343441e-05, + "loss": 0.0775588572025299, + "mean_token_accuracy": 0.9699241399765015, + "num_tokens": 15203795.0, + "step": 740 + }, + { + "entropy": 0.886955714225769, + "epoch": 1.3812154696132597, + "grad_norm": 0.7881133556365967, + "learning_rate": 9.790464831134903e-05, + "loss": 0.07125020027160645, + "mean_token_accuracy": 0.9723815560340882, + "num_tokens": 15408974.0, + "step": 750 + }, + { + "entropy": 0.9047374844551086, + "epoch": 1.3996316758747698, + "grad_norm": 0.9082005023956299, + "learning_rate": 9.781652371305824e-05, + "loss": 0.07004334926605224, + "mean_token_accuracy": 0.9725580036640167, + "num_tokens": 15614399.0, + "step": 760 + }, + { + "entropy": 0.9039053857326508, + "epoch": 1.4180478821362799, + "grad_norm": 0.8060817122459412, + "learning_rate": 9.77266252776972e-05, + "loss": 0.07103485465049744, + "mean_token_accuracy": 0.9721468150615692, + "num_tokens": 15819895.0, + "step": 770 + }, + { + "entropy": 0.8998047232627868, + "epoch": 1.43646408839779, + "grad_norm": 1.0152642726898193, + "learning_rate": 9.763495634020467e-05, + "loss": 0.07411704063415528, + "mean_token_accuracy": 0.9711063146591187, + "num_tokens": 16025297.0, + "step": 780 + }, + { + "entropy": 0.9120213568210602, + "epoch": 1.4548802946593002, + "grad_norm": 0.6288319826126099, + "learning_rate": 9.754152030119921e-05, + "loss": 0.07223712205886841, + "mean_token_accuracy": 0.9722476422786712, + "num_tokens": 16230656.0, + "step": 790 + }, + { + "entropy": 0.9142370820045471, + "epoch": 1.4732965009208103, + "grad_norm": 0.7854700088500977, + "learning_rate": 9.744632062685311e-05, + "loss": 0.07186744809150696, + "mean_token_accuracy": 0.972247713804245, + "num_tokens": 16435943.0, + "step": 800 + }, + { + "entropy": 0.8920814216136932, + "epoch": 1.4917127071823204, + "grad_norm": 0.6227074265480042, + "learning_rate": 9.734936084876383e-05, + "loss": 0.07016961574554444, + "mean_token_accuracy": 0.9725603640079499, + "num_tokens": 16641635.0, + "step": 810 + }, + { + "entropy": 0.891328877210617, + "epoch": 1.5101289134438307, + "grad_norm": 0.7601346969604492, + "learning_rate": 9.725064456382283e-05, + "loss": 0.07137494087219239, + "mean_token_accuracy": 0.9722997546195984, + "num_tokens": 16847194.0, + "step": 820 + }, + { + "entropy": 0.8921217978000641, + "epoch": 1.5285451197053406, + "grad_norm": 0.7813850045204163, + "learning_rate": 9.715017543408233e-05, + "loss": 0.06890199184417725, + "mean_token_accuracy": 0.9735044002532959, + "num_tokens": 17052807.0, + "step": 830 + }, + { + "entropy": 0.9085914671421051, + "epoch": 1.5469613259668509, + "grad_norm": 0.6184289455413818, + "learning_rate": 9.704795718661939e-05, + "loss": 0.07043765187263488, + "mean_token_accuracy": 0.9725716531276702, + "num_tokens": 17258284.0, + "step": 840 + }, + { + "entropy": 0.9029861629009247, + "epoch": 1.565377532228361, + "grad_norm": 0.7082377076148987, + "learning_rate": 9.694399361339752e-05, + "loss": 0.07113839387893676, + "mean_token_accuracy": 0.9725669205188752, + "num_tokens": 17464326.0, + "step": 850 + }, + { + "entropy": 0.8856533527374267, + "epoch": 1.583793738489871, + "grad_norm": 0.7409216165542603, + "learning_rate": 9.683828857112627e-05, + "loss": 0.07077333331108093, + "mean_token_accuracy": 0.9731084644794464, + "num_tokens": 17669537.0, + "step": 860 + }, + { + "entropy": 0.8613030433654785, + "epoch": 1.6022099447513813, + "grad_norm": 0.6801561713218689, + "learning_rate": 9.673084598111789e-05, + "loss": 0.06885308027267456, + "mean_token_accuracy": 0.97266526222229, + "num_tokens": 17875289.0, + "step": 870 + }, + { + "entropy": 0.8692965865135193, + "epoch": 1.6206261510128912, + "grad_norm": 1.1621277332305908, + "learning_rate": 9.662166982914203e-05, + "loss": 0.07017780542373657, + "mean_token_accuracy": 0.9733059942722321, + "num_tokens": 18080404.0, + "step": 880 + }, + { + "entropy": 0.8671502113342285, + "epoch": 1.6390423572744015, + "grad_norm": 0.7518903613090515, + "learning_rate": 9.651076416527787e-05, + "loss": 0.06977018713951111, + "mean_token_accuracy": 0.9730017304420471, + "num_tokens": 18285699.0, + "step": 890 + }, + { + "entropy": 0.8662045657634735, + "epoch": 1.6574585635359116, + "grad_norm": 0.6622698903083801, + "learning_rate": 9.639813310376378e-05, + "loss": 0.06620995998382569, + "mean_token_accuracy": 0.9737491130828857, + "num_tokens": 18491097.0, + "step": 900 + }, + { + "entropy": 0.8548173069953918, + "epoch": 1.6758747697974217, + "grad_norm": 0.8941843509674072, + "learning_rate": 9.628378082284479e-05, + "loss": 0.06711119413375854, + "mean_token_accuracy": 0.9740589797496796, + "num_tokens": 18696827.0, + "step": 910 + }, + { + "entropy": 0.8763562262058258, + "epoch": 1.694290976058932, + "grad_norm": 0.7571700215339661, + "learning_rate": 9.616771156461755e-05, + "loss": 0.07263468503952027, + "mean_token_accuracy": 0.9717419981956482, + "num_tokens": 18902513.0, + "step": 920 + }, + { + "entropy": 0.8663733780384064, + "epoch": 1.7127071823204418, + "grad_norm": 0.7886489629745483, + "learning_rate": 9.604992963487298e-05, + "loss": 0.07074605226516724, + "mean_token_accuracy": 0.9724965393543243, + "num_tokens": 19107812.0, + "step": 930 + }, + { + "entropy": 0.8673004627227783, + "epoch": 1.7311233885819521, + "grad_norm": 0.8180726170539856, + "learning_rate": 9.593043940293647e-05, + "loss": 0.06831735372543335, + "mean_token_accuracy": 0.9733696818351746, + "num_tokens": 19313330.0, + "step": 940 + }, + { + "entropy": 0.8525971233844757, + "epoch": 1.7495395948434622, + "grad_norm": 0.6576228737831116, + "learning_rate": 9.580924530150595e-05, + "loss": 0.06567002534866333, + "mean_token_accuracy": 0.9745754361152649, + "num_tokens": 19518671.0, + "step": 950 + }, + { + "entropy": 0.8605451703071594, + "epoch": 1.7679558011049723, + "grad_norm": 0.7171661257743835, + "learning_rate": 9.568635182648725e-05, + "loss": 0.06872050762176514, + "mean_token_accuracy": 0.9732091546058654, + "num_tokens": 19724135.0, + "step": 960 + }, + { + "entropy": 0.8642210960388184, + "epoch": 1.7863720073664826, + "grad_norm": 0.7603147029876709, + "learning_rate": 9.556176353682746e-05, + "loss": 0.06766576766967773, + "mean_token_accuracy": 0.9728681743144989, + "num_tokens": 19928785.0, + "step": 970 + }, + { + "entropy": 0.8543185651302337, + "epoch": 1.8047882136279927, + "grad_norm": 0.7280875444412231, + "learning_rate": 9.543548505434581e-05, + "loss": 0.06851862668991089, + "mean_token_accuracy": 0.9737437188625335, + "num_tokens": 20134195.0, + "step": 980 + }, + { + "entropy": 0.8744745373725891, + "epoch": 1.8232044198895028, + "grad_norm": 0.5897248983383179, + "learning_rate": 9.530752106356209e-05, + "loss": 0.06809053421020508, + "mean_token_accuracy": 0.9733593761920929, + "num_tokens": 20339517.0, + "step": 990 + }, + { + "entropy": 0.8623859465122223, + "epoch": 1.8416206261510129, + "grad_norm": 0.7515265345573425, + "learning_rate": 9.517787631152298e-05, + "loss": 0.07257847785949707, + "mean_token_accuracy": 0.9714054942131043, + "num_tokens": 20545249.0, + "step": 1000 + }, + { + "entropy": 0.8669404804706573, + "epoch": 1.860036832412523, + "grad_norm": 0.7144560813903809, + "learning_rate": 9.504655560762596e-05, + "loss": 0.06832354068756104, + "mean_token_accuracy": 0.9735779523849487, + "num_tokens": 20750507.0, + "step": 1010 + }, + { + "entropy": 0.8493516445159912, + "epoch": 1.8784530386740332, + "grad_norm": 0.6559189558029175, + "learning_rate": 9.491356382344081e-05, + "loss": 0.0629766047000885, + "mean_token_accuracy": 0.9754977762699127, + "num_tokens": 20955956.0, + "step": 1020 + }, + { + "entropy": 0.8599376022815705, + "epoch": 1.8968692449355433, + "grad_norm": 0.6792973279953003, + "learning_rate": 9.477890589252895e-05, + "loss": 0.0666757881641388, + "mean_token_accuracy": 0.974083811044693, + "num_tokens": 21161163.0, + "step": 1030 + }, + { + "entropy": 0.8458438158035279, + "epoch": 1.9152854511970534, + "grad_norm": 0.6941778659820557, + "learning_rate": 9.464258681026042e-05, + "loss": 0.06307152509689332, + "mean_token_accuracy": 0.9757042229175568, + "num_tokens": 21366525.0, + "step": 1040 + }, + { + "entropy": 0.848515909910202, + "epoch": 1.9337016574585635, + "grad_norm": 0.7307806611061096, + "learning_rate": 9.450461163362855e-05, + "loss": 0.06307026147842407, + "mean_token_accuracy": 0.9750974595546722, + "num_tokens": 21572238.0, + "step": 1050 + }, + { + "entropy": 0.8563454031944275, + "epoch": 1.9521178637200736, + "grad_norm": 0.7222106456756592, + "learning_rate": 9.436498548106236e-05, + "loss": 0.0647726058959961, + "mean_token_accuracy": 0.974629694223404, + "num_tokens": 21777633.0, + "step": 1060 + }, + { + "entropy": 0.8656457483768463, + "epoch": 1.9705340699815839, + "grad_norm": 0.67178875207901, + "learning_rate": 9.422371353223674e-05, + "loss": 0.06573554277420043, + "mean_token_accuracy": 0.9745908617973328, + "num_tokens": 21983116.0, + "step": 1070 + }, + { + "entropy": 0.8630891263484954, + "epoch": 1.988950276243094, + "grad_norm": 0.6956593990325928, + "learning_rate": 9.408080102788016e-05, + "loss": 0.06630704402923585, + "mean_token_accuracy": 0.9741333484649658, + "num_tokens": 22188662.0, + "step": 1080 + }, + { + "epoch": 2.0, + "eval_entropy": 0.8560857042022373, + "eval_loss": 0.06494329869747162, + "eval_mean_token_accuracy": 0.9745692672936813, + "eval_num_tokens": 22311800.0, + "eval_runtime": 10.129, + "eval_samples_per_second": 361.142, + "eval_steps_per_second": 11.354, + "step": 1086 + }, + { + "entropy": 0.8616272270679474, + "epoch": 2.007366482504604, + "grad_norm": 0.7778105139732361, + "learning_rate": 9.393625326958041e-05, + "loss": 0.054407155513763426, + "mean_token_accuracy": 0.9792074799537659, + "num_tokens": 22394215.0, + "step": 1090 + }, + { + "entropy": 0.8496910452842712, + "epoch": 2.0257826887661143, + "grad_norm": 0.7422528266906738, + "learning_rate": 9.379007561958792e-05, + "loss": 0.051881587505340575, + "mean_token_accuracy": 0.9799090325832367, + "num_tokens": 22599599.0, + "step": 1100 + }, + { + "entropy": 0.8531602442264556, + "epoch": 2.044198895027624, + "grad_norm": 0.9075332880020142, + "learning_rate": 9.36422735006167e-05, + "loss": 0.05190724730491638, + "mean_token_accuracy": 0.979931116104126, + "num_tokens": 22805318.0, + "step": 1110 + }, + { + "entropy": 0.8657277703285218, + "epoch": 2.0626151012891345, + "grad_norm": 0.9466913938522339, + "learning_rate": 9.349285239564325e-05, + "loss": 0.053853434324264524, + "mean_token_accuracy": 0.9796103596687317, + "num_tokens": 23010438.0, + "step": 1120 + }, + { + "entropy": 0.8578485429286957, + "epoch": 2.0810313075506444, + "grad_norm": 0.6903054714202881, + "learning_rate": 9.334181784770326e-05, + "loss": 0.05228850841522217, + "mean_token_accuracy": 0.9802409887313843, + "num_tokens": 23215795.0, + "step": 1130 + }, + { + "entropy": 0.8450767934322357, + "epoch": 2.0994475138121547, + "grad_norm": 0.6615211367607117, + "learning_rate": 9.318917545968581e-05, + "loss": 0.050570905208587646, + "mean_token_accuracy": 0.9802053451538086, + "num_tokens": 23421157.0, + "step": 1140 + }, + { + "entropy": 0.8325044393539429, + "epoch": 2.117863720073665, + "grad_norm": 0.760960578918457, + "learning_rate": 9.303493089412564e-05, + "loss": 0.051966112852096555, + "mean_token_accuracy": 0.9796205997467041, + "num_tokens": 23626584.0, + "step": 1150 + }, + { + "entropy": 0.8416404843330383, + "epoch": 2.136279926335175, + "grad_norm": 0.6947009563446045, + "learning_rate": 9.287908987299306e-05, + "loss": 0.05144861936569214, + "mean_token_accuracy": 0.9800034642219544, + "num_tokens": 23832137.0, + "step": 1160 + }, + { + "entropy": 0.8564540028572083, + "epoch": 2.154696132596685, + "grad_norm": 0.733252763748169, + "learning_rate": 9.272165817748164e-05, + "loss": 0.04944799542427063, + "mean_token_accuracy": 0.9808157980442047, + "num_tokens": 24038006.0, + "step": 1170 + }, + { + "entropy": 0.8575525343418121, + "epoch": 2.1731123388581954, + "grad_norm": 0.8911028504371643, + "learning_rate": 9.25626416477938e-05, + "loss": 0.05037952661514282, + "mean_token_accuracy": 0.980946284532547, + "num_tokens": 24243374.0, + "step": 1180 + }, + { + "entropy": 0.8599720418453216, + "epoch": 2.1915285451197053, + "grad_norm": 0.7713524103164673, + "learning_rate": 9.240204618292416e-05, + "loss": 0.050603735446929934, + "mean_token_accuracy": 0.980896121263504, + "num_tokens": 24448585.0, + "step": 1190 + }, + { + "entropy": 0.8566664934158326, + "epoch": 2.2099447513812156, + "grad_norm": 0.8439353704452515, + "learning_rate": 9.223987774044066e-05, + "loss": 0.054171699285507205, + "mean_token_accuracy": 0.9796543836593627, + "num_tokens": 24653863.0, + "step": 1200 + }, + { + "entropy": 0.846601277589798, + "epoch": 2.2283609576427255, + "grad_norm": 0.7025637030601501, + "learning_rate": 9.207614233626356e-05, + "loss": 0.048924127221107484, + "mean_token_accuracy": 0.9809681415557862, + "num_tokens": 24859801.0, + "step": 1210 + }, + { + "entropy": 0.8564423739910125, + "epoch": 2.2467771639042358, + "grad_norm": 0.7788274884223938, + "learning_rate": 9.191084604444233e-05, + "loss": 0.05260283350944519, + "mean_token_accuracy": 0.9793797850608825, + "num_tokens": 25065368.0, + "step": 1220 + }, + { + "entropy": 0.865056723356247, + "epoch": 2.265193370165746, + "grad_norm": 0.8728818297386169, + "learning_rate": 9.174399499693027e-05, + "loss": 0.05016371011734009, + "mean_token_accuracy": 0.9807134211063385, + "num_tokens": 25270945.0, + "step": 1230 + }, + { + "entropy": 0.8642262935638427, + "epoch": 2.283609576427256, + "grad_norm": 1.0582489967346191, + "learning_rate": 9.157559538335703e-05, + "loss": 0.05316779017448425, + "mean_token_accuracy": 0.9794209063053131, + "num_tokens": 25476575.0, + "step": 1240 + }, + { + "entropy": 0.8677761554718018, + "epoch": 2.3020257826887662, + "grad_norm": 0.760109543800354, + "learning_rate": 9.140565345079901e-05, + "loss": 0.05115479230880737, + "mean_token_accuracy": 0.9802310705184937, + "num_tokens": 25682814.0, + "step": 1250 + }, + { + "entropy": 0.8592945456504821, + "epoch": 2.320441988950276, + "grad_norm": 0.6537907123565674, + "learning_rate": 9.123417550354761e-05, + "loss": 0.050543540716171266, + "mean_token_accuracy": 0.9806945025920868, + "num_tokens": 25887575.0, + "step": 1260 + }, + { + "entropy": 0.8692500293254852, + "epoch": 2.3388581952117864, + "grad_norm": 0.7771905064582825, + "learning_rate": 9.106116790287541e-05, + "loss": 0.049718713760375975, + "mean_token_accuracy": 0.9805168390274048, + "num_tokens": 26092950.0, + "step": 1270 + }, + { + "entropy": 0.8841261565685272, + "epoch": 2.3572744014732967, + "grad_norm": 0.7791076898574829, + "learning_rate": 9.08866370668001e-05, + "loss": 0.0527400553226471, + "mean_token_accuracy": 0.9796754539012908, + "num_tokens": 26298182.0, + "step": 1280 + }, + { + "entropy": 0.8675022900104523, + "epoch": 2.3756906077348066, + "grad_norm": 0.8481605648994446, + "learning_rate": 9.07105894698464e-05, + "loss": 0.05320838689804077, + "mean_token_accuracy": 0.9792274832725525, + "num_tokens": 26503425.0, + "step": 1290 + }, + { + "entropy": 0.8704026222229004, + "epoch": 2.394106813996317, + "grad_norm": 0.8235505819320679, + "learning_rate": 9.053303164280602e-05, + "loss": 0.055045205354690555, + "mean_token_accuracy": 0.9788750648498535, + "num_tokens": 26708755.0, + "step": 1300 + }, + { + "entropy": 0.8525134027004242, + "epoch": 2.4125230202578267, + "grad_norm": 0.7611598968505859, + "learning_rate": 9.035397017249518e-05, + "loss": 0.05029621124267578, + "mean_token_accuracy": 0.9802757322788238, + "num_tokens": 26914704.0, + "step": 1310 + }, + { + "entropy": 0.8630305290222168, + "epoch": 2.430939226519337, + "grad_norm": 0.790408194065094, + "learning_rate": 9.017341170151041e-05, + "loss": 0.04856040775775909, + "mean_token_accuracy": 0.9809690833091735, + "num_tokens": 27120151.0, + "step": 1320 + }, + { + "entropy": 0.8579159140586853, + "epoch": 2.4493554327808473, + "grad_norm": 0.781972348690033, + "learning_rate": 8.999136292798207e-05, + "loss": 0.04869682788848877, + "mean_token_accuracy": 0.9816130697727203, + "num_tokens": 27325673.0, + "step": 1330 + }, + { + "entropy": 0.8634716987609863, + "epoch": 2.467771639042357, + "grad_norm": 0.8500784039497375, + "learning_rate": 8.980783060532588e-05, + "loss": 0.05050289034843445, + "mean_token_accuracy": 0.980079609155655, + "num_tokens": 27531270.0, + "step": 1340 + }, + { + "entropy": 0.8660618126392364, + "epoch": 2.4861878453038675, + "grad_norm": 0.719760537147522, + "learning_rate": 8.96228215419924e-05, + "loss": 0.04892141819000244, + "mean_token_accuracy": 0.9814020991325378, + "num_tokens": 27736542.0, + "step": 1350 + }, + { + "entropy": 0.8572284400463104, + "epoch": 2.5046040515653774, + "grad_norm": 1.0197229385375977, + "learning_rate": 8.943634260121442e-05, + "loss": 0.05104702711105347, + "mean_token_accuracy": 0.9798846662044525, + "num_tokens": 27941566.0, + "step": 1360 + }, + { + "entropy": 0.8702241241931915, + "epoch": 2.5230202578268877, + "grad_norm": 0.7136003375053406, + "learning_rate": 8.924840070075247e-05, + "loss": 0.04855787754058838, + "mean_token_accuracy": 0.9811685383319855, + "num_tokens": 28146943.0, + "step": 1370 + }, + { + "entropy": 0.874957013130188, + "epoch": 2.541436464088398, + "grad_norm": 0.8775497674942017, + "learning_rate": 8.905900281263804e-05, + "loss": 0.052434295415878296, + "mean_token_accuracy": 0.9795438170433044, + "num_tokens": 28352640.0, + "step": 1380 + }, + { + "entropy": 0.8776536166667939, + "epoch": 2.559852670349908, + "grad_norm": 0.8895741105079651, + "learning_rate": 8.8868155962915e-05, + "loss": 0.05282890796661377, + "mean_token_accuracy": 0.9790538609027862, + "num_tokens": 28558153.0, + "step": 1390 + }, + { + "entropy": 0.8738743245601654, + "epoch": 2.578268876611418, + "grad_norm": 0.788800060749054, + "learning_rate": 8.867586723137906e-05, + "loss": 0.048841872811317445, + "mean_token_accuracy": 0.9809149026870727, + "num_tokens": 28763613.0, + "step": 1400 + }, + { + "entropy": 0.8750253796577454, + "epoch": 2.596685082872928, + "grad_norm": 0.8738002777099609, + "learning_rate": 8.848214375131497e-05, + "loss": 0.048261132836341855, + "mean_token_accuracy": 0.980789190530777, + "num_tokens": 28969248.0, + "step": 1410 + }, + { + "entropy": 0.8624245524406433, + "epoch": 2.6151012891344383, + "grad_norm": 0.6404895186424255, + "learning_rate": 8.828699270923196e-05, + "loss": 0.04970468282699585, + "mean_token_accuracy": 0.9807762265205383, + "num_tokens": 29174779.0, + "step": 1420 + }, + { + "entropy": 0.8792938470840455, + "epoch": 2.6335174953959486, + "grad_norm": 0.7856965661048889, + "learning_rate": 8.80904213445972e-05, + "loss": 0.053334391117095946, + "mean_token_accuracy": 0.9790222108364105, + "num_tokens": 29380474.0, + "step": 1430 + }, + { + "entropy": 0.8831034600734711, + "epoch": 2.6519337016574585, + "grad_norm": 0.7739618420600891, + "learning_rate": 8.789243694956716e-05, + "loss": 0.04959054589271546, + "mean_token_accuracy": 0.9803965091705322, + "num_tokens": 29585985.0, + "step": 1440 + }, + { + "entropy": 0.8934672951698304, + "epoch": 2.6703499079189688, + "grad_norm": 0.6999697089195251, + "learning_rate": 8.769304686871719e-05, + "loss": 0.05165250301361084, + "mean_token_accuracy": 0.9798884153366089, + "num_tokens": 29791238.0, + "step": 1450 + }, + { + "entropy": 0.9053199410438537, + "epoch": 2.6887661141804786, + "grad_norm": 0.9199564456939697, + "learning_rate": 8.749225849876892e-05, + "loss": 0.04924143850803375, + "mean_token_accuracy": 0.9810785710811615, + "num_tokens": 29996589.0, + "step": 1460 + }, + { + "entropy": 0.888091403245926, + "epoch": 2.707182320441989, + "grad_norm": 0.7480106353759766, + "learning_rate": 8.729007928831597e-05, + "loss": 0.04948916733264923, + "mean_token_accuracy": 0.9809579730033875, + "num_tokens": 30201875.0, + "step": 1470 + }, + { + "entropy": 0.8723407983779907, + "epoch": 2.7255985267034992, + "grad_norm": 0.9506945013999939, + "learning_rate": 8.708651673754763e-05, + "loss": 0.048927539587020875, + "mean_token_accuracy": 0.980553150177002, + "num_tokens": 30407550.0, + "step": 1480 + }, + { + "entropy": 0.8737521529197693, + "epoch": 2.744014732965009, + "grad_norm": 0.8015706539154053, + "learning_rate": 8.688157839797062e-05, + "loss": 0.04963063597679138, + "mean_token_accuracy": 0.9809738755226135, + "num_tokens": 30612839.0, + "step": 1490 + }, + { + "entropy": 0.8800762951374054, + "epoch": 2.7624309392265194, + "grad_norm": 0.9429986476898193, + "learning_rate": 8.667527187212885e-05, + "loss": 0.0524174690246582, + "mean_token_accuracy": 0.9788767337799072, + "num_tokens": 30818578.0, + "step": 1500 + }, + { + "entropy": 0.8871055901050567, + "epoch": 2.7808471454880292, + "grad_norm": 0.5909196138381958, + "learning_rate": 8.646760481332157e-05, + "loss": 0.05166680812835693, + "mean_token_accuracy": 0.980216771364212, + "num_tokens": 31023829.0, + "step": 1510 + }, + { + "entropy": 0.8908755779266357, + "epoch": 2.7992633517495396, + "grad_norm": 0.9154611229896545, + "learning_rate": 8.625858492531931e-05, + "loss": 0.04951836466789246, + "mean_token_accuracy": 0.9801484227180481, + "num_tokens": 31229635.0, + "step": 1520 + }, + { + "entropy": 0.92480548620224, + "epoch": 2.81767955801105, + "grad_norm": 0.5989938378334045, + "learning_rate": 8.604821996207819e-05, + "loss": 0.04799881279468536, + "mean_token_accuracy": 0.9817522585391998, + "num_tokens": 31435456.0, + "step": 1530 + }, + { + "entropy": 0.9173881888389588, + "epoch": 2.8360957642725597, + "grad_norm": 0.899413526058197, + "learning_rate": 8.58365177274522e-05, + "loss": 0.0487445592880249, + "mean_token_accuracy": 0.9812625288963318, + "num_tokens": 31640904.0, + "step": 1540 + }, + { + "entropy": 0.9076135993003845, + "epoch": 2.85451197053407, + "grad_norm": 0.8494166135787964, + "learning_rate": 8.562348607490376e-05, + "loss": 0.05005228519439697, + "mean_token_accuracy": 0.9806681036949157, + "num_tokens": 31845807.0, + "step": 1550 + }, + { + "entropy": 0.9092245221138, + "epoch": 2.87292817679558, + "grad_norm": 0.8225123286247253, + "learning_rate": 8.540913290721234e-05, + "loss": 0.048654764890670776, + "mean_token_accuracy": 0.9805659353733063, + "num_tokens": 32051523.0, + "step": 1560 + }, + { + "entropy": 0.9062779664993286, + "epoch": 2.89134438305709, + "grad_norm": 0.7074014544487, + "learning_rate": 8.519346617618134e-05, + "loss": 0.049209845066070554, + "mean_token_accuracy": 0.9807434439659118, + "num_tokens": 32256895.0, + "step": 1570 + }, + { + "entropy": 0.9190246641635895, + "epoch": 2.9097605893186005, + "grad_norm": 0.8860642910003662, + "learning_rate": 8.497649388234304e-05, + "loss": 0.051211881637573245, + "mean_token_accuracy": 0.9802342295646668, + "num_tokens": 32462031.0, + "step": 1580 + }, + { + "entropy": 0.9088015079498291, + "epoch": 2.9281767955801103, + "grad_norm": 0.8062726855278015, + "learning_rate": 8.475822407466188e-05, + "loss": 0.053512704372406, + "mean_token_accuracy": 0.979486483335495, + "num_tokens": 32667533.0, + "step": 1590 + }, + { + "entropy": 0.9462027847766876, + "epoch": 2.9465930018416207, + "grad_norm": 0.7962909936904907, + "learning_rate": 8.453866485023579e-05, + "loss": 0.0501457154750824, + "mean_token_accuracy": 0.9803222417831421, + "num_tokens": 32872900.0, + "step": 1600 + }, + { + "entropy": 0.9671471297740937, + "epoch": 2.9650092081031305, + "grad_norm": 0.7641744017601013, + "learning_rate": 8.431782435399587e-05, + "loss": 0.04629061222076416, + "mean_token_accuracy": 0.9823175370693207, + "num_tokens": 33077850.0, + "step": 1610 + }, + { + "entropy": 0.955865204334259, + "epoch": 2.983425414364641, + "grad_norm": 0.6772348880767822, + "learning_rate": 8.409571077840426e-05, + "loss": 0.048368623852729796, + "mean_token_accuracy": 0.9808700799942016, + "num_tokens": 33283117.0, + "step": 1620 + }, + { + "epoch": 3.0, + "eval_entropy": 0.9563225186389426, + "eval_loss": 0.059064481407403946, + "eval_mean_token_accuracy": 0.9773589429648026, + "eval_num_tokens": 33467712.0, + "eval_runtime": 10.1471, + "eval_samples_per_second": 360.499, + "eval_steps_per_second": 11.333, + "step": 1629 + }, + { + "entropy": 0.9337226033210755, + "epoch": 3.001841620626151, + "grad_norm": 0.646203875541687, + "learning_rate": 8.387233236315016e-05, + "loss": 0.043352216482162476, + "mean_token_accuracy": 0.9830620110034942, + "num_tokens": 33488302.0, + "step": 1630 + }, + { + "entropy": 0.9734923839569092, + "epoch": 3.020257826887661, + "grad_norm": 0.7564226984977722, + "learning_rate": 8.364769739484416e-05, + "loss": 0.033932483196258544, + "mean_token_accuracy": 0.9872806966304779, + "num_tokens": 33693531.0, + "step": 1640 + }, + { + "entropy": 0.9669206500053406, + "epoch": 3.0386740331491713, + "grad_norm": 0.7126886248588562, + "learning_rate": 8.342181420671096e-05, + "loss": 0.03818287253379822, + "mean_token_accuracy": 0.9852082908153534, + "num_tokens": 33899305.0, + "step": 1650 + }, + { + "entropy": 0.9522916138172149, + "epoch": 3.0570902394106816, + "grad_norm": 1.0571653842926025, + "learning_rate": 8.319469117828007e-05, + "loss": 0.03456039130687714, + "mean_token_accuracy": 0.9867027878761292, + "num_tokens": 34104585.0, + "step": 1660 + }, + { + "entropy": 0.9568560004234314, + "epoch": 3.0755064456721914, + "grad_norm": 0.780940592288971, + "learning_rate": 8.296633673507505e-05, + "loss": 0.03551802039146423, + "mean_token_accuracy": 0.9867531359195709, + "num_tokens": 34309516.0, + "step": 1670 + }, + { + "entropy": 0.9590656876564025, + "epoch": 3.0939226519337018, + "grad_norm": 0.8330219388008118, + "learning_rate": 8.273675934830094e-05, + "loss": 0.03674865961074829, + "mean_token_accuracy": 0.9864118576049805, + "num_tokens": 34515170.0, + "step": 1680 + }, + { + "entropy": 0.975881814956665, + "epoch": 3.1123388581952116, + "grad_norm": 0.7010637521743774, + "learning_rate": 8.250596753453e-05, + "loss": 0.03550414443016052, + "mean_token_accuracy": 0.9864102602005005, + "num_tokens": 34720896.0, + "step": 1690 + }, + { + "entropy": 0.9599562883377075, + "epoch": 3.130755064456722, + "grad_norm": 0.6694278717041016, + "learning_rate": 8.227396985538578e-05, + "loss": 0.035564273595809937, + "mean_token_accuracy": 0.9867321848869324, + "num_tokens": 34925970.0, + "step": 1700 + }, + { + "entropy": 0.9582216143608093, + "epoch": 3.149171270718232, + "grad_norm": 0.9333199262619019, + "learning_rate": 8.204077491722546e-05, + "loss": 0.035575729608535764, + "mean_token_accuracy": 0.9862452208995819, + "num_tokens": 35131543.0, + "step": 1710 + }, + { + "entropy": 0.9579678058624268, + "epoch": 3.167587476979742, + "grad_norm": 0.9450218081474304, + "learning_rate": 8.180639137082066e-05, + "loss": 0.0385298490524292, + "mean_token_accuracy": 0.98538036942482, + "num_tokens": 35336790.0, + "step": 1720 + }, + { + "entropy": 0.9640831351280212, + "epoch": 3.1860036832412524, + "grad_norm": 0.8551534414291382, + "learning_rate": 8.157082791103649e-05, + "loss": 0.03702138364315033, + "mean_token_accuracy": 0.9852015495300293, + "num_tokens": 35542294.0, + "step": 1730 + }, + { + "entropy": 0.9867071211338043, + "epoch": 3.2044198895027622, + "grad_norm": 0.7138128876686096, + "learning_rate": 8.133409327650897e-05, + "loss": 0.035626694560050964, + "mean_token_accuracy": 0.986064875125885, + "num_tokens": 35747447.0, + "step": 1740 + }, + { + "entropy": 0.9639089345932007, + "epoch": 3.2228360957642725, + "grad_norm": 0.7131415009498596, + "learning_rate": 8.109619624932092e-05, + "loss": 0.035885071754455565, + "mean_token_accuracy": 0.986273056268692, + "num_tokens": 35952258.0, + "step": 1750 + }, + { + "entropy": 0.9516046345233917, + "epoch": 3.241252302025783, + "grad_norm": 0.6900200843811035, + "learning_rate": 8.085714565467611e-05, + "loss": 0.03535219430923462, + "mean_token_accuracy": 0.985836285352707, + "num_tokens": 36157938.0, + "step": 1760 + }, + { + "entropy": 0.9373646557331086, + "epoch": 3.2596685082872927, + "grad_norm": 0.6101690530776978, + "learning_rate": 8.061695036057191e-05, + "loss": 0.034940996766090394, + "mean_token_accuracy": 0.9863743901252746, + "num_tokens": 36363825.0, + "step": 1770 + }, + { + "entropy": 0.9444344758987426, + "epoch": 3.278084714548803, + "grad_norm": 0.7518529295921326, + "learning_rate": 8.03756192774703e-05, + "loss": 0.03404279053211212, + "mean_token_accuracy": 0.9866396844387054, + "num_tokens": 36568961.0, + "step": 1780 + }, + { + "entropy": 0.9550357758998871, + "epoch": 3.2965009208103133, + "grad_norm": 0.7687555551528931, + "learning_rate": 8.013316135796734e-05, + "loss": 0.038447052240371704, + "mean_token_accuracy": 0.985325163602829, + "num_tokens": 36774514.0, + "step": 1790 + }, + { + "entropy": 0.9477231681346894, + "epoch": 3.314917127071823, + "grad_norm": 0.7521633505821228, + "learning_rate": 7.988958559646102e-05, + "loss": 0.03746694028377533, + "mean_token_accuracy": 0.9853165090084076, + "num_tokens": 36979660.0, + "step": 1800 + }, + { + "entropy": 0.925805002450943, + "epoch": 3.3333333333333335, + "grad_norm": 0.9333297610282898, + "learning_rate": 7.964490102881768e-05, + "loss": 0.03700103759765625, + "mean_token_accuracy": 0.9850880861282348, + "num_tokens": 37185191.0, + "step": 1810 + }, + { + "entropy": 0.9225482225418091, + "epoch": 3.3517495395948433, + "grad_norm": 0.7928622961044312, + "learning_rate": 7.939911673203665e-05, + "loss": 0.03825801610946655, + "mean_token_accuracy": 0.9850241422653199, + "num_tokens": 37390749.0, + "step": 1820 + }, + { + "entropy": 0.9597147881984711, + "epoch": 3.3701657458563536, + "grad_norm": 0.7658583521842957, + "learning_rate": 7.915224182391375e-05, + "loss": 0.039855146408081056, + "mean_token_accuracy": 0.9845879554748536, + "num_tokens": 37596052.0, + "step": 1830 + }, + { + "entropy": 0.9485619068145752, + "epoch": 3.388581952117864, + "grad_norm": 0.8492130637168884, + "learning_rate": 7.890428546270278e-05, + "loss": 0.039359599351882935, + "mean_token_accuracy": 0.9847265422344208, + "num_tokens": 37802063.0, + "step": 1840 + }, + { + "entropy": 0.9670301914215088, + "epoch": 3.406998158379374, + "grad_norm": 0.7527599930763245, + "learning_rate": 7.865525684677608e-05, + "loss": 0.03752985596656799, + "mean_token_accuracy": 0.9855137526988983, + "num_tokens": 38007432.0, + "step": 1850 + }, + { + "entropy": 0.9681244969367981, + "epoch": 3.425414364640884, + "grad_norm": 0.7599612474441528, + "learning_rate": 7.840516521428303e-05, + "loss": 0.03653894364833832, + "mean_token_accuracy": 0.9858933389186859, + "num_tokens": 38212923.0, + "step": 1860 + }, + { + "entropy": 0.9706049561500549, + "epoch": 3.443830570902394, + "grad_norm": 0.7678127884864807, + "learning_rate": 7.815401984280748e-05, + "loss": 0.0366938978433609, + "mean_token_accuracy": 0.9854713797569274, + "num_tokens": 38418422.0, + "step": 1870 + }, + { + "entropy": 0.9637093842029572, + "epoch": 3.4622467771639043, + "grad_norm": 0.762824535369873, + "learning_rate": 7.790183004902359e-05, + "loss": 0.03516915142536163, + "mean_token_accuracy": 0.9866003453731537, + "num_tokens": 38624389.0, + "step": 1880 + }, + { + "entropy": 0.9373565018177032, + "epoch": 3.4806629834254146, + "grad_norm": 0.8221780061721802, + "learning_rate": 7.764860518835014e-05, + "loss": 0.04049026966094971, + "mean_token_accuracy": 0.984089481830597, + "num_tokens": 38829654.0, + "step": 1890 + }, + { + "entropy": 0.9356025457382202, + "epoch": 3.4990791896869244, + "grad_norm": 0.7583426237106323, + "learning_rate": 7.739435465460356e-05, + "loss": 0.03658481240272522, + "mean_token_accuracy": 0.9857318818569183, + "num_tokens": 39034638.0, + "step": 1900 + }, + { + "entropy": 0.9740163326263428, + "epoch": 3.5174953959484347, + "grad_norm": 0.7332878112792969, + "learning_rate": 7.713908787964937e-05, + "loss": 0.03508963882923126, + "mean_token_accuracy": 0.9863419532775879, + "num_tokens": 39240265.0, + "step": 1910 + }, + { + "entropy": 0.9528286933898926, + "epoch": 3.5359116022099446, + "grad_norm": 0.6515451669692993, + "learning_rate": 7.688281433305233e-05, + "loss": 0.036055779457092284, + "mean_token_accuracy": 0.9860979080200195, + "num_tokens": 39445546.0, + "step": 1920 + }, + { + "entropy": 0.9480705261230469, + "epoch": 3.554327808471455, + "grad_norm": 0.7725827097892761, + "learning_rate": 7.662554352172515e-05, + "loss": 0.037101513147354125, + "mean_token_accuracy": 0.985782790184021, + "num_tokens": 39651078.0, + "step": 1930 + }, + { + "entropy": 0.9655321061611175, + "epoch": 3.572744014732965, + "grad_norm": 0.7756506204605103, + "learning_rate": 7.636728498957581e-05, + "loss": 0.03721855878829956, + "mean_token_accuracy": 0.9857951939105988, + "num_tokens": 39856542.0, + "step": 1940 + }, + { + "entropy": 0.9772682309150695, + "epoch": 3.591160220994475, + "grad_norm": 0.9084987640380859, + "learning_rate": 7.610804831715355e-05, + "loss": 0.03570749163627625, + "mean_token_accuracy": 0.9863450109958649, + "num_tokens": 40061913.0, + "step": 1950 + }, + { + "entropy": 0.9579685389995575, + "epoch": 3.6095764272559854, + "grad_norm": 0.6358487606048584, + "learning_rate": 7.584784312129334e-05, + "loss": 0.038210684061050416, + "mean_token_accuracy": 0.9850837290287018, + "num_tokens": 40267398.0, + "step": 1960 + }, + { + "entropy": 0.9605201721191406, + "epoch": 3.6279926335174952, + "grad_norm": 0.6263149976730347, + "learning_rate": 7.558667905475927e-05, + "loss": 0.03509160876274109, + "mean_token_accuracy": 0.9868143379688263, + "num_tokens": 40472827.0, + "step": 1970 + }, + { + "entropy": 0.964026153087616, + "epoch": 3.6464088397790055, + "grad_norm": 0.90068119764328, + "learning_rate": 7.532456580588638e-05, + "loss": 0.036211782693862916, + "mean_token_accuracy": 0.9858468770980835, + "num_tokens": 40677935.0, + "step": 1980 + }, + { + "entropy": 0.9494135618209839, + "epoch": 3.664825046040516, + "grad_norm": 0.760134756565094, + "learning_rate": 7.50615130982213e-05, + "loss": 0.03786201477050781, + "mean_token_accuracy": 0.9852500438690186, + "num_tokens": 40883750.0, + "step": 1990 + }, + { + "entropy": 0.9527071297168732, + "epoch": 3.6832412523020257, + "grad_norm": 0.9812107682228088, + "learning_rate": 7.479753069016152e-05, + "loss": 0.03803159594535828, + "mean_token_accuracy": 0.9852405369281769, + "num_tokens": 41089115.0, + "step": 2000 + }, + { + "entropy": 0.9639330863952636, + "epoch": 3.701657458563536, + "grad_norm": 0.7164933681488037, + "learning_rate": 7.453262837459332e-05, + "loss": 0.03912568986415863, + "mean_token_accuracy": 0.9849458575248718, + "num_tokens": 41294694.0, + "step": 2010 + }, + { + "entropy": 0.9536987483501435, + "epoch": 3.720073664825046, + "grad_norm": 0.6804596185684204, + "learning_rate": 7.426681597852863e-05, + "loss": 0.036410006880760196, + "mean_token_accuracy": 0.985712206363678, + "num_tokens": 41499817.0, + "step": 2020 + }, + { + "entropy": 0.9478164672851562, + "epoch": 3.738489871086556, + "grad_norm": 0.8799397349357605, + "learning_rate": 7.400010336274037e-05, + "loss": 0.03801035583019256, + "mean_token_accuracy": 0.9850274682044983, + "num_tokens": 41704932.0, + "step": 2030 + }, + { + "entropy": 0.9383447647094727, + "epoch": 3.7569060773480665, + "grad_norm": 0.8386216163635254, + "learning_rate": 7.373250042139664e-05, + "loss": 0.0373637855052948, + "mean_token_accuracy": 0.9854822158813477, + "num_tokens": 41910804.0, + "step": 2040 + }, + { + "entropy": 0.925172996520996, + "epoch": 3.7753222836095763, + "grad_norm": 0.7599324584007263, + "learning_rate": 7.346401708169377e-05, + "loss": 0.03585260808467865, + "mean_token_accuracy": 0.9860672950744629, + "num_tokens": 42116706.0, + "step": 2050 + }, + { + "entropy": 0.9463765442371368, + "epoch": 3.7937384898710866, + "grad_norm": 0.9030149579048157, + "learning_rate": 7.319466330348797e-05, + "loss": 0.035877206921577455, + "mean_token_accuracy": 0.9863968968391419, + "num_tokens": 42322670.0, + "step": 2060 + }, + { + "entropy": 0.9942441761493683, + "epoch": 3.8121546961325965, + "grad_norm": 0.6400449275970459, + "learning_rate": 7.292444907892587e-05, + "loss": 0.037310433387756345, + "mean_token_accuracy": 0.9854151606559753, + "num_tokens": 42527752.0, + "step": 2070 + }, + { + "entropy": 0.9577703952789307, + "epoch": 3.830570902394107, + "grad_norm": 0.6193167567253113, + "learning_rate": 7.265338443207387e-05, + "loss": 0.03648848831653595, + "mean_token_accuracy": 0.9856530070304871, + "num_tokens": 42732981.0, + "step": 2080 + }, + { + "entropy": 0.9663952767848969, + "epoch": 3.848987108655617, + "grad_norm": 0.759611189365387, + "learning_rate": 7.238147941854625e-05, + "loss": 0.036112996935844424, + "mean_token_accuracy": 0.9862765550613404, + "num_tokens": 42938619.0, + "step": 2090 + }, + { + "entropy": 0.9484863519668579, + "epoch": 3.867403314917127, + "grad_norm": 0.7420705556869507, + "learning_rate": 7.210874412513218e-05, + "loss": 0.03703283965587616, + "mean_token_accuracy": 0.9857317566871643, + "num_tokens": 43143753.0, + "step": 2100 + }, + { + "entropy": 0.964326673746109, + "epoch": 3.8858195211786373, + "grad_norm": 0.8779639601707458, + "learning_rate": 7.183518866942147e-05, + "loss": 0.03739701807498932, + "mean_token_accuracy": 0.9852154791355133, + "num_tokens": 43349451.0, + "step": 2110 + }, + { + "entropy": 0.9729791641235351, + "epoch": 3.904235727440147, + "grad_norm": 0.7582741379737854, + "learning_rate": 7.156082319942929e-05, + "loss": 0.03894525766372681, + "mean_token_accuracy": 0.9847454309463501, + "num_tokens": 43554598.0, + "step": 2120 + }, + { + "entropy": 0.9860592544078827, + "epoch": 3.9226519337016574, + "grad_norm": 0.860698938369751, + "learning_rate": 7.128565789321969e-05, + "loss": 0.0365300178527832, + "mean_token_accuracy": 0.9859121859073638, + "num_tokens": 43760081.0, + "step": 2130 + }, + { + "entropy": 0.9916551172733307, + "epoch": 3.9410681399631677, + "grad_norm": 0.8363776206970215, + "learning_rate": 7.100970295852805e-05, + "loss": 0.036221379041671754, + "mean_token_accuracy": 0.9859034180641174, + "num_tokens": 43965432.0, + "step": 2140 + }, + { + "entropy": 0.9553558886051178, + "epoch": 3.9594843462246776, + "grad_norm": 0.9627474546432495, + "learning_rate": 7.073296863238242e-05, + "loss": 0.03684481382369995, + "mean_token_accuracy": 0.9857315957546234, + "num_tokens": 44171232.0, + "step": 2150 + }, + { + "entropy": 0.9538035809993743, + "epoch": 3.977900552486188, + "grad_norm": 0.8399474620819092, + "learning_rate": 7.045546518072366e-05, + "loss": 0.03825397789478302, + "mean_token_accuracy": 0.9846831560134888, + "num_tokens": 44376723.0, + "step": 2160 + }, + { + "entropy": 0.9476235210895538, + "epoch": 3.9963167587476978, + "grad_norm": 0.708739697933197, + "learning_rate": 7.017720289802472e-05, + "loss": 0.03618018329143524, + "mean_token_accuracy": 0.9861325800418854, + "num_tokens": 44582407.0, + "step": 2170 + }, + { + "epoch": 4.0, + "eval_entropy": 0.9569619194321011, + "eval_loss": 0.059838198125362396, + "eval_mean_token_accuracy": 0.9777795366618944, + "eval_num_tokens": 44623647.0, + "eval_runtime": 10.0379, + "eval_samples_per_second": 364.42, + "eval_steps_per_second": 11.457, + "step": 2172 + }, + { + "entropy": 0.9558675646781921, + "epoch": 4.014732965009208, + "grad_norm": 0.7347508668899536, + "learning_rate": 6.989819210690872e-05, + "loss": 0.02886659502983093, + "mean_token_accuracy": 0.9892994821071625, + "num_tokens": 44788219.0, + "step": 2180 + }, + { + "entropy": 1.0037677466869355, + "epoch": 4.033149171270718, + "grad_norm": 0.7403206825256348, + "learning_rate": 6.961844315776596e-05, + "loss": 0.02395295798778534, + "mean_token_accuracy": 0.9906026899814606, + "num_tokens": 44993505.0, + "step": 2190 + }, + { + "entropy": 1.0068290829658508, + "epoch": 4.051565377532229, + "grad_norm": 0.7979726195335388, + "learning_rate": 6.933796642837003e-05, + "loss": 0.02605988085269928, + "mean_token_accuracy": 0.9899706900119781, + "num_tokens": 45199193.0, + "step": 2200 + }, + { + "entropy": 0.9942211747169495, + "epoch": 4.069981583793738, + "grad_norm": 0.6460402011871338, + "learning_rate": 6.905677232349278e-05, + "loss": 0.025350230932235717, + "mean_token_accuracy": 0.9899386286735534, + "num_tokens": 45404030.0, + "step": 2210 + }, + { + "entropy": 0.9783595442771912, + "epoch": 4.088397790055248, + "grad_norm": 0.8177055716514587, + "learning_rate": 6.877487127451834e-05, + "loss": 0.02696993052959442, + "mean_token_accuracy": 0.9896106541156768, + "num_tokens": 45609763.0, + "step": 2220 + }, + { + "entropy": 0.9801763832569123, + "epoch": 4.106813996316759, + "grad_norm": 0.6608165502548218, + "learning_rate": 6.849227373905618e-05, + "loss": 0.025101393461227417, + "mean_token_accuracy": 0.9904372334480286, + "num_tokens": 45814941.0, + "step": 2230 + }, + { + "entropy": 0.9695689737796783, + "epoch": 4.125230202578269, + "grad_norm": 0.8036547899246216, + "learning_rate": 6.820899020055314e-05, + "loss": 0.027827343344688414, + "mean_token_accuracy": 0.9890337705612182, + "num_tokens": 46020535.0, + "step": 2240 + }, + { + "entropy": 0.9828635334968567, + "epoch": 4.143646408839779, + "grad_norm": 0.7729921936988831, + "learning_rate": 6.792503116790455e-05, + "loss": 0.02779492735862732, + "mean_token_accuracy": 0.9894372522830963, + "num_tokens": 46226013.0, + "step": 2250 + }, + { + "entropy": 0.9978842556476593, + "epoch": 4.162062615101289, + "grad_norm": 0.7334664463996887, + "learning_rate": 6.764040717506432e-05, + "loss": 0.025673511624336242, + "mean_token_accuracy": 0.9899355113506317, + "num_tokens": 46432087.0, + "step": 2260 + }, + { + "entropy": 1.0116403937339782, + "epoch": 4.180478821362799, + "grad_norm": 0.6769368052482605, + "learning_rate": 6.735512878065427e-05, + "loss": 0.024705511331558228, + "mean_token_accuracy": 0.9906128525733948, + "num_tokens": 46637478.0, + "step": 2270 + }, + { + "entropy": 0.9985016226768494, + "epoch": 4.198895027624309, + "grad_norm": 0.8301573991775513, + "learning_rate": 6.706920656757234e-05, + "loss": 0.02455987185239792, + "mean_token_accuracy": 0.9905728340148926, + "num_tokens": 46842562.0, + "step": 2280 + }, + { + "entropy": 0.9909430682659149, + "epoch": 4.21731123388582, + "grad_norm": 0.656026303768158, + "learning_rate": 6.67826511426001e-05, + "loss": 0.022711564600467683, + "mean_token_accuracy": 0.9910893619060517, + "num_tokens": 47048071.0, + "step": 2290 + }, + { + "entropy": 0.9868666052818298, + "epoch": 4.23572744014733, + "grad_norm": 0.7614991068840027, + "learning_rate": 6.649547313600916e-05, + "loss": 0.02453812211751938, + "mean_token_accuracy": 0.9908901154994965, + "num_tokens": 47253507.0, + "step": 2300 + }, + { + "entropy": 0.9870487153530121, + "epoch": 4.25414364640884, + "grad_norm": 0.7617276906967163, + "learning_rate": 6.62076832011669e-05, + "loss": 0.025818097591400146, + "mean_token_accuracy": 0.990347957611084, + "num_tokens": 47458747.0, + "step": 2310 + }, + { + "entropy": 0.9691080570220947, + "epoch": 4.27255985267035, + "grad_norm": 0.6743029952049255, + "learning_rate": 6.591929201414124e-05, + "loss": 0.02456912100315094, + "mean_token_accuracy": 0.9905289709568024, + "num_tokens": 47663643.0, + "step": 2320 + }, + { + "entropy": 0.9701108932495117, + "epoch": 4.29097605893186, + "grad_norm": 0.6964483261108398, + "learning_rate": 6.56303102733046e-05, + "loss": 0.02575681209564209, + "mean_token_accuracy": 0.9898503363132477, + "num_tokens": 47868982.0, + "step": 2330 + }, + { + "entropy": 0.969528192281723, + "epoch": 4.30939226519337, + "grad_norm": 0.7521987557411194, + "learning_rate": 6.5340748698937e-05, + "loss": 0.02678089737892151, + "mean_token_accuracy": 0.9898572087287902, + "num_tokens": 48074314.0, + "step": 2340 + }, + { + "entropy": 0.9921871721744537, + "epoch": 4.327808471454881, + "grad_norm": 0.6944513320922852, + "learning_rate": 6.505061803282844e-05, + "loss": 0.025553321838378905, + "mean_token_accuracy": 0.9907529592514038, + "num_tokens": 48279731.0, + "step": 2350 + }, + { + "entropy": 0.9768964886665344, + "epoch": 4.346224677716391, + "grad_norm": 0.6553092002868652, + "learning_rate": 6.47599290378803e-05, + "loss": 0.0250235915184021, + "mean_token_accuracy": 0.9904054701328278, + "num_tokens": 48485401.0, + "step": 2360 + }, + { + "entropy": 0.9612838506698609, + "epoch": 4.3646408839779, + "grad_norm": 0.916820228099823, + "learning_rate": 6.446869249770619e-05, + "loss": 0.028156182169914244, + "mean_token_accuracy": 0.9888657331466675, + "num_tokens": 48691047.0, + "step": 2370 + }, + { + "entropy": 0.9665832936763763, + "epoch": 4.383057090239411, + "grad_norm": 0.9197776913642883, + "learning_rate": 6.417691921623185e-05, + "loss": 0.025303921103477477, + "mean_token_accuracy": 0.989986252784729, + "num_tokens": 48896234.0, + "step": 2380 + }, + { + "entropy": 0.9686589121818543, + "epoch": 4.401473296500921, + "grad_norm": 0.8505764603614807, + "learning_rate": 6.388462001729434e-05, + "loss": 0.024816396832466125, + "mean_token_accuracy": 0.9909265041351318, + "num_tokens": 49101893.0, + "step": 2390 + }, + { + "entropy": 0.9625210344791413, + "epoch": 4.419889502762431, + "grad_norm": 1.0601766109466553, + "learning_rate": 6.359180574424062e-05, + "loss": 0.02706078290939331, + "mean_token_accuracy": 0.9895522117614746, + "num_tokens": 49307467.0, + "step": 2400 + }, + { + "entropy": 0.9679551541805267, + "epoch": 4.4383057090239415, + "grad_norm": 0.776253879070282, + "learning_rate": 6.329848725952514e-05, + "loss": 0.02693203091621399, + "mean_token_accuracy": 0.9893981635570526, + "num_tokens": 49513020.0, + "step": 2410 + }, + { + "entropy": 0.9704959928989411, + "epoch": 4.456721915285451, + "grad_norm": 0.5459668636322021, + "learning_rate": 6.3004675444307e-05, + "loss": 0.0279473751783371, + "mean_token_accuracy": 0.9894329369068146, + "num_tokens": 49718405.0, + "step": 2420 + }, + { + "entropy": 0.961863350868225, + "epoch": 4.475138121546961, + "grad_norm": 0.9338833093643188, + "learning_rate": 6.27103811980462e-05, + "loss": 0.026478803157806395, + "mean_token_accuracy": 0.9902269721031189, + "num_tokens": 49923375.0, + "step": 2430 + }, + { + "entropy": 0.9708506822586059, + "epoch": 4.4935543278084715, + "grad_norm": 0.9073707461357117, + "learning_rate": 6.241561543809947e-05, + "loss": 0.025289520621299744, + "mean_token_accuracy": 0.9904769957065582, + "num_tokens": 50128901.0, + "step": 2440 + }, + { + "entropy": 0.984996622800827, + "epoch": 4.511970534069982, + "grad_norm": 0.8674206733703613, + "learning_rate": 6.212038909931503e-05, + "loss": 0.026442551612854005, + "mean_token_accuracy": 0.9905101835727692, + "num_tokens": 50334449.0, + "step": 2450 + }, + { + "entropy": 0.9926377475261688, + "epoch": 4.530386740331492, + "grad_norm": 0.7571811079978943, + "learning_rate": 6.182471313362717e-05, + "loss": 0.026819539070129395, + "mean_token_accuracy": 0.9898989200592041, + "num_tokens": 50539597.0, + "step": 2460 + }, + { + "entropy": 0.9450563549995422, + "epoch": 4.5488029465930016, + "grad_norm": 0.6651087403297424, + "learning_rate": 6.15285985096498e-05, + "loss": 0.02665227949619293, + "mean_token_accuracy": 0.9897156655788422, + "num_tokens": 50744926.0, + "step": 2470 + }, + { + "entropy": 0.9715635657310486, + "epoch": 4.567219152854512, + "grad_norm": 0.7445545196533203, + "learning_rate": 6.12320562122697e-05, + "loss": 0.026212453842163086, + "mean_token_accuracy": 0.9904700636863708, + "num_tokens": 50950152.0, + "step": 2480 + }, + { + "entropy": 0.9613442063331604, + "epoch": 4.585635359116022, + "grad_norm": 0.7168459296226501, + "learning_rate": 6.0935097242238837e-05, + "loss": 0.02508128583431244, + "mean_token_accuracy": 0.9901923894882202, + "num_tokens": 51155430.0, + "step": 2490 + }, + { + "entropy": 0.9571944534778595, + "epoch": 4.6040515653775325, + "grad_norm": 0.7590732574462891, + "learning_rate": 6.063773261576646e-05, + "loss": 0.025445500016212465, + "mean_token_accuracy": 0.9902949810028077, + "num_tokens": 51360826.0, + "step": 2500 + }, + { + "entropy": 0.947079461812973, + "epoch": 4.622467771639043, + "grad_norm": 0.6942175030708313, + "learning_rate": 6.033997336411035e-05, + "loss": 0.026132801175117494, + "mean_token_accuracy": 0.9900939345359803, + "num_tokens": 51566095.0, + "step": 2510 + }, + { + "entropy": 0.970003741979599, + "epoch": 4.640883977900552, + "grad_norm": 0.6562672257423401, + "learning_rate": 6.00418305331675e-05, + "loss": 0.024759869277477264, + "mean_token_accuracy": 0.9905019223690033, + "num_tokens": 51771177.0, + "step": 2520 + }, + { + "entropy": 0.9715348601341247, + "epoch": 4.6593001841620625, + "grad_norm": 0.6151639819145203, + "learning_rate": 5.9743315183064564e-05, + "loss": 0.024138522148132325, + "mean_token_accuracy": 0.9910101473331452, + "num_tokens": 51976349.0, + "step": 2530 + }, + { + "entropy": 0.9552160143852234, + "epoch": 4.677716390423573, + "grad_norm": 0.968815267086029, + "learning_rate": 5.9444438387747336e-05, + "loss": 0.027274739742279053, + "mean_token_accuracy": 0.9896075248718261, + "num_tokens": 52181820.0, + "step": 2540 + }, + { + "entropy": 0.9265012145042419, + "epoch": 4.696132596685083, + "grad_norm": 0.8966720700263977, + "learning_rate": 5.914521123457015e-05, + "loss": 0.0291823148727417, + "mean_token_accuracy": 0.9886700630187988, + "num_tokens": 52387511.0, + "step": 2550 + }, + { + "entropy": 0.9156096875667572, + "epoch": 4.714548802946593, + "grad_norm": 0.7747519612312317, + "learning_rate": 5.88456448238844e-05, + "loss": 0.02809179127216339, + "mean_token_accuracy": 0.9891100466251374, + "num_tokens": 52592737.0, + "step": 2560 + }, + { + "entropy": 0.924511456489563, + "epoch": 4.732965009208103, + "grad_norm": 1.0087049007415771, + "learning_rate": 5.8545750268626844e-05, + "loss": 0.02683232128620148, + "mean_token_accuracy": 0.9896528899669648, + "num_tokens": 52798814.0, + "step": 2570 + }, + { + "entropy": 0.9662951111793519, + "epoch": 4.751381215469613, + "grad_norm": 0.7709590792655945, + "learning_rate": 5.824553869390734e-05, + "loss": 0.02503817081451416, + "mean_token_accuracy": 0.9900161385536194, + "num_tokens": 53004478.0, + "step": 2580 + }, + { + "entropy": 0.9889141619205475, + "epoch": 4.769797421731123, + "grad_norm": 0.815858006477356, + "learning_rate": 5.794502123659613e-05, + "loss": 0.026327347755432128, + "mean_token_accuracy": 0.9900785744190216, + "num_tokens": 53209888.0, + "step": 2590 + }, + { + "entropy": 0.9785685896873474, + "epoch": 4.788213627992634, + "grad_norm": 0.6514431238174438, + "learning_rate": 5.7644209044910735e-05, + "loss": 0.025033789873123168, + "mean_token_accuracy": 0.9902650475502014, + "num_tokens": 53415533.0, + "step": 2600 + }, + { + "entropy": 0.9723869919776916, + "epoch": 4.806629834254144, + "grad_norm": 0.8778963685035706, + "learning_rate": 5.7343113278002284e-05, + "loss": 0.02379843294620514, + "mean_token_accuracy": 0.9909472465515137, + "num_tokens": 53620850.0, + "step": 2610 + }, + { + "entropy": 0.9572711050510406, + "epoch": 4.8250460405156534, + "grad_norm": 0.8927134871482849, + "learning_rate": 5.70417451055417e-05, + "loss": 0.024856947362422943, + "mean_token_accuracy": 0.9904125213623047, + "num_tokens": 53826259.0, + "step": 2620 + }, + { + "entropy": 0.9523135125637054, + "epoch": 4.843462246777164, + "grad_norm": 0.6832691431045532, + "learning_rate": 5.674011570730523e-05, + "loss": 0.025352203845977785, + "mean_token_accuracy": 0.990432596206665, + "num_tokens": 54031531.0, + "step": 2630 + }, + { + "entropy": 0.9735220730304718, + "epoch": 4.861878453038674, + "grad_norm": 0.6399164795875549, + "learning_rate": 5.643823627275972e-05, + "loss": 0.026541513204574586, + "mean_token_accuracy": 0.9900369107723236, + "num_tokens": 54237155.0, + "step": 2640 + }, + { + "entropy": 0.9566517114639282, + "epoch": 4.880294659300184, + "grad_norm": 0.8725414276123047, + "learning_rate": 5.6136118000647616e-05, + "loss": 0.02675778865814209, + "mean_token_accuracy": 0.9894899427890778, + "num_tokens": 54442739.0, + "step": 2650 + }, + { + "entropy": 0.9447909593582153, + "epoch": 4.898710865561695, + "grad_norm": 0.8169302344322205, + "learning_rate": 5.583377209857138e-05, + "loss": 0.02642086148262024, + "mean_token_accuracy": 0.989885401725769, + "num_tokens": 54648098.0, + "step": 2660 + }, + { + "entropy": 0.9180052697658538, + "epoch": 4.917127071823204, + "grad_norm": 0.7768753170967102, + "learning_rate": 5.553120978257787e-05, + "loss": 0.02552323341369629, + "mean_token_accuracy": 0.9899512350559234, + "num_tokens": 54854281.0, + "step": 2670 + }, + { + "entropy": 0.917166668176651, + "epoch": 4.935543278084714, + "grad_norm": 0.8241410851478577, + "learning_rate": 5.5228442276742153e-05, + "loss": 0.02788199484348297, + "mean_token_accuracy": 0.989625746011734, + "num_tokens": 55059495.0, + "step": 2680 + }, + { + "entropy": 0.9345465302467346, + "epoch": 4.953959484346225, + "grad_norm": 0.7645496129989624, + "learning_rate": 5.4925480812751166e-05, + "loss": 0.02517639398574829, + "mean_token_accuracy": 0.9902283847332001, + "num_tokens": 55265381.0, + "step": 2690 + }, + { + "entropy": 0.9386432528495788, + "epoch": 4.972375690607735, + "grad_norm": 0.8371859192848206, + "learning_rate": 5.46223366294871e-05, + "loss": 0.025585666298866272, + "mean_token_accuracy": 0.9903791427612305, + "num_tokens": 55471210.0, + "step": 2700 + }, + { + "entropy": 0.9267561137676239, + "epoch": 4.990791896869245, + "grad_norm": 0.6789297461509705, + "learning_rate": 5.43190209726104e-05, + "loss": 0.024646708369255067, + "mean_token_accuracy": 0.9904700815677643, + "num_tokens": 55676877.0, + "step": 2710 + }, + { + "epoch": 5.0, + "eval_entropy": 0.9283919717954553, + "eval_loss": 0.06225527077913284, + "eval_mean_token_accuracy": 0.9784110421719758, + "eval_num_tokens": 55779559.0, + "eval_runtime": 10.0613, + "eval_samples_per_second": 363.573, + "eval_steps_per_second": 11.43, + "step": 2715 + }, + { + "entropy": 0.9269404351711273, + "epoch": 5.009208103130755, + "grad_norm": 0.540570080280304, + "learning_rate": 5.401554509414264e-05, + "loss": 0.019513805210590363, + "mean_token_accuracy": 0.9927033007144928, + "num_tokens": 55882241.0, + "step": 2720 + }, + { + "entropy": 0.9377441763877868, + "epoch": 5.027624309392265, + "grad_norm": 0.5840998888015747, + "learning_rate": 5.3711920252049085e-05, + "loss": 0.015180909633636474, + "mean_token_accuracy": 0.9944471418857574, + "num_tokens": 56087470.0, + "step": 2730 + }, + { + "entropy": 0.949122017621994, + "epoch": 5.046040515653775, + "grad_norm": 0.6938672065734863, + "learning_rate": 5.340815770982106e-05, + "loss": 0.0153742715716362, + "mean_token_accuracy": 0.9941534519195556, + "num_tokens": 56292226.0, + "step": 2740 + }, + { + "entropy": 0.9394402146339417, + "epoch": 5.064456721915286, + "grad_norm": 0.8259939551353455, + "learning_rate": 5.310426873605814e-05, + "loss": 0.014350908994674682, + "mean_token_accuracy": 0.9945570707321167, + "num_tokens": 56497839.0, + "step": 2750 + }, + { + "entropy": 0.9323545396327972, + "epoch": 5.082872928176796, + "grad_norm": 0.9675024747848511, + "learning_rate": 5.280026460405005e-05, + "loss": 0.016550135612487794, + "mean_token_accuracy": 0.9938908398151398, + "num_tokens": 56702932.0, + "step": 2760 + }, + { + "entropy": 0.89125554561615, + "epoch": 5.101289134438305, + "grad_norm": 0.8347184658050537, + "learning_rate": 5.2496156591358566e-05, + "loss": 0.017917826771736145, + "mean_token_accuracy": 0.9934309899806977, + "num_tokens": 56908644.0, + "step": 2770 + }, + { + "entropy": 0.8773505449295044, + "epoch": 5.119705340699816, + "grad_norm": 0.8869524598121643, + "learning_rate": 5.219195597939908e-05, + "loss": 0.017221055924892426, + "mean_token_accuracy": 0.993448656797409, + "num_tokens": 57114171.0, + "step": 2780 + }, + { + "entropy": 0.8874686002731323, + "epoch": 5.138121546961326, + "grad_norm": 1.0294251441955566, + "learning_rate": 5.1887674053022084e-05, + "loss": 0.018111808598041533, + "mean_token_accuracy": 0.9931293666362763, + "num_tokens": 57319158.0, + "step": 2790 + }, + { + "entropy": 0.8893351197242737, + "epoch": 5.156537753222836, + "grad_norm": 0.6253597736358643, + "learning_rate": 5.15833221000946e-05, + "loss": 0.017256538569927215, + "mean_token_accuracy": 0.9936724424362182, + "num_tokens": 57524901.0, + "step": 2800 + }, + { + "entropy": 0.9157109141349793, + "epoch": 5.1749539594843466, + "grad_norm": 0.6379142999649048, + "learning_rate": 5.12789114110814e-05, + "loss": 0.016415870189666747, + "mean_token_accuracy": 0.9939744889736175, + "num_tokens": 57730135.0, + "step": 2810 + }, + { + "entropy": 0.9157932877540589, + "epoch": 5.193370165745856, + "grad_norm": 0.7195688486099243, + "learning_rate": 5.097445327862619e-05, + "loss": 0.01577536463737488, + "mean_token_accuracy": 0.9941773355007172, + "num_tokens": 57936210.0, + "step": 2820 + }, + { + "entropy": 0.9179767727851867, + "epoch": 5.211786372007366, + "grad_norm": 0.7149335741996765, + "learning_rate": 5.066995899713264e-05, + "loss": 0.01606254279613495, + "mean_token_accuracy": 0.9937664806842804, + "num_tokens": 58141736.0, + "step": 2830 + }, + { + "entropy": 0.895512479543686, + "epoch": 5.230202578268877, + "grad_norm": 0.6460169553756714, + "learning_rate": 5.036543986234543e-05, + "loss": 0.01605578660964966, + "mean_token_accuracy": 0.994063013792038, + "num_tokens": 58347178.0, + "step": 2840 + }, + { + "entropy": 0.8883109211921691, + "epoch": 5.248618784530387, + "grad_norm": 0.72477787733078, + "learning_rate": 5.006090717093128e-05, + "loss": 0.016773784160614015, + "mean_token_accuracy": 0.9940340936183929, + "num_tokens": 58552952.0, + "step": 2850 + }, + { + "entropy": 0.8942575633525849, + "epoch": 5.267034990791897, + "grad_norm": 0.7344926595687866, + "learning_rate": 4.9756372220059736e-05, + "loss": 0.01604126989841461, + "mean_token_accuracy": 0.994256991147995, + "num_tokens": 58758449.0, + "step": 2860 + }, + { + "entropy": 0.8854099690914154, + "epoch": 5.285451197053407, + "grad_norm": 0.6142122149467468, + "learning_rate": 4.9451846306984214e-05, + "loss": 0.016244474053382873, + "mean_token_accuracy": 0.9938375532627106, + "num_tokens": 58963691.0, + "step": 2870 + }, + { + "entropy": 0.8745675146579742, + "epoch": 5.303867403314917, + "grad_norm": 0.8025366067886353, + "learning_rate": 4.9147340728622816e-05, + "loss": 0.01611460596323013, + "mean_token_accuracy": 0.9941173672676087, + "num_tokens": 59169484.0, + "step": 2880 + }, + { + "entropy": 0.8812389194965362, + "epoch": 5.322283609576427, + "grad_norm": 0.7699193358421326, + "learning_rate": 4.884286678113935e-05, + "loss": 0.016995206475257874, + "mean_token_accuracy": 0.9937523245811463, + "num_tokens": 59374627.0, + "step": 2890 + }, + { + "entropy": 0.8924362242221833, + "epoch": 5.3406998158379375, + "grad_norm": 0.7516226172447205, + "learning_rate": 4.853843575952414e-05, + "loss": 0.01652217358350754, + "mean_token_accuracy": 0.9936819314956665, + "num_tokens": 59580135.0, + "step": 2900 + }, + { + "entropy": 0.8972602427005768, + "epoch": 5.359116022099448, + "grad_norm": 0.7781681418418884, + "learning_rate": 4.823405895717521e-05, + "loss": 0.017360319197177888, + "mean_token_accuracy": 0.9935634732246399, + "num_tokens": 59785392.0, + "step": 2910 + }, + { + "entropy": 0.900998342037201, + "epoch": 5.377532228360957, + "grad_norm": 0.6837047934532166, + "learning_rate": 4.792974766547911e-05, + "loss": 0.017162233591079712, + "mean_token_accuracy": 0.993264091014862, + "num_tokens": 59991448.0, + "step": 2920 + }, + { + "entropy": 0.9239763855934143, + "epoch": 5.3959484346224675, + "grad_norm": 0.7188259363174438, + "learning_rate": 4.762551317339226e-05, + "loss": 0.01718595027923584, + "mean_token_accuracy": 0.9933857440948486, + "num_tokens": 60197079.0, + "step": 2930 + }, + { + "entropy": 0.9056789398193359, + "epoch": 5.414364640883978, + "grad_norm": 0.6922260522842407, + "learning_rate": 4.732136676702198e-05, + "loss": 0.016596907377243043, + "mean_token_accuracy": 0.9937462329864502, + "num_tokens": 60402299.0, + "step": 2940 + }, + { + "entropy": 0.9038522362709045, + "epoch": 5.432780847145488, + "grad_norm": 0.7937009930610657, + "learning_rate": 4.7017319729207954e-05, + "loss": 0.016130413115024566, + "mean_token_accuracy": 0.9941940546035767, + "num_tokens": 60607907.0, + "step": 2950 + }, + { + "entropy": 0.8911147236824035, + "epoch": 5.4511970534069984, + "grad_norm": 0.6261171698570251, + "learning_rate": 4.671338333910359e-05, + "loss": 0.01622493863105774, + "mean_token_accuracy": 0.9937945663928985, + "num_tokens": 60813428.0, + "step": 2960 + }, + { + "entropy": 0.8894322276115417, + "epoch": 5.469613259668508, + "grad_norm": 0.6898378729820251, + "learning_rate": 4.6409568871757645e-05, + "loss": 0.016513559222221374, + "mean_token_accuracy": 0.9936174690723419, + "num_tokens": 61018404.0, + "step": 2970 + }, + { + "entropy": 0.9026601016521454, + "epoch": 5.488029465930018, + "grad_norm": 0.7027897834777832, + "learning_rate": 4.610588759769593e-05, + "loss": 0.016727012395858765, + "mean_token_accuracy": 0.9941417872905731, + "num_tokens": 61223660.0, + "step": 2980 + }, + { + "entropy": 0.8903301954269409, + "epoch": 5.5064456721915285, + "grad_norm": 0.9087063074111938, + "learning_rate": 4.5802350782503196e-05, + "loss": 0.016929233074188234, + "mean_token_accuracy": 0.9935264468193055, + "num_tokens": 61429438.0, + "step": 2990 + }, + { + "entropy": 0.8886692762374878, + "epoch": 5.524861878453039, + "grad_norm": 0.8283822536468506, + "learning_rate": 4.5498969686405266e-05, + "loss": 0.015396638214588166, + "mean_token_accuracy": 0.99433131814003, + "num_tokens": 61635274.0, + "step": 3000 + }, + { + "entropy": 0.8902086555957794, + "epoch": 5.543278084714549, + "grad_norm": 0.7676647305488586, + "learning_rate": 4.5195755563851336e-05, + "loss": 0.01673731654882431, + "mean_token_accuracy": 0.9938134133815766, + "num_tokens": 61840778.0, + "step": 3010 + }, + { + "entropy": 0.8941606819629669, + "epoch": 5.5616942909760585, + "grad_norm": 0.7026392221450806, + "learning_rate": 4.489271966309634e-05, + "loss": 0.01694796681404114, + "mean_token_accuracy": 0.9936233103275299, + "num_tokens": 62046355.0, + "step": 3020 + }, + { + "entropy": 0.90918750166893, + "epoch": 5.580110497237569, + "grad_norm": 0.7146924734115601, + "learning_rate": 4.4589873225783806e-05, + "loss": 0.01852080672979355, + "mean_token_accuracy": 0.9928994178771973, + "num_tokens": 62251709.0, + "step": 3030 + }, + { + "entropy": 0.8946544349193573, + "epoch": 5.598526703499079, + "grad_norm": 0.607246458530426, + "learning_rate": 4.428722748652881e-05, + "loss": 0.016636812686920167, + "mean_token_accuracy": 0.9939334273338318, + "num_tokens": 62456680.0, + "step": 3040 + }, + { + "entropy": 0.8854653835296631, + "epoch": 5.616942909760589, + "grad_norm": 0.7457882165908813, + "learning_rate": 4.3984793672501124e-05, + "loss": 0.016008296608924867, + "mean_token_accuracy": 0.9940589666366577, + "num_tokens": 62662038.0, + "step": 3050 + }, + { + "entropy": 0.8921085000038147, + "epoch": 5.6353591160221, + "grad_norm": 0.7707350254058838, + "learning_rate": 4.368258300300888e-05, + "loss": 0.016655120253562927, + "mean_token_accuracy": 0.993935889005661, + "num_tokens": 62867272.0, + "step": 3060 + }, + { + "entropy": 0.8768653869628906, + "epoch": 5.653775322283609, + "grad_norm": 0.6994554996490479, + "learning_rate": 4.3380606689082166e-05, + "loss": 0.015841150283813478, + "mean_token_accuracy": 0.9944550096988678, + "num_tokens": 63072403.0, + "step": 3070 + }, + { + "entropy": 0.8767679035663605, + "epoch": 5.672191528545119, + "grad_norm": 0.8327192068099976, + "learning_rate": 4.307887593305733e-05, + "loss": 0.015826576948165895, + "mean_token_accuracy": 0.9941202461719513, + "num_tokens": 63277635.0, + "step": 3080 + }, + { + "entropy": 0.8763292253017425, + "epoch": 5.69060773480663, + "grad_norm": 0.7224747538566589, + "learning_rate": 4.277740192816127e-05, + "loss": 0.015298140048980714, + "mean_token_accuracy": 0.9943080008029938, + "num_tokens": 63483196.0, + "step": 3090 + }, + { + "entropy": 0.8834661841392517, + "epoch": 5.70902394106814, + "grad_norm": 0.9508277773857117, + "learning_rate": 4.247619585809627e-05, + "loss": 0.01658404469490051, + "mean_token_accuracy": 0.9934300124645233, + "num_tokens": 63688721.0, + "step": 3100 + }, + { + "entropy": 0.899389523267746, + "epoch": 5.72744014732965, + "grad_norm": 0.7170981168746948, + "learning_rate": 4.217526889662512e-05, + "loss": 0.015803813934326172, + "mean_token_accuracy": 0.9940325975418091, + "num_tokens": 63894220.0, + "step": 3110 + }, + { + "entropy": 0.8968011736869812, + "epoch": 5.74585635359116, + "grad_norm": 0.6686251163482666, + "learning_rate": 4.187463220715659e-05, + "loss": 0.015874400734901428, + "mean_token_accuracy": 0.9940970957279205, + "num_tokens": 64099768.0, + "step": 3120 + }, + { + "entropy": 0.8900792479515076, + "epoch": 5.76427255985267, + "grad_norm": 0.5979828238487244, + "learning_rate": 4.157429694233128e-05, + "loss": 0.01613767147064209, + "mean_token_accuracy": 0.9942961037158966, + "num_tokens": 64305055.0, + "step": 3130 + }, + { + "entropy": 0.8899810135364532, + "epoch": 5.78268876611418, + "grad_norm": 0.7330048084259033, + "learning_rate": 4.127427424360794e-05, + "loss": 0.016168563067913054, + "mean_token_accuracy": 0.9941077649593353, + "num_tokens": 64510002.0, + "step": 3140 + }, + { + "entropy": 0.8805335581302642, + "epoch": 5.801104972375691, + "grad_norm": 0.5978623032569885, + "learning_rate": 4.09745752408501e-05, + "loss": 0.01524556577205658, + "mean_token_accuracy": 0.994326776266098, + "num_tokens": 64715431.0, + "step": 3150 + }, + { + "entropy": 0.878781646490097, + "epoch": 5.819521178637201, + "grad_norm": 0.6749313473701477, + "learning_rate": 4.067521105191331e-05, + "loss": 0.015209287405014038, + "mean_token_accuracy": 0.9942974805831909, + "num_tokens": 64921579.0, + "step": 3160 + }, + { + "entropy": 0.8844729900360108, + "epoch": 5.83793738489871, + "grad_norm": 0.6887196898460388, + "learning_rate": 4.037619278223255e-05, + "loss": 0.01619938760995865, + "mean_token_accuracy": 0.9937683045864105, + "num_tokens": 65127007.0, + "step": 3170 + }, + { + "entropy": 0.8780498623847961, + "epoch": 5.856353591160221, + "grad_norm": 0.6962174773216248, + "learning_rate": 4.0077531524410304e-05, + "loss": 0.015934592485427855, + "mean_token_accuracy": 0.9935103774070739, + "num_tokens": 65332418.0, + "step": 3180 + }, + { + "entropy": 0.8889612555503845, + "epoch": 5.874769797421731, + "grad_norm": 0.6049854159355164, + "learning_rate": 3.977923835780517e-05, + "loss": 0.01600206792354584, + "mean_token_accuracy": 0.9937360048294067, + "num_tokens": 65537845.0, + "step": 3190 + }, + { + "entropy": 0.8960810244083405, + "epoch": 5.893186003683241, + "grad_norm": 0.6341013312339783, + "learning_rate": 3.948132434812065e-05, + "loss": 0.0143389493227005, + "mean_token_accuracy": 0.9948007702827454, + "num_tokens": 65743412.0, + "step": 3200 + }, + { + "entropy": 0.887304550409317, + "epoch": 5.911602209944752, + "grad_norm": 0.7564852237701416, + "learning_rate": 3.9183800546994886e-05, + "loss": 0.016044440865516662, + "mean_token_accuracy": 0.9939335525035858, + "num_tokens": 65948884.0, + "step": 3210 + }, + { + "entropy": 0.8823239862918854, + "epoch": 5.930018416206261, + "grad_norm": 0.6525556445121765, + "learning_rate": 3.8886677991590435e-05, + "loss": 0.016112390160560607, + "mean_token_accuracy": 0.9938134670257568, + "num_tokens": 66153768.0, + "step": 3220 + }, + { + "entropy": 0.8712829887866974, + "epoch": 5.948434622467771, + "grad_norm": 0.676167368888855, + "learning_rate": 3.858996770418504e-05, + "loss": 0.015146306157112122, + "mean_token_accuracy": 0.9944733619689942, + "num_tokens": 66359661.0, + "step": 3230 + }, + { + "entropy": 0.8734102070331573, + "epoch": 5.966850828729282, + "grad_norm": 0.6284340023994446, + "learning_rate": 3.829368069176257e-05, + "loss": 0.017269474267959595, + "mean_token_accuracy": 0.9938443183898926, + "num_tokens": 66565228.0, + "step": 3240 + }, + { + "entropy": 0.8757335782051087, + "epoch": 5.985267034990792, + "grad_norm": 0.722522497177124, + "learning_rate": 3.799782794560484e-05, + "loss": 0.015032704174518585, + "mean_token_accuracy": 0.9942249894142151, + "num_tokens": 66770718.0, + "step": 3250 + }, + { + "epoch": 6.0, + "eval_entropy": 0.8780099873957427, + "eval_loss": 0.06740746647119522, + "eval_mean_token_accuracy": 0.9795082377350849, + "eval_num_tokens": 66935435.0, + "eval_runtime": 10.0955, + "eval_samples_per_second": 362.34, + "eval_steps_per_second": 11.391, + "step": 3258 + }, + { + "entropy": 0.8773481965065002, + "epoch": 6.003683241252302, + "grad_norm": 0.49184396862983704, + "learning_rate": 3.770242044088375e-05, + "loss": 0.013721099495887757, + "mean_token_accuracy": 0.9951768457889557, + "num_tokens": 66976478.0, + "step": 3260 + }, + { + "entropy": 0.8643155217170715, + "epoch": 6.0220994475138125, + "grad_norm": 0.4903622567653656, + "learning_rate": 3.7407469136254234e-05, + "loss": 0.009165047109127045, + "mean_token_accuracy": 0.9969388306140899, + "num_tokens": 67182251.0, + "step": 3270 + }, + { + "entropy": 0.8560326337814331, + "epoch": 6.040515653775322, + "grad_norm": 0.6360073685646057, + "learning_rate": 3.711298497344766e-05, + "loss": 0.010200753808021545, + "mean_token_accuracy": 0.9964211463928223, + "num_tokens": 67387493.0, + "step": 3280 + }, + { + "entropy": 0.8391405165195465, + "epoch": 6.058931860036832, + "grad_norm": 0.519554853439331, + "learning_rate": 3.6818978876865984e-05, + "loss": 0.008906974643468856, + "mean_token_accuracy": 0.9966452360153198, + "num_tokens": 67593314.0, + "step": 3290 + }, + { + "entropy": 0.8300552070140839, + "epoch": 6.077348066298343, + "grad_norm": 0.5294632911682129, + "learning_rate": 3.6525461753176426e-05, + "loss": 0.008088209480047227, + "mean_token_accuracy": 0.9971098065376282, + "num_tokens": 67798634.0, + "step": 3300 + }, + { + "entropy": 0.822588461637497, + "epoch": 6.095764272559853, + "grad_norm": 0.46423637866973877, + "learning_rate": 3.623244449090697e-05, + "loss": 0.008058926463127137, + "mean_token_accuracy": 0.9970856845378876, + "num_tokens": 68003683.0, + "step": 3310 + }, + { + "entropy": 0.8130167067050934, + "epoch": 6.114180478821363, + "grad_norm": 0.497258722782135, + "learning_rate": 3.5939937960042314e-05, + "loss": 0.008712668716907502, + "mean_token_accuracy": 0.996820193529129, + "num_tokens": 68208795.0, + "step": 3320 + }, + { + "entropy": 0.8035802125930787, + "epoch": 6.132596685082873, + "grad_norm": 0.43152952194213867, + "learning_rate": 3.5647953011620716e-05, + "loss": 0.008366625010967254, + "mean_token_accuracy": 0.9969616234302521, + "num_tokens": 68414414.0, + "step": 3330 + }, + { + "entropy": 0.8028200149536133, + "epoch": 6.151012891344383, + "grad_norm": 0.6057612299919128, + "learning_rate": 3.535650047733141e-05, + "loss": 0.00967741459608078, + "mean_token_accuracy": 0.9963694036006927, + "num_tokens": 68619806.0, + "step": 3340 + }, + { + "entropy": 0.7982640087604522, + "epoch": 6.169429097605893, + "grad_norm": 0.4966030716896057, + "learning_rate": 3.5065591169112785e-05, + "loss": 0.00934397652745247, + "mean_token_accuracy": 0.9969527781009674, + "num_tokens": 68825254.0, + "step": 3350 + }, + { + "entropy": 0.7859498977661132, + "epoch": 6.1878453038674035, + "grad_norm": 0.6962474584579468, + "learning_rate": 3.477523587875139e-05, + "loss": 0.010414297878742217, + "mean_token_accuracy": 0.996203750371933, + "num_tokens": 69031043.0, + "step": 3360 + }, + { + "entropy": 0.7869667530059814, + "epoch": 6.206261510128914, + "grad_norm": 0.6078894734382629, + "learning_rate": 3.448544537748143e-05, + "loss": 0.008547455072402954, + "mean_token_accuracy": 0.9968406975269317, + "num_tokens": 69236559.0, + "step": 3370 + }, + { + "entropy": 0.8036401033401489, + "epoch": 6.224677716390423, + "grad_norm": 0.5990306735038757, + "learning_rate": 3.4196230415585337e-05, + "loss": 0.00924447700381279, + "mean_token_accuracy": 0.9967190623283386, + "num_tokens": 69441764.0, + "step": 3380 + }, + { + "entropy": 0.8085561394691467, + "epoch": 6.2430939226519335, + "grad_norm": 0.46029484272003174, + "learning_rate": 3.390760172199486e-05, + "loss": 0.008379801362752914, + "mean_token_accuracy": 0.9970395743846894, + "num_tokens": 69647075.0, + "step": 3390 + }, + { + "entropy": 0.8132422208786011, + "epoch": 6.261510128913444, + "grad_norm": 0.5699496865272522, + "learning_rate": 3.361957000389315e-05, + "loss": 0.009426499903202056, + "mean_token_accuracy": 0.996586662530899, + "num_tokens": 69852809.0, + "step": 3400 + }, + { + "entropy": 0.8065890491008758, + "epoch": 6.279926335174954, + "grad_norm": 0.6212234497070312, + "learning_rate": 3.33321459463175e-05, + "loss": 0.009811153262853622, + "mean_token_accuracy": 0.9965905249118805, + "num_tokens": 70057955.0, + "step": 3410 + }, + { + "entropy": 0.7904254853725433, + "epoch": 6.298342541436464, + "grad_norm": 0.8000790476799011, + "learning_rate": 3.304534021176299e-05, + "loss": 0.00957801640033722, + "mean_token_accuracy": 0.9964518308639526, + "num_tokens": 70263517.0, + "step": 3420 + }, + { + "entropy": 0.7971100151538849, + "epoch": 6.316758747697974, + "grad_norm": 0.6359512209892273, + "learning_rate": 3.275916343978689e-05, + "loss": 0.009681916236877442, + "mean_token_accuracy": 0.9967545390129089, + "num_tokens": 70468400.0, + "step": 3430 + }, + { + "entropy": 0.7987187504768372, + "epoch": 6.335174953959484, + "grad_norm": 0.5094901919364929, + "learning_rate": 3.247362624661406e-05, + "loss": 0.009966370463371278, + "mean_token_accuracy": 0.9964035987854004, + "num_tokens": 70673648.0, + "step": 3440 + }, + { + "entropy": 0.7850228011608124, + "epoch": 6.3535911602209945, + "grad_norm": 0.5554385185241699, + "learning_rate": 3.218873922474303e-05, + "loss": 0.009521079063415528, + "mean_token_accuracy": 0.9966651916503906, + "num_tokens": 70879452.0, + "step": 3450 + }, + { + "entropy": 0.7885844230651855, + "epoch": 6.372007366482505, + "grad_norm": 0.5217951536178589, + "learning_rate": 3.190451294255314e-05, + "loss": 0.00949474424123764, + "mean_token_accuracy": 0.9966598808765411, + "num_tokens": 71085217.0, + "step": 3460 + }, + { + "entropy": 0.797072297334671, + "epoch": 6.390423572744015, + "grad_norm": 0.5385560393333435, + "learning_rate": 3.162095794391241e-05, + "loss": 0.009810312837362289, + "mean_token_accuracy": 0.9965846955776214, + "num_tokens": 71290955.0, + "step": 3470 + }, + { + "entropy": 0.8024774849414825, + "epoch": 6.4088397790055245, + "grad_norm": 0.5419294238090515, + "learning_rate": 3.1338084747786456e-05, + "loss": 0.009127366542816161, + "mean_token_accuracy": 0.9968222141265869, + "num_tokens": 71496654.0, + "step": 3480 + }, + { + "entropy": 0.8082470417022705, + "epoch": 6.427255985267035, + "grad_norm": 0.7315362095832825, + "learning_rate": 3.105590384784821e-05, + "loss": 0.008642691373825073, + "mean_token_accuracy": 0.9970867097377777, + "num_tokens": 71701725.0, + "step": 3490 + }, + { + "entropy": 0.804630172252655, + "epoch": 6.445672191528545, + "grad_norm": 0.6668549180030823, + "learning_rate": 3.0774425712088676e-05, + "loss": 0.008679335564374923, + "mean_token_accuracy": 0.9969714701175689, + "num_tokens": 71907003.0, + "step": 3500 + }, + { + "entropy": 0.7939219176769257, + "epoch": 6.464088397790055, + "grad_norm": 0.8388434648513794, + "learning_rate": 3.049366078242864e-05, + "loss": 0.009249264001846313, + "mean_token_accuracy": 0.99674671292305, + "num_tokens": 72112532.0, + "step": 3510 + }, + { + "entropy": 0.78477823138237, + "epoch": 6.482504604051566, + "grad_norm": 0.4963231682777405, + "learning_rate": 3.021361947433125e-05, + "loss": 0.009192919731140137, + "mean_token_accuracy": 0.9965968191623688, + "num_tokens": 72318518.0, + "step": 3520 + }, + { + "entropy": 0.7841647148132325, + "epoch": 6.500920810313076, + "grad_norm": 0.5681823492050171, + "learning_rate": 2.9934312176415636e-05, + "loss": 0.008821797370910645, + "mean_token_accuracy": 0.9968703150749206, + "num_tokens": 72524548.0, + "step": 3530 + }, + { + "entropy": 0.7848304688930512, + "epoch": 6.519337016574585, + "grad_norm": 0.7126080393791199, + "learning_rate": 2.965574925007154e-05, + "loss": 0.009742744266986847, + "mean_token_accuracy": 0.9964317202568054, + "num_tokens": 72729659.0, + "step": 3540 + }, + { + "entropy": 0.7899512410163879, + "epoch": 6.537753222836096, + "grad_norm": 0.7015056014060974, + "learning_rate": 2.9377941029074986e-05, + "loss": 0.008977667987346649, + "mean_token_accuracy": 0.9968570172786713, + "num_tokens": 72934865.0, + "step": 3550 + }, + { + "entropy": 0.797937935590744, + "epoch": 6.556169429097606, + "grad_norm": 0.700501024723053, + "learning_rate": 2.910089781920486e-05, + "loss": 0.00973074734210968, + "mean_token_accuracy": 0.996515303850174, + "num_tokens": 73139684.0, + "step": 3560 + }, + { + "entropy": 0.7993333518505097, + "epoch": 6.574585635359116, + "grad_norm": 0.44471475481987, + "learning_rate": 2.882462989786061e-05, + "loss": 0.008206719905138016, + "mean_token_accuracy": 0.9968972980976105, + "num_tokens": 73345547.0, + "step": 3570 + }, + { + "entropy": 0.7961922466754914, + "epoch": 6.593001841620627, + "grad_norm": 0.5011329054832458, + "learning_rate": 2.854914751368109e-05, + "loss": 0.009073075652122498, + "mean_token_accuracy": 0.9968676805496216, + "num_tokens": 73550822.0, + "step": 3580 + }, + { + "entropy": 0.8064342319965363, + "epoch": 6.611418047882136, + "grad_norm": 0.6375740766525269, + "learning_rate": 2.82744608861642e-05, + "loss": 0.009309899061918259, + "mean_token_accuracy": 0.9967096745967865, + "num_tokens": 73756564.0, + "step": 3590 + }, + { + "entropy": 0.8025432348251342, + "epoch": 6.629834254143646, + "grad_norm": 0.8449372053146362, + "learning_rate": 2.8000580205287874e-05, + "loss": 0.009333166480064391, + "mean_token_accuracy": 0.9967890501022338, + "num_tokens": 73961849.0, + "step": 3600 + }, + { + "entropy": 0.8010810256004334, + "epoch": 6.648250460405157, + "grad_norm": 0.9473148584365845, + "learning_rate": 2.772751563113213e-05, + "loss": 0.00938543900847435, + "mean_token_accuracy": 0.996571558713913, + "num_tokens": 74167074.0, + "step": 3610 + }, + { + "entropy": 0.7954266011714936, + "epoch": 6.666666666666667, + "grad_norm": 0.4197849631309509, + "learning_rate": 2.7455277293502007e-05, + "loss": 0.008846811950206757, + "mean_token_accuracy": 0.9970432996749878, + "num_tokens": 74372119.0, + "step": 3620 + }, + { + "entropy": 0.7859483778476715, + "epoch": 6.685082872928177, + "grad_norm": 0.5353069305419922, + "learning_rate": 2.7183875291551892e-05, + "loss": 0.008807064592838287, + "mean_token_accuracy": 0.9969651758670807, + "num_tokens": 74577516.0, + "step": 3630 + }, + { + "entropy": 0.7940182387828827, + "epoch": 6.703499079189687, + "grad_norm": 0.5789965391159058, + "learning_rate": 2.6913319693410828e-05, + "loss": 0.008173662424087524, + "mean_token_accuracy": 0.9970715939998627, + "num_tokens": 74783031.0, + "step": 3640 + }, + { + "entropy": 0.7871349632740021, + "epoch": 6.721915285451197, + "grad_norm": 0.5887596011161804, + "learning_rate": 2.6643620535809076e-05, + "loss": 0.008517104387283325, + "mean_token_accuracy": 0.9969267845153809, + "num_tokens": 74988646.0, + "step": 3650 + }, + { + "entropy": 0.783170485496521, + "epoch": 6.740331491712707, + "grad_norm": 0.6228395104408264, + "learning_rate": 2.637478782370574e-05, + "loss": 0.008941689878702164, + "mean_token_accuracy": 0.9967794418334961, + "num_tokens": 75193938.0, + "step": 3660 + }, + { + "entropy": 0.7779926240444184, + "epoch": 6.758747697974218, + "grad_norm": 0.7367292642593384, + "learning_rate": 2.61068315299176e-05, + "loss": 0.009662539511919022, + "mean_token_accuracy": 0.9965554535388946, + "num_tokens": 75399816.0, + "step": 3670 + }, + { + "entropy": 0.7756146490573883, + "epoch": 6.777163904235728, + "grad_norm": 0.7478228807449341, + "learning_rate": 2.5839761594749167e-05, + "loss": 0.008691602945327758, + "mean_token_accuracy": 0.996806287765503, + "num_tokens": 75605531.0, + "step": 3680 + }, + { + "entropy": 0.7793804049491883, + "epoch": 6.795580110497237, + "grad_norm": 0.580205500125885, + "learning_rate": 2.5573587925623964e-05, + "loss": 0.00922732800245285, + "mean_token_accuracy": 0.9966219186782836, + "num_tokens": 75811143.0, + "step": 3690 + }, + { + "entropy": 0.7817609786987305, + "epoch": 6.813996316758748, + "grad_norm": 0.3849862813949585, + "learning_rate": 2.530832039671694e-05, + "loss": 0.00812167227268219, + "mean_token_accuracy": 0.9970280706882477, + "num_tokens": 76016366.0, + "step": 3700 + }, + { + "entropy": 0.7793294489383698, + "epoch": 6.832412523020258, + "grad_norm": 0.4873282313346863, + "learning_rate": 2.504396884858825e-05, + "loss": 0.008183138072490692, + "mean_token_accuracy": 0.9973145961761475, + "num_tokens": 76221692.0, + "step": 3710 + }, + { + "entropy": 0.7805068492889404, + "epoch": 6.850828729281768, + "grad_norm": 0.6652786135673523, + "learning_rate": 2.478054308781807e-05, + "loss": 0.009141853451728821, + "mean_token_accuracy": 0.9968416154384613, + "num_tokens": 76427170.0, + "step": 3720 + }, + { + "entropy": 0.7799863159656525, + "epoch": 6.8692449355432785, + "grad_norm": 0.5895428657531738, + "learning_rate": 2.451805288664298e-05, + "loss": 0.009343943744897842, + "mean_token_accuracy": 0.9968909084796905, + "num_tokens": 76632450.0, + "step": 3730 + }, + { + "entropy": 0.7819362223148346, + "epoch": 6.887661141804788, + "grad_norm": 0.6007734537124634, + "learning_rate": 2.425650798259327e-05, + "loss": 0.008081933856010437, + "mean_token_accuracy": 0.9972956955432892, + "num_tokens": 76837993.0, + "step": 3740 + }, + { + "entropy": 0.7859819054603576, + "epoch": 6.906077348066298, + "grad_norm": 0.5510725975036621, + "learning_rate": 2.39959180781318e-05, + "loss": 0.008848348259925842, + "mean_token_accuracy": 0.9968287885189057, + "num_tokens": 77043697.0, + "step": 3750 + }, + { + "entropy": 0.7813855290412903, + "epoch": 6.9244935543278086, + "grad_norm": 0.5004434585571289, + "learning_rate": 2.3736292840294122e-05, + "loss": 0.00795777291059494, + "mean_token_accuracy": 0.9973017990589141, + "num_tokens": 77248720.0, + "step": 3760 + }, + { + "entropy": 0.774254196882248, + "epoch": 6.942909760589319, + "grad_norm": 0.7068622708320618, + "learning_rate": 2.347764190032974e-05, + "loss": 0.007790238410234451, + "mean_token_accuracy": 0.997188663482666, + "num_tokens": 77454096.0, + "step": 3770 + }, + { + "entropy": 0.7675817251205445, + "epoch": 6.961325966850829, + "grad_norm": 0.5110977292060852, + "learning_rate": 2.3219974853344905e-05, + "loss": 0.008631375432014466, + "mean_token_accuracy": 0.9967362582683563, + "num_tokens": 77659811.0, + "step": 3780 + }, + { + "entropy": 0.7719516515731811, + "epoch": 6.979742173112339, + "grad_norm": 0.6288211941719055, + "learning_rate": 2.2963301257946622e-05, + "loss": 0.00804171860218048, + "mean_token_accuracy": 0.9971263229846954, + "num_tokens": 77865539.0, + "step": 3790 + }, + { + "entropy": 0.7786632418632508, + "epoch": 6.998158379373849, + "grad_norm": 0.5279833078384399, + "learning_rate": 2.270763063588814e-05, + "loss": 0.007490953803062439, + "mean_token_accuracy": 0.9974353730678558, + "num_tokens": 78070767.0, + "step": 3800 + }, + { + "epoch": 7.0, + "eval_entropy": 0.7808213239130767, + "eval_loss": 0.07382760941982269, + "eval_mean_token_accuracy": 0.9800234224485315, + "eval_num_tokens": 78091327.0, + "eval_runtime": 10.072, + "eval_samples_per_second": 363.186, + "eval_steps_per_second": 11.418, + "step": 3801 + } + ], + "logging_steps": 10, + "max_steps": 5430, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.7228641768471265e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3801/training_args.bin b/checkpoint-3801/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e182f9a244f56249cd86c9db6563a54fbeed7bd --- /dev/null +++ b/checkpoint-3801/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21325c9bdff5ed34f0cc34837ee67ed216c9301ab4d9b2e26f048b563564bd75 +size 5777 diff --git a/checkpoint-4344/README.md b/checkpoint-4344/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dd49221a54a197d9f0cab86c7982e2b2fe0776b3 --- /dev/null +++ b/checkpoint-4344/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen2.5-7B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen2.5-7B-Instruct +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-4344/adapter_config.json b/checkpoint-4344/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..88178e58c6d24c2bb788f871d548c8bff3b8b16a --- /dev/null +++ b/checkpoint-4344/adapter_config.json @@ -0,0 +1,48 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "down_proj", + "q_proj", + "gate_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4344/adapter_model.safetensors b/checkpoint-4344/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cea87c29fdcfd9028a220d3f7b97c68bfd23c9ba --- /dev/null +++ b/checkpoint-4344/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f1d44459dcf6919f1f87aba8b1741a5e1f59eaeff47af9fa14e37c2ec2aded8 +size 80792096 diff --git a/checkpoint-4344/chat_template.jinja b/checkpoint-4344/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..bdf7919a96cfe43d50914a007b9c0877bd0ec27e --- /dev/null +++ b/checkpoint-4344/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-4344/tokenizer.json b/checkpoint-4344/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/checkpoint-4344/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/checkpoint-4344/tokenizer_config.json b/checkpoint-4344/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d75d3bb5300d205e48769cc1999073ab5971214 --- /dev/null +++ b/checkpoint-4344/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-4344/trainer_state.json b/checkpoint-4344/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4a195d7f7e56a30f5a0c4530f3f48f314b74b368 --- /dev/null +++ b/checkpoint-4344/trainer_state.json @@ -0,0 +1,4462 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 4344, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2237394809722901, + "epoch": 0.01841620626151013, + "grad_norm": 5.082435607910156, + "learning_rate": 3.308823529411765e-06, + "loss": 0.9237876892089844, + "mean_token_accuracy": 0.7685343027114868, + "num_tokens": 205423.0, + "step": 10 + }, + { + "entropy": 1.2295925617218018, + "epoch": 0.03683241252302026, + "grad_norm": 4.672000408172607, + "learning_rate": 6.985294117647059e-06, + "loss": 0.8900892257690429, + "mean_token_accuracy": 0.7677771031856537, + "num_tokens": 410849.0, + "step": 20 + }, + { + "entropy": 1.2285718679428101, + "epoch": 0.055248618784530384, + "grad_norm": 1.4828118085861206, + "learning_rate": 1.0661764705882354e-05, + "loss": 0.5975452899932862, + "mean_token_accuracy": 0.8146551787853241, + "num_tokens": 616438.0, + "step": 30 + }, + { + "entropy": 1.210776400566101, + "epoch": 0.07366482504604052, + "grad_norm": 0.7761328816413879, + "learning_rate": 1.4338235294117647e-05, + "loss": 0.40664992332458494, + "mean_token_accuracy": 0.8699092030525207, + "num_tokens": 822118.0, + "step": 40 + }, + { + "entropy": 1.200321125984192, + "epoch": 0.09208103130755065, + "grad_norm": 0.5363371968269348, + "learning_rate": 1.8014705882352943e-05, + "loss": 0.3313469409942627, + "mean_token_accuracy": 0.8904915869235992, + "num_tokens": 1027941.0, + "step": 50 + }, + { + "entropy": 1.1809936046600342, + "epoch": 0.11049723756906077, + "grad_norm": 0.39541518688201904, + "learning_rate": 2.1691176470588237e-05, + "loss": 0.27568228244781495, + "mean_token_accuracy": 0.9047131836414337, + "num_tokens": 1233620.0, + "step": 60 + }, + { + "entropy": 1.169810914993286, + "epoch": 0.1289134438305709, + "grad_norm": 0.341960072517395, + "learning_rate": 2.536764705882353e-05, + "loss": 0.245219087600708, + "mean_token_accuracy": 0.9150686681270599, + "num_tokens": 1438656.0, + "step": 70 + }, + { + "entropy": 1.1652960777282715, + "epoch": 0.14732965009208104, + "grad_norm": 0.36872178316116333, + "learning_rate": 2.9044117647058828e-05, + "loss": 0.2220149040222168, + "mean_token_accuracy": 0.9224777698516846, + "num_tokens": 1643877.0, + "step": 80 + }, + { + "entropy": 1.154341197013855, + "epoch": 0.16574585635359115, + "grad_norm": 0.4152425229549408, + "learning_rate": 3.272058823529412e-05, + "loss": 0.2002798557281494, + "mean_token_accuracy": 0.9285802960395813, + "num_tokens": 1849506.0, + "step": 90 + }, + { + "entropy": 1.1507258892059327, + "epoch": 0.1841620626151013, + "grad_norm": 0.47647765278816223, + "learning_rate": 3.639705882352941e-05, + "loss": 0.18871363401412963, + "mean_token_accuracy": 0.9318056285381318, + "num_tokens": 2055071.0, + "step": 100 + }, + { + "entropy": 1.1455535531044005, + "epoch": 0.20257826887661143, + "grad_norm": 0.4853009581565857, + "learning_rate": 4.007352941176471e-05, + "loss": 0.17836341857910157, + "mean_token_accuracy": 0.9367631554603577, + "num_tokens": 2260643.0, + "step": 110 + }, + { + "entropy": 1.1402526497840881, + "epoch": 0.22099447513812154, + "grad_norm": 0.4455392360687256, + "learning_rate": 4.375e-05, + "loss": 0.16921783685684205, + "mean_token_accuracy": 0.9386959195137023, + "num_tokens": 2466085.0, + "step": 120 + }, + { + "entropy": 1.1374777555465698, + "epoch": 0.23941068139963168, + "grad_norm": 0.5880279541015625, + "learning_rate": 4.742647058823529e-05, + "loss": 0.15989291667938232, + "mean_token_accuracy": 0.9421182632446289, + "num_tokens": 2671024.0, + "step": 130 + }, + { + "entropy": 1.1273940205574036, + "epoch": 0.2578268876611418, + "grad_norm": 0.612959086894989, + "learning_rate": 5.110294117647059e-05, + "loss": 0.14701461791992188, + "mean_token_accuracy": 0.9463540315628052, + "num_tokens": 2876848.0, + "step": 140 + }, + { + "entropy": 1.1263513088226318, + "epoch": 0.27624309392265195, + "grad_norm": 0.5695255398750305, + "learning_rate": 5.477941176470589e-05, + "loss": 0.14604382514953612, + "mean_token_accuracy": 0.946351945400238, + "num_tokens": 3082589.0, + "step": 150 + }, + { + "entropy": 1.1290789365768432, + "epoch": 0.2946593001841621, + "grad_norm": 0.6608090996742249, + "learning_rate": 5.845588235294118e-05, + "loss": 0.1409450054168701, + "mean_token_accuracy": 0.9481450319290161, + "num_tokens": 3287459.0, + "step": 160 + }, + { + "entropy": 1.1291529774665832, + "epoch": 0.31307550644567217, + "grad_norm": 0.652715802192688, + "learning_rate": 6.213235294117647e-05, + "loss": 0.14441155195236205, + "mean_token_accuracy": 0.9466125547885895, + "num_tokens": 3493682.0, + "step": 170 + }, + { + "entropy": 1.1244838953018188, + "epoch": 0.3314917127071823, + "grad_norm": 0.7815241813659668, + "learning_rate": 6.580882352941177e-05, + "loss": 0.13361064195632935, + "mean_token_accuracy": 0.9512295544147491, + "num_tokens": 3699573.0, + "step": 180 + }, + { + "entropy": 1.1217721104621887, + "epoch": 0.34990791896869244, + "grad_norm": 0.7933160066604614, + "learning_rate": 6.948529411764706e-05, + "loss": 0.13089522123336791, + "mean_token_accuracy": 0.9520221531391144, + "num_tokens": 3905156.0, + "step": 190 + }, + { + "entropy": 1.1206679105758668, + "epoch": 0.3683241252302026, + "grad_norm": 0.6815240383148193, + "learning_rate": 7.316176470588236e-05, + "loss": 0.13400404453277587, + "mean_token_accuracy": 0.9501322209835052, + "num_tokens": 4110570.0, + "step": 200 + }, + { + "entropy": 1.1161052227020263, + "epoch": 0.3867403314917127, + "grad_norm": 0.8297767639160156, + "learning_rate": 7.683823529411766e-05, + "loss": 0.13389937877655028, + "mean_token_accuracy": 0.9501932203769684, + "num_tokens": 4315834.0, + "step": 210 + }, + { + "entropy": 1.1098745942115784, + "epoch": 0.40515653775322286, + "grad_norm": 0.5943381786346436, + "learning_rate": 8.051470588235294e-05, + "loss": 0.13452907800674438, + "mean_token_accuracy": 0.9503286242485046, + "num_tokens": 4520807.0, + "step": 220 + }, + { + "entropy": 1.100480353832245, + "epoch": 0.42357274401473294, + "grad_norm": 0.6094359755516052, + "learning_rate": 8.419117647058824e-05, + "loss": 0.12827746868133544, + "mean_token_accuracy": 0.952492094039917, + "num_tokens": 4725867.0, + "step": 230 + }, + { + "entropy": 1.0901286959648133, + "epoch": 0.4419889502762431, + "grad_norm": 0.7240597605705261, + "learning_rate": 8.786764705882353e-05, + "loss": 0.12171242237091065, + "mean_token_accuracy": 0.953943532705307, + "num_tokens": 4931629.0, + "step": 240 + }, + { + "entropy": 1.0885071873664856, + "epoch": 0.4604051565377532, + "grad_norm": 0.6939547657966614, + "learning_rate": 9.154411764705882e-05, + "loss": 0.12155698537826538, + "mean_token_accuracy": 0.9545870959758759, + "num_tokens": 5137285.0, + "step": 250 + }, + { + "entropy": 1.086272156238556, + "epoch": 0.47882136279926335, + "grad_norm": 0.5752800703048706, + "learning_rate": 9.522058823529412e-05, + "loss": 0.12157790660858155, + "mean_token_accuracy": 0.9541126549243927, + "num_tokens": 5342575.0, + "step": 260 + }, + { + "entropy": 1.0857678413391114, + "epoch": 0.4972375690607735, + "grad_norm": 0.7565123438835144, + "learning_rate": 9.889705882352942e-05, + "loss": 0.12349612712860107, + "mean_token_accuracy": 0.9535140514373779, + "num_tokens": 5547995.0, + "step": 270 + }, + { + "entropy": 1.079762625694275, + "epoch": 0.5156537753222836, + "grad_norm": 0.6972768306732178, + "learning_rate": 9.999954556423843e-05, + "loss": 0.11875582933425903, + "mean_token_accuracy": 0.9556483089923858, + "num_tokens": 5753195.0, + "step": 280 + }, + { + "entropy": 1.0742079138755798, + "epoch": 0.5340699815837937, + "grad_norm": 0.7821696996688843, + "learning_rate": 9.999731977631227e-05, + "loss": 0.11824090480804443, + "mean_token_accuracy": 0.9557521045207977, + "num_tokens": 5958236.0, + "step": 290 + }, + { + "entropy": 1.0679773569107056, + "epoch": 0.5524861878453039, + "grad_norm": 0.5846888422966003, + "learning_rate": 9.999323925089486e-05, + "loss": 0.11707355976104736, + "mean_token_accuracy": 0.9554719448089599, + "num_tokens": 6163992.0, + "step": 300 + }, + { + "entropy": 1.0655727863311768, + "epoch": 0.570902394106814, + "grad_norm": 0.5812502503395081, + "learning_rate": 9.998730413936037e-05, + "loss": 0.11371417045593261, + "mean_token_accuracy": 0.9576376020908356, + "num_tokens": 6369456.0, + "step": 310 + }, + { + "entropy": 1.0607039332389832, + "epoch": 0.5893186003683242, + "grad_norm": 0.6238475441932678, + "learning_rate": 9.99795146618821e-05, + "loss": 0.11775733232498169, + "mean_token_accuracy": 0.9557221591472626, + "num_tokens": 6574833.0, + "step": 320 + }, + { + "entropy": 1.0504255175590516, + "epoch": 0.6077348066298343, + "grad_norm": 0.6496815085411072, + "learning_rate": 9.996987110742422e-05, + "loss": 0.10904088020324706, + "mean_token_accuracy": 0.9585366368293762, + "num_tokens": 6780108.0, + "step": 330 + }, + { + "entropy": 1.0456081986427308, + "epoch": 0.6261510128913443, + "grad_norm": 0.786702573299408, + "learning_rate": 9.995837383373119e-05, + "loss": 0.10642309188842773, + "mean_token_accuracy": 0.9596696078777314, + "num_tokens": 6985920.0, + "step": 340 + }, + { + "entropy": 1.0455098271369934, + "epoch": 0.6445672191528545, + "grad_norm": 0.5473790168762207, + "learning_rate": 9.994502326731434e-05, + "loss": 0.10822961330413819, + "mean_token_accuracy": 0.959563136100769, + "num_tokens": 7191465.0, + "step": 350 + }, + { + "entropy": 1.04240562915802, + "epoch": 0.6629834254143646, + "grad_norm": 0.6672356128692627, + "learning_rate": 9.992981990343614e-05, + "loss": 0.1110004186630249, + "mean_token_accuracy": 0.9582514643669129, + "num_tokens": 7396877.0, + "step": 360 + }, + { + "entropy": 1.0386811256408692, + "epoch": 0.6813996316758748, + "grad_norm": 0.698539674282074, + "learning_rate": 9.99127643060918e-05, + "loss": 0.107539963722229, + "mean_token_accuracy": 0.9593036234378814, + "num_tokens": 7602437.0, + "step": 370 + }, + { + "entropy": 1.0311225533485413, + "epoch": 0.6998158379373849, + "grad_norm": 0.6629284024238586, + "learning_rate": 9.989385710798837e-05, + "loss": 0.1064023494720459, + "mean_token_accuracy": 0.9602205216884613, + "num_tokens": 7808142.0, + "step": 380 + }, + { + "entropy": 1.030210506916046, + "epoch": 0.7182320441988951, + "grad_norm": 0.5616748929023743, + "learning_rate": 9.987309901052121e-05, + "loss": 0.10717041492462158, + "mean_token_accuracy": 0.9599347949028015, + "num_tokens": 8013407.0, + "step": 390 + }, + { + "entropy": 1.0208017826080322, + "epoch": 0.7366482504604052, + "grad_norm": 0.6329049468040466, + "learning_rate": 9.985049078374806e-05, + "loss": 0.10359601974487305, + "mean_token_accuracy": 0.9603756129741668, + "num_tokens": 8219040.0, + "step": 400 + }, + { + "entropy": 1.015640377998352, + "epoch": 0.7550644567219152, + "grad_norm": 0.6516013741493225, + "learning_rate": 9.982603326636037e-05, + "loss": 0.10146439075469971, + "mean_token_accuracy": 0.9627702474594116, + "num_tokens": 8424678.0, + "step": 410 + }, + { + "entropy": 1.0105359435081482, + "epoch": 0.7734806629834254, + "grad_norm": 0.6920603513717651, + "learning_rate": 9.979972736565226e-05, + "loss": 0.10770498514175415, + "mean_token_accuracy": 0.9591470420360565, + "num_tokens": 8629868.0, + "step": 420 + }, + { + "entropy": 0.9966452836990356, + "epoch": 0.7918968692449355, + "grad_norm": 0.6857476234436035, + "learning_rate": 9.977157405748687e-05, + "loss": 0.10282524824142455, + "mean_token_accuracy": 0.9612209022045135, + "num_tokens": 8835320.0, + "step": 430 + }, + { + "entropy": 0.9945534646511078, + "epoch": 0.8103130755064457, + "grad_norm": 0.7208472490310669, + "learning_rate": 9.974157438626008e-05, + "loss": 0.10069938898086547, + "mean_token_accuracy": 0.9620070576667785, + "num_tokens": 9041123.0, + "step": 440 + }, + { + "entropy": 0.979461395740509, + "epoch": 0.8287292817679558, + "grad_norm": 0.5071915984153748, + "learning_rate": 9.970972946486185e-05, + "loss": 0.09799174070358277, + "mean_token_accuracy": 0.9620374023914338, + "num_tokens": 9246361.0, + "step": 450 + }, + { + "entropy": 0.9830998003482818, + "epoch": 0.8471454880294659, + "grad_norm": 0.8660802245140076, + "learning_rate": 9.967604047463493e-05, + "loss": 0.10378165245056152, + "mean_token_accuracy": 0.9606865763664245, + "num_tokens": 9451845.0, + "step": 460 + }, + { + "entropy": 0.9813413023948669, + "epoch": 0.8655616942909761, + "grad_norm": 0.7642477750778198, + "learning_rate": 9.964050866533094e-05, + "loss": 0.1010061264038086, + "mean_token_accuracy": 0.9608745336532593, + "num_tokens": 9656802.0, + "step": 470 + }, + { + "entropy": 0.967874163389206, + "epoch": 0.8839779005524862, + "grad_norm": 0.5987281799316406, + "learning_rate": 9.960313535506411e-05, + "loss": 0.10169394016265869, + "mean_token_accuracy": 0.9611998200416565, + "num_tokens": 9861719.0, + "step": 480 + }, + { + "entropy": 0.9663491308689117, + "epoch": 0.9023941068139963, + "grad_norm": 0.6124638319015503, + "learning_rate": 9.956392193026239e-05, + "loss": 0.102389657497406, + "mean_token_accuracy": 0.9611884355545044, + "num_tokens": 10066673.0, + "step": 490 + }, + { + "entropy": 0.959654438495636, + "epoch": 0.9208103130755064, + "grad_norm": 0.7873051762580872, + "learning_rate": 9.952286984561592e-05, + "loss": 0.10170392990112305, + "mean_token_accuracy": 0.9610928475856781, + "num_tokens": 10272091.0, + "step": 500 + }, + { + "entropy": 0.9550537407398224, + "epoch": 0.9392265193370166, + "grad_norm": 0.6071968078613281, + "learning_rate": 9.947998062402313e-05, + "loss": 0.09448277950286865, + "mean_token_accuracy": 0.9648977637290954, + "num_tokens": 10477632.0, + "step": 510 + }, + { + "entropy": 0.9538533687591553, + "epoch": 0.9576427255985267, + "grad_norm": 0.6317242980003357, + "learning_rate": 9.943525585653428e-05, + "loss": 0.09542192220687866, + "mean_token_accuracy": 0.9635261118412017, + "num_tokens": 10682828.0, + "step": 520 + }, + { + "entropy": 0.9362513542175293, + "epoch": 0.9760589318600368, + "grad_norm": 0.6421944499015808, + "learning_rate": 9.938869720229234e-05, + "loss": 0.09382058382034301, + "mean_token_accuracy": 0.9648073971271515, + "num_tokens": 10888741.0, + "step": 530 + }, + { + "entropy": 0.9235438346862793, + "epoch": 0.994475138121547, + "grad_norm": 0.7986873388290405, + "learning_rate": 9.934030638847155e-05, + "loss": 0.09827429056167603, + "mean_token_accuracy": 0.9621128737926483, + "num_tokens": 11094387.0, + "step": 540 + }, + { + "epoch": 1.0, + "eval_entropy": 0.9137652366057686, + "eval_loss": 0.09368764609098434, + "eval_mean_token_accuracy": 0.9640816880309063, + "eval_num_tokens": 11155908.0, + "eval_runtime": 10.4701, + "eval_samples_per_second": 349.377, + "eval_steps_per_second": 10.984, + "step": 543 + }, + { + "entropy": 0.9047818422317505, + "epoch": 1.0128913443830572, + "grad_norm": 0.6781501173973083, + "learning_rate": 9.929008521021325e-05, + "loss": 0.0863916516304016, + "mean_token_accuracy": 0.9673655688762665, + "num_tokens": 11299715.0, + "step": 550 + }, + { + "entropy": 0.8856981039047241, + "epoch": 1.0313075506445673, + "grad_norm": 0.7143136858940125, + "learning_rate": 9.923803553055937e-05, + "loss": 0.08632323145866394, + "mean_token_accuracy": 0.9677783191204071, + "num_tokens": 11505059.0, + "step": 560 + }, + { + "entropy": 0.8937099635601043, + "epoch": 1.0497237569060773, + "grad_norm": 0.7751694321632385, + "learning_rate": 9.918415928038325e-05, + "loss": 0.08178263902664185, + "mean_token_accuracy": 0.9694291114807129, + "num_tokens": 11710464.0, + "step": 570 + }, + { + "entropy": 0.8858704209327698, + "epoch": 1.0681399631675874, + "grad_norm": 0.7492292523384094, + "learning_rate": 9.912845845831805e-05, + "loss": 0.08074211478233337, + "mean_token_accuracy": 0.9692470014095307, + "num_tokens": 11915959.0, + "step": 580 + }, + { + "entropy": 0.8948039829730987, + "epoch": 1.0865561694290977, + "grad_norm": 0.8116479516029358, + "learning_rate": 9.907093513068259e-05, + "loss": 0.08712012171745301, + "mean_token_accuracy": 0.9669980227947235, + "num_tokens": 12121499.0, + "step": 590 + }, + { + "entropy": 0.8846789538860321, + "epoch": 1.1049723756906078, + "grad_norm": 0.7295626997947693, + "learning_rate": 9.901159143140471e-05, + "loss": 0.08444435596466064, + "mean_token_accuracy": 0.9674544095993042, + "num_tokens": 12327061.0, + "step": 600 + }, + { + "entropy": 0.8734103918075562, + "epoch": 1.1233885819521179, + "grad_norm": 0.9585768580436707, + "learning_rate": 9.89504295619421e-05, + "loss": 0.08022565841674804, + "mean_token_accuracy": 0.969569206237793, + "num_tokens": 12532305.0, + "step": 610 + }, + { + "entropy": 0.8640486001968384, + "epoch": 1.141804788213628, + "grad_norm": 0.7891159057617188, + "learning_rate": 9.88874517912006e-05, + "loss": 0.08415375947952271, + "mean_token_accuracy": 0.9678892493247986, + "num_tokens": 12737828.0, + "step": 620 + }, + { + "entropy": 0.8599755525588989, + "epoch": 1.160220994475138, + "grad_norm": 0.5801345109939575, + "learning_rate": 9.882266045545012e-05, + "loss": 0.08100489974021911, + "mean_token_accuracy": 0.9688023269176483, + "num_tokens": 12943343.0, + "step": 630 + }, + { + "entropy": 0.86524977684021, + "epoch": 1.1786372007366483, + "grad_norm": 0.7633041143417358, + "learning_rate": 9.87560579582379e-05, + "loss": 0.07859406471252442, + "mean_token_accuracy": 0.9702189445495606, + "num_tokens": 13148473.0, + "step": 640 + }, + { + "entropy": 0.8466695249080658, + "epoch": 1.1970534069981584, + "grad_norm": 0.8672215938568115, + "learning_rate": 9.868764677029934e-05, + "loss": 0.08082623481750488, + "mean_token_accuracy": 0.9689972400665283, + "num_tokens": 13353890.0, + "step": 650 + }, + { + "entropy": 0.8596941530704498, + "epoch": 1.2154696132596685, + "grad_norm": 0.7524124383926392, + "learning_rate": 9.861742942946639e-05, + "loss": 0.0789935290813446, + "mean_token_accuracy": 0.9693858206272126, + "num_tokens": 13559475.0, + "step": 660 + }, + { + "entropy": 0.8708749234676361, + "epoch": 1.2338858195211786, + "grad_norm": 0.5777031183242798, + "learning_rate": 9.854540854057337e-05, + "loss": 0.07773642539978028, + "mean_token_accuracy": 0.970385092496872, + "num_tokens": 13765076.0, + "step": 670 + }, + { + "entropy": 0.8651713371276856, + "epoch": 1.2523020257826887, + "grad_norm": 0.7924166321754456, + "learning_rate": 9.847158677536034e-05, + "loss": 0.0766686737537384, + "mean_token_accuracy": 0.9702267110347748, + "num_tokens": 13970642.0, + "step": 680 + }, + { + "entropy": 0.8763024985790253, + "epoch": 1.270718232044199, + "grad_norm": 0.741219162940979, + "learning_rate": 9.839596687237403e-05, + "loss": 0.07189929485321045, + "mean_token_accuracy": 0.9727097094058991, + "num_tokens": 14176556.0, + "step": 690 + }, + { + "entropy": 0.8556921362876893, + "epoch": 1.289134438305709, + "grad_norm": 0.6298198103904724, + "learning_rate": 9.831855163686618e-05, + "loss": 0.07608137726783752, + "mean_token_accuracy": 0.9716399371623993, + "num_tokens": 14381686.0, + "step": 700 + }, + { + "entropy": 0.869178420305252, + "epoch": 1.3075506445672191, + "grad_norm": 0.5850273370742798, + "learning_rate": 9.823934394068952e-05, + "loss": 0.07437651753425598, + "mean_token_accuracy": 0.9709566533565521, + "num_tokens": 14586814.0, + "step": 710 + }, + { + "entropy": 0.8708595156669616, + "epoch": 1.3259668508287292, + "grad_norm": 0.6580632328987122, + "learning_rate": 9.815834672219127e-05, + "loss": 0.07518917322158813, + "mean_token_accuracy": 0.9717426657676697, + "num_tokens": 14792321.0, + "step": 720 + }, + { + "entropy": 0.8826817810535431, + "epoch": 1.3443830570902393, + "grad_norm": 0.8788532018661499, + "learning_rate": 9.807556298610404e-05, + "loss": 0.07579240798950196, + "mean_token_accuracy": 0.9706341981887817, + "num_tokens": 14997810.0, + "step": 730 + }, + { + "entropy": 0.9012470185756684, + "epoch": 1.3627992633517496, + "grad_norm": 0.7022138237953186, + "learning_rate": 9.799099580343441e-05, + "loss": 0.0775588572025299, + "mean_token_accuracy": 0.9699241399765015, + "num_tokens": 15203795.0, + "step": 740 + }, + { + "entropy": 0.886955714225769, + "epoch": 1.3812154696132597, + "grad_norm": 0.7881133556365967, + "learning_rate": 9.790464831134903e-05, + "loss": 0.07125020027160645, + "mean_token_accuracy": 0.9723815560340882, + "num_tokens": 15408974.0, + "step": 750 + }, + { + "entropy": 0.9047374844551086, + "epoch": 1.3996316758747698, + "grad_norm": 0.9082005023956299, + "learning_rate": 9.781652371305824e-05, + "loss": 0.07004334926605224, + "mean_token_accuracy": 0.9725580036640167, + "num_tokens": 15614399.0, + "step": 760 + }, + { + "entropy": 0.9039053857326508, + "epoch": 1.4180478821362799, + "grad_norm": 0.8060817122459412, + "learning_rate": 9.77266252776972e-05, + "loss": 0.07103485465049744, + "mean_token_accuracy": 0.9721468150615692, + "num_tokens": 15819895.0, + "step": 770 + }, + { + "entropy": 0.8998047232627868, + "epoch": 1.43646408839779, + "grad_norm": 1.0152642726898193, + "learning_rate": 9.763495634020467e-05, + "loss": 0.07411704063415528, + "mean_token_accuracy": 0.9711063146591187, + "num_tokens": 16025297.0, + "step": 780 + }, + { + "entropy": 0.9120213568210602, + "epoch": 1.4548802946593002, + "grad_norm": 0.6288319826126099, + "learning_rate": 9.754152030119921e-05, + "loss": 0.07223712205886841, + "mean_token_accuracy": 0.9722476422786712, + "num_tokens": 16230656.0, + "step": 790 + }, + { + "entropy": 0.9142370820045471, + "epoch": 1.4732965009208103, + "grad_norm": 0.7854700088500977, + "learning_rate": 9.744632062685311e-05, + "loss": 0.07186744809150696, + "mean_token_accuracy": 0.972247713804245, + "num_tokens": 16435943.0, + "step": 800 + }, + { + "entropy": 0.8920814216136932, + "epoch": 1.4917127071823204, + "grad_norm": 0.6227074265480042, + "learning_rate": 9.734936084876383e-05, + "loss": 0.07016961574554444, + "mean_token_accuracy": 0.9725603640079499, + "num_tokens": 16641635.0, + "step": 810 + }, + { + "entropy": 0.891328877210617, + "epoch": 1.5101289134438307, + "grad_norm": 0.7601346969604492, + "learning_rate": 9.725064456382283e-05, + "loss": 0.07137494087219239, + "mean_token_accuracy": 0.9722997546195984, + "num_tokens": 16847194.0, + "step": 820 + }, + { + "entropy": 0.8921217978000641, + "epoch": 1.5285451197053406, + "grad_norm": 0.7813850045204163, + "learning_rate": 9.715017543408233e-05, + "loss": 0.06890199184417725, + "mean_token_accuracy": 0.9735044002532959, + "num_tokens": 17052807.0, + "step": 830 + }, + { + "entropy": 0.9085914671421051, + "epoch": 1.5469613259668509, + "grad_norm": 0.6184289455413818, + "learning_rate": 9.704795718661939e-05, + "loss": 0.07043765187263488, + "mean_token_accuracy": 0.9725716531276702, + "num_tokens": 17258284.0, + "step": 840 + }, + { + "entropy": 0.9029861629009247, + "epoch": 1.565377532228361, + "grad_norm": 0.7082377076148987, + "learning_rate": 9.694399361339752e-05, + "loss": 0.07113839387893676, + "mean_token_accuracy": 0.9725669205188752, + "num_tokens": 17464326.0, + "step": 850 + }, + { + "entropy": 0.8856533527374267, + "epoch": 1.583793738489871, + "grad_norm": 0.7409216165542603, + "learning_rate": 9.683828857112627e-05, + "loss": 0.07077333331108093, + "mean_token_accuracy": 0.9731084644794464, + "num_tokens": 17669537.0, + "step": 860 + }, + { + "entropy": 0.8613030433654785, + "epoch": 1.6022099447513813, + "grad_norm": 0.6801561713218689, + "learning_rate": 9.673084598111789e-05, + "loss": 0.06885308027267456, + "mean_token_accuracy": 0.97266526222229, + "num_tokens": 17875289.0, + "step": 870 + }, + { + "entropy": 0.8692965865135193, + "epoch": 1.6206261510128912, + "grad_norm": 1.1621277332305908, + "learning_rate": 9.662166982914203e-05, + "loss": 0.07017780542373657, + "mean_token_accuracy": 0.9733059942722321, + "num_tokens": 18080404.0, + "step": 880 + }, + { + "entropy": 0.8671502113342285, + "epoch": 1.6390423572744015, + "grad_norm": 0.7518903613090515, + "learning_rate": 9.651076416527787e-05, + "loss": 0.06977018713951111, + "mean_token_accuracy": 0.9730017304420471, + "num_tokens": 18285699.0, + "step": 890 + }, + { + "entropy": 0.8662045657634735, + "epoch": 1.6574585635359116, + "grad_norm": 0.6622698903083801, + "learning_rate": 9.639813310376378e-05, + "loss": 0.06620995998382569, + "mean_token_accuracy": 0.9737491130828857, + "num_tokens": 18491097.0, + "step": 900 + }, + { + "entropy": 0.8548173069953918, + "epoch": 1.6758747697974217, + "grad_norm": 0.8941843509674072, + "learning_rate": 9.628378082284479e-05, + "loss": 0.06711119413375854, + "mean_token_accuracy": 0.9740589797496796, + "num_tokens": 18696827.0, + "step": 910 + }, + { + "entropy": 0.8763562262058258, + "epoch": 1.694290976058932, + "grad_norm": 0.7571700215339661, + "learning_rate": 9.616771156461755e-05, + "loss": 0.07263468503952027, + "mean_token_accuracy": 0.9717419981956482, + "num_tokens": 18902513.0, + "step": 920 + }, + { + "entropy": 0.8663733780384064, + "epoch": 1.7127071823204418, + "grad_norm": 0.7886489629745483, + "learning_rate": 9.604992963487298e-05, + "loss": 0.07074605226516724, + "mean_token_accuracy": 0.9724965393543243, + "num_tokens": 19107812.0, + "step": 930 + }, + { + "entropy": 0.8673004627227783, + "epoch": 1.7311233885819521, + "grad_norm": 0.8180726170539856, + "learning_rate": 9.593043940293647e-05, + "loss": 0.06831735372543335, + "mean_token_accuracy": 0.9733696818351746, + "num_tokens": 19313330.0, + "step": 940 + }, + { + "entropy": 0.8525971233844757, + "epoch": 1.7495395948434622, + "grad_norm": 0.6576228737831116, + "learning_rate": 9.580924530150595e-05, + "loss": 0.06567002534866333, + "mean_token_accuracy": 0.9745754361152649, + "num_tokens": 19518671.0, + "step": 950 + }, + { + "entropy": 0.8605451703071594, + "epoch": 1.7679558011049723, + "grad_norm": 0.7171661257743835, + "learning_rate": 9.568635182648725e-05, + "loss": 0.06872050762176514, + "mean_token_accuracy": 0.9732091546058654, + "num_tokens": 19724135.0, + "step": 960 + }, + { + "entropy": 0.8642210960388184, + "epoch": 1.7863720073664826, + "grad_norm": 0.7603147029876709, + "learning_rate": 9.556176353682746e-05, + "loss": 0.06766576766967773, + "mean_token_accuracy": 0.9728681743144989, + "num_tokens": 19928785.0, + "step": 970 + }, + { + "entropy": 0.8543185651302337, + "epoch": 1.8047882136279927, + "grad_norm": 0.7280875444412231, + "learning_rate": 9.543548505434581e-05, + "loss": 0.06851862668991089, + "mean_token_accuracy": 0.9737437188625335, + "num_tokens": 20134195.0, + "step": 980 + }, + { + "entropy": 0.8744745373725891, + "epoch": 1.8232044198895028, + "grad_norm": 0.5897248983383179, + "learning_rate": 9.530752106356209e-05, + "loss": 0.06809053421020508, + "mean_token_accuracy": 0.9733593761920929, + "num_tokens": 20339517.0, + "step": 990 + }, + { + "entropy": 0.8623859465122223, + "epoch": 1.8416206261510129, + "grad_norm": 0.7515265345573425, + "learning_rate": 9.517787631152298e-05, + "loss": 0.07257847785949707, + "mean_token_accuracy": 0.9714054942131043, + "num_tokens": 20545249.0, + "step": 1000 + }, + { + "entropy": 0.8669404804706573, + "epoch": 1.860036832412523, + "grad_norm": 0.7144560813903809, + "learning_rate": 9.504655560762596e-05, + "loss": 0.06832354068756104, + "mean_token_accuracy": 0.9735779523849487, + "num_tokens": 20750507.0, + "step": 1010 + }, + { + "entropy": 0.8493516445159912, + "epoch": 1.8784530386740332, + "grad_norm": 0.6559189558029175, + "learning_rate": 9.491356382344081e-05, + "loss": 0.0629766047000885, + "mean_token_accuracy": 0.9754977762699127, + "num_tokens": 20955956.0, + "step": 1020 + }, + { + "entropy": 0.8599376022815705, + "epoch": 1.8968692449355433, + "grad_norm": 0.6792973279953003, + "learning_rate": 9.477890589252895e-05, + "loss": 0.0666757881641388, + "mean_token_accuracy": 0.974083811044693, + "num_tokens": 21161163.0, + "step": 1030 + }, + { + "entropy": 0.8458438158035279, + "epoch": 1.9152854511970534, + "grad_norm": 0.6941778659820557, + "learning_rate": 9.464258681026042e-05, + "loss": 0.06307152509689332, + "mean_token_accuracy": 0.9757042229175568, + "num_tokens": 21366525.0, + "step": 1040 + }, + { + "entropy": 0.848515909910202, + "epoch": 1.9337016574585635, + "grad_norm": 0.7307806611061096, + "learning_rate": 9.450461163362855e-05, + "loss": 0.06307026147842407, + "mean_token_accuracy": 0.9750974595546722, + "num_tokens": 21572238.0, + "step": 1050 + }, + { + "entropy": 0.8563454031944275, + "epoch": 1.9521178637200736, + "grad_norm": 0.7222106456756592, + "learning_rate": 9.436498548106236e-05, + "loss": 0.0647726058959961, + "mean_token_accuracy": 0.974629694223404, + "num_tokens": 21777633.0, + "step": 1060 + }, + { + "entropy": 0.8656457483768463, + "epoch": 1.9705340699815839, + "grad_norm": 0.67178875207901, + "learning_rate": 9.422371353223674e-05, + "loss": 0.06573554277420043, + "mean_token_accuracy": 0.9745908617973328, + "num_tokens": 21983116.0, + "step": 1070 + }, + { + "entropy": 0.8630891263484954, + "epoch": 1.988950276243094, + "grad_norm": 0.6956593990325928, + "learning_rate": 9.408080102788016e-05, + "loss": 0.06630704402923585, + "mean_token_accuracy": 0.9741333484649658, + "num_tokens": 22188662.0, + "step": 1080 + }, + { + "epoch": 2.0, + "eval_entropy": 0.8560857042022373, + "eval_loss": 0.06494329869747162, + "eval_mean_token_accuracy": 0.9745692672936813, + "eval_num_tokens": 22311800.0, + "eval_runtime": 10.129, + "eval_samples_per_second": 361.142, + "eval_steps_per_second": 11.354, + "step": 1086 + }, + { + "entropy": 0.8616272270679474, + "epoch": 2.007366482504604, + "grad_norm": 0.7778105139732361, + "learning_rate": 9.393625326958041e-05, + "loss": 0.054407155513763426, + "mean_token_accuracy": 0.9792074799537659, + "num_tokens": 22394215.0, + "step": 1090 + }, + { + "entropy": 0.8496910452842712, + "epoch": 2.0257826887661143, + "grad_norm": 0.7422528266906738, + "learning_rate": 9.379007561958792e-05, + "loss": 0.051881587505340575, + "mean_token_accuracy": 0.9799090325832367, + "num_tokens": 22599599.0, + "step": 1100 + }, + { + "entropy": 0.8531602442264556, + "epoch": 2.044198895027624, + "grad_norm": 0.9075332880020142, + "learning_rate": 9.36422735006167e-05, + "loss": 0.05190724730491638, + "mean_token_accuracy": 0.979931116104126, + "num_tokens": 22805318.0, + "step": 1110 + }, + { + "entropy": 0.8657277703285218, + "epoch": 2.0626151012891345, + "grad_norm": 0.9466913938522339, + "learning_rate": 9.349285239564325e-05, + "loss": 0.053853434324264524, + "mean_token_accuracy": 0.9796103596687317, + "num_tokens": 23010438.0, + "step": 1120 + }, + { + "entropy": 0.8578485429286957, + "epoch": 2.0810313075506444, + "grad_norm": 0.6903054714202881, + "learning_rate": 9.334181784770326e-05, + "loss": 0.05228850841522217, + "mean_token_accuracy": 0.9802409887313843, + "num_tokens": 23215795.0, + "step": 1130 + }, + { + "entropy": 0.8450767934322357, + "epoch": 2.0994475138121547, + "grad_norm": 0.6615211367607117, + "learning_rate": 9.318917545968581e-05, + "loss": 0.050570905208587646, + "mean_token_accuracy": 0.9802053451538086, + "num_tokens": 23421157.0, + "step": 1140 + }, + { + "entropy": 0.8325044393539429, + "epoch": 2.117863720073665, + "grad_norm": 0.760960578918457, + "learning_rate": 9.303493089412564e-05, + "loss": 0.051966112852096555, + "mean_token_accuracy": 0.9796205997467041, + "num_tokens": 23626584.0, + "step": 1150 + }, + { + "entropy": 0.8416404843330383, + "epoch": 2.136279926335175, + "grad_norm": 0.6947009563446045, + "learning_rate": 9.287908987299306e-05, + "loss": 0.05144861936569214, + "mean_token_accuracy": 0.9800034642219544, + "num_tokens": 23832137.0, + "step": 1160 + }, + { + "entropy": 0.8564540028572083, + "epoch": 2.154696132596685, + "grad_norm": 0.733252763748169, + "learning_rate": 9.272165817748164e-05, + "loss": 0.04944799542427063, + "mean_token_accuracy": 0.9808157980442047, + "num_tokens": 24038006.0, + "step": 1170 + }, + { + "entropy": 0.8575525343418121, + "epoch": 2.1731123388581954, + "grad_norm": 0.8911028504371643, + "learning_rate": 9.25626416477938e-05, + "loss": 0.05037952661514282, + "mean_token_accuracy": 0.980946284532547, + "num_tokens": 24243374.0, + "step": 1180 + }, + { + "entropy": 0.8599720418453216, + "epoch": 2.1915285451197053, + "grad_norm": 0.7713524103164673, + "learning_rate": 9.240204618292416e-05, + "loss": 0.050603735446929934, + "mean_token_accuracy": 0.980896121263504, + "num_tokens": 24448585.0, + "step": 1190 + }, + { + "entropy": 0.8566664934158326, + "epoch": 2.2099447513812156, + "grad_norm": 0.8439353704452515, + "learning_rate": 9.223987774044066e-05, + "loss": 0.054171699285507205, + "mean_token_accuracy": 0.9796543836593627, + "num_tokens": 24653863.0, + "step": 1200 + }, + { + "entropy": 0.846601277589798, + "epoch": 2.2283609576427255, + "grad_norm": 0.7025637030601501, + "learning_rate": 9.207614233626356e-05, + "loss": 0.048924127221107484, + "mean_token_accuracy": 0.9809681415557862, + "num_tokens": 24859801.0, + "step": 1210 + }, + { + "entropy": 0.8564423739910125, + "epoch": 2.2467771639042358, + "grad_norm": 0.7788274884223938, + "learning_rate": 9.191084604444233e-05, + "loss": 0.05260283350944519, + "mean_token_accuracy": 0.9793797850608825, + "num_tokens": 25065368.0, + "step": 1220 + }, + { + "entropy": 0.865056723356247, + "epoch": 2.265193370165746, + "grad_norm": 0.8728818297386169, + "learning_rate": 9.174399499693027e-05, + "loss": 0.05016371011734009, + "mean_token_accuracy": 0.9807134211063385, + "num_tokens": 25270945.0, + "step": 1230 + }, + { + "entropy": 0.8642262935638427, + "epoch": 2.283609576427256, + "grad_norm": 1.0582489967346191, + "learning_rate": 9.157559538335703e-05, + "loss": 0.05316779017448425, + "mean_token_accuracy": 0.9794209063053131, + "num_tokens": 25476575.0, + "step": 1240 + }, + { + "entropy": 0.8677761554718018, + "epoch": 2.3020257826887662, + "grad_norm": 0.760109543800354, + "learning_rate": 9.140565345079901e-05, + "loss": 0.05115479230880737, + "mean_token_accuracy": 0.9802310705184937, + "num_tokens": 25682814.0, + "step": 1250 + }, + { + "entropy": 0.8592945456504821, + "epoch": 2.320441988950276, + "grad_norm": 0.6537907123565674, + "learning_rate": 9.123417550354761e-05, + "loss": 0.050543540716171266, + "mean_token_accuracy": 0.9806945025920868, + "num_tokens": 25887575.0, + "step": 1260 + }, + { + "entropy": 0.8692500293254852, + "epoch": 2.3388581952117864, + "grad_norm": 0.7771905064582825, + "learning_rate": 9.106116790287541e-05, + "loss": 0.049718713760375975, + "mean_token_accuracy": 0.9805168390274048, + "num_tokens": 26092950.0, + "step": 1270 + }, + { + "entropy": 0.8841261565685272, + "epoch": 2.3572744014732967, + "grad_norm": 0.7791076898574829, + "learning_rate": 9.08866370668001e-05, + "loss": 0.0527400553226471, + "mean_token_accuracy": 0.9796754539012908, + "num_tokens": 26298182.0, + "step": 1280 + }, + { + "entropy": 0.8675022900104523, + "epoch": 2.3756906077348066, + "grad_norm": 0.8481605648994446, + "learning_rate": 9.07105894698464e-05, + "loss": 0.05320838689804077, + "mean_token_accuracy": 0.9792274832725525, + "num_tokens": 26503425.0, + "step": 1290 + }, + { + "entropy": 0.8704026222229004, + "epoch": 2.394106813996317, + "grad_norm": 0.8235505819320679, + "learning_rate": 9.053303164280602e-05, + "loss": 0.055045205354690555, + "mean_token_accuracy": 0.9788750648498535, + "num_tokens": 26708755.0, + "step": 1300 + }, + { + "entropy": 0.8525134027004242, + "epoch": 2.4125230202578267, + "grad_norm": 0.7611598968505859, + "learning_rate": 9.035397017249518e-05, + "loss": 0.05029621124267578, + "mean_token_accuracy": 0.9802757322788238, + "num_tokens": 26914704.0, + "step": 1310 + }, + { + "entropy": 0.8630305290222168, + "epoch": 2.430939226519337, + "grad_norm": 0.790408194065094, + "learning_rate": 9.017341170151041e-05, + "loss": 0.04856040775775909, + "mean_token_accuracy": 0.9809690833091735, + "num_tokens": 27120151.0, + "step": 1320 + }, + { + "entropy": 0.8579159140586853, + "epoch": 2.4493554327808473, + "grad_norm": 0.781972348690033, + "learning_rate": 8.999136292798207e-05, + "loss": 0.04869682788848877, + "mean_token_accuracy": 0.9816130697727203, + "num_tokens": 27325673.0, + "step": 1330 + }, + { + "entropy": 0.8634716987609863, + "epoch": 2.467771639042357, + "grad_norm": 0.8500784039497375, + "learning_rate": 8.980783060532588e-05, + "loss": 0.05050289034843445, + "mean_token_accuracy": 0.980079609155655, + "num_tokens": 27531270.0, + "step": 1340 + }, + { + "entropy": 0.8660618126392364, + "epoch": 2.4861878453038675, + "grad_norm": 0.719760537147522, + "learning_rate": 8.96228215419924e-05, + "loss": 0.04892141819000244, + "mean_token_accuracy": 0.9814020991325378, + "num_tokens": 27736542.0, + "step": 1350 + }, + { + "entropy": 0.8572284400463104, + "epoch": 2.5046040515653774, + "grad_norm": 1.0197229385375977, + "learning_rate": 8.943634260121442e-05, + "loss": 0.05104702711105347, + "mean_token_accuracy": 0.9798846662044525, + "num_tokens": 27941566.0, + "step": 1360 + }, + { + "entropy": 0.8702241241931915, + "epoch": 2.5230202578268877, + "grad_norm": 0.7136003375053406, + "learning_rate": 8.924840070075247e-05, + "loss": 0.04855787754058838, + "mean_token_accuracy": 0.9811685383319855, + "num_tokens": 28146943.0, + "step": 1370 + }, + { + "entropy": 0.874957013130188, + "epoch": 2.541436464088398, + "grad_norm": 0.8775497674942017, + "learning_rate": 8.905900281263804e-05, + "loss": 0.052434295415878296, + "mean_token_accuracy": 0.9795438170433044, + "num_tokens": 28352640.0, + "step": 1380 + }, + { + "entropy": 0.8776536166667939, + "epoch": 2.559852670349908, + "grad_norm": 0.8895741105079651, + "learning_rate": 8.8868155962915e-05, + "loss": 0.05282890796661377, + "mean_token_accuracy": 0.9790538609027862, + "num_tokens": 28558153.0, + "step": 1390 + }, + { + "entropy": 0.8738743245601654, + "epoch": 2.578268876611418, + "grad_norm": 0.788800060749054, + "learning_rate": 8.867586723137906e-05, + "loss": 0.048841872811317445, + "mean_token_accuracy": 0.9809149026870727, + "num_tokens": 28763613.0, + "step": 1400 + }, + { + "entropy": 0.8750253796577454, + "epoch": 2.596685082872928, + "grad_norm": 0.8738002777099609, + "learning_rate": 8.848214375131497e-05, + "loss": 0.048261132836341855, + "mean_token_accuracy": 0.980789190530777, + "num_tokens": 28969248.0, + "step": 1410 + }, + { + "entropy": 0.8624245524406433, + "epoch": 2.6151012891344383, + "grad_norm": 0.6404895186424255, + "learning_rate": 8.828699270923196e-05, + "loss": 0.04970468282699585, + "mean_token_accuracy": 0.9807762265205383, + "num_tokens": 29174779.0, + "step": 1420 + }, + { + "entropy": 0.8792938470840455, + "epoch": 2.6335174953959486, + "grad_norm": 0.7856965661048889, + "learning_rate": 8.80904213445972e-05, + "loss": 0.053334391117095946, + "mean_token_accuracy": 0.9790222108364105, + "num_tokens": 29380474.0, + "step": 1430 + }, + { + "entropy": 0.8831034600734711, + "epoch": 2.6519337016574585, + "grad_norm": 0.7739618420600891, + "learning_rate": 8.789243694956716e-05, + "loss": 0.04959054589271546, + "mean_token_accuracy": 0.9803965091705322, + "num_tokens": 29585985.0, + "step": 1440 + }, + { + "entropy": 0.8934672951698304, + "epoch": 2.6703499079189688, + "grad_norm": 0.6999697089195251, + "learning_rate": 8.769304686871719e-05, + "loss": 0.05165250301361084, + "mean_token_accuracy": 0.9798884153366089, + "num_tokens": 29791238.0, + "step": 1450 + }, + { + "entropy": 0.9053199410438537, + "epoch": 2.6887661141804786, + "grad_norm": 0.9199564456939697, + "learning_rate": 8.749225849876892e-05, + "loss": 0.04924143850803375, + "mean_token_accuracy": 0.9810785710811615, + "num_tokens": 29996589.0, + "step": 1460 + }, + { + "entropy": 0.888091403245926, + "epoch": 2.707182320441989, + "grad_norm": 0.7480106353759766, + "learning_rate": 8.729007928831597e-05, + "loss": 0.04948916733264923, + "mean_token_accuracy": 0.9809579730033875, + "num_tokens": 30201875.0, + "step": 1470 + }, + { + "entropy": 0.8723407983779907, + "epoch": 2.7255985267034992, + "grad_norm": 0.9506945013999939, + "learning_rate": 8.708651673754763e-05, + "loss": 0.048927539587020875, + "mean_token_accuracy": 0.980553150177002, + "num_tokens": 30407550.0, + "step": 1480 + }, + { + "entropy": 0.8737521529197693, + "epoch": 2.744014732965009, + "grad_norm": 0.8015706539154053, + "learning_rate": 8.688157839797062e-05, + "loss": 0.04963063597679138, + "mean_token_accuracy": 0.9809738755226135, + "num_tokens": 30612839.0, + "step": 1490 + }, + { + "entropy": 0.8800762951374054, + "epoch": 2.7624309392265194, + "grad_norm": 0.9429986476898193, + "learning_rate": 8.667527187212885e-05, + "loss": 0.0524174690246582, + "mean_token_accuracy": 0.9788767337799072, + "num_tokens": 30818578.0, + "step": 1500 + }, + { + "entropy": 0.8871055901050567, + "epoch": 2.7808471454880292, + "grad_norm": 0.5909196138381958, + "learning_rate": 8.646760481332157e-05, + "loss": 0.05166680812835693, + "mean_token_accuracy": 0.980216771364212, + "num_tokens": 31023829.0, + "step": 1510 + }, + { + "entropy": 0.8908755779266357, + "epoch": 2.7992633517495396, + "grad_norm": 0.9154611229896545, + "learning_rate": 8.625858492531931e-05, + "loss": 0.04951836466789246, + "mean_token_accuracy": 0.9801484227180481, + "num_tokens": 31229635.0, + "step": 1520 + }, + { + "entropy": 0.92480548620224, + "epoch": 2.81767955801105, + "grad_norm": 0.5989938378334045, + "learning_rate": 8.604821996207819e-05, + "loss": 0.04799881279468536, + "mean_token_accuracy": 0.9817522585391998, + "num_tokens": 31435456.0, + "step": 1530 + }, + { + "entropy": 0.9173881888389588, + "epoch": 2.8360957642725597, + "grad_norm": 0.899413526058197, + "learning_rate": 8.58365177274522e-05, + "loss": 0.0487445592880249, + "mean_token_accuracy": 0.9812625288963318, + "num_tokens": 31640904.0, + "step": 1540 + }, + { + "entropy": 0.9076135993003845, + "epoch": 2.85451197053407, + "grad_norm": 0.8494166135787964, + "learning_rate": 8.562348607490376e-05, + "loss": 0.05005228519439697, + "mean_token_accuracy": 0.9806681036949157, + "num_tokens": 31845807.0, + "step": 1550 + }, + { + "entropy": 0.9092245221138, + "epoch": 2.87292817679558, + "grad_norm": 0.8225123286247253, + "learning_rate": 8.540913290721234e-05, + "loss": 0.048654764890670776, + "mean_token_accuracy": 0.9805659353733063, + "num_tokens": 32051523.0, + "step": 1560 + }, + { + "entropy": 0.9062779664993286, + "epoch": 2.89134438305709, + "grad_norm": 0.7074014544487, + "learning_rate": 8.519346617618134e-05, + "loss": 0.049209845066070554, + "mean_token_accuracy": 0.9807434439659118, + "num_tokens": 32256895.0, + "step": 1570 + }, + { + "entropy": 0.9190246641635895, + "epoch": 2.9097605893186005, + "grad_norm": 0.8860642910003662, + "learning_rate": 8.497649388234304e-05, + "loss": 0.051211881637573245, + "mean_token_accuracy": 0.9802342295646668, + "num_tokens": 32462031.0, + "step": 1580 + }, + { + "entropy": 0.9088015079498291, + "epoch": 2.9281767955801103, + "grad_norm": 0.8062726855278015, + "learning_rate": 8.475822407466188e-05, + "loss": 0.053512704372406, + "mean_token_accuracy": 0.979486483335495, + "num_tokens": 32667533.0, + "step": 1590 + }, + { + "entropy": 0.9462027847766876, + "epoch": 2.9465930018416207, + "grad_norm": 0.7962909936904907, + "learning_rate": 8.453866485023579e-05, + "loss": 0.0501457154750824, + "mean_token_accuracy": 0.9803222417831421, + "num_tokens": 32872900.0, + "step": 1600 + }, + { + "entropy": 0.9671471297740937, + "epoch": 2.9650092081031305, + "grad_norm": 0.7641744017601013, + "learning_rate": 8.431782435399587e-05, + "loss": 0.04629061222076416, + "mean_token_accuracy": 0.9823175370693207, + "num_tokens": 33077850.0, + "step": 1610 + }, + { + "entropy": 0.955865204334259, + "epoch": 2.983425414364641, + "grad_norm": 0.6772348880767822, + "learning_rate": 8.409571077840426e-05, + "loss": 0.048368623852729796, + "mean_token_accuracy": 0.9808700799942016, + "num_tokens": 33283117.0, + "step": 1620 + }, + { + "epoch": 3.0, + "eval_entropy": 0.9563225186389426, + "eval_loss": 0.059064481407403946, + "eval_mean_token_accuracy": 0.9773589429648026, + "eval_num_tokens": 33467712.0, + "eval_runtime": 10.1471, + "eval_samples_per_second": 360.499, + "eval_steps_per_second": 11.333, + "step": 1629 + }, + { + "entropy": 0.9337226033210755, + "epoch": 3.001841620626151, + "grad_norm": 0.646203875541687, + "learning_rate": 8.387233236315016e-05, + "loss": 0.043352216482162476, + "mean_token_accuracy": 0.9830620110034942, + "num_tokens": 33488302.0, + "step": 1630 + }, + { + "entropy": 0.9734923839569092, + "epoch": 3.020257826887661, + "grad_norm": 0.7564226984977722, + "learning_rate": 8.364769739484416e-05, + "loss": 0.033932483196258544, + "mean_token_accuracy": 0.9872806966304779, + "num_tokens": 33693531.0, + "step": 1640 + }, + { + "entropy": 0.9669206500053406, + "epoch": 3.0386740331491713, + "grad_norm": 0.7126886248588562, + "learning_rate": 8.342181420671096e-05, + "loss": 0.03818287253379822, + "mean_token_accuracy": 0.9852082908153534, + "num_tokens": 33899305.0, + "step": 1650 + }, + { + "entropy": 0.9522916138172149, + "epoch": 3.0570902394106816, + "grad_norm": 1.0571653842926025, + "learning_rate": 8.319469117828007e-05, + "loss": 0.03456039130687714, + "mean_token_accuracy": 0.9867027878761292, + "num_tokens": 34104585.0, + "step": 1660 + }, + { + "entropy": 0.9568560004234314, + "epoch": 3.0755064456721914, + "grad_norm": 0.780940592288971, + "learning_rate": 8.296633673507505e-05, + "loss": 0.03551802039146423, + "mean_token_accuracy": 0.9867531359195709, + "num_tokens": 34309516.0, + "step": 1670 + }, + { + "entropy": 0.9590656876564025, + "epoch": 3.0939226519337018, + "grad_norm": 0.8330219388008118, + "learning_rate": 8.273675934830094e-05, + "loss": 0.03674865961074829, + "mean_token_accuracy": 0.9864118576049805, + "num_tokens": 34515170.0, + "step": 1680 + }, + { + "entropy": 0.975881814956665, + "epoch": 3.1123388581952116, + "grad_norm": 0.7010637521743774, + "learning_rate": 8.250596753453e-05, + "loss": 0.03550414443016052, + "mean_token_accuracy": 0.9864102602005005, + "num_tokens": 34720896.0, + "step": 1690 + }, + { + "entropy": 0.9599562883377075, + "epoch": 3.130755064456722, + "grad_norm": 0.6694278717041016, + "learning_rate": 8.227396985538578e-05, + "loss": 0.035564273595809937, + "mean_token_accuracy": 0.9867321848869324, + "num_tokens": 34925970.0, + "step": 1700 + }, + { + "entropy": 0.9582216143608093, + "epoch": 3.149171270718232, + "grad_norm": 0.9333199262619019, + "learning_rate": 8.204077491722546e-05, + "loss": 0.035575729608535764, + "mean_token_accuracy": 0.9862452208995819, + "num_tokens": 35131543.0, + "step": 1710 + }, + { + "entropy": 0.9579678058624268, + "epoch": 3.167587476979742, + "grad_norm": 0.9450218081474304, + "learning_rate": 8.180639137082066e-05, + "loss": 0.0385298490524292, + "mean_token_accuracy": 0.98538036942482, + "num_tokens": 35336790.0, + "step": 1720 + }, + { + "entropy": 0.9640831351280212, + "epoch": 3.1860036832412524, + "grad_norm": 0.8551534414291382, + "learning_rate": 8.157082791103649e-05, + "loss": 0.03702138364315033, + "mean_token_accuracy": 0.9852015495300293, + "num_tokens": 35542294.0, + "step": 1730 + }, + { + "entropy": 0.9867071211338043, + "epoch": 3.2044198895027622, + "grad_norm": 0.7138128876686096, + "learning_rate": 8.133409327650897e-05, + "loss": 0.035626694560050964, + "mean_token_accuracy": 0.986064875125885, + "num_tokens": 35747447.0, + "step": 1740 + }, + { + "entropy": 0.9639089345932007, + "epoch": 3.2228360957642725, + "grad_norm": 0.7131415009498596, + "learning_rate": 8.109619624932092e-05, + "loss": 0.035885071754455565, + "mean_token_accuracy": 0.986273056268692, + "num_tokens": 35952258.0, + "step": 1750 + }, + { + "entropy": 0.9516046345233917, + "epoch": 3.241252302025783, + "grad_norm": 0.6900200843811035, + "learning_rate": 8.085714565467611e-05, + "loss": 0.03535219430923462, + "mean_token_accuracy": 0.985836285352707, + "num_tokens": 36157938.0, + "step": 1760 + }, + { + "entropy": 0.9373646557331086, + "epoch": 3.2596685082872927, + "grad_norm": 0.6101690530776978, + "learning_rate": 8.061695036057191e-05, + "loss": 0.034940996766090394, + "mean_token_accuracy": 0.9863743901252746, + "num_tokens": 36363825.0, + "step": 1770 + }, + { + "entropy": 0.9444344758987426, + "epoch": 3.278084714548803, + "grad_norm": 0.7518529295921326, + "learning_rate": 8.03756192774703e-05, + "loss": 0.03404279053211212, + "mean_token_accuracy": 0.9866396844387054, + "num_tokens": 36568961.0, + "step": 1780 + }, + { + "entropy": 0.9550357758998871, + "epoch": 3.2965009208103133, + "grad_norm": 0.7687555551528931, + "learning_rate": 8.013316135796734e-05, + "loss": 0.038447052240371704, + "mean_token_accuracy": 0.985325163602829, + "num_tokens": 36774514.0, + "step": 1790 + }, + { + "entropy": 0.9477231681346894, + "epoch": 3.314917127071823, + "grad_norm": 0.7521633505821228, + "learning_rate": 7.988958559646102e-05, + "loss": 0.03746694028377533, + "mean_token_accuracy": 0.9853165090084076, + "num_tokens": 36979660.0, + "step": 1800 + }, + { + "entropy": 0.925805002450943, + "epoch": 3.3333333333333335, + "grad_norm": 0.9333297610282898, + "learning_rate": 7.964490102881768e-05, + "loss": 0.03700103759765625, + "mean_token_accuracy": 0.9850880861282348, + "num_tokens": 37185191.0, + "step": 1810 + }, + { + "entropy": 0.9225482225418091, + "epoch": 3.3517495395948433, + "grad_norm": 0.7928622961044312, + "learning_rate": 7.939911673203665e-05, + "loss": 0.03825801610946655, + "mean_token_accuracy": 0.9850241422653199, + "num_tokens": 37390749.0, + "step": 1820 + }, + { + "entropy": 0.9597147881984711, + "epoch": 3.3701657458563536, + "grad_norm": 0.7658583521842957, + "learning_rate": 7.915224182391375e-05, + "loss": 0.039855146408081056, + "mean_token_accuracy": 0.9845879554748536, + "num_tokens": 37596052.0, + "step": 1830 + }, + { + "entropy": 0.9485619068145752, + "epoch": 3.388581952117864, + "grad_norm": 0.8492130637168884, + "learning_rate": 7.890428546270278e-05, + "loss": 0.039359599351882935, + "mean_token_accuracy": 0.9847265422344208, + "num_tokens": 37802063.0, + "step": 1840 + }, + { + "entropy": 0.9670301914215088, + "epoch": 3.406998158379374, + "grad_norm": 0.7527599930763245, + "learning_rate": 7.865525684677608e-05, + "loss": 0.03752985596656799, + "mean_token_accuracy": 0.9855137526988983, + "num_tokens": 38007432.0, + "step": 1850 + }, + { + "entropy": 0.9681244969367981, + "epoch": 3.425414364640884, + "grad_norm": 0.7599612474441528, + "learning_rate": 7.840516521428303e-05, + "loss": 0.03653894364833832, + "mean_token_accuracy": 0.9858933389186859, + "num_tokens": 38212923.0, + "step": 1860 + }, + { + "entropy": 0.9706049561500549, + "epoch": 3.443830570902394, + "grad_norm": 0.7678127884864807, + "learning_rate": 7.815401984280748e-05, + "loss": 0.0366938978433609, + "mean_token_accuracy": 0.9854713797569274, + "num_tokens": 38418422.0, + "step": 1870 + }, + { + "entropy": 0.9637093842029572, + "epoch": 3.4622467771639043, + "grad_norm": 0.762824535369873, + "learning_rate": 7.790183004902359e-05, + "loss": 0.03516915142536163, + "mean_token_accuracy": 0.9866003453731537, + "num_tokens": 38624389.0, + "step": 1880 + }, + { + "entropy": 0.9373565018177032, + "epoch": 3.4806629834254146, + "grad_norm": 0.8221780061721802, + "learning_rate": 7.764860518835014e-05, + "loss": 0.04049026966094971, + "mean_token_accuracy": 0.984089481830597, + "num_tokens": 38829654.0, + "step": 1890 + }, + { + "entropy": 0.9356025457382202, + "epoch": 3.4990791896869244, + "grad_norm": 0.7583426237106323, + "learning_rate": 7.739435465460356e-05, + "loss": 0.03658481240272522, + "mean_token_accuracy": 0.9857318818569183, + "num_tokens": 39034638.0, + "step": 1900 + }, + { + "entropy": 0.9740163326263428, + "epoch": 3.5174953959484347, + "grad_norm": 0.7332878112792969, + "learning_rate": 7.713908787964937e-05, + "loss": 0.03508963882923126, + "mean_token_accuracy": 0.9863419532775879, + "num_tokens": 39240265.0, + "step": 1910 + }, + { + "entropy": 0.9528286933898926, + "epoch": 3.5359116022099446, + "grad_norm": 0.6515451669692993, + "learning_rate": 7.688281433305233e-05, + "loss": 0.036055779457092284, + "mean_token_accuracy": 0.9860979080200195, + "num_tokens": 39445546.0, + "step": 1920 + }, + { + "entropy": 0.9480705261230469, + "epoch": 3.554327808471455, + "grad_norm": 0.7725827097892761, + "learning_rate": 7.662554352172515e-05, + "loss": 0.037101513147354125, + "mean_token_accuracy": 0.985782790184021, + "num_tokens": 39651078.0, + "step": 1930 + }, + { + "entropy": 0.9655321061611175, + "epoch": 3.572744014732965, + "grad_norm": 0.7756506204605103, + "learning_rate": 7.636728498957581e-05, + "loss": 0.03721855878829956, + "mean_token_accuracy": 0.9857951939105988, + "num_tokens": 39856542.0, + "step": 1940 + }, + { + "entropy": 0.9772682309150695, + "epoch": 3.591160220994475, + "grad_norm": 0.9084987640380859, + "learning_rate": 7.610804831715355e-05, + "loss": 0.03570749163627625, + "mean_token_accuracy": 0.9863450109958649, + "num_tokens": 40061913.0, + "step": 1950 + }, + { + "entropy": 0.9579685389995575, + "epoch": 3.6095764272559854, + "grad_norm": 0.6358487606048584, + "learning_rate": 7.584784312129334e-05, + "loss": 0.038210684061050416, + "mean_token_accuracy": 0.9850837290287018, + "num_tokens": 40267398.0, + "step": 1960 + }, + { + "entropy": 0.9605201721191406, + "epoch": 3.6279926335174952, + "grad_norm": 0.6263149976730347, + "learning_rate": 7.558667905475927e-05, + "loss": 0.03509160876274109, + "mean_token_accuracy": 0.9868143379688263, + "num_tokens": 40472827.0, + "step": 1970 + }, + { + "entropy": 0.964026153087616, + "epoch": 3.6464088397790055, + "grad_norm": 0.90068119764328, + "learning_rate": 7.532456580588638e-05, + "loss": 0.036211782693862916, + "mean_token_accuracy": 0.9858468770980835, + "num_tokens": 40677935.0, + "step": 1980 + }, + { + "entropy": 0.9494135618209839, + "epoch": 3.664825046040516, + "grad_norm": 0.760134756565094, + "learning_rate": 7.50615130982213e-05, + "loss": 0.03786201477050781, + "mean_token_accuracy": 0.9852500438690186, + "num_tokens": 40883750.0, + "step": 1990 + }, + { + "entropy": 0.9527071297168732, + "epoch": 3.6832412523020257, + "grad_norm": 0.9812107682228088, + "learning_rate": 7.479753069016152e-05, + "loss": 0.03803159594535828, + "mean_token_accuracy": 0.9852405369281769, + "num_tokens": 41089115.0, + "step": 2000 + }, + { + "entropy": 0.9639330863952636, + "epoch": 3.701657458563536, + "grad_norm": 0.7164933681488037, + "learning_rate": 7.453262837459332e-05, + "loss": 0.03912568986415863, + "mean_token_accuracy": 0.9849458575248718, + "num_tokens": 41294694.0, + "step": 2010 + }, + { + "entropy": 0.9536987483501435, + "epoch": 3.720073664825046, + "grad_norm": 0.6804596185684204, + "learning_rate": 7.426681597852863e-05, + "loss": 0.036410006880760196, + "mean_token_accuracy": 0.985712206363678, + "num_tokens": 41499817.0, + "step": 2020 + }, + { + "entropy": 0.9478164672851562, + "epoch": 3.738489871086556, + "grad_norm": 0.8799397349357605, + "learning_rate": 7.400010336274037e-05, + "loss": 0.03801035583019256, + "mean_token_accuracy": 0.9850274682044983, + "num_tokens": 41704932.0, + "step": 2030 + }, + { + "entropy": 0.9383447647094727, + "epoch": 3.7569060773480665, + "grad_norm": 0.8386216163635254, + "learning_rate": 7.373250042139664e-05, + "loss": 0.0373637855052948, + "mean_token_accuracy": 0.9854822158813477, + "num_tokens": 41910804.0, + "step": 2040 + }, + { + "entropy": 0.925172996520996, + "epoch": 3.7753222836095763, + "grad_norm": 0.7599324584007263, + "learning_rate": 7.346401708169377e-05, + "loss": 0.03585260808467865, + "mean_token_accuracy": 0.9860672950744629, + "num_tokens": 42116706.0, + "step": 2050 + }, + { + "entropy": 0.9463765442371368, + "epoch": 3.7937384898710866, + "grad_norm": 0.9030149579048157, + "learning_rate": 7.319466330348797e-05, + "loss": 0.035877206921577455, + "mean_token_accuracy": 0.9863968968391419, + "num_tokens": 42322670.0, + "step": 2060 + }, + { + "entropy": 0.9942441761493683, + "epoch": 3.8121546961325965, + "grad_norm": 0.6400449275970459, + "learning_rate": 7.292444907892587e-05, + "loss": 0.037310433387756345, + "mean_token_accuracy": 0.9854151606559753, + "num_tokens": 42527752.0, + "step": 2070 + }, + { + "entropy": 0.9577703952789307, + "epoch": 3.830570902394107, + "grad_norm": 0.6193167567253113, + "learning_rate": 7.265338443207387e-05, + "loss": 0.03648848831653595, + "mean_token_accuracy": 0.9856530070304871, + "num_tokens": 42732981.0, + "step": 2080 + }, + { + "entropy": 0.9663952767848969, + "epoch": 3.848987108655617, + "grad_norm": 0.759611189365387, + "learning_rate": 7.238147941854625e-05, + "loss": 0.036112996935844424, + "mean_token_accuracy": 0.9862765550613404, + "num_tokens": 42938619.0, + "step": 2090 + }, + { + "entropy": 0.9484863519668579, + "epoch": 3.867403314917127, + "grad_norm": 0.7420705556869507, + "learning_rate": 7.210874412513218e-05, + "loss": 0.03703283965587616, + "mean_token_accuracy": 0.9857317566871643, + "num_tokens": 43143753.0, + "step": 2100 + }, + { + "entropy": 0.964326673746109, + "epoch": 3.8858195211786373, + "grad_norm": 0.8779639601707458, + "learning_rate": 7.183518866942147e-05, + "loss": 0.03739701807498932, + "mean_token_accuracy": 0.9852154791355133, + "num_tokens": 43349451.0, + "step": 2110 + }, + { + "entropy": 0.9729791641235351, + "epoch": 3.904235727440147, + "grad_norm": 0.7582741379737854, + "learning_rate": 7.156082319942929e-05, + "loss": 0.03894525766372681, + "mean_token_accuracy": 0.9847454309463501, + "num_tokens": 43554598.0, + "step": 2120 + }, + { + "entropy": 0.9860592544078827, + "epoch": 3.9226519337016574, + "grad_norm": 0.860698938369751, + "learning_rate": 7.128565789321969e-05, + "loss": 0.0365300178527832, + "mean_token_accuracy": 0.9859121859073638, + "num_tokens": 43760081.0, + "step": 2130 + }, + { + "entropy": 0.9916551172733307, + "epoch": 3.9410681399631677, + "grad_norm": 0.8363776206970215, + "learning_rate": 7.100970295852805e-05, + "loss": 0.036221379041671754, + "mean_token_accuracy": 0.9859034180641174, + "num_tokens": 43965432.0, + "step": 2140 + }, + { + "entropy": 0.9553558886051178, + "epoch": 3.9594843462246776, + "grad_norm": 0.9627474546432495, + "learning_rate": 7.073296863238242e-05, + "loss": 0.03684481382369995, + "mean_token_accuracy": 0.9857315957546234, + "num_tokens": 44171232.0, + "step": 2150 + }, + { + "entropy": 0.9538035809993743, + "epoch": 3.977900552486188, + "grad_norm": 0.8399474620819092, + "learning_rate": 7.045546518072366e-05, + "loss": 0.03825397789478302, + "mean_token_accuracy": 0.9846831560134888, + "num_tokens": 44376723.0, + "step": 2160 + }, + { + "entropy": 0.9476235210895538, + "epoch": 3.9963167587476978, + "grad_norm": 0.708739697933197, + "learning_rate": 7.017720289802472e-05, + "loss": 0.03618018329143524, + "mean_token_accuracy": 0.9861325800418854, + "num_tokens": 44582407.0, + "step": 2170 + }, + { + "epoch": 4.0, + "eval_entropy": 0.9569619194321011, + "eval_loss": 0.059838198125362396, + "eval_mean_token_accuracy": 0.9777795366618944, + "eval_num_tokens": 44623647.0, + "eval_runtime": 10.0379, + "eval_samples_per_second": 364.42, + "eval_steps_per_second": 11.457, + "step": 2172 + }, + { + "entropy": 0.9558675646781921, + "epoch": 4.014732965009208, + "grad_norm": 0.7347508668899536, + "learning_rate": 6.989819210690872e-05, + "loss": 0.02886659502983093, + "mean_token_accuracy": 0.9892994821071625, + "num_tokens": 44788219.0, + "step": 2180 + }, + { + "entropy": 1.0037677466869355, + "epoch": 4.033149171270718, + "grad_norm": 0.7403206825256348, + "learning_rate": 6.961844315776596e-05, + "loss": 0.02395295798778534, + "mean_token_accuracy": 0.9906026899814606, + "num_tokens": 44993505.0, + "step": 2190 + }, + { + "entropy": 1.0068290829658508, + "epoch": 4.051565377532229, + "grad_norm": 0.7979726195335388, + "learning_rate": 6.933796642837003e-05, + "loss": 0.02605988085269928, + "mean_token_accuracy": 0.9899706900119781, + "num_tokens": 45199193.0, + "step": 2200 + }, + { + "entropy": 0.9942211747169495, + "epoch": 4.069981583793738, + "grad_norm": 0.6460402011871338, + "learning_rate": 6.905677232349278e-05, + "loss": 0.025350230932235717, + "mean_token_accuracy": 0.9899386286735534, + "num_tokens": 45404030.0, + "step": 2210 + }, + { + "entropy": 0.9783595442771912, + "epoch": 4.088397790055248, + "grad_norm": 0.8177055716514587, + "learning_rate": 6.877487127451834e-05, + "loss": 0.02696993052959442, + "mean_token_accuracy": 0.9896106541156768, + "num_tokens": 45609763.0, + "step": 2220 + }, + { + "entropy": 0.9801763832569123, + "epoch": 4.106813996316759, + "grad_norm": 0.6608165502548218, + "learning_rate": 6.849227373905618e-05, + "loss": 0.025101393461227417, + "mean_token_accuracy": 0.9904372334480286, + "num_tokens": 45814941.0, + "step": 2230 + }, + { + "entropy": 0.9695689737796783, + "epoch": 4.125230202578269, + "grad_norm": 0.8036547899246216, + "learning_rate": 6.820899020055314e-05, + "loss": 0.027827343344688414, + "mean_token_accuracy": 0.9890337705612182, + "num_tokens": 46020535.0, + "step": 2240 + }, + { + "entropy": 0.9828635334968567, + "epoch": 4.143646408839779, + "grad_norm": 0.7729921936988831, + "learning_rate": 6.792503116790455e-05, + "loss": 0.02779492735862732, + "mean_token_accuracy": 0.9894372522830963, + "num_tokens": 46226013.0, + "step": 2250 + }, + { + "entropy": 0.9978842556476593, + "epoch": 4.162062615101289, + "grad_norm": 0.7334664463996887, + "learning_rate": 6.764040717506432e-05, + "loss": 0.025673511624336242, + "mean_token_accuracy": 0.9899355113506317, + "num_tokens": 46432087.0, + "step": 2260 + }, + { + "entropy": 1.0116403937339782, + "epoch": 4.180478821362799, + "grad_norm": 0.6769368052482605, + "learning_rate": 6.735512878065427e-05, + "loss": 0.024705511331558228, + "mean_token_accuracy": 0.9906128525733948, + "num_tokens": 46637478.0, + "step": 2270 + }, + { + "entropy": 0.9985016226768494, + "epoch": 4.198895027624309, + "grad_norm": 0.8301573991775513, + "learning_rate": 6.706920656757234e-05, + "loss": 0.02455987185239792, + "mean_token_accuracy": 0.9905728340148926, + "num_tokens": 46842562.0, + "step": 2280 + }, + { + "entropy": 0.9909430682659149, + "epoch": 4.21731123388582, + "grad_norm": 0.656026303768158, + "learning_rate": 6.67826511426001e-05, + "loss": 0.022711564600467683, + "mean_token_accuracy": 0.9910893619060517, + "num_tokens": 47048071.0, + "step": 2290 + }, + { + "entropy": 0.9868666052818298, + "epoch": 4.23572744014733, + "grad_norm": 0.7614991068840027, + "learning_rate": 6.649547313600916e-05, + "loss": 0.02453812211751938, + "mean_token_accuracy": 0.9908901154994965, + "num_tokens": 47253507.0, + "step": 2300 + }, + { + "entropy": 0.9870487153530121, + "epoch": 4.25414364640884, + "grad_norm": 0.7617276906967163, + "learning_rate": 6.62076832011669e-05, + "loss": 0.025818097591400146, + "mean_token_accuracy": 0.990347957611084, + "num_tokens": 47458747.0, + "step": 2310 + }, + { + "entropy": 0.9691080570220947, + "epoch": 4.27255985267035, + "grad_norm": 0.6743029952049255, + "learning_rate": 6.591929201414124e-05, + "loss": 0.02456912100315094, + "mean_token_accuracy": 0.9905289709568024, + "num_tokens": 47663643.0, + "step": 2320 + }, + { + "entropy": 0.9701108932495117, + "epoch": 4.29097605893186, + "grad_norm": 0.6964483261108398, + "learning_rate": 6.56303102733046e-05, + "loss": 0.02575681209564209, + "mean_token_accuracy": 0.9898503363132477, + "num_tokens": 47868982.0, + "step": 2330 + }, + { + "entropy": 0.969528192281723, + "epoch": 4.30939226519337, + "grad_norm": 0.7521987557411194, + "learning_rate": 6.5340748698937e-05, + "loss": 0.02678089737892151, + "mean_token_accuracy": 0.9898572087287902, + "num_tokens": 48074314.0, + "step": 2340 + }, + { + "entropy": 0.9921871721744537, + "epoch": 4.327808471454881, + "grad_norm": 0.6944513320922852, + "learning_rate": 6.505061803282844e-05, + "loss": 0.025553321838378905, + "mean_token_accuracy": 0.9907529592514038, + "num_tokens": 48279731.0, + "step": 2350 + }, + { + "entropy": 0.9768964886665344, + "epoch": 4.346224677716391, + "grad_norm": 0.6553092002868652, + "learning_rate": 6.47599290378803e-05, + "loss": 0.0250235915184021, + "mean_token_accuracy": 0.9904054701328278, + "num_tokens": 48485401.0, + "step": 2360 + }, + { + "entropy": 0.9612838506698609, + "epoch": 4.3646408839779, + "grad_norm": 0.916820228099823, + "learning_rate": 6.446869249770619e-05, + "loss": 0.028156182169914244, + "mean_token_accuracy": 0.9888657331466675, + "num_tokens": 48691047.0, + "step": 2370 + }, + { + "entropy": 0.9665832936763763, + "epoch": 4.383057090239411, + "grad_norm": 0.9197776913642883, + "learning_rate": 6.417691921623185e-05, + "loss": 0.025303921103477477, + "mean_token_accuracy": 0.989986252784729, + "num_tokens": 48896234.0, + "step": 2380 + }, + { + "entropy": 0.9686589121818543, + "epoch": 4.401473296500921, + "grad_norm": 0.8505764603614807, + "learning_rate": 6.388462001729434e-05, + "loss": 0.024816396832466125, + "mean_token_accuracy": 0.9909265041351318, + "num_tokens": 49101893.0, + "step": 2390 + }, + { + "entropy": 0.9625210344791413, + "epoch": 4.419889502762431, + "grad_norm": 1.0601766109466553, + "learning_rate": 6.359180574424062e-05, + "loss": 0.02706078290939331, + "mean_token_accuracy": 0.9895522117614746, + "num_tokens": 49307467.0, + "step": 2400 + }, + { + "entropy": 0.9679551541805267, + "epoch": 4.4383057090239415, + "grad_norm": 0.776253879070282, + "learning_rate": 6.329848725952514e-05, + "loss": 0.02693203091621399, + "mean_token_accuracy": 0.9893981635570526, + "num_tokens": 49513020.0, + "step": 2410 + }, + { + "entropy": 0.9704959928989411, + "epoch": 4.456721915285451, + "grad_norm": 0.5459668636322021, + "learning_rate": 6.3004675444307e-05, + "loss": 0.0279473751783371, + "mean_token_accuracy": 0.9894329369068146, + "num_tokens": 49718405.0, + "step": 2420 + }, + { + "entropy": 0.961863350868225, + "epoch": 4.475138121546961, + "grad_norm": 0.9338833093643188, + "learning_rate": 6.27103811980462e-05, + "loss": 0.026478803157806395, + "mean_token_accuracy": 0.9902269721031189, + "num_tokens": 49923375.0, + "step": 2430 + }, + { + "entropy": 0.9708506822586059, + "epoch": 4.4935543278084715, + "grad_norm": 0.9073707461357117, + "learning_rate": 6.241561543809947e-05, + "loss": 0.025289520621299744, + "mean_token_accuracy": 0.9904769957065582, + "num_tokens": 50128901.0, + "step": 2440 + }, + { + "entropy": 0.984996622800827, + "epoch": 4.511970534069982, + "grad_norm": 0.8674206733703613, + "learning_rate": 6.212038909931503e-05, + "loss": 0.026442551612854005, + "mean_token_accuracy": 0.9905101835727692, + "num_tokens": 50334449.0, + "step": 2450 + }, + { + "entropy": 0.9926377475261688, + "epoch": 4.530386740331492, + "grad_norm": 0.7571811079978943, + "learning_rate": 6.182471313362717e-05, + "loss": 0.026819539070129395, + "mean_token_accuracy": 0.9898989200592041, + "num_tokens": 50539597.0, + "step": 2460 + }, + { + "entropy": 0.9450563549995422, + "epoch": 4.5488029465930016, + "grad_norm": 0.6651087403297424, + "learning_rate": 6.15285985096498e-05, + "loss": 0.02665227949619293, + "mean_token_accuracy": 0.9897156655788422, + "num_tokens": 50744926.0, + "step": 2470 + }, + { + "entropy": 0.9715635657310486, + "epoch": 4.567219152854512, + "grad_norm": 0.7445545196533203, + "learning_rate": 6.12320562122697e-05, + "loss": 0.026212453842163086, + "mean_token_accuracy": 0.9904700636863708, + "num_tokens": 50950152.0, + "step": 2480 + }, + { + "entropy": 0.9613442063331604, + "epoch": 4.585635359116022, + "grad_norm": 0.7168459296226501, + "learning_rate": 6.0935097242238837e-05, + "loss": 0.02508128583431244, + "mean_token_accuracy": 0.9901923894882202, + "num_tokens": 51155430.0, + "step": 2490 + }, + { + "entropy": 0.9571944534778595, + "epoch": 4.6040515653775325, + "grad_norm": 0.7590732574462891, + "learning_rate": 6.063773261576646e-05, + "loss": 0.025445500016212465, + "mean_token_accuracy": 0.9902949810028077, + "num_tokens": 51360826.0, + "step": 2500 + }, + { + "entropy": 0.947079461812973, + "epoch": 4.622467771639043, + "grad_norm": 0.6942175030708313, + "learning_rate": 6.033997336411035e-05, + "loss": 0.026132801175117494, + "mean_token_accuracy": 0.9900939345359803, + "num_tokens": 51566095.0, + "step": 2510 + }, + { + "entropy": 0.970003741979599, + "epoch": 4.640883977900552, + "grad_norm": 0.6562672257423401, + "learning_rate": 6.00418305331675e-05, + "loss": 0.024759869277477264, + "mean_token_accuracy": 0.9905019223690033, + "num_tokens": 51771177.0, + "step": 2520 + }, + { + "entropy": 0.9715348601341247, + "epoch": 4.6593001841620625, + "grad_norm": 0.6151639819145203, + "learning_rate": 5.9743315183064564e-05, + "loss": 0.024138522148132325, + "mean_token_accuracy": 0.9910101473331452, + "num_tokens": 51976349.0, + "step": 2530 + }, + { + "entropy": 0.9552160143852234, + "epoch": 4.677716390423573, + "grad_norm": 0.968815267086029, + "learning_rate": 5.9444438387747336e-05, + "loss": 0.027274739742279053, + "mean_token_accuracy": 0.9896075248718261, + "num_tokens": 52181820.0, + "step": 2540 + }, + { + "entropy": 0.9265012145042419, + "epoch": 4.696132596685083, + "grad_norm": 0.8966720700263977, + "learning_rate": 5.914521123457015e-05, + "loss": 0.0291823148727417, + "mean_token_accuracy": 0.9886700630187988, + "num_tokens": 52387511.0, + "step": 2550 + }, + { + "entropy": 0.9156096875667572, + "epoch": 4.714548802946593, + "grad_norm": 0.7747519612312317, + "learning_rate": 5.88456448238844e-05, + "loss": 0.02809179127216339, + "mean_token_accuracy": 0.9891100466251374, + "num_tokens": 52592737.0, + "step": 2560 + }, + { + "entropy": 0.924511456489563, + "epoch": 4.732965009208103, + "grad_norm": 1.0087049007415771, + "learning_rate": 5.8545750268626844e-05, + "loss": 0.02683232128620148, + "mean_token_accuracy": 0.9896528899669648, + "num_tokens": 52798814.0, + "step": 2570 + }, + { + "entropy": 0.9662951111793519, + "epoch": 4.751381215469613, + "grad_norm": 0.7709590792655945, + "learning_rate": 5.824553869390734e-05, + "loss": 0.02503817081451416, + "mean_token_accuracy": 0.9900161385536194, + "num_tokens": 53004478.0, + "step": 2580 + }, + { + "entropy": 0.9889141619205475, + "epoch": 4.769797421731123, + "grad_norm": 0.815858006477356, + "learning_rate": 5.794502123659613e-05, + "loss": 0.026327347755432128, + "mean_token_accuracy": 0.9900785744190216, + "num_tokens": 53209888.0, + "step": 2590 + }, + { + "entropy": 0.9785685896873474, + "epoch": 4.788213627992634, + "grad_norm": 0.6514431238174438, + "learning_rate": 5.7644209044910735e-05, + "loss": 0.025033789873123168, + "mean_token_accuracy": 0.9902650475502014, + "num_tokens": 53415533.0, + "step": 2600 + }, + { + "entropy": 0.9723869919776916, + "epoch": 4.806629834254144, + "grad_norm": 0.8778963685035706, + "learning_rate": 5.7343113278002284e-05, + "loss": 0.02379843294620514, + "mean_token_accuracy": 0.9909472465515137, + "num_tokens": 53620850.0, + "step": 2610 + }, + { + "entropy": 0.9572711050510406, + "epoch": 4.8250460405156534, + "grad_norm": 0.8927134871482849, + "learning_rate": 5.70417451055417e-05, + "loss": 0.024856947362422943, + "mean_token_accuracy": 0.9904125213623047, + "num_tokens": 53826259.0, + "step": 2620 + }, + { + "entropy": 0.9523135125637054, + "epoch": 4.843462246777164, + "grad_norm": 0.6832691431045532, + "learning_rate": 5.674011570730523e-05, + "loss": 0.025352203845977785, + "mean_token_accuracy": 0.990432596206665, + "num_tokens": 54031531.0, + "step": 2630 + }, + { + "entropy": 0.9735220730304718, + "epoch": 4.861878453038674, + "grad_norm": 0.6399164795875549, + "learning_rate": 5.643823627275972e-05, + "loss": 0.026541513204574586, + "mean_token_accuracy": 0.9900369107723236, + "num_tokens": 54237155.0, + "step": 2640 + }, + { + "entropy": 0.9566517114639282, + "epoch": 4.880294659300184, + "grad_norm": 0.8725414276123047, + "learning_rate": 5.6136118000647616e-05, + "loss": 0.02675778865814209, + "mean_token_accuracy": 0.9894899427890778, + "num_tokens": 54442739.0, + "step": 2650 + }, + { + "entropy": 0.9447909593582153, + "epoch": 4.898710865561695, + "grad_norm": 0.8169302344322205, + "learning_rate": 5.583377209857138e-05, + "loss": 0.02642086148262024, + "mean_token_accuracy": 0.989885401725769, + "num_tokens": 54648098.0, + "step": 2660 + }, + { + "entropy": 0.9180052697658538, + "epoch": 4.917127071823204, + "grad_norm": 0.7768753170967102, + "learning_rate": 5.553120978257787e-05, + "loss": 0.02552323341369629, + "mean_token_accuracy": 0.9899512350559234, + "num_tokens": 54854281.0, + "step": 2670 + }, + { + "entropy": 0.917166668176651, + "epoch": 4.935543278084714, + "grad_norm": 0.8241410851478577, + "learning_rate": 5.5228442276742153e-05, + "loss": 0.02788199484348297, + "mean_token_accuracy": 0.989625746011734, + "num_tokens": 55059495.0, + "step": 2680 + }, + { + "entropy": 0.9345465302467346, + "epoch": 4.953959484346225, + "grad_norm": 0.7645496129989624, + "learning_rate": 5.4925480812751166e-05, + "loss": 0.02517639398574829, + "mean_token_accuracy": 0.9902283847332001, + "num_tokens": 55265381.0, + "step": 2690 + }, + { + "entropy": 0.9386432528495788, + "epoch": 4.972375690607735, + "grad_norm": 0.8371859192848206, + "learning_rate": 5.46223366294871e-05, + "loss": 0.025585666298866272, + "mean_token_accuracy": 0.9903791427612305, + "num_tokens": 55471210.0, + "step": 2700 + }, + { + "entropy": 0.9267561137676239, + "epoch": 4.990791896869245, + "grad_norm": 0.6789297461509705, + "learning_rate": 5.43190209726104e-05, + "loss": 0.024646708369255067, + "mean_token_accuracy": 0.9904700815677643, + "num_tokens": 55676877.0, + "step": 2710 + }, + { + "epoch": 5.0, + "eval_entropy": 0.9283919717954553, + "eval_loss": 0.06225527077913284, + "eval_mean_token_accuracy": 0.9784110421719758, + "eval_num_tokens": 55779559.0, + "eval_runtime": 10.0613, + "eval_samples_per_second": 363.573, + "eval_steps_per_second": 11.43, + "step": 2715 + }, + { + "entropy": 0.9269404351711273, + "epoch": 5.009208103130755, + "grad_norm": 0.540570080280304, + "learning_rate": 5.401554509414264e-05, + "loss": 0.019513805210590363, + "mean_token_accuracy": 0.9927033007144928, + "num_tokens": 55882241.0, + "step": 2720 + }, + { + "entropy": 0.9377441763877868, + "epoch": 5.027624309392265, + "grad_norm": 0.5840998888015747, + "learning_rate": 5.3711920252049085e-05, + "loss": 0.015180909633636474, + "mean_token_accuracy": 0.9944471418857574, + "num_tokens": 56087470.0, + "step": 2730 + }, + { + "entropy": 0.949122017621994, + "epoch": 5.046040515653775, + "grad_norm": 0.6938672065734863, + "learning_rate": 5.340815770982106e-05, + "loss": 0.0153742715716362, + "mean_token_accuracy": 0.9941534519195556, + "num_tokens": 56292226.0, + "step": 2740 + }, + { + "entropy": 0.9394402146339417, + "epoch": 5.064456721915286, + "grad_norm": 0.8259939551353455, + "learning_rate": 5.310426873605814e-05, + "loss": 0.014350908994674682, + "mean_token_accuracy": 0.9945570707321167, + "num_tokens": 56497839.0, + "step": 2750 + }, + { + "entropy": 0.9323545396327972, + "epoch": 5.082872928176796, + "grad_norm": 0.9675024747848511, + "learning_rate": 5.280026460405005e-05, + "loss": 0.016550135612487794, + "mean_token_accuracy": 0.9938908398151398, + "num_tokens": 56702932.0, + "step": 2760 + }, + { + "entropy": 0.89125554561615, + "epoch": 5.101289134438305, + "grad_norm": 0.8347184658050537, + "learning_rate": 5.2496156591358566e-05, + "loss": 0.017917826771736145, + "mean_token_accuracy": 0.9934309899806977, + "num_tokens": 56908644.0, + "step": 2770 + }, + { + "entropy": 0.8773505449295044, + "epoch": 5.119705340699816, + "grad_norm": 0.8869524598121643, + "learning_rate": 5.219195597939908e-05, + "loss": 0.017221055924892426, + "mean_token_accuracy": 0.993448656797409, + "num_tokens": 57114171.0, + "step": 2780 + }, + { + "entropy": 0.8874686002731323, + "epoch": 5.138121546961326, + "grad_norm": 1.0294251441955566, + "learning_rate": 5.1887674053022084e-05, + "loss": 0.018111808598041533, + "mean_token_accuracy": 0.9931293666362763, + "num_tokens": 57319158.0, + "step": 2790 + }, + { + "entropy": 0.8893351197242737, + "epoch": 5.156537753222836, + "grad_norm": 0.6253597736358643, + "learning_rate": 5.15833221000946e-05, + "loss": 0.017256538569927215, + "mean_token_accuracy": 0.9936724424362182, + "num_tokens": 57524901.0, + "step": 2800 + }, + { + "entropy": 0.9157109141349793, + "epoch": 5.1749539594843466, + "grad_norm": 0.6379142999649048, + "learning_rate": 5.12789114110814e-05, + "loss": 0.016415870189666747, + "mean_token_accuracy": 0.9939744889736175, + "num_tokens": 57730135.0, + "step": 2810 + }, + { + "entropy": 0.9157932877540589, + "epoch": 5.193370165745856, + "grad_norm": 0.7195688486099243, + "learning_rate": 5.097445327862619e-05, + "loss": 0.01577536463737488, + "mean_token_accuracy": 0.9941773355007172, + "num_tokens": 57936210.0, + "step": 2820 + }, + { + "entropy": 0.9179767727851867, + "epoch": 5.211786372007366, + "grad_norm": 0.7149335741996765, + "learning_rate": 5.066995899713264e-05, + "loss": 0.01606254279613495, + "mean_token_accuracy": 0.9937664806842804, + "num_tokens": 58141736.0, + "step": 2830 + }, + { + "entropy": 0.895512479543686, + "epoch": 5.230202578268877, + "grad_norm": 0.6460169553756714, + "learning_rate": 5.036543986234543e-05, + "loss": 0.01605578660964966, + "mean_token_accuracy": 0.994063013792038, + "num_tokens": 58347178.0, + "step": 2840 + }, + { + "entropy": 0.8883109211921691, + "epoch": 5.248618784530387, + "grad_norm": 0.72477787733078, + "learning_rate": 5.006090717093128e-05, + "loss": 0.016773784160614015, + "mean_token_accuracy": 0.9940340936183929, + "num_tokens": 58552952.0, + "step": 2850 + }, + { + "entropy": 0.8942575633525849, + "epoch": 5.267034990791897, + "grad_norm": 0.7344926595687866, + "learning_rate": 4.9756372220059736e-05, + "loss": 0.01604126989841461, + "mean_token_accuracy": 0.994256991147995, + "num_tokens": 58758449.0, + "step": 2860 + }, + { + "entropy": 0.8854099690914154, + "epoch": 5.285451197053407, + "grad_norm": 0.6142122149467468, + "learning_rate": 4.9451846306984214e-05, + "loss": 0.016244474053382873, + "mean_token_accuracy": 0.9938375532627106, + "num_tokens": 58963691.0, + "step": 2870 + }, + { + "entropy": 0.8745675146579742, + "epoch": 5.303867403314917, + "grad_norm": 0.8025366067886353, + "learning_rate": 4.9147340728622816e-05, + "loss": 0.01611460596323013, + "mean_token_accuracy": 0.9941173672676087, + "num_tokens": 59169484.0, + "step": 2880 + }, + { + "entropy": 0.8812389194965362, + "epoch": 5.322283609576427, + "grad_norm": 0.7699193358421326, + "learning_rate": 4.884286678113935e-05, + "loss": 0.016995206475257874, + "mean_token_accuracy": 0.9937523245811463, + "num_tokens": 59374627.0, + "step": 2890 + }, + { + "entropy": 0.8924362242221833, + "epoch": 5.3406998158379375, + "grad_norm": 0.7516226172447205, + "learning_rate": 4.853843575952414e-05, + "loss": 0.01652217358350754, + "mean_token_accuracy": 0.9936819314956665, + "num_tokens": 59580135.0, + "step": 2900 + }, + { + "entropy": 0.8972602427005768, + "epoch": 5.359116022099448, + "grad_norm": 0.7781681418418884, + "learning_rate": 4.823405895717521e-05, + "loss": 0.017360319197177888, + "mean_token_accuracy": 0.9935634732246399, + "num_tokens": 59785392.0, + "step": 2910 + }, + { + "entropy": 0.900998342037201, + "epoch": 5.377532228360957, + "grad_norm": 0.6837047934532166, + "learning_rate": 4.792974766547911e-05, + "loss": 0.017162233591079712, + "mean_token_accuracy": 0.993264091014862, + "num_tokens": 59991448.0, + "step": 2920 + }, + { + "entropy": 0.9239763855934143, + "epoch": 5.3959484346224675, + "grad_norm": 0.7188259363174438, + "learning_rate": 4.762551317339226e-05, + "loss": 0.01718595027923584, + "mean_token_accuracy": 0.9933857440948486, + "num_tokens": 60197079.0, + "step": 2930 + }, + { + "entropy": 0.9056789398193359, + "epoch": 5.414364640883978, + "grad_norm": 0.6922260522842407, + "learning_rate": 4.732136676702198e-05, + "loss": 0.016596907377243043, + "mean_token_accuracy": 0.9937462329864502, + "num_tokens": 60402299.0, + "step": 2940 + }, + { + "entropy": 0.9038522362709045, + "epoch": 5.432780847145488, + "grad_norm": 0.7937009930610657, + "learning_rate": 4.7017319729207954e-05, + "loss": 0.016130413115024566, + "mean_token_accuracy": 0.9941940546035767, + "num_tokens": 60607907.0, + "step": 2950 + }, + { + "entropy": 0.8911147236824035, + "epoch": 5.4511970534069984, + "grad_norm": 0.6261171698570251, + "learning_rate": 4.671338333910359e-05, + "loss": 0.01622493863105774, + "mean_token_accuracy": 0.9937945663928985, + "num_tokens": 60813428.0, + "step": 2960 + }, + { + "entropy": 0.8894322276115417, + "epoch": 5.469613259668508, + "grad_norm": 0.6898378729820251, + "learning_rate": 4.6409568871757645e-05, + "loss": 0.016513559222221374, + "mean_token_accuracy": 0.9936174690723419, + "num_tokens": 61018404.0, + "step": 2970 + }, + { + "entropy": 0.9026601016521454, + "epoch": 5.488029465930018, + "grad_norm": 0.7027897834777832, + "learning_rate": 4.610588759769593e-05, + "loss": 0.016727012395858765, + "mean_token_accuracy": 0.9941417872905731, + "num_tokens": 61223660.0, + "step": 2980 + }, + { + "entropy": 0.8903301954269409, + "epoch": 5.5064456721915285, + "grad_norm": 0.9087063074111938, + "learning_rate": 4.5802350782503196e-05, + "loss": 0.016929233074188234, + "mean_token_accuracy": 0.9935264468193055, + "num_tokens": 61429438.0, + "step": 2990 + }, + { + "entropy": 0.8886692762374878, + "epoch": 5.524861878453039, + "grad_norm": 0.8283822536468506, + "learning_rate": 4.5498969686405266e-05, + "loss": 0.015396638214588166, + "mean_token_accuracy": 0.99433131814003, + "num_tokens": 61635274.0, + "step": 3000 + }, + { + "entropy": 0.8902086555957794, + "epoch": 5.543278084714549, + "grad_norm": 0.7676647305488586, + "learning_rate": 4.5195755563851336e-05, + "loss": 0.01673731654882431, + "mean_token_accuracy": 0.9938134133815766, + "num_tokens": 61840778.0, + "step": 3010 + }, + { + "entropy": 0.8941606819629669, + "epoch": 5.5616942909760585, + "grad_norm": 0.7026392221450806, + "learning_rate": 4.489271966309634e-05, + "loss": 0.01694796681404114, + "mean_token_accuracy": 0.9936233103275299, + "num_tokens": 62046355.0, + "step": 3020 + }, + { + "entropy": 0.90918750166893, + "epoch": 5.580110497237569, + "grad_norm": 0.7146924734115601, + "learning_rate": 4.4589873225783806e-05, + "loss": 0.01852080672979355, + "mean_token_accuracy": 0.9928994178771973, + "num_tokens": 62251709.0, + "step": 3030 + }, + { + "entropy": 0.8946544349193573, + "epoch": 5.598526703499079, + "grad_norm": 0.607246458530426, + "learning_rate": 4.428722748652881e-05, + "loss": 0.016636812686920167, + "mean_token_accuracy": 0.9939334273338318, + "num_tokens": 62456680.0, + "step": 3040 + }, + { + "entropy": 0.8854653835296631, + "epoch": 5.616942909760589, + "grad_norm": 0.7457882165908813, + "learning_rate": 4.3984793672501124e-05, + "loss": 0.016008296608924867, + "mean_token_accuracy": 0.9940589666366577, + "num_tokens": 62662038.0, + "step": 3050 + }, + { + "entropy": 0.8921085000038147, + "epoch": 5.6353591160221, + "grad_norm": 0.7707350254058838, + "learning_rate": 4.368258300300888e-05, + "loss": 0.016655120253562927, + "mean_token_accuracy": 0.993935889005661, + "num_tokens": 62867272.0, + "step": 3060 + }, + { + "entropy": 0.8768653869628906, + "epoch": 5.653775322283609, + "grad_norm": 0.6994554996490479, + "learning_rate": 4.3380606689082166e-05, + "loss": 0.015841150283813478, + "mean_token_accuracy": 0.9944550096988678, + "num_tokens": 63072403.0, + "step": 3070 + }, + { + "entropy": 0.8767679035663605, + "epoch": 5.672191528545119, + "grad_norm": 0.8327192068099976, + "learning_rate": 4.307887593305733e-05, + "loss": 0.015826576948165895, + "mean_token_accuracy": 0.9941202461719513, + "num_tokens": 63277635.0, + "step": 3080 + }, + { + "entropy": 0.8763292253017425, + "epoch": 5.69060773480663, + "grad_norm": 0.7224747538566589, + "learning_rate": 4.277740192816127e-05, + "loss": 0.015298140048980714, + "mean_token_accuracy": 0.9943080008029938, + "num_tokens": 63483196.0, + "step": 3090 + }, + { + "entropy": 0.8834661841392517, + "epoch": 5.70902394106814, + "grad_norm": 0.9508277773857117, + "learning_rate": 4.247619585809627e-05, + "loss": 0.01658404469490051, + "mean_token_accuracy": 0.9934300124645233, + "num_tokens": 63688721.0, + "step": 3100 + }, + { + "entropy": 0.899389523267746, + "epoch": 5.72744014732965, + "grad_norm": 0.7170981168746948, + "learning_rate": 4.217526889662512e-05, + "loss": 0.015803813934326172, + "mean_token_accuracy": 0.9940325975418091, + "num_tokens": 63894220.0, + "step": 3110 + }, + { + "entropy": 0.8968011736869812, + "epoch": 5.74585635359116, + "grad_norm": 0.6686251163482666, + "learning_rate": 4.187463220715659e-05, + "loss": 0.015874400734901428, + "mean_token_accuracy": 0.9940970957279205, + "num_tokens": 64099768.0, + "step": 3120 + }, + { + "entropy": 0.8900792479515076, + "epoch": 5.76427255985267, + "grad_norm": 0.5979828238487244, + "learning_rate": 4.157429694233128e-05, + "loss": 0.01613767147064209, + "mean_token_accuracy": 0.9942961037158966, + "num_tokens": 64305055.0, + "step": 3130 + }, + { + "entropy": 0.8899810135364532, + "epoch": 5.78268876611418, + "grad_norm": 0.7330048084259033, + "learning_rate": 4.127427424360794e-05, + "loss": 0.016168563067913054, + "mean_token_accuracy": 0.9941077649593353, + "num_tokens": 64510002.0, + "step": 3140 + }, + { + "entropy": 0.8805335581302642, + "epoch": 5.801104972375691, + "grad_norm": 0.5978623032569885, + "learning_rate": 4.09745752408501e-05, + "loss": 0.01524556577205658, + "mean_token_accuracy": 0.994326776266098, + "num_tokens": 64715431.0, + "step": 3150 + }, + { + "entropy": 0.878781646490097, + "epoch": 5.819521178637201, + "grad_norm": 0.6749313473701477, + "learning_rate": 4.067521105191331e-05, + "loss": 0.015209287405014038, + "mean_token_accuracy": 0.9942974805831909, + "num_tokens": 64921579.0, + "step": 3160 + }, + { + "entropy": 0.8844729900360108, + "epoch": 5.83793738489871, + "grad_norm": 0.6887196898460388, + "learning_rate": 4.037619278223255e-05, + "loss": 0.01619938760995865, + "mean_token_accuracy": 0.9937683045864105, + "num_tokens": 65127007.0, + "step": 3170 + }, + { + "entropy": 0.8780498623847961, + "epoch": 5.856353591160221, + "grad_norm": 0.6962174773216248, + "learning_rate": 4.0077531524410304e-05, + "loss": 0.015934592485427855, + "mean_token_accuracy": 0.9935103774070739, + "num_tokens": 65332418.0, + "step": 3180 + }, + { + "entropy": 0.8889612555503845, + "epoch": 5.874769797421731, + "grad_norm": 0.6049854159355164, + "learning_rate": 3.977923835780517e-05, + "loss": 0.01600206792354584, + "mean_token_accuracy": 0.9937360048294067, + "num_tokens": 65537845.0, + "step": 3190 + }, + { + "entropy": 0.8960810244083405, + "epoch": 5.893186003683241, + "grad_norm": 0.6341013312339783, + "learning_rate": 3.948132434812065e-05, + "loss": 0.0143389493227005, + "mean_token_accuracy": 0.9948007702827454, + "num_tokens": 65743412.0, + "step": 3200 + }, + { + "entropy": 0.887304550409317, + "epoch": 5.911602209944752, + "grad_norm": 0.7564852237701416, + "learning_rate": 3.9183800546994886e-05, + "loss": 0.016044440865516662, + "mean_token_accuracy": 0.9939335525035858, + "num_tokens": 65948884.0, + "step": 3210 + }, + { + "entropy": 0.8823239862918854, + "epoch": 5.930018416206261, + "grad_norm": 0.6525556445121765, + "learning_rate": 3.8886677991590435e-05, + "loss": 0.016112390160560607, + "mean_token_accuracy": 0.9938134670257568, + "num_tokens": 66153768.0, + "step": 3220 + }, + { + "entropy": 0.8712829887866974, + "epoch": 5.948434622467771, + "grad_norm": 0.676167368888855, + "learning_rate": 3.858996770418504e-05, + "loss": 0.015146306157112122, + "mean_token_accuracy": 0.9944733619689942, + "num_tokens": 66359661.0, + "step": 3230 + }, + { + "entropy": 0.8734102070331573, + "epoch": 5.966850828729282, + "grad_norm": 0.6284340023994446, + "learning_rate": 3.829368069176257e-05, + "loss": 0.017269474267959595, + "mean_token_accuracy": 0.9938443183898926, + "num_tokens": 66565228.0, + "step": 3240 + }, + { + "entropy": 0.8757335782051087, + "epoch": 5.985267034990792, + "grad_norm": 0.722522497177124, + "learning_rate": 3.799782794560484e-05, + "loss": 0.015032704174518585, + "mean_token_accuracy": 0.9942249894142151, + "num_tokens": 66770718.0, + "step": 3250 + }, + { + "epoch": 6.0, + "eval_entropy": 0.8780099873957427, + "eval_loss": 0.06740746647119522, + "eval_mean_token_accuracy": 0.9795082377350849, + "eval_num_tokens": 66935435.0, + "eval_runtime": 10.0955, + "eval_samples_per_second": 362.34, + "eval_steps_per_second": 11.391, + "step": 3258 + }, + { + "entropy": 0.8773481965065002, + "epoch": 6.003683241252302, + "grad_norm": 0.49184396862983704, + "learning_rate": 3.770242044088375e-05, + "loss": 0.013721099495887757, + "mean_token_accuracy": 0.9951768457889557, + "num_tokens": 66976478.0, + "step": 3260 + }, + { + "entropy": 0.8643155217170715, + "epoch": 6.0220994475138125, + "grad_norm": 0.4903622567653656, + "learning_rate": 3.7407469136254234e-05, + "loss": 0.009165047109127045, + "mean_token_accuracy": 0.9969388306140899, + "num_tokens": 67182251.0, + "step": 3270 + }, + { + "entropy": 0.8560326337814331, + "epoch": 6.040515653775322, + "grad_norm": 0.6360073685646057, + "learning_rate": 3.711298497344766e-05, + "loss": 0.010200753808021545, + "mean_token_accuracy": 0.9964211463928223, + "num_tokens": 67387493.0, + "step": 3280 + }, + { + "entropy": 0.8391405165195465, + "epoch": 6.058931860036832, + "grad_norm": 0.519554853439331, + "learning_rate": 3.6818978876865984e-05, + "loss": 0.008906974643468856, + "mean_token_accuracy": 0.9966452360153198, + "num_tokens": 67593314.0, + "step": 3290 + }, + { + "entropy": 0.8300552070140839, + "epoch": 6.077348066298343, + "grad_norm": 0.5294632911682129, + "learning_rate": 3.6525461753176426e-05, + "loss": 0.008088209480047227, + "mean_token_accuracy": 0.9971098065376282, + "num_tokens": 67798634.0, + "step": 3300 + }, + { + "entropy": 0.822588461637497, + "epoch": 6.095764272559853, + "grad_norm": 0.46423637866973877, + "learning_rate": 3.623244449090697e-05, + "loss": 0.008058926463127137, + "mean_token_accuracy": 0.9970856845378876, + "num_tokens": 68003683.0, + "step": 3310 + }, + { + "entropy": 0.8130167067050934, + "epoch": 6.114180478821363, + "grad_norm": 0.497258722782135, + "learning_rate": 3.5939937960042314e-05, + "loss": 0.008712668716907502, + "mean_token_accuracy": 0.996820193529129, + "num_tokens": 68208795.0, + "step": 3320 + }, + { + "entropy": 0.8035802125930787, + "epoch": 6.132596685082873, + "grad_norm": 0.43152952194213867, + "learning_rate": 3.5647953011620716e-05, + "loss": 0.008366625010967254, + "mean_token_accuracy": 0.9969616234302521, + "num_tokens": 68414414.0, + "step": 3330 + }, + { + "entropy": 0.8028200149536133, + "epoch": 6.151012891344383, + "grad_norm": 0.6057612299919128, + "learning_rate": 3.535650047733141e-05, + "loss": 0.00967741459608078, + "mean_token_accuracy": 0.9963694036006927, + "num_tokens": 68619806.0, + "step": 3340 + }, + { + "entropy": 0.7982640087604522, + "epoch": 6.169429097605893, + "grad_norm": 0.4966030716896057, + "learning_rate": 3.5065591169112785e-05, + "loss": 0.00934397652745247, + "mean_token_accuracy": 0.9969527781009674, + "num_tokens": 68825254.0, + "step": 3350 + }, + { + "entropy": 0.7859498977661132, + "epoch": 6.1878453038674035, + "grad_norm": 0.6962474584579468, + "learning_rate": 3.477523587875139e-05, + "loss": 0.010414297878742217, + "mean_token_accuracy": 0.996203750371933, + "num_tokens": 69031043.0, + "step": 3360 + }, + { + "entropy": 0.7869667530059814, + "epoch": 6.206261510128914, + "grad_norm": 0.6078894734382629, + "learning_rate": 3.448544537748143e-05, + "loss": 0.008547455072402954, + "mean_token_accuracy": 0.9968406975269317, + "num_tokens": 69236559.0, + "step": 3370 + }, + { + "entropy": 0.8036401033401489, + "epoch": 6.224677716390423, + "grad_norm": 0.5990306735038757, + "learning_rate": 3.4196230415585337e-05, + "loss": 0.00924447700381279, + "mean_token_accuracy": 0.9967190623283386, + "num_tokens": 69441764.0, + "step": 3380 + }, + { + "entropy": 0.8085561394691467, + "epoch": 6.2430939226519335, + "grad_norm": 0.46029484272003174, + "learning_rate": 3.390760172199486e-05, + "loss": 0.008379801362752914, + "mean_token_accuracy": 0.9970395743846894, + "num_tokens": 69647075.0, + "step": 3390 + }, + { + "entropy": 0.8132422208786011, + "epoch": 6.261510128913444, + "grad_norm": 0.5699496865272522, + "learning_rate": 3.361957000389315e-05, + "loss": 0.009426499903202056, + "mean_token_accuracy": 0.996586662530899, + "num_tokens": 69852809.0, + "step": 3400 + }, + { + "entropy": 0.8065890491008758, + "epoch": 6.279926335174954, + "grad_norm": 0.6212234497070312, + "learning_rate": 3.33321459463175e-05, + "loss": 0.009811153262853622, + "mean_token_accuracy": 0.9965905249118805, + "num_tokens": 70057955.0, + "step": 3410 + }, + { + "entropy": 0.7904254853725433, + "epoch": 6.298342541436464, + "grad_norm": 0.8000790476799011, + "learning_rate": 3.304534021176299e-05, + "loss": 0.00957801640033722, + "mean_token_accuracy": 0.9964518308639526, + "num_tokens": 70263517.0, + "step": 3420 + }, + { + "entropy": 0.7971100151538849, + "epoch": 6.316758747697974, + "grad_norm": 0.6359512209892273, + "learning_rate": 3.275916343978689e-05, + "loss": 0.009681916236877442, + "mean_token_accuracy": 0.9967545390129089, + "num_tokens": 70468400.0, + "step": 3430 + }, + { + "entropy": 0.7987187504768372, + "epoch": 6.335174953959484, + "grad_norm": 0.5094901919364929, + "learning_rate": 3.247362624661406e-05, + "loss": 0.009966370463371278, + "mean_token_accuracy": 0.9964035987854004, + "num_tokens": 70673648.0, + "step": 3440 + }, + { + "entropy": 0.7850228011608124, + "epoch": 6.3535911602209945, + "grad_norm": 0.5554385185241699, + "learning_rate": 3.218873922474303e-05, + "loss": 0.009521079063415528, + "mean_token_accuracy": 0.9966651916503906, + "num_tokens": 70879452.0, + "step": 3450 + }, + { + "entropy": 0.7885844230651855, + "epoch": 6.372007366482505, + "grad_norm": 0.5217951536178589, + "learning_rate": 3.190451294255314e-05, + "loss": 0.00949474424123764, + "mean_token_accuracy": 0.9966598808765411, + "num_tokens": 71085217.0, + "step": 3460 + }, + { + "entropy": 0.797072297334671, + "epoch": 6.390423572744015, + "grad_norm": 0.5385560393333435, + "learning_rate": 3.162095794391241e-05, + "loss": 0.009810312837362289, + "mean_token_accuracy": 0.9965846955776214, + "num_tokens": 71290955.0, + "step": 3470 + }, + { + "entropy": 0.8024774849414825, + "epoch": 6.4088397790055245, + "grad_norm": 0.5419294238090515, + "learning_rate": 3.1338084747786456e-05, + "loss": 0.009127366542816161, + "mean_token_accuracy": 0.9968222141265869, + "num_tokens": 71496654.0, + "step": 3480 + }, + { + "entropy": 0.8082470417022705, + "epoch": 6.427255985267035, + "grad_norm": 0.7315362095832825, + "learning_rate": 3.105590384784821e-05, + "loss": 0.008642691373825073, + "mean_token_accuracy": 0.9970867097377777, + "num_tokens": 71701725.0, + "step": 3490 + }, + { + "entropy": 0.804630172252655, + "epoch": 6.445672191528545, + "grad_norm": 0.6668549180030823, + "learning_rate": 3.0774425712088676e-05, + "loss": 0.008679335564374923, + "mean_token_accuracy": 0.9969714701175689, + "num_tokens": 71907003.0, + "step": 3500 + }, + { + "entropy": 0.7939219176769257, + "epoch": 6.464088397790055, + "grad_norm": 0.8388434648513794, + "learning_rate": 3.049366078242864e-05, + "loss": 0.009249264001846313, + "mean_token_accuracy": 0.99674671292305, + "num_tokens": 72112532.0, + "step": 3510 + }, + { + "entropy": 0.78477823138237, + "epoch": 6.482504604051566, + "grad_norm": 0.4963231682777405, + "learning_rate": 3.021361947433125e-05, + "loss": 0.009192919731140137, + "mean_token_accuracy": 0.9965968191623688, + "num_tokens": 72318518.0, + "step": 3520 + }, + { + "entropy": 0.7841647148132325, + "epoch": 6.500920810313076, + "grad_norm": 0.5681823492050171, + "learning_rate": 2.9934312176415636e-05, + "loss": 0.008821797370910645, + "mean_token_accuracy": 0.9968703150749206, + "num_tokens": 72524548.0, + "step": 3530 + }, + { + "entropy": 0.7848304688930512, + "epoch": 6.519337016574585, + "grad_norm": 0.7126080393791199, + "learning_rate": 2.965574925007154e-05, + "loss": 0.009742744266986847, + "mean_token_accuracy": 0.9964317202568054, + "num_tokens": 72729659.0, + "step": 3540 + }, + { + "entropy": 0.7899512410163879, + "epoch": 6.537753222836096, + "grad_norm": 0.7015056014060974, + "learning_rate": 2.9377941029074986e-05, + "loss": 0.008977667987346649, + "mean_token_accuracy": 0.9968570172786713, + "num_tokens": 72934865.0, + "step": 3550 + }, + { + "entropy": 0.797937935590744, + "epoch": 6.556169429097606, + "grad_norm": 0.700501024723053, + "learning_rate": 2.910089781920486e-05, + "loss": 0.00973074734210968, + "mean_token_accuracy": 0.996515303850174, + "num_tokens": 73139684.0, + "step": 3560 + }, + { + "entropy": 0.7993333518505097, + "epoch": 6.574585635359116, + "grad_norm": 0.44471475481987, + "learning_rate": 2.882462989786061e-05, + "loss": 0.008206719905138016, + "mean_token_accuracy": 0.9968972980976105, + "num_tokens": 73345547.0, + "step": 3570 + }, + { + "entropy": 0.7961922466754914, + "epoch": 6.593001841620627, + "grad_norm": 0.5011329054832458, + "learning_rate": 2.854914751368109e-05, + "loss": 0.009073075652122498, + "mean_token_accuracy": 0.9968676805496216, + "num_tokens": 73550822.0, + "step": 3580 + }, + { + "entropy": 0.8064342319965363, + "epoch": 6.611418047882136, + "grad_norm": 0.6375740766525269, + "learning_rate": 2.82744608861642e-05, + "loss": 0.009309899061918259, + "mean_token_accuracy": 0.9967096745967865, + "num_tokens": 73756564.0, + "step": 3590 + }, + { + "entropy": 0.8025432348251342, + "epoch": 6.629834254143646, + "grad_norm": 0.8449372053146362, + "learning_rate": 2.8000580205287874e-05, + "loss": 0.009333166480064391, + "mean_token_accuracy": 0.9967890501022338, + "num_tokens": 73961849.0, + "step": 3600 + }, + { + "entropy": 0.8010810256004334, + "epoch": 6.648250460405157, + "grad_norm": 0.9473148584365845, + "learning_rate": 2.772751563113213e-05, + "loss": 0.00938543900847435, + "mean_token_accuracy": 0.996571558713913, + "num_tokens": 74167074.0, + "step": 3610 + }, + { + "entropy": 0.7954266011714936, + "epoch": 6.666666666666667, + "grad_norm": 0.4197849631309509, + "learning_rate": 2.7455277293502007e-05, + "loss": 0.008846811950206757, + "mean_token_accuracy": 0.9970432996749878, + "num_tokens": 74372119.0, + "step": 3620 + }, + { + "entropy": 0.7859483778476715, + "epoch": 6.685082872928177, + "grad_norm": 0.5353069305419922, + "learning_rate": 2.7183875291551892e-05, + "loss": 0.008807064592838287, + "mean_token_accuracy": 0.9969651758670807, + "num_tokens": 74577516.0, + "step": 3630 + }, + { + "entropy": 0.7940182387828827, + "epoch": 6.703499079189687, + "grad_norm": 0.5789965391159058, + "learning_rate": 2.6913319693410828e-05, + "loss": 0.008173662424087524, + "mean_token_accuracy": 0.9970715939998627, + "num_tokens": 74783031.0, + "step": 3640 + }, + { + "entropy": 0.7871349632740021, + "epoch": 6.721915285451197, + "grad_norm": 0.5887596011161804, + "learning_rate": 2.6643620535809076e-05, + "loss": 0.008517104387283325, + "mean_token_accuracy": 0.9969267845153809, + "num_tokens": 74988646.0, + "step": 3650 + }, + { + "entropy": 0.783170485496521, + "epoch": 6.740331491712707, + "grad_norm": 0.6228395104408264, + "learning_rate": 2.637478782370574e-05, + "loss": 0.008941689878702164, + "mean_token_accuracy": 0.9967794418334961, + "num_tokens": 75193938.0, + "step": 3660 + }, + { + "entropy": 0.7779926240444184, + "epoch": 6.758747697974218, + "grad_norm": 0.7367292642593384, + "learning_rate": 2.61068315299176e-05, + "loss": 0.009662539511919022, + "mean_token_accuracy": 0.9965554535388946, + "num_tokens": 75399816.0, + "step": 3670 + }, + { + "entropy": 0.7756146490573883, + "epoch": 6.777163904235728, + "grad_norm": 0.7478228807449341, + "learning_rate": 2.5839761594749167e-05, + "loss": 0.008691602945327758, + "mean_token_accuracy": 0.996806287765503, + "num_tokens": 75605531.0, + "step": 3680 + }, + { + "entropy": 0.7793804049491883, + "epoch": 6.795580110497237, + "grad_norm": 0.580205500125885, + "learning_rate": 2.5573587925623964e-05, + "loss": 0.00922732800245285, + "mean_token_accuracy": 0.9966219186782836, + "num_tokens": 75811143.0, + "step": 3690 + }, + { + "entropy": 0.7817609786987305, + "epoch": 6.813996316758748, + "grad_norm": 0.3849862813949585, + "learning_rate": 2.530832039671694e-05, + "loss": 0.00812167227268219, + "mean_token_accuracy": 0.9970280706882477, + "num_tokens": 76016366.0, + "step": 3700 + }, + { + "entropy": 0.7793294489383698, + "epoch": 6.832412523020258, + "grad_norm": 0.4873282313346863, + "learning_rate": 2.504396884858825e-05, + "loss": 0.008183138072490692, + "mean_token_accuracy": 0.9973145961761475, + "num_tokens": 76221692.0, + "step": 3710 + }, + { + "entropy": 0.7805068492889404, + "epoch": 6.850828729281768, + "grad_norm": 0.6652786135673523, + "learning_rate": 2.478054308781807e-05, + "loss": 0.009141853451728821, + "mean_token_accuracy": 0.9968416154384613, + "num_tokens": 76427170.0, + "step": 3720 + }, + { + "entropy": 0.7799863159656525, + "epoch": 6.8692449355432785, + "grad_norm": 0.5895428657531738, + "learning_rate": 2.451805288664298e-05, + "loss": 0.009343943744897842, + "mean_token_accuracy": 0.9968909084796905, + "num_tokens": 76632450.0, + "step": 3730 + }, + { + "entropy": 0.7819362223148346, + "epoch": 6.887661141804788, + "grad_norm": 0.6007734537124634, + "learning_rate": 2.425650798259327e-05, + "loss": 0.008081933856010437, + "mean_token_accuracy": 0.9972956955432892, + "num_tokens": 76837993.0, + "step": 3740 + }, + { + "entropy": 0.7859819054603576, + "epoch": 6.906077348066298, + "grad_norm": 0.5510725975036621, + "learning_rate": 2.39959180781318e-05, + "loss": 0.008848348259925842, + "mean_token_accuracy": 0.9968287885189057, + "num_tokens": 77043697.0, + "step": 3750 + }, + { + "entropy": 0.7813855290412903, + "epoch": 6.9244935543278086, + "grad_norm": 0.5004434585571289, + "learning_rate": 2.3736292840294122e-05, + "loss": 0.00795777291059494, + "mean_token_accuracy": 0.9973017990589141, + "num_tokens": 77248720.0, + "step": 3760 + }, + { + "entropy": 0.774254196882248, + "epoch": 6.942909760589319, + "grad_norm": 0.7068622708320618, + "learning_rate": 2.347764190032974e-05, + "loss": 0.007790238410234451, + "mean_token_accuracy": 0.997188663482666, + "num_tokens": 77454096.0, + "step": 3770 + }, + { + "entropy": 0.7675817251205445, + "epoch": 6.961325966850829, + "grad_norm": 0.5110977292060852, + "learning_rate": 2.3219974853344905e-05, + "loss": 0.008631375432014466, + "mean_token_accuracy": 0.9967362582683563, + "num_tokens": 77659811.0, + "step": 3780 + }, + { + "entropy": 0.7719516515731811, + "epoch": 6.979742173112339, + "grad_norm": 0.6288211941719055, + "learning_rate": 2.2963301257946622e-05, + "loss": 0.00804171860218048, + "mean_token_accuracy": 0.9971263229846954, + "num_tokens": 77865539.0, + "step": 3790 + }, + { + "entropy": 0.7786632418632508, + "epoch": 6.998158379373849, + "grad_norm": 0.5279833078384399, + "learning_rate": 2.270763063588814e-05, + "loss": 0.007490953803062439, + "mean_token_accuracy": 0.9974353730678558, + "num_tokens": 78070767.0, + "step": 3800 + }, + { + "epoch": 7.0, + "eval_entropy": 0.7808213239130767, + "eval_loss": 0.07382760941982269, + "eval_mean_token_accuracy": 0.9800234224485315, + "eval_num_tokens": 78091327.0, + "eval_runtime": 10.072, + "eval_samples_per_second": 363.186, + "eval_steps_per_second": 11.418, + "step": 3801 + }, + { + "entropy": 0.7750412881374359, + "epoch": 7.016574585635359, + "grad_norm": 0.3850567936897278, + "learning_rate": 2.2452972471715644e-05, + "loss": 0.005539501458406449, + "mean_token_accuracy": 0.9983771502971649, + "num_tokens": 78276132.0, + "step": 3810 + }, + { + "entropy": 0.7673899948596954, + "epoch": 7.0349907918968695, + "grad_norm": 0.4390123188495636, + "learning_rate": 2.2199336212416406e-05, + "loss": 0.0051019065082073215, + "mean_token_accuracy": 0.9984941363334656, + "num_tokens": 78481842.0, + "step": 3820 + }, + { + "entropy": 0.7712743639945984, + "epoch": 7.05340699815838, + "grad_norm": 0.48151132464408875, + "learning_rate": 2.1946731267068386e-05, + "loss": 0.005401181802153587, + "mean_token_accuracy": 0.9984619855880738, + "num_tokens": 78687831.0, + "step": 3830 + }, + { + "entropy": 0.772344833612442, + "epoch": 7.071823204419889, + "grad_norm": 0.3234920799732208, + "learning_rate": 2.169516700649115e-05, + "loss": 0.004806514084339142, + "mean_token_accuracy": 0.9984551191329956, + "num_tokens": 78893336.0, + "step": 3840 + }, + { + "entropy": 0.7736243844032288, + "epoch": 7.0902394106813995, + "grad_norm": 0.4605523645877838, + "learning_rate": 2.1444652762898242e-05, + "loss": 0.0041438989341259, + "mean_token_accuracy": 0.9988476693630218, + "num_tokens": 79098566.0, + "step": 3850 + }, + { + "entropy": 0.7667625486850739, + "epoch": 7.10865561694291, + "grad_norm": 0.43270638585090637, + "learning_rate": 2.119519782955105e-05, + "loss": 0.004775972291827202, + "mean_token_accuracy": 0.9984218835830688, + "num_tokens": 79303871.0, + "step": 3860 + }, + { + "entropy": 0.7631282329559326, + "epoch": 7.12707182320442, + "grad_norm": 0.35699328780174255, + "learning_rate": 2.094681146041394e-05, + "loss": 0.00421409159898758, + "mean_token_accuracy": 0.9988139629364013, + "num_tokens": 79509090.0, + "step": 3870 + }, + { + "entropy": 0.7578658938407898, + "epoch": 7.14548802946593, + "grad_norm": 0.6432686448097229, + "learning_rate": 2.06995028698111e-05, + "loss": 0.004374136403203011, + "mean_token_accuracy": 0.998746919631958, + "num_tokens": 79714834.0, + "step": 3880 + }, + { + "entropy": 0.7558880388736725, + "epoch": 7.16390423572744, + "grad_norm": 0.7402953505516052, + "learning_rate": 2.0453281232084586e-05, + "loss": 0.004856631904840469, + "mean_token_accuracy": 0.9985869526863098, + "num_tokens": 79920226.0, + "step": 3890 + }, + { + "entropy": 0.7542947113513947, + "epoch": 7.18232044198895, + "grad_norm": 0.3336258828639984, + "learning_rate": 2.0208155681254076e-05, + "loss": 0.0044605318456888195, + "mean_token_accuracy": 0.9986851871013641, + "num_tokens": 80125096.0, + "step": 3900 + }, + { + "entropy": 0.7547785460948944, + "epoch": 7.2007366482504604, + "grad_norm": 0.48785507678985596, + "learning_rate": 1.9964135310678017e-05, + "loss": 0.004243453219532967, + "mean_token_accuracy": 0.99877148270607, + "num_tokens": 80330544.0, + "step": 3910 + }, + { + "entropy": 0.7504124104976654, + "epoch": 7.219152854511971, + "grad_norm": 0.462425172328949, + "learning_rate": 1.9721229172716245e-05, + "loss": 0.004574070125818253, + "mean_token_accuracy": 0.9984130024909973, + "num_tokens": 80536072.0, + "step": 3920 + }, + { + "entropy": 0.7527936816215515, + "epoch": 7.237569060773481, + "grad_norm": 0.38035058975219727, + "learning_rate": 1.9479446278394208e-05, + "loss": 0.004055039957165718, + "mean_token_accuracy": 0.9987563371658326, + "num_tokens": 80741305.0, + "step": 3930 + }, + { + "entropy": 0.7502566337585449, + "epoch": 7.2559852670349905, + "grad_norm": 0.3040870428085327, + "learning_rate": 1.9238795597068665e-05, + "loss": 0.0041418131440877914, + "mean_token_accuracy": 0.998740965127945, + "num_tokens": 80946776.0, + "step": 3940 + }, + { + "entropy": 0.7453009426593781, + "epoch": 7.274401473296501, + "grad_norm": 0.39008331298828125, + "learning_rate": 1.8999286056095e-05, + "loss": 0.003949865326285362, + "mean_token_accuracy": 0.9989946007728576, + "num_tokens": 81151930.0, + "step": 3950 + }, + { + "entropy": 0.7451439797878265, + "epoch": 7.292817679558011, + "grad_norm": 0.44502392411231995, + "learning_rate": 1.8760926540496006e-05, + "loss": 0.0047814734280109406, + "mean_token_accuracy": 0.9984888076782227, + "num_tokens": 81357559.0, + "step": 3960 + }, + { + "entropy": 0.749438214302063, + "epoch": 7.311233885819521, + "grad_norm": 0.5195235013961792, + "learning_rate": 1.8523725892632253e-05, + "loss": 0.004281774908304214, + "mean_token_accuracy": 0.9987141609191894, + "num_tokens": 81563294.0, + "step": 3970 + }, + { + "entropy": 0.7536393761634826, + "epoch": 7.329650092081032, + "grad_norm": 0.34366822242736816, + "learning_rate": 1.828769291187413e-05, + "loss": 0.0038790594786405562, + "mean_token_accuracy": 0.9988636136054992, + "num_tokens": 81768219.0, + "step": 3980 + }, + { + "entropy": 0.750888729095459, + "epoch": 7.348066298342541, + "grad_norm": 0.3888038098812103, + "learning_rate": 1.8052836354275355e-05, + "loss": 0.0046471841633319855, + "mean_token_accuracy": 0.9985671877861023, + "num_tokens": 81973285.0, + "step": 3990 + }, + { + "entropy": 0.7512096881866455, + "epoch": 7.366482504604051, + "grad_norm": 0.4684146046638489, + "learning_rate": 1.7819164932248194e-05, + "loss": 0.0049116648733615875, + "mean_token_accuracy": 0.9983943462371826, + "num_tokens": 82178663.0, + "step": 4000 + }, + { + "entropy": 0.7514511585235596, + "epoch": 7.384898710865562, + "grad_norm": 0.571757435798645, + "learning_rate": 1.7586687314240296e-05, + "loss": 0.005085925757884979, + "mean_token_accuracy": 0.9984491765499115, + "num_tokens": 82384703.0, + "step": 4010 + }, + { + "entropy": 0.7534020125865937, + "epoch": 7.403314917127072, + "grad_norm": 0.3526608943939209, + "learning_rate": 1.7355412124412988e-05, + "loss": 0.004772019758820534, + "mean_token_accuracy": 0.9984830975532532, + "num_tokens": 82590547.0, + "step": 4020 + }, + { + "entropy": 0.7495484054088593, + "epoch": 7.421731123388582, + "grad_norm": 0.389273464679718, + "learning_rate": 1.7125347942321523e-05, + "loss": 0.004463380947709083, + "mean_token_accuracy": 0.9985935151576996, + "num_tokens": 82795801.0, + "step": 4030 + }, + { + "entropy": 0.7503954172134399, + "epoch": 7.440147329650092, + "grad_norm": 0.44278526306152344, + "learning_rate": 1.689650330259665e-05, + "loss": 0.004496005177497864, + "mean_token_accuracy": 0.9985817015171051, + "num_tokens": 83001332.0, + "step": 4040 + }, + { + "entropy": 0.7515525698661805, + "epoch": 7.458563535911602, + "grad_norm": 0.418070524930954, + "learning_rate": 1.666888669462809e-05, + "loss": 0.004161220416426659, + "mean_token_accuracy": 0.9988651812076569, + "num_tokens": 83206211.0, + "step": 4050 + }, + { + "entropy": 0.747660368680954, + "epoch": 7.476979742173112, + "grad_norm": 0.52656090259552, + "learning_rate": 1.6442506562249622e-05, + "loss": 0.0042838241904973985, + "mean_token_accuracy": 0.9985848963260651, + "num_tokens": 83411773.0, + "step": 4060 + }, + { + "entropy": 0.7506191194057464, + "epoch": 7.495395948434623, + "grad_norm": 0.29261597990989685, + "learning_rate": 1.621737130342578e-05, + "loss": 0.003957664594054222, + "mean_token_accuracy": 0.9986599206924438, + "num_tokens": 83617231.0, + "step": 4070 + }, + { + "entropy": 0.7549885094165802, + "epoch": 7.513812154696133, + "grad_norm": 0.35437923669815063, + "learning_rate": 1.599348926994036e-05, + "loss": 0.0034299422055482864, + "mean_token_accuracy": 0.9990102112293243, + "num_tokens": 83822893.0, + "step": 4080 + }, + { + "entropy": 0.7578713536262512, + "epoch": 7.532228360957642, + "grad_norm": 0.38587674498558044, + "learning_rate": 1.5770868767086567e-05, + "loss": 0.003320001810789108, + "mean_token_accuracy": 0.9990382492542267, + "num_tokens": 84028376.0, + "step": 4090 + }, + { + "entropy": 0.754696124792099, + "epoch": 7.550644567219153, + "grad_norm": 0.39879584312438965, + "learning_rate": 1.554951805335897e-05, + "loss": 0.004190019145607948, + "mean_token_accuracy": 0.9987861573696136, + "num_tokens": 84233768.0, + "step": 4100 + }, + { + "entropy": 0.7515742480754852, + "epoch": 7.569060773480663, + "grad_norm": 0.47624126076698303, + "learning_rate": 1.5329445340147096e-05, + "loss": 0.00403064489364624, + "mean_token_accuracy": 0.9986754775047302, + "num_tokens": 84439923.0, + "step": 4110 + }, + { + "entropy": 0.7527350902557373, + "epoch": 7.587476979742173, + "grad_norm": 0.4721614718437195, + "learning_rate": 1.5110658791430804e-05, + "loss": 0.004505171626806259, + "mean_token_accuracy": 0.9985378623008728, + "num_tokens": 84645432.0, + "step": 4120 + }, + { + "entropy": 0.747485089302063, + "epoch": 7.605893186003684, + "grad_norm": 0.49411219358444214, + "learning_rate": 1.4893166523477448e-05, + "loss": 0.0038516007363796232, + "mean_token_accuracy": 0.9987127304077148, + "num_tokens": 84850968.0, + "step": 4130 + }, + { + "entropy": 0.7463041722774506, + "epoch": 7.624309392265193, + "grad_norm": 0.4478297531604767, + "learning_rate": 1.4676976604540787e-05, + "loss": 0.00429936945438385, + "mean_token_accuracy": 0.9987892746925354, + "num_tokens": 85056363.0, + "step": 4140 + }, + { + "entropy": 0.7416360318660736, + "epoch": 7.642725598526703, + "grad_norm": 0.4913847744464874, + "learning_rate": 1.4462097054561675e-05, + "loss": 0.0036755587905645372, + "mean_token_accuracy": 0.9989015281200408, + "num_tokens": 85262252.0, + "step": 4150 + }, + { + "entropy": 0.7403277635574341, + "epoch": 7.661141804788214, + "grad_norm": 0.49693024158477783, + "learning_rate": 1.4248535844870586e-05, + "loss": 0.0037889480590820312, + "mean_token_accuracy": 0.99878990650177, + "num_tokens": 85468067.0, + "step": 4160 + }, + { + "entropy": 0.7400458335876465, + "epoch": 7.679558011049724, + "grad_norm": 0.4609115421772003, + "learning_rate": 1.4036300897891819e-05, + "loss": 0.004160438477993011, + "mean_token_accuracy": 0.9985541105270386, + "num_tokens": 85673442.0, + "step": 4170 + }, + { + "entropy": 0.7405486226081848, + "epoch": 7.697974217311234, + "grad_norm": 0.4713679254055023, + "learning_rate": 1.3825400086849693e-05, + "loss": 0.004131061211228371, + "mean_token_accuracy": 0.9986487686634063, + "num_tokens": 85878837.0, + "step": 4180 + }, + { + "entropy": 0.7426068425178528, + "epoch": 7.716390423572744, + "grad_norm": 0.36043041944503784, + "learning_rate": 1.3615841235476423e-05, + "loss": 0.004306273162364959, + "mean_token_accuracy": 0.9986724078655242, + "num_tokens": 86083884.0, + "step": 4190 + }, + { + "entropy": 0.7389791548252106, + "epoch": 7.734806629834254, + "grad_norm": 0.4564935863018036, + "learning_rate": 1.3407632117721858e-05, + "loss": 0.003909315168857575, + "mean_token_accuracy": 0.998731005191803, + "num_tokens": 86289698.0, + "step": 4200 + }, + { + "entropy": 0.7440706253051758, + "epoch": 7.753222836095764, + "grad_norm": 0.442862331867218, + "learning_rate": 1.3200780457465211e-05, + "loss": 0.0041195075958967205, + "mean_token_accuracy": 0.9987683832645416, + "num_tokens": 86494738.0, + "step": 4210 + }, + { + "entropy": 0.738617730140686, + "epoch": 7.7716390423572745, + "grad_norm": 0.4486972391605377, + "learning_rate": 1.2995293928228385e-05, + "loss": 0.003850420191884041, + "mean_token_accuracy": 0.998667311668396, + "num_tokens": 86700730.0, + "step": 4220 + }, + { + "entropy": 0.7406526923179626, + "epoch": 7.790055248618785, + "grad_norm": 0.5896158218383789, + "learning_rate": 1.2791180152891396e-05, + "loss": 0.004078804701566696, + "mean_token_accuracy": 0.9987831771373749, + "num_tokens": 86905871.0, + "step": 4230 + }, + { + "entropy": 0.7394271969795227, + "epoch": 7.808471454880294, + "grad_norm": 0.5551350116729736, + "learning_rate": 1.2588446703409552e-05, + "loss": 0.004226792231202125, + "mean_token_accuracy": 0.9985223591327668, + "num_tokens": 87111128.0, + "step": 4240 + }, + { + "entropy": 0.7396899223327636, + "epoch": 7.826887661141805, + "grad_norm": 0.48465287685394287, + "learning_rate": 1.23871011005326e-05, + "loss": 0.004565985128283501, + "mean_token_accuracy": 0.9984864890575409, + "num_tokens": 87316264.0, + "step": 4250 + }, + { + "entropy": 0.7409733414649964, + "epoch": 7.845303867403315, + "grad_norm": 0.44551214575767517, + "learning_rate": 1.218715081352571e-05, + "loss": 0.004014456272125244, + "mean_token_accuracy": 0.9988965094089508, + "num_tokens": 87521547.0, + "step": 4260 + }, + { + "entropy": 0.7397344529628753, + "epoch": 7.863720073664825, + "grad_norm": 0.5410996079444885, + "learning_rate": 1.198860325989235e-05, + "loss": 0.0038732051849365234, + "mean_token_accuracy": 0.9988058865070343, + "num_tokens": 87727075.0, + "step": 4270 + }, + { + "entropy": 0.7383821964263916, + "epoch": 7.8821362799263355, + "grad_norm": 0.3724612891674042, + "learning_rate": 1.1791465805099183e-05, + "loss": 0.0038180787116289137, + "mean_token_accuracy": 0.9988650500774383, + "num_tokens": 87932399.0, + "step": 4280 + }, + { + "entropy": 0.73704674243927, + "epoch": 7.900552486187845, + "grad_norm": 0.44809216260910034, + "learning_rate": 1.1595745762302779e-05, + "loss": 0.0037666790187358854, + "mean_token_accuracy": 0.9988551497459411, + "num_tokens": 88138197.0, + "step": 4290 + }, + { + "entropy": 0.7348978996276856, + "epoch": 7.918968692449355, + "grad_norm": 0.3487635850906372, + "learning_rate": 1.140145039207836e-05, + "loss": 0.0034891828894615174, + "mean_token_accuracy": 0.9990123450756073, + "num_tokens": 88343771.0, + "step": 4300 + }, + { + "entropy": 0.7349947333335877, + "epoch": 7.9373848987108655, + "grad_norm": 0.45597800612449646, + "learning_rate": 1.1208586902150458e-05, + "loss": 0.0037573061883449553, + "mean_token_accuracy": 0.9988206088542938, + "num_tokens": 88549078.0, + "step": 4310 + }, + { + "entropy": 0.7384503066539765, + "epoch": 7.955801104972376, + "grad_norm": 0.4761682450771332, + "learning_rate": 1.1017162447125484e-05, + "loss": 0.004058422148227691, + "mean_token_accuracy": 0.9988016963005066, + "num_tokens": 88754354.0, + "step": 4320 + }, + { + "entropy": 0.7428180873394012, + "epoch": 7.974217311233886, + "grad_norm": 0.4685237407684326, + "learning_rate": 1.0827184128226392e-05, + "loss": 0.003867045044898987, + "mean_token_accuracy": 0.998711907863617, + "num_tokens": 88959534.0, + "step": 4330 + }, + { + "entropy": 0.7457234025001526, + "epoch": 7.9926335174953955, + "grad_norm": 0.35596850514411926, + "learning_rate": 1.0638658993029154e-05, + "loss": 0.0037776529788970947, + "mean_token_accuracy": 0.9989305913448334, + "num_tokens": 89164857.0, + "step": 4340 + }, + { + "epoch": 8.0, + "eval_entropy": 0.7504938332930855, + "eval_loss": 0.08306439220905304, + "eval_mean_token_accuracy": 0.980349570253621, + "eval_num_tokens": 89247188.0, + "eval_runtime": 10.0529, + "eval_samples_per_second": 363.875, + "eval_steps_per_second": 11.439, + "step": 4344 + } + ], + "logging_steps": 10, + "max_steps": 5430, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.2545465251510354e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4344/training_args.bin b/checkpoint-4344/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e182f9a244f56249cd86c9db6563a54fbeed7bd --- /dev/null +++ b/checkpoint-4344/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21325c9bdff5ed34f0cc34837ee67ed216c9301ab4d9b2e26f048b563564bd75 +size 5777 diff --git a/checkpoint-4887/README.md b/checkpoint-4887/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dd49221a54a197d9f0cab86c7982e2b2fe0776b3 --- /dev/null +++ b/checkpoint-4887/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen2.5-7B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen2.5-7B-Instruct +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-4887/adapter_config.json b/checkpoint-4887/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..88178e58c6d24c2bb788f871d548c8bff3b8b16a --- /dev/null +++ b/checkpoint-4887/adapter_config.json @@ -0,0 +1,48 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "down_proj", + "q_proj", + "gate_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4887/adapter_model.safetensors b/checkpoint-4887/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fa6b3d5563d1dd4e0ea15ef2feb9aff353870675 --- /dev/null +++ b/checkpoint-4887/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c269f0085f232e646c2122f94ff4b91b649dee880f799ec1eeab03dc06bc430 +size 80792096 diff --git a/checkpoint-4887/chat_template.jinja b/checkpoint-4887/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..bdf7919a96cfe43d50914a007b9c0877bd0ec27e --- /dev/null +++ b/checkpoint-4887/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-4887/tokenizer.json b/checkpoint-4887/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/checkpoint-4887/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/checkpoint-4887/tokenizer_config.json b/checkpoint-4887/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d75d3bb5300d205e48769cc1999073ab5971214 --- /dev/null +++ b/checkpoint-4887/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-4887/trainer_state.json b/checkpoint-4887/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..df8f775e3d9dae711387688d9f3cb982431c848a --- /dev/null +++ b/checkpoint-4887/trainer_state.json @@ -0,0 +1,5013 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.0, + "eval_steps": 500, + "global_step": 4887, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2237394809722901, + "epoch": 0.01841620626151013, + "grad_norm": 5.082435607910156, + "learning_rate": 3.308823529411765e-06, + "loss": 0.9237876892089844, + "mean_token_accuracy": 0.7685343027114868, + "num_tokens": 205423.0, + "step": 10 + }, + { + "entropy": 1.2295925617218018, + "epoch": 0.03683241252302026, + "grad_norm": 4.672000408172607, + "learning_rate": 6.985294117647059e-06, + "loss": 0.8900892257690429, + "mean_token_accuracy": 0.7677771031856537, + "num_tokens": 410849.0, + "step": 20 + }, + { + "entropy": 1.2285718679428101, + "epoch": 0.055248618784530384, + "grad_norm": 1.4828118085861206, + "learning_rate": 1.0661764705882354e-05, + "loss": 0.5975452899932862, + "mean_token_accuracy": 0.8146551787853241, + "num_tokens": 616438.0, + "step": 30 + }, + { + "entropy": 1.210776400566101, + "epoch": 0.07366482504604052, + "grad_norm": 0.7761328816413879, + "learning_rate": 1.4338235294117647e-05, + "loss": 0.40664992332458494, + "mean_token_accuracy": 0.8699092030525207, + "num_tokens": 822118.0, + "step": 40 + }, + { + "entropy": 1.200321125984192, + "epoch": 0.09208103130755065, + "grad_norm": 0.5363371968269348, + "learning_rate": 1.8014705882352943e-05, + "loss": 0.3313469409942627, + "mean_token_accuracy": 0.8904915869235992, + "num_tokens": 1027941.0, + "step": 50 + }, + { + "entropy": 1.1809936046600342, + "epoch": 0.11049723756906077, + "grad_norm": 0.39541518688201904, + "learning_rate": 2.1691176470588237e-05, + "loss": 0.27568228244781495, + "mean_token_accuracy": 0.9047131836414337, + "num_tokens": 1233620.0, + "step": 60 + }, + { + "entropy": 1.169810914993286, + "epoch": 0.1289134438305709, + "grad_norm": 0.341960072517395, + "learning_rate": 2.536764705882353e-05, + "loss": 0.245219087600708, + "mean_token_accuracy": 0.9150686681270599, + "num_tokens": 1438656.0, + "step": 70 + }, + { + "entropy": 1.1652960777282715, + "epoch": 0.14732965009208104, + "grad_norm": 0.36872178316116333, + "learning_rate": 2.9044117647058828e-05, + "loss": 0.2220149040222168, + "mean_token_accuracy": 0.9224777698516846, + "num_tokens": 1643877.0, + "step": 80 + }, + { + "entropy": 1.154341197013855, + "epoch": 0.16574585635359115, + "grad_norm": 0.4152425229549408, + "learning_rate": 3.272058823529412e-05, + "loss": 0.2002798557281494, + "mean_token_accuracy": 0.9285802960395813, + "num_tokens": 1849506.0, + "step": 90 + }, + { + "entropy": 1.1507258892059327, + "epoch": 0.1841620626151013, + "grad_norm": 0.47647765278816223, + "learning_rate": 3.639705882352941e-05, + "loss": 0.18871363401412963, + "mean_token_accuracy": 0.9318056285381318, + "num_tokens": 2055071.0, + "step": 100 + }, + { + "entropy": 1.1455535531044005, + "epoch": 0.20257826887661143, + "grad_norm": 0.4853009581565857, + "learning_rate": 4.007352941176471e-05, + "loss": 0.17836341857910157, + "mean_token_accuracy": 0.9367631554603577, + "num_tokens": 2260643.0, + "step": 110 + }, + { + "entropy": 1.1402526497840881, + "epoch": 0.22099447513812154, + "grad_norm": 0.4455392360687256, + "learning_rate": 4.375e-05, + "loss": 0.16921783685684205, + "mean_token_accuracy": 0.9386959195137023, + "num_tokens": 2466085.0, + "step": 120 + }, + { + "entropy": 1.1374777555465698, + "epoch": 0.23941068139963168, + "grad_norm": 0.5880279541015625, + "learning_rate": 4.742647058823529e-05, + "loss": 0.15989291667938232, + "mean_token_accuracy": 0.9421182632446289, + "num_tokens": 2671024.0, + "step": 130 + }, + { + "entropy": 1.1273940205574036, + "epoch": 0.2578268876611418, + "grad_norm": 0.612959086894989, + "learning_rate": 5.110294117647059e-05, + "loss": 0.14701461791992188, + "mean_token_accuracy": 0.9463540315628052, + "num_tokens": 2876848.0, + "step": 140 + }, + { + "entropy": 1.1263513088226318, + "epoch": 0.27624309392265195, + "grad_norm": 0.5695255398750305, + "learning_rate": 5.477941176470589e-05, + "loss": 0.14604382514953612, + "mean_token_accuracy": 0.946351945400238, + "num_tokens": 3082589.0, + "step": 150 + }, + { + "entropy": 1.1290789365768432, + "epoch": 0.2946593001841621, + "grad_norm": 0.6608090996742249, + "learning_rate": 5.845588235294118e-05, + "loss": 0.1409450054168701, + "mean_token_accuracy": 0.9481450319290161, + "num_tokens": 3287459.0, + "step": 160 + }, + { + "entropy": 1.1291529774665832, + "epoch": 0.31307550644567217, + "grad_norm": 0.652715802192688, + "learning_rate": 6.213235294117647e-05, + "loss": 0.14441155195236205, + "mean_token_accuracy": 0.9466125547885895, + "num_tokens": 3493682.0, + "step": 170 + }, + { + "entropy": 1.1244838953018188, + "epoch": 0.3314917127071823, + "grad_norm": 0.7815241813659668, + "learning_rate": 6.580882352941177e-05, + "loss": 0.13361064195632935, + "mean_token_accuracy": 0.9512295544147491, + "num_tokens": 3699573.0, + "step": 180 + }, + { + "entropy": 1.1217721104621887, + "epoch": 0.34990791896869244, + "grad_norm": 0.7933160066604614, + "learning_rate": 6.948529411764706e-05, + "loss": 0.13089522123336791, + "mean_token_accuracy": 0.9520221531391144, + "num_tokens": 3905156.0, + "step": 190 + }, + { + "entropy": 1.1206679105758668, + "epoch": 0.3683241252302026, + "grad_norm": 0.6815240383148193, + "learning_rate": 7.316176470588236e-05, + "loss": 0.13400404453277587, + "mean_token_accuracy": 0.9501322209835052, + "num_tokens": 4110570.0, + "step": 200 + }, + { + "entropy": 1.1161052227020263, + "epoch": 0.3867403314917127, + "grad_norm": 0.8297767639160156, + "learning_rate": 7.683823529411766e-05, + "loss": 0.13389937877655028, + "mean_token_accuracy": 0.9501932203769684, + "num_tokens": 4315834.0, + "step": 210 + }, + { + "entropy": 1.1098745942115784, + "epoch": 0.40515653775322286, + "grad_norm": 0.5943381786346436, + "learning_rate": 8.051470588235294e-05, + "loss": 0.13452907800674438, + "mean_token_accuracy": 0.9503286242485046, + "num_tokens": 4520807.0, + "step": 220 + }, + { + "entropy": 1.100480353832245, + "epoch": 0.42357274401473294, + "grad_norm": 0.6094359755516052, + "learning_rate": 8.419117647058824e-05, + "loss": 0.12827746868133544, + "mean_token_accuracy": 0.952492094039917, + "num_tokens": 4725867.0, + "step": 230 + }, + { + "entropy": 1.0901286959648133, + "epoch": 0.4419889502762431, + "grad_norm": 0.7240597605705261, + "learning_rate": 8.786764705882353e-05, + "loss": 0.12171242237091065, + "mean_token_accuracy": 0.953943532705307, + "num_tokens": 4931629.0, + "step": 240 + }, + { + "entropy": 1.0885071873664856, + "epoch": 0.4604051565377532, + "grad_norm": 0.6939547657966614, + "learning_rate": 9.154411764705882e-05, + "loss": 0.12155698537826538, + "mean_token_accuracy": 0.9545870959758759, + "num_tokens": 5137285.0, + "step": 250 + }, + { + "entropy": 1.086272156238556, + "epoch": 0.47882136279926335, + "grad_norm": 0.5752800703048706, + "learning_rate": 9.522058823529412e-05, + "loss": 0.12157790660858155, + "mean_token_accuracy": 0.9541126549243927, + "num_tokens": 5342575.0, + "step": 260 + }, + { + "entropy": 1.0857678413391114, + "epoch": 0.4972375690607735, + "grad_norm": 0.7565123438835144, + "learning_rate": 9.889705882352942e-05, + "loss": 0.12349612712860107, + "mean_token_accuracy": 0.9535140514373779, + "num_tokens": 5547995.0, + "step": 270 + }, + { + "entropy": 1.079762625694275, + "epoch": 0.5156537753222836, + "grad_norm": 0.6972768306732178, + "learning_rate": 9.999954556423843e-05, + "loss": 0.11875582933425903, + "mean_token_accuracy": 0.9556483089923858, + "num_tokens": 5753195.0, + "step": 280 + }, + { + "entropy": 1.0742079138755798, + "epoch": 0.5340699815837937, + "grad_norm": 0.7821696996688843, + "learning_rate": 9.999731977631227e-05, + "loss": 0.11824090480804443, + "mean_token_accuracy": 0.9557521045207977, + "num_tokens": 5958236.0, + "step": 290 + }, + { + "entropy": 1.0679773569107056, + "epoch": 0.5524861878453039, + "grad_norm": 0.5846888422966003, + "learning_rate": 9.999323925089486e-05, + "loss": 0.11707355976104736, + "mean_token_accuracy": 0.9554719448089599, + "num_tokens": 6163992.0, + "step": 300 + }, + { + "entropy": 1.0655727863311768, + "epoch": 0.570902394106814, + "grad_norm": 0.5812502503395081, + "learning_rate": 9.998730413936037e-05, + "loss": 0.11371417045593261, + "mean_token_accuracy": 0.9576376020908356, + "num_tokens": 6369456.0, + "step": 310 + }, + { + "entropy": 1.0607039332389832, + "epoch": 0.5893186003683242, + "grad_norm": 0.6238475441932678, + "learning_rate": 9.99795146618821e-05, + "loss": 0.11775733232498169, + "mean_token_accuracy": 0.9557221591472626, + "num_tokens": 6574833.0, + "step": 320 + }, + { + "entropy": 1.0504255175590516, + "epoch": 0.6077348066298343, + "grad_norm": 0.6496815085411072, + "learning_rate": 9.996987110742422e-05, + "loss": 0.10904088020324706, + "mean_token_accuracy": 0.9585366368293762, + "num_tokens": 6780108.0, + "step": 330 + }, + { + "entropy": 1.0456081986427308, + "epoch": 0.6261510128913443, + "grad_norm": 0.786702573299408, + "learning_rate": 9.995837383373119e-05, + "loss": 0.10642309188842773, + "mean_token_accuracy": 0.9596696078777314, + "num_tokens": 6985920.0, + "step": 340 + }, + { + "entropy": 1.0455098271369934, + "epoch": 0.6445672191528545, + "grad_norm": 0.5473790168762207, + "learning_rate": 9.994502326731434e-05, + "loss": 0.10822961330413819, + "mean_token_accuracy": 0.959563136100769, + "num_tokens": 7191465.0, + "step": 350 + }, + { + "entropy": 1.04240562915802, + "epoch": 0.6629834254143646, + "grad_norm": 0.6672356128692627, + "learning_rate": 9.992981990343614e-05, + "loss": 0.1110004186630249, + "mean_token_accuracy": 0.9582514643669129, + "num_tokens": 7396877.0, + "step": 360 + }, + { + "entropy": 1.0386811256408692, + "epoch": 0.6813996316758748, + "grad_norm": 0.698539674282074, + "learning_rate": 9.99127643060918e-05, + "loss": 0.107539963722229, + "mean_token_accuracy": 0.9593036234378814, + "num_tokens": 7602437.0, + "step": 370 + }, + { + "entropy": 1.0311225533485413, + "epoch": 0.6998158379373849, + "grad_norm": 0.6629284024238586, + "learning_rate": 9.989385710798837e-05, + "loss": 0.1064023494720459, + "mean_token_accuracy": 0.9602205216884613, + "num_tokens": 7808142.0, + "step": 380 + }, + { + "entropy": 1.030210506916046, + "epoch": 0.7182320441988951, + "grad_norm": 0.5616748929023743, + "learning_rate": 9.987309901052121e-05, + "loss": 0.10717041492462158, + "mean_token_accuracy": 0.9599347949028015, + "num_tokens": 8013407.0, + "step": 390 + }, + { + "entropy": 1.0208017826080322, + "epoch": 0.7366482504604052, + "grad_norm": 0.6329049468040466, + "learning_rate": 9.985049078374806e-05, + "loss": 0.10359601974487305, + "mean_token_accuracy": 0.9603756129741668, + "num_tokens": 8219040.0, + "step": 400 + }, + { + "entropy": 1.015640377998352, + "epoch": 0.7550644567219152, + "grad_norm": 0.6516013741493225, + "learning_rate": 9.982603326636037e-05, + "loss": 0.10146439075469971, + "mean_token_accuracy": 0.9627702474594116, + "num_tokens": 8424678.0, + "step": 410 + }, + { + "entropy": 1.0105359435081482, + "epoch": 0.7734806629834254, + "grad_norm": 0.6920603513717651, + "learning_rate": 9.979972736565226e-05, + "loss": 0.10770498514175415, + "mean_token_accuracy": 0.9591470420360565, + "num_tokens": 8629868.0, + "step": 420 + }, + { + "entropy": 0.9966452836990356, + "epoch": 0.7918968692449355, + "grad_norm": 0.6857476234436035, + "learning_rate": 9.977157405748687e-05, + "loss": 0.10282524824142455, + "mean_token_accuracy": 0.9612209022045135, + "num_tokens": 8835320.0, + "step": 430 + }, + { + "entropy": 0.9945534646511078, + "epoch": 0.8103130755064457, + "grad_norm": 0.7208472490310669, + "learning_rate": 9.974157438626008e-05, + "loss": 0.10069938898086547, + "mean_token_accuracy": 0.9620070576667785, + "num_tokens": 9041123.0, + "step": 440 + }, + { + "entropy": 0.979461395740509, + "epoch": 0.8287292817679558, + "grad_norm": 0.5071915984153748, + "learning_rate": 9.970972946486185e-05, + "loss": 0.09799174070358277, + "mean_token_accuracy": 0.9620374023914338, + "num_tokens": 9246361.0, + "step": 450 + }, + { + "entropy": 0.9830998003482818, + "epoch": 0.8471454880294659, + "grad_norm": 0.8660802245140076, + "learning_rate": 9.967604047463493e-05, + "loss": 0.10378165245056152, + "mean_token_accuracy": 0.9606865763664245, + "num_tokens": 9451845.0, + "step": 460 + }, + { + "entropy": 0.9813413023948669, + "epoch": 0.8655616942909761, + "grad_norm": 0.7642477750778198, + "learning_rate": 9.964050866533094e-05, + "loss": 0.1010061264038086, + "mean_token_accuracy": 0.9608745336532593, + "num_tokens": 9656802.0, + "step": 470 + }, + { + "entropy": 0.967874163389206, + "epoch": 0.8839779005524862, + "grad_norm": 0.5987281799316406, + "learning_rate": 9.960313535506411e-05, + "loss": 0.10169394016265869, + "mean_token_accuracy": 0.9611998200416565, + "num_tokens": 9861719.0, + "step": 480 + }, + { + "entropy": 0.9663491308689117, + "epoch": 0.9023941068139963, + "grad_norm": 0.6124638319015503, + "learning_rate": 9.956392193026239e-05, + "loss": 0.102389657497406, + "mean_token_accuracy": 0.9611884355545044, + "num_tokens": 10066673.0, + "step": 490 + }, + { + "entropy": 0.959654438495636, + "epoch": 0.9208103130755064, + "grad_norm": 0.7873051762580872, + "learning_rate": 9.952286984561592e-05, + "loss": 0.10170392990112305, + "mean_token_accuracy": 0.9610928475856781, + "num_tokens": 10272091.0, + "step": 500 + }, + { + "entropy": 0.9550537407398224, + "epoch": 0.9392265193370166, + "grad_norm": 0.6071968078613281, + "learning_rate": 9.947998062402313e-05, + "loss": 0.09448277950286865, + "mean_token_accuracy": 0.9648977637290954, + "num_tokens": 10477632.0, + "step": 510 + }, + { + "entropy": 0.9538533687591553, + "epoch": 0.9576427255985267, + "grad_norm": 0.6317242980003357, + "learning_rate": 9.943525585653428e-05, + "loss": 0.09542192220687866, + "mean_token_accuracy": 0.9635261118412017, + "num_tokens": 10682828.0, + "step": 520 + }, + { + "entropy": 0.9362513542175293, + "epoch": 0.9760589318600368, + "grad_norm": 0.6421944499015808, + "learning_rate": 9.938869720229234e-05, + "loss": 0.09382058382034301, + "mean_token_accuracy": 0.9648073971271515, + "num_tokens": 10888741.0, + "step": 530 + }, + { + "entropy": 0.9235438346862793, + "epoch": 0.994475138121547, + "grad_norm": 0.7986873388290405, + "learning_rate": 9.934030638847155e-05, + "loss": 0.09827429056167603, + "mean_token_accuracy": 0.9621128737926483, + "num_tokens": 11094387.0, + "step": 540 + }, + { + "epoch": 1.0, + "eval_entropy": 0.9137652366057686, + "eval_loss": 0.09368764609098434, + "eval_mean_token_accuracy": 0.9640816880309063, + "eval_num_tokens": 11155908.0, + "eval_runtime": 10.4701, + "eval_samples_per_second": 349.377, + "eval_steps_per_second": 10.984, + "step": 543 + }, + { + "entropy": 0.9047818422317505, + "epoch": 1.0128913443830572, + "grad_norm": 0.6781501173973083, + "learning_rate": 9.929008521021325e-05, + "loss": 0.0863916516304016, + "mean_token_accuracy": 0.9673655688762665, + "num_tokens": 11299715.0, + "step": 550 + }, + { + "entropy": 0.8856981039047241, + "epoch": 1.0313075506445673, + "grad_norm": 0.7143136858940125, + "learning_rate": 9.923803553055937e-05, + "loss": 0.08632323145866394, + "mean_token_accuracy": 0.9677783191204071, + "num_tokens": 11505059.0, + "step": 560 + }, + { + "entropy": 0.8937099635601043, + "epoch": 1.0497237569060773, + "grad_norm": 0.7751694321632385, + "learning_rate": 9.918415928038325e-05, + "loss": 0.08178263902664185, + "mean_token_accuracy": 0.9694291114807129, + "num_tokens": 11710464.0, + "step": 570 + }, + { + "entropy": 0.8858704209327698, + "epoch": 1.0681399631675874, + "grad_norm": 0.7492292523384094, + "learning_rate": 9.912845845831805e-05, + "loss": 0.08074211478233337, + "mean_token_accuracy": 0.9692470014095307, + "num_tokens": 11915959.0, + "step": 580 + }, + { + "entropy": 0.8948039829730987, + "epoch": 1.0865561694290977, + "grad_norm": 0.8116479516029358, + "learning_rate": 9.907093513068259e-05, + "loss": 0.08712012171745301, + "mean_token_accuracy": 0.9669980227947235, + "num_tokens": 12121499.0, + "step": 590 + }, + { + "entropy": 0.8846789538860321, + "epoch": 1.1049723756906078, + "grad_norm": 0.7295626997947693, + "learning_rate": 9.901159143140471e-05, + "loss": 0.08444435596466064, + "mean_token_accuracy": 0.9674544095993042, + "num_tokens": 12327061.0, + "step": 600 + }, + { + "entropy": 0.8734103918075562, + "epoch": 1.1233885819521179, + "grad_norm": 0.9585768580436707, + "learning_rate": 9.89504295619421e-05, + "loss": 0.08022565841674804, + "mean_token_accuracy": 0.969569206237793, + "num_tokens": 12532305.0, + "step": 610 + }, + { + "entropy": 0.8640486001968384, + "epoch": 1.141804788213628, + "grad_norm": 0.7891159057617188, + "learning_rate": 9.88874517912006e-05, + "loss": 0.08415375947952271, + "mean_token_accuracy": 0.9678892493247986, + "num_tokens": 12737828.0, + "step": 620 + }, + { + "entropy": 0.8599755525588989, + "epoch": 1.160220994475138, + "grad_norm": 0.5801345109939575, + "learning_rate": 9.882266045545012e-05, + "loss": 0.08100489974021911, + "mean_token_accuracy": 0.9688023269176483, + "num_tokens": 12943343.0, + "step": 630 + }, + { + "entropy": 0.86524977684021, + "epoch": 1.1786372007366483, + "grad_norm": 0.7633041143417358, + "learning_rate": 9.87560579582379e-05, + "loss": 0.07859406471252442, + "mean_token_accuracy": 0.9702189445495606, + "num_tokens": 13148473.0, + "step": 640 + }, + { + "entropy": 0.8466695249080658, + "epoch": 1.1970534069981584, + "grad_norm": 0.8672215938568115, + "learning_rate": 9.868764677029934e-05, + "loss": 0.08082623481750488, + "mean_token_accuracy": 0.9689972400665283, + "num_tokens": 13353890.0, + "step": 650 + }, + { + "entropy": 0.8596941530704498, + "epoch": 1.2154696132596685, + "grad_norm": 0.7524124383926392, + "learning_rate": 9.861742942946639e-05, + "loss": 0.0789935290813446, + "mean_token_accuracy": 0.9693858206272126, + "num_tokens": 13559475.0, + "step": 660 + }, + { + "entropy": 0.8708749234676361, + "epoch": 1.2338858195211786, + "grad_norm": 0.5777031183242798, + "learning_rate": 9.854540854057337e-05, + "loss": 0.07773642539978028, + "mean_token_accuracy": 0.970385092496872, + "num_tokens": 13765076.0, + "step": 670 + }, + { + "entropy": 0.8651713371276856, + "epoch": 1.2523020257826887, + "grad_norm": 0.7924166321754456, + "learning_rate": 9.847158677536034e-05, + "loss": 0.0766686737537384, + "mean_token_accuracy": 0.9702267110347748, + "num_tokens": 13970642.0, + "step": 680 + }, + { + "entropy": 0.8763024985790253, + "epoch": 1.270718232044199, + "grad_norm": 0.741219162940979, + "learning_rate": 9.839596687237403e-05, + "loss": 0.07189929485321045, + "mean_token_accuracy": 0.9727097094058991, + "num_tokens": 14176556.0, + "step": 690 + }, + { + "entropy": 0.8556921362876893, + "epoch": 1.289134438305709, + "grad_norm": 0.6298198103904724, + "learning_rate": 9.831855163686618e-05, + "loss": 0.07608137726783752, + "mean_token_accuracy": 0.9716399371623993, + "num_tokens": 14381686.0, + "step": 700 + }, + { + "entropy": 0.869178420305252, + "epoch": 1.3075506445672191, + "grad_norm": 0.5850273370742798, + "learning_rate": 9.823934394068952e-05, + "loss": 0.07437651753425598, + "mean_token_accuracy": 0.9709566533565521, + "num_tokens": 14586814.0, + "step": 710 + }, + { + "entropy": 0.8708595156669616, + "epoch": 1.3259668508287292, + "grad_norm": 0.6580632328987122, + "learning_rate": 9.815834672219127e-05, + "loss": 0.07518917322158813, + "mean_token_accuracy": 0.9717426657676697, + "num_tokens": 14792321.0, + "step": 720 + }, + { + "entropy": 0.8826817810535431, + "epoch": 1.3443830570902393, + "grad_norm": 0.8788532018661499, + "learning_rate": 9.807556298610404e-05, + "loss": 0.07579240798950196, + "mean_token_accuracy": 0.9706341981887817, + "num_tokens": 14997810.0, + "step": 730 + }, + { + "entropy": 0.9012470185756684, + "epoch": 1.3627992633517496, + "grad_norm": 0.7022138237953186, + "learning_rate": 9.799099580343441e-05, + "loss": 0.0775588572025299, + "mean_token_accuracy": 0.9699241399765015, + "num_tokens": 15203795.0, + "step": 740 + }, + { + "entropy": 0.886955714225769, + "epoch": 1.3812154696132597, + "grad_norm": 0.7881133556365967, + "learning_rate": 9.790464831134903e-05, + "loss": 0.07125020027160645, + "mean_token_accuracy": 0.9723815560340882, + "num_tokens": 15408974.0, + "step": 750 + }, + { + "entropy": 0.9047374844551086, + "epoch": 1.3996316758747698, + "grad_norm": 0.9082005023956299, + "learning_rate": 9.781652371305824e-05, + "loss": 0.07004334926605224, + "mean_token_accuracy": 0.9725580036640167, + "num_tokens": 15614399.0, + "step": 760 + }, + { + "entropy": 0.9039053857326508, + "epoch": 1.4180478821362799, + "grad_norm": 0.8060817122459412, + "learning_rate": 9.77266252776972e-05, + "loss": 0.07103485465049744, + "mean_token_accuracy": 0.9721468150615692, + "num_tokens": 15819895.0, + "step": 770 + }, + { + "entropy": 0.8998047232627868, + "epoch": 1.43646408839779, + "grad_norm": 1.0152642726898193, + "learning_rate": 9.763495634020467e-05, + "loss": 0.07411704063415528, + "mean_token_accuracy": 0.9711063146591187, + "num_tokens": 16025297.0, + "step": 780 + }, + { + "entropy": 0.9120213568210602, + "epoch": 1.4548802946593002, + "grad_norm": 0.6288319826126099, + "learning_rate": 9.754152030119921e-05, + "loss": 0.07223712205886841, + "mean_token_accuracy": 0.9722476422786712, + "num_tokens": 16230656.0, + "step": 790 + }, + { + "entropy": 0.9142370820045471, + "epoch": 1.4732965009208103, + "grad_norm": 0.7854700088500977, + "learning_rate": 9.744632062685311e-05, + "loss": 0.07186744809150696, + "mean_token_accuracy": 0.972247713804245, + "num_tokens": 16435943.0, + "step": 800 + }, + { + "entropy": 0.8920814216136932, + "epoch": 1.4917127071823204, + "grad_norm": 0.6227074265480042, + "learning_rate": 9.734936084876383e-05, + "loss": 0.07016961574554444, + "mean_token_accuracy": 0.9725603640079499, + "num_tokens": 16641635.0, + "step": 810 + }, + { + "entropy": 0.891328877210617, + "epoch": 1.5101289134438307, + "grad_norm": 0.7601346969604492, + "learning_rate": 9.725064456382283e-05, + "loss": 0.07137494087219239, + "mean_token_accuracy": 0.9722997546195984, + "num_tokens": 16847194.0, + "step": 820 + }, + { + "entropy": 0.8921217978000641, + "epoch": 1.5285451197053406, + "grad_norm": 0.7813850045204163, + "learning_rate": 9.715017543408233e-05, + "loss": 0.06890199184417725, + "mean_token_accuracy": 0.9735044002532959, + "num_tokens": 17052807.0, + "step": 830 + }, + { + "entropy": 0.9085914671421051, + "epoch": 1.5469613259668509, + "grad_norm": 0.6184289455413818, + "learning_rate": 9.704795718661939e-05, + "loss": 0.07043765187263488, + "mean_token_accuracy": 0.9725716531276702, + "num_tokens": 17258284.0, + "step": 840 + }, + { + "entropy": 0.9029861629009247, + "epoch": 1.565377532228361, + "grad_norm": 0.7082377076148987, + "learning_rate": 9.694399361339752e-05, + "loss": 0.07113839387893676, + "mean_token_accuracy": 0.9725669205188752, + "num_tokens": 17464326.0, + "step": 850 + }, + { + "entropy": 0.8856533527374267, + "epoch": 1.583793738489871, + "grad_norm": 0.7409216165542603, + "learning_rate": 9.683828857112627e-05, + "loss": 0.07077333331108093, + "mean_token_accuracy": 0.9731084644794464, + "num_tokens": 17669537.0, + "step": 860 + }, + { + "entropy": 0.8613030433654785, + "epoch": 1.6022099447513813, + "grad_norm": 0.6801561713218689, + "learning_rate": 9.673084598111789e-05, + "loss": 0.06885308027267456, + "mean_token_accuracy": 0.97266526222229, + "num_tokens": 17875289.0, + "step": 870 + }, + { + "entropy": 0.8692965865135193, + "epoch": 1.6206261510128912, + "grad_norm": 1.1621277332305908, + "learning_rate": 9.662166982914203e-05, + "loss": 0.07017780542373657, + "mean_token_accuracy": 0.9733059942722321, + "num_tokens": 18080404.0, + "step": 880 + }, + { + "entropy": 0.8671502113342285, + "epoch": 1.6390423572744015, + "grad_norm": 0.7518903613090515, + "learning_rate": 9.651076416527787e-05, + "loss": 0.06977018713951111, + "mean_token_accuracy": 0.9730017304420471, + "num_tokens": 18285699.0, + "step": 890 + }, + { + "entropy": 0.8662045657634735, + "epoch": 1.6574585635359116, + "grad_norm": 0.6622698903083801, + "learning_rate": 9.639813310376378e-05, + "loss": 0.06620995998382569, + "mean_token_accuracy": 0.9737491130828857, + "num_tokens": 18491097.0, + "step": 900 + }, + { + "entropy": 0.8548173069953918, + "epoch": 1.6758747697974217, + "grad_norm": 0.8941843509674072, + "learning_rate": 9.628378082284479e-05, + "loss": 0.06711119413375854, + "mean_token_accuracy": 0.9740589797496796, + "num_tokens": 18696827.0, + "step": 910 + }, + { + "entropy": 0.8763562262058258, + "epoch": 1.694290976058932, + "grad_norm": 0.7571700215339661, + "learning_rate": 9.616771156461755e-05, + "loss": 0.07263468503952027, + "mean_token_accuracy": 0.9717419981956482, + "num_tokens": 18902513.0, + "step": 920 + }, + { + "entropy": 0.8663733780384064, + "epoch": 1.7127071823204418, + "grad_norm": 0.7886489629745483, + "learning_rate": 9.604992963487298e-05, + "loss": 0.07074605226516724, + "mean_token_accuracy": 0.9724965393543243, + "num_tokens": 19107812.0, + "step": 930 + }, + { + "entropy": 0.8673004627227783, + "epoch": 1.7311233885819521, + "grad_norm": 0.8180726170539856, + "learning_rate": 9.593043940293647e-05, + "loss": 0.06831735372543335, + "mean_token_accuracy": 0.9733696818351746, + "num_tokens": 19313330.0, + "step": 940 + }, + { + "entropy": 0.8525971233844757, + "epoch": 1.7495395948434622, + "grad_norm": 0.6576228737831116, + "learning_rate": 9.580924530150595e-05, + "loss": 0.06567002534866333, + "mean_token_accuracy": 0.9745754361152649, + "num_tokens": 19518671.0, + "step": 950 + }, + { + "entropy": 0.8605451703071594, + "epoch": 1.7679558011049723, + "grad_norm": 0.7171661257743835, + "learning_rate": 9.568635182648725e-05, + "loss": 0.06872050762176514, + "mean_token_accuracy": 0.9732091546058654, + "num_tokens": 19724135.0, + "step": 960 + }, + { + "entropy": 0.8642210960388184, + "epoch": 1.7863720073664826, + "grad_norm": 0.7603147029876709, + "learning_rate": 9.556176353682746e-05, + "loss": 0.06766576766967773, + "mean_token_accuracy": 0.9728681743144989, + "num_tokens": 19928785.0, + "step": 970 + }, + { + "entropy": 0.8543185651302337, + "epoch": 1.8047882136279927, + "grad_norm": 0.7280875444412231, + "learning_rate": 9.543548505434581e-05, + "loss": 0.06851862668991089, + "mean_token_accuracy": 0.9737437188625335, + "num_tokens": 20134195.0, + "step": 980 + }, + { + "entropy": 0.8744745373725891, + "epoch": 1.8232044198895028, + "grad_norm": 0.5897248983383179, + "learning_rate": 9.530752106356209e-05, + "loss": 0.06809053421020508, + "mean_token_accuracy": 0.9733593761920929, + "num_tokens": 20339517.0, + "step": 990 + }, + { + "entropy": 0.8623859465122223, + "epoch": 1.8416206261510129, + "grad_norm": 0.7515265345573425, + "learning_rate": 9.517787631152298e-05, + "loss": 0.07257847785949707, + "mean_token_accuracy": 0.9714054942131043, + "num_tokens": 20545249.0, + "step": 1000 + }, + { + "entropy": 0.8669404804706573, + "epoch": 1.860036832412523, + "grad_norm": 0.7144560813903809, + "learning_rate": 9.504655560762596e-05, + "loss": 0.06832354068756104, + "mean_token_accuracy": 0.9735779523849487, + "num_tokens": 20750507.0, + "step": 1010 + }, + { + "entropy": 0.8493516445159912, + "epoch": 1.8784530386740332, + "grad_norm": 0.6559189558029175, + "learning_rate": 9.491356382344081e-05, + "loss": 0.0629766047000885, + "mean_token_accuracy": 0.9754977762699127, + "num_tokens": 20955956.0, + "step": 1020 + }, + { + "entropy": 0.8599376022815705, + "epoch": 1.8968692449355433, + "grad_norm": 0.6792973279953003, + "learning_rate": 9.477890589252895e-05, + "loss": 0.0666757881641388, + "mean_token_accuracy": 0.974083811044693, + "num_tokens": 21161163.0, + "step": 1030 + }, + { + "entropy": 0.8458438158035279, + "epoch": 1.9152854511970534, + "grad_norm": 0.6941778659820557, + "learning_rate": 9.464258681026042e-05, + "loss": 0.06307152509689332, + "mean_token_accuracy": 0.9757042229175568, + "num_tokens": 21366525.0, + "step": 1040 + }, + { + "entropy": 0.848515909910202, + "epoch": 1.9337016574585635, + "grad_norm": 0.7307806611061096, + "learning_rate": 9.450461163362855e-05, + "loss": 0.06307026147842407, + "mean_token_accuracy": 0.9750974595546722, + "num_tokens": 21572238.0, + "step": 1050 + }, + { + "entropy": 0.8563454031944275, + "epoch": 1.9521178637200736, + "grad_norm": 0.7222106456756592, + "learning_rate": 9.436498548106236e-05, + "loss": 0.0647726058959961, + "mean_token_accuracy": 0.974629694223404, + "num_tokens": 21777633.0, + "step": 1060 + }, + { + "entropy": 0.8656457483768463, + "epoch": 1.9705340699815839, + "grad_norm": 0.67178875207901, + "learning_rate": 9.422371353223674e-05, + "loss": 0.06573554277420043, + "mean_token_accuracy": 0.9745908617973328, + "num_tokens": 21983116.0, + "step": 1070 + }, + { + "entropy": 0.8630891263484954, + "epoch": 1.988950276243094, + "grad_norm": 0.6956593990325928, + "learning_rate": 9.408080102788016e-05, + "loss": 0.06630704402923585, + "mean_token_accuracy": 0.9741333484649658, + "num_tokens": 22188662.0, + "step": 1080 + }, + { + "epoch": 2.0, + "eval_entropy": 0.8560857042022373, + "eval_loss": 0.06494329869747162, + "eval_mean_token_accuracy": 0.9745692672936813, + "eval_num_tokens": 22311800.0, + "eval_runtime": 10.129, + "eval_samples_per_second": 361.142, + "eval_steps_per_second": 11.354, + "step": 1086 + }, + { + "entropy": 0.8616272270679474, + "epoch": 2.007366482504604, + "grad_norm": 0.7778105139732361, + "learning_rate": 9.393625326958041e-05, + "loss": 0.054407155513763426, + "mean_token_accuracy": 0.9792074799537659, + "num_tokens": 22394215.0, + "step": 1090 + }, + { + "entropy": 0.8496910452842712, + "epoch": 2.0257826887661143, + "grad_norm": 0.7422528266906738, + "learning_rate": 9.379007561958792e-05, + "loss": 0.051881587505340575, + "mean_token_accuracy": 0.9799090325832367, + "num_tokens": 22599599.0, + "step": 1100 + }, + { + "entropy": 0.8531602442264556, + "epoch": 2.044198895027624, + "grad_norm": 0.9075332880020142, + "learning_rate": 9.36422735006167e-05, + "loss": 0.05190724730491638, + "mean_token_accuracy": 0.979931116104126, + "num_tokens": 22805318.0, + "step": 1110 + }, + { + "entropy": 0.8657277703285218, + "epoch": 2.0626151012891345, + "grad_norm": 0.9466913938522339, + "learning_rate": 9.349285239564325e-05, + "loss": 0.053853434324264524, + "mean_token_accuracy": 0.9796103596687317, + "num_tokens": 23010438.0, + "step": 1120 + }, + { + "entropy": 0.8578485429286957, + "epoch": 2.0810313075506444, + "grad_norm": 0.6903054714202881, + "learning_rate": 9.334181784770326e-05, + "loss": 0.05228850841522217, + "mean_token_accuracy": 0.9802409887313843, + "num_tokens": 23215795.0, + "step": 1130 + }, + { + "entropy": 0.8450767934322357, + "epoch": 2.0994475138121547, + "grad_norm": 0.6615211367607117, + "learning_rate": 9.318917545968581e-05, + "loss": 0.050570905208587646, + "mean_token_accuracy": 0.9802053451538086, + "num_tokens": 23421157.0, + "step": 1140 + }, + { + "entropy": 0.8325044393539429, + "epoch": 2.117863720073665, + "grad_norm": 0.760960578918457, + "learning_rate": 9.303493089412564e-05, + "loss": 0.051966112852096555, + "mean_token_accuracy": 0.9796205997467041, + "num_tokens": 23626584.0, + "step": 1150 + }, + { + "entropy": 0.8416404843330383, + "epoch": 2.136279926335175, + "grad_norm": 0.6947009563446045, + "learning_rate": 9.287908987299306e-05, + "loss": 0.05144861936569214, + "mean_token_accuracy": 0.9800034642219544, + "num_tokens": 23832137.0, + "step": 1160 + }, + { + "entropy": 0.8564540028572083, + "epoch": 2.154696132596685, + "grad_norm": 0.733252763748169, + "learning_rate": 9.272165817748164e-05, + "loss": 0.04944799542427063, + "mean_token_accuracy": 0.9808157980442047, + "num_tokens": 24038006.0, + "step": 1170 + }, + { + "entropy": 0.8575525343418121, + "epoch": 2.1731123388581954, + "grad_norm": 0.8911028504371643, + "learning_rate": 9.25626416477938e-05, + "loss": 0.05037952661514282, + "mean_token_accuracy": 0.980946284532547, + "num_tokens": 24243374.0, + "step": 1180 + }, + { + "entropy": 0.8599720418453216, + "epoch": 2.1915285451197053, + "grad_norm": 0.7713524103164673, + "learning_rate": 9.240204618292416e-05, + "loss": 0.050603735446929934, + "mean_token_accuracy": 0.980896121263504, + "num_tokens": 24448585.0, + "step": 1190 + }, + { + "entropy": 0.8566664934158326, + "epoch": 2.2099447513812156, + "grad_norm": 0.8439353704452515, + "learning_rate": 9.223987774044066e-05, + "loss": 0.054171699285507205, + "mean_token_accuracy": 0.9796543836593627, + "num_tokens": 24653863.0, + "step": 1200 + }, + { + "entropy": 0.846601277589798, + "epoch": 2.2283609576427255, + "grad_norm": 0.7025637030601501, + "learning_rate": 9.207614233626356e-05, + "loss": 0.048924127221107484, + "mean_token_accuracy": 0.9809681415557862, + "num_tokens": 24859801.0, + "step": 1210 + }, + { + "entropy": 0.8564423739910125, + "epoch": 2.2467771639042358, + "grad_norm": 0.7788274884223938, + "learning_rate": 9.191084604444233e-05, + "loss": 0.05260283350944519, + "mean_token_accuracy": 0.9793797850608825, + "num_tokens": 25065368.0, + "step": 1220 + }, + { + "entropy": 0.865056723356247, + "epoch": 2.265193370165746, + "grad_norm": 0.8728818297386169, + "learning_rate": 9.174399499693027e-05, + "loss": 0.05016371011734009, + "mean_token_accuracy": 0.9807134211063385, + "num_tokens": 25270945.0, + "step": 1230 + }, + { + "entropy": 0.8642262935638427, + "epoch": 2.283609576427256, + "grad_norm": 1.0582489967346191, + "learning_rate": 9.157559538335703e-05, + "loss": 0.05316779017448425, + "mean_token_accuracy": 0.9794209063053131, + "num_tokens": 25476575.0, + "step": 1240 + }, + { + "entropy": 0.8677761554718018, + "epoch": 2.3020257826887662, + "grad_norm": 0.760109543800354, + "learning_rate": 9.140565345079901e-05, + "loss": 0.05115479230880737, + "mean_token_accuracy": 0.9802310705184937, + "num_tokens": 25682814.0, + "step": 1250 + }, + { + "entropy": 0.8592945456504821, + "epoch": 2.320441988950276, + "grad_norm": 0.6537907123565674, + "learning_rate": 9.123417550354761e-05, + "loss": 0.050543540716171266, + "mean_token_accuracy": 0.9806945025920868, + "num_tokens": 25887575.0, + "step": 1260 + }, + { + "entropy": 0.8692500293254852, + "epoch": 2.3388581952117864, + "grad_norm": 0.7771905064582825, + "learning_rate": 9.106116790287541e-05, + "loss": 0.049718713760375975, + "mean_token_accuracy": 0.9805168390274048, + "num_tokens": 26092950.0, + "step": 1270 + }, + { + "entropy": 0.8841261565685272, + "epoch": 2.3572744014732967, + "grad_norm": 0.7791076898574829, + "learning_rate": 9.08866370668001e-05, + "loss": 0.0527400553226471, + "mean_token_accuracy": 0.9796754539012908, + "num_tokens": 26298182.0, + "step": 1280 + }, + { + "entropy": 0.8675022900104523, + "epoch": 2.3756906077348066, + "grad_norm": 0.8481605648994446, + "learning_rate": 9.07105894698464e-05, + "loss": 0.05320838689804077, + "mean_token_accuracy": 0.9792274832725525, + "num_tokens": 26503425.0, + "step": 1290 + }, + { + "entropy": 0.8704026222229004, + "epoch": 2.394106813996317, + "grad_norm": 0.8235505819320679, + "learning_rate": 9.053303164280602e-05, + "loss": 0.055045205354690555, + "mean_token_accuracy": 0.9788750648498535, + "num_tokens": 26708755.0, + "step": 1300 + }, + { + "entropy": 0.8525134027004242, + "epoch": 2.4125230202578267, + "grad_norm": 0.7611598968505859, + "learning_rate": 9.035397017249518e-05, + "loss": 0.05029621124267578, + "mean_token_accuracy": 0.9802757322788238, + "num_tokens": 26914704.0, + "step": 1310 + }, + { + "entropy": 0.8630305290222168, + "epoch": 2.430939226519337, + "grad_norm": 0.790408194065094, + "learning_rate": 9.017341170151041e-05, + "loss": 0.04856040775775909, + "mean_token_accuracy": 0.9809690833091735, + "num_tokens": 27120151.0, + "step": 1320 + }, + { + "entropy": 0.8579159140586853, + "epoch": 2.4493554327808473, + "grad_norm": 0.781972348690033, + "learning_rate": 8.999136292798207e-05, + "loss": 0.04869682788848877, + "mean_token_accuracy": 0.9816130697727203, + "num_tokens": 27325673.0, + "step": 1330 + }, + { + "entropy": 0.8634716987609863, + "epoch": 2.467771639042357, + "grad_norm": 0.8500784039497375, + "learning_rate": 8.980783060532588e-05, + "loss": 0.05050289034843445, + "mean_token_accuracy": 0.980079609155655, + "num_tokens": 27531270.0, + "step": 1340 + }, + { + "entropy": 0.8660618126392364, + "epoch": 2.4861878453038675, + "grad_norm": 0.719760537147522, + "learning_rate": 8.96228215419924e-05, + "loss": 0.04892141819000244, + "mean_token_accuracy": 0.9814020991325378, + "num_tokens": 27736542.0, + "step": 1350 + }, + { + "entropy": 0.8572284400463104, + "epoch": 2.5046040515653774, + "grad_norm": 1.0197229385375977, + "learning_rate": 8.943634260121442e-05, + "loss": 0.05104702711105347, + "mean_token_accuracy": 0.9798846662044525, + "num_tokens": 27941566.0, + "step": 1360 + }, + { + "entropy": 0.8702241241931915, + "epoch": 2.5230202578268877, + "grad_norm": 0.7136003375053406, + "learning_rate": 8.924840070075247e-05, + "loss": 0.04855787754058838, + "mean_token_accuracy": 0.9811685383319855, + "num_tokens": 28146943.0, + "step": 1370 + }, + { + "entropy": 0.874957013130188, + "epoch": 2.541436464088398, + "grad_norm": 0.8775497674942017, + "learning_rate": 8.905900281263804e-05, + "loss": 0.052434295415878296, + "mean_token_accuracy": 0.9795438170433044, + "num_tokens": 28352640.0, + "step": 1380 + }, + { + "entropy": 0.8776536166667939, + "epoch": 2.559852670349908, + "grad_norm": 0.8895741105079651, + "learning_rate": 8.8868155962915e-05, + "loss": 0.05282890796661377, + "mean_token_accuracy": 0.9790538609027862, + "num_tokens": 28558153.0, + "step": 1390 + }, + { + "entropy": 0.8738743245601654, + "epoch": 2.578268876611418, + "grad_norm": 0.788800060749054, + "learning_rate": 8.867586723137906e-05, + "loss": 0.048841872811317445, + "mean_token_accuracy": 0.9809149026870727, + "num_tokens": 28763613.0, + "step": 1400 + }, + { + "entropy": 0.8750253796577454, + "epoch": 2.596685082872928, + "grad_norm": 0.8738002777099609, + "learning_rate": 8.848214375131497e-05, + "loss": 0.048261132836341855, + "mean_token_accuracy": 0.980789190530777, + "num_tokens": 28969248.0, + "step": 1410 + }, + { + "entropy": 0.8624245524406433, + "epoch": 2.6151012891344383, + "grad_norm": 0.6404895186424255, + "learning_rate": 8.828699270923196e-05, + "loss": 0.04970468282699585, + "mean_token_accuracy": 0.9807762265205383, + "num_tokens": 29174779.0, + "step": 1420 + }, + { + "entropy": 0.8792938470840455, + "epoch": 2.6335174953959486, + "grad_norm": 0.7856965661048889, + "learning_rate": 8.80904213445972e-05, + "loss": 0.053334391117095946, + "mean_token_accuracy": 0.9790222108364105, + "num_tokens": 29380474.0, + "step": 1430 + }, + { + "entropy": 0.8831034600734711, + "epoch": 2.6519337016574585, + "grad_norm": 0.7739618420600891, + "learning_rate": 8.789243694956716e-05, + "loss": 0.04959054589271546, + "mean_token_accuracy": 0.9803965091705322, + "num_tokens": 29585985.0, + "step": 1440 + }, + { + "entropy": 0.8934672951698304, + "epoch": 2.6703499079189688, + "grad_norm": 0.6999697089195251, + "learning_rate": 8.769304686871719e-05, + "loss": 0.05165250301361084, + "mean_token_accuracy": 0.9798884153366089, + "num_tokens": 29791238.0, + "step": 1450 + }, + { + "entropy": 0.9053199410438537, + "epoch": 2.6887661141804786, + "grad_norm": 0.9199564456939697, + "learning_rate": 8.749225849876892e-05, + "loss": 0.04924143850803375, + "mean_token_accuracy": 0.9810785710811615, + "num_tokens": 29996589.0, + "step": 1460 + }, + { + "entropy": 0.888091403245926, + "epoch": 2.707182320441989, + "grad_norm": 0.7480106353759766, + "learning_rate": 8.729007928831597e-05, + "loss": 0.04948916733264923, + "mean_token_accuracy": 0.9809579730033875, + "num_tokens": 30201875.0, + "step": 1470 + }, + { + "entropy": 0.8723407983779907, + "epoch": 2.7255985267034992, + "grad_norm": 0.9506945013999939, + "learning_rate": 8.708651673754763e-05, + "loss": 0.048927539587020875, + "mean_token_accuracy": 0.980553150177002, + "num_tokens": 30407550.0, + "step": 1480 + }, + { + "entropy": 0.8737521529197693, + "epoch": 2.744014732965009, + "grad_norm": 0.8015706539154053, + "learning_rate": 8.688157839797062e-05, + "loss": 0.04963063597679138, + "mean_token_accuracy": 0.9809738755226135, + "num_tokens": 30612839.0, + "step": 1490 + }, + { + "entropy": 0.8800762951374054, + "epoch": 2.7624309392265194, + "grad_norm": 0.9429986476898193, + "learning_rate": 8.667527187212885e-05, + "loss": 0.0524174690246582, + "mean_token_accuracy": 0.9788767337799072, + "num_tokens": 30818578.0, + "step": 1500 + }, + { + "entropy": 0.8871055901050567, + "epoch": 2.7808471454880292, + "grad_norm": 0.5909196138381958, + "learning_rate": 8.646760481332157e-05, + "loss": 0.05166680812835693, + "mean_token_accuracy": 0.980216771364212, + "num_tokens": 31023829.0, + "step": 1510 + }, + { + "entropy": 0.8908755779266357, + "epoch": 2.7992633517495396, + "grad_norm": 0.9154611229896545, + "learning_rate": 8.625858492531931e-05, + "loss": 0.04951836466789246, + "mean_token_accuracy": 0.9801484227180481, + "num_tokens": 31229635.0, + "step": 1520 + }, + { + "entropy": 0.92480548620224, + "epoch": 2.81767955801105, + "grad_norm": 0.5989938378334045, + "learning_rate": 8.604821996207819e-05, + "loss": 0.04799881279468536, + "mean_token_accuracy": 0.9817522585391998, + "num_tokens": 31435456.0, + "step": 1530 + }, + { + "entropy": 0.9173881888389588, + "epoch": 2.8360957642725597, + "grad_norm": 0.899413526058197, + "learning_rate": 8.58365177274522e-05, + "loss": 0.0487445592880249, + "mean_token_accuracy": 0.9812625288963318, + "num_tokens": 31640904.0, + "step": 1540 + }, + { + "entropy": 0.9076135993003845, + "epoch": 2.85451197053407, + "grad_norm": 0.8494166135787964, + "learning_rate": 8.562348607490376e-05, + "loss": 0.05005228519439697, + "mean_token_accuracy": 0.9806681036949157, + "num_tokens": 31845807.0, + "step": 1550 + }, + { + "entropy": 0.9092245221138, + "epoch": 2.87292817679558, + "grad_norm": 0.8225123286247253, + "learning_rate": 8.540913290721234e-05, + "loss": 0.048654764890670776, + "mean_token_accuracy": 0.9805659353733063, + "num_tokens": 32051523.0, + "step": 1560 + }, + { + "entropy": 0.9062779664993286, + "epoch": 2.89134438305709, + "grad_norm": 0.7074014544487, + "learning_rate": 8.519346617618134e-05, + "loss": 0.049209845066070554, + "mean_token_accuracy": 0.9807434439659118, + "num_tokens": 32256895.0, + "step": 1570 + }, + { + "entropy": 0.9190246641635895, + "epoch": 2.9097605893186005, + "grad_norm": 0.8860642910003662, + "learning_rate": 8.497649388234304e-05, + "loss": 0.051211881637573245, + "mean_token_accuracy": 0.9802342295646668, + "num_tokens": 32462031.0, + "step": 1580 + }, + { + "entropy": 0.9088015079498291, + "epoch": 2.9281767955801103, + "grad_norm": 0.8062726855278015, + "learning_rate": 8.475822407466188e-05, + "loss": 0.053512704372406, + "mean_token_accuracy": 0.979486483335495, + "num_tokens": 32667533.0, + "step": 1590 + }, + { + "entropy": 0.9462027847766876, + "epoch": 2.9465930018416207, + "grad_norm": 0.7962909936904907, + "learning_rate": 8.453866485023579e-05, + "loss": 0.0501457154750824, + "mean_token_accuracy": 0.9803222417831421, + "num_tokens": 32872900.0, + "step": 1600 + }, + { + "entropy": 0.9671471297740937, + "epoch": 2.9650092081031305, + "grad_norm": 0.7641744017601013, + "learning_rate": 8.431782435399587e-05, + "loss": 0.04629061222076416, + "mean_token_accuracy": 0.9823175370693207, + "num_tokens": 33077850.0, + "step": 1610 + }, + { + "entropy": 0.955865204334259, + "epoch": 2.983425414364641, + "grad_norm": 0.6772348880767822, + "learning_rate": 8.409571077840426e-05, + "loss": 0.048368623852729796, + "mean_token_accuracy": 0.9808700799942016, + "num_tokens": 33283117.0, + "step": 1620 + }, + { + "epoch": 3.0, + "eval_entropy": 0.9563225186389426, + "eval_loss": 0.059064481407403946, + "eval_mean_token_accuracy": 0.9773589429648026, + "eval_num_tokens": 33467712.0, + "eval_runtime": 10.1471, + "eval_samples_per_second": 360.499, + "eval_steps_per_second": 11.333, + "step": 1629 + }, + { + "entropy": 0.9337226033210755, + "epoch": 3.001841620626151, + "grad_norm": 0.646203875541687, + "learning_rate": 8.387233236315016e-05, + "loss": 0.043352216482162476, + "mean_token_accuracy": 0.9830620110034942, + "num_tokens": 33488302.0, + "step": 1630 + }, + { + "entropy": 0.9734923839569092, + "epoch": 3.020257826887661, + "grad_norm": 0.7564226984977722, + "learning_rate": 8.364769739484416e-05, + "loss": 0.033932483196258544, + "mean_token_accuracy": 0.9872806966304779, + "num_tokens": 33693531.0, + "step": 1640 + }, + { + "entropy": 0.9669206500053406, + "epoch": 3.0386740331491713, + "grad_norm": 0.7126886248588562, + "learning_rate": 8.342181420671096e-05, + "loss": 0.03818287253379822, + "mean_token_accuracy": 0.9852082908153534, + "num_tokens": 33899305.0, + "step": 1650 + }, + { + "entropy": 0.9522916138172149, + "epoch": 3.0570902394106816, + "grad_norm": 1.0571653842926025, + "learning_rate": 8.319469117828007e-05, + "loss": 0.03456039130687714, + "mean_token_accuracy": 0.9867027878761292, + "num_tokens": 34104585.0, + "step": 1660 + }, + { + "entropy": 0.9568560004234314, + "epoch": 3.0755064456721914, + "grad_norm": 0.780940592288971, + "learning_rate": 8.296633673507505e-05, + "loss": 0.03551802039146423, + "mean_token_accuracy": 0.9867531359195709, + "num_tokens": 34309516.0, + "step": 1670 + }, + { + "entropy": 0.9590656876564025, + "epoch": 3.0939226519337018, + "grad_norm": 0.8330219388008118, + "learning_rate": 8.273675934830094e-05, + "loss": 0.03674865961074829, + "mean_token_accuracy": 0.9864118576049805, + "num_tokens": 34515170.0, + "step": 1680 + }, + { + "entropy": 0.975881814956665, + "epoch": 3.1123388581952116, + "grad_norm": 0.7010637521743774, + "learning_rate": 8.250596753453e-05, + "loss": 0.03550414443016052, + "mean_token_accuracy": 0.9864102602005005, + "num_tokens": 34720896.0, + "step": 1690 + }, + { + "entropy": 0.9599562883377075, + "epoch": 3.130755064456722, + "grad_norm": 0.6694278717041016, + "learning_rate": 8.227396985538578e-05, + "loss": 0.035564273595809937, + "mean_token_accuracy": 0.9867321848869324, + "num_tokens": 34925970.0, + "step": 1700 + }, + { + "entropy": 0.9582216143608093, + "epoch": 3.149171270718232, + "grad_norm": 0.9333199262619019, + "learning_rate": 8.204077491722546e-05, + "loss": 0.035575729608535764, + "mean_token_accuracy": 0.9862452208995819, + "num_tokens": 35131543.0, + "step": 1710 + }, + { + "entropy": 0.9579678058624268, + "epoch": 3.167587476979742, + "grad_norm": 0.9450218081474304, + "learning_rate": 8.180639137082066e-05, + "loss": 0.0385298490524292, + "mean_token_accuracy": 0.98538036942482, + "num_tokens": 35336790.0, + "step": 1720 + }, + { + "entropy": 0.9640831351280212, + "epoch": 3.1860036832412524, + "grad_norm": 0.8551534414291382, + "learning_rate": 8.157082791103649e-05, + "loss": 0.03702138364315033, + "mean_token_accuracy": 0.9852015495300293, + "num_tokens": 35542294.0, + "step": 1730 + }, + { + "entropy": 0.9867071211338043, + "epoch": 3.2044198895027622, + "grad_norm": 0.7138128876686096, + "learning_rate": 8.133409327650897e-05, + "loss": 0.035626694560050964, + "mean_token_accuracy": 0.986064875125885, + "num_tokens": 35747447.0, + "step": 1740 + }, + { + "entropy": 0.9639089345932007, + "epoch": 3.2228360957642725, + "grad_norm": 0.7131415009498596, + "learning_rate": 8.109619624932092e-05, + "loss": 0.035885071754455565, + "mean_token_accuracy": 0.986273056268692, + "num_tokens": 35952258.0, + "step": 1750 + }, + { + "entropy": 0.9516046345233917, + "epoch": 3.241252302025783, + "grad_norm": 0.6900200843811035, + "learning_rate": 8.085714565467611e-05, + "loss": 0.03535219430923462, + "mean_token_accuracy": 0.985836285352707, + "num_tokens": 36157938.0, + "step": 1760 + }, + { + "entropy": 0.9373646557331086, + "epoch": 3.2596685082872927, + "grad_norm": 0.6101690530776978, + "learning_rate": 8.061695036057191e-05, + "loss": 0.034940996766090394, + "mean_token_accuracy": 0.9863743901252746, + "num_tokens": 36363825.0, + "step": 1770 + }, + { + "entropy": 0.9444344758987426, + "epoch": 3.278084714548803, + "grad_norm": 0.7518529295921326, + "learning_rate": 8.03756192774703e-05, + "loss": 0.03404279053211212, + "mean_token_accuracy": 0.9866396844387054, + "num_tokens": 36568961.0, + "step": 1780 + }, + { + "entropy": 0.9550357758998871, + "epoch": 3.2965009208103133, + "grad_norm": 0.7687555551528931, + "learning_rate": 8.013316135796734e-05, + "loss": 0.038447052240371704, + "mean_token_accuracy": 0.985325163602829, + "num_tokens": 36774514.0, + "step": 1790 + }, + { + "entropy": 0.9477231681346894, + "epoch": 3.314917127071823, + "grad_norm": 0.7521633505821228, + "learning_rate": 7.988958559646102e-05, + "loss": 0.03746694028377533, + "mean_token_accuracy": 0.9853165090084076, + "num_tokens": 36979660.0, + "step": 1800 + }, + { + "entropy": 0.925805002450943, + "epoch": 3.3333333333333335, + "grad_norm": 0.9333297610282898, + "learning_rate": 7.964490102881768e-05, + "loss": 0.03700103759765625, + "mean_token_accuracy": 0.9850880861282348, + "num_tokens": 37185191.0, + "step": 1810 + }, + { + "entropy": 0.9225482225418091, + "epoch": 3.3517495395948433, + "grad_norm": 0.7928622961044312, + "learning_rate": 7.939911673203665e-05, + "loss": 0.03825801610946655, + "mean_token_accuracy": 0.9850241422653199, + "num_tokens": 37390749.0, + "step": 1820 + }, + { + "entropy": 0.9597147881984711, + "epoch": 3.3701657458563536, + "grad_norm": 0.7658583521842957, + "learning_rate": 7.915224182391375e-05, + "loss": 0.039855146408081056, + "mean_token_accuracy": 0.9845879554748536, + "num_tokens": 37596052.0, + "step": 1830 + }, + { + "entropy": 0.9485619068145752, + "epoch": 3.388581952117864, + "grad_norm": 0.8492130637168884, + "learning_rate": 7.890428546270278e-05, + "loss": 0.039359599351882935, + "mean_token_accuracy": 0.9847265422344208, + "num_tokens": 37802063.0, + "step": 1840 + }, + { + "entropy": 0.9670301914215088, + "epoch": 3.406998158379374, + "grad_norm": 0.7527599930763245, + "learning_rate": 7.865525684677608e-05, + "loss": 0.03752985596656799, + "mean_token_accuracy": 0.9855137526988983, + "num_tokens": 38007432.0, + "step": 1850 + }, + { + "entropy": 0.9681244969367981, + "epoch": 3.425414364640884, + "grad_norm": 0.7599612474441528, + "learning_rate": 7.840516521428303e-05, + "loss": 0.03653894364833832, + "mean_token_accuracy": 0.9858933389186859, + "num_tokens": 38212923.0, + "step": 1860 + }, + { + "entropy": 0.9706049561500549, + "epoch": 3.443830570902394, + "grad_norm": 0.7678127884864807, + "learning_rate": 7.815401984280748e-05, + "loss": 0.0366938978433609, + "mean_token_accuracy": 0.9854713797569274, + "num_tokens": 38418422.0, + "step": 1870 + }, + { + "entropy": 0.9637093842029572, + "epoch": 3.4622467771639043, + "grad_norm": 0.762824535369873, + "learning_rate": 7.790183004902359e-05, + "loss": 0.03516915142536163, + "mean_token_accuracy": 0.9866003453731537, + "num_tokens": 38624389.0, + "step": 1880 + }, + { + "entropy": 0.9373565018177032, + "epoch": 3.4806629834254146, + "grad_norm": 0.8221780061721802, + "learning_rate": 7.764860518835014e-05, + "loss": 0.04049026966094971, + "mean_token_accuracy": 0.984089481830597, + "num_tokens": 38829654.0, + "step": 1890 + }, + { + "entropy": 0.9356025457382202, + "epoch": 3.4990791896869244, + "grad_norm": 0.7583426237106323, + "learning_rate": 7.739435465460356e-05, + "loss": 0.03658481240272522, + "mean_token_accuracy": 0.9857318818569183, + "num_tokens": 39034638.0, + "step": 1900 + }, + { + "entropy": 0.9740163326263428, + "epoch": 3.5174953959484347, + "grad_norm": 0.7332878112792969, + "learning_rate": 7.713908787964937e-05, + "loss": 0.03508963882923126, + "mean_token_accuracy": 0.9863419532775879, + "num_tokens": 39240265.0, + "step": 1910 + }, + { + "entropy": 0.9528286933898926, + "epoch": 3.5359116022099446, + "grad_norm": 0.6515451669692993, + "learning_rate": 7.688281433305233e-05, + "loss": 0.036055779457092284, + "mean_token_accuracy": 0.9860979080200195, + "num_tokens": 39445546.0, + "step": 1920 + }, + { + "entropy": 0.9480705261230469, + "epoch": 3.554327808471455, + "grad_norm": 0.7725827097892761, + "learning_rate": 7.662554352172515e-05, + "loss": 0.037101513147354125, + "mean_token_accuracy": 0.985782790184021, + "num_tokens": 39651078.0, + "step": 1930 + }, + { + "entropy": 0.9655321061611175, + "epoch": 3.572744014732965, + "grad_norm": 0.7756506204605103, + "learning_rate": 7.636728498957581e-05, + "loss": 0.03721855878829956, + "mean_token_accuracy": 0.9857951939105988, + "num_tokens": 39856542.0, + "step": 1940 + }, + { + "entropy": 0.9772682309150695, + "epoch": 3.591160220994475, + "grad_norm": 0.9084987640380859, + "learning_rate": 7.610804831715355e-05, + "loss": 0.03570749163627625, + "mean_token_accuracy": 0.9863450109958649, + "num_tokens": 40061913.0, + "step": 1950 + }, + { + "entropy": 0.9579685389995575, + "epoch": 3.6095764272559854, + "grad_norm": 0.6358487606048584, + "learning_rate": 7.584784312129334e-05, + "loss": 0.038210684061050416, + "mean_token_accuracy": 0.9850837290287018, + "num_tokens": 40267398.0, + "step": 1960 + }, + { + "entropy": 0.9605201721191406, + "epoch": 3.6279926335174952, + "grad_norm": 0.6263149976730347, + "learning_rate": 7.558667905475927e-05, + "loss": 0.03509160876274109, + "mean_token_accuracy": 0.9868143379688263, + "num_tokens": 40472827.0, + "step": 1970 + }, + { + "entropy": 0.964026153087616, + "epoch": 3.6464088397790055, + "grad_norm": 0.90068119764328, + "learning_rate": 7.532456580588638e-05, + "loss": 0.036211782693862916, + "mean_token_accuracy": 0.9858468770980835, + "num_tokens": 40677935.0, + "step": 1980 + }, + { + "entropy": 0.9494135618209839, + "epoch": 3.664825046040516, + "grad_norm": 0.760134756565094, + "learning_rate": 7.50615130982213e-05, + "loss": 0.03786201477050781, + "mean_token_accuracy": 0.9852500438690186, + "num_tokens": 40883750.0, + "step": 1990 + }, + { + "entropy": 0.9527071297168732, + "epoch": 3.6832412523020257, + "grad_norm": 0.9812107682228088, + "learning_rate": 7.479753069016152e-05, + "loss": 0.03803159594535828, + "mean_token_accuracy": 0.9852405369281769, + "num_tokens": 41089115.0, + "step": 2000 + }, + { + "entropy": 0.9639330863952636, + "epoch": 3.701657458563536, + "grad_norm": 0.7164933681488037, + "learning_rate": 7.453262837459332e-05, + "loss": 0.03912568986415863, + "mean_token_accuracy": 0.9849458575248718, + "num_tokens": 41294694.0, + "step": 2010 + }, + { + "entropy": 0.9536987483501435, + "epoch": 3.720073664825046, + "grad_norm": 0.6804596185684204, + "learning_rate": 7.426681597852863e-05, + "loss": 0.036410006880760196, + "mean_token_accuracy": 0.985712206363678, + "num_tokens": 41499817.0, + "step": 2020 + }, + { + "entropy": 0.9478164672851562, + "epoch": 3.738489871086556, + "grad_norm": 0.8799397349357605, + "learning_rate": 7.400010336274037e-05, + "loss": 0.03801035583019256, + "mean_token_accuracy": 0.9850274682044983, + "num_tokens": 41704932.0, + "step": 2030 + }, + { + "entropy": 0.9383447647094727, + "epoch": 3.7569060773480665, + "grad_norm": 0.8386216163635254, + "learning_rate": 7.373250042139664e-05, + "loss": 0.0373637855052948, + "mean_token_accuracy": 0.9854822158813477, + "num_tokens": 41910804.0, + "step": 2040 + }, + { + "entropy": 0.925172996520996, + "epoch": 3.7753222836095763, + "grad_norm": 0.7599324584007263, + "learning_rate": 7.346401708169377e-05, + "loss": 0.03585260808467865, + "mean_token_accuracy": 0.9860672950744629, + "num_tokens": 42116706.0, + "step": 2050 + }, + { + "entropy": 0.9463765442371368, + "epoch": 3.7937384898710866, + "grad_norm": 0.9030149579048157, + "learning_rate": 7.319466330348797e-05, + "loss": 0.035877206921577455, + "mean_token_accuracy": 0.9863968968391419, + "num_tokens": 42322670.0, + "step": 2060 + }, + { + "entropy": 0.9942441761493683, + "epoch": 3.8121546961325965, + "grad_norm": 0.6400449275970459, + "learning_rate": 7.292444907892587e-05, + "loss": 0.037310433387756345, + "mean_token_accuracy": 0.9854151606559753, + "num_tokens": 42527752.0, + "step": 2070 + }, + { + "entropy": 0.9577703952789307, + "epoch": 3.830570902394107, + "grad_norm": 0.6193167567253113, + "learning_rate": 7.265338443207387e-05, + "loss": 0.03648848831653595, + "mean_token_accuracy": 0.9856530070304871, + "num_tokens": 42732981.0, + "step": 2080 + }, + { + "entropy": 0.9663952767848969, + "epoch": 3.848987108655617, + "grad_norm": 0.759611189365387, + "learning_rate": 7.238147941854625e-05, + "loss": 0.036112996935844424, + "mean_token_accuracy": 0.9862765550613404, + "num_tokens": 42938619.0, + "step": 2090 + }, + { + "entropy": 0.9484863519668579, + "epoch": 3.867403314917127, + "grad_norm": 0.7420705556869507, + "learning_rate": 7.210874412513218e-05, + "loss": 0.03703283965587616, + "mean_token_accuracy": 0.9857317566871643, + "num_tokens": 43143753.0, + "step": 2100 + }, + { + "entropy": 0.964326673746109, + "epoch": 3.8858195211786373, + "grad_norm": 0.8779639601707458, + "learning_rate": 7.183518866942147e-05, + "loss": 0.03739701807498932, + "mean_token_accuracy": 0.9852154791355133, + "num_tokens": 43349451.0, + "step": 2110 + }, + { + "entropy": 0.9729791641235351, + "epoch": 3.904235727440147, + "grad_norm": 0.7582741379737854, + "learning_rate": 7.156082319942929e-05, + "loss": 0.03894525766372681, + "mean_token_accuracy": 0.9847454309463501, + "num_tokens": 43554598.0, + "step": 2120 + }, + { + "entropy": 0.9860592544078827, + "epoch": 3.9226519337016574, + "grad_norm": 0.860698938369751, + "learning_rate": 7.128565789321969e-05, + "loss": 0.0365300178527832, + "mean_token_accuracy": 0.9859121859073638, + "num_tokens": 43760081.0, + "step": 2130 + }, + { + "entropy": 0.9916551172733307, + "epoch": 3.9410681399631677, + "grad_norm": 0.8363776206970215, + "learning_rate": 7.100970295852805e-05, + "loss": 0.036221379041671754, + "mean_token_accuracy": 0.9859034180641174, + "num_tokens": 43965432.0, + "step": 2140 + }, + { + "entropy": 0.9553558886051178, + "epoch": 3.9594843462246776, + "grad_norm": 0.9627474546432495, + "learning_rate": 7.073296863238242e-05, + "loss": 0.03684481382369995, + "mean_token_accuracy": 0.9857315957546234, + "num_tokens": 44171232.0, + "step": 2150 + }, + { + "entropy": 0.9538035809993743, + "epoch": 3.977900552486188, + "grad_norm": 0.8399474620819092, + "learning_rate": 7.045546518072366e-05, + "loss": 0.03825397789478302, + "mean_token_accuracy": 0.9846831560134888, + "num_tokens": 44376723.0, + "step": 2160 + }, + { + "entropy": 0.9476235210895538, + "epoch": 3.9963167587476978, + "grad_norm": 0.708739697933197, + "learning_rate": 7.017720289802472e-05, + "loss": 0.03618018329143524, + "mean_token_accuracy": 0.9861325800418854, + "num_tokens": 44582407.0, + "step": 2170 + }, + { + "epoch": 4.0, + "eval_entropy": 0.9569619194321011, + "eval_loss": 0.059838198125362396, + "eval_mean_token_accuracy": 0.9777795366618944, + "eval_num_tokens": 44623647.0, + "eval_runtime": 10.0379, + "eval_samples_per_second": 364.42, + "eval_steps_per_second": 11.457, + "step": 2172 + }, + { + "entropy": 0.9558675646781921, + "epoch": 4.014732965009208, + "grad_norm": 0.7347508668899536, + "learning_rate": 6.989819210690872e-05, + "loss": 0.02886659502983093, + "mean_token_accuracy": 0.9892994821071625, + "num_tokens": 44788219.0, + "step": 2180 + }, + { + "entropy": 1.0037677466869355, + "epoch": 4.033149171270718, + "grad_norm": 0.7403206825256348, + "learning_rate": 6.961844315776596e-05, + "loss": 0.02395295798778534, + "mean_token_accuracy": 0.9906026899814606, + "num_tokens": 44993505.0, + "step": 2190 + }, + { + "entropy": 1.0068290829658508, + "epoch": 4.051565377532229, + "grad_norm": 0.7979726195335388, + "learning_rate": 6.933796642837003e-05, + "loss": 0.02605988085269928, + "mean_token_accuracy": 0.9899706900119781, + "num_tokens": 45199193.0, + "step": 2200 + }, + { + "entropy": 0.9942211747169495, + "epoch": 4.069981583793738, + "grad_norm": 0.6460402011871338, + "learning_rate": 6.905677232349278e-05, + "loss": 0.025350230932235717, + "mean_token_accuracy": 0.9899386286735534, + "num_tokens": 45404030.0, + "step": 2210 + }, + { + "entropy": 0.9783595442771912, + "epoch": 4.088397790055248, + "grad_norm": 0.8177055716514587, + "learning_rate": 6.877487127451834e-05, + "loss": 0.02696993052959442, + "mean_token_accuracy": 0.9896106541156768, + "num_tokens": 45609763.0, + "step": 2220 + }, + { + "entropy": 0.9801763832569123, + "epoch": 4.106813996316759, + "grad_norm": 0.6608165502548218, + "learning_rate": 6.849227373905618e-05, + "loss": 0.025101393461227417, + "mean_token_accuracy": 0.9904372334480286, + "num_tokens": 45814941.0, + "step": 2230 + }, + { + "entropy": 0.9695689737796783, + "epoch": 4.125230202578269, + "grad_norm": 0.8036547899246216, + "learning_rate": 6.820899020055314e-05, + "loss": 0.027827343344688414, + "mean_token_accuracy": 0.9890337705612182, + "num_tokens": 46020535.0, + "step": 2240 + }, + { + "entropy": 0.9828635334968567, + "epoch": 4.143646408839779, + "grad_norm": 0.7729921936988831, + "learning_rate": 6.792503116790455e-05, + "loss": 0.02779492735862732, + "mean_token_accuracy": 0.9894372522830963, + "num_tokens": 46226013.0, + "step": 2250 + }, + { + "entropy": 0.9978842556476593, + "epoch": 4.162062615101289, + "grad_norm": 0.7334664463996887, + "learning_rate": 6.764040717506432e-05, + "loss": 0.025673511624336242, + "mean_token_accuracy": 0.9899355113506317, + "num_tokens": 46432087.0, + "step": 2260 + }, + { + "entropy": 1.0116403937339782, + "epoch": 4.180478821362799, + "grad_norm": 0.6769368052482605, + "learning_rate": 6.735512878065427e-05, + "loss": 0.024705511331558228, + "mean_token_accuracy": 0.9906128525733948, + "num_tokens": 46637478.0, + "step": 2270 + }, + { + "entropy": 0.9985016226768494, + "epoch": 4.198895027624309, + "grad_norm": 0.8301573991775513, + "learning_rate": 6.706920656757234e-05, + "loss": 0.02455987185239792, + "mean_token_accuracy": 0.9905728340148926, + "num_tokens": 46842562.0, + "step": 2280 + }, + { + "entropy": 0.9909430682659149, + "epoch": 4.21731123388582, + "grad_norm": 0.656026303768158, + "learning_rate": 6.67826511426001e-05, + "loss": 0.022711564600467683, + "mean_token_accuracy": 0.9910893619060517, + "num_tokens": 47048071.0, + "step": 2290 + }, + { + "entropy": 0.9868666052818298, + "epoch": 4.23572744014733, + "grad_norm": 0.7614991068840027, + "learning_rate": 6.649547313600916e-05, + "loss": 0.02453812211751938, + "mean_token_accuracy": 0.9908901154994965, + "num_tokens": 47253507.0, + "step": 2300 + }, + { + "entropy": 0.9870487153530121, + "epoch": 4.25414364640884, + "grad_norm": 0.7617276906967163, + "learning_rate": 6.62076832011669e-05, + "loss": 0.025818097591400146, + "mean_token_accuracy": 0.990347957611084, + "num_tokens": 47458747.0, + "step": 2310 + }, + { + "entropy": 0.9691080570220947, + "epoch": 4.27255985267035, + "grad_norm": 0.6743029952049255, + "learning_rate": 6.591929201414124e-05, + "loss": 0.02456912100315094, + "mean_token_accuracy": 0.9905289709568024, + "num_tokens": 47663643.0, + "step": 2320 + }, + { + "entropy": 0.9701108932495117, + "epoch": 4.29097605893186, + "grad_norm": 0.6964483261108398, + "learning_rate": 6.56303102733046e-05, + "loss": 0.02575681209564209, + "mean_token_accuracy": 0.9898503363132477, + "num_tokens": 47868982.0, + "step": 2330 + }, + { + "entropy": 0.969528192281723, + "epoch": 4.30939226519337, + "grad_norm": 0.7521987557411194, + "learning_rate": 6.5340748698937e-05, + "loss": 0.02678089737892151, + "mean_token_accuracy": 0.9898572087287902, + "num_tokens": 48074314.0, + "step": 2340 + }, + { + "entropy": 0.9921871721744537, + "epoch": 4.327808471454881, + "grad_norm": 0.6944513320922852, + "learning_rate": 6.505061803282844e-05, + "loss": 0.025553321838378905, + "mean_token_accuracy": 0.9907529592514038, + "num_tokens": 48279731.0, + "step": 2350 + }, + { + "entropy": 0.9768964886665344, + "epoch": 4.346224677716391, + "grad_norm": 0.6553092002868652, + "learning_rate": 6.47599290378803e-05, + "loss": 0.0250235915184021, + "mean_token_accuracy": 0.9904054701328278, + "num_tokens": 48485401.0, + "step": 2360 + }, + { + "entropy": 0.9612838506698609, + "epoch": 4.3646408839779, + "grad_norm": 0.916820228099823, + "learning_rate": 6.446869249770619e-05, + "loss": 0.028156182169914244, + "mean_token_accuracy": 0.9888657331466675, + "num_tokens": 48691047.0, + "step": 2370 + }, + { + "entropy": 0.9665832936763763, + "epoch": 4.383057090239411, + "grad_norm": 0.9197776913642883, + "learning_rate": 6.417691921623185e-05, + "loss": 0.025303921103477477, + "mean_token_accuracy": 0.989986252784729, + "num_tokens": 48896234.0, + "step": 2380 + }, + { + "entropy": 0.9686589121818543, + "epoch": 4.401473296500921, + "grad_norm": 0.8505764603614807, + "learning_rate": 6.388462001729434e-05, + "loss": 0.024816396832466125, + "mean_token_accuracy": 0.9909265041351318, + "num_tokens": 49101893.0, + "step": 2390 + }, + { + "entropy": 0.9625210344791413, + "epoch": 4.419889502762431, + "grad_norm": 1.0601766109466553, + "learning_rate": 6.359180574424062e-05, + "loss": 0.02706078290939331, + "mean_token_accuracy": 0.9895522117614746, + "num_tokens": 49307467.0, + "step": 2400 + }, + { + "entropy": 0.9679551541805267, + "epoch": 4.4383057090239415, + "grad_norm": 0.776253879070282, + "learning_rate": 6.329848725952514e-05, + "loss": 0.02693203091621399, + "mean_token_accuracy": 0.9893981635570526, + "num_tokens": 49513020.0, + "step": 2410 + }, + { + "entropy": 0.9704959928989411, + "epoch": 4.456721915285451, + "grad_norm": 0.5459668636322021, + "learning_rate": 6.3004675444307e-05, + "loss": 0.0279473751783371, + "mean_token_accuracy": 0.9894329369068146, + "num_tokens": 49718405.0, + "step": 2420 + }, + { + "entropy": 0.961863350868225, + "epoch": 4.475138121546961, + "grad_norm": 0.9338833093643188, + "learning_rate": 6.27103811980462e-05, + "loss": 0.026478803157806395, + "mean_token_accuracy": 0.9902269721031189, + "num_tokens": 49923375.0, + "step": 2430 + }, + { + "entropy": 0.9708506822586059, + "epoch": 4.4935543278084715, + "grad_norm": 0.9073707461357117, + "learning_rate": 6.241561543809947e-05, + "loss": 0.025289520621299744, + "mean_token_accuracy": 0.9904769957065582, + "num_tokens": 50128901.0, + "step": 2440 + }, + { + "entropy": 0.984996622800827, + "epoch": 4.511970534069982, + "grad_norm": 0.8674206733703613, + "learning_rate": 6.212038909931503e-05, + "loss": 0.026442551612854005, + "mean_token_accuracy": 0.9905101835727692, + "num_tokens": 50334449.0, + "step": 2450 + }, + { + "entropy": 0.9926377475261688, + "epoch": 4.530386740331492, + "grad_norm": 0.7571811079978943, + "learning_rate": 6.182471313362717e-05, + "loss": 0.026819539070129395, + "mean_token_accuracy": 0.9898989200592041, + "num_tokens": 50539597.0, + "step": 2460 + }, + { + "entropy": 0.9450563549995422, + "epoch": 4.5488029465930016, + "grad_norm": 0.6651087403297424, + "learning_rate": 6.15285985096498e-05, + "loss": 0.02665227949619293, + "mean_token_accuracy": 0.9897156655788422, + "num_tokens": 50744926.0, + "step": 2470 + }, + { + "entropy": 0.9715635657310486, + "epoch": 4.567219152854512, + "grad_norm": 0.7445545196533203, + "learning_rate": 6.12320562122697e-05, + "loss": 0.026212453842163086, + "mean_token_accuracy": 0.9904700636863708, + "num_tokens": 50950152.0, + "step": 2480 + }, + { + "entropy": 0.9613442063331604, + "epoch": 4.585635359116022, + "grad_norm": 0.7168459296226501, + "learning_rate": 6.0935097242238837e-05, + "loss": 0.02508128583431244, + "mean_token_accuracy": 0.9901923894882202, + "num_tokens": 51155430.0, + "step": 2490 + }, + { + "entropy": 0.9571944534778595, + "epoch": 4.6040515653775325, + "grad_norm": 0.7590732574462891, + "learning_rate": 6.063773261576646e-05, + "loss": 0.025445500016212465, + "mean_token_accuracy": 0.9902949810028077, + "num_tokens": 51360826.0, + "step": 2500 + }, + { + "entropy": 0.947079461812973, + "epoch": 4.622467771639043, + "grad_norm": 0.6942175030708313, + "learning_rate": 6.033997336411035e-05, + "loss": 0.026132801175117494, + "mean_token_accuracy": 0.9900939345359803, + "num_tokens": 51566095.0, + "step": 2510 + }, + { + "entropy": 0.970003741979599, + "epoch": 4.640883977900552, + "grad_norm": 0.6562672257423401, + "learning_rate": 6.00418305331675e-05, + "loss": 0.024759869277477264, + "mean_token_accuracy": 0.9905019223690033, + "num_tokens": 51771177.0, + "step": 2520 + }, + { + "entropy": 0.9715348601341247, + "epoch": 4.6593001841620625, + "grad_norm": 0.6151639819145203, + "learning_rate": 5.9743315183064564e-05, + "loss": 0.024138522148132325, + "mean_token_accuracy": 0.9910101473331452, + "num_tokens": 51976349.0, + "step": 2530 + }, + { + "entropy": 0.9552160143852234, + "epoch": 4.677716390423573, + "grad_norm": 0.968815267086029, + "learning_rate": 5.9444438387747336e-05, + "loss": 0.027274739742279053, + "mean_token_accuracy": 0.9896075248718261, + "num_tokens": 52181820.0, + "step": 2540 + }, + { + "entropy": 0.9265012145042419, + "epoch": 4.696132596685083, + "grad_norm": 0.8966720700263977, + "learning_rate": 5.914521123457015e-05, + "loss": 0.0291823148727417, + "mean_token_accuracy": 0.9886700630187988, + "num_tokens": 52387511.0, + "step": 2550 + }, + { + "entropy": 0.9156096875667572, + "epoch": 4.714548802946593, + "grad_norm": 0.7747519612312317, + "learning_rate": 5.88456448238844e-05, + "loss": 0.02809179127216339, + "mean_token_accuracy": 0.9891100466251374, + "num_tokens": 52592737.0, + "step": 2560 + }, + { + "entropy": 0.924511456489563, + "epoch": 4.732965009208103, + "grad_norm": 1.0087049007415771, + "learning_rate": 5.8545750268626844e-05, + "loss": 0.02683232128620148, + "mean_token_accuracy": 0.9896528899669648, + "num_tokens": 52798814.0, + "step": 2570 + }, + { + "entropy": 0.9662951111793519, + "epoch": 4.751381215469613, + "grad_norm": 0.7709590792655945, + "learning_rate": 5.824553869390734e-05, + "loss": 0.02503817081451416, + "mean_token_accuracy": 0.9900161385536194, + "num_tokens": 53004478.0, + "step": 2580 + }, + { + "entropy": 0.9889141619205475, + "epoch": 4.769797421731123, + "grad_norm": 0.815858006477356, + "learning_rate": 5.794502123659613e-05, + "loss": 0.026327347755432128, + "mean_token_accuracy": 0.9900785744190216, + "num_tokens": 53209888.0, + "step": 2590 + }, + { + "entropy": 0.9785685896873474, + "epoch": 4.788213627992634, + "grad_norm": 0.6514431238174438, + "learning_rate": 5.7644209044910735e-05, + "loss": 0.025033789873123168, + "mean_token_accuracy": 0.9902650475502014, + "num_tokens": 53415533.0, + "step": 2600 + }, + { + "entropy": 0.9723869919776916, + "epoch": 4.806629834254144, + "grad_norm": 0.8778963685035706, + "learning_rate": 5.7343113278002284e-05, + "loss": 0.02379843294620514, + "mean_token_accuracy": 0.9909472465515137, + "num_tokens": 53620850.0, + "step": 2610 + }, + { + "entropy": 0.9572711050510406, + "epoch": 4.8250460405156534, + "grad_norm": 0.8927134871482849, + "learning_rate": 5.70417451055417e-05, + "loss": 0.024856947362422943, + "mean_token_accuracy": 0.9904125213623047, + "num_tokens": 53826259.0, + "step": 2620 + }, + { + "entropy": 0.9523135125637054, + "epoch": 4.843462246777164, + "grad_norm": 0.6832691431045532, + "learning_rate": 5.674011570730523e-05, + "loss": 0.025352203845977785, + "mean_token_accuracy": 0.990432596206665, + "num_tokens": 54031531.0, + "step": 2630 + }, + { + "entropy": 0.9735220730304718, + "epoch": 4.861878453038674, + "grad_norm": 0.6399164795875549, + "learning_rate": 5.643823627275972e-05, + "loss": 0.026541513204574586, + "mean_token_accuracy": 0.9900369107723236, + "num_tokens": 54237155.0, + "step": 2640 + }, + { + "entropy": 0.9566517114639282, + "epoch": 4.880294659300184, + "grad_norm": 0.8725414276123047, + "learning_rate": 5.6136118000647616e-05, + "loss": 0.02675778865814209, + "mean_token_accuracy": 0.9894899427890778, + "num_tokens": 54442739.0, + "step": 2650 + }, + { + "entropy": 0.9447909593582153, + "epoch": 4.898710865561695, + "grad_norm": 0.8169302344322205, + "learning_rate": 5.583377209857138e-05, + "loss": 0.02642086148262024, + "mean_token_accuracy": 0.989885401725769, + "num_tokens": 54648098.0, + "step": 2660 + }, + { + "entropy": 0.9180052697658538, + "epoch": 4.917127071823204, + "grad_norm": 0.7768753170967102, + "learning_rate": 5.553120978257787e-05, + "loss": 0.02552323341369629, + "mean_token_accuracy": 0.9899512350559234, + "num_tokens": 54854281.0, + "step": 2670 + }, + { + "entropy": 0.917166668176651, + "epoch": 4.935543278084714, + "grad_norm": 0.8241410851478577, + "learning_rate": 5.5228442276742153e-05, + "loss": 0.02788199484348297, + "mean_token_accuracy": 0.989625746011734, + "num_tokens": 55059495.0, + "step": 2680 + }, + { + "entropy": 0.9345465302467346, + "epoch": 4.953959484346225, + "grad_norm": 0.7645496129989624, + "learning_rate": 5.4925480812751166e-05, + "loss": 0.02517639398574829, + "mean_token_accuracy": 0.9902283847332001, + "num_tokens": 55265381.0, + "step": 2690 + }, + { + "entropy": 0.9386432528495788, + "epoch": 4.972375690607735, + "grad_norm": 0.8371859192848206, + "learning_rate": 5.46223366294871e-05, + "loss": 0.025585666298866272, + "mean_token_accuracy": 0.9903791427612305, + "num_tokens": 55471210.0, + "step": 2700 + }, + { + "entropy": 0.9267561137676239, + "epoch": 4.990791896869245, + "grad_norm": 0.6789297461509705, + "learning_rate": 5.43190209726104e-05, + "loss": 0.024646708369255067, + "mean_token_accuracy": 0.9904700815677643, + "num_tokens": 55676877.0, + "step": 2710 + }, + { + "epoch": 5.0, + "eval_entropy": 0.9283919717954553, + "eval_loss": 0.06225527077913284, + "eval_mean_token_accuracy": 0.9784110421719758, + "eval_num_tokens": 55779559.0, + "eval_runtime": 10.0613, + "eval_samples_per_second": 363.573, + "eval_steps_per_second": 11.43, + "step": 2715 + }, + { + "entropy": 0.9269404351711273, + "epoch": 5.009208103130755, + "grad_norm": 0.540570080280304, + "learning_rate": 5.401554509414264e-05, + "loss": 0.019513805210590363, + "mean_token_accuracy": 0.9927033007144928, + "num_tokens": 55882241.0, + "step": 2720 + }, + { + "entropy": 0.9377441763877868, + "epoch": 5.027624309392265, + "grad_norm": 0.5840998888015747, + "learning_rate": 5.3711920252049085e-05, + "loss": 0.015180909633636474, + "mean_token_accuracy": 0.9944471418857574, + "num_tokens": 56087470.0, + "step": 2730 + }, + { + "entropy": 0.949122017621994, + "epoch": 5.046040515653775, + "grad_norm": 0.6938672065734863, + "learning_rate": 5.340815770982106e-05, + "loss": 0.0153742715716362, + "mean_token_accuracy": 0.9941534519195556, + "num_tokens": 56292226.0, + "step": 2740 + }, + { + "entropy": 0.9394402146339417, + "epoch": 5.064456721915286, + "grad_norm": 0.8259939551353455, + "learning_rate": 5.310426873605814e-05, + "loss": 0.014350908994674682, + "mean_token_accuracy": 0.9945570707321167, + "num_tokens": 56497839.0, + "step": 2750 + }, + { + "entropy": 0.9323545396327972, + "epoch": 5.082872928176796, + "grad_norm": 0.9675024747848511, + "learning_rate": 5.280026460405005e-05, + "loss": 0.016550135612487794, + "mean_token_accuracy": 0.9938908398151398, + "num_tokens": 56702932.0, + "step": 2760 + }, + { + "entropy": 0.89125554561615, + "epoch": 5.101289134438305, + "grad_norm": 0.8347184658050537, + "learning_rate": 5.2496156591358566e-05, + "loss": 0.017917826771736145, + "mean_token_accuracy": 0.9934309899806977, + "num_tokens": 56908644.0, + "step": 2770 + }, + { + "entropy": 0.8773505449295044, + "epoch": 5.119705340699816, + "grad_norm": 0.8869524598121643, + "learning_rate": 5.219195597939908e-05, + "loss": 0.017221055924892426, + "mean_token_accuracy": 0.993448656797409, + "num_tokens": 57114171.0, + "step": 2780 + }, + { + "entropy": 0.8874686002731323, + "epoch": 5.138121546961326, + "grad_norm": 1.0294251441955566, + "learning_rate": 5.1887674053022084e-05, + "loss": 0.018111808598041533, + "mean_token_accuracy": 0.9931293666362763, + "num_tokens": 57319158.0, + "step": 2790 + }, + { + "entropy": 0.8893351197242737, + "epoch": 5.156537753222836, + "grad_norm": 0.6253597736358643, + "learning_rate": 5.15833221000946e-05, + "loss": 0.017256538569927215, + "mean_token_accuracy": 0.9936724424362182, + "num_tokens": 57524901.0, + "step": 2800 + }, + { + "entropy": 0.9157109141349793, + "epoch": 5.1749539594843466, + "grad_norm": 0.6379142999649048, + "learning_rate": 5.12789114110814e-05, + "loss": 0.016415870189666747, + "mean_token_accuracy": 0.9939744889736175, + "num_tokens": 57730135.0, + "step": 2810 + }, + { + "entropy": 0.9157932877540589, + "epoch": 5.193370165745856, + "grad_norm": 0.7195688486099243, + "learning_rate": 5.097445327862619e-05, + "loss": 0.01577536463737488, + "mean_token_accuracy": 0.9941773355007172, + "num_tokens": 57936210.0, + "step": 2820 + }, + { + "entropy": 0.9179767727851867, + "epoch": 5.211786372007366, + "grad_norm": 0.7149335741996765, + "learning_rate": 5.066995899713264e-05, + "loss": 0.01606254279613495, + "mean_token_accuracy": 0.9937664806842804, + "num_tokens": 58141736.0, + "step": 2830 + }, + { + "entropy": 0.895512479543686, + "epoch": 5.230202578268877, + "grad_norm": 0.6460169553756714, + "learning_rate": 5.036543986234543e-05, + "loss": 0.01605578660964966, + "mean_token_accuracy": 0.994063013792038, + "num_tokens": 58347178.0, + "step": 2840 + }, + { + "entropy": 0.8883109211921691, + "epoch": 5.248618784530387, + "grad_norm": 0.72477787733078, + "learning_rate": 5.006090717093128e-05, + "loss": 0.016773784160614015, + "mean_token_accuracy": 0.9940340936183929, + "num_tokens": 58552952.0, + "step": 2850 + }, + { + "entropy": 0.8942575633525849, + "epoch": 5.267034990791897, + "grad_norm": 0.7344926595687866, + "learning_rate": 4.9756372220059736e-05, + "loss": 0.01604126989841461, + "mean_token_accuracy": 0.994256991147995, + "num_tokens": 58758449.0, + "step": 2860 + }, + { + "entropy": 0.8854099690914154, + "epoch": 5.285451197053407, + "grad_norm": 0.6142122149467468, + "learning_rate": 4.9451846306984214e-05, + "loss": 0.016244474053382873, + "mean_token_accuracy": 0.9938375532627106, + "num_tokens": 58963691.0, + "step": 2870 + }, + { + "entropy": 0.8745675146579742, + "epoch": 5.303867403314917, + "grad_norm": 0.8025366067886353, + "learning_rate": 4.9147340728622816e-05, + "loss": 0.01611460596323013, + "mean_token_accuracy": 0.9941173672676087, + "num_tokens": 59169484.0, + "step": 2880 + }, + { + "entropy": 0.8812389194965362, + "epoch": 5.322283609576427, + "grad_norm": 0.7699193358421326, + "learning_rate": 4.884286678113935e-05, + "loss": 0.016995206475257874, + "mean_token_accuracy": 0.9937523245811463, + "num_tokens": 59374627.0, + "step": 2890 + }, + { + "entropy": 0.8924362242221833, + "epoch": 5.3406998158379375, + "grad_norm": 0.7516226172447205, + "learning_rate": 4.853843575952414e-05, + "loss": 0.01652217358350754, + "mean_token_accuracy": 0.9936819314956665, + "num_tokens": 59580135.0, + "step": 2900 + }, + { + "entropy": 0.8972602427005768, + "epoch": 5.359116022099448, + "grad_norm": 0.7781681418418884, + "learning_rate": 4.823405895717521e-05, + "loss": 0.017360319197177888, + "mean_token_accuracy": 0.9935634732246399, + "num_tokens": 59785392.0, + "step": 2910 + }, + { + "entropy": 0.900998342037201, + "epoch": 5.377532228360957, + "grad_norm": 0.6837047934532166, + "learning_rate": 4.792974766547911e-05, + "loss": 0.017162233591079712, + "mean_token_accuracy": 0.993264091014862, + "num_tokens": 59991448.0, + "step": 2920 + }, + { + "entropy": 0.9239763855934143, + "epoch": 5.3959484346224675, + "grad_norm": 0.7188259363174438, + "learning_rate": 4.762551317339226e-05, + "loss": 0.01718595027923584, + "mean_token_accuracy": 0.9933857440948486, + "num_tokens": 60197079.0, + "step": 2930 + }, + { + "entropy": 0.9056789398193359, + "epoch": 5.414364640883978, + "grad_norm": 0.6922260522842407, + "learning_rate": 4.732136676702198e-05, + "loss": 0.016596907377243043, + "mean_token_accuracy": 0.9937462329864502, + "num_tokens": 60402299.0, + "step": 2940 + }, + { + "entropy": 0.9038522362709045, + "epoch": 5.432780847145488, + "grad_norm": 0.7937009930610657, + "learning_rate": 4.7017319729207954e-05, + "loss": 0.016130413115024566, + "mean_token_accuracy": 0.9941940546035767, + "num_tokens": 60607907.0, + "step": 2950 + }, + { + "entropy": 0.8911147236824035, + "epoch": 5.4511970534069984, + "grad_norm": 0.6261171698570251, + "learning_rate": 4.671338333910359e-05, + "loss": 0.01622493863105774, + "mean_token_accuracy": 0.9937945663928985, + "num_tokens": 60813428.0, + "step": 2960 + }, + { + "entropy": 0.8894322276115417, + "epoch": 5.469613259668508, + "grad_norm": 0.6898378729820251, + "learning_rate": 4.6409568871757645e-05, + "loss": 0.016513559222221374, + "mean_token_accuracy": 0.9936174690723419, + "num_tokens": 61018404.0, + "step": 2970 + }, + { + "entropy": 0.9026601016521454, + "epoch": 5.488029465930018, + "grad_norm": 0.7027897834777832, + "learning_rate": 4.610588759769593e-05, + "loss": 0.016727012395858765, + "mean_token_accuracy": 0.9941417872905731, + "num_tokens": 61223660.0, + "step": 2980 + }, + { + "entropy": 0.8903301954269409, + "epoch": 5.5064456721915285, + "grad_norm": 0.9087063074111938, + "learning_rate": 4.5802350782503196e-05, + "loss": 0.016929233074188234, + "mean_token_accuracy": 0.9935264468193055, + "num_tokens": 61429438.0, + "step": 2990 + }, + { + "entropy": 0.8886692762374878, + "epoch": 5.524861878453039, + "grad_norm": 0.8283822536468506, + "learning_rate": 4.5498969686405266e-05, + "loss": 0.015396638214588166, + "mean_token_accuracy": 0.99433131814003, + "num_tokens": 61635274.0, + "step": 3000 + }, + { + "entropy": 0.8902086555957794, + "epoch": 5.543278084714549, + "grad_norm": 0.7676647305488586, + "learning_rate": 4.5195755563851336e-05, + "loss": 0.01673731654882431, + "mean_token_accuracy": 0.9938134133815766, + "num_tokens": 61840778.0, + "step": 3010 + }, + { + "entropy": 0.8941606819629669, + "epoch": 5.5616942909760585, + "grad_norm": 0.7026392221450806, + "learning_rate": 4.489271966309634e-05, + "loss": 0.01694796681404114, + "mean_token_accuracy": 0.9936233103275299, + "num_tokens": 62046355.0, + "step": 3020 + }, + { + "entropy": 0.90918750166893, + "epoch": 5.580110497237569, + "grad_norm": 0.7146924734115601, + "learning_rate": 4.4589873225783806e-05, + "loss": 0.01852080672979355, + "mean_token_accuracy": 0.9928994178771973, + "num_tokens": 62251709.0, + "step": 3030 + }, + { + "entropy": 0.8946544349193573, + "epoch": 5.598526703499079, + "grad_norm": 0.607246458530426, + "learning_rate": 4.428722748652881e-05, + "loss": 0.016636812686920167, + "mean_token_accuracy": 0.9939334273338318, + "num_tokens": 62456680.0, + "step": 3040 + }, + { + "entropy": 0.8854653835296631, + "epoch": 5.616942909760589, + "grad_norm": 0.7457882165908813, + "learning_rate": 4.3984793672501124e-05, + "loss": 0.016008296608924867, + "mean_token_accuracy": 0.9940589666366577, + "num_tokens": 62662038.0, + "step": 3050 + }, + { + "entropy": 0.8921085000038147, + "epoch": 5.6353591160221, + "grad_norm": 0.7707350254058838, + "learning_rate": 4.368258300300888e-05, + "loss": 0.016655120253562927, + "mean_token_accuracy": 0.993935889005661, + "num_tokens": 62867272.0, + "step": 3060 + }, + { + "entropy": 0.8768653869628906, + "epoch": 5.653775322283609, + "grad_norm": 0.6994554996490479, + "learning_rate": 4.3380606689082166e-05, + "loss": 0.015841150283813478, + "mean_token_accuracy": 0.9944550096988678, + "num_tokens": 63072403.0, + "step": 3070 + }, + { + "entropy": 0.8767679035663605, + "epoch": 5.672191528545119, + "grad_norm": 0.8327192068099976, + "learning_rate": 4.307887593305733e-05, + "loss": 0.015826576948165895, + "mean_token_accuracy": 0.9941202461719513, + "num_tokens": 63277635.0, + "step": 3080 + }, + { + "entropy": 0.8763292253017425, + "epoch": 5.69060773480663, + "grad_norm": 0.7224747538566589, + "learning_rate": 4.277740192816127e-05, + "loss": 0.015298140048980714, + "mean_token_accuracy": 0.9943080008029938, + "num_tokens": 63483196.0, + "step": 3090 + }, + { + "entropy": 0.8834661841392517, + "epoch": 5.70902394106814, + "grad_norm": 0.9508277773857117, + "learning_rate": 4.247619585809627e-05, + "loss": 0.01658404469490051, + "mean_token_accuracy": 0.9934300124645233, + "num_tokens": 63688721.0, + "step": 3100 + }, + { + "entropy": 0.899389523267746, + "epoch": 5.72744014732965, + "grad_norm": 0.7170981168746948, + "learning_rate": 4.217526889662512e-05, + "loss": 0.015803813934326172, + "mean_token_accuracy": 0.9940325975418091, + "num_tokens": 63894220.0, + "step": 3110 + }, + { + "entropy": 0.8968011736869812, + "epoch": 5.74585635359116, + "grad_norm": 0.6686251163482666, + "learning_rate": 4.187463220715659e-05, + "loss": 0.015874400734901428, + "mean_token_accuracy": 0.9940970957279205, + "num_tokens": 64099768.0, + "step": 3120 + }, + { + "entropy": 0.8900792479515076, + "epoch": 5.76427255985267, + "grad_norm": 0.5979828238487244, + "learning_rate": 4.157429694233128e-05, + "loss": 0.01613767147064209, + "mean_token_accuracy": 0.9942961037158966, + "num_tokens": 64305055.0, + "step": 3130 + }, + { + "entropy": 0.8899810135364532, + "epoch": 5.78268876611418, + "grad_norm": 0.7330048084259033, + "learning_rate": 4.127427424360794e-05, + "loss": 0.016168563067913054, + "mean_token_accuracy": 0.9941077649593353, + "num_tokens": 64510002.0, + "step": 3140 + }, + { + "entropy": 0.8805335581302642, + "epoch": 5.801104972375691, + "grad_norm": 0.5978623032569885, + "learning_rate": 4.09745752408501e-05, + "loss": 0.01524556577205658, + "mean_token_accuracy": 0.994326776266098, + "num_tokens": 64715431.0, + "step": 3150 + }, + { + "entropy": 0.878781646490097, + "epoch": 5.819521178637201, + "grad_norm": 0.6749313473701477, + "learning_rate": 4.067521105191331e-05, + "loss": 0.015209287405014038, + "mean_token_accuracy": 0.9942974805831909, + "num_tokens": 64921579.0, + "step": 3160 + }, + { + "entropy": 0.8844729900360108, + "epoch": 5.83793738489871, + "grad_norm": 0.6887196898460388, + "learning_rate": 4.037619278223255e-05, + "loss": 0.01619938760995865, + "mean_token_accuracy": 0.9937683045864105, + "num_tokens": 65127007.0, + "step": 3170 + }, + { + "entropy": 0.8780498623847961, + "epoch": 5.856353591160221, + "grad_norm": 0.6962174773216248, + "learning_rate": 4.0077531524410304e-05, + "loss": 0.015934592485427855, + "mean_token_accuracy": 0.9935103774070739, + "num_tokens": 65332418.0, + "step": 3180 + }, + { + "entropy": 0.8889612555503845, + "epoch": 5.874769797421731, + "grad_norm": 0.6049854159355164, + "learning_rate": 3.977923835780517e-05, + "loss": 0.01600206792354584, + "mean_token_accuracy": 0.9937360048294067, + "num_tokens": 65537845.0, + "step": 3190 + }, + { + "entropy": 0.8960810244083405, + "epoch": 5.893186003683241, + "grad_norm": 0.6341013312339783, + "learning_rate": 3.948132434812065e-05, + "loss": 0.0143389493227005, + "mean_token_accuracy": 0.9948007702827454, + "num_tokens": 65743412.0, + "step": 3200 + }, + { + "entropy": 0.887304550409317, + "epoch": 5.911602209944752, + "grad_norm": 0.7564852237701416, + "learning_rate": 3.9183800546994886e-05, + "loss": 0.016044440865516662, + "mean_token_accuracy": 0.9939335525035858, + "num_tokens": 65948884.0, + "step": 3210 + }, + { + "entropy": 0.8823239862918854, + "epoch": 5.930018416206261, + "grad_norm": 0.6525556445121765, + "learning_rate": 3.8886677991590435e-05, + "loss": 0.016112390160560607, + "mean_token_accuracy": 0.9938134670257568, + "num_tokens": 66153768.0, + "step": 3220 + }, + { + "entropy": 0.8712829887866974, + "epoch": 5.948434622467771, + "grad_norm": 0.676167368888855, + "learning_rate": 3.858996770418504e-05, + "loss": 0.015146306157112122, + "mean_token_accuracy": 0.9944733619689942, + "num_tokens": 66359661.0, + "step": 3230 + }, + { + "entropy": 0.8734102070331573, + "epoch": 5.966850828729282, + "grad_norm": 0.6284340023994446, + "learning_rate": 3.829368069176257e-05, + "loss": 0.017269474267959595, + "mean_token_accuracy": 0.9938443183898926, + "num_tokens": 66565228.0, + "step": 3240 + }, + { + "entropy": 0.8757335782051087, + "epoch": 5.985267034990792, + "grad_norm": 0.722522497177124, + "learning_rate": 3.799782794560484e-05, + "loss": 0.015032704174518585, + "mean_token_accuracy": 0.9942249894142151, + "num_tokens": 66770718.0, + "step": 3250 + }, + { + "epoch": 6.0, + "eval_entropy": 0.8780099873957427, + "eval_loss": 0.06740746647119522, + "eval_mean_token_accuracy": 0.9795082377350849, + "eval_num_tokens": 66935435.0, + "eval_runtime": 10.0955, + "eval_samples_per_second": 362.34, + "eval_steps_per_second": 11.391, + "step": 3258 + }, + { + "entropy": 0.8773481965065002, + "epoch": 6.003683241252302, + "grad_norm": 0.49184396862983704, + "learning_rate": 3.770242044088375e-05, + "loss": 0.013721099495887757, + "mean_token_accuracy": 0.9951768457889557, + "num_tokens": 66976478.0, + "step": 3260 + }, + { + "entropy": 0.8643155217170715, + "epoch": 6.0220994475138125, + "grad_norm": 0.4903622567653656, + "learning_rate": 3.7407469136254234e-05, + "loss": 0.009165047109127045, + "mean_token_accuracy": 0.9969388306140899, + "num_tokens": 67182251.0, + "step": 3270 + }, + { + "entropy": 0.8560326337814331, + "epoch": 6.040515653775322, + "grad_norm": 0.6360073685646057, + "learning_rate": 3.711298497344766e-05, + "loss": 0.010200753808021545, + "mean_token_accuracy": 0.9964211463928223, + "num_tokens": 67387493.0, + "step": 3280 + }, + { + "entropy": 0.8391405165195465, + "epoch": 6.058931860036832, + "grad_norm": 0.519554853439331, + "learning_rate": 3.6818978876865984e-05, + "loss": 0.008906974643468856, + "mean_token_accuracy": 0.9966452360153198, + "num_tokens": 67593314.0, + "step": 3290 + }, + { + "entropy": 0.8300552070140839, + "epoch": 6.077348066298343, + "grad_norm": 0.5294632911682129, + "learning_rate": 3.6525461753176426e-05, + "loss": 0.008088209480047227, + "mean_token_accuracy": 0.9971098065376282, + "num_tokens": 67798634.0, + "step": 3300 + }, + { + "entropy": 0.822588461637497, + "epoch": 6.095764272559853, + "grad_norm": 0.46423637866973877, + "learning_rate": 3.623244449090697e-05, + "loss": 0.008058926463127137, + "mean_token_accuracy": 0.9970856845378876, + "num_tokens": 68003683.0, + "step": 3310 + }, + { + "entropy": 0.8130167067050934, + "epoch": 6.114180478821363, + "grad_norm": 0.497258722782135, + "learning_rate": 3.5939937960042314e-05, + "loss": 0.008712668716907502, + "mean_token_accuracy": 0.996820193529129, + "num_tokens": 68208795.0, + "step": 3320 + }, + { + "entropy": 0.8035802125930787, + "epoch": 6.132596685082873, + "grad_norm": 0.43152952194213867, + "learning_rate": 3.5647953011620716e-05, + "loss": 0.008366625010967254, + "mean_token_accuracy": 0.9969616234302521, + "num_tokens": 68414414.0, + "step": 3330 + }, + { + "entropy": 0.8028200149536133, + "epoch": 6.151012891344383, + "grad_norm": 0.6057612299919128, + "learning_rate": 3.535650047733141e-05, + "loss": 0.00967741459608078, + "mean_token_accuracy": 0.9963694036006927, + "num_tokens": 68619806.0, + "step": 3340 + }, + { + "entropy": 0.7982640087604522, + "epoch": 6.169429097605893, + "grad_norm": 0.4966030716896057, + "learning_rate": 3.5065591169112785e-05, + "loss": 0.00934397652745247, + "mean_token_accuracy": 0.9969527781009674, + "num_tokens": 68825254.0, + "step": 3350 + }, + { + "entropy": 0.7859498977661132, + "epoch": 6.1878453038674035, + "grad_norm": 0.6962474584579468, + "learning_rate": 3.477523587875139e-05, + "loss": 0.010414297878742217, + "mean_token_accuracy": 0.996203750371933, + "num_tokens": 69031043.0, + "step": 3360 + }, + { + "entropy": 0.7869667530059814, + "epoch": 6.206261510128914, + "grad_norm": 0.6078894734382629, + "learning_rate": 3.448544537748143e-05, + "loss": 0.008547455072402954, + "mean_token_accuracy": 0.9968406975269317, + "num_tokens": 69236559.0, + "step": 3370 + }, + { + "entropy": 0.8036401033401489, + "epoch": 6.224677716390423, + "grad_norm": 0.5990306735038757, + "learning_rate": 3.4196230415585337e-05, + "loss": 0.00924447700381279, + "mean_token_accuracy": 0.9967190623283386, + "num_tokens": 69441764.0, + "step": 3380 + }, + { + "entropy": 0.8085561394691467, + "epoch": 6.2430939226519335, + "grad_norm": 0.46029484272003174, + "learning_rate": 3.390760172199486e-05, + "loss": 0.008379801362752914, + "mean_token_accuracy": 0.9970395743846894, + "num_tokens": 69647075.0, + "step": 3390 + }, + { + "entropy": 0.8132422208786011, + "epoch": 6.261510128913444, + "grad_norm": 0.5699496865272522, + "learning_rate": 3.361957000389315e-05, + "loss": 0.009426499903202056, + "mean_token_accuracy": 0.996586662530899, + "num_tokens": 69852809.0, + "step": 3400 + }, + { + "entropy": 0.8065890491008758, + "epoch": 6.279926335174954, + "grad_norm": 0.6212234497070312, + "learning_rate": 3.33321459463175e-05, + "loss": 0.009811153262853622, + "mean_token_accuracy": 0.9965905249118805, + "num_tokens": 70057955.0, + "step": 3410 + }, + { + "entropy": 0.7904254853725433, + "epoch": 6.298342541436464, + "grad_norm": 0.8000790476799011, + "learning_rate": 3.304534021176299e-05, + "loss": 0.00957801640033722, + "mean_token_accuracy": 0.9964518308639526, + "num_tokens": 70263517.0, + "step": 3420 + }, + { + "entropy": 0.7971100151538849, + "epoch": 6.316758747697974, + "grad_norm": 0.6359512209892273, + "learning_rate": 3.275916343978689e-05, + "loss": 0.009681916236877442, + "mean_token_accuracy": 0.9967545390129089, + "num_tokens": 70468400.0, + "step": 3430 + }, + { + "entropy": 0.7987187504768372, + "epoch": 6.335174953959484, + "grad_norm": 0.5094901919364929, + "learning_rate": 3.247362624661406e-05, + "loss": 0.009966370463371278, + "mean_token_accuracy": 0.9964035987854004, + "num_tokens": 70673648.0, + "step": 3440 + }, + { + "entropy": 0.7850228011608124, + "epoch": 6.3535911602209945, + "grad_norm": 0.5554385185241699, + "learning_rate": 3.218873922474303e-05, + "loss": 0.009521079063415528, + "mean_token_accuracy": 0.9966651916503906, + "num_tokens": 70879452.0, + "step": 3450 + }, + { + "entropy": 0.7885844230651855, + "epoch": 6.372007366482505, + "grad_norm": 0.5217951536178589, + "learning_rate": 3.190451294255314e-05, + "loss": 0.00949474424123764, + "mean_token_accuracy": 0.9966598808765411, + "num_tokens": 71085217.0, + "step": 3460 + }, + { + "entropy": 0.797072297334671, + "epoch": 6.390423572744015, + "grad_norm": 0.5385560393333435, + "learning_rate": 3.162095794391241e-05, + "loss": 0.009810312837362289, + "mean_token_accuracy": 0.9965846955776214, + "num_tokens": 71290955.0, + "step": 3470 + }, + { + "entropy": 0.8024774849414825, + "epoch": 6.4088397790055245, + "grad_norm": 0.5419294238090515, + "learning_rate": 3.1338084747786456e-05, + "loss": 0.009127366542816161, + "mean_token_accuracy": 0.9968222141265869, + "num_tokens": 71496654.0, + "step": 3480 + }, + { + "entropy": 0.8082470417022705, + "epoch": 6.427255985267035, + "grad_norm": 0.7315362095832825, + "learning_rate": 3.105590384784821e-05, + "loss": 0.008642691373825073, + "mean_token_accuracy": 0.9970867097377777, + "num_tokens": 71701725.0, + "step": 3490 + }, + { + "entropy": 0.804630172252655, + "epoch": 6.445672191528545, + "grad_norm": 0.6668549180030823, + "learning_rate": 3.0774425712088676e-05, + "loss": 0.008679335564374923, + "mean_token_accuracy": 0.9969714701175689, + "num_tokens": 71907003.0, + "step": 3500 + }, + { + "entropy": 0.7939219176769257, + "epoch": 6.464088397790055, + "grad_norm": 0.8388434648513794, + "learning_rate": 3.049366078242864e-05, + "loss": 0.009249264001846313, + "mean_token_accuracy": 0.99674671292305, + "num_tokens": 72112532.0, + "step": 3510 + }, + { + "entropy": 0.78477823138237, + "epoch": 6.482504604051566, + "grad_norm": 0.4963231682777405, + "learning_rate": 3.021361947433125e-05, + "loss": 0.009192919731140137, + "mean_token_accuracy": 0.9965968191623688, + "num_tokens": 72318518.0, + "step": 3520 + }, + { + "entropy": 0.7841647148132325, + "epoch": 6.500920810313076, + "grad_norm": 0.5681823492050171, + "learning_rate": 2.9934312176415636e-05, + "loss": 0.008821797370910645, + "mean_token_accuracy": 0.9968703150749206, + "num_tokens": 72524548.0, + "step": 3530 + }, + { + "entropy": 0.7848304688930512, + "epoch": 6.519337016574585, + "grad_norm": 0.7126080393791199, + "learning_rate": 2.965574925007154e-05, + "loss": 0.009742744266986847, + "mean_token_accuracy": 0.9964317202568054, + "num_tokens": 72729659.0, + "step": 3540 + }, + { + "entropy": 0.7899512410163879, + "epoch": 6.537753222836096, + "grad_norm": 0.7015056014060974, + "learning_rate": 2.9377941029074986e-05, + "loss": 0.008977667987346649, + "mean_token_accuracy": 0.9968570172786713, + "num_tokens": 72934865.0, + "step": 3550 + }, + { + "entropy": 0.797937935590744, + "epoch": 6.556169429097606, + "grad_norm": 0.700501024723053, + "learning_rate": 2.910089781920486e-05, + "loss": 0.00973074734210968, + "mean_token_accuracy": 0.996515303850174, + "num_tokens": 73139684.0, + "step": 3560 + }, + { + "entropy": 0.7993333518505097, + "epoch": 6.574585635359116, + "grad_norm": 0.44471475481987, + "learning_rate": 2.882462989786061e-05, + "loss": 0.008206719905138016, + "mean_token_accuracy": 0.9968972980976105, + "num_tokens": 73345547.0, + "step": 3570 + }, + { + "entropy": 0.7961922466754914, + "epoch": 6.593001841620627, + "grad_norm": 0.5011329054832458, + "learning_rate": 2.854914751368109e-05, + "loss": 0.009073075652122498, + "mean_token_accuracy": 0.9968676805496216, + "num_tokens": 73550822.0, + "step": 3580 + }, + { + "entropy": 0.8064342319965363, + "epoch": 6.611418047882136, + "grad_norm": 0.6375740766525269, + "learning_rate": 2.82744608861642e-05, + "loss": 0.009309899061918259, + "mean_token_accuracy": 0.9967096745967865, + "num_tokens": 73756564.0, + "step": 3590 + }, + { + "entropy": 0.8025432348251342, + "epoch": 6.629834254143646, + "grad_norm": 0.8449372053146362, + "learning_rate": 2.8000580205287874e-05, + "loss": 0.009333166480064391, + "mean_token_accuracy": 0.9967890501022338, + "num_tokens": 73961849.0, + "step": 3600 + }, + { + "entropy": 0.8010810256004334, + "epoch": 6.648250460405157, + "grad_norm": 0.9473148584365845, + "learning_rate": 2.772751563113213e-05, + "loss": 0.00938543900847435, + "mean_token_accuracy": 0.996571558713913, + "num_tokens": 74167074.0, + "step": 3610 + }, + { + "entropy": 0.7954266011714936, + "epoch": 6.666666666666667, + "grad_norm": 0.4197849631309509, + "learning_rate": 2.7455277293502007e-05, + "loss": 0.008846811950206757, + "mean_token_accuracy": 0.9970432996749878, + "num_tokens": 74372119.0, + "step": 3620 + }, + { + "entropy": 0.7859483778476715, + "epoch": 6.685082872928177, + "grad_norm": 0.5353069305419922, + "learning_rate": 2.7183875291551892e-05, + "loss": 0.008807064592838287, + "mean_token_accuracy": 0.9969651758670807, + "num_tokens": 74577516.0, + "step": 3630 + }, + { + "entropy": 0.7940182387828827, + "epoch": 6.703499079189687, + "grad_norm": 0.5789965391159058, + "learning_rate": 2.6913319693410828e-05, + "loss": 0.008173662424087524, + "mean_token_accuracy": 0.9970715939998627, + "num_tokens": 74783031.0, + "step": 3640 + }, + { + "entropy": 0.7871349632740021, + "epoch": 6.721915285451197, + "grad_norm": 0.5887596011161804, + "learning_rate": 2.6643620535809076e-05, + "loss": 0.008517104387283325, + "mean_token_accuracy": 0.9969267845153809, + "num_tokens": 74988646.0, + "step": 3650 + }, + { + "entropy": 0.783170485496521, + "epoch": 6.740331491712707, + "grad_norm": 0.6228395104408264, + "learning_rate": 2.637478782370574e-05, + "loss": 0.008941689878702164, + "mean_token_accuracy": 0.9967794418334961, + "num_tokens": 75193938.0, + "step": 3660 + }, + { + "entropy": 0.7779926240444184, + "epoch": 6.758747697974218, + "grad_norm": 0.7367292642593384, + "learning_rate": 2.61068315299176e-05, + "loss": 0.009662539511919022, + "mean_token_accuracy": 0.9965554535388946, + "num_tokens": 75399816.0, + "step": 3670 + }, + { + "entropy": 0.7756146490573883, + "epoch": 6.777163904235728, + "grad_norm": 0.7478228807449341, + "learning_rate": 2.5839761594749167e-05, + "loss": 0.008691602945327758, + "mean_token_accuracy": 0.996806287765503, + "num_tokens": 75605531.0, + "step": 3680 + }, + { + "entropy": 0.7793804049491883, + "epoch": 6.795580110497237, + "grad_norm": 0.580205500125885, + "learning_rate": 2.5573587925623964e-05, + "loss": 0.00922732800245285, + "mean_token_accuracy": 0.9966219186782836, + "num_tokens": 75811143.0, + "step": 3690 + }, + { + "entropy": 0.7817609786987305, + "epoch": 6.813996316758748, + "grad_norm": 0.3849862813949585, + "learning_rate": 2.530832039671694e-05, + "loss": 0.00812167227268219, + "mean_token_accuracy": 0.9970280706882477, + "num_tokens": 76016366.0, + "step": 3700 + }, + { + "entropy": 0.7793294489383698, + "epoch": 6.832412523020258, + "grad_norm": 0.4873282313346863, + "learning_rate": 2.504396884858825e-05, + "loss": 0.008183138072490692, + "mean_token_accuracy": 0.9973145961761475, + "num_tokens": 76221692.0, + "step": 3710 + }, + { + "entropy": 0.7805068492889404, + "epoch": 6.850828729281768, + "grad_norm": 0.6652786135673523, + "learning_rate": 2.478054308781807e-05, + "loss": 0.009141853451728821, + "mean_token_accuracy": 0.9968416154384613, + "num_tokens": 76427170.0, + "step": 3720 + }, + { + "entropy": 0.7799863159656525, + "epoch": 6.8692449355432785, + "grad_norm": 0.5895428657531738, + "learning_rate": 2.451805288664298e-05, + "loss": 0.009343943744897842, + "mean_token_accuracy": 0.9968909084796905, + "num_tokens": 76632450.0, + "step": 3730 + }, + { + "entropy": 0.7819362223148346, + "epoch": 6.887661141804788, + "grad_norm": 0.6007734537124634, + "learning_rate": 2.425650798259327e-05, + "loss": 0.008081933856010437, + "mean_token_accuracy": 0.9972956955432892, + "num_tokens": 76837993.0, + "step": 3740 + }, + { + "entropy": 0.7859819054603576, + "epoch": 6.906077348066298, + "grad_norm": 0.5510725975036621, + "learning_rate": 2.39959180781318e-05, + "loss": 0.008848348259925842, + "mean_token_accuracy": 0.9968287885189057, + "num_tokens": 77043697.0, + "step": 3750 + }, + { + "entropy": 0.7813855290412903, + "epoch": 6.9244935543278086, + "grad_norm": 0.5004434585571289, + "learning_rate": 2.3736292840294122e-05, + "loss": 0.00795777291059494, + "mean_token_accuracy": 0.9973017990589141, + "num_tokens": 77248720.0, + "step": 3760 + }, + { + "entropy": 0.774254196882248, + "epoch": 6.942909760589319, + "grad_norm": 0.7068622708320618, + "learning_rate": 2.347764190032974e-05, + "loss": 0.007790238410234451, + "mean_token_accuracy": 0.997188663482666, + "num_tokens": 77454096.0, + "step": 3770 + }, + { + "entropy": 0.7675817251205445, + "epoch": 6.961325966850829, + "grad_norm": 0.5110977292060852, + "learning_rate": 2.3219974853344905e-05, + "loss": 0.008631375432014466, + "mean_token_accuracy": 0.9967362582683563, + "num_tokens": 77659811.0, + "step": 3780 + }, + { + "entropy": 0.7719516515731811, + "epoch": 6.979742173112339, + "grad_norm": 0.6288211941719055, + "learning_rate": 2.2963301257946622e-05, + "loss": 0.00804171860218048, + "mean_token_accuracy": 0.9971263229846954, + "num_tokens": 77865539.0, + "step": 3790 + }, + { + "entropy": 0.7786632418632508, + "epoch": 6.998158379373849, + "grad_norm": 0.5279833078384399, + "learning_rate": 2.270763063588814e-05, + "loss": 0.007490953803062439, + "mean_token_accuracy": 0.9974353730678558, + "num_tokens": 78070767.0, + "step": 3800 + }, + { + "epoch": 7.0, + "eval_entropy": 0.7808213239130767, + "eval_loss": 0.07382760941982269, + "eval_mean_token_accuracy": 0.9800234224485315, + "eval_num_tokens": 78091327.0, + "eval_runtime": 10.072, + "eval_samples_per_second": 363.186, + "eval_steps_per_second": 11.418, + "step": 3801 + }, + { + "entropy": 0.7750412881374359, + "epoch": 7.016574585635359, + "grad_norm": 0.3850567936897278, + "learning_rate": 2.2452972471715644e-05, + "loss": 0.005539501458406449, + "mean_token_accuracy": 0.9983771502971649, + "num_tokens": 78276132.0, + "step": 3810 + }, + { + "entropy": 0.7673899948596954, + "epoch": 7.0349907918968695, + "grad_norm": 0.4390123188495636, + "learning_rate": 2.2199336212416406e-05, + "loss": 0.0051019065082073215, + "mean_token_accuracy": 0.9984941363334656, + "num_tokens": 78481842.0, + "step": 3820 + }, + { + "entropy": 0.7712743639945984, + "epoch": 7.05340699815838, + "grad_norm": 0.48151132464408875, + "learning_rate": 2.1946731267068386e-05, + "loss": 0.005401181802153587, + "mean_token_accuracy": 0.9984619855880738, + "num_tokens": 78687831.0, + "step": 3830 + }, + { + "entropy": 0.772344833612442, + "epoch": 7.071823204419889, + "grad_norm": 0.3234920799732208, + "learning_rate": 2.169516700649115e-05, + "loss": 0.004806514084339142, + "mean_token_accuracy": 0.9984551191329956, + "num_tokens": 78893336.0, + "step": 3840 + }, + { + "entropy": 0.7736243844032288, + "epoch": 7.0902394106813995, + "grad_norm": 0.4605523645877838, + "learning_rate": 2.1444652762898242e-05, + "loss": 0.0041438989341259, + "mean_token_accuracy": 0.9988476693630218, + "num_tokens": 79098566.0, + "step": 3850 + }, + { + "entropy": 0.7667625486850739, + "epoch": 7.10865561694291, + "grad_norm": 0.43270638585090637, + "learning_rate": 2.119519782955105e-05, + "loss": 0.004775972291827202, + "mean_token_accuracy": 0.9984218835830688, + "num_tokens": 79303871.0, + "step": 3860 + }, + { + "entropy": 0.7631282329559326, + "epoch": 7.12707182320442, + "grad_norm": 0.35699328780174255, + "learning_rate": 2.094681146041394e-05, + "loss": 0.00421409159898758, + "mean_token_accuracy": 0.9988139629364013, + "num_tokens": 79509090.0, + "step": 3870 + }, + { + "entropy": 0.7578658938407898, + "epoch": 7.14548802946593, + "grad_norm": 0.6432686448097229, + "learning_rate": 2.06995028698111e-05, + "loss": 0.004374136403203011, + "mean_token_accuracy": 0.998746919631958, + "num_tokens": 79714834.0, + "step": 3880 + }, + { + "entropy": 0.7558880388736725, + "epoch": 7.16390423572744, + "grad_norm": 0.7402953505516052, + "learning_rate": 2.0453281232084586e-05, + "loss": 0.004856631904840469, + "mean_token_accuracy": 0.9985869526863098, + "num_tokens": 79920226.0, + "step": 3890 + }, + { + "entropy": 0.7542947113513947, + "epoch": 7.18232044198895, + "grad_norm": 0.3336258828639984, + "learning_rate": 2.0208155681254076e-05, + "loss": 0.0044605318456888195, + "mean_token_accuracy": 0.9986851871013641, + "num_tokens": 80125096.0, + "step": 3900 + }, + { + "entropy": 0.7547785460948944, + "epoch": 7.2007366482504604, + "grad_norm": 0.48785507678985596, + "learning_rate": 1.9964135310678017e-05, + "loss": 0.004243453219532967, + "mean_token_accuracy": 0.99877148270607, + "num_tokens": 80330544.0, + "step": 3910 + }, + { + "entropy": 0.7504124104976654, + "epoch": 7.219152854511971, + "grad_norm": 0.462425172328949, + "learning_rate": 1.9721229172716245e-05, + "loss": 0.004574070125818253, + "mean_token_accuracy": 0.9984130024909973, + "num_tokens": 80536072.0, + "step": 3920 + }, + { + "entropy": 0.7527936816215515, + "epoch": 7.237569060773481, + "grad_norm": 0.38035058975219727, + "learning_rate": 1.9479446278394208e-05, + "loss": 0.004055039957165718, + "mean_token_accuracy": 0.9987563371658326, + "num_tokens": 80741305.0, + "step": 3930 + }, + { + "entropy": 0.7502566337585449, + "epoch": 7.2559852670349905, + "grad_norm": 0.3040870428085327, + "learning_rate": 1.9238795597068665e-05, + "loss": 0.0041418131440877914, + "mean_token_accuracy": 0.998740965127945, + "num_tokens": 80946776.0, + "step": 3940 + }, + { + "entropy": 0.7453009426593781, + "epoch": 7.274401473296501, + "grad_norm": 0.39008331298828125, + "learning_rate": 1.8999286056095e-05, + "loss": 0.003949865326285362, + "mean_token_accuracy": 0.9989946007728576, + "num_tokens": 81151930.0, + "step": 3950 + }, + { + "entropy": 0.7451439797878265, + "epoch": 7.292817679558011, + "grad_norm": 0.44502392411231995, + "learning_rate": 1.8760926540496006e-05, + "loss": 0.0047814734280109406, + "mean_token_accuracy": 0.9984888076782227, + "num_tokens": 81357559.0, + "step": 3960 + }, + { + "entropy": 0.749438214302063, + "epoch": 7.311233885819521, + "grad_norm": 0.5195235013961792, + "learning_rate": 1.8523725892632253e-05, + "loss": 0.004281774908304214, + "mean_token_accuracy": 0.9987141609191894, + "num_tokens": 81563294.0, + "step": 3970 + }, + { + "entropy": 0.7536393761634826, + "epoch": 7.329650092081032, + "grad_norm": 0.34366822242736816, + "learning_rate": 1.828769291187413e-05, + "loss": 0.0038790594786405562, + "mean_token_accuracy": 0.9988636136054992, + "num_tokens": 81768219.0, + "step": 3980 + }, + { + "entropy": 0.750888729095459, + "epoch": 7.348066298342541, + "grad_norm": 0.3888038098812103, + "learning_rate": 1.8052836354275355e-05, + "loss": 0.0046471841633319855, + "mean_token_accuracy": 0.9985671877861023, + "num_tokens": 81973285.0, + "step": 3990 + }, + { + "entropy": 0.7512096881866455, + "epoch": 7.366482504604051, + "grad_norm": 0.4684146046638489, + "learning_rate": 1.7819164932248194e-05, + "loss": 0.0049116648733615875, + "mean_token_accuracy": 0.9983943462371826, + "num_tokens": 82178663.0, + "step": 4000 + }, + { + "entropy": 0.7514511585235596, + "epoch": 7.384898710865562, + "grad_norm": 0.571757435798645, + "learning_rate": 1.7586687314240296e-05, + "loss": 0.005085925757884979, + "mean_token_accuracy": 0.9984491765499115, + "num_tokens": 82384703.0, + "step": 4010 + }, + { + "entropy": 0.7534020125865937, + "epoch": 7.403314917127072, + "grad_norm": 0.3526608943939209, + "learning_rate": 1.7355412124412988e-05, + "loss": 0.004772019758820534, + "mean_token_accuracy": 0.9984830975532532, + "num_tokens": 82590547.0, + "step": 4020 + }, + { + "entropy": 0.7495484054088593, + "epoch": 7.421731123388582, + "grad_norm": 0.389273464679718, + "learning_rate": 1.7125347942321523e-05, + "loss": 0.004463380947709083, + "mean_token_accuracy": 0.9985935151576996, + "num_tokens": 82795801.0, + "step": 4030 + }, + { + "entropy": 0.7503954172134399, + "epoch": 7.440147329650092, + "grad_norm": 0.44278526306152344, + "learning_rate": 1.689650330259665e-05, + "loss": 0.004496005177497864, + "mean_token_accuracy": 0.9985817015171051, + "num_tokens": 83001332.0, + "step": 4040 + }, + { + "entropy": 0.7515525698661805, + "epoch": 7.458563535911602, + "grad_norm": 0.418070524930954, + "learning_rate": 1.666888669462809e-05, + "loss": 0.004161220416426659, + "mean_token_accuracy": 0.9988651812076569, + "num_tokens": 83206211.0, + "step": 4050 + }, + { + "entropy": 0.747660368680954, + "epoch": 7.476979742173112, + "grad_norm": 0.52656090259552, + "learning_rate": 1.6442506562249622e-05, + "loss": 0.0042838241904973985, + "mean_token_accuracy": 0.9985848963260651, + "num_tokens": 83411773.0, + "step": 4060 + }, + { + "entropy": 0.7506191194057464, + "epoch": 7.495395948434623, + "grad_norm": 0.29261597990989685, + "learning_rate": 1.621737130342578e-05, + "loss": 0.003957664594054222, + "mean_token_accuracy": 0.9986599206924438, + "num_tokens": 83617231.0, + "step": 4070 + }, + { + "entropy": 0.7549885094165802, + "epoch": 7.513812154696133, + "grad_norm": 0.35437923669815063, + "learning_rate": 1.599348926994036e-05, + "loss": 0.0034299422055482864, + "mean_token_accuracy": 0.9990102112293243, + "num_tokens": 83822893.0, + "step": 4080 + }, + { + "entropy": 0.7578713536262512, + "epoch": 7.532228360957642, + "grad_norm": 0.38587674498558044, + "learning_rate": 1.5770868767086567e-05, + "loss": 0.003320001810789108, + "mean_token_accuracy": 0.9990382492542267, + "num_tokens": 84028376.0, + "step": 4090 + }, + { + "entropy": 0.754696124792099, + "epoch": 7.550644567219153, + "grad_norm": 0.39879584312438965, + "learning_rate": 1.554951805335897e-05, + "loss": 0.004190019145607948, + "mean_token_accuracy": 0.9987861573696136, + "num_tokens": 84233768.0, + "step": 4100 + }, + { + "entropy": 0.7515742480754852, + "epoch": 7.569060773480663, + "grad_norm": 0.47624126076698303, + "learning_rate": 1.5329445340147096e-05, + "loss": 0.00403064489364624, + "mean_token_accuracy": 0.9986754775047302, + "num_tokens": 84439923.0, + "step": 4110 + }, + { + "entropy": 0.7527350902557373, + "epoch": 7.587476979742173, + "grad_norm": 0.4721614718437195, + "learning_rate": 1.5110658791430804e-05, + "loss": 0.004505171626806259, + "mean_token_accuracy": 0.9985378623008728, + "num_tokens": 84645432.0, + "step": 4120 + }, + { + "entropy": 0.747485089302063, + "epoch": 7.605893186003684, + "grad_norm": 0.49411219358444214, + "learning_rate": 1.4893166523477448e-05, + "loss": 0.0038516007363796232, + "mean_token_accuracy": 0.9987127304077148, + "num_tokens": 84850968.0, + "step": 4130 + }, + { + "entropy": 0.7463041722774506, + "epoch": 7.624309392265193, + "grad_norm": 0.4478297531604767, + "learning_rate": 1.4676976604540787e-05, + "loss": 0.00429936945438385, + "mean_token_accuracy": 0.9987892746925354, + "num_tokens": 85056363.0, + "step": 4140 + }, + { + "entropy": 0.7416360318660736, + "epoch": 7.642725598526703, + "grad_norm": 0.4913847744464874, + "learning_rate": 1.4462097054561675e-05, + "loss": 0.0036755587905645372, + "mean_token_accuracy": 0.9989015281200408, + "num_tokens": 85262252.0, + "step": 4150 + }, + { + "entropy": 0.7403277635574341, + "epoch": 7.661141804788214, + "grad_norm": 0.49693024158477783, + "learning_rate": 1.4248535844870586e-05, + "loss": 0.0037889480590820312, + "mean_token_accuracy": 0.99878990650177, + "num_tokens": 85468067.0, + "step": 4160 + }, + { + "entropy": 0.7400458335876465, + "epoch": 7.679558011049724, + "grad_norm": 0.4609115421772003, + "learning_rate": 1.4036300897891819e-05, + "loss": 0.004160438477993011, + "mean_token_accuracy": 0.9985541105270386, + "num_tokens": 85673442.0, + "step": 4170 + }, + { + "entropy": 0.7405486226081848, + "epoch": 7.697974217311234, + "grad_norm": 0.4713679254055023, + "learning_rate": 1.3825400086849693e-05, + "loss": 0.004131061211228371, + "mean_token_accuracy": 0.9986487686634063, + "num_tokens": 85878837.0, + "step": 4180 + }, + { + "entropy": 0.7426068425178528, + "epoch": 7.716390423572744, + "grad_norm": 0.36043041944503784, + "learning_rate": 1.3615841235476423e-05, + "loss": 0.004306273162364959, + "mean_token_accuracy": 0.9986724078655242, + "num_tokens": 86083884.0, + "step": 4190 + }, + { + "entropy": 0.7389791548252106, + "epoch": 7.734806629834254, + "grad_norm": 0.4564935863018036, + "learning_rate": 1.3407632117721858e-05, + "loss": 0.003909315168857575, + "mean_token_accuracy": 0.998731005191803, + "num_tokens": 86289698.0, + "step": 4200 + }, + { + "entropy": 0.7440706253051758, + "epoch": 7.753222836095764, + "grad_norm": 0.442862331867218, + "learning_rate": 1.3200780457465211e-05, + "loss": 0.0041195075958967205, + "mean_token_accuracy": 0.9987683832645416, + "num_tokens": 86494738.0, + "step": 4210 + }, + { + "entropy": 0.738617730140686, + "epoch": 7.7716390423572745, + "grad_norm": 0.4486972391605377, + "learning_rate": 1.2995293928228385e-05, + "loss": 0.003850420191884041, + "mean_token_accuracy": 0.998667311668396, + "num_tokens": 86700730.0, + "step": 4220 + }, + { + "entropy": 0.7406526923179626, + "epoch": 7.790055248618785, + "grad_norm": 0.5896158218383789, + "learning_rate": 1.2791180152891396e-05, + "loss": 0.004078804701566696, + "mean_token_accuracy": 0.9987831771373749, + "num_tokens": 86905871.0, + "step": 4230 + }, + { + "entropy": 0.7394271969795227, + "epoch": 7.808471454880294, + "grad_norm": 0.5551350116729736, + "learning_rate": 1.2588446703409552e-05, + "loss": 0.004226792231202125, + "mean_token_accuracy": 0.9985223591327668, + "num_tokens": 87111128.0, + "step": 4240 + }, + { + "entropy": 0.7396899223327636, + "epoch": 7.826887661141805, + "grad_norm": 0.48465287685394287, + "learning_rate": 1.23871011005326e-05, + "loss": 0.004565985128283501, + "mean_token_accuracy": 0.9984864890575409, + "num_tokens": 87316264.0, + "step": 4250 + }, + { + "entropy": 0.7409733414649964, + "epoch": 7.845303867403315, + "grad_norm": 0.44551214575767517, + "learning_rate": 1.218715081352571e-05, + "loss": 0.004014456272125244, + "mean_token_accuracy": 0.9988965094089508, + "num_tokens": 87521547.0, + "step": 4260 + }, + { + "entropy": 0.7397344529628753, + "epoch": 7.863720073664825, + "grad_norm": 0.5410996079444885, + "learning_rate": 1.198860325989235e-05, + "loss": 0.0038732051849365234, + "mean_token_accuracy": 0.9988058865070343, + "num_tokens": 87727075.0, + "step": 4270 + }, + { + "entropy": 0.7383821964263916, + "epoch": 7.8821362799263355, + "grad_norm": 0.3724612891674042, + "learning_rate": 1.1791465805099183e-05, + "loss": 0.0038180787116289137, + "mean_token_accuracy": 0.9988650500774383, + "num_tokens": 87932399.0, + "step": 4280 + }, + { + "entropy": 0.73704674243927, + "epoch": 7.900552486187845, + "grad_norm": 0.44809216260910034, + "learning_rate": 1.1595745762302779e-05, + "loss": 0.0037666790187358854, + "mean_token_accuracy": 0.9988551497459411, + "num_tokens": 88138197.0, + "step": 4290 + }, + { + "entropy": 0.7348978996276856, + "epoch": 7.918968692449355, + "grad_norm": 0.3487635850906372, + "learning_rate": 1.140145039207836e-05, + "loss": 0.0034891828894615174, + "mean_token_accuracy": 0.9990123450756073, + "num_tokens": 88343771.0, + "step": 4300 + }, + { + "entropy": 0.7349947333335877, + "epoch": 7.9373848987108655, + "grad_norm": 0.45597800612449646, + "learning_rate": 1.1208586902150458e-05, + "loss": 0.0037573061883449553, + "mean_token_accuracy": 0.9988206088542938, + "num_tokens": 88549078.0, + "step": 4310 + }, + { + "entropy": 0.7384503066539765, + "epoch": 7.955801104972376, + "grad_norm": 0.4761682450771332, + "learning_rate": 1.1017162447125484e-05, + "loss": 0.004058422148227691, + "mean_token_accuracy": 0.9988016963005066, + "num_tokens": 88754354.0, + "step": 4320 + }, + { + "entropy": 0.7428180873394012, + "epoch": 7.974217311233886, + "grad_norm": 0.4685237407684326, + "learning_rate": 1.0827184128226392e-05, + "loss": 0.003867045044898987, + "mean_token_accuracy": 0.998711907863617, + "num_tokens": 88959534.0, + "step": 4330 + }, + { + "entropy": 0.7457234025001526, + "epoch": 7.9926335174953955, + "grad_norm": 0.35596850514411926, + "learning_rate": 1.0638658993029154e-05, + "loss": 0.0037776529788970947, + "mean_token_accuracy": 0.9989305913448334, + "num_tokens": 89164857.0, + "step": 4340 + }, + { + "epoch": 8.0, + "eval_entropy": 0.7504938332930855, + "eval_loss": 0.08306439220905304, + "eval_mean_token_accuracy": 0.980349570253621, + "eval_num_tokens": 89247188.0, + "eval_runtime": 10.0529, + "eval_samples_per_second": 363.875, + "eval_steps_per_second": 11.439, + "step": 4344 + }, + { + "entropy": 0.7459707975387573, + "epoch": 8.011049723756907, + "grad_norm": 0.2889060974121094, + "learning_rate": 1.0451594035201378e-05, + "loss": 0.003062780387699604, + "mean_token_accuracy": 0.9991089224815368, + "num_tokens": 89370847.0, + "step": 4350 + }, + { + "entropy": 0.7482452511787414, + "epoch": 8.029465930018416, + "grad_norm": 0.2335795760154724, + "learning_rate": 1.0265996194242888e-05, + "loss": 0.0023574704304337502, + "mean_token_accuracy": 0.9994185745716095, + "num_tokens": 89576321.0, + "step": 4360 + }, + { + "entropy": 0.7470004737377167, + "epoch": 8.047882136279926, + "grad_norm": 0.2035285383462906, + "learning_rate": 1.0081872355228228e-05, + "loss": 0.0021786754950881004, + "mean_token_accuracy": 0.9996390819549561, + "num_tokens": 89781865.0, + "step": 4370 + }, + { + "entropy": 0.7470916926860809, + "epoch": 8.066298342541437, + "grad_norm": 0.2530227303504944, + "learning_rate": 9.899229348551275e-06, + "loss": 0.0022982701659202574, + "mean_token_accuracy": 0.9995273351669312, + "num_tokens": 89987004.0, + "step": 4380 + }, + { + "entropy": 0.7481065988540649, + "epoch": 8.084714548802946, + "grad_norm": 0.269809752702713, + "learning_rate": 9.718073949671857e-06, + "loss": 0.0022342003881931304, + "mean_token_accuracy": 0.9994472205638886, + "num_tokens": 90191825.0, + "step": 4390 + }, + { + "entropy": 0.745439088344574, + "epoch": 8.103130755064457, + "grad_norm": 0.29525282979011536, + "learning_rate": 9.538412878864423e-06, + "loss": 0.002189977839589119, + "mean_token_accuracy": 0.9995133578777313, + "num_tokens": 90397588.0, + "step": 4400 + }, + { + "entropy": 0.745042335987091, + "epoch": 8.121546961325967, + "grad_norm": 0.26536691188812256, + "learning_rate": 9.360252800968717e-06, + "loss": 0.0021448172628879547, + "mean_token_accuracy": 0.9994488894939423, + "num_tokens": 90602602.0, + "step": 4410 + }, + { + "entropy": 0.7413490653038025, + "epoch": 8.139963167587476, + "grad_norm": 0.5203945636749268, + "learning_rate": 9.183600325142538e-06, + "loss": 0.002386796101927757, + "mean_token_accuracy": 0.999308317899704, + "num_tokens": 90808417.0, + "step": 4420 + }, + { + "entropy": 0.7447272837162018, + "epoch": 8.158379373848987, + "grad_norm": 0.23244412243366241, + "learning_rate": 9.008462004616558e-06, + "loss": 0.0021626869216561317, + "mean_token_accuracy": 0.9995910286903381, + "num_tokens": 91013911.0, + "step": 4430 + }, + { + "entropy": 0.7487411558628082, + "epoch": 8.176795580110497, + "grad_norm": 0.2531150281429291, + "learning_rate": 8.834844336451237e-06, + "loss": 0.0023509185761213303, + "mean_token_accuracy": 0.9994486331939697, + "num_tokens": 91219200.0, + "step": 4440 + }, + { + "entropy": 0.7508959770202637, + "epoch": 8.195211786372008, + "grad_norm": 0.3283107876777649, + "learning_rate": 8.662753761295772e-06, + "loss": 0.0020494431257247923, + "mean_token_accuracy": 0.9996370017528534, + "num_tokens": 91424124.0, + "step": 4450 + }, + { + "entropy": 0.7488435864448547, + "epoch": 8.213627992633517, + "grad_norm": 0.394175261259079, + "learning_rate": 8.492196663149232e-06, + "loss": 0.0024197638034820558, + "mean_token_accuracy": 0.999306058883667, + "num_tokens": 91629365.0, + "step": 4460 + }, + { + "entropy": 0.7459348142147064, + "epoch": 8.232044198895027, + "grad_norm": 0.36789506673812866, + "learning_rate": 8.32317936912364e-06, + "loss": 0.0022338634356856347, + "mean_token_accuracy": 0.9994658648967742, + "num_tokens": 91834941.0, + "step": 4470 + }, + { + "entropy": 0.7455970704555511, + "epoch": 8.250460405156538, + "grad_norm": 0.2969403564929962, + "learning_rate": 8.155708149209362e-06, + "loss": 0.0021990347653627396, + "mean_token_accuracy": 0.9994347035884857, + "num_tokens": 92040366.0, + "step": 4480 + }, + { + "entropy": 0.7435900688171386, + "epoch": 8.268876611418047, + "grad_norm": 0.35975271463394165, + "learning_rate": 7.989789216042415e-06, + "loss": 0.002438249811530113, + "mean_token_accuracy": 0.9993879854679107, + "num_tokens": 92246051.0, + "step": 4490 + }, + { + "entropy": 0.742461520433426, + "epoch": 8.287292817679559, + "grad_norm": 0.33160826563835144, + "learning_rate": 7.825428724674043e-06, + "loss": 0.0023146603256464005, + "mean_token_accuracy": 0.9994353473186492, + "num_tokens": 92451596.0, + "step": 4500 + }, + { + "entropy": 0.7435722470283508, + "epoch": 8.305709023941068, + "grad_norm": 0.3440259099006653, + "learning_rate": 7.662632772342415e-06, + "loss": 0.0021770250052213667, + "mean_token_accuracy": 0.9994809687137604, + "num_tokens": 92657419.0, + "step": 4510 + }, + { + "entropy": 0.7460585415363312, + "epoch": 8.324125230202577, + "grad_norm": 0.24974678456783295, + "learning_rate": 7.501407398246369e-06, + "loss": 0.0020514041185379027, + "mean_token_accuracy": 0.9994971275329589, + "num_tokens": 92863210.0, + "step": 4520 + }, + { + "entropy": 0.7475444614887238, + "epoch": 8.342541436464089, + "grad_norm": 0.31347015500068665, + "learning_rate": 7.3417585833214346e-06, + "loss": 0.0022453794255852698, + "mean_token_accuracy": 0.9995283901691436, + "num_tokens": 93068502.0, + "step": 4530 + }, + { + "entropy": 0.7466561555862427, + "epoch": 8.360957642725598, + "grad_norm": 0.2352103888988495, + "learning_rate": 7.183692250017915e-06, + "loss": 0.0020875211805105208, + "mean_token_accuracy": 0.999623715877533, + "num_tokens": 93274067.0, + "step": 4540 + }, + { + "entropy": 0.7491689443588256, + "epoch": 8.37937384898711, + "grad_norm": 0.2557368576526642, + "learning_rate": 7.027214262081239e-06, + "loss": 0.002048984169960022, + "mean_token_accuracy": 0.999556976556778, + "num_tokens": 93479279.0, + "step": 4550 + }, + { + "entropy": 0.744943630695343, + "epoch": 8.397790055248619, + "grad_norm": 0.2464229315519333, + "learning_rate": 6.872330424334395e-06, + "loss": 0.002088337019085884, + "mean_token_accuracy": 0.9994827687740326, + "num_tokens": 93684930.0, + "step": 4560 + }, + { + "entropy": 0.7461286425590515, + "epoch": 8.416206261510128, + "grad_norm": 0.2721276879310608, + "learning_rate": 6.719046482462571e-06, + "loss": 0.0020654335618019103, + "mean_token_accuracy": 0.9994162619113922, + "num_tokens": 93889972.0, + "step": 4570 + }, + { + "entropy": 0.7432287812232972, + "epoch": 8.43462246777164, + "grad_norm": 0.2500509023666382, + "learning_rate": 6.567368122800072e-06, + "loss": 0.002229658514261246, + "mean_token_accuracy": 0.9993383109569549, + "num_tokens": 94095488.0, + "step": 4580 + }, + { + "entropy": 0.742755651473999, + "epoch": 8.453038674033149, + "grad_norm": 0.17173363268375397, + "learning_rate": 6.4173009721193115e-06, + "loss": 0.0021433889865875243, + "mean_token_accuracy": 0.9994635343551636, + "num_tokens": 94300651.0, + "step": 4590 + }, + { + "entropy": 0.7388956308364868, + "epoch": 8.47145488029466, + "grad_norm": 0.274360716342926, + "learning_rate": 6.26885059742211e-06, + "loss": 0.0022360695526003837, + "mean_token_accuracy": 0.9994634568691254, + "num_tokens": 94506048.0, + "step": 4600 + }, + { + "entropy": 0.7357347309589386, + "epoch": 8.48987108655617, + "grad_norm": 0.24777138233184814, + "learning_rate": 6.122022505733205e-06, + "loss": 0.0022206470370292664, + "mean_token_accuracy": 0.9994794666767121, + "num_tokens": 94711514.0, + "step": 4610 + }, + { + "entropy": 0.735917067527771, + "epoch": 8.50828729281768, + "grad_norm": 0.3161139488220215, + "learning_rate": 5.976822143895872e-06, + "loss": 0.002088923379778862, + "mean_token_accuracy": 0.9994975507259369, + "num_tokens": 94917150.0, + "step": 4620 + }, + { + "entropy": 0.7372638165950776, + "epoch": 8.52670349907919, + "grad_norm": 0.20879895985126495, + "learning_rate": 5.833254898369972e-06, + "loss": 0.0024028895422816277, + "mean_token_accuracy": 0.9993885040283204, + "num_tokens": 95122554.0, + "step": 4630 + }, + { + "entropy": 0.7353293180465699, + "epoch": 8.5451197053407, + "grad_norm": 0.21544156968593597, + "learning_rate": 5.69132609503204e-06, + "loss": 0.0022050481289625167, + "mean_token_accuracy": 0.9993851661682129, + "num_tokens": 95327539.0, + "step": 4640 + }, + { + "entropy": 0.7328085541725159, + "epoch": 8.56353591160221, + "grad_norm": 0.2792287766933441, + "learning_rate": 5.551040998977747e-06, + "loss": 0.0022569041699171065, + "mean_token_accuracy": 0.999373483657837, + "num_tokens": 95533608.0, + "step": 4650 + }, + { + "entropy": 0.7355918467044831, + "epoch": 8.58195211786372, + "grad_norm": 0.35147684812545776, + "learning_rate": 5.412404814326633e-06, + "loss": 0.001975206658244133, + "mean_token_accuracy": 0.9996055364608765, + "num_tokens": 95738975.0, + "step": 4660 + }, + { + "entropy": 0.7370502591133118, + "epoch": 8.600368324125231, + "grad_norm": 0.2416532039642334, + "learning_rate": 5.2754226840289415e-06, + "loss": 0.002513406053185463, + "mean_token_accuracy": 0.9993549644947052, + "num_tokens": 95944370.0, + "step": 4670 + }, + { + "entropy": 0.737632417678833, + "epoch": 8.61878453038674, + "grad_norm": 0.3952357769012451, + "learning_rate": 5.140099689674926e-06, + "loss": 0.001944526843726635, + "mean_token_accuracy": 0.9995273172855377, + "num_tokens": 96149509.0, + "step": 4680 + }, + { + "entropy": 0.7384664714336395, + "epoch": 8.63720073664825, + "grad_norm": 0.40997347235679626, + "learning_rate": 5.006440851306315e-06, + "loss": 0.00224909633398056, + "mean_token_accuracy": 0.9993718564510345, + "num_tokens": 96354896.0, + "step": 4690 + }, + { + "entropy": 0.7366916120052338, + "epoch": 8.655616942909761, + "grad_norm": 0.3221156597137451, + "learning_rate": 4.874451127230057e-06, + "loss": 0.001974274218082428, + "mean_token_accuracy": 0.9994946360588074, + "num_tokens": 96559981.0, + "step": 4700 + }, + { + "entropy": 0.7354292273521423, + "epoch": 8.67403314917127, + "grad_norm": 0.25105226039886475, + "learning_rate": 4.744135413834427e-06, + "loss": 0.002092510275542736, + "mean_token_accuracy": 0.9995289027690888, + "num_tokens": 96765568.0, + "step": 4710 + }, + { + "entropy": 0.7376088976860047, + "epoch": 8.692449355432782, + "grad_norm": 0.2917911410331726, + "learning_rate": 4.615498545407343e-06, + "loss": 0.0022462595254182814, + "mean_token_accuracy": 0.9993582189083099, + "num_tokens": 96971275.0, + "step": 4720 + }, + { + "entropy": 0.74048610329628, + "epoch": 8.710865561694291, + "grad_norm": 0.27348214387893677, + "learning_rate": 4.4885452939570585e-06, + "loss": 0.0022259410470724106, + "mean_token_accuracy": 0.9994804978370666, + "num_tokens": 97176487.0, + "step": 4730 + }, + { + "entropy": 0.7381387889385224, + "epoch": 8.7292817679558, + "grad_norm": 0.43880945444107056, + "learning_rate": 4.363280369035128e-06, + "loss": 0.002389534562826157, + "mean_token_accuracy": 0.9994222104549408, + "num_tokens": 97382263.0, + "step": 4740 + }, + { + "entropy": 0.7372830450534821, + "epoch": 8.747697974217312, + "grad_norm": 0.23599931597709656, + "learning_rate": 4.2397084175616885e-06, + "loss": 0.0020194988697767257, + "mean_token_accuracy": 0.9995136618614197, + "num_tokens": 97588269.0, + "step": 4750 + }, + { + "entropy": 0.7386971414089203, + "epoch": 8.766114180478821, + "grad_norm": 0.42252203822135925, + "learning_rate": 4.117834023653117e-06, + "loss": 0.0021715080365538597, + "mean_token_accuracy": 0.9993864893913269, + "num_tokens": 97793832.0, + "step": 4760 + }, + { + "entropy": 0.7398471057415008, + "epoch": 8.784530386740332, + "grad_norm": 0.2709618806838989, + "learning_rate": 3.9976617084519e-06, + "loss": 0.0023746009916067123, + "mean_token_accuracy": 0.9993088185787201, + "num_tokens": 97999276.0, + "step": 4770 + }, + { + "entropy": 0.7391557276248932, + "epoch": 8.802946593001842, + "grad_norm": 0.30807623267173767, + "learning_rate": 3.8791959299589895e-06, + "loss": 0.001992644742131233, + "mean_token_accuracy": 0.9994801640510559, + "num_tokens": 98204933.0, + "step": 4780 + }, + { + "entropy": 0.7370819032192231, + "epoch": 8.821362799263351, + "grad_norm": 0.25062650442123413, + "learning_rate": 3.762441082868373e-06, + "loss": 0.002125708945095539, + "mean_token_accuracy": 0.999417644739151, + "num_tokens": 98410574.0, + "step": 4790 + }, + { + "entropy": 0.7399656236171722, + "epoch": 8.839779005524862, + "grad_norm": 0.32383784651756287, + "learning_rate": 3.647401498404052e-06, + "loss": 0.0019155235961079598, + "mean_token_accuracy": 0.9995088458061219, + "num_tokens": 98615509.0, + "step": 4800 + }, + { + "entropy": 0.7371088266372681, + "epoch": 8.858195211786372, + "grad_norm": 0.2771705985069275, + "learning_rate": 3.5340814441594207e-06, + "loss": 0.0020927552133798598, + "mean_token_accuracy": 0.9995276153087616, + "num_tokens": 98820673.0, + "step": 4810 + }, + { + "entropy": 0.7376474678516388, + "epoch": 8.876611418047883, + "grad_norm": 0.37182608246803284, + "learning_rate": 3.422485123938862e-06, + "loss": 0.0021230582147836684, + "mean_token_accuracy": 0.9994983315467835, + "num_tokens": 99026388.0, + "step": 4820 + }, + { + "entropy": 0.7386386632919312, + "epoch": 8.895027624309392, + "grad_norm": 0.29156213998794556, + "learning_rate": 3.3126166776018763e-06, + "loss": 0.0022009313106536865, + "mean_token_accuracy": 0.9994311630725861, + "num_tokens": 99231572.0, + "step": 4830 + }, + { + "entropy": 0.7364113330841064, + "epoch": 8.913443830570902, + "grad_norm": 0.32613781094551086, + "learning_rate": 3.2044801809094805e-06, + "loss": 0.0021279999986290933, + "mean_token_accuracy": 0.999450010061264, + "num_tokens": 99437158.0, + "step": 4840 + }, + { + "entropy": 0.7359442055225373, + "epoch": 8.931860036832413, + "grad_norm": 0.34959548711776733, + "learning_rate": 3.098079645372992e-06, + "loss": 0.002106292359530926, + "mean_token_accuracy": 0.9993851602077484, + "num_tokens": 99642599.0, + "step": 4850 + }, + { + "entropy": 0.7357041001319885, + "epoch": 8.950276243093922, + "grad_norm": 0.3114016056060791, + "learning_rate": 2.993419018105248e-06, + "loss": 0.002017174661159515, + "mean_token_accuracy": 0.9994815111160278, + "num_tokens": 99848146.0, + "step": 4860 + }, + { + "entropy": 0.7360438585281373, + "epoch": 8.968692449355434, + "grad_norm": 0.1850976198911667, + "learning_rate": 2.890502181674154e-06, + "loss": 0.0017870433628559112, + "mean_token_accuracy": 0.9996066927909851, + "num_tokens": 100053727.0, + "step": 4870 + }, + { + "entropy": 0.7401692926883697, + "epoch": 8.987108655616943, + "grad_norm": 0.22801977396011353, + "learning_rate": 2.7893329539586678e-06, + "loss": 0.0018131747841835023, + "mean_token_accuracy": 0.9995904862880707, + "num_tokens": 100259212.0, + "step": 4880 + }, + { + "epoch": 9.0, + "eval_entropy": 0.7392174223194952, + "eval_loss": 0.08729223161935806, + "eval_mean_token_accuracy": 0.9803872994754625, + "eval_num_tokens": 100403043.0, + "eval_runtime": 10.0788, + "eval_samples_per_second": 362.941, + "eval_steps_per_second": 11.41, + "step": 4887 + } + ], + "logging_steps": 10, + "max_steps": 5430, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.786260182155919e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4887/training_args.bin b/checkpoint-4887/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e182f9a244f56249cd86c9db6563a54fbeed7bd --- /dev/null +++ b/checkpoint-4887/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21325c9bdff5ed34f0cc34837ee67ed216c9301ab4d9b2e26f048b563564bd75 +size 5777 diff --git a/checkpoint-543/README.md b/checkpoint-543/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dd49221a54a197d9f0cab86c7982e2b2fe0776b3 --- /dev/null +++ b/checkpoint-543/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen2.5-7B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen2.5-7B-Instruct +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-543/adapter_config.json b/checkpoint-543/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..88178e58c6d24c2bb788f871d548c8bff3b8b16a --- /dev/null +++ b/checkpoint-543/adapter_config.json @@ -0,0 +1,48 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "down_proj", + "q_proj", + "gate_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-543/adapter_model.safetensors b/checkpoint-543/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cb61270e62785c45ccb7b4a159567da5b65ac8b4 --- /dev/null +++ b/checkpoint-543/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3759f99479bb7f6c67e389ea7cd94c4a1f4be8277648e8b2db827ea9eb3bcf3a +size 80792096 diff --git a/checkpoint-543/chat_template.jinja b/checkpoint-543/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..bdf7919a96cfe43d50914a007b9c0877bd0ec27e --- /dev/null +++ b/checkpoint-543/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-543/tokenizer.json b/checkpoint-543/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/checkpoint-543/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/checkpoint-543/tokenizer_config.json b/checkpoint-543/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d75d3bb5300d205e48769cc1999073ab5971214 --- /dev/null +++ b/checkpoint-543/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-543/trainer_state.json b/checkpoint-543/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..63a4f411d09e0131754beb134a1c5c1cc38b33ce --- /dev/null +++ b/checkpoint-543/trainer_state.json @@ -0,0 +1,585 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 543, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2237394809722901, + "epoch": 0.01841620626151013, + "grad_norm": 5.082435607910156, + "learning_rate": 3.308823529411765e-06, + "loss": 0.9237876892089844, + "mean_token_accuracy": 0.7685343027114868, + "num_tokens": 205423.0, + "step": 10 + }, + { + "entropy": 1.2295925617218018, + "epoch": 0.03683241252302026, + "grad_norm": 4.672000408172607, + "learning_rate": 6.985294117647059e-06, + "loss": 0.8900892257690429, + "mean_token_accuracy": 0.7677771031856537, + "num_tokens": 410849.0, + "step": 20 + }, + { + "entropy": 1.2285718679428101, + "epoch": 0.055248618784530384, + "grad_norm": 1.4828118085861206, + "learning_rate": 1.0661764705882354e-05, + "loss": 0.5975452899932862, + "mean_token_accuracy": 0.8146551787853241, + "num_tokens": 616438.0, + "step": 30 + }, + { + "entropy": 1.210776400566101, + "epoch": 0.07366482504604052, + "grad_norm": 0.7761328816413879, + "learning_rate": 1.4338235294117647e-05, + "loss": 0.40664992332458494, + "mean_token_accuracy": 0.8699092030525207, + "num_tokens": 822118.0, + "step": 40 + }, + { + "entropy": 1.200321125984192, + "epoch": 0.09208103130755065, + "grad_norm": 0.5363371968269348, + "learning_rate": 1.8014705882352943e-05, + "loss": 0.3313469409942627, + "mean_token_accuracy": 0.8904915869235992, + "num_tokens": 1027941.0, + "step": 50 + }, + { + "entropy": 1.1809936046600342, + "epoch": 0.11049723756906077, + "grad_norm": 0.39541518688201904, + "learning_rate": 2.1691176470588237e-05, + "loss": 0.27568228244781495, + "mean_token_accuracy": 0.9047131836414337, + "num_tokens": 1233620.0, + "step": 60 + }, + { + "entropy": 1.169810914993286, + "epoch": 0.1289134438305709, + "grad_norm": 0.341960072517395, + "learning_rate": 2.536764705882353e-05, + "loss": 0.245219087600708, + "mean_token_accuracy": 0.9150686681270599, + "num_tokens": 1438656.0, + "step": 70 + }, + { + "entropy": 1.1652960777282715, + "epoch": 0.14732965009208104, + "grad_norm": 0.36872178316116333, + "learning_rate": 2.9044117647058828e-05, + "loss": 0.2220149040222168, + "mean_token_accuracy": 0.9224777698516846, + "num_tokens": 1643877.0, + "step": 80 + }, + { + "entropy": 1.154341197013855, + "epoch": 0.16574585635359115, + "grad_norm": 0.4152425229549408, + "learning_rate": 3.272058823529412e-05, + "loss": 0.2002798557281494, + "mean_token_accuracy": 0.9285802960395813, + "num_tokens": 1849506.0, + "step": 90 + }, + { + "entropy": 1.1507258892059327, + "epoch": 0.1841620626151013, + "grad_norm": 0.47647765278816223, + "learning_rate": 3.639705882352941e-05, + "loss": 0.18871363401412963, + "mean_token_accuracy": 0.9318056285381318, + "num_tokens": 2055071.0, + "step": 100 + }, + { + "entropy": 1.1455535531044005, + "epoch": 0.20257826887661143, + "grad_norm": 0.4853009581565857, + "learning_rate": 4.007352941176471e-05, + "loss": 0.17836341857910157, + "mean_token_accuracy": 0.9367631554603577, + "num_tokens": 2260643.0, + "step": 110 + }, + { + "entropy": 1.1402526497840881, + "epoch": 0.22099447513812154, + "grad_norm": 0.4455392360687256, + "learning_rate": 4.375e-05, + "loss": 0.16921783685684205, + "mean_token_accuracy": 0.9386959195137023, + "num_tokens": 2466085.0, + "step": 120 + }, + { + "entropy": 1.1374777555465698, + "epoch": 0.23941068139963168, + "grad_norm": 0.5880279541015625, + "learning_rate": 4.742647058823529e-05, + "loss": 0.15989291667938232, + "mean_token_accuracy": 0.9421182632446289, + "num_tokens": 2671024.0, + "step": 130 + }, + { + "entropy": 1.1273940205574036, + "epoch": 0.2578268876611418, + "grad_norm": 0.612959086894989, + "learning_rate": 5.110294117647059e-05, + "loss": 0.14701461791992188, + "mean_token_accuracy": 0.9463540315628052, + "num_tokens": 2876848.0, + "step": 140 + }, + { + "entropy": 1.1263513088226318, + "epoch": 0.27624309392265195, + "grad_norm": 0.5695255398750305, + "learning_rate": 5.477941176470589e-05, + "loss": 0.14604382514953612, + "mean_token_accuracy": 0.946351945400238, + "num_tokens": 3082589.0, + "step": 150 + }, + { + "entropy": 1.1290789365768432, + "epoch": 0.2946593001841621, + "grad_norm": 0.6608090996742249, + "learning_rate": 5.845588235294118e-05, + "loss": 0.1409450054168701, + "mean_token_accuracy": 0.9481450319290161, + "num_tokens": 3287459.0, + "step": 160 + }, + { + "entropy": 1.1291529774665832, + "epoch": 0.31307550644567217, + "grad_norm": 0.652715802192688, + "learning_rate": 6.213235294117647e-05, + "loss": 0.14441155195236205, + "mean_token_accuracy": 0.9466125547885895, + "num_tokens": 3493682.0, + "step": 170 + }, + { + "entropy": 1.1244838953018188, + "epoch": 0.3314917127071823, + "grad_norm": 0.7815241813659668, + "learning_rate": 6.580882352941177e-05, + "loss": 0.13361064195632935, + "mean_token_accuracy": 0.9512295544147491, + "num_tokens": 3699573.0, + "step": 180 + }, + { + "entropy": 1.1217721104621887, + "epoch": 0.34990791896869244, + "grad_norm": 0.7933160066604614, + "learning_rate": 6.948529411764706e-05, + "loss": 0.13089522123336791, + "mean_token_accuracy": 0.9520221531391144, + "num_tokens": 3905156.0, + "step": 190 + }, + { + "entropy": 1.1206679105758668, + "epoch": 0.3683241252302026, + "grad_norm": 0.6815240383148193, + "learning_rate": 7.316176470588236e-05, + "loss": 0.13400404453277587, + "mean_token_accuracy": 0.9501322209835052, + "num_tokens": 4110570.0, + "step": 200 + }, + { + "entropy": 1.1161052227020263, + "epoch": 0.3867403314917127, + "grad_norm": 0.8297767639160156, + "learning_rate": 7.683823529411766e-05, + "loss": 0.13389937877655028, + "mean_token_accuracy": 0.9501932203769684, + "num_tokens": 4315834.0, + "step": 210 + }, + { + "entropy": 1.1098745942115784, + "epoch": 0.40515653775322286, + "grad_norm": 0.5943381786346436, + "learning_rate": 8.051470588235294e-05, + "loss": 0.13452907800674438, + "mean_token_accuracy": 0.9503286242485046, + "num_tokens": 4520807.0, + "step": 220 + }, + { + "entropy": 1.100480353832245, + "epoch": 0.42357274401473294, + "grad_norm": 0.6094359755516052, + "learning_rate": 8.419117647058824e-05, + "loss": 0.12827746868133544, + "mean_token_accuracy": 0.952492094039917, + "num_tokens": 4725867.0, + "step": 230 + }, + { + "entropy": 1.0901286959648133, + "epoch": 0.4419889502762431, + "grad_norm": 0.7240597605705261, + "learning_rate": 8.786764705882353e-05, + "loss": 0.12171242237091065, + "mean_token_accuracy": 0.953943532705307, + "num_tokens": 4931629.0, + "step": 240 + }, + { + "entropy": 1.0885071873664856, + "epoch": 0.4604051565377532, + "grad_norm": 0.6939547657966614, + "learning_rate": 9.154411764705882e-05, + "loss": 0.12155698537826538, + "mean_token_accuracy": 0.9545870959758759, + "num_tokens": 5137285.0, + "step": 250 + }, + { + "entropy": 1.086272156238556, + "epoch": 0.47882136279926335, + "grad_norm": 0.5752800703048706, + "learning_rate": 9.522058823529412e-05, + "loss": 0.12157790660858155, + "mean_token_accuracy": 0.9541126549243927, + "num_tokens": 5342575.0, + "step": 260 + }, + { + "entropy": 1.0857678413391114, + "epoch": 0.4972375690607735, + "grad_norm": 0.7565123438835144, + "learning_rate": 9.889705882352942e-05, + "loss": 0.12349612712860107, + "mean_token_accuracy": 0.9535140514373779, + "num_tokens": 5547995.0, + "step": 270 + }, + { + "entropy": 1.079762625694275, + "epoch": 0.5156537753222836, + "grad_norm": 0.6972768306732178, + "learning_rate": 9.999954556423843e-05, + "loss": 0.11875582933425903, + "mean_token_accuracy": 0.9556483089923858, + "num_tokens": 5753195.0, + "step": 280 + }, + { + "entropy": 1.0742079138755798, + "epoch": 0.5340699815837937, + "grad_norm": 0.7821696996688843, + "learning_rate": 9.999731977631227e-05, + "loss": 0.11824090480804443, + "mean_token_accuracy": 0.9557521045207977, + "num_tokens": 5958236.0, + "step": 290 + }, + { + "entropy": 1.0679773569107056, + "epoch": 0.5524861878453039, + "grad_norm": 0.5846888422966003, + "learning_rate": 9.999323925089486e-05, + "loss": 0.11707355976104736, + "mean_token_accuracy": 0.9554719448089599, + "num_tokens": 6163992.0, + "step": 300 + }, + { + "entropy": 1.0655727863311768, + "epoch": 0.570902394106814, + "grad_norm": 0.5812502503395081, + "learning_rate": 9.998730413936037e-05, + "loss": 0.11371417045593261, + "mean_token_accuracy": 0.9576376020908356, + "num_tokens": 6369456.0, + "step": 310 + }, + { + "entropy": 1.0607039332389832, + "epoch": 0.5893186003683242, + "grad_norm": 0.6238475441932678, + "learning_rate": 9.99795146618821e-05, + "loss": 0.11775733232498169, + "mean_token_accuracy": 0.9557221591472626, + "num_tokens": 6574833.0, + "step": 320 + }, + { + "entropy": 1.0504255175590516, + "epoch": 0.6077348066298343, + "grad_norm": 0.6496815085411072, + "learning_rate": 9.996987110742422e-05, + "loss": 0.10904088020324706, + "mean_token_accuracy": 0.9585366368293762, + "num_tokens": 6780108.0, + "step": 330 + }, + { + "entropy": 1.0456081986427308, + "epoch": 0.6261510128913443, + "grad_norm": 0.786702573299408, + "learning_rate": 9.995837383373119e-05, + "loss": 0.10642309188842773, + "mean_token_accuracy": 0.9596696078777314, + "num_tokens": 6985920.0, + "step": 340 + }, + { + "entropy": 1.0455098271369934, + "epoch": 0.6445672191528545, + "grad_norm": 0.5473790168762207, + "learning_rate": 9.994502326731434e-05, + "loss": 0.10822961330413819, + "mean_token_accuracy": 0.959563136100769, + "num_tokens": 7191465.0, + "step": 350 + }, + { + "entropy": 1.04240562915802, + "epoch": 0.6629834254143646, + "grad_norm": 0.6672356128692627, + "learning_rate": 9.992981990343614e-05, + "loss": 0.1110004186630249, + "mean_token_accuracy": 0.9582514643669129, + "num_tokens": 7396877.0, + "step": 360 + }, + { + "entropy": 1.0386811256408692, + "epoch": 0.6813996316758748, + "grad_norm": 0.698539674282074, + "learning_rate": 9.99127643060918e-05, + "loss": 0.107539963722229, + "mean_token_accuracy": 0.9593036234378814, + "num_tokens": 7602437.0, + "step": 370 + }, + { + "entropy": 1.0311225533485413, + "epoch": 0.6998158379373849, + "grad_norm": 0.6629284024238586, + "learning_rate": 9.989385710798837e-05, + "loss": 0.1064023494720459, + "mean_token_accuracy": 0.9602205216884613, + "num_tokens": 7808142.0, + "step": 380 + }, + { + "entropy": 1.030210506916046, + "epoch": 0.7182320441988951, + "grad_norm": 0.5616748929023743, + "learning_rate": 9.987309901052121e-05, + "loss": 0.10717041492462158, + "mean_token_accuracy": 0.9599347949028015, + "num_tokens": 8013407.0, + "step": 390 + }, + { + "entropy": 1.0208017826080322, + "epoch": 0.7366482504604052, + "grad_norm": 0.6329049468040466, + "learning_rate": 9.985049078374806e-05, + "loss": 0.10359601974487305, + "mean_token_accuracy": 0.9603756129741668, + "num_tokens": 8219040.0, + "step": 400 + }, + { + "entropy": 1.015640377998352, + "epoch": 0.7550644567219152, + "grad_norm": 0.6516013741493225, + "learning_rate": 9.982603326636037e-05, + "loss": 0.10146439075469971, + "mean_token_accuracy": 0.9627702474594116, + "num_tokens": 8424678.0, + "step": 410 + }, + { + "entropy": 1.0105359435081482, + "epoch": 0.7734806629834254, + "grad_norm": 0.6920603513717651, + "learning_rate": 9.979972736565226e-05, + "loss": 0.10770498514175415, + "mean_token_accuracy": 0.9591470420360565, + "num_tokens": 8629868.0, + "step": 420 + }, + { + "entropy": 0.9966452836990356, + "epoch": 0.7918968692449355, + "grad_norm": 0.6857476234436035, + "learning_rate": 9.977157405748687e-05, + "loss": 0.10282524824142455, + "mean_token_accuracy": 0.9612209022045135, + "num_tokens": 8835320.0, + "step": 430 + }, + { + "entropy": 0.9945534646511078, + "epoch": 0.8103130755064457, + "grad_norm": 0.7208472490310669, + "learning_rate": 9.974157438626008e-05, + "loss": 0.10069938898086547, + "mean_token_accuracy": 0.9620070576667785, + "num_tokens": 9041123.0, + "step": 440 + }, + { + "entropy": 0.979461395740509, + "epoch": 0.8287292817679558, + "grad_norm": 0.5071915984153748, + "learning_rate": 9.970972946486185e-05, + "loss": 0.09799174070358277, + "mean_token_accuracy": 0.9620374023914338, + "num_tokens": 9246361.0, + "step": 450 + }, + { + "entropy": 0.9830998003482818, + "epoch": 0.8471454880294659, + "grad_norm": 0.8660802245140076, + "learning_rate": 9.967604047463493e-05, + "loss": 0.10378165245056152, + "mean_token_accuracy": 0.9606865763664245, + "num_tokens": 9451845.0, + "step": 460 + }, + { + "entropy": 0.9813413023948669, + "epoch": 0.8655616942909761, + "grad_norm": 0.7642477750778198, + "learning_rate": 9.964050866533094e-05, + "loss": 0.1010061264038086, + "mean_token_accuracy": 0.9608745336532593, + "num_tokens": 9656802.0, + "step": 470 + }, + { + "entropy": 0.967874163389206, + "epoch": 0.8839779005524862, + "grad_norm": 0.5987281799316406, + "learning_rate": 9.960313535506411e-05, + "loss": 0.10169394016265869, + "mean_token_accuracy": 0.9611998200416565, + "num_tokens": 9861719.0, + "step": 480 + }, + { + "entropy": 0.9663491308689117, + "epoch": 0.9023941068139963, + "grad_norm": 0.6124638319015503, + "learning_rate": 9.956392193026239e-05, + "loss": 0.102389657497406, + "mean_token_accuracy": 0.9611884355545044, + "num_tokens": 10066673.0, + "step": 490 + }, + { + "entropy": 0.959654438495636, + "epoch": 0.9208103130755064, + "grad_norm": 0.7873051762580872, + "learning_rate": 9.952286984561592e-05, + "loss": 0.10170392990112305, + "mean_token_accuracy": 0.9610928475856781, + "num_tokens": 10272091.0, + "step": 500 + }, + { + "entropy": 0.9550537407398224, + "epoch": 0.9392265193370166, + "grad_norm": 0.6071968078613281, + "learning_rate": 9.947998062402313e-05, + "loss": 0.09448277950286865, + "mean_token_accuracy": 0.9648977637290954, + "num_tokens": 10477632.0, + "step": 510 + }, + { + "entropy": 0.9538533687591553, + "epoch": 0.9576427255985267, + "grad_norm": 0.6317242980003357, + "learning_rate": 9.943525585653428e-05, + "loss": 0.09542192220687866, + "mean_token_accuracy": 0.9635261118412017, + "num_tokens": 10682828.0, + "step": 520 + }, + { + "entropy": 0.9362513542175293, + "epoch": 0.9760589318600368, + "grad_norm": 0.6421944499015808, + "learning_rate": 9.938869720229234e-05, + "loss": 0.09382058382034301, + "mean_token_accuracy": 0.9648073971271515, + "num_tokens": 10888741.0, + "step": 530 + }, + { + "entropy": 0.9235438346862793, + "epoch": 0.994475138121547, + "grad_norm": 0.7986873388290405, + "learning_rate": 9.934030638847155e-05, + "loss": 0.09827429056167603, + "mean_token_accuracy": 0.9621128737926483, + "num_tokens": 11094387.0, + "step": 540 + }, + { + "epoch": 1.0, + "eval_entropy": 0.9137652366057686, + "eval_loss": 0.09368764609098434, + "eval_mean_token_accuracy": 0.9640816880309063, + "eval_num_tokens": 11155908.0, + "eval_runtime": 10.4701, + "eval_samples_per_second": 349.377, + "eval_steps_per_second": 10.984, + "step": 543 + } + ], + "logging_steps": 10, + "max_steps": 5430, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.319709655427973e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-543/training_args.bin b/checkpoint-543/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e182f9a244f56249cd86c9db6563a54fbeed7bd --- /dev/null +++ b/checkpoint-543/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21325c9bdff5ed34f0cc34837ee67ed216c9301ab4d9b2e26f048b563564bd75 +size 5777 diff --git a/checkpoint-5430/README.md b/checkpoint-5430/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dd49221a54a197d9f0cab86c7982e2b2fe0776b3 --- /dev/null +++ b/checkpoint-5430/README.md @@ -0,0 +1,209 @@ +--- +base_model: Qwen/Qwen2.5-7B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen2.5-7B-Instruct +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-5430/adapter_config.json b/checkpoint-5430/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..88178e58c6d24c2bb788f871d548c8bff3b8b16a --- /dev/null +++ b/checkpoint-5430/adapter_config.json @@ -0,0 +1,48 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "down_proj", + "q_proj", + "gate_proj", + "k_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-5430/adapter_model.safetensors b/checkpoint-5430/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c57a413b91fad886bd899d552a4aace9704085e4 --- /dev/null +++ b/checkpoint-5430/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44a37f4e701d0d74b2f3087cbbb4d2cce354e5bfcb1651dbb4d48e82fd2234d7 +size 80792096 diff --git a/checkpoint-5430/chat_template.jinja b/checkpoint-5430/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..bdf7919a96cfe43d50914a007b9c0877bd0ec27e --- /dev/null +++ b/checkpoint-5430/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/checkpoint-5430/tokenizer.json b/checkpoint-5430/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/checkpoint-5430/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/checkpoint-5430/tokenizer_config.json b/checkpoint-5430/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d75d3bb5300d205e48769cc1999073ab5971214 --- /dev/null +++ b/checkpoint-5430/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-5430/trainer_state.json b/checkpoint-5430/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fd65f1af6f476b68fe0c8ca2ad93c7ea9563fd34 --- /dev/null +++ b/checkpoint-5430/trainer_state.json @@ -0,0 +1,5574 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 5430, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2237394809722901, + "epoch": 0.01841620626151013, + "grad_norm": 5.082435607910156, + "learning_rate": 3.308823529411765e-06, + "loss": 0.9237876892089844, + "mean_token_accuracy": 0.7685343027114868, + "num_tokens": 205423.0, + "step": 10 + }, + { + "entropy": 1.2295925617218018, + "epoch": 0.03683241252302026, + "grad_norm": 4.672000408172607, + "learning_rate": 6.985294117647059e-06, + "loss": 0.8900892257690429, + "mean_token_accuracy": 0.7677771031856537, + "num_tokens": 410849.0, + "step": 20 + }, + { + "entropy": 1.2285718679428101, + "epoch": 0.055248618784530384, + "grad_norm": 1.4828118085861206, + "learning_rate": 1.0661764705882354e-05, + "loss": 0.5975452899932862, + "mean_token_accuracy": 0.8146551787853241, + "num_tokens": 616438.0, + "step": 30 + }, + { + "entropy": 1.210776400566101, + "epoch": 0.07366482504604052, + "grad_norm": 0.7761328816413879, + "learning_rate": 1.4338235294117647e-05, + "loss": 0.40664992332458494, + "mean_token_accuracy": 0.8699092030525207, + "num_tokens": 822118.0, + "step": 40 + }, + { + "entropy": 1.200321125984192, + "epoch": 0.09208103130755065, + "grad_norm": 0.5363371968269348, + "learning_rate": 1.8014705882352943e-05, + "loss": 0.3313469409942627, + "mean_token_accuracy": 0.8904915869235992, + "num_tokens": 1027941.0, + "step": 50 + }, + { + "entropy": 1.1809936046600342, + "epoch": 0.11049723756906077, + "grad_norm": 0.39541518688201904, + "learning_rate": 2.1691176470588237e-05, + "loss": 0.27568228244781495, + "mean_token_accuracy": 0.9047131836414337, + "num_tokens": 1233620.0, + "step": 60 + }, + { + "entropy": 1.169810914993286, + "epoch": 0.1289134438305709, + "grad_norm": 0.341960072517395, + "learning_rate": 2.536764705882353e-05, + "loss": 0.245219087600708, + "mean_token_accuracy": 0.9150686681270599, + "num_tokens": 1438656.0, + "step": 70 + }, + { + "entropy": 1.1652960777282715, + "epoch": 0.14732965009208104, + "grad_norm": 0.36872178316116333, + "learning_rate": 2.9044117647058828e-05, + "loss": 0.2220149040222168, + "mean_token_accuracy": 0.9224777698516846, + "num_tokens": 1643877.0, + "step": 80 + }, + { + "entropy": 1.154341197013855, + "epoch": 0.16574585635359115, + "grad_norm": 0.4152425229549408, + "learning_rate": 3.272058823529412e-05, + "loss": 0.2002798557281494, + "mean_token_accuracy": 0.9285802960395813, + "num_tokens": 1849506.0, + "step": 90 + }, + { + "entropy": 1.1507258892059327, + "epoch": 0.1841620626151013, + "grad_norm": 0.47647765278816223, + "learning_rate": 3.639705882352941e-05, + "loss": 0.18871363401412963, + "mean_token_accuracy": 0.9318056285381318, + "num_tokens": 2055071.0, + "step": 100 + }, + { + "entropy": 1.1455535531044005, + "epoch": 0.20257826887661143, + "grad_norm": 0.4853009581565857, + "learning_rate": 4.007352941176471e-05, + "loss": 0.17836341857910157, + "mean_token_accuracy": 0.9367631554603577, + "num_tokens": 2260643.0, + "step": 110 + }, + { + "entropy": 1.1402526497840881, + "epoch": 0.22099447513812154, + "grad_norm": 0.4455392360687256, + "learning_rate": 4.375e-05, + "loss": 0.16921783685684205, + "mean_token_accuracy": 0.9386959195137023, + "num_tokens": 2466085.0, + "step": 120 + }, + { + "entropy": 1.1374777555465698, + "epoch": 0.23941068139963168, + "grad_norm": 0.5880279541015625, + "learning_rate": 4.742647058823529e-05, + "loss": 0.15989291667938232, + "mean_token_accuracy": 0.9421182632446289, + "num_tokens": 2671024.0, + "step": 130 + }, + { + "entropy": 1.1273940205574036, + "epoch": 0.2578268876611418, + "grad_norm": 0.612959086894989, + "learning_rate": 5.110294117647059e-05, + "loss": 0.14701461791992188, + "mean_token_accuracy": 0.9463540315628052, + "num_tokens": 2876848.0, + "step": 140 + }, + { + "entropy": 1.1263513088226318, + "epoch": 0.27624309392265195, + "grad_norm": 0.5695255398750305, + "learning_rate": 5.477941176470589e-05, + "loss": 0.14604382514953612, + "mean_token_accuracy": 0.946351945400238, + "num_tokens": 3082589.0, + "step": 150 + }, + { + "entropy": 1.1290789365768432, + "epoch": 0.2946593001841621, + "grad_norm": 0.6608090996742249, + "learning_rate": 5.845588235294118e-05, + "loss": 0.1409450054168701, + "mean_token_accuracy": 0.9481450319290161, + "num_tokens": 3287459.0, + "step": 160 + }, + { + "entropy": 1.1291529774665832, + "epoch": 0.31307550644567217, + "grad_norm": 0.652715802192688, + "learning_rate": 6.213235294117647e-05, + "loss": 0.14441155195236205, + "mean_token_accuracy": 0.9466125547885895, + "num_tokens": 3493682.0, + "step": 170 + }, + { + "entropy": 1.1244838953018188, + "epoch": 0.3314917127071823, + "grad_norm": 0.7815241813659668, + "learning_rate": 6.580882352941177e-05, + "loss": 0.13361064195632935, + "mean_token_accuracy": 0.9512295544147491, + "num_tokens": 3699573.0, + "step": 180 + }, + { + "entropy": 1.1217721104621887, + "epoch": 0.34990791896869244, + "grad_norm": 0.7933160066604614, + "learning_rate": 6.948529411764706e-05, + "loss": 0.13089522123336791, + "mean_token_accuracy": 0.9520221531391144, + "num_tokens": 3905156.0, + "step": 190 + }, + { + "entropy": 1.1206679105758668, + "epoch": 0.3683241252302026, + "grad_norm": 0.6815240383148193, + "learning_rate": 7.316176470588236e-05, + "loss": 0.13400404453277587, + "mean_token_accuracy": 0.9501322209835052, + "num_tokens": 4110570.0, + "step": 200 + }, + { + "entropy": 1.1161052227020263, + "epoch": 0.3867403314917127, + "grad_norm": 0.8297767639160156, + "learning_rate": 7.683823529411766e-05, + "loss": 0.13389937877655028, + "mean_token_accuracy": 0.9501932203769684, + "num_tokens": 4315834.0, + "step": 210 + }, + { + "entropy": 1.1098745942115784, + "epoch": 0.40515653775322286, + "grad_norm": 0.5943381786346436, + "learning_rate": 8.051470588235294e-05, + "loss": 0.13452907800674438, + "mean_token_accuracy": 0.9503286242485046, + "num_tokens": 4520807.0, + "step": 220 + }, + { + "entropy": 1.100480353832245, + "epoch": 0.42357274401473294, + "grad_norm": 0.6094359755516052, + "learning_rate": 8.419117647058824e-05, + "loss": 0.12827746868133544, + "mean_token_accuracy": 0.952492094039917, + "num_tokens": 4725867.0, + "step": 230 + }, + { + "entropy": 1.0901286959648133, + "epoch": 0.4419889502762431, + "grad_norm": 0.7240597605705261, + "learning_rate": 8.786764705882353e-05, + "loss": 0.12171242237091065, + "mean_token_accuracy": 0.953943532705307, + "num_tokens": 4931629.0, + "step": 240 + }, + { + "entropy": 1.0885071873664856, + "epoch": 0.4604051565377532, + "grad_norm": 0.6939547657966614, + "learning_rate": 9.154411764705882e-05, + "loss": 0.12155698537826538, + "mean_token_accuracy": 0.9545870959758759, + "num_tokens": 5137285.0, + "step": 250 + }, + { + "entropy": 1.086272156238556, + "epoch": 0.47882136279926335, + "grad_norm": 0.5752800703048706, + "learning_rate": 9.522058823529412e-05, + "loss": 0.12157790660858155, + "mean_token_accuracy": 0.9541126549243927, + "num_tokens": 5342575.0, + "step": 260 + }, + { + "entropy": 1.0857678413391114, + "epoch": 0.4972375690607735, + "grad_norm": 0.7565123438835144, + "learning_rate": 9.889705882352942e-05, + "loss": 0.12349612712860107, + "mean_token_accuracy": 0.9535140514373779, + "num_tokens": 5547995.0, + "step": 270 + }, + { + "entropy": 1.079762625694275, + "epoch": 0.5156537753222836, + "grad_norm": 0.6972768306732178, + "learning_rate": 9.999954556423843e-05, + "loss": 0.11875582933425903, + "mean_token_accuracy": 0.9556483089923858, + "num_tokens": 5753195.0, + "step": 280 + }, + { + "entropy": 1.0742079138755798, + "epoch": 0.5340699815837937, + "grad_norm": 0.7821696996688843, + "learning_rate": 9.999731977631227e-05, + "loss": 0.11824090480804443, + "mean_token_accuracy": 0.9557521045207977, + "num_tokens": 5958236.0, + "step": 290 + }, + { + "entropy": 1.0679773569107056, + "epoch": 0.5524861878453039, + "grad_norm": 0.5846888422966003, + "learning_rate": 9.999323925089486e-05, + "loss": 0.11707355976104736, + "mean_token_accuracy": 0.9554719448089599, + "num_tokens": 6163992.0, + "step": 300 + }, + { + "entropy": 1.0655727863311768, + "epoch": 0.570902394106814, + "grad_norm": 0.5812502503395081, + "learning_rate": 9.998730413936037e-05, + "loss": 0.11371417045593261, + "mean_token_accuracy": 0.9576376020908356, + "num_tokens": 6369456.0, + "step": 310 + }, + { + "entropy": 1.0607039332389832, + "epoch": 0.5893186003683242, + "grad_norm": 0.6238475441932678, + "learning_rate": 9.99795146618821e-05, + "loss": 0.11775733232498169, + "mean_token_accuracy": 0.9557221591472626, + "num_tokens": 6574833.0, + "step": 320 + }, + { + "entropy": 1.0504255175590516, + "epoch": 0.6077348066298343, + "grad_norm": 0.6496815085411072, + "learning_rate": 9.996987110742422e-05, + "loss": 0.10904088020324706, + "mean_token_accuracy": 0.9585366368293762, + "num_tokens": 6780108.0, + "step": 330 + }, + { + "entropy": 1.0456081986427308, + "epoch": 0.6261510128913443, + "grad_norm": 0.786702573299408, + "learning_rate": 9.995837383373119e-05, + "loss": 0.10642309188842773, + "mean_token_accuracy": 0.9596696078777314, + "num_tokens": 6985920.0, + "step": 340 + }, + { + "entropy": 1.0455098271369934, + "epoch": 0.6445672191528545, + "grad_norm": 0.5473790168762207, + "learning_rate": 9.994502326731434e-05, + "loss": 0.10822961330413819, + "mean_token_accuracy": 0.959563136100769, + "num_tokens": 7191465.0, + "step": 350 + }, + { + "entropy": 1.04240562915802, + "epoch": 0.6629834254143646, + "grad_norm": 0.6672356128692627, + "learning_rate": 9.992981990343614e-05, + "loss": 0.1110004186630249, + "mean_token_accuracy": 0.9582514643669129, + "num_tokens": 7396877.0, + "step": 360 + }, + { + "entropy": 1.0386811256408692, + "epoch": 0.6813996316758748, + "grad_norm": 0.698539674282074, + "learning_rate": 9.99127643060918e-05, + "loss": 0.107539963722229, + "mean_token_accuracy": 0.9593036234378814, + "num_tokens": 7602437.0, + "step": 370 + }, + { + "entropy": 1.0311225533485413, + "epoch": 0.6998158379373849, + "grad_norm": 0.6629284024238586, + "learning_rate": 9.989385710798837e-05, + "loss": 0.1064023494720459, + "mean_token_accuracy": 0.9602205216884613, + "num_tokens": 7808142.0, + "step": 380 + }, + { + "entropy": 1.030210506916046, + "epoch": 0.7182320441988951, + "grad_norm": 0.5616748929023743, + "learning_rate": 9.987309901052121e-05, + "loss": 0.10717041492462158, + "mean_token_accuracy": 0.9599347949028015, + "num_tokens": 8013407.0, + "step": 390 + }, + { + "entropy": 1.0208017826080322, + "epoch": 0.7366482504604052, + "grad_norm": 0.6329049468040466, + "learning_rate": 9.985049078374806e-05, + "loss": 0.10359601974487305, + "mean_token_accuracy": 0.9603756129741668, + "num_tokens": 8219040.0, + "step": 400 + }, + { + "entropy": 1.015640377998352, + "epoch": 0.7550644567219152, + "grad_norm": 0.6516013741493225, + "learning_rate": 9.982603326636037e-05, + "loss": 0.10146439075469971, + "mean_token_accuracy": 0.9627702474594116, + "num_tokens": 8424678.0, + "step": 410 + }, + { + "entropy": 1.0105359435081482, + "epoch": 0.7734806629834254, + "grad_norm": 0.6920603513717651, + "learning_rate": 9.979972736565226e-05, + "loss": 0.10770498514175415, + "mean_token_accuracy": 0.9591470420360565, + "num_tokens": 8629868.0, + "step": 420 + }, + { + "entropy": 0.9966452836990356, + "epoch": 0.7918968692449355, + "grad_norm": 0.6857476234436035, + "learning_rate": 9.977157405748687e-05, + "loss": 0.10282524824142455, + "mean_token_accuracy": 0.9612209022045135, + "num_tokens": 8835320.0, + "step": 430 + }, + { + "entropy": 0.9945534646511078, + "epoch": 0.8103130755064457, + "grad_norm": 0.7208472490310669, + "learning_rate": 9.974157438626008e-05, + "loss": 0.10069938898086547, + "mean_token_accuracy": 0.9620070576667785, + "num_tokens": 9041123.0, + "step": 440 + }, + { + "entropy": 0.979461395740509, + "epoch": 0.8287292817679558, + "grad_norm": 0.5071915984153748, + "learning_rate": 9.970972946486185e-05, + "loss": 0.09799174070358277, + "mean_token_accuracy": 0.9620374023914338, + "num_tokens": 9246361.0, + "step": 450 + }, + { + "entropy": 0.9830998003482818, + "epoch": 0.8471454880294659, + "grad_norm": 0.8660802245140076, + "learning_rate": 9.967604047463493e-05, + "loss": 0.10378165245056152, + "mean_token_accuracy": 0.9606865763664245, + "num_tokens": 9451845.0, + "step": 460 + }, + { + "entropy": 0.9813413023948669, + "epoch": 0.8655616942909761, + "grad_norm": 0.7642477750778198, + "learning_rate": 9.964050866533094e-05, + "loss": 0.1010061264038086, + "mean_token_accuracy": 0.9608745336532593, + "num_tokens": 9656802.0, + "step": 470 + }, + { + "entropy": 0.967874163389206, + "epoch": 0.8839779005524862, + "grad_norm": 0.5987281799316406, + "learning_rate": 9.960313535506411e-05, + "loss": 0.10169394016265869, + "mean_token_accuracy": 0.9611998200416565, + "num_tokens": 9861719.0, + "step": 480 + }, + { + "entropy": 0.9663491308689117, + "epoch": 0.9023941068139963, + "grad_norm": 0.6124638319015503, + "learning_rate": 9.956392193026239e-05, + "loss": 0.102389657497406, + "mean_token_accuracy": 0.9611884355545044, + "num_tokens": 10066673.0, + "step": 490 + }, + { + "entropy": 0.959654438495636, + "epoch": 0.9208103130755064, + "grad_norm": 0.7873051762580872, + "learning_rate": 9.952286984561592e-05, + "loss": 0.10170392990112305, + "mean_token_accuracy": 0.9610928475856781, + "num_tokens": 10272091.0, + "step": 500 + }, + { + "entropy": 0.9550537407398224, + "epoch": 0.9392265193370166, + "grad_norm": 0.6071968078613281, + "learning_rate": 9.947998062402313e-05, + "loss": 0.09448277950286865, + "mean_token_accuracy": 0.9648977637290954, + "num_tokens": 10477632.0, + "step": 510 + }, + { + "entropy": 0.9538533687591553, + "epoch": 0.9576427255985267, + "grad_norm": 0.6317242980003357, + "learning_rate": 9.943525585653428e-05, + "loss": 0.09542192220687866, + "mean_token_accuracy": 0.9635261118412017, + "num_tokens": 10682828.0, + "step": 520 + }, + { + "entropy": 0.9362513542175293, + "epoch": 0.9760589318600368, + "grad_norm": 0.6421944499015808, + "learning_rate": 9.938869720229234e-05, + "loss": 0.09382058382034301, + "mean_token_accuracy": 0.9648073971271515, + "num_tokens": 10888741.0, + "step": 530 + }, + { + "entropy": 0.9235438346862793, + "epoch": 0.994475138121547, + "grad_norm": 0.7986873388290405, + "learning_rate": 9.934030638847155e-05, + "loss": 0.09827429056167603, + "mean_token_accuracy": 0.9621128737926483, + "num_tokens": 11094387.0, + "step": 540 + }, + { + "epoch": 1.0, + "eval_entropy": 0.9137652366057686, + "eval_loss": 0.09368764609098434, + "eval_mean_token_accuracy": 0.9640816880309063, + "eval_num_tokens": 11155908.0, + "eval_runtime": 10.4701, + "eval_samples_per_second": 349.377, + "eval_steps_per_second": 10.984, + "step": 543 + }, + { + "entropy": 0.9047818422317505, + "epoch": 1.0128913443830572, + "grad_norm": 0.6781501173973083, + "learning_rate": 9.929008521021325e-05, + "loss": 0.0863916516304016, + "mean_token_accuracy": 0.9673655688762665, + "num_tokens": 11299715.0, + "step": 550 + }, + { + "entropy": 0.8856981039047241, + "epoch": 1.0313075506445673, + "grad_norm": 0.7143136858940125, + "learning_rate": 9.923803553055937e-05, + "loss": 0.08632323145866394, + "mean_token_accuracy": 0.9677783191204071, + "num_tokens": 11505059.0, + "step": 560 + }, + { + "entropy": 0.8937099635601043, + "epoch": 1.0497237569060773, + "grad_norm": 0.7751694321632385, + "learning_rate": 9.918415928038325e-05, + "loss": 0.08178263902664185, + "mean_token_accuracy": 0.9694291114807129, + "num_tokens": 11710464.0, + "step": 570 + }, + { + "entropy": 0.8858704209327698, + "epoch": 1.0681399631675874, + "grad_norm": 0.7492292523384094, + "learning_rate": 9.912845845831805e-05, + "loss": 0.08074211478233337, + "mean_token_accuracy": 0.9692470014095307, + "num_tokens": 11915959.0, + "step": 580 + }, + { + "entropy": 0.8948039829730987, + "epoch": 1.0865561694290977, + "grad_norm": 0.8116479516029358, + "learning_rate": 9.907093513068259e-05, + "loss": 0.08712012171745301, + "mean_token_accuracy": 0.9669980227947235, + "num_tokens": 12121499.0, + "step": 590 + }, + { + "entropy": 0.8846789538860321, + "epoch": 1.1049723756906078, + "grad_norm": 0.7295626997947693, + "learning_rate": 9.901159143140471e-05, + "loss": 0.08444435596466064, + "mean_token_accuracy": 0.9674544095993042, + "num_tokens": 12327061.0, + "step": 600 + }, + { + "entropy": 0.8734103918075562, + "epoch": 1.1233885819521179, + "grad_norm": 0.9585768580436707, + "learning_rate": 9.89504295619421e-05, + "loss": 0.08022565841674804, + "mean_token_accuracy": 0.969569206237793, + "num_tokens": 12532305.0, + "step": 610 + }, + { + "entropy": 0.8640486001968384, + "epoch": 1.141804788213628, + "grad_norm": 0.7891159057617188, + "learning_rate": 9.88874517912006e-05, + "loss": 0.08415375947952271, + "mean_token_accuracy": 0.9678892493247986, + "num_tokens": 12737828.0, + "step": 620 + }, + { + "entropy": 0.8599755525588989, + "epoch": 1.160220994475138, + "grad_norm": 0.5801345109939575, + "learning_rate": 9.882266045545012e-05, + "loss": 0.08100489974021911, + "mean_token_accuracy": 0.9688023269176483, + "num_tokens": 12943343.0, + "step": 630 + }, + { + "entropy": 0.86524977684021, + "epoch": 1.1786372007366483, + "grad_norm": 0.7633041143417358, + "learning_rate": 9.87560579582379e-05, + "loss": 0.07859406471252442, + "mean_token_accuracy": 0.9702189445495606, + "num_tokens": 13148473.0, + "step": 640 + }, + { + "entropy": 0.8466695249080658, + "epoch": 1.1970534069981584, + "grad_norm": 0.8672215938568115, + "learning_rate": 9.868764677029934e-05, + "loss": 0.08082623481750488, + "mean_token_accuracy": 0.9689972400665283, + "num_tokens": 13353890.0, + "step": 650 + }, + { + "entropy": 0.8596941530704498, + "epoch": 1.2154696132596685, + "grad_norm": 0.7524124383926392, + "learning_rate": 9.861742942946639e-05, + "loss": 0.0789935290813446, + "mean_token_accuracy": 0.9693858206272126, + "num_tokens": 13559475.0, + "step": 660 + }, + { + "entropy": 0.8708749234676361, + "epoch": 1.2338858195211786, + "grad_norm": 0.5777031183242798, + "learning_rate": 9.854540854057337e-05, + "loss": 0.07773642539978028, + "mean_token_accuracy": 0.970385092496872, + "num_tokens": 13765076.0, + "step": 670 + }, + { + "entropy": 0.8651713371276856, + "epoch": 1.2523020257826887, + "grad_norm": 0.7924166321754456, + "learning_rate": 9.847158677536034e-05, + "loss": 0.0766686737537384, + "mean_token_accuracy": 0.9702267110347748, + "num_tokens": 13970642.0, + "step": 680 + }, + { + "entropy": 0.8763024985790253, + "epoch": 1.270718232044199, + "grad_norm": 0.741219162940979, + "learning_rate": 9.839596687237403e-05, + "loss": 0.07189929485321045, + "mean_token_accuracy": 0.9727097094058991, + "num_tokens": 14176556.0, + "step": 690 + }, + { + "entropy": 0.8556921362876893, + "epoch": 1.289134438305709, + "grad_norm": 0.6298198103904724, + "learning_rate": 9.831855163686618e-05, + "loss": 0.07608137726783752, + "mean_token_accuracy": 0.9716399371623993, + "num_tokens": 14381686.0, + "step": 700 + }, + { + "entropy": 0.869178420305252, + "epoch": 1.3075506445672191, + "grad_norm": 0.5850273370742798, + "learning_rate": 9.823934394068952e-05, + "loss": 0.07437651753425598, + "mean_token_accuracy": 0.9709566533565521, + "num_tokens": 14586814.0, + "step": 710 + }, + { + "entropy": 0.8708595156669616, + "epoch": 1.3259668508287292, + "grad_norm": 0.6580632328987122, + "learning_rate": 9.815834672219127e-05, + "loss": 0.07518917322158813, + "mean_token_accuracy": 0.9717426657676697, + "num_tokens": 14792321.0, + "step": 720 + }, + { + "entropy": 0.8826817810535431, + "epoch": 1.3443830570902393, + "grad_norm": 0.8788532018661499, + "learning_rate": 9.807556298610404e-05, + "loss": 0.07579240798950196, + "mean_token_accuracy": 0.9706341981887817, + "num_tokens": 14997810.0, + "step": 730 + }, + { + "entropy": 0.9012470185756684, + "epoch": 1.3627992633517496, + "grad_norm": 0.7022138237953186, + "learning_rate": 9.799099580343441e-05, + "loss": 0.0775588572025299, + "mean_token_accuracy": 0.9699241399765015, + "num_tokens": 15203795.0, + "step": 740 + }, + { + "entropy": 0.886955714225769, + "epoch": 1.3812154696132597, + "grad_norm": 0.7881133556365967, + "learning_rate": 9.790464831134903e-05, + "loss": 0.07125020027160645, + "mean_token_accuracy": 0.9723815560340882, + "num_tokens": 15408974.0, + "step": 750 + }, + { + "entropy": 0.9047374844551086, + "epoch": 1.3996316758747698, + "grad_norm": 0.9082005023956299, + "learning_rate": 9.781652371305824e-05, + "loss": 0.07004334926605224, + "mean_token_accuracy": 0.9725580036640167, + "num_tokens": 15614399.0, + "step": 760 + }, + { + "entropy": 0.9039053857326508, + "epoch": 1.4180478821362799, + "grad_norm": 0.8060817122459412, + "learning_rate": 9.77266252776972e-05, + "loss": 0.07103485465049744, + "mean_token_accuracy": 0.9721468150615692, + "num_tokens": 15819895.0, + "step": 770 + }, + { + "entropy": 0.8998047232627868, + "epoch": 1.43646408839779, + "grad_norm": 1.0152642726898193, + "learning_rate": 9.763495634020467e-05, + "loss": 0.07411704063415528, + "mean_token_accuracy": 0.9711063146591187, + "num_tokens": 16025297.0, + "step": 780 + }, + { + "entropy": 0.9120213568210602, + "epoch": 1.4548802946593002, + "grad_norm": 0.6288319826126099, + "learning_rate": 9.754152030119921e-05, + "loss": 0.07223712205886841, + "mean_token_accuracy": 0.9722476422786712, + "num_tokens": 16230656.0, + "step": 790 + }, + { + "entropy": 0.9142370820045471, + "epoch": 1.4732965009208103, + "grad_norm": 0.7854700088500977, + "learning_rate": 9.744632062685311e-05, + "loss": 0.07186744809150696, + "mean_token_accuracy": 0.972247713804245, + "num_tokens": 16435943.0, + "step": 800 + }, + { + "entropy": 0.8920814216136932, + "epoch": 1.4917127071823204, + "grad_norm": 0.6227074265480042, + "learning_rate": 9.734936084876383e-05, + "loss": 0.07016961574554444, + "mean_token_accuracy": 0.9725603640079499, + "num_tokens": 16641635.0, + "step": 810 + }, + { + "entropy": 0.891328877210617, + "epoch": 1.5101289134438307, + "grad_norm": 0.7601346969604492, + "learning_rate": 9.725064456382283e-05, + "loss": 0.07137494087219239, + "mean_token_accuracy": 0.9722997546195984, + "num_tokens": 16847194.0, + "step": 820 + }, + { + "entropy": 0.8921217978000641, + "epoch": 1.5285451197053406, + "grad_norm": 0.7813850045204163, + "learning_rate": 9.715017543408233e-05, + "loss": 0.06890199184417725, + "mean_token_accuracy": 0.9735044002532959, + "num_tokens": 17052807.0, + "step": 830 + }, + { + "entropy": 0.9085914671421051, + "epoch": 1.5469613259668509, + "grad_norm": 0.6184289455413818, + "learning_rate": 9.704795718661939e-05, + "loss": 0.07043765187263488, + "mean_token_accuracy": 0.9725716531276702, + "num_tokens": 17258284.0, + "step": 840 + }, + { + "entropy": 0.9029861629009247, + "epoch": 1.565377532228361, + "grad_norm": 0.7082377076148987, + "learning_rate": 9.694399361339752e-05, + "loss": 0.07113839387893676, + "mean_token_accuracy": 0.9725669205188752, + "num_tokens": 17464326.0, + "step": 850 + }, + { + "entropy": 0.8856533527374267, + "epoch": 1.583793738489871, + "grad_norm": 0.7409216165542603, + "learning_rate": 9.683828857112627e-05, + "loss": 0.07077333331108093, + "mean_token_accuracy": 0.9731084644794464, + "num_tokens": 17669537.0, + "step": 860 + }, + { + "entropy": 0.8613030433654785, + "epoch": 1.6022099447513813, + "grad_norm": 0.6801561713218689, + "learning_rate": 9.673084598111789e-05, + "loss": 0.06885308027267456, + "mean_token_accuracy": 0.97266526222229, + "num_tokens": 17875289.0, + "step": 870 + }, + { + "entropy": 0.8692965865135193, + "epoch": 1.6206261510128912, + "grad_norm": 1.1621277332305908, + "learning_rate": 9.662166982914203e-05, + "loss": 0.07017780542373657, + "mean_token_accuracy": 0.9733059942722321, + "num_tokens": 18080404.0, + "step": 880 + }, + { + "entropy": 0.8671502113342285, + "epoch": 1.6390423572744015, + "grad_norm": 0.7518903613090515, + "learning_rate": 9.651076416527787e-05, + "loss": 0.06977018713951111, + "mean_token_accuracy": 0.9730017304420471, + "num_tokens": 18285699.0, + "step": 890 + }, + { + "entropy": 0.8662045657634735, + "epoch": 1.6574585635359116, + "grad_norm": 0.6622698903083801, + "learning_rate": 9.639813310376378e-05, + "loss": 0.06620995998382569, + "mean_token_accuracy": 0.9737491130828857, + "num_tokens": 18491097.0, + "step": 900 + }, + { + "entropy": 0.8548173069953918, + "epoch": 1.6758747697974217, + "grad_norm": 0.8941843509674072, + "learning_rate": 9.628378082284479e-05, + "loss": 0.06711119413375854, + "mean_token_accuracy": 0.9740589797496796, + "num_tokens": 18696827.0, + "step": 910 + }, + { + "entropy": 0.8763562262058258, + "epoch": 1.694290976058932, + "grad_norm": 0.7571700215339661, + "learning_rate": 9.616771156461755e-05, + "loss": 0.07263468503952027, + "mean_token_accuracy": 0.9717419981956482, + "num_tokens": 18902513.0, + "step": 920 + }, + { + "entropy": 0.8663733780384064, + "epoch": 1.7127071823204418, + "grad_norm": 0.7886489629745483, + "learning_rate": 9.604992963487298e-05, + "loss": 0.07074605226516724, + "mean_token_accuracy": 0.9724965393543243, + "num_tokens": 19107812.0, + "step": 930 + }, + { + "entropy": 0.8673004627227783, + "epoch": 1.7311233885819521, + "grad_norm": 0.8180726170539856, + "learning_rate": 9.593043940293647e-05, + "loss": 0.06831735372543335, + "mean_token_accuracy": 0.9733696818351746, + "num_tokens": 19313330.0, + "step": 940 + }, + { + "entropy": 0.8525971233844757, + "epoch": 1.7495395948434622, + "grad_norm": 0.6576228737831116, + "learning_rate": 9.580924530150595e-05, + "loss": 0.06567002534866333, + "mean_token_accuracy": 0.9745754361152649, + "num_tokens": 19518671.0, + "step": 950 + }, + { + "entropy": 0.8605451703071594, + "epoch": 1.7679558011049723, + "grad_norm": 0.7171661257743835, + "learning_rate": 9.568635182648725e-05, + "loss": 0.06872050762176514, + "mean_token_accuracy": 0.9732091546058654, + "num_tokens": 19724135.0, + "step": 960 + }, + { + "entropy": 0.8642210960388184, + "epoch": 1.7863720073664826, + "grad_norm": 0.7603147029876709, + "learning_rate": 9.556176353682746e-05, + "loss": 0.06766576766967773, + "mean_token_accuracy": 0.9728681743144989, + "num_tokens": 19928785.0, + "step": 970 + }, + { + "entropy": 0.8543185651302337, + "epoch": 1.8047882136279927, + "grad_norm": 0.7280875444412231, + "learning_rate": 9.543548505434581e-05, + "loss": 0.06851862668991089, + "mean_token_accuracy": 0.9737437188625335, + "num_tokens": 20134195.0, + "step": 980 + }, + { + "entropy": 0.8744745373725891, + "epoch": 1.8232044198895028, + "grad_norm": 0.5897248983383179, + "learning_rate": 9.530752106356209e-05, + "loss": 0.06809053421020508, + "mean_token_accuracy": 0.9733593761920929, + "num_tokens": 20339517.0, + "step": 990 + }, + { + "entropy": 0.8623859465122223, + "epoch": 1.8416206261510129, + "grad_norm": 0.7515265345573425, + "learning_rate": 9.517787631152298e-05, + "loss": 0.07257847785949707, + "mean_token_accuracy": 0.9714054942131043, + "num_tokens": 20545249.0, + "step": 1000 + }, + { + "entropy": 0.8669404804706573, + "epoch": 1.860036832412523, + "grad_norm": 0.7144560813903809, + "learning_rate": 9.504655560762596e-05, + "loss": 0.06832354068756104, + "mean_token_accuracy": 0.9735779523849487, + "num_tokens": 20750507.0, + "step": 1010 + }, + { + "entropy": 0.8493516445159912, + "epoch": 1.8784530386740332, + "grad_norm": 0.6559189558029175, + "learning_rate": 9.491356382344081e-05, + "loss": 0.0629766047000885, + "mean_token_accuracy": 0.9754977762699127, + "num_tokens": 20955956.0, + "step": 1020 + }, + { + "entropy": 0.8599376022815705, + "epoch": 1.8968692449355433, + "grad_norm": 0.6792973279953003, + "learning_rate": 9.477890589252895e-05, + "loss": 0.0666757881641388, + "mean_token_accuracy": 0.974083811044693, + "num_tokens": 21161163.0, + "step": 1030 + }, + { + "entropy": 0.8458438158035279, + "epoch": 1.9152854511970534, + "grad_norm": 0.6941778659820557, + "learning_rate": 9.464258681026042e-05, + "loss": 0.06307152509689332, + "mean_token_accuracy": 0.9757042229175568, + "num_tokens": 21366525.0, + "step": 1040 + }, + { + "entropy": 0.848515909910202, + "epoch": 1.9337016574585635, + "grad_norm": 0.7307806611061096, + "learning_rate": 9.450461163362855e-05, + "loss": 0.06307026147842407, + "mean_token_accuracy": 0.9750974595546722, + "num_tokens": 21572238.0, + "step": 1050 + }, + { + "entropy": 0.8563454031944275, + "epoch": 1.9521178637200736, + "grad_norm": 0.7222106456756592, + "learning_rate": 9.436498548106236e-05, + "loss": 0.0647726058959961, + "mean_token_accuracy": 0.974629694223404, + "num_tokens": 21777633.0, + "step": 1060 + }, + { + "entropy": 0.8656457483768463, + "epoch": 1.9705340699815839, + "grad_norm": 0.67178875207901, + "learning_rate": 9.422371353223674e-05, + "loss": 0.06573554277420043, + "mean_token_accuracy": 0.9745908617973328, + "num_tokens": 21983116.0, + "step": 1070 + }, + { + "entropy": 0.8630891263484954, + "epoch": 1.988950276243094, + "grad_norm": 0.6956593990325928, + "learning_rate": 9.408080102788016e-05, + "loss": 0.06630704402923585, + "mean_token_accuracy": 0.9741333484649658, + "num_tokens": 22188662.0, + "step": 1080 + }, + { + "epoch": 2.0, + "eval_entropy": 0.8560857042022373, + "eval_loss": 0.06494329869747162, + "eval_mean_token_accuracy": 0.9745692672936813, + "eval_num_tokens": 22311800.0, + "eval_runtime": 10.129, + "eval_samples_per_second": 361.142, + "eval_steps_per_second": 11.354, + "step": 1086 + }, + { + "entropy": 0.8616272270679474, + "epoch": 2.007366482504604, + "grad_norm": 0.7778105139732361, + "learning_rate": 9.393625326958041e-05, + "loss": 0.054407155513763426, + "mean_token_accuracy": 0.9792074799537659, + "num_tokens": 22394215.0, + "step": 1090 + }, + { + "entropy": 0.8496910452842712, + "epoch": 2.0257826887661143, + "grad_norm": 0.7422528266906738, + "learning_rate": 9.379007561958792e-05, + "loss": 0.051881587505340575, + "mean_token_accuracy": 0.9799090325832367, + "num_tokens": 22599599.0, + "step": 1100 + }, + { + "entropy": 0.8531602442264556, + "epoch": 2.044198895027624, + "grad_norm": 0.9075332880020142, + "learning_rate": 9.36422735006167e-05, + "loss": 0.05190724730491638, + "mean_token_accuracy": 0.979931116104126, + "num_tokens": 22805318.0, + "step": 1110 + }, + { + "entropy": 0.8657277703285218, + "epoch": 2.0626151012891345, + "grad_norm": 0.9466913938522339, + "learning_rate": 9.349285239564325e-05, + "loss": 0.053853434324264524, + "mean_token_accuracy": 0.9796103596687317, + "num_tokens": 23010438.0, + "step": 1120 + }, + { + "entropy": 0.8578485429286957, + "epoch": 2.0810313075506444, + "grad_norm": 0.6903054714202881, + "learning_rate": 9.334181784770326e-05, + "loss": 0.05228850841522217, + "mean_token_accuracy": 0.9802409887313843, + "num_tokens": 23215795.0, + "step": 1130 + }, + { + "entropy": 0.8450767934322357, + "epoch": 2.0994475138121547, + "grad_norm": 0.6615211367607117, + "learning_rate": 9.318917545968581e-05, + "loss": 0.050570905208587646, + "mean_token_accuracy": 0.9802053451538086, + "num_tokens": 23421157.0, + "step": 1140 + }, + { + "entropy": 0.8325044393539429, + "epoch": 2.117863720073665, + "grad_norm": 0.760960578918457, + "learning_rate": 9.303493089412564e-05, + "loss": 0.051966112852096555, + "mean_token_accuracy": 0.9796205997467041, + "num_tokens": 23626584.0, + "step": 1150 + }, + { + "entropy": 0.8416404843330383, + "epoch": 2.136279926335175, + "grad_norm": 0.6947009563446045, + "learning_rate": 9.287908987299306e-05, + "loss": 0.05144861936569214, + "mean_token_accuracy": 0.9800034642219544, + "num_tokens": 23832137.0, + "step": 1160 + }, + { + "entropy": 0.8564540028572083, + "epoch": 2.154696132596685, + "grad_norm": 0.733252763748169, + "learning_rate": 9.272165817748164e-05, + "loss": 0.04944799542427063, + "mean_token_accuracy": 0.9808157980442047, + "num_tokens": 24038006.0, + "step": 1170 + }, + { + "entropy": 0.8575525343418121, + "epoch": 2.1731123388581954, + "grad_norm": 0.8911028504371643, + "learning_rate": 9.25626416477938e-05, + "loss": 0.05037952661514282, + "mean_token_accuracy": 0.980946284532547, + "num_tokens": 24243374.0, + "step": 1180 + }, + { + "entropy": 0.8599720418453216, + "epoch": 2.1915285451197053, + "grad_norm": 0.7713524103164673, + "learning_rate": 9.240204618292416e-05, + "loss": 0.050603735446929934, + "mean_token_accuracy": 0.980896121263504, + "num_tokens": 24448585.0, + "step": 1190 + }, + { + "entropy": 0.8566664934158326, + "epoch": 2.2099447513812156, + "grad_norm": 0.8439353704452515, + "learning_rate": 9.223987774044066e-05, + "loss": 0.054171699285507205, + "mean_token_accuracy": 0.9796543836593627, + "num_tokens": 24653863.0, + "step": 1200 + }, + { + "entropy": 0.846601277589798, + "epoch": 2.2283609576427255, + "grad_norm": 0.7025637030601501, + "learning_rate": 9.207614233626356e-05, + "loss": 0.048924127221107484, + "mean_token_accuracy": 0.9809681415557862, + "num_tokens": 24859801.0, + "step": 1210 + }, + { + "entropy": 0.8564423739910125, + "epoch": 2.2467771639042358, + "grad_norm": 0.7788274884223938, + "learning_rate": 9.191084604444233e-05, + "loss": 0.05260283350944519, + "mean_token_accuracy": 0.9793797850608825, + "num_tokens": 25065368.0, + "step": 1220 + }, + { + "entropy": 0.865056723356247, + "epoch": 2.265193370165746, + "grad_norm": 0.8728818297386169, + "learning_rate": 9.174399499693027e-05, + "loss": 0.05016371011734009, + "mean_token_accuracy": 0.9807134211063385, + "num_tokens": 25270945.0, + "step": 1230 + }, + { + "entropy": 0.8642262935638427, + "epoch": 2.283609576427256, + "grad_norm": 1.0582489967346191, + "learning_rate": 9.157559538335703e-05, + "loss": 0.05316779017448425, + "mean_token_accuracy": 0.9794209063053131, + "num_tokens": 25476575.0, + "step": 1240 + }, + { + "entropy": 0.8677761554718018, + "epoch": 2.3020257826887662, + "grad_norm": 0.760109543800354, + "learning_rate": 9.140565345079901e-05, + "loss": 0.05115479230880737, + "mean_token_accuracy": 0.9802310705184937, + "num_tokens": 25682814.0, + "step": 1250 + }, + { + "entropy": 0.8592945456504821, + "epoch": 2.320441988950276, + "grad_norm": 0.6537907123565674, + "learning_rate": 9.123417550354761e-05, + "loss": 0.050543540716171266, + "mean_token_accuracy": 0.9806945025920868, + "num_tokens": 25887575.0, + "step": 1260 + }, + { + "entropy": 0.8692500293254852, + "epoch": 2.3388581952117864, + "grad_norm": 0.7771905064582825, + "learning_rate": 9.106116790287541e-05, + "loss": 0.049718713760375975, + "mean_token_accuracy": 0.9805168390274048, + "num_tokens": 26092950.0, + "step": 1270 + }, + { + "entropy": 0.8841261565685272, + "epoch": 2.3572744014732967, + "grad_norm": 0.7791076898574829, + "learning_rate": 9.08866370668001e-05, + "loss": 0.0527400553226471, + "mean_token_accuracy": 0.9796754539012908, + "num_tokens": 26298182.0, + "step": 1280 + }, + { + "entropy": 0.8675022900104523, + "epoch": 2.3756906077348066, + "grad_norm": 0.8481605648994446, + "learning_rate": 9.07105894698464e-05, + "loss": 0.05320838689804077, + "mean_token_accuracy": 0.9792274832725525, + "num_tokens": 26503425.0, + "step": 1290 + }, + { + "entropy": 0.8704026222229004, + "epoch": 2.394106813996317, + "grad_norm": 0.8235505819320679, + "learning_rate": 9.053303164280602e-05, + "loss": 0.055045205354690555, + "mean_token_accuracy": 0.9788750648498535, + "num_tokens": 26708755.0, + "step": 1300 + }, + { + "entropy": 0.8525134027004242, + "epoch": 2.4125230202578267, + "grad_norm": 0.7611598968505859, + "learning_rate": 9.035397017249518e-05, + "loss": 0.05029621124267578, + "mean_token_accuracy": 0.9802757322788238, + "num_tokens": 26914704.0, + "step": 1310 + }, + { + "entropy": 0.8630305290222168, + "epoch": 2.430939226519337, + "grad_norm": 0.790408194065094, + "learning_rate": 9.017341170151041e-05, + "loss": 0.04856040775775909, + "mean_token_accuracy": 0.9809690833091735, + "num_tokens": 27120151.0, + "step": 1320 + }, + { + "entropy": 0.8579159140586853, + "epoch": 2.4493554327808473, + "grad_norm": 0.781972348690033, + "learning_rate": 8.999136292798207e-05, + "loss": 0.04869682788848877, + "mean_token_accuracy": 0.9816130697727203, + "num_tokens": 27325673.0, + "step": 1330 + }, + { + "entropy": 0.8634716987609863, + "epoch": 2.467771639042357, + "grad_norm": 0.8500784039497375, + "learning_rate": 8.980783060532588e-05, + "loss": 0.05050289034843445, + "mean_token_accuracy": 0.980079609155655, + "num_tokens": 27531270.0, + "step": 1340 + }, + { + "entropy": 0.8660618126392364, + "epoch": 2.4861878453038675, + "grad_norm": 0.719760537147522, + "learning_rate": 8.96228215419924e-05, + "loss": 0.04892141819000244, + "mean_token_accuracy": 0.9814020991325378, + "num_tokens": 27736542.0, + "step": 1350 + }, + { + "entropy": 0.8572284400463104, + "epoch": 2.5046040515653774, + "grad_norm": 1.0197229385375977, + "learning_rate": 8.943634260121442e-05, + "loss": 0.05104702711105347, + "mean_token_accuracy": 0.9798846662044525, + "num_tokens": 27941566.0, + "step": 1360 + }, + { + "entropy": 0.8702241241931915, + "epoch": 2.5230202578268877, + "grad_norm": 0.7136003375053406, + "learning_rate": 8.924840070075247e-05, + "loss": 0.04855787754058838, + "mean_token_accuracy": 0.9811685383319855, + "num_tokens": 28146943.0, + "step": 1370 + }, + { + "entropy": 0.874957013130188, + "epoch": 2.541436464088398, + "grad_norm": 0.8775497674942017, + "learning_rate": 8.905900281263804e-05, + "loss": 0.052434295415878296, + "mean_token_accuracy": 0.9795438170433044, + "num_tokens": 28352640.0, + "step": 1380 + }, + { + "entropy": 0.8776536166667939, + "epoch": 2.559852670349908, + "grad_norm": 0.8895741105079651, + "learning_rate": 8.8868155962915e-05, + "loss": 0.05282890796661377, + "mean_token_accuracy": 0.9790538609027862, + "num_tokens": 28558153.0, + "step": 1390 + }, + { + "entropy": 0.8738743245601654, + "epoch": 2.578268876611418, + "grad_norm": 0.788800060749054, + "learning_rate": 8.867586723137906e-05, + "loss": 0.048841872811317445, + "mean_token_accuracy": 0.9809149026870727, + "num_tokens": 28763613.0, + "step": 1400 + }, + { + "entropy": 0.8750253796577454, + "epoch": 2.596685082872928, + "grad_norm": 0.8738002777099609, + "learning_rate": 8.848214375131497e-05, + "loss": 0.048261132836341855, + "mean_token_accuracy": 0.980789190530777, + "num_tokens": 28969248.0, + "step": 1410 + }, + { + "entropy": 0.8624245524406433, + "epoch": 2.6151012891344383, + "grad_norm": 0.6404895186424255, + "learning_rate": 8.828699270923196e-05, + "loss": 0.04970468282699585, + "mean_token_accuracy": 0.9807762265205383, + "num_tokens": 29174779.0, + "step": 1420 + }, + { + "entropy": 0.8792938470840455, + "epoch": 2.6335174953959486, + "grad_norm": 0.7856965661048889, + "learning_rate": 8.80904213445972e-05, + "loss": 0.053334391117095946, + "mean_token_accuracy": 0.9790222108364105, + "num_tokens": 29380474.0, + "step": 1430 + }, + { + "entropy": 0.8831034600734711, + "epoch": 2.6519337016574585, + "grad_norm": 0.7739618420600891, + "learning_rate": 8.789243694956716e-05, + "loss": 0.04959054589271546, + "mean_token_accuracy": 0.9803965091705322, + "num_tokens": 29585985.0, + "step": 1440 + }, + { + "entropy": 0.8934672951698304, + "epoch": 2.6703499079189688, + "grad_norm": 0.6999697089195251, + "learning_rate": 8.769304686871719e-05, + "loss": 0.05165250301361084, + "mean_token_accuracy": 0.9798884153366089, + "num_tokens": 29791238.0, + "step": 1450 + }, + { + "entropy": 0.9053199410438537, + "epoch": 2.6887661141804786, + "grad_norm": 0.9199564456939697, + "learning_rate": 8.749225849876892e-05, + "loss": 0.04924143850803375, + "mean_token_accuracy": 0.9810785710811615, + "num_tokens": 29996589.0, + "step": 1460 + }, + { + "entropy": 0.888091403245926, + "epoch": 2.707182320441989, + "grad_norm": 0.7480106353759766, + "learning_rate": 8.729007928831597e-05, + "loss": 0.04948916733264923, + "mean_token_accuracy": 0.9809579730033875, + "num_tokens": 30201875.0, + "step": 1470 + }, + { + "entropy": 0.8723407983779907, + "epoch": 2.7255985267034992, + "grad_norm": 0.9506945013999939, + "learning_rate": 8.708651673754763e-05, + "loss": 0.048927539587020875, + "mean_token_accuracy": 0.980553150177002, + "num_tokens": 30407550.0, + "step": 1480 + }, + { + "entropy": 0.8737521529197693, + "epoch": 2.744014732965009, + "grad_norm": 0.8015706539154053, + "learning_rate": 8.688157839797062e-05, + "loss": 0.04963063597679138, + "mean_token_accuracy": 0.9809738755226135, + "num_tokens": 30612839.0, + "step": 1490 + }, + { + "entropy": 0.8800762951374054, + "epoch": 2.7624309392265194, + "grad_norm": 0.9429986476898193, + "learning_rate": 8.667527187212885e-05, + "loss": 0.0524174690246582, + "mean_token_accuracy": 0.9788767337799072, + "num_tokens": 30818578.0, + "step": 1500 + }, + { + "entropy": 0.8871055901050567, + "epoch": 2.7808471454880292, + "grad_norm": 0.5909196138381958, + "learning_rate": 8.646760481332157e-05, + "loss": 0.05166680812835693, + "mean_token_accuracy": 0.980216771364212, + "num_tokens": 31023829.0, + "step": 1510 + }, + { + "entropy": 0.8908755779266357, + "epoch": 2.7992633517495396, + "grad_norm": 0.9154611229896545, + "learning_rate": 8.625858492531931e-05, + "loss": 0.04951836466789246, + "mean_token_accuracy": 0.9801484227180481, + "num_tokens": 31229635.0, + "step": 1520 + }, + { + "entropy": 0.92480548620224, + "epoch": 2.81767955801105, + "grad_norm": 0.5989938378334045, + "learning_rate": 8.604821996207819e-05, + "loss": 0.04799881279468536, + "mean_token_accuracy": 0.9817522585391998, + "num_tokens": 31435456.0, + "step": 1530 + }, + { + "entropy": 0.9173881888389588, + "epoch": 2.8360957642725597, + "grad_norm": 0.899413526058197, + "learning_rate": 8.58365177274522e-05, + "loss": 0.0487445592880249, + "mean_token_accuracy": 0.9812625288963318, + "num_tokens": 31640904.0, + "step": 1540 + }, + { + "entropy": 0.9076135993003845, + "epoch": 2.85451197053407, + "grad_norm": 0.8494166135787964, + "learning_rate": 8.562348607490376e-05, + "loss": 0.05005228519439697, + "mean_token_accuracy": 0.9806681036949157, + "num_tokens": 31845807.0, + "step": 1550 + }, + { + "entropy": 0.9092245221138, + "epoch": 2.87292817679558, + "grad_norm": 0.8225123286247253, + "learning_rate": 8.540913290721234e-05, + "loss": 0.048654764890670776, + "mean_token_accuracy": 0.9805659353733063, + "num_tokens": 32051523.0, + "step": 1560 + }, + { + "entropy": 0.9062779664993286, + "epoch": 2.89134438305709, + "grad_norm": 0.7074014544487, + "learning_rate": 8.519346617618134e-05, + "loss": 0.049209845066070554, + "mean_token_accuracy": 0.9807434439659118, + "num_tokens": 32256895.0, + "step": 1570 + }, + { + "entropy": 0.9190246641635895, + "epoch": 2.9097605893186005, + "grad_norm": 0.8860642910003662, + "learning_rate": 8.497649388234304e-05, + "loss": 0.051211881637573245, + "mean_token_accuracy": 0.9802342295646668, + "num_tokens": 32462031.0, + "step": 1580 + }, + { + "entropy": 0.9088015079498291, + "epoch": 2.9281767955801103, + "grad_norm": 0.8062726855278015, + "learning_rate": 8.475822407466188e-05, + "loss": 0.053512704372406, + "mean_token_accuracy": 0.979486483335495, + "num_tokens": 32667533.0, + "step": 1590 + }, + { + "entropy": 0.9462027847766876, + "epoch": 2.9465930018416207, + "grad_norm": 0.7962909936904907, + "learning_rate": 8.453866485023579e-05, + "loss": 0.0501457154750824, + "mean_token_accuracy": 0.9803222417831421, + "num_tokens": 32872900.0, + "step": 1600 + }, + { + "entropy": 0.9671471297740937, + "epoch": 2.9650092081031305, + "grad_norm": 0.7641744017601013, + "learning_rate": 8.431782435399587e-05, + "loss": 0.04629061222076416, + "mean_token_accuracy": 0.9823175370693207, + "num_tokens": 33077850.0, + "step": 1610 + }, + { + "entropy": 0.955865204334259, + "epoch": 2.983425414364641, + "grad_norm": 0.6772348880767822, + "learning_rate": 8.409571077840426e-05, + "loss": 0.048368623852729796, + "mean_token_accuracy": 0.9808700799942016, + "num_tokens": 33283117.0, + "step": 1620 + }, + { + "epoch": 3.0, + "eval_entropy": 0.9563225186389426, + "eval_loss": 0.059064481407403946, + "eval_mean_token_accuracy": 0.9773589429648026, + "eval_num_tokens": 33467712.0, + "eval_runtime": 10.1471, + "eval_samples_per_second": 360.499, + "eval_steps_per_second": 11.333, + "step": 1629 + }, + { + "entropy": 0.9337226033210755, + "epoch": 3.001841620626151, + "grad_norm": 0.646203875541687, + "learning_rate": 8.387233236315016e-05, + "loss": 0.043352216482162476, + "mean_token_accuracy": 0.9830620110034942, + "num_tokens": 33488302.0, + "step": 1630 + }, + { + "entropy": 0.9734923839569092, + "epoch": 3.020257826887661, + "grad_norm": 0.7564226984977722, + "learning_rate": 8.364769739484416e-05, + "loss": 0.033932483196258544, + "mean_token_accuracy": 0.9872806966304779, + "num_tokens": 33693531.0, + "step": 1640 + }, + { + "entropy": 0.9669206500053406, + "epoch": 3.0386740331491713, + "grad_norm": 0.7126886248588562, + "learning_rate": 8.342181420671096e-05, + "loss": 0.03818287253379822, + "mean_token_accuracy": 0.9852082908153534, + "num_tokens": 33899305.0, + "step": 1650 + }, + { + "entropy": 0.9522916138172149, + "epoch": 3.0570902394106816, + "grad_norm": 1.0571653842926025, + "learning_rate": 8.319469117828007e-05, + "loss": 0.03456039130687714, + "mean_token_accuracy": 0.9867027878761292, + "num_tokens": 34104585.0, + "step": 1660 + }, + { + "entropy": 0.9568560004234314, + "epoch": 3.0755064456721914, + "grad_norm": 0.780940592288971, + "learning_rate": 8.296633673507505e-05, + "loss": 0.03551802039146423, + "mean_token_accuracy": 0.9867531359195709, + "num_tokens": 34309516.0, + "step": 1670 + }, + { + "entropy": 0.9590656876564025, + "epoch": 3.0939226519337018, + "grad_norm": 0.8330219388008118, + "learning_rate": 8.273675934830094e-05, + "loss": 0.03674865961074829, + "mean_token_accuracy": 0.9864118576049805, + "num_tokens": 34515170.0, + "step": 1680 + }, + { + "entropy": 0.975881814956665, + "epoch": 3.1123388581952116, + "grad_norm": 0.7010637521743774, + "learning_rate": 8.250596753453e-05, + "loss": 0.03550414443016052, + "mean_token_accuracy": 0.9864102602005005, + "num_tokens": 34720896.0, + "step": 1690 + }, + { + "entropy": 0.9599562883377075, + "epoch": 3.130755064456722, + "grad_norm": 0.6694278717041016, + "learning_rate": 8.227396985538578e-05, + "loss": 0.035564273595809937, + "mean_token_accuracy": 0.9867321848869324, + "num_tokens": 34925970.0, + "step": 1700 + }, + { + "entropy": 0.9582216143608093, + "epoch": 3.149171270718232, + "grad_norm": 0.9333199262619019, + "learning_rate": 8.204077491722546e-05, + "loss": 0.035575729608535764, + "mean_token_accuracy": 0.9862452208995819, + "num_tokens": 35131543.0, + "step": 1710 + }, + { + "entropy": 0.9579678058624268, + "epoch": 3.167587476979742, + "grad_norm": 0.9450218081474304, + "learning_rate": 8.180639137082066e-05, + "loss": 0.0385298490524292, + "mean_token_accuracy": 0.98538036942482, + "num_tokens": 35336790.0, + "step": 1720 + }, + { + "entropy": 0.9640831351280212, + "epoch": 3.1860036832412524, + "grad_norm": 0.8551534414291382, + "learning_rate": 8.157082791103649e-05, + "loss": 0.03702138364315033, + "mean_token_accuracy": 0.9852015495300293, + "num_tokens": 35542294.0, + "step": 1730 + }, + { + "entropy": 0.9867071211338043, + "epoch": 3.2044198895027622, + "grad_norm": 0.7138128876686096, + "learning_rate": 8.133409327650897e-05, + "loss": 0.035626694560050964, + "mean_token_accuracy": 0.986064875125885, + "num_tokens": 35747447.0, + "step": 1740 + }, + { + "entropy": 0.9639089345932007, + "epoch": 3.2228360957642725, + "grad_norm": 0.7131415009498596, + "learning_rate": 8.109619624932092e-05, + "loss": 0.035885071754455565, + "mean_token_accuracy": 0.986273056268692, + "num_tokens": 35952258.0, + "step": 1750 + }, + { + "entropy": 0.9516046345233917, + "epoch": 3.241252302025783, + "grad_norm": 0.6900200843811035, + "learning_rate": 8.085714565467611e-05, + "loss": 0.03535219430923462, + "mean_token_accuracy": 0.985836285352707, + "num_tokens": 36157938.0, + "step": 1760 + }, + { + "entropy": 0.9373646557331086, + "epoch": 3.2596685082872927, + "grad_norm": 0.6101690530776978, + "learning_rate": 8.061695036057191e-05, + "loss": 0.034940996766090394, + "mean_token_accuracy": 0.9863743901252746, + "num_tokens": 36363825.0, + "step": 1770 + }, + { + "entropy": 0.9444344758987426, + "epoch": 3.278084714548803, + "grad_norm": 0.7518529295921326, + "learning_rate": 8.03756192774703e-05, + "loss": 0.03404279053211212, + "mean_token_accuracy": 0.9866396844387054, + "num_tokens": 36568961.0, + "step": 1780 + }, + { + "entropy": 0.9550357758998871, + "epoch": 3.2965009208103133, + "grad_norm": 0.7687555551528931, + "learning_rate": 8.013316135796734e-05, + "loss": 0.038447052240371704, + "mean_token_accuracy": 0.985325163602829, + "num_tokens": 36774514.0, + "step": 1790 + }, + { + "entropy": 0.9477231681346894, + "epoch": 3.314917127071823, + "grad_norm": 0.7521633505821228, + "learning_rate": 7.988958559646102e-05, + "loss": 0.03746694028377533, + "mean_token_accuracy": 0.9853165090084076, + "num_tokens": 36979660.0, + "step": 1800 + }, + { + "entropy": 0.925805002450943, + "epoch": 3.3333333333333335, + "grad_norm": 0.9333297610282898, + "learning_rate": 7.964490102881768e-05, + "loss": 0.03700103759765625, + "mean_token_accuracy": 0.9850880861282348, + "num_tokens": 37185191.0, + "step": 1810 + }, + { + "entropy": 0.9225482225418091, + "epoch": 3.3517495395948433, + "grad_norm": 0.7928622961044312, + "learning_rate": 7.939911673203665e-05, + "loss": 0.03825801610946655, + "mean_token_accuracy": 0.9850241422653199, + "num_tokens": 37390749.0, + "step": 1820 + }, + { + "entropy": 0.9597147881984711, + "epoch": 3.3701657458563536, + "grad_norm": 0.7658583521842957, + "learning_rate": 7.915224182391375e-05, + "loss": 0.039855146408081056, + "mean_token_accuracy": 0.9845879554748536, + "num_tokens": 37596052.0, + "step": 1830 + }, + { + "entropy": 0.9485619068145752, + "epoch": 3.388581952117864, + "grad_norm": 0.8492130637168884, + "learning_rate": 7.890428546270278e-05, + "loss": 0.039359599351882935, + "mean_token_accuracy": 0.9847265422344208, + "num_tokens": 37802063.0, + "step": 1840 + }, + { + "entropy": 0.9670301914215088, + "epoch": 3.406998158379374, + "grad_norm": 0.7527599930763245, + "learning_rate": 7.865525684677608e-05, + "loss": 0.03752985596656799, + "mean_token_accuracy": 0.9855137526988983, + "num_tokens": 38007432.0, + "step": 1850 + }, + { + "entropy": 0.9681244969367981, + "epoch": 3.425414364640884, + "grad_norm": 0.7599612474441528, + "learning_rate": 7.840516521428303e-05, + "loss": 0.03653894364833832, + "mean_token_accuracy": 0.9858933389186859, + "num_tokens": 38212923.0, + "step": 1860 + }, + { + "entropy": 0.9706049561500549, + "epoch": 3.443830570902394, + "grad_norm": 0.7678127884864807, + "learning_rate": 7.815401984280748e-05, + "loss": 0.0366938978433609, + "mean_token_accuracy": 0.9854713797569274, + "num_tokens": 38418422.0, + "step": 1870 + }, + { + "entropy": 0.9637093842029572, + "epoch": 3.4622467771639043, + "grad_norm": 0.762824535369873, + "learning_rate": 7.790183004902359e-05, + "loss": 0.03516915142536163, + "mean_token_accuracy": 0.9866003453731537, + "num_tokens": 38624389.0, + "step": 1880 + }, + { + "entropy": 0.9373565018177032, + "epoch": 3.4806629834254146, + "grad_norm": 0.8221780061721802, + "learning_rate": 7.764860518835014e-05, + "loss": 0.04049026966094971, + "mean_token_accuracy": 0.984089481830597, + "num_tokens": 38829654.0, + "step": 1890 + }, + { + "entropy": 0.9356025457382202, + "epoch": 3.4990791896869244, + "grad_norm": 0.7583426237106323, + "learning_rate": 7.739435465460356e-05, + "loss": 0.03658481240272522, + "mean_token_accuracy": 0.9857318818569183, + "num_tokens": 39034638.0, + "step": 1900 + }, + { + "entropy": 0.9740163326263428, + "epoch": 3.5174953959484347, + "grad_norm": 0.7332878112792969, + "learning_rate": 7.713908787964937e-05, + "loss": 0.03508963882923126, + "mean_token_accuracy": 0.9863419532775879, + "num_tokens": 39240265.0, + "step": 1910 + }, + { + "entropy": 0.9528286933898926, + "epoch": 3.5359116022099446, + "grad_norm": 0.6515451669692993, + "learning_rate": 7.688281433305233e-05, + "loss": 0.036055779457092284, + "mean_token_accuracy": 0.9860979080200195, + "num_tokens": 39445546.0, + "step": 1920 + }, + { + "entropy": 0.9480705261230469, + "epoch": 3.554327808471455, + "grad_norm": 0.7725827097892761, + "learning_rate": 7.662554352172515e-05, + "loss": 0.037101513147354125, + "mean_token_accuracy": 0.985782790184021, + "num_tokens": 39651078.0, + "step": 1930 + }, + { + "entropy": 0.9655321061611175, + "epoch": 3.572744014732965, + "grad_norm": 0.7756506204605103, + "learning_rate": 7.636728498957581e-05, + "loss": 0.03721855878829956, + "mean_token_accuracy": 0.9857951939105988, + "num_tokens": 39856542.0, + "step": 1940 + }, + { + "entropy": 0.9772682309150695, + "epoch": 3.591160220994475, + "grad_norm": 0.9084987640380859, + "learning_rate": 7.610804831715355e-05, + "loss": 0.03570749163627625, + "mean_token_accuracy": 0.9863450109958649, + "num_tokens": 40061913.0, + "step": 1950 + }, + { + "entropy": 0.9579685389995575, + "epoch": 3.6095764272559854, + "grad_norm": 0.6358487606048584, + "learning_rate": 7.584784312129334e-05, + "loss": 0.038210684061050416, + "mean_token_accuracy": 0.9850837290287018, + "num_tokens": 40267398.0, + "step": 1960 + }, + { + "entropy": 0.9605201721191406, + "epoch": 3.6279926335174952, + "grad_norm": 0.6263149976730347, + "learning_rate": 7.558667905475927e-05, + "loss": 0.03509160876274109, + "mean_token_accuracy": 0.9868143379688263, + "num_tokens": 40472827.0, + "step": 1970 + }, + { + "entropy": 0.964026153087616, + "epoch": 3.6464088397790055, + "grad_norm": 0.90068119764328, + "learning_rate": 7.532456580588638e-05, + "loss": 0.036211782693862916, + "mean_token_accuracy": 0.9858468770980835, + "num_tokens": 40677935.0, + "step": 1980 + }, + { + "entropy": 0.9494135618209839, + "epoch": 3.664825046040516, + "grad_norm": 0.760134756565094, + "learning_rate": 7.50615130982213e-05, + "loss": 0.03786201477050781, + "mean_token_accuracy": 0.9852500438690186, + "num_tokens": 40883750.0, + "step": 1990 + }, + { + "entropy": 0.9527071297168732, + "epoch": 3.6832412523020257, + "grad_norm": 0.9812107682228088, + "learning_rate": 7.479753069016152e-05, + "loss": 0.03803159594535828, + "mean_token_accuracy": 0.9852405369281769, + "num_tokens": 41089115.0, + "step": 2000 + }, + { + "entropy": 0.9639330863952636, + "epoch": 3.701657458563536, + "grad_norm": 0.7164933681488037, + "learning_rate": 7.453262837459332e-05, + "loss": 0.03912568986415863, + "mean_token_accuracy": 0.9849458575248718, + "num_tokens": 41294694.0, + "step": 2010 + }, + { + "entropy": 0.9536987483501435, + "epoch": 3.720073664825046, + "grad_norm": 0.6804596185684204, + "learning_rate": 7.426681597852863e-05, + "loss": 0.036410006880760196, + "mean_token_accuracy": 0.985712206363678, + "num_tokens": 41499817.0, + "step": 2020 + }, + { + "entropy": 0.9478164672851562, + "epoch": 3.738489871086556, + "grad_norm": 0.8799397349357605, + "learning_rate": 7.400010336274037e-05, + "loss": 0.03801035583019256, + "mean_token_accuracy": 0.9850274682044983, + "num_tokens": 41704932.0, + "step": 2030 + }, + { + "entropy": 0.9383447647094727, + "epoch": 3.7569060773480665, + "grad_norm": 0.8386216163635254, + "learning_rate": 7.373250042139664e-05, + "loss": 0.0373637855052948, + "mean_token_accuracy": 0.9854822158813477, + "num_tokens": 41910804.0, + "step": 2040 + }, + { + "entropy": 0.925172996520996, + "epoch": 3.7753222836095763, + "grad_norm": 0.7599324584007263, + "learning_rate": 7.346401708169377e-05, + "loss": 0.03585260808467865, + "mean_token_accuracy": 0.9860672950744629, + "num_tokens": 42116706.0, + "step": 2050 + }, + { + "entropy": 0.9463765442371368, + "epoch": 3.7937384898710866, + "grad_norm": 0.9030149579048157, + "learning_rate": 7.319466330348797e-05, + "loss": 0.035877206921577455, + "mean_token_accuracy": 0.9863968968391419, + "num_tokens": 42322670.0, + "step": 2060 + }, + { + "entropy": 0.9942441761493683, + "epoch": 3.8121546961325965, + "grad_norm": 0.6400449275970459, + "learning_rate": 7.292444907892587e-05, + "loss": 0.037310433387756345, + "mean_token_accuracy": 0.9854151606559753, + "num_tokens": 42527752.0, + "step": 2070 + }, + { + "entropy": 0.9577703952789307, + "epoch": 3.830570902394107, + "grad_norm": 0.6193167567253113, + "learning_rate": 7.265338443207387e-05, + "loss": 0.03648848831653595, + "mean_token_accuracy": 0.9856530070304871, + "num_tokens": 42732981.0, + "step": 2080 + }, + { + "entropy": 0.9663952767848969, + "epoch": 3.848987108655617, + "grad_norm": 0.759611189365387, + "learning_rate": 7.238147941854625e-05, + "loss": 0.036112996935844424, + "mean_token_accuracy": 0.9862765550613404, + "num_tokens": 42938619.0, + "step": 2090 + }, + { + "entropy": 0.9484863519668579, + "epoch": 3.867403314917127, + "grad_norm": 0.7420705556869507, + "learning_rate": 7.210874412513218e-05, + "loss": 0.03703283965587616, + "mean_token_accuracy": 0.9857317566871643, + "num_tokens": 43143753.0, + "step": 2100 + }, + { + "entropy": 0.964326673746109, + "epoch": 3.8858195211786373, + "grad_norm": 0.8779639601707458, + "learning_rate": 7.183518866942147e-05, + "loss": 0.03739701807498932, + "mean_token_accuracy": 0.9852154791355133, + "num_tokens": 43349451.0, + "step": 2110 + }, + { + "entropy": 0.9729791641235351, + "epoch": 3.904235727440147, + "grad_norm": 0.7582741379737854, + "learning_rate": 7.156082319942929e-05, + "loss": 0.03894525766372681, + "mean_token_accuracy": 0.9847454309463501, + "num_tokens": 43554598.0, + "step": 2120 + }, + { + "entropy": 0.9860592544078827, + "epoch": 3.9226519337016574, + "grad_norm": 0.860698938369751, + "learning_rate": 7.128565789321969e-05, + "loss": 0.0365300178527832, + "mean_token_accuracy": 0.9859121859073638, + "num_tokens": 43760081.0, + "step": 2130 + }, + { + "entropy": 0.9916551172733307, + "epoch": 3.9410681399631677, + "grad_norm": 0.8363776206970215, + "learning_rate": 7.100970295852805e-05, + "loss": 0.036221379041671754, + "mean_token_accuracy": 0.9859034180641174, + "num_tokens": 43965432.0, + "step": 2140 + }, + { + "entropy": 0.9553558886051178, + "epoch": 3.9594843462246776, + "grad_norm": 0.9627474546432495, + "learning_rate": 7.073296863238242e-05, + "loss": 0.03684481382369995, + "mean_token_accuracy": 0.9857315957546234, + "num_tokens": 44171232.0, + "step": 2150 + }, + { + "entropy": 0.9538035809993743, + "epoch": 3.977900552486188, + "grad_norm": 0.8399474620819092, + "learning_rate": 7.045546518072366e-05, + "loss": 0.03825397789478302, + "mean_token_accuracy": 0.9846831560134888, + "num_tokens": 44376723.0, + "step": 2160 + }, + { + "entropy": 0.9476235210895538, + "epoch": 3.9963167587476978, + "grad_norm": 0.708739697933197, + "learning_rate": 7.017720289802472e-05, + "loss": 0.03618018329143524, + "mean_token_accuracy": 0.9861325800418854, + "num_tokens": 44582407.0, + "step": 2170 + }, + { + "epoch": 4.0, + "eval_entropy": 0.9569619194321011, + "eval_loss": 0.059838198125362396, + "eval_mean_token_accuracy": 0.9777795366618944, + "eval_num_tokens": 44623647.0, + "eval_runtime": 10.0379, + "eval_samples_per_second": 364.42, + "eval_steps_per_second": 11.457, + "step": 2172 + }, + { + "entropy": 0.9558675646781921, + "epoch": 4.014732965009208, + "grad_norm": 0.7347508668899536, + "learning_rate": 6.989819210690872e-05, + "loss": 0.02886659502983093, + "mean_token_accuracy": 0.9892994821071625, + "num_tokens": 44788219.0, + "step": 2180 + }, + { + "entropy": 1.0037677466869355, + "epoch": 4.033149171270718, + "grad_norm": 0.7403206825256348, + "learning_rate": 6.961844315776596e-05, + "loss": 0.02395295798778534, + "mean_token_accuracy": 0.9906026899814606, + "num_tokens": 44993505.0, + "step": 2190 + }, + { + "entropy": 1.0068290829658508, + "epoch": 4.051565377532229, + "grad_norm": 0.7979726195335388, + "learning_rate": 6.933796642837003e-05, + "loss": 0.02605988085269928, + "mean_token_accuracy": 0.9899706900119781, + "num_tokens": 45199193.0, + "step": 2200 + }, + { + "entropy": 0.9942211747169495, + "epoch": 4.069981583793738, + "grad_norm": 0.6460402011871338, + "learning_rate": 6.905677232349278e-05, + "loss": 0.025350230932235717, + "mean_token_accuracy": 0.9899386286735534, + "num_tokens": 45404030.0, + "step": 2210 + }, + { + "entropy": 0.9783595442771912, + "epoch": 4.088397790055248, + "grad_norm": 0.8177055716514587, + "learning_rate": 6.877487127451834e-05, + "loss": 0.02696993052959442, + "mean_token_accuracy": 0.9896106541156768, + "num_tokens": 45609763.0, + "step": 2220 + }, + { + "entropy": 0.9801763832569123, + "epoch": 4.106813996316759, + "grad_norm": 0.6608165502548218, + "learning_rate": 6.849227373905618e-05, + "loss": 0.025101393461227417, + "mean_token_accuracy": 0.9904372334480286, + "num_tokens": 45814941.0, + "step": 2230 + }, + { + "entropy": 0.9695689737796783, + "epoch": 4.125230202578269, + "grad_norm": 0.8036547899246216, + "learning_rate": 6.820899020055314e-05, + "loss": 0.027827343344688414, + "mean_token_accuracy": 0.9890337705612182, + "num_tokens": 46020535.0, + "step": 2240 + }, + { + "entropy": 0.9828635334968567, + "epoch": 4.143646408839779, + "grad_norm": 0.7729921936988831, + "learning_rate": 6.792503116790455e-05, + "loss": 0.02779492735862732, + "mean_token_accuracy": 0.9894372522830963, + "num_tokens": 46226013.0, + "step": 2250 + }, + { + "entropy": 0.9978842556476593, + "epoch": 4.162062615101289, + "grad_norm": 0.7334664463996887, + "learning_rate": 6.764040717506432e-05, + "loss": 0.025673511624336242, + "mean_token_accuracy": 0.9899355113506317, + "num_tokens": 46432087.0, + "step": 2260 + }, + { + "entropy": 1.0116403937339782, + "epoch": 4.180478821362799, + "grad_norm": 0.6769368052482605, + "learning_rate": 6.735512878065427e-05, + "loss": 0.024705511331558228, + "mean_token_accuracy": 0.9906128525733948, + "num_tokens": 46637478.0, + "step": 2270 + }, + { + "entropy": 0.9985016226768494, + "epoch": 4.198895027624309, + "grad_norm": 0.8301573991775513, + "learning_rate": 6.706920656757234e-05, + "loss": 0.02455987185239792, + "mean_token_accuracy": 0.9905728340148926, + "num_tokens": 46842562.0, + "step": 2280 + }, + { + "entropy": 0.9909430682659149, + "epoch": 4.21731123388582, + "grad_norm": 0.656026303768158, + "learning_rate": 6.67826511426001e-05, + "loss": 0.022711564600467683, + "mean_token_accuracy": 0.9910893619060517, + "num_tokens": 47048071.0, + "step": 2290 + }, + { + "entropy": 0.9868666052818298, + "epoch": 4.23572744014733, + "grad_norm": 0.7614991068840027, + "learning_rate": 6.649547313600916e-05, + "loss": 0.02453812211751938, + "mean_token_accuracy": 0.9908901154994965, + "num_tokens": 47253507.0, + "step": 2300 + }, + { + "entropy": 0.9870487153530121, + "epoch": 4.25414364640884, + "grad_norm": 0.7617276906967163, + "learning_rate": 6.62076832011669e-05, + "loss": 0.025818097591400146, + "mean_token_accuracy": 0.990347957611084, + "num_tokens": 47458747.0, + "step": 2310 + }, + { + "entropy": 0.9691080570220947, + "epoch": 4.27255985267035, + "grad_norm": 0.6743029952049255, + "learning_rate": 6.591929201414124e-05, + "loss": 0.02456912100315094, + "mean_token_accuracy": 0.9905289709568024, + "num_tokens": 47663643.0, + "step": 2320 + }, + { + "entropy": 0.9701108932495117, + "epoch": 4.29097605893186, + "grad_norm": 0.6964483261108398, + "learning_rate": 6.56303102733046e-05, + "loss": 0.02575681209564209, + "mean_token_accuracy": 0.9898503363132477, + "num_tokens": 47868982.0, + "step": 2330 + }, + { + "entropy": 0.969528192281723, + "epoch": 4.30939226519337, + "grad_norm": 0.7521987557411194, + "learning_rate": 6.5340748698937e-05, + "loss": 0.02678089737892151, + "mean_token_accuracy": 0.9898572087287902, + "num_tokens": 48074314.0, + "step": 2340 + }, + { + "entropy": 0.9921871721744537, + "epoch": 4.327808471454881, + "grad_norm": 0.6944513320922852, + "learning_rate": 6.505061803282844e-05, + "loss": 0.025553321838378905, + "mean_token_accuracy": 0.9907529592514038, + "num_tokens": 48279731.0, + "step": 2350 + }, + { + "entropy": 0.9768964886665344, + "epoch": 4.346224677716391, + "grad_norm": 0.6553092002868652, + "learning_rate": 6.47599290378803e-05, + "loss": 0.0250235915184021, + "mean_token_accuracy": 0.9904054701328278, + "num_tokens": 48485401.0, + "step": 2360 + }, + { + "entropy": 0.9612838506698609, + "epoch": 4.3646408839779, + "grad_norm": 0.916820228099823, + "learning_rate": 6.446869249770619e-05, + "loss": 0.028156182169914244, + "mean_token_accuracy": 0.9888657331466675, + "num_tokens": 48691047.0, + "step": 2370 + }, + { + "entropy": 0.9665832936763763, + "epoch": 4.383057090239411, + "grad_norm": 0.9197776913642883, + "learning_rate": 6.417691921623185e-05, + "loss": 0.025303921103477477, + "mean_token_accuracy": 0.989986252784729, + "num_tokens": 48896234.0, + "step": 2380 + }, + { + "entropy": 0.9686589121818543, + "epoch": 4.401473296500921, + "grad_norm": 0.8505764603614807, + "learning_rate": 6.388462001729434e-05, + "loss": 0.024816396832466125, + "mean_token_accuracy": 0.9909265041351318, + "num_tokens": 49101893.0, + "step": 2390 + }, + { + "entropy": 0.9625210344791413, + "epoch": 4.419889502762431, + "grad_norm": 1.0601766109466553, + "learning_rate": 6.359180574424062e-05, + "loss": 0.02706078290939331, + "mean_token_accuracy": 0.9895522117614746, + "num_tokens": 49307467.0, + "step": 2400 + }, + { + "entropy": 0.9679551541805267, + "epoch": 4.4383057090239415, + "grad_norm": 0.776253879070282, + "learning_rate": 6.329848725952514e-05, + "loss": 0.02693203091621399, + "mean_token_accuracy": 0.9893981635570526, + "num_tokens": 49513020.0, + "step": 2410 + }, + { + "entropy": 0.9704959928989411, + "epoch": 4.456721915285451, + "grad_norm": 0.5459668636322021, + "learning_rate": 6.3004675444307e-05, + "loss": 0.0279473751783371, + "mean_token_accuracy": 0.9894329369068146, + "num_tokens": 49718405.0, + "step": 2420 + }, + { + "entropy": 0.961863350868225, + "epoch": 4.475138121546961, + "grad_norm": 0.9338833093643188, + "learning_rate": 6.27103811980462e-05, + "loss": 0.026478803157806395, + "mean_token_accuracy": 0.9902269721031189, + "num_tokens": 49923375.0, + "step": 2430 + }, + { + "entropy": 0.9708506822586059, + "epoch": 4.4935543278084715, + "grad_norm": 0.9073707461357117, + "learning_rate": 6.241561543809947e-05, + "loss": 0.025289520621299744, + "mean_token_accuracy": 0.9904769957065582, + "num_tokens": 50128901.0, + "step": 2440 + }, + { + "entropy": 0.984996622800827, + "epoch": 4.511970534069982, + "grad_norm": 0.8674206733703613, + "learning_rate": 6.212038909931503e-05, + "loss": 0.026442551612854005, + "mean_token_accuracy": 0.9905101835727692, + "num_tokens": 50334449.0, + "step": 2450 + }, + { + "entropy": 0.9926377475261688, + "epoch": 4.530386740331492, + "grad_norm": 0.7571811079978943, + "learning_rate": 6.182471313362717e-05, + "loss": 0.026819539070129395, + "mean_token_accuracy": 0.9898989200592041, + "num_tokens": 50539597.0, + "step": 2460 + }, + { + "entropy": 0.9450563549995422, + "epoch": 4.5488029465930016, + "grad_norm": 0.6651087403297424, + "learning_rate": 6.15285985096498e-05, + "loss": 0.02665227949619293, + "mean_token_accuracy": 0.9897156655788422, + "num_tokens": 50744926.0, + "step": 2470 + }, + { + "entropy": 0.9715635657310486, + "epoch": 4.567219152854512, + "grad_norm": 0.7445545196533203, + "learning_rate": 6.12320562122697e-05, + "loss": 0.026212453842163086, + "mean_token_accuracy": 0.9904700636863708, + "num_tokens": 50950152.0, + "step": 2480 + }, + { + "entropy": 0.9613442063331604, + "epoch": 4.585635359116022, + "grad_norm": 0.7168459296226501, + "learning_rate": 6.0935097242238837e-05, + "loss": 0.02508128583431244, + "mean_token_accuracy": 0.9901923894882202, + "num_tokens": 51155430.0, + "step": 2490 + }, + { + "entropy": 0.9571944534778595, + "epoch": 4.6040515653775325, + "grad_norm": 0.7590732574462891, + "learning_rate": 6.063773261576646e-05, + "loss": 0.025445500016212465, + "mean_token_accuracy": 0.9902949810028077, + "num_tokens": 51360826.0, + "step": 2500 + }, + { + "entropy": 0.947079461812973, + "epoch": 4.622467771639043, + "grad_norm": 0.6942175030708313, + "learning_rate": 6.033997336411035e-05, + "loss": 0.026132801175117494, + "mean_token_accuracy": 0.9900939345359803, + "num_tokens": 51566095.0, + "step": 2510 + }, + { + "entropy": 0.970003741979599, + "epoch": 4.640883977900552, + "grad_norm": 0.6562672257423401, + "learning_rate": 6.00418305331675e-05, + "loss": 0.024759869277477264, + "mean_token_accuracy": 0.9905019223690033, + "num_tokens": 51771177.0, + "step": 2520 + }, + { + "entropy": 0.9715348601341247, + "epoch": 4.6593001841620625, + "grad_norm": 0.6151639819145203, + "learning_rate": 5.9743315183064564e-05, + "loss": 0.024138522148132325, + "mean_token_accuracy": 0.9910101473331452, + "num_tokens": 51976349.0, + "step": 2530 + }, + { + "entropy": 0.9552160143852234, + "epoch": 4.677716390423573, + "grad_norm": 0.968815267086029, + "learning_rate": 5.9444438387747336e-05, + "loss": 0.027274739742279053, + "mean_token_accuracy": 0.9896075248718261, + "num_tokens": 52181820.0, + "step": 2540 + }, + { + "entropy": 0.9265012145042419, + "epoch": 4.696132596685083, + "grad_norm": 0.8966720700263977, + "learning_rate": 5.914521123457015e-05, + "loss": 0.0291823148727417, + "mean_token_accuracy": 0.9886700630187988, + "num_tokens": 52387511.0, + "step": 2550 + }, + { + "entropy": 0.9156096875667572, + "epoch": 4.714548802946593, + "grad_norm": 0.7747519612312317, + "learning_rate": 5.88456448238844e-05, + "loss": 0.02809179127216339, + "mean_token_accuracy": 0.9891100466251374, + "num_tokens": 52592737.0, + "step": 2560 + }, + { + "entropy": 0.924511456489563, + "epoch": 4.732965009208103, + "grad_norm": 1.0087049007415771, + "learning_rate": 5.8545750268626844e-05, + "loss": 0.02683232128620148, + "mean_token_accuracy": 0.9896528899669648, + "num_tokens": 52798814.0, + "step": 2570 + }, + { + "entropy": 0.9662951111793519, + "epoch": 4.751381215469613, + "grad_norm": 0.7709590792655945, + "learning_rate": 5.824553869390734e-05, + "loss": 0.02503817081451416, + "mean_token_accuracy": 0.9900161385536194, + "num_tokens": 53004478.0, + "step": 2580 + }, + { + "entropy": 0.9889141619205475, + "epoch": 4.769797421731123, + "grad_norm": 0.815858006477356, + "learning_rate": 5.794502123659613e-05, + "loss": 0.026327347755432128, + "mean_token_accuracy": 0.9900785744190216, + "num_tokens": 53209888.0, + "step": 2590 + }, + { + "entropy": 0.9785685896873474, + "epoch": 4.788213627992634, + "grad_norm": 0.6514431238174438, + "learning_rate": 5.7644209044910735e-05, + "loss": 0.025033789873123168, + "mean_token_accuracy": 0.9902650475502014, + "num_tokens": 53415533.0, + "step": 2600 + }, + { + "entropy": 0.9723869919776916, + "epoch": 4.806629834254144, + "grad_norm": 0.8778963685035706, + "learning_rate": 5.7343113278002284e-05, + "loss": 0.02379843294620514, + "mean_token_accuracy": 0.9909472465515137, + "num_tokens": 53620850.0, + "step": 2610 + }, + { + "entropy": 0.9572711050510406, + "epoch": 4.8250460405156534, + "grad_norm": 0.8927134871482849, + "learning_rate": 5.70417451055417e-05, + "loss": 0.024856947362422943, + "mean_token_accuracy": 0.9904125213623047, + "num_tokens": 53826259.0, + "step": 2620 + }, + { + "entropy": 0.9523135125637054, + "epoch": 4.843462246777164, + "grad_norm": 0.6832691431045532, + "learning_rate": 5.674011570730523e-05, + "loss": 0.025352203845977785, + "mean_token_accuracy": 0.990432596206665, + "num_tokens": 54031531.0, + "step": 2630 + }, + { + "entropy": 0.9735220730304718, + "epoch": 4.861878453038674, + "grad_norm": 0.6399164795875549, + "learning_rate": 5.643823627275972e-05, + "loss": 0.026541513204574586, + "mean_token_accuracy": 0.9900369107723236, + "num_tokens": 54237155.0, + "step": 2640 + }, + { + "entropy": 0.9566517114639282, + "epoch": 4.880294659300184, + "grad_norm": 0.8725414276123047, + "learning_rate": 5.6136118000647616e-05, + "loss": 0.02675778865814209, + "mean_token_accuracy": 0.9894899427890778, + "num_tokens": 54442739.0, + "step": 2650 + }, + { + "entropy": 0.9447909593582153, + "epoch": 4.898710865561695, + "grad_norm": 0.8169302344322205, + "learning_rate": 5.583377209857138e-05, + "loss": 0.02642086148262024, + "mean_token_accuracy": 0.989885401725769, + "num_tokens": 54648098.0, + "step": 2660 + }, + { + "entropy": 0.9180052697658538, + "epoch": 4.917127071823204, + "grad_norm": 0.7768753170967102, + "learning_rate": 5.553120978257787e-05, + "loss": 0.02552323341369629, + "mean_token_accuracy": 0.9899512350559234, + "num_tokens": 54854281.0, + "step": 2670 + }, + { + "entropy": 0.917166668176651, + "epoch": 4.935543278084714, + "grad_norm": 0.8241410851478577, + "learning_rate": 5.5228442276742153e-05, + "loss": 0.02788199484348297, + "mean_token_accuracy": 0.989625746011734, + "num_tokens": 55059495.0, + "step": 2680 + }, + { + "entropy": 0.9345465302467346, + "epoch": 4.953959484346225, + "grad_norm": 0.7645496129989624, + "learning_rate": 5.4925480812751166e-05, + "loss": 0.02517639398574829, + "mean_token_accuracy": 0.9902283847332001, + "num_tokens": 55265381.0, + "step": 2690 + }, + { + "entropy": 0.9386432528495788, + "epoch": 4.972375690607735, + "grad_norm": 0.8371859192848206, + "learning_rate": 5.46223366294871e-05, + "loss": 0.025585666298866272, + "mean_token_accuracy": 0.9903791427612305, + "num_tokens": 55471210.0, + "step": 2700 + }, + { + "entropy": 0.9267561137676239, + "epoch": 4.990791896869245, + "grad_norm": 0.6789297461509705, + "learning_rate": 5.43190209726104e-05, + "loss": 0.024646708369255067, + "mean_token_accuracy": 0.9904700815677643, + "num_tokens": 55676877.0, + "step": 2710 + }, + { + "epoch": 5.0, + "eval_entropy": 0.9283919717954553, + "eval_loss": 0.06225527077913284, + "eval_mean_token_accuracy": 0.9784110421719758, + "eval_num_tokens": 55779559.0, + "eval_runtime": 10.0613, + "eval_samples_per_second": 363.573, + "eval_steps_per_second": 11.43, + "step": 2715 + }, + { + "entropy": 0.9269404351711273, + "epoch": 5.009208103130755, + "grad_norm": 0.540570080280304, + "learning_rate": 5.401554509414264e-05, + "loss": 0.019513805210590363, + "mean_token_accuracy": 0.9927033007144928, + "num_tokens": 55882241.0, + "step": 2720 + }, + { + "entropy": 0.9377441763877868, + "epoch": 5.027624309392265, + "grad_norm": 0.5840998888015747, + "learning_rate": 5.3711920252049085e-05, + "loss": 0.015180909633636474, + "mean_token_accuracy": 0.9944471418857574, + "num_tokens": 56087470.0, + "step": 2730 + }, + { + "entropy": 0.949122017621994, + "epoch": 5.046040515653775, + "grad_norm": 0.6938672065734863, + "learning_rate": 5.340815770982106e-05, + "loss": 0.0153742715716362, + "mean_token_accuracy": 0.9941534519195556, + "num_tokens": 56292226.0, + "step": 2740 + }, + { + "entropy": 0.9394402146339417, + "epoch": 5.064456721915286, + "grad_norm": 0.8259939551353455, + "learning_rate": 5.310426873605814e-05, + "loss": 0.014350908994674682, + "mean_token_accuracy": 0.9945570707321167, + "num_tokens": 56497839.0, + "step": 2750 + }, + { + "entropy": 0.9323545396327972, + "epoch": 5.082872928176796, + "grad_norm": 0.9675024747848511, + "learning_rate": 5.280026460405005e-05, + "loss": 0.016550135612487794, + "mean_token_accuracy": 0.9938908398151398, + "num_tokens": 56702932.0, + "step": 2760 + }, + { + "entropy": 0.89125554561615, + "epoch": 5.101289134438305, + "grad_norm": 0.8347184658050537, + "learning_rate": 5.2496156591358566e-05, + "loss": 0.017917826771736145, + "mean_token_accuracy": 0.9934309899806977, + "num_tokens": 56908644.0, + "step": 2770 + }, + { + "entropy": 0.8773505449295044, + "epoch": 5.119705340699816, + "grad_norm": 0.8869524598121643, + "learning_rate": 5.219195597939908e-05, + "loss": 0.017221055924892426, + "mean_token_accuracy": 0.993448656797409, + "num_tokens": 57114171.0, + "step": 2780 + }, + { + "entropy": 0.8874686002731323, + "epoch": 5.138121546961326, + "grad_norm": 1.0294251441955566, + "learning_rate": 5.1887674053022084e-05, + "loss": 0.018111808598041533, + "mean_token_accuracy": 0.9931293666362763, + "num_tokens": 57319158.0, + "step": 2790 + }, + { + "entropy": 0.8893351197242737, + "epoch": 5.156537753222836, + "grad_norm": 0.6253597736358643, + "learning_rate": 5.15833221000946e-05, + "loss": 0.017256538569927215, + "mean_token_accuracy": 0.9936724424362182, + "num_tokens": 57524901.0, + "step": 2800 + }, + { + "entropy": 0.9157109141349793, + "epoch": 5.1749539594843466, + "grad_norm": 0.6379142999649048, + "learning_rate": 5.12789114110814e-05, + "loss": 0.016415870189666747, + "mean_token_accuracy": 0.9939744889736175, + "num_tokens": 57730135.0, + "step": 2810 + }, + { + "entropy": 0.9157932877540589, + "epoch": 5.193370165745856, + "grad_norm": 0.7195688486099243, + "learning_rate": 5.097445327862619e-05, + "loss": 0.01577536463737488, + "mean_token_accuracy": 0.9941773355007172, + "num_tokens": 57936210.0, + "step": 2820 + }, + { + "entropy": 0.9179767727851867, + "epoch": 5.211786372007366, + "grad_norm": 0.7149335741996765, + "learning_rate": 5.066995899713264e-05, + "loss": 0.01606254279613495, + "mean_token_accuracy": 0.9937664806842804, + "num_tokens": 58141736.0, + "step": 2830 + }, + { + "entropy": 0.895512479543686, + "epoch": 5.230202578268877, + "grad_norm": 0.6460169553756714, + "learning_rate": 5.036543986234543e-05, + "loss": 0.01605578660964966, + "mean_token_accuracy": 0.994063013792038, + "num_tokens": 58347178.0, + "step": 2840 + }, + { + "entropy": 0.8883109211921691, + "epoch": 5.248618784530387, + "grad_norm": 0.72477787733078, + "learning_rate": 5.006090717093128e-05, + "loss": 0.016773784160614015, + "mean_token_accuracy": 0.9940340936183929, + "num_tokens": 58552952.0, + "step": 2850 + }, + { + "entropy": 0.8942575633525849, + "epoch": 5.267034990791897, + "grad_norm": 0.7344926595687866, + "learning_rate": 4.9756372220059736e-05, + "loss": 0.01604126989841461, + "mean_token_accuracy": 0.994256991147995, + "num_tokens": 58758449.0, + "step": 2860 + }, + { + "entropy": 0.8854099690914154, + "epoch": 5.285451197053407, + "grad_norm": 0.6142122149467468, + "learning_rate": 4.9451846306984214e-05, + "loss": 0.016244474053382873, + "mean_token_accuracy": 0.9938375532627106, + "num_tokens": 58963691.0, + "step": 2870 + }, + { + "entropy": 0.8745675146579742, + "epoch": 5.303867403314917, + "grad_norm": 0.8025366067886353, + "learning_rate": 4.9147340728622816e-05, + "loss": 0.01611460596323013, + "mean_token_accuracy": 0.9941173672676087, + "num_tokens": 59169484.0, + "step": 2880 + }, + { + "entropy": 0.8812389194965362, + "epoch": 5.322283609576427, + "grad_norm": 0.7699193358421326, + "learning_rate": 4.884286678113935e-05, + "loss": 0.016995206475257874, + "mean_token_accuracy": 0.9937523245811463, + "num_tokens": 59374627.0, + "step": 2890 + }, + { + "entropy": 0.8924362242221833, + "epoch": 5.3406998158379375, + "grad_norm": 0.7516226172447205, + "learning_rate": 4.853843575952414e-05, + "loss": 0.01652217358350754, + "mean_token_accuracy": 0.9936819314956665, + "num_tokens": 59580135.0, + "step": 2900 + }, + { + "entropy": 0.8972602427005768, + "epoch": 5.359116022099448, + "grad_norm": 0.7781681418418884, + "learning_rate": 4.823405895717521e-05, + "loss": 0.017360319197177888, + "mean_token_accuracy": 0.9935634732246399, + "num_tokens": 59785392.0, + "step": 2910 + }, + { + "entropy": 0.900998342037201, + "epoch": 5.377532228360957, + "grad_norm": 0.6837047934532166, + "learning_rate": 4.792974766547911e-05, + "loss": 0.017162233591079712, + "mean_token_accuracy": 0.993264091014862, + "num_tokens": 59991448.0, + "step": 2920 + }, + { + "entropy": 0.9239763855934143, + "epoch": 5.3959484346224675, + "grad_norm": 0.7188259363174438, + "learning_rate": 4.762551317339226e-05, + "loss": 0.01718595027923584, + "mean_token_accuracy": 0.9933857440948486, + "num_tokens": 60197079.0, + "step": 2930 + }, + { + "entropy": 0.9056789398193359, + "epoch": 5.414364640883978, + "grad_norm": 0.6922260522842407, + "learning_rate": 4.732136676702198e-05, + "loss": 0.016596907377243043, + "mean_token_accuracy": 0.9937462329864502, + "num_tokens": 60402299.0, + "step": 2940 + }, + { + "entropy": 0.9038522362709045, + "epoch": 5.432780847145488, + "grad_norm": 0.7937009930610657, + "learning_rate": 4.7017319729207954e-05, + "loss": 0.016130413115024566, + "mean_token_accuracy": 0.9941940546035767, + "num_tokens": 60607907.0, + "step": 2950 + }, + { + "entropy": 0.8911147236824035, + "epoch": 5.4511970534069984, + "grad_norm": 0.6261171698570251, + "learning_rate": 4.671338333910359e-05, + "loss": 0.01622493863105774, + "mean_token_accuracy": 0.9937945663928985, + "num_tokens": 60813428.0, + "step": 2960 + }, + { + "entropy": 0.8894322276115417, + "epoch": 5.469613259668508, + "grad_norm": 0.6898378729820251, + "learning_rate": 4.6409568871757645e-05, + "loss": 0.016513559222221374, + "mean_token_accuracy": 0.9936174690723419, + "num_tokens": 61018404.0, + "step": 2970 + }, + { + "entropy": 0.9026601016521454, + "epoch": 5.488029465930018, + "grad_norm": 0.7027897834777832, + "learning_rate": 4.610588759769593e-05, + "loss": 0.016727012395858765, + "mean_token_accuracy": 0.9941417872905731, + "num_tokens": 61223660.0, + "step": 2980 + }, + { + "entropy": 0.8903301954269409, + "epoch": 5.5064456721915285, + "grad_norm": 0.9087063074111938, + "learning_rate": 4.5802350782503196e-05, + "loss": 0.016929233074188234, + "mean_token_accuracy": 0.9935264468193055, + "num_tokens": 61429438.0, + "step": 2990 + }, + { + "entropy": 0.8886692762374878, + "epoch": 5.524861878453039, + "grad_norm": 0.8283822536468506, + "learning_rate": 4.5498969686405266e-05, + "loss": 0.015396638214588166, + "mean_token_accuracy": 0.99433131814003, + "num_tokens": 61635274.0, + "step": 3000 + }, + { + "entropy": 0.8902086555957794, + "epoch": 5.543278084714549, + "grad_norm": 0.7676647305488586, + "learning_rate": 4.5195755563851336e-05, + "loss": 0.01673731654882431, + "mean_token_accuracy": 0.9938134133815766, + "num_tokens": 61840778.0, + "step": 3010 + }, + { + "entropy": 0.8941606819629669, + "epoch": 5.5616942909760585, + "grad_norm": 0.7026392221450806, + "learning_rate": 4.489271966309634e-05, + "loss": 0.01694796681404114, + "mean_token_accuracy": 0.9936233103275299, + "num_tokens": 62046355.0, + "step": 3020 + }, + { + "entropy": 0.90918750166893, + "epoch": 5.580110497237569, + "grad_norm": 0.7146924734115601, + "learning_rate": 4.4589873225783806e-05, + "loss": 0.01852080672979355, + "mean_token_accuracy": 0.9928994178771973, + "num_tokens": 62251709.0, + "step": 3030 + }, + { + "entropy": 0.8946544349193573, + "epoch": 5.598526703499079, + "grad_norm": 0.607246458530426, + "learning_rate": 4.428722748652881e-05, + "loss": 0.016636812686920167, + "mean_token_accuracy": 0.9939334273338318, + "num_tokens": 62456680.0, + "step": 3040 + }, + { + "entropy": 0.8854653835296631, + "epoch": 5.616942909760589, + "grad_norm": 0.7457882165908813, + "learning_rate": 4.3984793672501124e-05, + "loss": 0.016008296608924867, + "mean_token_accuracy": 0.9940589666366577, + "num_tokens": 62662038.0, + "step": 3050 + }, + { + "entropy": 0.8921085000038147, + "epoch": 5.6353591160221, + "grad_norm": 0.7707350254058838, + "learning_rate": 4.368258300300888e-05, + "loss": 0.016655120253562927, + "mean_token_accuracy": 0.993935889005661, + "num_tokens": 62867272.0, + "step": 3060 + }, + { + "entropy": 0.8768653869628906, + "epoch": 5.653775322283609, + "grad_norm": 0.6994554996490479, + "learning_rate": 4.3380606689082166e-05, + "loss": 0.015841150283813478, + "mean_token_accuracy": 0.9944550096988678, + "num_tokens": 63072403.0, + "step": 3070 + }, + { + "entropy": 0.8767679035663605, + "epoch": 5.672191528545119, + "grad_norm": 0.8327192068099976, + "learning_rate": 4.307887593305733e-05, + "loss": 0.015826576948165895, + "mean_token_accuracy": 0.9941202461719513, + "num_tokens": 63277635.0, + "step": 3080 + }, + { + "entropy": 0.8763292253017425, + "epoch": 5.69060773480663, + "grad_norm": 0.7224747538566589, + "learning_rate": 4.277740192816127e-05, + "loss": 0.015298140048980714, + "mean_token_accuracy": 0.9943080008029938, + "num_tokens": 63483196.0, + "step": 3090 + }, + { + "entropy": 0.8834661841392517, + "epoch": 5.70902394106814, + "grad_norm": 0.9508277773857117, + "learning_rate": 4.247619585809627e-05, + "loss": 0.01658404469490051, + "mean_token_accuracy": 0.9934300124645233, + "num_tokens": 63688721.0, + "step": 3100 + }, + { + "entropy": 0.899389523267746, + "epoch": 5.72744014732965, + "grad_norm": 0.7170981168746948, + "learning_rate": 4.217526889662512e-05, + "loss": 0.015803813934326172, + "mean_token_accuracy": 0.9940325975418091, + "num_tokens": 63894220.0, + "step": 3110 + }, + { + "entropy": 0.8968011736869812, + "epoch": 5.74585635359116, + "grad_norm": 0.6686251163482666, + "learning_rate": 4.187463220715659e-05, + "loss": 0.015874400734901428, + "mean_token_accuracy": 0.9940970957279205, + "num_tokens": 64099768.0, + "step": 3120 + }, + { + "entropy": 0.8900792479515076, + "epoch": 5.76427255985267, + "grad_norm": 0.5979828238487244, + "learning_rate": 4.157429694233128e-05, + "loss": 0.01613767147064209, + "mean_token_accuracy": 0.9942961037158966, + "num_tokens": 64305055.0, + "step": 3130 + }, + { + "entropy": 0.8899810135364532, + "epoch": 5.78268876611418, + "grad_norm": 0.7330048084259033, + "learning_rate": 4.127427424360794e-05, + "loss": 0.016168563067913054, + "mean_token_accuracy": 0.9941077649593353, + "num_tokens": 64510002.0, + "step": 3140 + }, + { + "entropy": 0.8805335581302642, + "epoch": 5.801104972375691, + "grad_norm": 0.5978623032569885, + "learning_rate": 4.09745752408501e-05, + "loss": 0.01524556577205658, + "mean_token_accuracy": 0.994326776266098, + "num_tokens": 64715431.0, + "step": 3150 + }, + { + "entropy": 0.878781646490097, + "epoch": 5.819521178637201, + "grad_norm": 0.6749313473701477, + "learning_rate": 4.067521105191331e-05, + "loss": 0.015209287405014038, + "mean_token_accuracy": 0.9942974805831909, + "num_tokens": 64921579.0, + "step": 3160 + }, + { + "entropy": 0.8844729900360108, + "epoch": 5.83793738489871, + "grad_norm": 0.6887196898460388, + "learning_rate": 4.037619278223255e-05, + "loss": 0.01619938760995865, + "mean_token_accuracy": 0.9937683045864105, + "num_tokens": 65127007.0, + "step": 3170 + }, + { + "entropy": 0.8780498623847961, + "epoch": 5.856353591160221, + "grad_norm": 0.6962174773216248, + "learning_rate": 4.0077531524410304e-05, + "loss": 0.015934592485427855, + "mean_token_accuracy": 0.9935103774070739, + "num_tokens": 65332418.0, + "step": 3180 + }, + { + "entropy": 0.8889612555503845, + "epoch": 5.874769797421731, + "grad_norm": 0.6049854159355164, + "learning_rate": 3.977923835780517e-05, + "loss": 0.01600206792354584, + "mean_token_accuracy": 0.9937360048294067, + "num_tokens": 65537845.0, + "step": 3190 + }, + { + "entropy": 0.8960810244083405, + "epoch": 5.893186003683241, + "grad_norm": 0.6341013312339783, + "learning_rate": 3.948132434812065e-05, + "loss": 0.0143389493227005, + "mean_token_accuracy": 0.9948007702827454, + "num_tokens": 65743412.0, + "step": 3200 + }, + { + "entropy": 0.887304550409317, + "epoch": 5.911602209944752, + "grad_norm": 0.7564852237701416, + "learning_rate": 3.9183800546994886e-05, + "loss": 0.016044440865516662, + "mean_token_accuracy": 0.9939335525035858, + "num_tokens": 65948884.0, + "step": 3210 + }, + { + "entropy": 0.8823239862918854, + "epoch": 5.930018416206261, + "grad_norm": 0.6525556445121765, + "learning_rate": 3.8886677991590435e-05, + "loss": 0.016112390160560607, + "mean_token_accuracy": 0.9938134670257568, + "num_tokens": 66153768.0, + "step": 3220 + }, + { + "entropy": 0.8712829887866974, + "epoch": 5.948434622467771, + "grad_norm": 0.676167368888855, + "learning_rate": 3.858996770418504e-05, + "loss": 0.015146306157112122, + "mean_token_accuracy": 0.9944733619689942, + "num_tokens": 66359661.0, + "step": 3230 + }, + { + "entropy": 0.8734102070331573, + "epoch": 5.966850828729282, + "grad_norm": 0.6284340023994446, + "learning_rate": 3.829368069176257e-05, + "loss": 0.017269474267959595, + "mean_token_accuracy": 0.9938443183898926, + "num_tokens": 66565228.0, + "step": 3240 + }, + { + "entropy": 0.8757335782051087, + "epoch": 5.985267034990792, + "grad_norm": 0.722522497177124, + "learning_rate": 3.799782794560484e-05, + "loss": 0.015032704174518585, + "mean_token_accuracy": 0.9942249894142151, + "num_tokens": 66770718.0, + "step": 3250 + }, + { + "epoch": 6.0, + "eval_entropy": 0.8780099873957427, + "eval_loss": 0.06740746647119522, + "eval_mean_token_accuracy": 0.9795082377350849, + "eval_num_tokens": 66935435.0, + "eval_runtime": 10.0955, + "eval_samples_per_second": 362.34, + "eval_steps_per_second": 11.391, + "step": 3258 + }, + { + "entropy": 0.8773481965065002, + "epoch": 6.003683241252302, + "grad_norm": 0.49184396862983704, + "learning_rate": 3.770242044088375e-05, + "loss": 0.013721099495887757, + "mean_token_accuracy": 0.9951768457889557, + "num_tokens": 66976478.0, + "step": 3260 + }, + { + "entropy": 0.8643155217170715, + "epoch": 6.0220994475138125, + "grad_norm": 0.4903622567653656, + "learning_rate": 3.7407469136254234e-05, + "loss": 0.009165047109127045, + "mean_token_accuracy": 0.9969388306140899, + "num_tokens": 67182251.0, + "step": 3270 + }, + { + "entropy": 0.8560326337814331, + "epoch": 6.040515653775322, + "grad_norm": 0.6360073685646057, + "learning_rate": 3.711298497344766e-05, + "loss": 0.010200753808021545, + "mean_token_accuracy": 0.9964211463928223, + "num_tokens": 67387493.0, + "step": 3280 + }, + { + "entropy": 0.8391405165195465, + "epoch": 6.058931860036832, + "grad_norm": 0.519554853439331, + "learning_rate": 3.6818978876865984e-05, + "loss": 0.008906974643468856, + "mean_token_accuracy": 0.9966452360153198, + "num_tokens": 67593314.0, + "step": 3290 + }, + { + "entropy": 0.8300552070140839, + "epoch": 6.077348066298343, + "grad_norm": 0.5294632911682129, + "learning_rate": 3.6525461753176426e-05, + "loss": 0.008088209480047227, + "mean_token_accuracy": 0.9971098065376282, + "num_tokens": 67798634.0, + "step": 3300 + }, + { + "entropy": 0.822588461637497, + "epoch": 6.095764272559853, + "grad_norm": 0.46423637866973877, + "learning_rate": 3.623244449090697e-05, + "loss": 0.008058926463127137, + "mean_token_accuracy": 0.9970856845378876, + "num_tokens": 68003683.0, + "step": 3310 + }, + { + "entropy": 0.8130167067050934, + "epoch": 6.114180478821363, + "grad_norm": 0.497258722782135, + "learning_rate": 3.5939937960042314e-05, + "loss": 0.008712668716907502, + "mean_token_accuracy": 0.996820193529129, + "num_tokens": 68208795.0, + "step": 3320 + }, + { + "entropy": 0.8035802125930787, + "epoch": 6.132596685082873, + "grad_norm": 0.43152952194213867, + "learning_rate": 3.5647953011620716e-05, + "loss": 0.008366625010967254, + "mean_token_accuracy": 0.9969616234302521, + "num_tokens": 68414414.0, + "step": 3330 + }, + { + "entropy": 0.8028200149536133, + "epoch": 6.151012891344383, + "grad_norm": 0.6057612299919128, + "learning_rate": 3.535650047733141e-05, + "loss": 0.00967741459608078, + "mean_token_accuracy": 0.9963694036006927, + "num_tokens": 68619806.0, + "step": 3340 + }, + { + "entropy": 0.7982640087604522, + "epoch": 6.169429097605893, + "grad_norm": 0.4966030716896057, + "learning_rate": 3.5065591169112785e-05, + "loss": 0.00934397652745247, + "mean_token_accuracy": 0.9969527781009674, + "num_tokens": 68825254.0, + "step": 3350 + }, + { + "entropy": 0.7859498977661132, + "epoch": 6.1878453038674035, + "grad_norm": 0.6962474584579468, + "learning_rate": 3.477523587875139e-05, + "loss": 0.010414297878742217, + "mean_token_accuracy": 0.996203750371933, + "num_tokens": 69031043.0, + "step": 3360 + }, + { + "entropy": 0.7869667530059814, + "epoch": 6.206261510128914, + "grad_norm": 0.6078894734382629, + "learning_rate": 3.448544537748143e-05, + "loss": 0.008547455072402954, + "mean_token_accuracy": 0.9968406975269317, + "num_tokens": 69236559.0, + "step": 3370 + }, + { + "entropy": 0.8036401033401489, + "epoch": 6.224677716390423, + "grad_norm": 0.5990306735038757, + "learning_rate": 3.4196230415585337e-05, + "loss": 0.00924447700381279, + "mean_token_accuracy": 0.9967190623283386, + "num_tokens": 69441764.0, + "step": 3380 + }, + { + "entropy": 0.8085561394691467, + "epoch": 6.2430939226519335, + "grad_norm": 0.46029484272003174, + "learning_rate": 3.390760172199486e-05, + "loss": 0.008379801362752914, + "mean_token_accuracy": 0.9970395743846894, + "num_tokens": 69647075.0, + "step": 3390 + }, + { + "entropy": 0.8132422208786011, + "epoch": 6.261510128913444, + "grad_norm": 0.5699496865272522, + "learning_rate": 3.361957000389315e-05, + "loss": 0.009426499903202056, + "mean_token_accuracy": 0.996586662530899, + "num_tokens": 69852809.0, + "step": 3400 + }, + { + "entropy": 0.8065890491008758, + "epoch": 6.279926335174954, + "grad_norm": 0.6212234497070312, + "learning_rate": 3.33321459463175e-05, + "loss": 0.009811153262853622, + "mean_token_accuracy": 0.9965905249118805, + "num_tokens": 70057955.0, + "step": 3410 + }, + { + "entropy": 0.7904254853725433, + "epoch": 6.298342541436464, + "grad_norm": 0.8000790476799011, + "learning_rate": 3.304534021176299e-05, + "loss": 0.00957801640033722, + "mean_token_accuracy": 0.9964518308639526, + "num_tokens": 70263517.0, + "step": 3420 + }, + { + "entropy": 0.7971100151538849, + "epoch": 6.316758747697974, + "grad_norm": 0.6359512209892273, + "learning_rate": 3.275916343978689e-05, + "loss": 0.009681916236877442, + "mean_token_accuracy": 0.9967545390129089, + "num_tokens": 70468400.0, + "step": 3430 + }, + { + "entropy": 0.7987187504768372, + "epoch": 6.335174953959484, + "grad_norm": 0.5094901919364929, + "learning_rate": 3.247362624661406e-05, + "loss": 0.009966370463371278, + "mean_token_accuracy": 0.9964035987854004, + "num_tokens": 70673648.0, + "step": 3440 + }, + { + "entropy": 0.7850228011608124, + "epoch": 6.3535911602209945, + "grad_norm": 0.5554385185241699, + "learning_rate": 3.218873922474303e-05, + "loss": 0.009521079063415528, + "mean_token_accuracy": 0.9966651916503906, + "num_tokens": 70879452.0, + "step": 3450 + }, + { + "entropy": 0.7885844230651855, + "epoch": 6.372007366482505, + "grad_norm": 0.5217951536178589, + "learning_rate": 3.190451294255314e-05, + "loss": 0.00949474424123764, + "mean_token_accuracy": 0.9966598808765411, + "num_tokens": 71085217.0, + "step": 3460 + }, + { + "entropy": 0.797072297334671, + "epoch": 6.390423572744015, + "grad_norm": 0.5385560393333435, + "learning_rate": 3.162095794391241e-05, + "loss": 0.009810312837362289, + "mean_token_accuracy": 0.9965846955776214, + "num_tokens": 71290955.0, + "step": 3470 + }, + { + "entropy": 0.8024774849414825, + "epoch": 6.4088397790055245, + "grad_norm": 0.5419294238090515, + "learning_rate": 3.1338084747786456e-05, + "loss": 0.009127366542816161, + "mean_token_accuracy": 0.9968222141265869, + "num_tokens": 71496654.0, + "step": 3480 + }, + { + "entropy": 0.8082470417022705, + "epoch": 6.427255985267035, + "grad_norm": 0.7315362095832825, + "learning_rate": 3.105590384784821e-05, + "loss": 0.008642691373825073, + "mean_token_accuracy": 0.9970867097377777, + "num_tokens": 71701725.0, + "step": 3490 + }, + { + "entropy": 0.804630172252655, + "epoch": 6.445672191528545, + "grad_norm": 0.6668549180030823, + "learning_rate": 3.0774425712088676e-05, + "loss": 0.008679335564374923, + "mean_token_accuracy": 0.9969714701175689, + "num_tokens": 71907003.0, + "step": 3500 + }, + { + "entropy": 0.7939219176769257, + "epoch": 6.464088397790055, + "grad_norm": 0.8388434648513794, + "learning_rate": 3.049366078242864e-05, + "loss": 0.009249264001846313, + "mean_token_accuracy": 0.99674671292305, + "num_tokens": 72112532.0, + "step": 3510 + }, + { + "entropy": 0.78477823138237, + "epoch": 6.482504604051566, + "grad_norm": 0.4963231682777405, + "learning_rate": 3.021361947433125e-05, + "loss": 0.009192919731140137, + "mean_token_accuracy": 0.9965968191623688, + "num_tokens": 72318518.0, + "step": 3520 + }, + { + "entropy": 0.7841647148132325, + "epoch": 6.500920810313076, + "grad_norm": 0.5681823492050171, + "learning_rate": 2.9934312176415636e-05, + "loss": 0.008821797370910645, + "mean_token_accuracy": 0.9968703150749206, + "num_tokens": 72524548.0, + "step": 3530 + }, + { + "entropy": 0.7848304688930512, + "epoch": 6.519337016574585, + "grad_norm": 0.7126080393791199, + "learning_rate": 2.965574925007154e-05, + "loss": 0.009742744266986847, + "mean_token_accuracy": 0.9964317202568054, + "num_tokens": 72729659.0, + "step": 3540 + }, + { + "entropy": 0.7899512410163879, + "epoch": 6.537753222836096, + "grad_norm": 0.7015056014060974, + "learning_rate": 2.9377941029074986e-05, + "loss": 0.008977667987346649, + "mean_token_accuracy": 0.9968570172786713, + "num_tokens": 72934865.0, + "step": 3550 + }, + { + "entropy": 0.797937935590744, + "epoch": 6.556169429097606, + "grad_norm": 0.700501024723053, + "learning_rate": 2.910089781920486e-05, + "loss": 0.00973074734210968, + "mean_token_accuracy": 0.996515303850174, + "num_tokens": 73139684.0, + "step": 3560 + }, + { + "entropy": 0.7993333518505097, + "epoch": 6.574585635359116, + "grad_norm": 0.44471475481987, + "learning_rate": 2.882462989786061e-05, + "loss": 0.008206719905138016, + "mean_token_accuracy": 0.9968972980976105, + "num_tokens": 73345547.0, + "step": 3570 + }, + { + "entropy": 0.7961922466754914, + "epoch": 6.593001841620627, + "grad_norm": 0.5011329054832458, + "learning_rate": 2.854914751368109e-05, + "loss": 0.009073075652122498, + "mean_token_accuracy": 0.9968676805496216, + "num_tokens": 73550822.0, + "step": 3580 + }, + { + "entropy": 0.8064342319965363, + "epoch": 6.611418047882136, + "grad_norm": 0.6375740766525269, + "learning_rate": 2.82744608861642e-05, + "loss": 0.009309899061918259, + "mean_token_accuracy": 0.9967096745967865, + "num_tokens": 73756564.0, + "step": 3590 + }, + { + "entropy": 0.8025432348251342, + "epoch": 6.629834254143646, + "grad_norm": 0.8449372053146362, + "learning_rate": 2.8000580205287874e-05, + "loss": 0.009333166480064391, + "mean_token_accuracy": 0.9967890501022338, + "num_tokens": 73961849.0, + "step": 3600 + }, + { + "entropy": 0.8010810256004334, + "epoch": 6.648250460405157, + "grad_norm": 0.9473148584365845, + "learning_rate": 2.772751563113213e-05, + "loss": 0.00938543900847435, + "mean_token_accuracy": 0.996571558713913, + "num_tokens": 74167074.0, + "step": 3610 + }, + { + "entropy": 0.7954266011714936, + "epoch": 6.666666666666667, + "grad_norm": 0.4197849631309509, + "learning_rate": 2.7455277293502007e-05, + "loss": 0.008846811950206757, + "mean_token_accuracy": 0.9970432996749878, + "num_tokens": 74372119.0, + "step": 3620 + }, + { + "entropy": 0.7859483778476715, + "epoch": 6.685082872928177, + "grad_norm": 0.5353069305419922, + "learning_rate": 2.7183875291551892e-05, + "loss": 0.008807064592838287, + "mean_token_accuracy": 0.9969651758670807, + "num_tokens": 74577516.0, + "step": 3630 + }, + { + "entropy": 0.7940182387828827, + "epoch": 6.703499079189687, + "grad_norm": 0.5789965391159058, + "learning_rate": 2.6913319693410828e-05, + "loss": 0.008173662424087524, + "mean_token_accuracy": 0.9970715939998627, + "num_tokens": 74783031.0, + "step": 3640 + }, + { + "entropy": 0.7871349632740021, + "epoch": 6.721915285451197, + "grad_norm": 0.5887596011161804, + "learning_rate": 2.6643620535809076e-05, + "loss": 0.008517104387283325, + "mean_token_accuracy": 0.9969267845153809, + "num_tokens": 74988646.0, + "step": 3650 + }, + { + "entropy": 0.783170485496521, + "epoch": 6.740331491712707, + "grad_norm": 0.6228395104408264, + "learning_rate": 2.637478782370574e-05, + "loss": 0.008941689878702164, + "mean_token_accuracy": 0.9967794418334961, + "num_tokens": 75193938.0, + "step": 3660 + }, + { + "entropy": 0.7779926240444184, + "epoch": 6.758747697974218, + "grad_norm": 0.7367292642593384, + "learning_rate": 2.61068315299176e-05, + "loss": 0.009662539511919022, + "mean_token_accuracy": 0.9965554535388946, + "num_tokens": 75399816.0, + "step": 3670 + }, + { + "entropy": 0.7756146490573883, + "epoch": 6.777163904235728, + "grad_norm": 0.7478228807449341, + "learning_rate": 2.5839761594749167e-05, + "loss": 0.008691602945327758, + "mean_token_accuracy": 0.996806287765503, + "num_tokens": 75605531.0, + "step": 3680 + }, + { + "entropy": 0.7793804049491883, + "epoch": 6.795580110497237, + "grad_norm": 0.580205500125885, + "learning_rate": 2.5573587925623964e-05, + "loss": 0.00922732800245285, + "mean_token_accuracy": 0.9966219186782836, + "num_tokens": 75811143.0, + "step": 3690 + }, + { + "entropy": 0.7817609786987305, + "epoch": 6.813996316758748, + "grad_norm": 0.3849862813949585, + "learning_rate": 2.530832039671694e-05, + "loss": 0.00812167227268219, + "mean_token_accuracy": 0.9970280706882477, + "num_tokens": 76016366.0, + "step": 3700 + }, + { + "entropy": 0.7793294489383698, + "epoch": 6.832412523020258, + "grad_norm": 0.4873282313346863, + "learning_rate": 2.504396884858825e-05, + "loss": 0.008183138072490692, + "mean_token_accuracy": 0.9973145961761475, + "num_tokens": 76221692.0, + "step": 3710 + }, + { + "entropy": 0.7805068492889404, + "epoch": 6.850828729281768, + "grad_norm": 0.6652786135673523, + "learning_rate": 2.478054308781807e-05, + "loss": 0.009141853451728821, + "mean_token_accuracy": 0.9968416154384613, + "num_tokens": 76427170.0, + "step": 3720 + }, + { + "entropy": 0.7799863159656525, + "epoch": 6.8692449355432785, + "grad_norm": 0.5895428657531738, + "learning_rate": 2.451805288664298e-05, + "loss": 0.009343943744897842, + "mean_token_accuracy": 0.9968909084796905, + "num_tokens": 76632450.0, + "step": 3730 + }, + { + "entropy": 0.7819362223148346, + "epoch": 6.887661141804788, + "grad_norm": 0.6007734537124634, + "learning_rate": 2.425650798259327e-05, + "loss": 0.008081933856010437, + "mean_token_accuracy": 0.9972956955432892, + "num_tokens": 76837993.0, + "step": 3740 + }, + { + "entropy": 0.7859819054603576, + "epoch": 6.906077348066298, + "grad_norm": 0.5510725975036621, + "learning_rate": 2.39959180781318e-05, + "loss": 0.008848348259925842, + "mean_token_accuracy": 0.9968287885189057, + "num_tokens": 77043697.0, + "step": 3750 + }, + { + "entropy": 0.7813855290412903, + "epoch": 6.9244935543278086, + "grad_norm": 0.5004434585571289, + "learning_rate": 2.3736292840294122e-05, + "loss": 0.00795777291059494, + "mean_token_accuracy": 0.9973017990589141, + "num_tokens": 77248720.0, + "step": 3760 + }, + { + "entropy": 0.774254196882248, + "epoch": 6.942909760589319, + "grad_norm": 0.7068622708320618, + "learning_rate": 2.347764190032974e-05, + "loss": 0.007790238410234451, + "mean_token_accuracy": 0.997188663482666, + "num_tokens": 77454096.0, + "step": 3770 + }, + { + "entropy": 0.7675817251205445, + "epoch": 6.961325966850829, + "grad_norm": 0.5110977292060852, + "learning_rate": 2.3219974853344905e-05, + "loss": 0.008631375432014466, + "mean_token_accuracy": 0.9967362582683563, + "num_tokens": 77659811.0, + "step": 3780 + }, + { + "entropy": 0.7719516515731811, + "epoch": 6.979742173112339, + "grad_norm": 0.6288211941719055, + "learning_rate": 2.2963301257946622e-05, + "loss": 0.00804171860218048, + "mean_token_accuracy": 0.9971263229846954, + "num_tokens": 77865539.0, + "step": 3790 + }, + { + "entropy": 0.7786632418632508, + "epoch": 6.998158379373849, + "grad_norm": 0.5279833078384399, + "learning_rate": 2.270763063588814e-05, + "loss": 0.007490953803062439, + "mean_token_accuracy": 0.9974353730678558, + "num_tokens": 78070767.0, + "step": 3800 + }, + { + "epoch": 7.0, + "eval_entropy": 0.7808213239130767, + "eval_loss": 0.07382760941982269, + "eval_mean_token_accuracy": 0.9800234224485315, + "eval_num_tokens": 78091327.0, + "eval_runtime": 10.072, + "eval_samples_per_second": 363.186, + "eval_steps_per_second": 11.418, + "step": 3801 + }, + { + "entropy": 0.7750412881374359, + "epoch": 7.016574585635359, + "grad_norm": 0.3850567936897278, + "learning_rate": 2.2452972471715644e-05, + "loss": 0.005539501458406449, + "mean_token_accuracy": 0.9983771502971649, + "num_tokens": 78276132.0, + "step": 3810 + }, + { + "entropy": 0.7673899948596954, + "epoch": 7.0349907918968695, + "grad_norm": 0.4390123188495636, + "learning_rate": 2.2199336212416406e-05, + "loss": 0.0051019065082073215, + "mean_token_accuracy": 0.9984941363334656, + "num_tokens": 78481842.0, + "step": 3820 + }, + { + "entropy": 0.7712743639945984, + "epoch": 7.05340699815838, + "grad_norm": 0.48151132464408875, + "learning_rate": 2.1946731267068386e-05, + "loss": 0.005401181802153587, + "mean_token_accuracy": 0.9984619855880738, + "num_tokens": 78687831.0, + "step": 3830 + }, + { + "entropy": 0.772344833612442, + "epoch": 7.071823204419889, + "grad_norm": 0.3234920799732208, + "learning_rate": 2.169516700649115e-05, + "loss": 0.004806514084339142, + "mean_token_accuracy": 0.9984551191329956, + "num_tokens": 78893336.0, + "step": 3840 + }, + { + "entropy": 0.7736243844032288, + "epoch": 7.0902394106813995, + "grad_norm": 0.4605523645877838, + "learning_rate": 2.1444652762898242e-05, + "loss": 0.0041438989341259, + "mean_token_accuracy": 0.9988476693630218, + "num_tokens": 79098566.0, + "step": 3850 + }, + { + "entropy": 0.7667625486850739, + "epoch": 7.10865561694291, + "grad_norm": 0.43270638585090637, + "learning_rate": 2.119519782955105e-05, + "loss": 0.004775972291827202, + "mean_token_accuracy": 0.9984218835830688, + "num_tokens": 79303871.0, + "step": 3860 + }, + { + "entropy": 0.7631282329559326, + "epoch": 7.12707182320442, + "grad_norm": 0.35699328780174255, + "learning_rate": 2.094681146041394e-05, + "loss": 0.00421409159898758, + "mean_token_accuracy": 0.9988139629364013, + "num_tokens": 79509090.0, + "step": 3870 + }, + { + "entropy": 0.7578658938407898, + "epoch": 7.14548802946593, + "grad_norm": 0.6432686448097229, + "learning_rate": 2.06995028698111e-05, + "loss": 0.004374136403203011, + "mean_token_accuracy": 0.998746919631958, + "num_tokens": 79714834.0, + "step": 3880 + }, + { + "entropy": 0.7558880388736725, + "epoch": 7.16390423572744, + "grad_norm": 0.7402953505516052, + "learning_rate": 2.0453281232084586e-05, + "loss": 0.004856631904840469, + "mean_token_accuracy": 0.9985869526863098, + "num_tokens": 79920226.0, + "step": 3890 + }, + { + "entropy": 0.7542947113513947, + "epoch": 7.18232044198895, + "grad_norm": 0.3336258828639984, + "learning_rate": 2.0208155681254076e-05, + "loss": 0.0044605318456888195, + "mean_token_accuracy": 0.9986851871013641, + "num_tokens": 80125096.0, + "step": 3900 + }, + { + "entropy": 0.7547785460948944, + "epoch": 7.2007366482504604, + "grad_norm": 0.48785507678985596, + "learning_rate": 1.9964135310678017e-05, + "loss": 0.004243453219532967, + "mean_token_accuracy": 0.99877148270607, + "num_tokens": 80330544.0, + "step": 3910 + }, + { + "entropy": 0.7504124104976654, + "epoch": 7.219152854511971, + "grad_norm": 0.462425172328949, + "learning_rate": 1.9721229172716245e-05, + "loss": 0.004574070125818253, + "mean_token_accuracy": 0.9984130024909973, + "num_tokens": 80536072.0, + "step": 3920 + }, + { + "entropy": 0.7527936816215515, + "epoch": 7.237569060773481, + "grad_norm": 0.38035058975219727, + "learning_rate": 1.9479446278394208e-05, + "loss": 0.004055039957165718, + "mean_token_accuracy": 0.9987563371658326, + "num_tokens": 80741305.0, + "step": 3930 + }, + { + "entropy": 0.7502566337585449, + "epoch": 7.2559852670349905, + "grad_norm": 0.3040870428085327, + "learning_rate": 1.9238795597068665e-05, + "loss": 0.0041418131440877914, + "mean_token_accuracy": 0.998740965127945, + "num_tokens": 80946776.0, + "step": 3940 + }, + { + "entropy": 0.7453009426593781, + "epoch": 7.274401473296501, + "grad_norm": 0.39008331298828125, + "learning_rate": 1.8999286056095e-05, + "loss": 0.003949865326285362, + "mean_token_accuracy": 0.9989946007728576, + "num_tokens": 81151930.0, + "step": 3950 + }, + { + "entropy": 0.7451439797878265, + "epoch": 7.292817679558011, + "grad_norm": 0.44502392411231995, + "learning_rate": 1.8760926540496006e-05, + "loss": 0.0047814734280109406, + "mean_token_accuracy": 0.9984888076782227, + "num_tokens": 81357559.0, + "step": 3960 + }, + { + "entropy": 0.749438214302063, + "epoch": 7.311233885819521, + "grad_norm": 0.5195235013961792, + "learning_rate": 1.8523725892632253e-05, + "loss": 0.004281774908304214, + "mean_token_accuracy": 0.9987141609191894, + "num_tokens": 81563294.0, + "step": 3970 + }, + { + "entropy": 0.7536393761634826, + "epoch": 7.329650092081032, + "grad_norm": 0.34366822242736816, + "learning_rate": 1.828769291187413e-05, + "loss": 0.0038790594786405562, + "mean_token_accuracy": 0.9988636136054992, + "num_tokens": 81768219.0, + "step": 3980 + }, + { + "entropy": 0.750888729095459, + "epoch": 7.348066298342541, + "grad_norm": 0.3888038098812103, + "learning_rate": 1.8052836354275355e-05, + "loss": 0.0046471841633319855, + "mean_token_accuracy": 0.9985671877861023, + "num_tokens": 81973285.0, + "step": 3990 + }, + { + "entropy": 0.7512096881866455, + "epoch": 7.366482504604051, + "grad_norm": 0.4684146046638489, + "learning_rate": 1.7819164932248194e-05, + "loss": 0.0049116648733615875, + "mean_token_accuracy": 0.9983943462371826, + "num_tokens": 82178663.0, + "step": 4000 + }, + { + "entropy": 0.7514511585235596, + "epoch": 7.384898710865562, + "grad_norm": 0.571757435798645, + "learning_rate": 1.7586687314240296e-05, + "loss": 0.005085925757884979, + "mean_token_accuracy": 0.9984491765499115, + "num_tokens": 82384703.0, + "step": 4010 + }, + { + "entropy": 0.7534020125865937, + "epoch": 7.403314917127072, + "grad_norm": 0.3526608943939209, + "learning_rate": 1.7355412124412988e-05, + "loss": 0.004772019758820534, + "mean_token_accuracy": 0.9984830975532532, + "num_tokens": 82590547.0, + "step": 4020 + }, + { + "entropy": 0.7495484054088593, + "epoch": 7.421731123388582, + "grad_norm": 0.389273464679718, + "learning_rate": 1.7125347942321523e-05, + "loss": 0.004463380947709083, + "mean_token_accuracy": 0.9985935151576996, + "num_tokens": 82795801.0, + "step": 4030 + }, + { + "entropy": 0.7503954172134399, + "epoch": 7.440147329650092, + "grad_norm": 0.44278526306152344, + "learning_rate": 1.689650330259665e-05, + "loss": 0.004496005177497864, + "mean_token_accuracy": 0.9985817015171051, + "num_tokens": 83001332.0, + "step": 4040 + }, + { + "entropy": 0.7515525698661805, + "epoch": 7.458563535911602, + "grad_norm": 0.418070524930954, + "learning_rate": 1.666888669462809e-05, + "loss": 0.004161220416426659, + "mean_token_accuracy": 0.9988651812076569, + "num_tokens": 83206211.0, + "step": 4050 + }, + { + "entropy": 0.747660368680954, + "epoch": 7.476979742173112, + "grad_norm": 0.52656090259552, + "learning_rate": 1.6442506562249622e-05, + "loss": 0.0042838241904973985, + "mean_token_accuracy": 0.9985848963260651, + "num_tokens": 83411773.0, + "step": 4060 + }, + { + "entropy": 0.7506191194057464, + "epoch": 7.495395948434623, + "grad_norm": 0.29261597990989685, + "learning_rate": 1.621737130342578e-05, + "loss": 0.003957664594054222, + "mean_token_accuracy": 0.9986599206924438, + "num_tokens": 83617231.0, + "step": 4070 + }, + { + "entropy": 0.7549885094165802, + "epoch": 7.513812154696133, + "grad_norm": 0.35437923669815063, + "learning_rate": 1.599348926994036e-05, + "loss": 0.0034299422055482864, + "mean_token_accuracy": 0.9990102112293243, + "num_tokens": 83822893.0, + "step": 4080 + }, + { + "entropy": 0.7578713536262512, + "epoch": 7.532228360957642, + "grad_norm": 0.38587674498558044, + "learning_rate": 1.5770868767086567e-05, + "loss": 0.003320001810789108, + "mean_token_accuracy": 0.9990382492542267, + "num_tokens": 84028376.0, + "step": 4090 + }, + { + "entropy": 0.754696124792099, + "epoch": 7.550644567219153, + "grad_norm": 0.39879584312438965, + "learning_rate": 1.554951805335897e-05, + "loss": 0.004190019145607948, + "mean_token_accuracy": 0.9987861573696136, + "num_tokens": 84233768.0, + "step": 4100 + }, + { + "entropy": 0.7515742480754852, + "epoch": 7.569060773480663, + "grad_norm": 0.47624126076698303, + "learning_rate": 1.5329445340147096e-05, + "loss": 0.00403064489364624, + "mean_token_accuracy": 0.9986754775047302, + "num_tokens": 84439923.0, + "step": 4110 + }, + { + "entropy": 0.7527350902557373, + "epoch": 7.587476979742173, + "grad_norm": 0.4721614718437195, + "learning_rate": 1.5110658791430804e-05, + "loss": 0.004505171626806259, + "mean_token_accuracy": 0.9985378623008728, + "num_tokens": 84645432.0, + "step": 4120 + }, + { + "entropy": 0.747485089302063, + "epoch": 7.605893186003684, + "grad_norm": 0.49411219358444214, + "learning_rate": 1.4893166523477448e-05, + "loss": 0.0038516007363796232, + "mean_token_accuracy": 0.9987127304077148, + "num_tokens": 84850968.0, + "step": 4130 + }, + { + "entropy": 0.7463041722774506, + "epoch": 7.624309392265193, + "grad_norm": 0.4478297531604767, + "learning_rate": 1.4676976604540787e-05, + "loss": 0.00429936945438385, + "mean_token_accuracy": 0.9987892746925354, + "num_tokens": 85056363.0, + "step": 4140 + }, + { + "entropy": 0.7416360318660736, + "epoch": 7.642725598526703, + "grad_norm": 0.4913847744464874, + "learning_rate": 1.4462097054561675e-05, + "loss": 0.0036755587905645372, + "mean_token_accuracy": 0.9989015281200408, + "num_tokens": 85262252.0, + "step": 4150 + }, + { + "entropy": 0.7403277635574341, + "epoch": 7.661141804788214, + "grad_norm": 0.49693024158477783, + "learning_rate": 1.4248535844870586e-05, + "loss": 0.0037889480590820312, + "mean_token_accuracy": 0.99878990650177, + "num_tokens": 85468067.0, + "step": 4160 + }, + { + "entropy": 0.7400458335876465, + "epoch": 7.679558011049724, + "grad_norm": 0.4609115421772003, + "learning_rate": 1.4036300897891819e-05, + "loss": 0.004160438477993011, + "mean_token_accuracy": 0.9985541105270386, + "num_tokens": 85673442.0, + "step": 4170 + }, + { + "entropy": 0.7405486226081848, + "epoch": 7.697974217311234, + "grad_norm": 0.4713679254055023, + "learning_rate": 1.3825400086849693e-05, + "loss": 0.004131061211228371, + "mean_token_accuracy": 0.9986487686634063, + "num_tokens": 85878837.0, + "step": 4180 + }, + { + "entropy": 0.7426068425178528, + "epoch": 7.716390423572744, + "grad_norm": 0.36043041944503784, + "learning_rate": 1.3615841235476423e-05, + "loss": 0.004306273162364959, + "mean_token_accuracy": 0.9986724078655242, + "num_tokens": 86083884.0, + "step": 4190 + }, + { + "entropy": 0.7389791548252106, + "epoch": 7.734806629834254, + "grad_norm": 0.4564935863018036, + "learning_rate": 1.3407632117721858e-05, + "loss": 0.003909315168857575, + "mean_token_accuracy": 0.998731005191803, + "num_tokens": 86289698.0, + "step": 4200 + }, + { + "entropy": 0.7440706253051758, + "epoch": 7.753222836095764, + "grad_norm": 0.442862331867218, + "learning_rate": 1.3200780457465211e-05, + "loss": 0.0041195075958967205, + "mean_token_accuracy": 0.9987683832645416, + "num_tokens": 86494738.0, + "step": 4210 + }, + { + "entropy": 0.738617730140686, + "epoch": 7.7716390423572745, + "grad_norm": 0.4486972391605377, + "learning_rate": 1.2995293928228385e-05, + "loss": 0.003850420191884041, + "mean_token_accuracy": 0.998667311668396, + "num_tokens": 86700730.0, + "step": 4220 + }, + { + "entropy": 0.7406526923179626, + "epoch": 7.790055248618785, + "grad_norm": 0.5896158218383789, + "learning_rate": 1.2791180152891396e-05, + "loss": 0.004078804701566696, + "mean_token_accuracy": 0.9987831771373749, + "num_tokens": 86905871.0, + "step": 4230 + }, + { + "entropy": 0.7394271969795227, + "epoch": 7.808471454880294, + "grad_norm": 0.5551350116729736, + "learning_rate": 1.2588446703409552e-05, + "loss": 0.004226792231202125, + "mean_token_accuracy": 0.9985223591327668, + "num_tokens": 87111128.0, + "step": 4240 + }, + { + "entropy": 0.7396899223327636, + "epoch": 7.826887661141805, + "grad_norm": 0.48465287685394287, + "learning_rate": 1.23871011005326e-05, + "loss": 0.004565985128283501, + "mean_token_accuracy": 0.9984864890575409, + "num_tokens": 87316264.0, + "step": 4250 + }, + { + "entropy": 0.7409733414649964, + "epoch": 7.845303867403315, + "grad_norm": 0.44551214575767517, + "learning_rate": 1.218715081352571e-05, + "loss": 0.004014456272125244, + "mean_token_accuracy": 0.9988965094089508, + "num_tokens": 87521547.0, + "step": 4260 + }, + { + "entropy": 0.7397344529628753, + "epoch": 7.863720073664825, + "grad_norm": 0.5410996079444885, + "learning_rate": 1.198860325989235e-05, + "loss": 0.0038732051849365234, + "mean_token_accuracy": 0.9988058865070343, + "num_tokens": 87727075.0, + "step": 4270 + }, + { + "entropy": 0.7383821964263916, + "epoch": 7.8821362799263355, + "grad_norm": 0.3724612891674042, + "learning_rate": 1.1791465805099183e-05, + "loss": 0.0038180787116289137, + "mean_token_accuracy": 0.9988650500774383, + "num_tokens": 87932399.0, + "step": 4280 + }, + { + "entropy": 0.73704674243927, + "epoch": 7.900552486187845, + "grad_norm": 0.44809216260910034, + "learning_rate": 1.1595745762302779e-05, + "loss": 0.0037666790187358854, + "mean_token_accuracy": 0.9988551497459411, + "num_tokens": 88138197.0, + "step": 4290 + }, + { + "entropy": 0.7348978996276856, + "epoch": 7.918968692449355, + "grad_norm": 0.3487635850906372, + "learning_rate": 1.140145039207836e-05, + "loss": 0.0034891828894615174, + "mean_token_accuracy": 0.9990123450756073, + "num_tokens": 88343771.0, + "step": 4300 + }, + { + "entropy": 0.7349947333335877, + "epoch": 7.9373848987108655, + "grad_norm": 0.45597800612449646, + "learning_rate": 1.1208586902150458e-05, + "loss": 0.0037573061883449553, + "mean_token_accuracy": 0.9988206088542938, + "num_tokens": 88549078.0, + "step": 4310 + }, + { + "entropy": 0.7384503066539765, + "epoch": 7.955801104972376, + "grad_norm": 0.4761682450771332, + "learning_rate": 1.1017162447125484e-05, + "loss": 0.004058422148227691, + "mean_token_accuracy": 0.9988016963005066, + "num_tokens": 88754354.0, + "step": 4320 + }, + { + "entropy": 0.7428180873394012, + "epoch": 7.974217311233886, + "grad_norm": 0.4685237407684326, + "learning_rate": 1.0827184128226392e-05, + "loss": 0.003867045044898987, + "mean_token_accuracy": 0.998711907863617, + "num_tokens": 88959534.0, + "step": 4330 + }, + { + "entropy": 0.7457234025001526, + "epoch": 7.9926335174953955, + "grad_norm": 0.35596850514411926, + "learning_rate": 1.0638658993029154e-05, + "loss": 0.0037776529788970947, + "mean_token_accuracy": 0.9989305913448334, + "num_tokens": 89164857.0, + "step": 4340 + }, + { + "epoch": 8.0, + "eval_entropy": 0.7504938332930855, + "eval_loss": 0.08306439220905304, + "eval_mean_token_accuracy": 0.980349570253621, + "eval_num_tokens": 89247188.0, + "eval_runtime": 10.0529, + "eval_samples_per_second": 363.875, + "eval_steps_per_second": 11.439, + "step": 4344 + }, + { + "entropy": 0.7459707975387573, + "epoch": 8.011049723756907, + "grad_norm": 0.2889060974121094, + "learning_rate": 1.0451594035201378e-05, + "loss": 0.003062780387699604, + "mean_token_accuracy": 0.9991089224815368, + "num_tokens": 89370847.0, + "step": 4350 + }, + { + "entropy": 0.7482452511787414, + "epoch": 8.029465930018416, + "grad_norm": 0.2335795760154724, + "learning_rate": 1.0265996194242888e-05, + "loss": 0.0023574704304337502, + "mean_token_accuracy": 0.9994185745716095, + "num_tokens": 89576321.0, + "step": 4360 + }, + { + "entropy": 0.7470004737377167, + "epoch": 8.047882136279926, + "grad_norm": 0.2035285383462906, + "learning_rate": 1.0081872355228228e-05, + "loss": 0.0021786754950881004, + "mean_token_accuracy": 0.9996390819549561, + "num_tokens": 89781865.0, + "step": 4370 + }, + { + "entropy": 0.7470916926860809, + "epoch": 8.066298342541437, + "grad_norm": 0.2530227303504944, + "learning_rate": 9.899229348551275e-06, + "loss": 0.0022982701659202574, + "mean_token_accuracy": 0.9995273351669312, + "num_tokens": 89987004.0, + "step": 4380 + }, + { + "entropy": 0.7481065988540649, + "epoch": 8.084714548802946, + "grad_norm": 0.269809752702713, + "learning_rate": 9.718073949671857e-06, + "loss": 0.0022342003881931304, + "mean_token_accuracy": 0.9994472205638886, + "num_tokens": 90191825.0, + "step": 4390 + }, + { + "entropy": 0.745439088344574, + "epoch": 8.103130755064457, + "grad_norm": 0.29525282979011536, + "learning_rate": 9.538412878864423e-06, + "loss": 0.002189977839589119, + "mean_token_accuracy": 0.9995133578777313, + "num_tokens": 90397588.0, + "step": 4400 + }, + { + "entropy": 0.745042335987091, + "epoch": 8.121546961325967, + "grad_norm": 0.26536691188812256, + "learning_rate": 9.360252800968717e-06, + "loss": 0.0021448172628879547, + "mean_token_accuracy": 0.9994488894939423, + "num_tokens": 90602602.0, + "step": 4410 + }, + { + "entropy": 0.7413490653038025, + "epoch": 8.139963167587476, + "grad_norm": 0.5203945636749268, + "learning_rate": 9.183600325142538e-06, + "loss": 0.002386796101927757, + "mean_token_accuracy": 0.999308317899704, + "num_tokens": 90808417.0, + "step": 4420 + }, + { + "entropy": 0.7447272837162018, + "epoch": 8.158379373848987, + "grad_norm": 0.23244412243366241, + "learning_rate": 9.008462004616558e-06, + "loss": 0.0021626869216561317, + "mean_token_accuracy": 0.9995910286903381, + "num_tokens": 91013911.0, + "step": 4430 + }, + { + "entropy": 0.7487411558628082, + "epoch": 8.176795580110497, + "grad_norm": 0.2531150281429291, + "learning_rate": 8.834844336451237e-06, + "loss": 0.0023509185761213303, + "mean_token_accuracy": 0.9994486331939697, + "num_tokens": 91219200.0, + "step": 4440 + }, + { + "entropy": 0.7508959770202637, + "epoch": 8.195211786372008, + "grad_norm": 0.3283107876777649, + "learning_rate": 8.662753761295772e-06, + "loss": 0.0020494431257247923, + "mean_token_accuracy": 0.9996370017528534, + "num_tokens": 91424124.0, + "step": 4450 + }, + { + "entropy": 0.7488435864448547, + "epoch": 8.213627992633517, + "grad_norm": 0.394175261259079, + "learning_rate": 8.492196663149232e-06, + "loss": 0.0024197638034820558, + "mean_token_accuracy": 0.999306058883667, + "num_tokens": 91629365.0, + "step": 4460 + }, + { + "entropy": 0.7459348142147064, + "epoch": 8.232044198895027, + "grad_norm": 0.36789506673812866, + "learning_rate": 8.32317936912364e-06, + "loss": 0.0022338634356856347, + "mean_token_accuracy": 0.9994658648967742, + "num_tokens": 91834941.0, + "step": 4470 + }, + { + "entropy": 0.7455970704555511, + "epoch": 8.250460405156538, + "grad_norm": 0.2969403564929962, + "learning_rate": 8.155708149209362e-06, + "loss": 0.0021990347653627396, + "mean_token_accuracy": 0.9994347035884857, + "num_tokens": 92040366.0, + "step": 4480 + }, + { + "entropy": 0.7435900688171386, + "epoch": 8.268876611418047, + "grad_norm": 0.35975271463394165, + "learning_rate": 7.989789216042415e-06, + "loss": 0.002438249811530113, + "mean_token_accuracy": 0.9993879854679107, + "num_tokens": 92246051.0, + "step": 4490 + }, + { + "entropy": 0.742461520433426, + "epoch": 8.287292817679559, + "grad_norm": 0.33160826563835144, + "learning_rate": 7.825428724674043e-06, + "loss": 0.0023146603256464005, + "mean_token_accuracy": 0.9994353473186492, + "num_tokens": 92451596.0, + "step": 4500 + }, + { + "entropy": 0.7435722470283508, + "epoch": 8.305709023941068, + "grad_norm": 0.3440259099006653, + "learning_rate": 7.662632772342415e-06, + "loss": 0.0021770250052213667, + "mean_token_accuracy": 0.9994809687137604, + "num_tokens": 92657419.0, + "step": 4510 + }, + { + "entropy": 0.7460585415363312, + "epoch": 8.324125230202577, + "grad_norm": 0.24974678456783295, + "learning_rate": 7.501407398246369e-06, + "loss": 0.0020514041185379027, + "mean_token_accuracy": 0.9994971275329589, + "num_tokens": 92863210.0, + "step": 4520 + }, + { + "entropy": 0.7475444614887238, + "epoch": 8.342541436464089, + "grad_norm": 0.31347015500068665, + "learning_rate": 7.3417585833214346e-06, + "loss": 0.0022453794255852698, + "mean_token_accuracy": 0.9995283901691436, + "num_tokens": 93068502.0, + "step": 4530 + }, + { + "entropy": 0.7466561555862427, + "epoch": 8.360957642725598, + "grad_norm": 0.2352103888988495, + "learning_rate": 7.183692250017915e-06, + "loss": 0.0020875211805105208, + "mean_token_accuracy": 0.999623715877533, + "num_tokens": 93274067.0, + "step": 4540 + }, + { + "entropy": 0.7491689443588256, + "epoch": 8.37937384898711, + "grad_norm": 0.2557368576526642, + "learning_rate": 7.027214262081239e-06, + "loss": 0.002048984169960022, + "mean_token_accuracy": 0.999556976556778, + "num_tokens": 93479279.0, + "step": 4550 + }, + { + "entropy": 0.744943630695343, + "epoch": 8.397790055248619, + "grad_norm": 0.2464229315519333, + "learning_rate": 6.872330424334395e-06, + "loss": 0.002088337019085884, + "mean_token_accuracy": 0.9994827687740326, + "num_tokens": 93684930.0, + "step": 4560 + }, + { + "entropy": 0.7461286425590515, + "epoch": 8.416206261510128, + "grad_norm": 0.2721276879310608, + "learning_rate": 6.719046482462571e-06, + "loss": 0.0020654335618019103, + "mean_token_accuracy": 0.9994162619113922, + "num_tokens": 93889972.0, + "step": 4570 + }, + { + "entropy": 0.7432287812232972, + "epoch": 8.43462246777164, + "grad_norm": 0.2500509023666382, + "learning_rate": 6.567368122800072e-06, + "loss": 0.002229658514261246, + "mean_token_accuracy": 0.9993383109569549, + "num_tokens": 94095488.0, + "step": 4580 + }, + { + "entropy": 0.742755651473999, + "epoch": 8.453038674033149, + "grad_norm": 0.17173363268375397, + "learning_rate": 6.4173009721193115e-06, + "loss": 0.0021433889865875243, + "mean_token_accuracy": 0.9994635343551636, + "num_tokens": 94300651.0, + "step": 4590 + }, + { + "entropy": 0.7388956308364868, + "epoch": 8.47145488029466, + "grad_norm": 0.274360716342926, + "learning_rate": 6.26885059742211e-06, + "loss": 0.0022360695526003837, + "mean_token_accuracy": 0.9994634568691254, + "num_tokens": 94506048.0, + "step": 4600 + }, + { + "entropy": 0.7357347309589386, + "epoch": 8.48987108655617, + "grad_norm": 0.24777138233184814, + "learning_rate": 6.122022505733205e-06, + "loss": 0.0022206470370292664, + "mean_token_accuracy": 0.9994794666767121, + "num_tokens": 94711514.0, + "step": 4610 + }, + { + "entropy": 0.735917067527771, + "epoch": 8.50828729281768, + "grad_norm": 0.3161139488220215, + "learning_rate": 5.976822143895872e-06, + "loss": 0.002088923379778862, + "mean_token_accuracy": 0.9994975507259369, + "num_tokens": 94917150.0, + "step": 4620 + }, + { + "entropy": 0.7372638165950776, + "epoch": 8.52670349907919, + "grad_norm": 0.20879895985126495, + "learning_rate": 5.833254898369972e-06, + "loss": 0.0024028895422816277, + "mean_token_accuracy": 0.9993885040283204, + "num_tokens": 95122554.0, + "step": 4630 + }, + { + "entropy": 0.7353293180465699, + "epoch": 8.5451197053407, + "grad_norm": 0.21544156968593597, + "learning_rate": 5.69132609503204e-06, + "loss": 0.0022050481289625167, + "mean_token_accuracy": 0.9993851661682129, + "num_tokens": 95327539.0, + "step": 4640 + }, + { + "entropy": 0.7328085541725159, + "epoch": 8.56353591160221, + "grad_norm": 0.2792287766933441, + "learning_rate": 5.551040998977747e-06, + "loss": 0.0022569041699171065, + "mean_token_accuracy": 0.999373483657837, + "num_tokens": 95533608.0, + "step": 4650 + }, + { + "entropy": 0.7355918467044831, + "epoch": 8.58195211786372, + "grad_norm": 0.35147684812545776, + "learning_rate": 5.412404814326633e-06, + "loss": 0.001975206658244133, + "mean_token_accuracy": 0.9996055364608765, + "num_tokens": 95738975.0, + "step": 4660 + }, + { + "entropy": 0.7370502591133118, + "epoch": 8.600368324125231, + "grad_norm": 0.2416532039642334, + "learning_rate": 5.2754226840289415e-06, + "loss": 0.002513406053185463, + "mean_token_accuracy": 0.9993549644947052, + "num_tokens": 95944370.0, + "step": 4670 + }, + { + "entropy": 0.737632417678833, + "epoch": 8.61878453038674, + "grad_norm": 0.3952357769012451, + "learning_rate": 5.140099689674926e-06, + "loss": 0.001944526843726635, + "mean_token_accuracy": 0.9995273172855377, + "num_tokens": 96149509.0, + "step": 4680 + }, + { + "entropy": 0.7384664714336395, + "epoch": 8.63720073664825, + "grad_norm": 0.40997347235679626, + "learning_rate": 5.006440851306315e-06, + "loss": 0.00224909633398056, + "mean_token_accuracy": 0.9993718564510345, + "num_tokens": 96354896.0, + "step": 4690 + }, + { + "entropy": 0.7366916120052338, + "epoch": 8.655616942909761, + "grad_norm": 0.3221156597137451, + "learning_rate": 4.874451127230057e-06, + "loss": 0.001974274218082428, + "mean_token_accuracy": 0.9994946360588074, + "num_tokens": 96559981.0, + "step": 4700 + }, + { + "entropy": 0.7354292273521423, + "epoch": 8.67403314917127, + "grad_norm": 0.25105226039886475, + "learning_rate": 4.744135413834427e-06, + "loss": 0.002092510275542736, + "mean_token_accuracy": 0.9995289027690888, + "num_tokens": 96765568.0, + "step": 4710 + }, + { + "entropy": 0.7376088976860047, + "epoch": 8.692449355432782, + "grad_norm": 0.2917911410331726, + "learning_rate": 4.615498545407343e-06, + "loss": 0.0022462595254182814, + "mean_token_accuracy": 0.9993582189083099, + "num_tokens": 96971275.0, + "step": 4720 + }, + { + "entropy": 0.74048610329628, + "epoch": 8.710865561694291, + "grad_norm": 0.27348214387893677, + "learning_rate": 4.4885452939570585e-06, + "loss": 0.0022259410470724106, + "mean_token_accuracy": 0.9994804978370666, + "num_tokens": 97176487.0, + "step": 4730 + }, + { + "entropy": 0.7381387889385224, + "epoch": 8.7292817679558, + "grad_norm": 0.43880945444107056, + "learning_rate": 4.363280369035128e-06, + "loss": 0.002389534562826157, + "mean_token_accuracy": 0.9994222104549408, + "num_tokens": 97382263.0, + "step": 4740 + }, + { + "entropy": 0.7372830450534821, + "epoch": 8.747697974217312, + "grad_norm": 0.23599931597709656, + "learning_rate": 4.2397084175616885e-06, + "loss": 0.0020194988697767257, + "mean_token_accuracy": 0.9995136618614197, + "num_tokens": 97588269.0, + "step": 4750 + }, + { + "entropy": 0.7386971414089203, + "epoch": 8.766114180478821, + "grad_norm": 0.42252203822135925, + "learning_rate": 4.117834023653117e-06, + "loss": 0.0021715080365538597, + "mean_token_accuracy": 0.9993864893913269, + "num_tokens": 97793832.0, + "step": 4760 + }, + { + "entropy": 0.7398471057415008, + "epoch": 8.784530386740332, + "grad_norm": 0.2709618806838989, + "learning_rate": 3.9976617084519e-06, + "loss": 0.0023746009916067123, + "mean_token_accuracy": 0.9993088185787201, + "num_tokens": 97999276.0, + "step": 4770 + }, + { + "entropy": 0.7391557276248932, + "epoch": 8.802946593001842, + "grad_norm": 0.30807623267173767, + "learning_rate": 3.8791959299589895e-06, + "loss": 0.001992644742131233, + "mean_token_accuracy": 0.9994801640510559, + "num_tokens": 98204933.0, + "step": 4780 + }, + { + "entropy": 0.7370819032192231, + "epoch": 8.821362799263351, + "grad_norm": 0.25062650442123413, + "learning_rate": 3.762441082868373e-06, + "loss": 0.002125708945095539, + "mean_token_accuracy": 0.999417644739151, + "num_tokens": 98410574.0, + "step": 4790 + }, + { + "entropy": 0.7399656236171722, + "epoch": 8.839779005524862, + "grad_norm": 0.32383784651756287, + "learning_rate": 3.647401498404052e-06, + "loss": 0.0019155235961079598, + "mean_token_accuracy": 0.9995088458061219, + "num_tokens": 98615509.0, + "step": 4800 + }, + { + "entropy": 0.7371088266372681, + "epoch": 8.858195211786372, + "grad_norm": 0.2771705985069275, + "learning_rate": 3.5340814441594207e-06, + "loss": 0.0020927552133798598, + "mean_token_accuracy": 0.9995276153087616, + "num_tokens": 98820673.0, + "step": 4810 + }, + { + "entropy": 0.7376474678516388, + "epoch": 8.876611418047883, + "grad_norm": 0.37182608246803284, + "learning_rate": 3.422485123938862e-06, + "loss": 0.0021230582147836684, + "mean_token_accuracy": 0.9994983315467835, + "num_tokens": 99026388.0, + "step": 4820 + }, + { + "entropy": 0.7386386632919312, + "epoch": 8.895027624309392, + "grad_norm": 0.29156213998794556, + "learning_rate": 3.3126166776018763e-06, + "loss": 0.0022009313106536865, + "mean_token_accuracy": 0.9994311630725861, + "num_tokens": 99231572.0, + "step": 4830 + }, + { + "entropy": 0.7364113330841064, + "epoch": 8.913443830570902, + "grad_norm": 0.32613781094551086, + "learning_rate": 3.2044801809094805e-06, + "loss": 0.0021279999986290933, + "mean_token_accuracy": 0.999450010061264, + "num_tokens": 99437158.0, + "step": 4840 + }, + { + "entropy": 0.7359442055225373, + "epoch": 8.931860036832413, + "grad_norm": 0.34959548711776733, + "learning_rate": 3.098079645372992e-06, + "loss": 0.002106292359530926, + "mean_token_accuracy": 0.9993851602077484, + "num_tokens": 99642599.0, + "step": 4850 + }, + { + "entropy": 0.7357041001319885, + "epoch": 8.950276243093922, + "grad_norm": 0.3114016056060791, + "learning_rate": 2.993419018105248e-06, + "loss": 0.002017174661159515, + "mean_token_accuracy": 0.9994815111160278, + "num_tokens": 99848146.0, + "step": 4860 + }, + { + "entropy": 0.7360438585281373, + "epoch": 8.968692449355434, + "grad_norm": 0.1850976198911667, + "learning_rate": 2.890502181674154e-06, + "loss": 0.0017870433628559112, + "mean_token_accuracy": 0.9996066927909851, + "num_tokens": 100053727.0, + "step": 4870 + }, + { + "entropy": 0.7401692926883697, + "epoch": 8.987108655616943, + "grad_norm": 0.22801977396011353, + "learning_rate": 2.7893329539586678e-06, + "loss": 0.0018131747841835023, + "mean_token_accuracy": 0.9995904862880707, + "num_tokens": 100259212.0, + "step": 4880 + }, + { + "epoch": 9.0, + "eval_entropy": 0.7392174223194952, + "eval_loss": 0.08729223161935806, + "eval_mean_token_accuracy": 0.9803872994754625, + "eval_num_tokens": 100403043.0, + "eval_runtime": 10.0788, + "eval_samples_per_second": 362.941, + "eval_steps_per_second": 11.41, + "step": 4887 + }, + { + "entropy": 0.7386398851871491, + "epoch": 9.005524861878452, + "grad_norm": 0.28971442580223083, + "learning_rate": 2.689915088007161e-06, + "loss": 0.0019765887409448624, + "mean_token_accuracy": 0.9995899498462677, + "num_tokens": 100464463.0, + "step": 4890 + }, + { + "entropy": 0.7406698048114777, + "epoch": 9.023941068139964, + "grad_norm": 0.35152414441108704, + "learning_rate": 2.5922522718981822e-06, + "loss": 0.0015977894887328147, + "mean_token_accuracy": 0.9996071636676789, + "num_tokens": 100669624.0, + "step": 4900 + }, + { + "entropy": 0.739100044965744, + "epoch": 9.042357274401473, + "grad_norm": 0.1938054859638214, + "learning_rate": 2.496348128603693e-06, + "loss": 0.0015546448528766632, + "mean_token_accuracy": 0.9997159540653229, + "num_tokens": 100875003.0, + "step": 4910 + }, + { + "entropy": 0.7400683701038361, + "epoch": 9.060773480662984, + "grad_norm": 0.16299064457416534, + "learning_rate": 2.402206215854591e-06, + "loss": 0.0015749253332614898, + "mean_token_accuracy": 0.9997485220432282, + "num_tokens": 101080504.0, + "step": 4920 + }, + { + "entropy": 0.7379888653755188, + "epoch": 9.079189686924494, + "grad_norm": 0.2756253480911255, + "learning_rate": 2.309830026008791e-06, + "loss": 0.0015607805922627448, + "mean_token_accuracy": 0.9997011303901673, + "num_tokens": 101286289.0, + "step": 4930 + }, + { + "entropy": 0.7385160863399506, + "epoch": 9.097605893186003, + "grad_norm": 0.2883203327655792, + "learning_rate": 2.219222985921637e-06, + "loss": 0.0017442166805267333, + "mean_token_accuracy": 0.9997642278671265, + "num_tokens": 101491893.0, + "step": 4940 + }, + { + "entropy": 0.7401205420494079, + "epoch": 9.116022099447514, + "grad_norm": 0.13606344163417816, + "learning_rate": 2.1303884568187882e-06, + "loss": 0.0015275017358362675, + "mean_token_accuracy": 0.99971724152565, + "num_tokens": 101697507.0, + "step": 4950 + }, + { + "entropy": 0.7398943901062012, + "epoch": 9.134438305709024, + "grad_norm": 0.20152442157268524, + "learning_rate": 2.043329734171534e-06, + "loss": 0.0014851298183202744, + "mean_token_accuracy": 0.9997945368289948, + "num_tokens": 101902695.0, + "step": 4960 + }, + { + "entropy": 0.7379605591297149, + "epoch": 9.152854511970535, + "grad_norm": 0.24019353091716766, + "learning_rate": 1.9580500475745332e-06, + "loss": 0.00150354765355587, + "mean_token_accuracy": 0.9997339248657227, + "num_tokens": 102108345.0, + "step": 4970 + }, + { + "entropy": 0.7367536067962647, + "epoch": 9.171270718232044, + "grad_norm": 0.15596628189086914, + "learning_rate": 1.8745525606259972e-06, + "loss": 0.0015196558088064194, + "mean_token_accuracy": 0.9997179448604584, + "num_tokens": 102314461.0, + "step": 4980 + }, + { + "entropy": 0.7380897223949432, + "epoch": 9.189686924493554, + "grad_norm": 0.27068376541137695, + "learning_rate": 1.792840370810367e-06, + "loss": 0.0017002832144498825, + "mean_token_accuracy": 0.9996692836284637, + "num_tokens": 102519944.0, + "step": 4990 + }, + { + "entropy": 0.738553661108017, + "epoch": 9.208103130755065, + "grad_norm": 0.2920447289943695, + "learning_rate": 1.712916509383361e-06, + "loss": 0.001522800512611866, + "mean_token_accuracy": 0.9997173726558686, + "num_tokens": 102725507.0, + "step": 5000 + }, + { + "entropy": 0.7379853069782257, + "epoch": 9.226519337016574, + "grad_norm": 0.21655166149139404, + "learning_rate": 1.6347839412595655e-06, + "loss": 0.0014866959303617478, + "mean_token_accuracy": 0.9997638881206512, + "num_tokens": 102931093.0, + "step": 5010 + }, + { + "entropy": 0.7364933252334595, + "epoch": 9.244935543278086, + "grad_norm": 0.2438272386789322, + "learning_rate": 1.558445564902411e-06, + "loss": 0.0016015460714697838, + "mean_token_accuracy": 0.9996855020523071, + "num_tokens": 103136804.0, + "step": 5020 + }, + { + "entropy": 0.7388014018535614, + "epoch": 9.263351749539595, + "grad_norm": 0.19854310154914856, + "learning_rate": 1.4839042122166803e-06, + "loss": 0.0015659671276807784, + "mean_token_accuracy": 0.9997456073760986, + "num_tokens": 103341751.0, + "step": 5030 + }, + { + "entropy": 0.7381871581077576, + "epoch": 9.281767955801104, + "grad_norm": 0.19087830185890198, + "learning_rate": 1.4111626484434315e-06, + "loss": 0.0017243871465325356, + "mean_token_accuracy": 0.9996393024921417, + "num_tokens": 103547225.0, + "step": 5040 + }, + { + "entropy": 0.7398294270038605, + "epoch": 9.300184162062616, + "grad_norm": 0.20569592714309692, + "learning_rate": 1.340223572057414e-06, + "loss": 0.001436135545372963, + "mean_token_accuracy": 0.9998414158821106, + "num_tokens": 103752302.0, + "step": 5050 + }, + { + "entropy": 0.7395297527313233, + "epoch": 9.318600368324125, + "grad_norm": 0.2108682245016098, + "learning_rate": 1.2710896146669927e-06, + "loss": 0.0013779642060399056, + "mean_token_accuracy": 0.9997463762760163, + "num_tokens": 103957465.0, + "step": 5060 + }, + { + "entropy": 0.7404512584209442, + "epoch": 9.337016574585636, + "grad_norm": 0.20543107390403748, + "learning_rate": 1.2037633409165028e-06, + "loss": 0.0013825546018779277, + "mean_token_accuracy": 0.999826842546463, + "num_tokens": 104162599.0, + "step": 5070 + }, + { + "entropy": 0.7397530496120452, + "epoch": 9.355432780847146, + "grad_norm": 0.23482060432434082, + "learning_rate": 1.1382472483911e-06, + "loss": 0.001441533863544464, + "mean_token_accuracy": 0.9997948348522187, + "num_tokens": 104367879.0, + "step": 5080 + }, + { + "entropy": 0.739465343952179, + "epoch": 9.373848987108655, + "grad_norm": 0.23690862953662872, + "learning_rate": 1.0745437675241333e-06, + "loss": 0.0018115714192390442, + "mean_token_accuracy": 0.9996071577072143, + "num_tokens": 104573222.0, + "step": 5090 + }, + { + "entropy": 0.7405745625495911, + "epoch": 9.392265193370166, + "grad_norm": 0.1851053088903427, + "learning_rate": 1.0126552615069618e-06, + "loss": 0.0016087744385004043, + "mean_token_accuracy": 0.9997322618961334, + "num_tokens": 104778417.0, + "step": 5100 + }, + { + "entropy": 0.7388820052146912, + "epoch": 9.410681399631676, + "grad_norm": 0.3480280041694641, + "learning_rate": 9.525840262013086e-07, + "loss": 0.0015109233558177949, + "mean_token_accuracy": 0.9997011184692383, + "num_tokens": 104983881.0, + "step": 5110 + }, + { + "entropy": 0.7403645634651184, + "epoch": 9.429097605893187, + "grad_norm": 0.199057474732399, + "learning_rate": 8.943322900540619e-07, + "loss": 0.0014625045470893383, + "mean_token_accuracy": 0.9997631847858429, + "num_tokens": 105189174.0, + "step": 5120 + }, + { + "entropy": 0.7373388469219208, + "epoch": 9.447513812154696, + "grad_norm": 0.15488000214099884, + "learning_rate": 8.379022140146464e-07, + "loss": 0.0017985764890909195, + "mean_token_accuracy": 0.9995609581470489, + "num_tokens": 105394991.0, + "step": 5130 + }, + { + "entropy": 0.7373765110969543, + "epoch": 9.465930018416206, + "grad_norm": 0.18474414944648743, + "learning_rate": 7.832958914548328e-07, + "loss": 0.0015545262023806572, + "mean_token_accuracy": 0.9996864259243011, + "num_tokens": 105600647.0, + "step": 5140 + }, + { + "entropy": 0.7392567217350006, + "epoch": 9.484346224677717, + "grad_norm": 0.1750747412443161, + "learning_rate": 7.305153480910709e-07, + "loss": 0.0012717658653855323, + "mean_token_accuracy": 0.9997801303863525, + "num_tokens": 105806087.0, + "step": 5150 + }, + { + "entropy": 0.7382550954818725, + "epoch": 9.502762430939226, + "grad_norm": 0.20807236433029175, + "learning_rate": 6.795625419093787e-07, + "loss": 0.0014687830582261086, + "mean_token_accuracy": 0.9997487604618073, + "num_tokens": 106011606.0, + "step": 5160 + }, + { + "entropy": 0.7380134582519531, + "epoch": 9.521178637200737, + "grad_norm": 0.22601647675037384, + "learning_rate": 6.30439363092672e-07, + "loss": 0.001499163918197155, + "mean_token_accuracy": 0.9996694803237915, + "num_tokens": 106217292.0, + "step": 5170 + }, + { + "entropy": 0.7392662584781646, + "epoch": 9.539594843462247, + "grad_norm": 0.18030250072479248, + "learning_rate": 5.831476339506703e-07, + "loss": 0.0014147022739052773, + "mean_token_accuracy": 0.9997945427894592, + "num_tokens": 106422508.0, + "step": 5180 + }, + { + "entropy": 0.7376449704170227, + "epoch": 9.558011049723756, + "grad_norm": 0.18657395243644714, + "learning_rate": 5.376891088522684e-07, + "loss": 0.001451531518250704, + "mean_token_accuracy": 0.9997494041919708, + "num_tokens": 106628357.0, + "step": 5190 + }, + { + "entropy": 0.7384322106838226, + "epoch": 9.576427255985267, + "grad_norm": 0.18637168407440186, + "learning_rate": 4.940654741604822e-07, + "loss": 0.0015604430809617043, + "mean_token_accuracy": 0.9997960567474365, + "num_tokens": 106833931.0, + "step": 5200 + }, + { + "entropy": 0.7389702200889587, + "epoch": 9.594843462246777, + "grad_norm": 0.19192856550216675, + "learning_rate": 4.522783481698767e-07, + "loss": 0.0016801396384835243, + "mean_token_accuracy": 0.9997003376483917, + "num_tokens": 107039353.0, + "step": 5210 + }, + { + "entropy": 0.7389976680278778, + "epoch": 9.613259668508288, + "grad_norm": 0.1852940022945404, + "learning_rate": 4.1232928104653067e-07, + "loss": 0.0013099458068609239, + "mean_token_accuracy": 0.9997794568538666, + "num_tokens": 107244677.0, + "step": 5220 + }, + { + "entropy": 0.739892303943634, + "epoch": 9.631675874769797, + "grad_norm": 0.304728627204895, + "learning_rate": 3.742197547705384e-07, + "loss": 0.0015266045928001403, + "mean_token_accuracy": 0.9997328281402588, + "num_tokens": 107450312.0, + "step": 5230 + }, + { + "entropy": 0.7386184096336365, + "epoch": 9.650092081031307, + "grad_norm": 0.23618076741695404, + "learning_rate": 3.3795118308102e-07, + "loss": 0.0014404524117708206, + "mean_token_accuracy": 0.9997794568538666, + "num_tokens": 107655688.0, + "step": 5240 + }, + { + "entropy": 0.738778418302536, + "epoch": 9.668508287292818, + "grad_norm": 0.12527571618556976, + "learning_rate": 3.035249114236915e-07, + "loss": 0.0015100114047527312, + "mean_token_accuracy": 0.9997797787189484, + "num_tokens": 107861360.0, + "step": 5250 + }, + { + "entropy": 0.7354449093341827, + "epoch": 9.686924493554327, + "grad_norm": 0.22283077239990234, + "learning_rate": 2.70942216900949e-07, + "loss": 0.0015022851526737212, + "mean_token_accuracy": 0.999655795097351, + "num_tokens": 108067536.0, + "step": 5260 + }, + { + "entropy": 0.7414606928825378, + "epoch": 9.705340699815839, + "grad_norm": 0.21263206005096436, + "learning_rate": 2.402043082244898e-07, + "loss": 0.001539918314665556, + "mean_token_accuracy": 0.9998098492622376, + "num_tokens": 108272273.0, + "step": 5270 + }, + { + "entropy": 0.7401190042495728, + "epoch": 9.723756906077348, + "grad_norm": 0.15394610166549683, + "learning_rate": 2.1131232567046522e-07, + "loss": 0.0014080126769840718, + "mean_token_accuracy": 0.9997639358043671, + "num_tokens": 108477453.0, + "step": 5280 + }, + { + "entropy": 0.7389390110969544, + "epoch": 9.742173112338858, + "grad_norm": 0.2818741202354431, + "learning_rate": 1.84267341037192e-07, + "loss": 0.0017805691808462144, + "mean_token_accuracy": 0.9997964143753052, + "num_tokens": 108682971.0, + "step": 5290 + }, + { + "entropy": 0.7389884531497956, + "epoch": 9.760589318600369, + "grad_norm": 0.31309419870376587, + "learning_rate": 1.5907035760539534e-07, + "loss": 0.0017935600131750106, + "mean_token_accuracy": 0.9997329533100128, + "num_tokens": 108888381.0, + "step": 5300 + }, + { + "entropy": 0.7414484560489655, + "epoch": 9.779005524861878, + "grad_norm": 0.2706625163555145, + "learning_rate": 1.3572231010097193e-07, + "loss": 0.0014929558150470258, + "mean_token_accuracy": 0.9996992349624634, + "num_tokens": 109093131.0, + "step": 5310 + }, + { + "entropy": 0.7389127910137177, + "epoch": 9.79742173112339, + "grad_norm": 0.25215238332748413, + "learning_rate": 1.1422406466033431e-07, + "loss": 0.0015146594494581222, + "mean_token_accuracy": 0.999780786037445, + "num_tokens": 109298763.0, + "step": 5320 + }, + { + "entropy": 0.7388081133365632, + "epoch": 9.815837937384899, + "grad_norm": 0.18103563785552979, + "learning_rate": 9.457641879827006e-08, + "loss": 0.0014452625066041947, + "mean_token_accuracy": 0.9997159004211426, + "num_tokens": 109504011.0, + "step": 5330 + }, + { + "entropy": 0.7377239882946014, + "epoch": 9.834254143646408, + "grad_norm": 0.1665307879447937, + "learning_rate": 7.678010137835422e-08, + "loss": 0.0014774853363633156, + "mean_token_accuracy": 0.999732518196106, + "num_tokens": 109709558.0, + "step": 5340 + }, + { + "entropy": 0.7402674973011016, + "epoch": 9.85267034990792, + "grad_norm": 0.278103232383728, + "learning_rate": 6.083577258591544e-08, + "loss": 0.0013820343650877477, + "mean_token_accuracy": 0.999780023097992, + "num_tokens": 109914850.0, + "step": 5350 + }, + { + "entropy": 0.7407928586006165, + "epoch": 9.871086556169429, + "grad_norm": 0.20344048738479614, + "learning_rate": 4.674402390355548e-08, + "loss": 0.001439080573618412, + "mean_token_accuracy": 0.9997944951057434, + "num_tokens": 110120014.0, + "step": 5360 + }, + { + "entropy": 0.7369158685207366, + "epoch": 9.88950276243094, + "grad_norm": 0.20637933909893036, + "learning_rate": 3.450537808918908e-08, + "loss": 0.001433834433555603, + "mean_token_accuracy": 0.9997311711311341, + "num_tokens": 110325492.0, + "step": 5370 + }, + { + "entropy": 0.7403705477714538, + "epoch": 9.90791896869245, + "grad_norm": 0.1724194586277008, + "learning_rate": 2.4120289156653876e-08, + "loss": 0.0016179192811250688, + "mean_token_accuracy": 0.9997326374053955, + "num_tokens": 110530951.0, + "step": 5380 + }, + { + "entropy": 0.7363588571548462, + "epoch": 9.926335174953959, + "grad_norm": 0.16912443935871124, + "learning_rate": 1.5589142358873878e-08, + "loss": 0.0016466960310935973, + "mean_token_accuracy": 0.9997175872325897, + "num_tokens": 110736950.0, + "step": 5390 + }, + { + "entropy": 0.7375564873218536, + "epoch": 9.94475138121547, + "grad_norm": 0.1804341822862625, + "learning_rate": 8.912254173570889e-09, + "loss": 0.0014190340414643287, + "mean_token_accuracy": 0.9997477352619171, + "num_tokens": 110942519.0, + "step": 5400 + }, + { + "entropy": 0.7391844034194947, + "epoch": 9.96316758747698, + "grad_norm": 0.20264621078968048, + "learning_rate": 4.0898722915239104e-09, + "loss": 0.001664016954600811, + "mean_token_accuracy": 0.9996694087982178, + "num_tokens": 111147891.0, + "step": 5410 + }, + { + "entropy": 0.7379689931869506, + "epoch": 9.98158379373849, + "grad_norm": 0.23066288232803345, + "learning_rate": 1.1221756073709345e-09, + "loss": 0.0015935502946376801, + "mean_token_accuracy": 0.9997010469436646, + "num_tokens": 111353305.0, + "step": 5420 + }, + { + "entropy": 0.7381729364395142, + "epoch": 10.0, + "grad_norm": 0.25462397933006287, + "learning_rate": 9.274212975363128e-12, + "loss": 0.0016142765060067176, + "mean_token_accuracy": 0.9997016966342926, + "num_tokens": 111558957.0, + "step": 5430 + }, + { + "epoch": 10.0, + "eval_entropy": 0.7430633855902631, + "eval_loss": 0.08843734115362167, + "eval_mean_token_accuracy": 0.9804784380871316, + "eval_num_tokens": 111558957.0, + "eval_runtime": 10.0419, + "eval_samples_per_second": 364.275, + "eval_steps_per_second": 11.452, + "step": 5430 + } + ], + "logging_steps": 10, + "max_steps": 5430, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.318202559054217e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5430/training_args.bin b/checkpoint-5430/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e182f9a244f56249cd86c9db6563a54fbeed7bd --- /dev/null +++ b/checkpoint-5430/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21325c9bdff5ed34f0cc34837ee67ed216c9301ab4d9b2e26f048b563564bd75 +size 5777 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d75d3bb5300d205e48769cc1999073ab5971214 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e182f9a244f56249cd86c9db6563a54fbeed7bd --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21325c9bdff5ed34f0cc34837ee67ed216c9301ab4d9b2e26f048b563564bd75 +size 5777