diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..417595f5ada8dc8a6dfa2721b701d7578fdd4fb4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-1878/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2191/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2504/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2817/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3130/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md index 7b95401dc46245ac339fc25059d4a56d90b4cde5..8ed23a191760512ab6c043027d7cdd13e4a720e5 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,62 @@ ---- -license: apache-2.0 ---- +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +library_name: peft +model_name: sft-Llama-3.1-8B-Instruct-tiger-perturbed-both +tags: +- base_model:adapter:meta-llama/Llama-3.1-8B-Instruct +- lora +- sft +- transformers +- trl +licence: license +pipeline_tag: text-generation +--- + +# Model Card for sft-Llama-3.1-8B-Instruct-tiger-perturbed-both + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/robusteval/huggingface/runs/3oacqfu7) + + +This model was trained with SFT. + +### Framework versions + +- PEFT 0.17.1 +- TRL: 0.23.0 +- Transformers: 4.56.1 +- Pytorch: 2.8.0 +- Datasets: 4.0.0 +- Tokenizers: 0.22.0 + +## Citations + + + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8cd4cdaf30ceb587991efae70006ef463605c378 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "o_proj", + "k_proj", + "gate_proj", + "v_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..859e9fc4591cabb8a4d2c0188e37fb97f6082fda --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c12804c41316890b8e14ea34914bc845a667e4232adadde80c1ee814cc01eeb +size 335604696 diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..33089ace1be88f22a10fe861ad49718d5d886090 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,109 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {#- This means we're in ipython mode #} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-1878/README.md b/checkpoint-1878/README.md new file mode 100644 index 0000000000000000000000000000000000000000..96b9f5618833a1728fbecbefb87f08b279b6b2ed --- /dev/null +++ b/checkpoint-1878/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.1-8B-Instruct +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/checkpoint-1878/adapter_config.json b/checkpoint-1878/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8cd4cdaf30ceb587991efae70006ef463605c378 --- /dev/null +++ b/checkpoint-1878/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "o_proj", + "k_proj", + "gate_proj", + "v_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1878/adapter_model.safetensors b/checkpoint-1878/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f98531415e3c5f94ba6331cf5c84f165f2f76e75 --- /dev/null +++ b/checkpoint-1878/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af28b7a1d623dbf2f3242ced316cb10babe6027a76c12d3e6fd8b6b311a8ac80 +size 335604696 diff --git a/checkpoint-1878/chat_template.jinja b/checkpoint-1878/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..33089ace1be88f22a10fe861ad49718d5d886090 --- /dev/null +++ b/checkpoint-1878/chat_template.jinja @@ -0,0 +1,109 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {#- This means we're in ipython mode #} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-1878/optimizer.pt b/checkpoint-1878/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b39d2122226ff75b3fe6ecd08845ed4d55830edc --- /dev/null +++ b/checkpoint-1878/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5813c7935eedbc3dde95dc7ff59078459e1f2b91179f84fc746466de3490011f +size 671473443 diff --git a/checkpoint-1878/rng_state.pth b/checkpoint-1878/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6dbb7b30aa496ed3d10fe53398337160db16c297 --- /dev/null +++ b/checkpoint-1878/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:faad39c48ec928dfe53df7300f3a7d6585f1f7ae590cdae863d5cd9933b7ff47 +size 14645 diff --git a/checkpoint-1878/scheduler.pt b/checkpoint-1878/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e52a7f8e215bce419e442ec8c0eac933c8a618bc --- /dev/null +++ b/checkpoint-1878/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31ee5619a0d5008a4c7d9efbe00ce6260807337e0a7b72020c25f086c43a4b71 +size 1465 diff --git a/checkpoint-1878/special_tokens_map.json b/checkpoint-1878/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..02ee80b6196926a5ad790a004d9efd6ab1ba6542 --- /dev/null +++ b/checkpoint-1878/special_tokens_map.json @@ -0,0 +1,16 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1878/tokenizer.json b/checkpoint-1878/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-1878/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-1878/tokenizer_config.json b/checkpoint-1878/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8b0c7c141373ca36e5e819a28f60e146ccef652f --- /dev/null +++ b/checkpoint-1878/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-1878/trainer_state.json b/checkpoint-1878/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..83c586cc7f65243b3e615e91814b2b35c55b8d3d --- /dev/null +++ b/checkpoint-1878/trainer_state.json @@ -0,0 +1,1904 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 1878, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.098961293697357, + "epoch": 0.032, + "grad_norm": 1.891703486442566, + "learning_rate": 1.9148936170212767e-06, + "loss": 2.0828, + "mean_token_accuracy": 0.530680388212204, + "num_tokens": 72723.0, + "step": 10 + }, + { + "entropy": 2.119775766134262, + "epoch": 0.064, + "grad_norm": 1.2044862508773804, + "learning_rate": 4.042553191489362e-06, + "loss": 2.0093, + "mean_token_accuracy": 0.5355814293026924, + "num_tokens": 146392.0, + "step": 20 + }, + { + "entropy": 2.220579963922501, + "epoch": 0.096, + "grad_norm": 0.9982365369796753, + "learning_rate": 6.170212765957447e-06, + "loss": 1.8939, + "mean_token_accuracy": 0.5451944440603256, + "num_tokens": 223711.0, + "step": 30 + }, + { + "entropy": 2.382017892599106, + "epoch": 0.128, + "grad_norm": 0.7386544346809387, + "learning_rate": 8.297872340425532e-06, + "loss": 1.9066, + "mean_token_accuracy": 0.5411656655371189, + "num_tokens": 300889.0, + "step": 40 + }, + { + "entropy": 2.274736815690994, + "epoch": 0.16, + "grad_norm": 0.6412256956100464, + "learning_rate": 1.0425531914893619e-05, + "loss": 1.7387, + "mean_token_accuracy": 0.5679451540112496, + "num_tokens": 377362.0, + "step": 50 + }, + { + "entropy": 2.3663365960121157, + "epoch": 0.192, + "grad_norm": 0.6228290796279907, + "learning_rate": 1.2553191489361702e-05, + "loss": 1.7492, + "mean_token_accuracy": 0.5746532663702965, + "num_tokens": 449594.0, + "step": 60 + }, + { + "entropy": 2.315044218301773, + "epoch": 0.224, + "grad_norm": 0.6034156680107117, + "learning_rate": 1.4680851063829789e-05, + "loss": 1.7111, + "mean_token_accuracy": 0.5675176709890366, + "num_tokens": 523439.0, + "step": 70 + }, + { + "entropy": 2.288265961408615, + "epoch": 0.256, + "grad_norm": 0.45914268493652344, + "learning_rate": 1.6808510638297873e-05, + "loss": 1.6931, + "mean_token_accuracy": 0.5713589735329151, + "num_tokens": 599650.0, + "step": 80 + }, + { + "entropy": 2.2693382859230042, + "epoch": 0.288, + "grad_norm": 0.6197793483734131, + "learning_rate": 1.893617021276596e-05, + "loss": 1.6542, + "mean_token_accuracy": 0.578165066242218, + "num_tokens": 675377.0, + "step": 90 + }, + { + "entropy": 2.293796479701996, + "epoch": 0.32, + "grad_norm": 0.5502006411552429, + "learning_rate": 1.9999866154043656e-05, + "loss": 1.7108, + "mean_token_accuracy": 0.5681634023785591, + "num_tokens": 751838.0, + "step": 100 + }, + { + "entropy": 2.2658903509378434, + "epoch": 0.352, + "grad_norm": 0.5713317394256592, + "learning_rate": 1.9998795407890486e-05, + "loss": 1.6168, + "mean_token_accuracy": 0.5843982398509979, + "num_tokens": 825539.0, + "step": 110 + }, + { + "entropy": 2.270280033349991, + "epoch": 0.384, + "grad_norm": 0.5967482924461365, + "learning_rate": 1.999665403023542e-05, + "loss": 1.6194, + "mean_token_accuracy": 0.5839526921510696, + "num_tokens": 897258.0, + "step": 120 + }, + { + "entropy": 2.2349284648895265, + "epoch": 0.416, + "grad_norm": 0.4899630844593048, + "learning_rate": 1.9993442250368708e-05, + "loss": 1.6313, + "mean_token_accuracy": 0.5815729826688767, + "num_tokens": 973142.0, + "step": 130 + }, + { + "entropy": 2.245553806424141, + "epoch": 0.448, + "grad_norm": 0.6546034812927246, + "learning_rate": 1.9989160412195047e-05, + "loss": 1.6395, + "mean_token_accuracy": 0.5780692532658577, + "num_tokens": 1046762.0, + "step": 140 + }, + { + "entropy": 2.288555932044983, + "epoch": 0.48, + "grad_norm": 0.5528404116630554, + "learning_rate": 1.9983808974196752e-05, + "loss": 1.7118, + "mean_token_accuracy": 0.5686657652258873, + "num_tokens": 1125167.0, + "step": 150 + }, + { + "entropy": 2.232080355286598, + "epoch": 0.512, + "grad_norm": 0.5887461304664612, + "learning_rate": 1.9977388509384656e-05, + "loss": 1.6339, + "mean_token_accuracy": 0.5838325396180153, + "num_tokens": 1199589.0, + "step": 160 + }, + { + "entropy": 2.2232475757598875, + "epoch": 0.544, + "grad_norm": 0.5764511823654175, + "learning_rate": 1.9969899705236763e-05, + "loss": 1.6173, + "mean_token_accuracy": 0.5848860442638397, + "num_tokens": 1276431.0, + "step": 170 + }, + { + "entropy": 2.244092071056366, + "epoch": 0.576, + "grad_norm": 0.6295827627182007, + "learning_rate": 1.9961343363624626e-05, + "loss": 1.6017, + "mean_token_accuracy": 0.5818701103329659, + "num_tokens": 1350012.0, + "step": 180 + }, + { + "entropy": 2.237305074930191, + "epoch": 0.608, + "grad_norm": 0.5939638018608093, + "learning_rate": 1.9951720400727495e-05, + "loss": 1.6704, + "mean_token_accuracy": 0.5779796853661537, + "num_tokens": 1423391.0, + "step": 190 + }, + { + "entropy": 2.211505854129791, + "epoch": 0.64, + "grad_norm": 0.6119778156280518, + "learning_rate": 1.9941031846934213e-05, + "loss": 1.6223, + "mean_token_accuracy": 0.5848233133554459, + "num_tokens": 1499124.0, + "step": 200 + }, + { + "entropy": 2.2195493161678312, + "epoch": 0.672, + "grad_norm": 0.6129831671714783, + "learning_rate": 1.9929278846732883e-05, + "loss": 1.5886, + "mean_token_accuracy": 0.5897421136498451, + "num_tokens": 1573541.0, + "step": 210 + }, + { + "entropy": 2.2096123576164244, + "epoch": 0.704, + "grad_norm": 0.6091306209564209, + "learning_rate": 1.9916462658588328e-05, + "loss": 1.6031, + "mean_token_accuracy": 0.5894487425684929, + "num_tokens": 1649546.0, + "step": 220 + }, + { + "entropy": 2.249841979146004, + "epoch": 0.736, + "grad_norm": 0.570816695690155, + "learning_rate": 1.9902584654807325e-05, + "loss": 1.5876, + "mean_token_accuracy": 0.5911228567361831, + "num_tokens": 1722199.0, + "step": 230 + }, + { + "entropy": 2.1915894985198974, + "epoch": 0.768, + "grad_norm": 0.5748864412307739, + "learning_rate": 1.988764632139168e-05, + "loss": 1.5963, + "mean_token_accuracy": 0.5891387596726417, + "num_tokens": 1797304.0, + "step": 240 + }, + { + "entropy": 2.2358563423156737, + "epoch": 0.8, + "grad_norm": 0.6511492729187012, + "learning_rate": 1.9871649257879115e-05, + "loss": 1.6453, + "mean_token_accuracy": 0.5792816638946533, + "num_tokens": 1870113.0, + "step": 250 + }, + { + "entropy": 2.2169984579086304, + "epoch": 0.832, + "grad_norm": 0.5317641496658325, + "learning_rate": 1.9854595177171968e-05, + "loss": 1.6594, + "mean_token_accuracy": 0.577045065164566, + "num_tokens": 1947405.0, + "step": 260 + }, + { + "entropy": 2.2434292674064635, + "epoch": 0.864, + "grad_norm": 0.5399971604347229, + "learning_rate": 1.9836485905353823e-05, + "loss": 1.7158, + "mean_token_accuracy": 0.5683416239917278, + "num_tokens": 2026284.0, + "step": 270 + }, + { + "entropy": 2.227828550338745, + "epoch": 0.896, + "grad_norm": 0.5378643870353699, + "learning_rate": 1.9817323381493933e-05, + "loss": 1.6714, + "mean_token_accuracy": 0.5818367518484593, + "num_tokens": 2103986.0, + "step": 280 + }, + { + "entropy": 2.2110894501209257, + "epoch": 0.928, + "grad_norm": 0.5195969343185425, + "learning_rate": 1.979710965743964e-05, + "loss": 1.6239, + "mean_token_accuracy": 0.5819958478212357, + "num_tokens": 2177010.0, + "step": 290 + }, + { + "entropy": 2.1666628658771514, + "epoch": 0.96, + "grad_norm": 0.5663164258003235, + "learning_rate": 1.977584689759664e-05, + "loss": 1.6024, + "mean_token_accuracy": 0.5876665830612182, + "num_tokens": 2251285.0, + "step": 300 + }, + { + "entropy": 2.214203083515167, + "epoch": 0.992, + "grad_norm": 0.6764860153198242, + "learning_rate": 1.9753537378697237e-05, + "loss": 1.6446, + "mean_token_accuracy": 0.5818003416061401, + "num_tokens": 2325752.0, + "step": 310 + }, + { + "entropy": 2.16783396821273, + "epoch": 1.0224, + "grad_norm": 0.5795008540153503, + "learning_rate": 1.9730183489556563e-05, + "loss": 1.594, + "mean_token_accuracy": 0.5867547392845154, + "num_tokens": 2396254.0, + "step": 320 + }, + { + "entropy": 2.172953352332115, + "epoch": 1.0544, + "grad_norm": 0.6686444282531738, + "learning_rate": 1.9705787730816776e-05, + "loss": 1.613, + "mean_token_accuracy": 0.5906373374164104, + "num_tokens": 2470123.0, + "step": 330 + }, + { + "entropy": 2.2217346757650374, + "epoch": 1.0864, + "grad_norm": 0.6389091610908508, + "learning_rate": 1.9680352714679324e-05, + "loss": 1.7053, + "mean_token_accuracy": 0.577599074691534, + "num_tokens": 2545749.0, + "step": 340 + }, + { + "entropy": 2.138428696990013, + "epoch": 1.1184, + "grad_norm": 0.7369883060455322, + "learning_rate": 1.9653881164625234e-05, + "loss": 1.5599, + "mean_token_accuracy": 0.5946489304304123, + "num_tokens": 2623270.0, + "step": 350 + }, + { + "entropy": 2.147254040837288, + "epoch": 1.1504, + "grad_norm": 0.6707085967063904, + "learning_rate": 1.9626375915123473e-05, + "loss": 1.5843, + "mean_token_accuracy": 0.5965728983283043, + "num_tokens": 2697616.0, + "step": 360 + }, + { + "entropy": 2.1412769109010696, + "epoch": 1.1824, + "grad_norm": 0.7201400995254517, + "learning_rate": 1.9597839911327475e-05, + "loss": 1.58, + "mean_token_accuracy": 0.5957784004509449, + "num_tokens": 2771426.0, + "step": 370 + }, + { + "entropy": 2.164059528708458, + "epoch": 1.2144, + "grad_norm": 0.7561144232749939, + "learning_rate": 1.9568276208759772e-05, + "loss": 1.5872, + "mean_token_accuracy": 0.5874110117554665, + "num_tokens": 2846711.0, + "step": 380 + }, + { + "entropy": 2.205427420139313, + "epoch": 1.2464, + "grad_norm": 0.691585898399353, + "learning_rate": 1.9537687972984804e-05, + "loss": 1.625, + "mean_token_accuracy": 0.5892911069095135, + "num_tokens": 2920916.0, + "step": 390 + }, + { + "entropy": 2.1242104679346085, + "epoch": 1.2784, + "grad_norm": 0.6999676823616028, + "learning_rate": 1.950607847926999e-05, + "loss": 1.5606, + "mean_token_accuracy": 0.5917269751429558, + "num_tokens": 2996056.0, + "step": 400 + }, + { + "entropy": 2.114223065972328, + "epoch": 1.3104, + "grad_norm": 0.7616406679153442, + "learning_rate": 1.947345111223502e-05, + "loss": 1.5296, + "mean_token_accuracy": 0.5938275754451752, + "num_tokens": 3072912.0, + "step": 410 + }, + { + "entropy": 2.1418962299823763, + "epoch": 1.3424, + "grad_norm": 0.7253025770187378, + "learning_rate": 1.943980936548942e-05, + "loss": 1.575, + "mean_token_accuracy": 0.5945621818304062, + "num_tokens": 3148498.0, + "step": 420 + }, + { + "entropy": 2.109667718410492, + "epoch": 1.3744, + "grad_norm": 0.8988682627677917, + "learning_rate": 1.9405156841258498e-05, + "loss": 1.5796, + "mean_token_accuracy": 0.5901263400912284, + "num_tokens": 3224741.0, + "step": 430 + }, + { + "entropy": 2.179358023405075, + "epoch": 1.4064, + "grad_norm": 0.741558849811554, + "learning_rate": 1.936949724999762e-05, + "loss": 1.6507, + "mean_token_accuracy": 0.581992793083191, + "num_tokens": 3299366.0, + "step": 440 + }, + { + "entropy": 2.1574251472949983, + "epoch": 1.4384000000000001, + "grad_norm": 0.7538727521896362, + "learning_rate": 1.9332834409994906e-05, + "loss": 1.5771, + "mean_token_accuracy": 0.5888051658868789, + "num_tokens": 3374162.0, + "step": 450 + }, + { + "entropy": 2.1186763852834702, + "epoch": 1.4704, + "grad_norm": 0.7905173301696777, + "learning_rate": 1.929517224696239e-05, + "loss": 1.6138, + "mean_token_accuracy": 0.584889967739582, + "num_tokens": 3452582.0, + "step": 460 + }, + { + "entropy": 2.1135365635156633, + "epoch": 1.5024, + "grad_norm": 0.7416484951972961, + "learning_rate": 1.9256514793615674e-05, + "loss": 1.5623, + "mean_token_accuracy": 0.5928735345602035, + "num_tokens": 3527694.0, + "step": 470 + }, + { + "entropy": 2.146635016798973, + "epoch": 1.5344, + "grad_norm": 0.731999397277832, + "learning_rate": 1.9216866189242095e-05, + "loss": 1.5634, + "mean_token_accuracy": 0.5988615363836288, + "num_tokens": 3600277.0, + "step": 480 + }, + { + "entropy": 2.1472962319850923, + "epoch": 1.5664, + "grad_norm": 0.7493702173233032, + "learning_rate": 1.9176230679257547e-05, + "loss": 1.5891, + "mean_token_accuracy": 0.5858126983046532, + "num_tokens": 3674781.0, + "step": 490 + }, + { + "entropy": 2.1530486762523653, + "epoch": 1.5984, + "grad_norm": 0.8006687164306641, + "learning_rate": 1.9134612614751865e-05, + "loss": 1.5674, + "mean_token_accuracy": 0.5904534175992012, + "num_tokens": 3748434.0, + "step": 500 + }, + { + "entropy": 2.169738906621933, + "epoch": 1.6303999999999998, + "grad_norm": 0.9293455481529236, + "learning_rate": 1.909201645202294e-05, + "loss": 1.6104, + "mean_token_accuracy": 0.5860036969184875, + "num_tokens": 3823982.0, + "step": 510 + }, + { + "entropy": 2.178475347161293, + "epoch": 1.6623999999999999, + "grad_norm": 0.7716575860977173, + "learning_rate": 1.904844675209956e-05, + "loss": 1.6432, + "mean_token_accuracy": 0.5838924221694469, + "num_tokens": 3900064.0, + "step": 520 + }, + { + "entropy": 2.1585603266954423, + "epoch": 1.6944, + "grad_norm": 0.8225084543228149, + "learning_rate": 1.9003908180253027e-05, + "loss": 1.5957, + "mean_token_accuracy": 0.5880116850137711, + "num_tokens": 3974029.0, + "step": 530 + }, + { + "entropy": 2.111869788169861, + "epoch": 1.7264, + "grad_norm": 0.7035638093948364, + "learning_rate": 1.8958405505497613e-05, + "loss": 1.579, + "mean_token_accuracy": 0.5890362292528153, + "num_tokens": 4049974.0, + "step": 540 + }, + { + "entropy": 2.144411253929138, + "epoch": 1.7584, + "grad_norm": 0.7046850919723511, + "learning_rate": 1.8911943600079934e-05, + "loss": 1.5926, + "mean_token_accuracy": 0.5874261602759361, + "num_tokens": 4125206.0, + "step": 550 + }, + { + "entropy": 2.1093025386333464, + "epoch": 1.7904, + "grad_norm": 0.807727575302124, + "learning_rate": 1.8864527438957223e-05, + "loss": 1.5367, + "mean_token_accuracy": 0.5988967984914779, + "num_tokens": 4199365.0, + "step": 560 + }, + { + "entropy": 2.097169244289398, + "epoch": 1.8224, + "grad_norm": 0.7856780886650085, + "learning_rate": 1.881616209926465e-05, + "loss": 1.561, + "mean_token_accuracy": 0.5948230788111687, + "num_tokens": 4275889.0, + "step": 570 + }, + { + "entropy": 2.088553088903427, + "epoch": 1.8544, + "grad_norm": 0.8993458151817322, + "learning_rate": 1.876685275977167e-05, + "loss": 1.5557, + "mean_token_accuracy": 0.5941933646798134, + "num_tokens": 4350502.0, + "step": 580 + }, + { + "entropy": 2.132419008016586, + "epoch": 1.8864, + "grad_norm": 0.7769711017608643, + "learning_rate": 1.8716604700327516e-05, + "loss": 1.6105, + "mean_token_accuracy": 0.5815305605530738, + "num_tokens": 4426429.0, + "step": 590 + }, + { + "entropy": 2.1076891005039213, + "epoch": 1.9184, + "grad_norm": 0.9261249899864197, + "learning_rate": 1.866542330129583e-05, + "loss": 1.5307, + "mean_token_accuracy": 0.5964644759893417, + "num_tokens": 4500147.0, + "step": 600 + }, + { + "entropy": 2.114642283320427, + "epoch": 1.9504000000000001, + "grad_norm": 0.806425929069519, + "learning_rate": 1.8613314042978576e-05, + "loss": 1.5809, + "mean_token_accuracy": 0.5901800125837326, + "num_tokens": 4573438.0, + "step": 610 + }, + { + "entropy": 2.1167576968669892, + "epoch": 1.9824000000000002, + "grad_norm": 0.8191499710083008, + "learning_rate": 1.856028250502923e-05, + "loss": 1.6031, + "mean_token_accuracy": 0.5843381330370903, + "num_tokens": 4648156.0, + "step": 620 + }, + { + "entropy": 2.0566249019221257, + "epoch": 2.0128, + "grad_norm": 0.7406135201454163, + "learning_rate": 1.8506334365855315e-05, + "loss": 1.5187, + "mean_token_accuracy": 0.6027438483740154, + "num_tokens": 4719492.0, + "step": 630 + }, + { + "entropy": 2.0126763731241226, + "epoch": 2.0448, + "grad_norm": 0.8845784068107605, + "learning_rate": 1.8451475402010405e-05, + "loss": 1.4841, + "mean_token_accuracy": 0.6069207280874253, + "num_tokens": 4796271.0, + "step": 640 + }, + { + "entropy": 2.0516900300979612, + "epoch": 2.0768, + "grad_norm": 0.9927017092704773, + "learning_rate": 1.8395711487575564e-05, + "loss": 1.512, + "mean_token_accuracy": 0.6031922519207, + "num_tokens": 4870202.0, + "step": 650 + }, + { + "entropy": 2.0824343889951704, + "epoch": 2.1088, + "grad_norm": 0.927236795425415, + "learning_rate": 1.8339048593530406e-05, + "loss": 1.5843, + "mean_token_accuracy": 0.5952437989413738, + "num_tokens": 4945568.0, + "step": 660 + }, + { + "entropy": 2.0304481953382494, + "epoch": 2.1408, + "grad_norm": 0.874019205570221, + "learning_rate": 1.8281492787113707e-05, + "loss": 1.5096, + "mean_token_accuracy": 0.5992600306868553, + "num_tokens": 5020723.0, + "step": 670 + }, + { + "entropy": 2.0402441143989565, + "epoch": 2.1728, + "grad_norm": 0.8746942281723022, + "learning_rate": 1.8223050231173802e-05, + "loss": 1.5119, + "mean_token_accuracy": 0.5994458049535751, + "num_tokens": 5095780.0, + "step": 680 + }, + { + "entropy": 2.018441066145897, + "epoch": 2.2048, + "grad_norm": 1.063180923461914, + "learning_rate": 1.816372718350864e-05, + "loss": 1.4923, + "mean_token_accuracy": 0.6064845189452172, + "num_tokens": 5169733.0, + "step": 690 + }, + { + "entropy": 2.0563316702842713, + "epoch": 2.2368, + "grad_norm": 1.0281789302825928, + "learning_rate": 1.810352999619574e-05, + "loss": 1.5505, + "mean_token_accuracy": 0.602813882380724, + "num_tokens": 5246393.0, + "step": 700 + }, + { + "entropy": 2.0298285841941834, + "epoch": 2.2688, + "grad_norm": 1.070520281791687, + "learning_rate": 1.804246511491206e-05, + "loss": 1.5159, + "mean_token_accuracy": 0.6006126523017883, + "num_tokens": 5322244.0, + "step": 710 + }, + { + "entropy": 2.0195819228887557, + "epoch": 2.3008, + "grad_norm": 0.9672983884811401, + "learning_rate": 1.7980539078243783e-05, + "loss": 1.5166, + "mean_token_accuracy": 0.6054230839014053, + "num_tokens": 5399317.0, + "step": 720 + }, + { + "entropy": 2.045917159318924, + "epoch": 2.3327999999999998, + "grad_norm": 1.1228744983673096, + "learning_rate": 1.791775851698622e-05, + "loss": 1.5096, + "mean_token_accuracy": 0.6015639662742615, + "num_tokens": 5473195.0, + "step": 730 + }, + { + "entropy": 2.0935415983200074, + "epoch": 2.3648, + "grad_norm": 1.149794578552246, + "learning_rate": 1.7854130153433785e-05, + "loss": 1.5583, + "mean_token_accuracy": 0.5921522840857506, + "num_tokens": 5548357.0, + "step": 740 + }, + { + "entropy": 2.044076007604599, + "epoch": 2.3968, + "grad_norm": 1.063625693321228, + "learning_rate": 1.7789660800660222e-05, + "loss": 1.5013, + "mean_token_accuracy": 0.5974589124321937, + "num_tokens": 5620915.0, + "step": 750 + }, + { + "entropy": 2.092478734254837, + "epoch": 2.4288, + "grad_norm": 1.1822012662887573, + "learning_rate": 1.7724357361789075e-05, + "loss": 1.5552, + "mean_token_accuracy": 0.5929681301116944, + "num_tokens": 5693406.0, + "step": 760 + }, + { + "entropy": 2.0430804908275606, + "epoch": 2.4608, + "grad_norm": 0.9921984076499939, + "learning_rate": 1.765822682925453e-05, + "loss": 1.4944, + "mean_token_accuracy": 0.6029774472117424, + "num_tokens": 5770143.0, + "step": 770 + }, + { + "entropy": 2.049290281534195, + "epoch": 2.4928, + "grad_norm": 1.0144131183624268, + "learning_rate": 1.7591276284052695e-05, + "loss": 1.5437, + "mean_token_accuracy": 0.5986773043870925, + "num_tokens": 5844022.0, + "step": 780 + }, + { + "entropy": 2.033898201584816, + "epoch": 2.5248, + "grad_norm": 1.1700315475463867, + "learning_rate": 1.7523512894983396e-05, + "loss": 1.5197, + "mean_token_accuracy": 0.5972102269530296, + "num_tokens": 5919099.0, + "step": 790 + }, + { + "entropy": 2.03344586789608, + "epoch": 2.5568, + "grad_norm": 1.0503427982330322, + "learning_rate": 1.745494391788257e-05, + "loss": 1.5456, + "mean_token_accuracy": 0.6011263683438302, + "num_tokens": 5997797.0, + "step": 800 + }, + { + "entropy": 2.0796399265527725, + "epoch": 2.5888, + "grad_norm": 1.0316176414489746, + "learning_rate": 1.7385576694845324e-05, + "loss": 1.608, + "mean_token_accuracy": 0.6024919278919697, + "num_tokens": 6075434.0, + "step": 810 + }, + { + "entropy": 2.0257797837257385, + "epoch": 2.6208, + "grad_norm": 1.048309087753296, + "learning_rate": 1.7315418653439802e-05, + "loss": 1.4876, + "mean_token_accuracy": 0.6070949509739876, + "num_tokens": 6149232.0, + "step": 820 + }, + { + "entropy": 2.024846690893173, + "epoch": 2.6528, + "grad_norm": 1.186710000038147, + "learning_rate": 1.7244477305911845e-05, + "loss": 1.499, + "mean_token_accuracy": 0.6022308841347694, + "num_tokens": 6222180.0, + "step": 830 + }, + { + "entropy": 1.9938248336315154, + "epoch": 2.6848, + "grad_norm": 1.1091604232788086, + "learning_rate": 1.717276024838062e-05, + "loss": 1.4795, + "mean_token_accuracy": 0.6044012248516083, + "num_tokens": 6296902.0, + "step": 840 + }, + { + "entropy": 1.9988998174667358, + "epoch": 2.7168, + "grad_norm": 1.0359690189361572, + "learning_rate": 1.710027516002526e-05, + "loss": 1.5173, + "mean_token_accuracy": 0.6025070771574974, + "num_tokens": 6373494.0, + "step": 850 + }, + { + "entropy": 2.02343093752861, + "epoch": 2.7488, + "grad_norm": 1.1783568859100342, + "learning_rate": 1.7027029802262598e-05, + "loss": 1.5146, + "mean_token_accuracy": 0.6033479735255242, + "num_tokens": 6449229.0, + "step": 860 + }, + { + "entropy": 2.0429257422685625, + "epoch": 2.7808, + "grad_norm": 0.9909568428993225, + "learning_rate": 1.6953032017916115e-05, + "loss": 1.5473, + "mean_token_accuracy": 0.5932901218533516, + "num_tokens": 6525728.0, + "step": 870 + }, + { + "entropy": 2.0058376491069794, + "epoch": 2.8128, + "grad_norm": 1.0904430150985718, + "learning_rate": 1.687828973037615e-05, + "loss": 1.4545, + "mean_token_accuracy": 0.6120153024792672, + "num_tokens": 6599335.0, + "step": 880 + }, + { + "entropy": 2.005480855703354, + "epoch": 2.8448, + "grad_norm": 1.1638548374176025, + "learning_rate": 1.6802810942751514e-05, + "loss": 1.4887, + "mean_token_accuracy": 0.6060751393437386, + "num_tokens": 6672722.0, + "step": 890 + }, + { + "entropy": 2.0311779022216796, + "epoch": 2.8768000000000002, + "grad_norm": 1.1404571533203125, + "learning_rate": 1.6726603737012527e-05, + "loss": 1.5238, + "mean_token_accuracy": 0.6015868663787842, + "num_tokens": 6748069.0, + "step": 900 + }, + { + "entropy": 2.0126856863498688, + "epoch": 2.9088000000000003, + "grad_norm": 1.0942543745040894, + "learning_rate": 1.6649676273125647e-05, + "loss": 1.4984, + "mean_token_accuracy": 0.6019899815320968, + "num_tokens": 6820935.0, + "step": 910 + }, + { + "entropy": 1.9961138010025024, + "epoch": 2.9408, + "grad_norm": 1.0870610475540161, + "learning_rate": 1.6572036788179728e-05, + "loss": 1.4962, + "mean_token_accuracy": 0.6030571654438972, + "num_tokens": 6896286.0, + "step": 920 + }, + { + "entropy": 2.035824549198151, + "epoch": 2.9728, + "grad_norm": 1.0822633504867554, + "learning_rate": 1.6493693595504022e-05, + "loss": 1.5354, + "mean_token_accuracy": 0.5986709952354431, + "num_tokens": 6971854.0, + "step": 930 + }, + { + "entropy": 2.0243908260997974, + "epoch": 3.0032, + "grad_norm": 1.0899602174758911, + "learning_rate": 1.6414655083778027e-05, + "loss": 1.5032, + "mean_token_accuracy": 0.5983682243447555, + "num_tokens": 7041122.0, + "step": 940 + }, + { + "entropy": 1.9538823068141937, + "epoch": 3.0352, + "grad_norm": 1.3042237758636475, + "learning_rate": 1.633492971613326e-05, + "loss": 1.4604, + "mean_token_accuracy": 0.6146818101406097, + "num_tokens": 7116032.0, + "step": 950 + }, + { + "entropy": 1.9383916020393372, + "epoch": 3.0672, + "grad_norm": 1.397078037261963, + "learning_rate": 1.6254526029247048e-05, + "loss": 1.4019, + "mean_token_accuracy": 0.6210932344198227, + "num_tokens": 7189009.0, + "step": 960 + }, + { + "entropy": 1.9460978150367736, + "epoch": 3.0992, + "grad_norm": 1.2756887674331665, + "learning_rate": 1.617345263242847e-05, + "loss": 1.4623, + "mean_token_accuracy": 0.6121616646647453, + "num_tokens": 7263068.0, + "step": 970 + }, + { + "entropy": 1.9156711965799331, + "epoch": 3.1312, + "grad_norm": 1.1937649250030518, + "learning_rate": 1.609171820669649e-05, + "loss": 1.4301, + "mean_token_accuracy": 0.6136599197983742, + "num_tokens": 7338652.0, + "step": 980 + }, + { + "entropy": 1.9247682303190232, + "epoch": 3.1632, + "grad_norm": 1.3291118144989014, + "learning_rate": 1.6009331503850448e-05, + "loss": 1.4545, + "mean_token_accuracy": 0.6153608947992325, + "num_tokens": 7414529.0, + "step": 990 + }, + { + "entropy": 1.9066543668508529, + "epoch": 3.1952, + "grad_norm": 1.4356389045715332, + "learning_rate": 1.5926301345532925e-05, + "loss": 1.4413, + "mean_token_accuracy": 0.612147618830204, + "num_tokens": 7489106.0, + "step": 1000 + }, + { + "entropy": 1.895160937309265, + "epoch": 3.2272, + "grad_norm": 1.4345523118972778, + "learning_rate": 1.5842636622285187e-05, + "loss": 1.4207, + "mean_token_accuracy": 0.6138400137424469, + "num_tokens": 7564304.0, + "step": 1010 + }, + { + "entropy": 1.9546802312135696, + "epoch": 3.2592, + "grad_norm": 1.5242680311203003, + "learning_rate": 1.575834629259519e-05, + "loss": 1.4435, + "mean_token_accuracy": 0.6153354361653328, + "num_tokens": 7637409.0, + "step": 1020 + }, + { + "entropy": 1.912938117980957, + "epoch": 3.2912, + "grad_norm": 1.529726505279541, + "learning_rate": 1.5673439381938365e-05, + "loss": 1.4409, + "mean_token_accuracy": 0.6191004544496537, + "num_tokens": 7711595.0, + "step": 1030 + }, + { + "entropy": 1.8989770442247391, + "epoch": 3.3232, + "grad_norm": 1.3367948532104492, + "learning_rate": 1.5587924981811196e-05, + "loss": 1.394, + "mean_token_accuracy": 0.624155393242836, + "num_tokens": 7785750.0, + "step": 1040 + }, + { + "entropy": 1.932333904504776, + "epoch": 3.3552, + "grad_norm": 1.4732215404510498, + "learning_rate": 1.5501812248757734e-05, + "loss": 1.3959, + "mean_token_accuracy": 0.6221834555268287, + "num_tokens": 7859036.0, + "step": 1050 + }, + { + "entropy": 1.9492982983589173, + "epoch": 3.3872, + "grad_norm": 1.4499313831329346, + "learning_rate": 1.5415110403389166e-05, + "loss": 1.4633, + "mean_token_accuracy": 0.6100246667861938, + "num_tokens": 7933165.0, + "step": 1060 + }, + { + "entropy": 1.9063653618097305, + "epoch": 3.4192, + "grad_norm": 1.4364317655563354, + "learning_rate": 1.5327828729396482e-05, + "loss": 1.4216, + "mean_token_accuracy": 0.6210869938135147, + "num_tokens": 8009376.0, + "step": 1070 + }, + { + "entropy": 1.9919361650943757, + "epoch": 3.4512, + "grad_norm": 1.5573089122772217, + "learning_rate": 1.5239976572556438e-05, + "loss": 1.5899, + "mean_token_accuracy": 0.5991086520254612, + "num_tokens": 8086825.0, + "step": 1080 + }, + { + "entropy": 1.922476476430893, + "epoch": 3.4832, + "grad_norm": 1.3339344263076782, + "learning_rate": 1.5151563339730849e-05, + "loss": 1.4162, + "mean_token_accuracy": 0.6182018965482712, + "num_tokens": 8161726.0, + "step": 1090 + }, + { + "entropy": 1.9143129527568816, + "epoch": 3.5152, + "grad_norm": 1.4425708055496216, + "learning_rate": 1.506259849785931e-05, + "loss": 1.4076, + "mean_token_accuracy": 0.6197950705885887, + "num_tokens": 8237046.0, + "step": 1100 + }, + { + "entropy": 1.9093267023563385, + "epoch": 3.5472, + "grad_norm": 1.5437992811203003, + "learning_rate": 1.497309157294555e-05, + "loss": 1.4339, + "mean_token_accuracy": 0.6177847877144813, + "num_tokens": 8315350.0, + "step": 1110 + }, + { + "entropy": 1.9121424347162246, + "epoch": 3.5792, + "grad_norm": 1.3761622905731201, + "learning_rate": 1.4883052149037395e-05, + "loss": 1.4175, + "mean_token_accuracy": 0.6171463698148727, + "num_tokens": 8390383.0, + "step": 1120 + }, + { + "entropy": 1.883551675081253, + "epoch": 3.6112, + "grad_norm": 1.36739182472229, + "learning_rate": 1.479248986720057e-05, + "loss": 1.4158, + "mean_token_accuracy": 0.6186214044690133, + "num_tokens": 8468414.0, + "step": 1130 + }, + { + "entropy": 1.988349151611328, + "epoch": 3.6432, + "grad_norm": 1.4566738605499268, + "learning_rate": 1.4701414424486353e-05, + "loss": 1.5366, + "mean_token_accuracy": 0.6110676810145378, + "num_tokens": 8541715.0, + "step": 1140 + }, + { + "entropy": 1.9057112097740174, + "epoch": 3.6752000000000002, + "grad_norm": 1.499079704284668, + "learning_rate": 1.4609835572893266e-05, + "loss": 1.3991, + "mean_token_accuracy": 0.6208718970417977, + "num_tokens": 8615694.0, + "step": 1150 + }, + { + "entropy": 1.9219326049089431, + "epoch": 3.7072000000000003, + "grad_norm": 1.3865621089935303, + "learning_rate": 1.4517763118322861e-05, + "loss": 1.431, + "mean_token_accuracy": 0.6143050745129586, + "num_tokens": 8692473.0, + "step": 1160 + }, + { + "entropy": 1.9036399960517882, + "epoch": 3.7392, + "grad_norm": 1.5362603664398193, + "learning_rate": 1.4425206919529747e-05, + "loss": 1.4156, + "mean_token_accuracy": 0.6199175521731377, + "num_tokens": 8767618.0, + "step": 1170 + }, + { + "entropy": 1.9499989479780198, + "epoch": 3.7712, + "grad_norm": 1.663404941558838, + "learning_rate": 1.4332176887065955e-05, + "loss": 1.4668, + "mean_token_accuracy": 0.605186915397644, + "num_tokens": 8843100.0, + "step": 1180 + }, + { + "entropy": 1.9545456051826477, + "epoch": 3.8032, + "grad_norm": 1.6169345378875732, + "learning_rate": 1.4238682982219753e-05, + "loss": 1.4241, + "mean_token_accuracy": 0.6206902250647545, + "num_tokens": 8914604.0, + "step": 1190 + }, + { + "entropy": 1.9130536198616028, + "epoch": 3.8352, + "grad_norm": 1.472740650177002, + "learning_rate": 1.4144735215949028e-05, + "loss": 1.4271, + "mean_token_accuracy": 0.6139126420021057, + "num_tokens": 8989305.0, + "step": 1200 + }, + { + "entropy": 1.938635140657425, + "epoch": 3.8672, + "grad_norm": 1.4194226264953613, + "learning_rate": 1.4050343647809354e-05, + "loss": 1.4538, + "mean_token_accuracy": 0.6131341770291329, + "num_tokens": 9065589.0, + "step": 1210 + }, + { + "entropy": 1.9123675346374511, + "epoch": 3.8992, + "grad_norm": 1.5208053588867188, + "learning_rate": 1.3955518384876863e-05, + "loss": 1.4309, + "mean_token_accuracy": 0.6139545351266861, + "num_tokens": 9140150.0, + "step": 1220 + }, + { + "entropy": 1.9148090302944183, + "epoch": 3.9312, + "grad_norm": 1.6418218612670898, + "learning_rate": 1.3860269580666004e-05, + "loss": 1.4269, + "mean_token_accuracy": 0.6169310078024864, + "num_tokens": 9215796.0, + "step": 1230 + }, + { + "entropy": 1.9157740741968154, + "epoch": 3.9632, + "grad_norm": 1.4638084173202515, + "learning_rate": 1.3764607434042353e-05, + "loss": 1.4509, + "mean_token_accuracy": 0.6164968460798264, + "num_tokens": 9291010.0, + "step": 1240 + }, + { + "entropy": 1.9184510678052902, + "epoch": 3.9952, + "grad_norm": 1.5152716636657715, + "learning_rate": 1.3668542188130567e-05, + "loss": 1.4453, + "mean_token_accuracy": 0.6112410992383956, + "num_tokens": 9367186.0, + "step": 1250 + }, + { + "entropy": 1.9016748384425515, + "epoch": 4.0256, + "grad_norm": 1.490628719329834, + "learning_rate": 1.3572084129217566e-05, + "loss": 1.382, + "mean_token_accuracy": 0.623968276538347, + "num_tokens": 9439028.0, + "step": 1260 + }, + { + "entropy": 1.8026290327310561, + "epoch": 4.0576, + "grad_norm": 1.8969308137893677, + "learning_rate": 1.347524358565115e-05, + "loss": 1.3128, + "mean_token_accuracy": 0.6386646836996078, + "num_tokens": 9513855.0, + "step": 1270 + }, + { + "entropy": 1.8283424764871596, + "epoch": 4.0896, + "grad_norm": 1.5952194929122925, + "learning_rate": 1.3378030926734052e-05, + "loss": 1.3362, + "mean_token_accuracy": 0.6328515768051147, + "num_tokens": 9589080.0, + "step": 1280 + }, + { + "entropy": 1.8405955344438554, + "epoch": 4.1216, + "grad_norm": 1.6057584285736084, + "learning_rate": 1.3280456561613653e-05, + "loss": 1.4151, + "mean_token_accuracy": 0.6261398203670978, + "num_tokens": 9666808.0, + "step": 1290 + }, + { + "entropy": 1.8390818655490875, + "epoch": 4.1536, + "grad_norm": 1.8149824142456055, + "learning_rate": 1.3182530938167409e-05, + "loss": 1.3455, + "mean_token_accuracy": 0.6318597674369812, + "num_tokens": 9740267.0, + "step": 1300 + }, + { + "entropy": 1.8203887075185776, + "epoch": 4.1856, + "grad_norm": 1.6102676391601562, + "learning_rate": 1.3084264541884118e-05, + "loss": 1.3255, + "mean_token_accuracy": 0.6316933467984199, + "num_tokens": 9816400.0, + "step": 1310 + }, + { + "entropy": 1.8592366576194763, + "epoch": 4.2176, + "grad_norm": 1.9501773118972778, + "learning_rate": 1.2985667894741197e-05, + "loss": 1.3521, + "mean_token_accuracy": 0.6301594719290733, + "num_tokens": 9889311.0, + "step": 1320 + }, + { + "entropy": 1.8420085966587068, + "epoch": 4.2496, + "grad_norm": 1.6526106595993042, + "learning_rate": 1.2886751554078015e-05, + "loss": 1.3662, + "mean_token_accuracy": 0.6302071824669838, + "num_tokens": 9965339.0, + "step": 1330 + }, + { + "entropy": 1.8313881188631058, + "epoch": 4.2816, + "grad_norm": 1.6269904375076294, + "learning_rate": 1.2787526111465453e-05, + "loss": 1.3579, + "mean_token_accuracy": 0.6328388035297394, + "num_tokens": 10039668.0, + "step": 1340 + }, + { + "entropy": 1.858151137828827, + "epoch": 4.3136, + "grad_norm": 1.9028024673461914, + "learning_rate": 1.2688002191571829e-05, + "loss": 1.421, + "mean_token_accuracy": 0.6276688367128372, + "num_tokens": 10115387.0, + "step": 1350 + }, + { + "entropy": 1.8273844957351684, + "epoch": 4.3456, + "grad_norm": 1.7530555725097656, + "learning_rate": 1.2588190451025209e-05, + "loss": 1.3527, + "mean_token_accuracy": 0.6345869660377502, + "num_tokens": 10191506.0, + "step": 1360 + }, + { + "entropy": 1.8732422679662704, + "epoch": 4.3776, + "grad_norm": 1.7372691631317139, + "learning_rate": 1.248810157727236e-05, + "loss": 1.4132, + "mean_token_accuracy": 0.6252246856689453, + "num_tokens": 10268756.0, + "step": 1370 + }, + { + "entropy": 1.8583054572343827, + "epoch": 4.4096, + "grad_norm": 1.6993470191955566, + "learning_rate": 1.2387746287434385e-05, + "loss": 1.3638, + "mean_token_accuracy": 0.6286717876791954, + "num_tokens": 10341779.0, + "step": 1380 + }, + { + "entropy": 1.8324467271566391, + "epoch": 4.4416, + "grad_norm": 1.7818169593811035, + "learning_rate": 1.2287135327159165e-05, + "loss": 1.3372, + "mean_token_accuracy": 0.6361263945698739, + "num_tokens": 10414642.0, + "step": 1390 + }, + { + "entropy": 1.8514392852783204, + "epoch": 4.4736, + "grad_norm": 1.7585517168045044, + "learning_rate": 1.2186279469470757e-05, + "loss": 1.3697, + "mean_token_accuracy": 0.628801380097866, + "num_tokens": 10489517.0, + "step": 1400 + }, + { + "entropy": 1.8218136370182036, + "epoch": 4.5056, + "grad_norm": 1.9843116998672485, + "learning_rate": 1.2085189513615872e-05, + "loss": 1.3628, + "mean_token_accuracy": 0.6295172199606895, + "num_tokens": 10565467.0, + "step": 1410 + }, + { + "entropy": 1.8919565021991729, + "epoch": 4.5376, + "grad_norm": 1.9309132099151611, + "learning_rate": 1.1983876283907522e-05, + "loss": 1.4467, + "mean_token_accuracy": 0.6263746194541454, + "num_tokens": 10641283.0, + "step": 1420 + }, + { + "entropy": 1.8356508910655975, + "epoch": 4.5696, + "grad_norm": 1.7685068845748901, + "learning_rate": 1.1882350628566008e-05, + "loss": 1.3631, + "mean_token_accuracy": 0.624418406188488, + "num_tokens": 10716701.0, + "step": 1430 + }, + { + "entropy": 1.8288098931312562, + "epoch": 4.6016, + "grad_norm": 1.8276050090789795, + "learning_rate": 1.178062341855732e-05, + "loss": 1.3619, + "mean_token_accuracy": 0.6286922857165337, + "num_tokens": 10791427.0, + "step": 1440 + }, + { + "entropy": 1.8557640790939331, + "epoch": 4.6336, + "grad_norm": 1.7773240804672241, + "learning_rate": 1.1678705546429132e-05, + "loss": 1.383, + "mean_token_accuracy": 0.6216814562678337, + "num_tokens": 10866356.0, + "step": 1450 + }, + { + "entropy": 1.8483826667070389, + "epoch": 4.6655999999999995, + "grad_norm": 1.831931710243225, + "learning_rate": 1.1576607925144456e-05, + "loss": 1.355, + "mean_token_accuracy": 0.6278511002659798, + "num_tokens": 10940772.0, + "step": 1460 + }, + { + "entropy": 1.8824394553899766, + "epoch": 4.6975999999999996, + "grad_norm": 1.9213542938232422, + "learning_rate": 1.1474341486913146e-05, + "loss": 1.3767, + "mean_token_accuracy": 0.6256057649850846, + "num_tokens": 11016144.0, + "step": 1470 + }, + { + "entropy": 1.8709469974040984, + "epoch": 4.7296, + "grad_norm": 1.8768925666809082, + "learning_rate": 1.1371917182021297e-05, + "loss": 1.3734, + "mean_token_accuracy": 0.6317574754357338, + "num_tokens": 11089939.0, + "step": 1480 + }, + { + "entropy": 1.8673742085695266, + "epoch": 4.7616, + "grad_norm": 1.796302318572998, + "learning_rate": 1.1269345977658747e-05, + "loss": 1.3707, + "mean_token_accuracy": 0.6238353416323662, + "num_tokens": 11166087.0, + "step": 1490 + }, + { + "entropy": 1.8310889720916748, + "epoch": 4.7936, + "grad_norm": 1.8969939947128296, + "learning_rate": 1.1166638856744747e-05, + "loss": 1.3373, + "mean_token_accuracy": 0.6348015949130058, + "num_tokens": 11240732.0, + "step": 1500 + }, + { + "entropy": 1.8809226244688033, + "epoch": 4.8256, + "grad_norm": 1.642104983329773, + "learning_rate": 1.1063806816751957e-05, + "loss": 1.3792, + "mean_token_accuracy": 0.6250617265701294, + "num_tokens": 11316878.0, + "step": 1510 + }, + { + "entropy": 1.8715822875499726, + "epoch": 4.8576, + "grad_norm": 1.962158441543579, + "learning_rate": 1.0960860868528872e-05, + "loss": 1.3711, + "mean_token_accuracy": 0.6293752744793892, + "num_tokens": 11389042.0, + "step": 1520 + }, + { + "entropy": 1.8657191127538681, + "epoch": 4.8896, + "grad_norm": 1.9577444791793823, + "learning_rate": 1.0857812035120845e-05, + "loss": 1.379, + "mean_token_accuracy": 0.6259156972169876, + "num_tokens": 11464215.0, + "step": 1530 + }, + { + "entropy": 1.8811951220035552, + "epoch": 4.9216, + "grad_norm": 2.015150785446167, + "learning_rate": 1.0754671350589752e-05, + "loss": 1.4155, + "mean_token_accuracy": 0.626779156178236, + "num_tokens": 11539122.0, + "step": 1540 + }, + { + "entropy": 1.863905319571495, + "epoch": 4.9536, + "grad_norm": 1.8474093675613403, + "learning_rate": 1.065144985883253e-05, + "loss": 1.3409, + "mean_token_accuracy": 0.6319419264793396, + "num_tokens": 11613016.0, + "step": 1550 + }, + { + "entropy": 1.836970153450966, + "epoch": 4.9856, + "grad_norm": 1.8822177648544312, + "learning_rate": 1.054815861239864e-05, + "loss": 1.3514, + "mean_token_accuracy": 0.6292115703225136, + "num_tokens": 11688143.0, + "step": 1560 + }, + { + "entropy": 1.8377465950815302, + "epoch": 5.016, + "grad_norm": 1.8221346139907837, + "learning_rate": 1.0444808671306588e-05, + "loss": 1.3028, + "mean_token_accuracy": 0.6413120329380035, + "num_tokens": 11758768.0, + "step": 1570 + }, + { + "entropy": 1.7883025139570237, + "epoch": 5.048, + "grad_norm": 2.1959595680236816, + "learning_rate": 1.034141110185968e-05, + "loss": 1.2797, + "mean_token_accuracy": 0.6479741290211678, + "num_tokens": 11832210.0, + "step": 1580 + }, + { + "entropy": 1.7955584406852723, + "epoch": 5.08, + "grad_norm": 2.106905698776245, + "learning_rate": 1.0237976975461074e-05, + "loss": 1.2881, + "mean_token_accuracy": 0.6424632370471954, + "num_tokens": 11906115.0, + "step": 1590 + }, + { + "entropy": 1.7998322755098344, + "epoch": 5.112, + "grad_norm": 2.327314615249634, + "learning_rate": 1.0134517367428309e-05, + "loss": 1.3324, + "mean_token_accuracy": 0.6460248224437237, + "num_tokens": 11981328.0, + "step": 1600 + }, + { + "entropy": 1.7885828018188477, + "epoch": 5.144, + "grad_norm": 2.1001713275909424, + "learning_rate": 1.0031043355807386e-05, + "loss": 1.3098, + "mean_token_accuracy": 0.63900758177042, + "num_tokens": 12056453.0, + "step": 1610 + }, + { + "entropy": 1.769435602426529, + "epoch": 5.176, + "grad_norm": 2.1210567951202393, + "learning_rate": 9.927566020186592e-06, + "loss": 1.2892, + "mean_token_accuracy": 0.6432970002293587, + "num_tokens": 12133433.0, + "step": 1620 + }, + { + "entropy": 1.7907766073942184, + "epoch": 5.208, + "grad_norm": 2.1842658519744873, + "learning_rate": 9.82409644051013e-06, + "loss": 1.2856, + "mean_token_accuracy": 0.6423615619540215, + "num_tokens": 12207150.0, + "step": 1630 + }, + { + "entropy": 1.7834827870130538, + "epoch": 5.24, + "grad_norm": 2.2503459453582764, + "learning_rate": 9.720645695891733e-06, + "loss": 1.3066, + "mean_token_accuracy": 0.6417693704366684, + "num_tokens": 12282584.0, + "step": 1640 + }, + { + "entropy": 1.763256973028183, + "epoch": 5.272, + "grad_norm": 1.9505388736724854, + "learning_rate": 9.617224863428346e-06, + "loss": 1.2951, + "mean_token_accuracy": 0.6429389595985413, + "num_tokens": 12359793.0, + "step": 1650 + }, + { + "entropy": 1.8142763644456863, + "epoch": 5.304, + "grad_norm": 1.9957698583602905, + "learning_rate": 9.513845017014048e-06, + "loss": 1.33, + "mean_token_accuracy": 0.6413653999567032, + "num_tokens": 12434251.0, + "step": 1660 + }, + { + "entropy": 1.797221601009369, + "epoch": 5.336, + "grad_norm": 2.5095462799072266, + "learning_rate": 9.410517226154276e-06, + "loss": 1.2978, + "mean_token_accuracy": 0.6389835774898529, + "num_tokens": 12508416.0, + "step": 1670 + }, + { + "entropy": 1.8157870292663574, + "epoch": 5.368, + "grad_norm": 2.1890602111816406, + "learning_rate": 9.30725255478058e-06, + "loss": 1.3183, + "mean_token_accuracy": 0.6448161751031876, + "num_tokens": 12582896.0, + "step": 1680 + }, + { + "entropy": 1.7990054041147232, + "epoch": 5.4, + "grad_norm": 2.3904025554656982, + "learning_rate": 9.204062060065915e-06, + "loss": 1.3318, + "mean_token_accuracy": 0.636146479845047, + "num_tokens": 12656802.0, + "step": 1690 + }, + { + "entropy": 1.8003453463315964, + "epoch": 5.432, + "grad_norm": 1.9204304218292236, + "learning_rate": 9.100956791240699e-06, + "loss": 1.3186, + "mean_token_accuracy": 0.6372130662202835, + "num_tokens": 12733283.0, + "step": 1700 + }, + { + "entropy": 1.8101116061210631, + "epoch": 5.464, + "grad_norm": 2.009500026702881, + "learning_rate": 8.997947788409696e-06, + "loss": 1.3247, + "mean_token_accuracy": 0.6406339526176452, + "num_tokens": 12810272.0, + "step": 1710 + }, + { + "entropy": 1.764935952425003, + "epoch": 5.496, + "grad_norm": 2.2038798332214355, + "learning_rate": 8.89504608136989e-06, + "loss": 1.2792, + "mean_token_accuracy": 0.6445836886763573, + "num_tokens": 12885633.0, + "step": 1720 + }, + { + "entropy": 1.7950240582227708, + "epoch": 5.5280000000000005, + "grad_norm": 2.0160531997680664, + "learning_rate": 8.792262688429445e-06, + "loss": 1.2934, + "mean_token_accuracy": 0.6469692558050155, + "num_tokens": 12961131.0, + "step": 1730 + }, + { + "entropy": 1.7804677098989488, + "epoch": 5.5600000000000005, + "grad_norm": 2.1956582069396973, + "learning_rate": 8.689608615227933e-06, + "loss": 1.2969, + "mean_token_accuracy": 0.6438389763236045, + "num_tokens": 13036481.0, + "step": 1740 + }, + { + "entropy": 1.7932062089443206, + "epoch": 5.592, + "grad_norm": 2.2215394973754883, + "learning_rate": 8.587094853557877e-06, + "loss": 1.2907, + "mean_token_accuracy": 0.6460438340902328, + "num_tokens": 13111001.0, + "step": 1750 + }, + { + "entropy": 1.8026408910751344, + "epoch": 5.624, + "grad_norm": 2.3881425857543945, + "learning_rate": 8.484732380187785e-06, + "loss": 1.3049, + "mean_token_accuracy": 0.6414234206080437, + "num_tokens": 13186347.0, + "step": 1760 + }, + { + "entropy": 1.8440747499465941, + "epoch": 5.656, + "grad_norm": 2.2154159545898438, + "learning_rate": 8.382532155686825e-06, + "loss": 1.3797, + "mean_token_accuracy": 0.6365857936441899, + "num_tokens": 13261455.0, + "step": 1770 + }, + { + "entropy": 1.7975190997123718, + "epoch": 5.688, + "grad_norm": 2.1991233825683594, + "learning_rate": 8.280505123251183e-06, + "loss": 1.3191, + "mean_token_accuracy": 0.6393151715397835, + "num_tokens": 13338064.0, + "step": 1780 + }, + { + "entropy": 1.8396487146615983, + "epoch": 5.72, + "grad_norm": 2.0190858840942383, + "learning_rate": 8.178662207532343e-06, + "loss": 1.4052, + "mean_token_accuracy": 0.629064130038023, + "num_tokens": 13414806.0, + "step": 1790 + }, + { + "entropy": 1.7840806126594544, + "epoch": 5.752, + "grad_norm": 2.3335204124450684, + "learning_rate": 8.077014313467274e-06, + "loss": 1.2701, + "mean_token_accuracy": 0.6464540064334869, + "num_tokens": 13489075.0, + "step": 1800 + }, + { + "entropy": 1.7840022534132003, + "epoch": 5.784, + "grad_norm": 2.2151618003845215, + "learning_rate": 7.975572325110819e-06, + "loss": 1.3248, + "mean_token_accuracy": 0.6358998969197274, + "num_tokens": 13565636.0, + "step": 1810 + }, + { + "entropy": 1.7677135676145554, + "epoch": 5.816, + "grad_norm": 2.11505389213562, + "learning_rate": 7.874347104470234e-06, + "loss": 1.2765, + "mean_token_accuracy": 0.6448719501495361, + "num_tokens": 13641112.0, + "step": 1820 + }, + { + "entropy": 1.7586044907569884, + "epoch": 5.848, + "grad_norm": 2.178250789642334, + "learning_rate": 7.773349490342157e-06, + "loss": 1.2846, + "mean_token_accuracy": 0.6450280979275703, + "num_tokens": 13715158.0, + "step": 1830 + }, + { + "entropy": 1.8128920108079911, + "epoch": 5.88, + "grad_norm": 2.2499353885650635, + "learning_rate": 7.672590297152013e-06, + "loss": 1.3248, + "mean_token_accuracy": 0.6343795835971833, + "num_tokens": 13791086.0, + "step": 1840 + }, + { + "entropy": 1.7873643577098846, + "epoch": 5.912, + "grad_norm": 2.1989104747772217, + "learning_rate": 7.572080313796064e-06, + "loss": 1.2907, + "mean_token_accuracy": 0.6413815975189209, + "num_tokens": 13865700.0, + "step": 1850 + }, + { + "entropy": 1.790488451719284, + "epoch": 5.944, + "grad_norm": 2.2605504989624023, + "learning_rate": 7.471830302486151e-06, + "loss": 1.2955, + "mean_token_accuracy": 0.6424889475107193, + "num_tokens": 13938540.0, + "step": 1860 + }, + { + "entropy": 1.7985246628522873, + "epoch": 5.976, + "grad_norm": 2.3228533267974854, + "learning_rate": 7.371850997597355e-06, + "loss": 1.2854, + "mean_token_accuracy": 0.6457341402769089, + "num_tokens": 14011087.0, + "step": 1870 + } + ], + "logging_steps": 10, + "max_steps": 3130, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.366506102804185e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1878/training_args.bin b/checkpoint-1878/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..ca9563f4064abfd2f36668559093a5f0763d7c85 --- /dev/null +++ b/checkpoint-1878/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88fbd415d023bc35b9e36c515e374642c961f2c3a428f0d0bba13a27d8c151a7 +size 6289 diff --git a/checkpoint-2191/README.md b/checkpoint-2191/README.md new file mode 100644 index 0000000000000000000000000000000000000000..96b9f5618833a1728fbecbefb87f08b279b6b2ed --- /dev/null +++ b/checkpoint-2191/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.1-8B-Instruct +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/checkpoint-2191/adapter_config.json b/checkpoint-2191/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8cd4cdaf30ceb587991efae70006ef463605c378 --- /dev/null +++ b/checkpoint-2191/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "o_proj", + "k_proj", + "gate_proj", + "v_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2191/adapter_model.safetensors b/checkpoint-2191/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1daa9cdefa34c0a8279e1504760d134d0adf8284 --- /dev/null +++ b/checkpoint-2191/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e2354c7fa06f8c9edc7564f44dc50bf844a43cb99bc721fe13bbddf52f16045 +size 335604696 diff --git a/checkpoint-2191/chat_template.jinja b/checkpoint-2191/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..33089ace1be88f22a10fe861ad49718d5d886090 --- /dev/null +++ b/checkpoint-2191/chat_template.jinja @@ -0,0 +1,109 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {#- This means we're in ipython mode #} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-2191/optimizer.pt b/checkpoint-2191/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f32aa57d72be218123f1d23670dd26950442f9f5 --- /dev/null +++ b/checkpoint-2191/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d836de1d9f8c8f7da0e887661c326cb923086d497764c1fb42037da5a07bd0d9 +size 671473443 diff --git a/checkpoint-2191/rng_state.pth b/checkpoint-2191/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0f94f693327737e32b988557eb925f20487d132a --- /dev/null +++ b/checkpoint-2191/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb90ea4ea7267de7b0555c5e0e2eed66fa477f18850eb66d72beb1a8243b4aac +size 14645 diff --git a/checkpoint-2191/scheduler.pt b/checkpoint-2191/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed40dd56f2607ce20440e5631e23333e449ac0fb --- /dev/null +++ b/checkpoint-2191/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ebab70eadf5095544b80af8b44b5737a796bb6042159dd43e49db03fffab3ae +size 1465 diff --git a/checkpoint-2191/special_tokens_map.json b/checkpoint-2191/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..02ee80b6196926a5ad790a004d9efd6ab1ba6542 --- /dev/null +++ b/checkpoint-2191/special_tokens_map.json @@ -0,0 +1,16 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-2191/tokenizer.json b/checkpoint-2191/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-2191/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-2191/tokenizer_config.json b/checkpoint-2191/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8b0c7c141373ca36e5e819a28f60e146ccef652f --- /dev/null +++ b/checkpoint-2191/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-2191/trainer_state.json b/checkpoint-2191/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a27b46346cdeb1636b939483b1492bfa1ffe3dbc --- /dev/null +++ b/checkpoint-2191/trainer_state.json @@ -0,0 +1,2224 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.0, + "eval_steps": 500, + "global_step": 2191, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.098961293697357, + "epoch": 0.032, + "grad_norm": 1.891703486442566, + "learning_rate": 1.9148936170212767e-06, + "loss": 2.0828, + "mean_token_accuracy": 0.530680388212204, + "num_tokens": 72723.0, + "step": 10 + }, + { + "entropy": 2.119775766134262, + "epoch": 0.064, + "grad_norm": 1.2044862508773804, + "learning_rate": 4.042553191489362e-06, + "loss": 2.0093, + "mean_token_accuracy": 0.5355814293026924, + "num_tokens": 146392.0, + "step": 20 + }, + { + "entropy": 2.220579963922501, + "epoch": 0.096, + "grad_norm": 0.9982365369796753, + "learning_rate": 6.170212765957447e-06, + "loss": 1.8939, + "mean_token_accuracy": 0.5451944440603256, + "num_tokens": 223711.0, + "step": 30 + }, + { + "entropy": 2.382017892599106, + "epoch": 0.128, + "grad_norm": 0.7386544346809387, + "learning_rate": 8.297872340425532e-06, + "loss": 1.9066, + "mean_token_accuracy": 0.5411656655371189, + "num_tokens": 300889.0, + "step": 40 + }, + { + "entropy": 2.274736815690994, + "epoch": 0.16, + "grad_norm": 0.6412256956100464, + "learning_rate": 1.0425531914893619e-05, + "loss": 1.7387, + "mean_token_accuracy": 0.5679451540112496, + "num_tokens": 377362.0, + "step": 50 + }, + { + "entropy": 2.3663365960121157, + "epoch": 0.192, + "grad_norm": 0.6228290796279907, + "learning_rate": 1.2553191489361702e-05, + "loss": 1.7492, + "mean_token_accuracy": 0.5746532663702965, + "num_tokens": 449594.0, + "step": 60 + }, + { + "entropy": 2.315044218301773, + "epoch": 0.224, + "grad_norm": 0.6034156680107117, + "learning_rate": 1.4680851063829789e-05, + "loss": 1.7111, + "mean_token_accuracy": 0.5675176709890366, + "num_tokens": 523439.0, + "step": 70 + }, + { + "entropy": 2.288265961408615, + "epoch": 0.256, + "grad_norm": 0.45914268493652344, + "learning_rate": 1.6808510638297873e-05, + "loss": 1.6931, + "mean_token_accuracy": 0.5713589735329151, + "num_tokens": 599650.0, + "step": 80 + }, + { + "entropy": 2.2693382859230042, + "epoch": 0.288, + "grad_norm": 0.6197793483734131, + "learning_rate": 1.893617021276596e-05, + "loss": 1.6542, + "mean_token_accuracy": 0.578165066242218, + "num_tokens": 675377.0, + "step": 90 + }, + { + "entropy": 2.293796479701996, + "epoch": 0.32, + "grad_norm": 0.5502006411552429, + "learning_rate": 1.9999866154043656e-05, + "loss": 1.7108, + "mean_token_accuracy": 0.5681634023785591, + "num_tokens": 751838.0, + "step": 100 + }, + { + "entropy": 2.2658903509378434, + "epoch": 0.352, + "grad_norm": 0.5713317394256592, + "learning_rate": 1.9998795407890486e-05, + "loss": 1.6168, + "mean_token_accuracy": 0.5843982398509979, + "num_tokens": 825539.0, + "step": 110 + }, + { + "entropy": 2.270280033349991, + "epoch": 0.384, + "grad_norm": 0.5967482924461365, + "learning_rate": 1.999665403023542e-05, + "loss": 1.6194, + "mean_token_accuracy": 0.5839526921510696, + "num_tokens": 897258.0, + "step": 120 + }, + { + "entropy": 2.2349284648895265, + "epoch": 0.416, + "grad_norm": 0.4899630844593048, + "learning_rate": 1.9993442250368708e-05, + "loss": 1.6313, + "mean_token_accuracy": 0.5815729826688767, + "num_tokens": 973142.0, + "step": 130 + }, + { + "entropy": 2.245553806424141, + "epoch": 0.448, + "grad_norm": 0.6546034812927246, + "learning_rate": 1.9989160412195047e-05, + "loss": 1.6395, + "mean_token_accuracy": 0.5780692532658577, + "num_tokens": 1046762.0, + "step": 140 + }, + { + "entropy": 2.288555932044983, + "epoch": 0.48, + "grad_norm": 0.5528404116630554, + "learning_rate": 1.9983808974196752e-05, + "loss": 1.7118, + "mean_token_accuracy": 0.5686657652258873, + "num_tokens": 1125167.0, + "step": 150 + }, + { + "entropy": 2.232080355286598, + "epoch": 0.512, + "grad_norm": 0.5887461304664612, + "learning_rate": 1.9977388509384656e-05, + "loss": 1.6339, + "mean_token_accuracy": 0.5838325396180153, + "num_tokens": 1199589.0, + "step": 160 + }, + { + "entropy": 2.2232475757598875, + "epoch": 0.544, + "grad_norm": 0.5764511823654175, + "learning_rate": 1.9969899705236763e-05, + "loss": 1.6173, + "mean_token_accuracy": 0.5848860442638397, + "num_tokens": 1276431.0, + "step": 170 + }, + { + "entropy": 2.244092071056366, + "epoch": 0.576, + "grad_norm": 0.6295827627182007, + "learning_rate": 1.9961343363624626e-05, + "loss": 1.6017, + "mean_token_accuracy": 0.5818701103329659, + "num_tokens": 1350012.0, + "step": 180 + }, + { + "entropy": 2.237305074930191, + "epoch": 0.608, + "grad_norm": 0.5939638018608093, + "learning_rate": 1.9951720400727495e-05, + "loss": 1.6704, + "mean_token_accuracy": 0.5779796853661537, + "num_tokens": 1423391.0, + "step": 190 + }, + { + "entropy": 2.211505854129791, + "epoch": 0.64, + "grad_norm": 0.6119778156280518, + "learning_rate": 1.9941031846934213e-05, + "loss": 1.6223, + "mean_token_accuracy": 0.5848233133554459, + "num_tokens": 1499124.0, + "step": 200 + }, + { + "entropy": 2.2195493161678312, + "epoch": 0.672, + "grad_norm": 0.6129831671714783, + "learning_rate": 1.9929278846732883e-05, + "loss": 1.5886, + "mean_token_accuracy": 0.5897421136498451, + "num_tokens": 1573541.0, + "step": 210 + }, + { + "entropy": 2.2096123576164244, + "epoch": 0.704, + "grad_norm": 0.6091306209564209, + "learning_rate": 1.9916462658588328e-05, + "loss": 1.6031, + "mean_token_accuracy": 0.5894487425684929, + "num_tokens": 1649546.0, + "step": 220 + }, + { + "entropy": 2.249841979146004, + "epoch": 0.736, + "grad_norm": 0.570816695690155, + "learning_rate": 1.9902584654807325e-05, + "loss": 1.5876, + "mean_token_accuracy": 0.5911228567361831, + "num_tokens": 1722199.0, + "step": 230 + }, + { + "entropy": 2.1915894985198974, + "epoch": 0.768, + "grad_norm": 0.5748864412307739, + "learning_rate": 1.988764632139168e-05, + "loss": 1.5963, + "mean_token_accuracy": 0.5891387596726417, + "num_tokens": 1797304.0, + "step": 240 + }, + { + "entropy": 2.2358563423156737, + "epoch": 0.8, + "grad_norm": 0.6511492729187012, + "learning_rate": 1.9871649257879115e-05, + "loss": 1.6453, + "mean_token_accuracy": 0.5792816638946533, + "num_tokens": 1870113.0, + "step": 250 + }, + { + "entropy": 2.2169984579086304, + "epoch": 0.832, + "grad_norm": 0.5317641496658325, + "learning_rate": 1.9854595177171968e-05, + "loss": 1.6594, + "mean_token_accuracy": 0.577045065164566, + "num_tokens": 1947405.0, + "step": 260 + }, + { + "entropy": 2.2434292674064635, + "epoch": 0.864, + "grad_norm": 0.5399971604347229, + "learning_rate": 1.9836485905353823e-05, + "loss": 1.7158, + "mean_token_accuracy": 0.5683416239917278, + "num_tokens": 2026284.0, + "step": 270 + }, + { + "entropy": 2.227828550338745, + "epoch": 0.896, + "grad_norm": 0.5378643870353699, + "learning_rate": 1.9817323381493933e-05, + "loss": 1.6714, + "mean_token_accuracy": 0.5818367518484593, + "num_tokens": 2103986.0, + "step": 280 + }, + { + "entropy": 2.2110894501209257, + "epoch": 0.928, + "grad_norm": 0.5195969343185425, + "learning_rate": 1.979710965743964e-05, + "loss": 1.6239, + "mean_token_accuracy": 0.5819958478212357, + "num_tokens": 2177010.0, + "step": 290 + }, + { + "entropy": 2.1666628658771514, + "epoch": 0.96, + "grad_norm": 0.5663164258003235, + "learning_rate": 1.977584689759664e-05, + "loss": 1.6024, + "mean_token_accuracy": 0.5876665830612182, + "num_tokens": 2251285.0, + "step": 300 + }, + { + "entropy": 2.214203083515167, + "epoch": 0.992, + "grad_norm": 0.6764860153198242, + "learning_rate": 1.9753537378697237e-05, + "loss": 1.6446, + "mean_token_accuracy": 0.5818003416061401, + "num_tokens": 2325752.0, + "step": 310 + }, + { + "entropy": 2.16783396821273, + "epoch": 1.0224, + "grad_norm": 0.5795008540153503, + "learning_rate": 1.9730183489556563e-05, + "loss": 1.594, + "mean_token_accuracy": 0.5867547392845154, + "num_tokens": 2396254.0, + "step": 320 + }, + { + "entropy": 2.172953352332115, + "epoch": 1.0544, + "grad_norm": 0.6686444282531738, + "learning_rate": 1.9705787730816776e-05, + "loss": 1.613, + "mean_token_accuracy": 0.5906373374164104, + "num_tokens": 2470123.0, + "step": 330 + }, + { + "entropy": 2.2217346757650374, + "epoch": 1.0864, + "grad_norm": 0.6389091610908508, + "learning_rate": 1.9680352714679324e-05, + "loss": 1.7053, + "mean_token_accuracy": 0.577599074691534, + "num_tokens": 2545749.0, + "step": 340 + }, + { + "entropy": 2.138428696990013, + "epoch": 1.1184, + "grad_norm": 0.7369883060455322, + "learning_rate": 1.9653881164625234e-05, + "loss": 1.5599, + "mean_token_accuracy": 0.5946489304304123, + "num_tokens": 2623270.0, + "step": 350 + }, + { + "entropy": 2.147254040837288, + "epoch": 1.1504, + "grad_norm": 0.6707085967063904, + "learning_rate": 1.9626375915123473e-05, + "loss": 1.5843, + "mean_token_accuracy": 0.5965728983283043, + "num_tokens": 2697616.0, + "step": 360 + }, + { + "entropy": 2.1412769109010696, + "epoch": 1.1824, + "grad_norm": 0.7201400995254517, + "learning_rate": 1.9597839911327475e-05, + "loss": 1.58, + "mean_token_accuracy": 0.5957784004509449, + "num_tokens": 2771426.0, + "step": 370 + }, + { + "entropy": 2.164059528708458, + "epoch": 1.2144, + "grad_norm": 0.7561144232749939, + "learning_rate": 1.9568276208759772e-05, + "loss": 1.5872, + "mean_token_accuracy": 0.5874110117554665, + "num_tokens": 2846711.0, + "step": 380 + }, + { + "entropy": 2.205427420139313, + "epoch": 1.2464, + "grad_norm": 0.691585898399353, + "learning_rate": 1.9537687972984804e-05, + "loss": 1.625, + "mean_token_accuracy": 0.5892911069095135, + "num_tokens": 2920916.0, + "step": 390 + }, + { + "entropy": 2.1242104679346085, + "epoch": 1.2784, + "grad_norm": 0.6999676823616028, + "learning_rate": 1.950607847926999e-05, + "loss": 1.5606, + "mean_token_accuracy": 0.5917269751429558, + "num_tokens": 2996056.0, + "step": 400 + }, + { + "entropy": 2.114223065972328, + "epoch": 1.3104, + "grad_norm": 0.7616406679153442, + "learning_rate": 1.947345111223502e-05, + "loss": 1.5296, + "mean_token_accuracy": 0.5938275754451752, + "num_tokens": 3072912.0, + "step": 410 + }, + { + "entropy": 2.1418962299823763, + "epoch": 1.3424, + "grad_norm": 0.7253025770187378, + "learning_rate": 1.943980936548942e-05, + "loss": 1.575, + "mean_token_accuracy": 0.5945621818304062, + "num_tokens": 3148498.0, + "step": 420 + }, + { + "entropy": 2.109667718410492, + "epoch": 1.3744, + "grad_norm": 0.8988682627677917, + "learning_rate": 1.9405156841258498e-05, + "loss": 1.5796, + "mean_token_accuracy": 0.5901263400912284, + "num_tokens": 3224741.0, + "step": 430 + }, + { + "entropy": 2.179358023405075, + "epoch": 1.4064, + "grad_norm": 0.741558849811554, + "learning_rate": 1.936949724999762e-05, + "loss": 1.6507, + "mean_token_accuracy": 0.581992793083191, + "num_tokens": 3299366.0, + "step": 440 + }, + { + "entropy": 2.1574251472949983, + "epoch": 1.4384000000000001, + "grad_norm": 0.7538727521896362, + "learning_rate": 1.9332834409994906e-05, + "loss": 1.5771, + "mean_token_accuracy": 0.5888051658868789, + "num_tokens": 3374162.0, + "step": 450 + }, + { + "entropy": 2.1186763852834702, + "epoch": 1.4704, + "grad_norm": 0.7905173301696777, + "learning_rate": 1.929517224696239e-05, + "loss": 1.6138, + "mean_token_accuracy": 0.584889967739582, + "num_tokens": 3452582.0, + "step": 460 + }, + { + "entropy": 2.1135365635156633, + "epoch": 1.5024, + "grad_norm": 0.7416484951972961, + "learning_rate": 1.9256514793615674e-05, + "loss": 1.5623, + "mean_token_accuracy": 0.5928735345602035, + "num_tokens": 3527694.0, + "step": 470 + }, + { + "entropy": 2.146635016798973, + "epoch": 1.5344, + "grad_norm": 0.731999397277832, + "learning_rate": 1.9216866189242095e-05, + "loss": 1.5634, + "mean_token_accuracy": 0.5988615363836288, + "num_tokens": 3600277.0, + "step": 480 + }, + { + "entropy": 2.1472962319850923, + "epoch": 1.5664, + "grad_norm": 0.7493702173233032, + "learning_rate": 1.9176230679257547e-05, + "loss": 1.5891, + "mean_token_accuracy": 0.5858126983046532, + "num_tokens": 3674781.0, + "step": 490 + }, + { + "entropy": 2.1530486762523653, + "epoch": 1.5984, + "grad_norm": 0.8006687164306641, + "learning_rate": 1.9134612614751865e-05, + "loss": 1.5674, + "mean_token_accuracy": 0.5904534175992012, + "num_tokens": 3748434.0, + "step": 500 + }, + { + "entropy": 2.169738906621933, + "epoch": 1.6303999999999998, + "grad_norm": 0.9293455481529236, + "learning_rate": 1.909201645202294e-05, + "loss": 1.6104, + "mean_token_accuracy": 0.5860036969184875, + "num_tokens": 3823982.0, + "step": 510 + }, + { + "entropy": 2.178475347161293, + "epoch": 1.6623999999999999, + "grad_norm": 0.7716575860977173, + "learning_rate": 1.904844675209956e-05, + "loss": 1.6432, + "mean_token_accuracy": 0.5838924221694469, + "num_tokens": 3900064.0, + "step": 520 + }, + { + "entropy": 2.1585603266954423, + "epoch": 1.6944, + "grad_norm": 0.8225084543228149, + "learning_rate": 1.9003908180253027e-05, + "loss": 1.5957, + "mean_token_accuracy": 0.5880116850137711, + "num_tokens": 3974029.0, + "step": 530 + }, + { + "entropy": 2.111869788169861, + "epoch": 1.7264, + "grad_norm": 0.7035638093948364, + "learning_rate": 1.8958405505497613e-05, + "loss": 1.579, + "mean_token_accuracy": 0.5890362292528153, + "num_tokens": 4049974.0, + "step": 540 + }, + { + "entropy": 2.144411253929138, + "epoch": 1.7584, + "grad_norm": 0.7046850919723511, + "learning_rate": 1.8911943600079934e-05, + "loss": 1.5926, + "mean_token_accuracy": 0.5874261602759361, + "num_tokens": 4125206.0, + "step": 550 + }, + { + "entropy": 2.1093025386333464, + "epoch": 1.7904, + "grad_norm": 0.807727575302124, + "learning_rate": 1.8864527438957223e-05, + "loss": 1.5367, + "mean_token_accuracy": 0.5988967984914779, + "num_tokens": 4199365.0, + "step": 560 + }, + { + "entropy": 2.097169244289398, + "epoch": 1.8224, + "grad_norm": 0.7856780886650085, + "learning_rate": 1.881616209926465e-05, + "loss": 1.561, + "mean_token_accuracy": 0.5948230788111687, + "num_tokens": 4275889.0, + "step": 570 + }, + { + "entropy": 2.088553088903427, + "epoch": 1.8544, + "grad_norm": 0.8993458151817322, + "learning_rate": 1.876685275977167e-05, + "loss": 1.5557, + "mean_token_accuracy": 0.5941933646798134, + "num_tokens": 4350502.0, + "step": 580 + }, + { + "entropy": 2.132419008016586, + "epoch": 1.8864, + "grad_norm": 0.7769711017608643, + "learning_rate": 1.8716604700327516e-05, + "loss": 1.6105, + "mean_token_accuracy": 0.5815305605530738, + "num_tokens": 4426429.0, + "step": 590 + }, + { + "entropy": 2.1076891005039213, + "epoch": 1.9184, + "grad_norm": 0.9261249899864197, + "learning_rate": 1.866542330129583e-05, + "loss": 1.5307, + "mean_token_accuracy": 0.5964644759893417, + "num_tokens": 4500147.0, + "step": 600 + }, + { + "entropy": 2.114642283320427, + "epoch": 1.9504000000000001, + "grad_norm": 0.806425929069519, + "learning_rate": 1.8613314042978576e-05, + "loss": 1.5809, + "mean_token_accuracy": 0.5901800125837326, + "num_tokens": 4573438.0, + "step": 610 + }, + { + "entropy": 2.1167576968669892, + "epoch": 1.9824000000000002, + "grad_norm": 0.8191499710083008, + "learning_rate": 1.856028250502923e-05, + "loss": 1.6031, + "mean_token_accuracy": 0.5843381330370903, + "num_tokens": 4648156.0, + "step": 620 + }, + { + "entropy": 2.0566249019221257, + "epoch": 2.0128, + "grad_norm": 0.7406135201454163, + "learning_rate": 1.8506334365855315e-05, + "loss": 1.5187, + "mean_token_accuracy": 0.6027438483740154, + "num_tokens": 4719492.0, + "step": 630 + }, + { + "entropy": 2.0126763731241226, + "epoch": 2.0448, + "grad_norm": 0.8845784068107605, + "learning_rate": 1.8451475402010405e-05, + "loss": 1.4841, + "mean_token_accuracy": 0.6069207280874253, + "num_tokens": 4796271.0, + "step": 640 + }, + { + "entropy": 2.0516900300979612, + "epoch": 2.0768, + "grad_norm": 0.9927017092704773, + "learning_rate": 1.8395711487575564e-05, + "loss": 1.512, + "mean_token_accuracy": 0.6031922519207, + "num_tokens": 4870202.0, + "step": 650 + }, + { + "entropy": 2.0824343889951704, + "epoch": 2.1088, + "grad_norm": 0.927236795425415, + "learning_rate": 1.8339048593530406e-05, + "loss": 1.5843, + "mean_token_accuracy": 0.5952437989413738, + "num_tokens": 4945568.0, + "step": 660 + }, + { + "entropy": 2.0304481953382494, + "epoch": 2.1408, + "grad_norm": 0.874019205570221, + "learning_rate": 1.8281492787113707e-05, + "loss": 1.5096, + "mean_token_accuracy": 0.5992600306868553, + "num_tokens": 5020723.0, + "step": 670 + }, + { + "entropy": 2.0402441143989565, + "epoch": 2.1728, + "grad_norm": 0.8746942281723022, + "learning_rate": 1.8223050231173802e-05, + "loss": 1.5119, + "mean_token_accuracy": 0.5994458049535751, + "num_tokens": 5095780.0, + "step": 680 + }, + { + "entropy": 2.018441066145897, + "epoch": 2.2048, + "grad_norm": 1.063180923461914, + "learning_rate": 1.816372718350864e-05, + "loss": 1.4923, + "mean_token_accuracy": 0.6064845189452172, + "num_tokens": 5169733.0, + "step": 690 + }, + { + "entropy": 2.0563316702842713, + "epoch": 2.2368, + "grad_norm": 1.0281789302825928, + "learning_rate": 1.810352999619574e-05, + "loss": 1.5505, + "mean_token_accuracy": 0.602813882380724, + "num_tokens": 5246393.0, + "step": 700 + }, + { + "entropy": 2.0298285841941834, + "epoch": 2.2688, + "grad_norm": 1.070520281791687, + "learning_rate": 1.804246511491206e-05, + "loss": 1.5159, + "mean_token_accuracy": 0.6006126523017883, + "num_tokens": 5322244.0, + "step": 710 + }, + { + "entropy": 2.0195819228887557, + "epoch": 2.3008, + "grad_norm": 0.9672983884811401, + "learning_rate": 1.7980539078243783e-05, + "loss": 1.5166, + "mean_token_accuracy": 0.6054230839014053, + "num_tokens": 5399317.0, + "step": 720 + }, + { + "entropy": 2.045917159318924, + "epoch": 2.3327999999999998, + "grad_norm": 1.1228744983673096, + "learning_rate": 1.791775851698622e-05, + "loss": 1.5096, + "mean_token_accuracy": 0.6015639662742615, + "num_tokens": 5473195.0, + "step": 730 + }, + { + "entropy": 2.0935415983200074, + "epoch": 2.3648, + "grad_norm": 1.149794578552246, + "learning_rate": 1.7854130153433785e-05, + "loss": 1.5583, + "mean_token_accuracy": 0.5921522840857506, + "num_tokens": 5548357.0, + "step": 740 + }, + { + "entropy": 2.044076007604599, + "epoch": 2.3968, + "grad_norm": 1.063625693321228, + "learning_rate": 1.7789660800660222e-05, + "loss": 1.5013, + "mean_token_accuracy": 0.5974589124321937, + "num_tokens": 5620915.0, + "step": 750 + }, + { + "entropy": 2.092478734254837, + "epoch": 2.4288, + "grad_norm": 1.1822012662887573, + "learning_rate": 1.7724357361789075e-05, + "loss": 1.5552, + "mean_token_accuracy": 0.5929681301116944, + "num_tokens": 5693406.0, + "step": 760 + }, + { + "entropy": 2.0430804908275606, + "epoch": 2.4608, + "grad_norm": 0.9921984076499939, + "learning_rate": 1.765822682925453e-05, + "loss": 1.4944, + "mean_token_accuracy": 0.6029774472117424, + "num_tokens": 5770143.0, + "step": 770 + }, + { + "entropy": 2.049290281534195, + "epoch": 2.4928, + "grad_norm": 1.0144131183624268, + "learning_rate": 1.7591276284052695e-05, + "loss": 1.5437, + "mean_token_accuracy": 0.5986773043870925, + "num_tokens": 5844022.0, + "step": 780 + }, + { + "entropy": 2.033898201584816, + "epoch": 2.5248, + "grad_norm": 1.1700315475463867, + "learning_rate": 1.7523512894983396e-05, + "loss": 1.5197, + "mean_token_accuracy": 0.5972102269530296, + "num_tokens": 5919099.0, + "step": 790 + }, + { + "entropy": 2.03344586789608, + "epoch": 2.5568, + "grad_norm": 1.0503427982330322, + "learning_rate": 1.745494391788257e-05, + "loss": 1.5456, + "mean_token_accuracy": 0.6011263683438302, + "num_tokens": 5997797.0, + "step": 800 + }, + { + "entropy": 2.0796399265527725, + "epoch": 2.5888, + "grad_norm": 1.0316176414489746, + "learning_rate": 1.7385576694845324e-05, + "loss": 1.608, + "mean_token_accuracy": 0.6024919278919697, + "num_tokens": 6075434.0, + "step": 810 + }, + { + "entropy": 2.0257797837257385, + "epoch": 2.6208, + "grad_norm": 1.048309087753296, + "learning_rate": 1.7315418653439802e-05, + "loss": 1.4876, + "mean_token_accuracy": 0.6070949509739876, + "num_tokens": 6149232.0, + "step": 820 + }, + { + "entropy": 2.024846690893173, + "epoch": 2.6528, + "grad_norm": 1.186710000038147, + "learning_rate": 1.7244477305911845e-05, + "loss": 1.499, + "mean_token_accuracy": 0.6022308841347694, + "num_tokens": 6222180.0, + "step": 830 + }, + { + "entropy": 1.9938248336315154, + "epoch": 2.6848, + "grad_norm": 1.1091604232788086, + "learning_rate": 1.717276024838062e-05, + "loss": 1.4795, + "mean_token_accuracy": 0.6044012248516083, + "num_tokens": 6296902.0, + "step": 840 + }, + { + "entropy": 1.9988998174667358, + "epoch": 2.7168, + "grad_norm": 1.0359690189361572, + "learning_rate": 1.710027516002526e-05, + "loss": 1.5173, + "mean_token_accuracy": 0.6025070771574974, + "num_tokens": 6373494.0, + "step": 850 + }, + { + "entropy": 2.02343093752861, + "epoch": 2.7488, + "grad_norm": 1.1783568859100342, + "learning_rate": 1.7027029802262598e-05, + "loss": 1.5146, + "mean_token_accuracy": 0.6033479735255242, + "num_tokens": 6449229.0, + "step": 860 + }, + { + "entropy": 2.0429257422685625, + "epoch": 2.7808, + "grad_norm": 0.9909568428993225, + "learning_rate": 1.6953032017916115e-05, + "loss": 1.5473, + "mean_token_accuracy": 0.5932901218533516, + "num_tokens": 6525728.0, + "step": 870 + }, + { + "entropy": 2.0058376491069794, + "epoch": 2.8128, + "grad_norm": 1.0904430150985718, + "learning_rate": 1.687828973037615e-05, + "loss": 1.4545, + "mean_token_accuracy": 0.6120153024792672, + "num_tokens": 6599335.0, + "step": 880 + }, + { + "entropy": 2.005480855703354, + "epoch": 2.8448, + "grad_norm": 1.1638548374176025, + "learning_rate": 1.6802810942751514e-05, + "loss": 1.4887, + "mean_token_accuracy": 0.6060751393437386, + "num_tokens": 6672722.0, + "step": 890 + }, + { + "entropy": 2.0311779022216796, + "epoch": 2.8768000000000002, + "grad_norm": 1.1404571533203125, + "learning_rate": 1.6726603737012527e-05, + "loss": 1.5238, + "mean_token_accuracy": 0.6015868663787842, + "num_tokens": 6748069.0, + "step": 900 + }, + { + "entropy": 2.0126856863498688, + "epoch": 2.9088000000000003, + "grad_norm": 1.0942543745040894, + "learning_rate": 1.6649676273125647e-05, + "loss": 1.4984, + "mean_token_accuracy": 0.6019899815320968, + "num_tokens": 6820935.0, + "step": 910 + }, + { + "entropy": 1.9961138010025024, + "epoch": 2.9408, + "grad_norm": 1.0870610475540161, + "learning_rate": 1.6572036788179728e-05, + "loss": 1.4962, + "mean_token_accuracy": 0.6030571654438972, + "num_tokens": 6896286.0, + "step": 920 + }, + { + "entropy": 2.035824549198151, + "epoch": 2.9728, + "grad_norm": 1.0822633504867554, + "learning_rate": 1.6493693595504022e-05, + "loss": 1.5354, + "mean_token_accuracy": 0.5986709952354431, + "num_tokens": 6971854.0, + "step": 930 + }, + { + "entropy": 2.0243908260997974, + "epoch": 3.0032, + "grad_norm": 1.0899602174758911, + "learning_rate": 1.6414655083778027e-05, + "loss": 1.5032, + "mean_token_accuracy": 0.5983682243447555, + "num_tokens": 7041122.0, + "step": 940 + }, + { + "entropy": 1.9538823068141937, + "epoch": 3.0352, + "grad_norm": 1.3042237758636475, + "learning_rate": 1.633492971613326e-05, + "loss": 1.4604, + "mean_token_accuracy": 0.6146818101406097, + "num_tokens": 7116032.0, + "step": 950 + }, + { + "entropy": 1.9383916020393372, + "epoch": 3.0672, + "grad_norm": 1.397078037261963, + "learning_rate": 1.6254526029247048e-05, + "loss": 1.4019, + "mean_token_accuracy": 0.6210932344198227, + "num_tokens": 7189009.0, + "step": 960 + }, + { + "entropy": 1.9460978150367736, + "epoch": 3.0992, + "grad_norm": 1.2756887674331665, + "learning_rate": 1.617345263242847e-05, + "loss": 1.4623, + "mean_token_accuracy": 0.6121616646647453, + "num_tokens": 7263068.0, + "step": 970 + }, + { + "entropy": 1.9156711965799331, + "epoch": 3.1312, + "grad_norm": 1.1937649250030518, + "learning_rate": 1.609171820669649e-05, + "loss": 1.4301, + "mean_token_accuracy": 0.6136599197983742, + "num_tokens": 7338652.0, + "step": 980 + }, + { + "entropy": 1.9247682303190232, + "epoch": 3.1632, + "grad_norm": 1.3291118144989014, + "learning_rate": 1.6009331503850448e-05, + "loss": 1.4545, + "mean_token_accuracy": 0.6153608947992325, + "num_tokens": 7414529.0, + "step": 990 + }, + { + "entropy": 1.9066543668508529, + "epoch": 3.1952, + "grad_norm": 1.4356389045715332, + "learning_rate": 1.5926301345532925e-05, + "loss": 1.4413, + "mean_token_accuracy": 0.612147618830204, + "num_tokens": 7489106.0, + "step": 1000 + }, + { + "entropy": 1.895160937309265, + "epoch": 3.2272, + "grad_norm": 1.4345523118972778, + "learning_rate": 1.5842636622285187e-05, + "loss": 1.4207, + "mean_token_accuracy": 0.6138400137424469, + "num_tokens": 7564304.0, + "step": 1010 + }, + { + "entropy": 1.9546802312135696, + "epoch": 3.2592, + "grad_norm": 1.5242680311203003, + "learning_rate": 1.575834629259519e-05, + "loss": 1.4435, + "mean_token_accuracy": 0.6153354361653328, + "num_tokens": 7637409.0, + "step": 1020 + }, + { + "entropy": 1.912938117980957, + "epoch": 3.2912, + "grad_norm": 1.529726505279541, + "learning_rate": 1.5673439381938365e-05, + "loss": 1.4409, + "mean_token_accuracy": 0.6191004544496537, + "num_tokens": 7711595.0, + "step": 1030 + }, + { + "entropy": 1.8989770442247391, + "epoch": 3.3232, + "grad_norm": 1.3367948532104492, + "learning_rate": 1.5587924981811196e-05, + "loss": 1.394, + "mean_token_accuracy": 0.624155393242836, + "num_tokens": 7785750.0, + "step": 1040 + }, + { + "entropy": 1.932333904504776, + "epoch": 3.3552, + "grad_norm": 1.4732215404510498, + "learning_rate": 1.5501812248757734e-05, + "loss": 1.3959, + "mean_token_accuracy": 0.6221834555268287, + "num_tokens": 7859036.0, + "step": 1050 + }, + { + "entropy": 1.9492982983589173, + "epoch": 3.3872, + "grad_norm": 1.4499313831329346, + "learning_rate": 1.5415110403389166e-05, + "loss": 1.4633, + "mean_token_accuracy": 0.6100246667861938, + "num_tokens": 7933165.0, + "step": 1060 + }, + { + "entropy": 1.9063653618097305, + "epoch": 3.4192, + "grad_norm": 1.4364317655563354, + "learning_rate": 1.5327828729396482e-05, + "loss": 1.4216, + "mean_token_accuracy": 0.6210869938135147, + "num_tokens": 8009376.0, + "step": 1070 + }, + { + "entropy": 1.9919361650943757, + "epoch": 3.4512, + "grad_norm": 1.5573089122772217, + "learning_rate": 1.5239976572556438e-05, + "loss": 1.5899, + "mean_token_accuracy": 0.5991086520254612, + "num_tokens": 8086825.0, + "step": 1080 + }, + { + "entropy": 1.922476476430893, + "epoch": 3.4832, + "grad_norm": 1.3339344263076782, + "learning_rate": 1.5151563339730849e-05, + "loss": 1.4162, + "mean_token_accuracy": 0.6182018965482712, + "num_tokens": 8161726.0, + "step": 1090 + }, + { + "entropy": 1.9143129527568816, + "epoch": 3.5152, + "grad_norm": 1.4425708055496216, + "learning_rate": 1.506259849785931e-05, + "loss": 1.4076, + "mean_token_accuracy": 0.6197950705885887, + "num_tokens": 8237046.0, + "step": 1100 + }, + { + "entropy": 1.9093267023563385, + "epoch": 3.5472, + "grad_norm": 1.5437992811203003, + "learning_rate": 1.497309157294555e-05, + "loss": 1.4339, + "mean_token_accuracy": 0.6177847877144813, + "num_tokens": 8315350.0, + "step": 1110 + }, + { + "entropy": 1.9121424347162246, + "epoch": 3.5792, + "grad_norm": 1.3761622905731201, + "learning_rate": 1.4883052149037395e-05, + "loss": 1.4175, + "mean_token_accuracy": 0.6171463698148727, + "num_tokens": 8390383.0, + "step": 1120 + }, + { + "entropy": 1.883551675081253, + "epoch": 3.6112, + "grad_norm": 1.36739182472229, + "learning_rate": 1.479248986720057e-05, + "loss": 1.4158, + "mean_token_accuracy": 0.6186214044690133, + "num_tokens": 8468414.0, + "step": 1130 + }, + { + "entropy": 1.988349151611328, + "epoch": 3.6432, + "grad_norm": 1.4566738605499268, + "learning_rate": 1.4701414424486353e-05, + "loss": 1.5366, + "mean_token_accuracy": 0.6110676810145378, + "num_tokens": 8541715.0, + "step": 1140 + }, + { + "entropy": 1.9057112097740174, + "epoch": 3.6752000000000002, + "grad_norm": 1.499079704284668, + "learning_rate": 1.4609835572893266e-05, + "loss": 1.3991, + "mean_token_accuracy": 0.6208718970417977, + "num_tokens": 8615694.0, + "step": 1150 + }, + { + "entropy": 1.9219326049089431, + "epoch": 3.7072000000000003, + "grad_norm": 1.3865621089935303, + "learning_rate": 1.4517763118322861e-05, + "loss": 1.431, + "mean_token_accuracy": 0.6143050745129586, + "num_tokens": 8692473.0, + "step": 1160 + }, + { + "entropy": 1.9036399960517882, + "epoch": 3.7392, + "grad_norm": 1.5362603664398193, + "learning_rate": 1.4425206919529747e-05, + "loss": 1.4156, + "mean_token_accuracy": 0.6199175521731377, + "num_tokens": 8767618.0, + "step": 1170 + }, + { + "entropy": 1.9499989479780198, + "epoch": 3.7712, + "grad_norm": 1.663404941558838, + "learning_rate": 1.4332176887065955e-05, + "loss": 1.4668, + "mean_token_accuracy": 0.605186915397644, + "num_tokens": 8843100.0, + "step": 1180 + }, + { + "entropy": 1.9545456051826477, + "epoch": 3.8032, + "grad_norm": 1.6169345378875732, + "learning_rate": 1.4238682982219753e-05, + "loss": 1.4241, + "mean_token_accuracy": 0.6206902250647545, + "num_tokens": 8914604.0, + "step": 1190 + }, + { + "entropy": 1.9130536198616028, + "epoch": 3.8352, + "grad_norm": 1.472740650177002, + "learning_rate": 1.4144735215949028e-05, + "loss": 1.4271, + "mean_token_accuracy": 0.6139126420021057, + "num_tokens": 8989305.0, + "step": 1200 + }, + { + "entropy": 1.938635140657425, + "epoch": 3.8672, + "grad_norm": 1.4194226264953613, + "learning_rate": 1.4050343647809354e-05, + "loss": 1.4538, + "mean_token_accuracy": 0.6131341770291329, + "num_tokens": 9065589.0, + "step": 1210 + }, + { + "entropy": 1.9123675346374511, + "epoch": 3.8992, + "grad_norm": 1.5208053588867188, + "learning_rate": 1.3955518384876863e-05, + "loss": 1.4309, + "mean_token_accuracy": 0.6139545351266861, + "num_tokens": 9140150.0, + "step": 1220 + }, + { + "entropy": 1.9148090302944183, + "epoch": 3.9312, + "grad_norm": 1.6418218612670898, + "learning_rate": 1.3860269580666004e-05, + "loss": 1.4269, + "mean_token_accuracy": 0.6169310078024864, + "num_tokens": 9215796.0, + "step": 1230 + }, + { + "entropy": 1.9157740741968154, + "epoch": 3.9632, + "grad_norm": 1.4638084173202515, + "learning_rate": 1.3764607434042353e-05, + "loss": 1.4509, + "mean_token_accuracy": 0.6164968460798264, + "num_tokens": 9291010.0, + "step": 1240 + }, + { + "entropy": 1.9184510678052902, + "epoch": 3.9952, + "grad_norm": 1.5152716636657715, + "learning_rate": 1.3668542188130567e-05, + "loss": 1.4453, + "mean_token_accuracy": 0.6112410992383956, + "num_tokens": 9367186.0, + "step": 1250 + }, + { + "entropy": 1.9016748384425515, + "epoch": 4.0256, + "grad_norm": 1.490628719329834, + "learning_rate": 1.3572084129217566e-05, + "loss": 1.382, + "mean_token_accuracy": 0.623968276538347, + "num_tokens": 9439028.0, + "step": 1260 + }, + { + "entropy": 1.8026290327310561, + "epoch": 4.0576, + "grad_norm": 1.8969308137893677, + "learning_rate": 1.347524358565115e-05, + "loss": 1.3128, + "mean_token_accuracy": 0.6386646836996078, + "num_tokens": 9513855.0, + "step": 1270 + }, + { + "entropy": 1.8283424764871596, + "epoch": 4.0896, + "grad_norm": 1.5952194929122925, + "learning_rate": 1.3378030926734052e-05, + "loss": 1.3362, + "mean_token_accuracy": 0.6328515768051147, + "num_tokens": 9589080.0, + "step": 1280 + }, + { + "entropy": 1.8405955344438554, + "epoch": 4.1216, + "grad_norm": 1.6057584285736084, + "learning_rate": 1.3280456561613653e-05, + "loss": 1.4151, + "mean_token_accuracy": 0.6261398203670978, + "num_tokens": 9666808.0, + "step": 1290 + }, + { + "entropy": 1.8390818655490875, + "epoch": 4.1536, + "grad_norm": 1.8149824142456055, + "learning_rate": 1.3182530938167409e-05, + "loss": 1.3455, + "mean_token_accuracy": 0.6318597674369812, + "num_tokens": 9740267.0, + "step": 1300 + }, + { + "entropy": 1.8203887075185776, + "epoch": 4.1856, + "grad_norm": 1.6102676391601562, + "learning_rate": 1.3084264541884118e-05, + "loss": 1.3255, + "mean_token_accuracy": 0.6316933467984199, + "num_tokens": 9816400.0, + "step": 1310 + }, + { + "entropy": 1.8592366576194763, + "epoch": 4.2176, + "grad_norm": 1.9501773118972778, + "learning_rate": 1.2985667894741197e-05, + "loss": 1.3521, + "mean_token_accuracy": 0.6301594719290733, + "num_tokens": 9889311.0, + "step": 1320 + }, + { + "entropy": 1.8420085966587068, + "epoch": 4.2496, + "grad_norm": 1.6526106595993042, + "learning_rate": 1.2886751554078015e-05, + "loss": 1.3662, + "mean_token_accuracy": 0.6302071824669838, + "num_tokens": 9965339.0, + "step": 1330 + }, + { + "entropy": 1.8313881188631058, + "epoch": 4.2816, + "grad_norm": 1.6269904375076294, + "learning_rate": 1.2787526111465453e-05, + "loss": 1.3579, + "mean_token_accuracy": 0.6328388035297394, + "num_tokens": 10039668.0, + "step": 1340 + }, + { + "entropy": 1.858151137828827, + "epoch": 4.3136, + "grad_norm": 1.9028024673461914, + "learning_rate": 1.2688002191571829e-05, + "loss": 1.421, + "mean_token_accuracy": 0.6276688367128372, + "num_tokens": 10115387.0, + "step": 1350 + }, + { + "entropy": 1.8273844957351684, + "epoch": 4.3456, + "grad_norm": 1.7530555725097656, + "learning_rate": 1.2588190451025209e-05, + "loss": 1.3527, + "mean_token_accuracy": 0.6345869660377502, + "num_tokens": 10191506.0, + "step": 1360 + }, + { + "entropy": 1.8732422679662704, + "epoch": 4.3776, + "grad_norm": 1.7372691631317139, + "learning_rate": 1.248810157727236e-05, + "loss": 1.4132, + "mean_token_accuracy": 0.6252246856689453, + "num_tokens": 10268756.0, + "step": 1370 + }, + { + "entropy": 1.8583054572343827, + "epoch": 4.4096, + "grad_norm": 1.6993470191955566, + "learning_rate": 1.2387746287434385e-05, + "loss": 1.3638, + "mean_token_accuracy": 0.6286717876791954, + "num_tokens": 10341779.0, + "step": 1380 + }, + { + "entropy": 1.8324467271566391, + "epoch": 4.4416, + "grad_norm": 1.7818169593811035, + "learning_rate": 1.2287135327159165e-05, + "loss": 1.3372, + "mean_token_accuracy": 0.6361263945698739, + "num_tokens": 10414642.0, + "step": 1390 + }, + { + "entropy": 1.8514392852783204, + "epoch": 4.4736, + "grad_norm": 1.7585517168045044, + "learning_rate": 1.2186279469470757e-05, + "loss": 1.3697, + "mean_token_accuracy": 0.628801380097866, + "num_tokens": 10489517.0, + "step": 1400 + }, + { + "entropy": 1.8218136370182036, + "epoch": 4.5056, + "grad_norm": 1.9843116998672485, + "learning_rate": 1.2085189513615872e-05, + "loss": 1.3628, + "mean_token_accuracy": 0.6295172199606895, + "num_tokens": 10565467.0, + "step": 1410 + }, + { + "entropy": 1.8919565021991729, + "epoch": 4.5376, + "grad_norm": 1.9309132099151611, + "learning_rate": 1.1983876283907522e-05, + "loss": 1.4467, + "mean_token_accuracy": 0.6263746194541454, + "num_tokens": 10641283.0, + "step": 1420 + }, + { + "entropy": 1.8356508910655975, + "epoch": 4.5696, + "grad_norm": 1.7685068845748901, + "learning_rate": 1.1882350628566008e-05, + "loss": 1.3631, + "mean_token_accuracy": 0.624418406188488, + "num_tokens": 10716701.0, + "step": 1430 + }, + { + "entropy": 1.8288098931312562, + "epoch": 4.6016, + "grad_norm": 1.8276050090789795, + "learning_rate": 1.178062341855732e-05, + "loss": 1.3619, + "mean_token_accuracy": 0.6286922857165337, + "num_tokens": 10791427.0, + "step": 1440 + }, + { + "entropy": 1.8557640790939331, + "epoch": 4.6336, + "grad_norm": 1.7773240804672241, + "learning_rate": 1.1678705546429132e-05, + "loss": 1.383, + "mean_token_accuracy": 0.6216814562678337, + "num_tokens": 10866356.0, + "step": 1450 + }, + { + "entropy": 1.8483826667070389, + "epoch": 4.6655999999999995, + "grad_norm": 1.831931710243225, + "learning_rate": 1.1576607925144456e-05, + "loss": 1.355, + "mean_token_accuracy": 0.6278511002659798, + "num_tokens": 10940772.0, + "step": 1460 + }, + { + "entropy": 1.8824394553899766, + "epoch": 4.6975999999999996, + "grad_norm": 1.9213542938232422, + "learning_rate": 1.1474341486913146e-05, + "loss": 1.3767, + "mean_token_accuracy": 0.6256057649850846, + "num_tokens": 11016144.0, + "step": 1470 + }, + { + "entropy": 1.8709469974040984, + "epoch": 4.7296, + "grad_norm": 1.8768925666809082, + "learning_rate": 1.1371917182021297e-05, + "loss": 1.3734, + "mean_token_accuracy": 0.6317574754357338, + "num_tokens": 11089939.0, + "step": 1480 + }, + { + "entropy": 1.8673742085695266, + "epoch": 4.7616, + "grad_norm": 1.796302318572998, + "learning_rate": 1.1269345977658747e-05, + "loss": 1.3707, + "mean_token_accuracy": 0.6238353416323662, + "num_tokens": 11166087.0, + "step": 1490 + }, + { + "entropy": 1.8310889720916748, + "epoch": 4.7936, + "grad_norm": 1.8969939947128296, + "learning_rate": 1.1166638856744747e-05, + "loss": 1.3373, + "mean_token_accuracy": 0.6348015949130058, + "num_tokens": 11240732.0, + "step": 1500 + }, + { + "entropy": 1.8809226244688033, + "epoch": 4.8256, + "grad_norm": 1.642104983329773, + "learning_rate": 1.1063806816751957e-05, + "loss": 1.3792, + "mean_token_accuracy": 0.6250617265701294, + "num_tokens": 11316878.0, + "step": 1510 + }, + { + "entropy": 1.8715822875499726, + "epoch": 4.8576, + "grad_norm": 1.962158441543579, + "learning_rate": 1.0960860868528872e-05, + "loss": 1.3711, + "mean_token_accuracy": 0.6293752744793892, + "num_tokens": 11389042.0, + "step": 1520 + }, + { + "entropy": 1.8657191127538681, + "epoch": 4.8896, + "grad_norm": 1.9577444791793823, + "learning_rate": 1.0857812035120845e-05, + "loss": 1.379, + "mean_token_accuracy": 0.6259156972169876, + "num_tokens": 11464215.0, + "step": 1530 + }, + { + "entropy": 1.8811951220035552, + "epoch": 4.9216, + "grad_norm": 2.015150785446167, + "learning_rate": 1.0754671350589752e-05, + "loss": 1.4155, + "mean_token_accuracy": 0.626779156178236, + "num_tokens": 11539122.0, + "step": 1540 + }, + { + "entropy": 1.863905319571495, + "epoch": 4.9536, + "grad_norm": 1.8474093675613403, + "learning_rate": 1.065144985883253e-05, + "loss": 1.3409, + "mean_token_accuracy": 0.6319419264793396, + "num_tokens": 11613016.0, + "step": 1550 + }, + { + "entropy": 1.836970153450966, + "epoch": 4.9856, + "grad_norm": 1.8822177648544312, + "learning_rate": 1.054815861239864e-05, + "loss": 1.3514, + "mean_token_accuracy": 0.6292115703225136, + "num_tokens": 11688143.0, + "step": 1560 + }, + { + "entropy": 1.8377465950815302, + "epoch": 5.016, + "grad_norm": 1.8221346139907837, + "learning_rate": 1.0444808671306588e-05, + "loss": 1.3028, + "mean_token_accuracy": 0.6413120329380035, + "num_tokens": 11758768.0, + "step": 1570 + }, + { + "entropy": 1.7883025139570237, + "epoch": 5.048, + "grad_norm": 2.1959595680236816, + "learning_rate": 1.034141110185968e-05, + "loss": 1.2797, + "mean_token_accuracy": 0.6479741290211678, + "num_tokens": 11832210.0, + "step": 1580 + }, + { + "entropy": 1.7955584406852723, + "epoch": 5.08, + "grad_norm": 2.106905698776245, + "learning_rate": 1.0237976975461074e-05, + "loss": 1.2881, + "mean_token_accuracy": 0.6424632370471954, + "num_tokens": 11906115.0, + "step": 1590 + }, + { + "entropy": 1.7998322755098344, + "epoch": 5.112, + "grad_norm": 2.327314615249634, + "learning_rate": 1.0134517367428309e-05, + "loss": 1.3324, + "mean_token_accuracy": 0.6460248224437237, + "num_tokens": 11981328.0, + "step": 1600 + }, + { + "entropy": 1.7885828018188477, + "epoch": 5.144, + "grad_norm": 2.1001713275909424, + "learning_rate": 1.0031043355807386e-05, + "loss": 1.3098, + "mean_token_accuracy": 0.63900758177042, + "num_tokens": 12056453.0, + "step": 1610 + }, + { + "entropy": 1.769435602426529, + "epoch": 5.176, + "grad_norm": 2.1210567951202393, + "learning_rate": 9.927566020186592e-06, + "loss": 1.2892, + "mean_token_accuracy": 0.6432970002293587, + "num_tokens": 12133433.0, + "step": 1620 + }, + { + "entropy": 1.7907766073942184, + "epoch": 5.208, + "grad_norm": 2.1842658519744873, + "learning_rate": 9.82409644051013e-06, + "loss": 1.2856, + "mean_token_accuracy": 0.6423615619540215, + "num_tokens": 12207150.0, + "step": 1630 + }, + { + "entropy": 1.7834827870130538, + "epoch": 5.24, + "grad_norm": 2.2503459453582764, + "learning_rate": 9.720645695891733e-06, + "loss": 1.3066, + "mean_token_accuracy": 0.6417693704366684, + "num_tokens": 12282584.0, + "step": 1640 + }, + { + "entropy": 1.763256973028183, + "epoch": 5.272, + "grad_norm": 1.9505388736724854, + "learning_rate": 9.617224863428346e-06, + "loss": 1.2951, + "mean_token_accuracy": 0.6429389595985413, + "num_tokens": 12359793.0, + "step": 1650 + }, + { + "entropy": 1.8142763644456863, + "epoch": 5.304, + "grad_norm": 1.9957698583602905, + "learning_rate": 9.513845017014048e-06, + "loss": 1.33, + "mean_token_accuracy": 0.6413653999567032, + "num_tokens": 12434251.0, + "step": 1660 + }, + { + "entropy": 1.797221601009369, + "epoch": 5.336, + "grad_norm": 2.5095462799072266, + "learning_rate": 9.410517226154276e-06, + "loss": 1.2978, + "mean_token_accuracy": 0.6389835774898529, + "num_tokens": 12508416.0, + "step": 1670 + }, + { + "entropy": 1.8157870292663574, + "epoch": 5.368, + "grad_norm": 2.1890602111816406, + "learning_rate": 9.30725255478058e-06, + "loss": 1.3183, + "mean_token_accuracy": 0.6448161751031876, + "num_tokens": 12582896.0, + "step": 1680 + }, + { + "entropy": 1.7990054041147232, + "epoch": 5.4, + "grad_norm": 2.3904025554656982, + "learning_rate": 9.204062060065915e-06, + "loss": 1.3318, + "mean_token_accuracy": 0.636146479845047, + "num_tokens": 12656802.0, + "step": 1690 + }, + { + "entropy": 1.8003453463315964, + "epoch": 5.432, + "grad_norm": 1.9204304218292236, + "learning_rate": 9.100956791240699e-06, + "loss": 1.3186, + "mean_token_accuracy": 0.6372130662202835, + "num_tokens": 12733283.0, + "step": 1700 + }, + { + "entropy": 1.8101116061210631, + "epoch": 5.464, + "grad_norm": 2.009500026702881, + "learning_rate": 8.997947788409696e-06, + "loss": 1.3247, + "mean_token_accuracy": 0.6406339526176452, + "num_tokens": 12810272.0, + "step": 1710 + }, + { + "entropy": 1.764935952425003, + "epoch": 5.496, + "grad_norm": 2.2038798332214355, + "learning_rate": 8.89504608136989e-06, + "loss": 1.2792, + "mean_token_accuracy": 0.6445836886763573, + "num_tokens": 12885633.0, + "step": 1720 + }, + { + "entropy": 1.7950240582227708, + "epoch": 5.5280000000000005, + "grad_norm": 2.0160531997680664, + "learning_rate": 8.792262688429445e-06, + "loss": 1.2934, + "mean_token_accuracy": 0.6469692558050155, + "num_tokens": 12961131.0, + "step": 1730 + }, + { + "entropy": 1.7804677098989488, + "epoch": 5.5600000000000005, + "grad_norm": 2.1956582069396973, + "learning_rate": 8.689608615227933e-06, + "loss": 1.2969, + "mean_token_accuracy": 0.6438389763236045, + "num_tokens": 13036481.0, + "step": 1740 + }, + { + "entropy": 1.7932062089443206, + "epoch": 5.592, + "grad_norm": 2.2215394973754883, + "learning_rate": 8.587094853557877e-06, + "loss": 1.2907, + "mean_token_accuracy": 0.6460438340902328, + "num_tokens": 13111001.0, + "step": 1750 + }, + { + "entropy": 1.8026408910751344, + "epoch": 5.624, + "grad_norm": 2.3881425857543945, + "learning_rate": 8.484732380187785e-06, + "loss": 1.3049, + "mean_token_accuracy": 0.6414234206080437, + "num_tokens": 13186347.0, + "step": 1760 + }, + { + "entropy": 1.8440747499465941, + "epoch": 5.656, + "grad_norm": 2.2154159545898438, + "learning_rate": 8.382532155686825e-06, + "loss": 1.3797, + "mean_token_accuracy": 0.6365857936441899, + "num_tokens": 13261455.0, + "step": 1770 + }, + { + "entropy": 1.7975190997123718, + "epoch": 5.688, + "grad_norm": 2.1991233825683594, + "learning_rate": 8.280505123251183e-06, + "loss": 1.3191, + "mean_token_accuracy": 0.6393151715397835, + "num_tokens": 13338064.0, + "step": 1780 + }, + { + "entropy": 1.8396487146615983, + "epoch": 5.72, + "grad_norm": 2.0190858840942383, + "learning_rate": 8.178662207532343e-06, + "loss": 1.4052, + "mean_token_accuracy": 0.629064130038023, + "num_tokens": 13414806.0, + "step": 1790 + }, + { + "entropy": 1.7840806126594544, + "epoch": 5.752, + "grad_norm": 2.3335204124450684, + "learning_rate": 8.077014313467274e-06, + "loss": 1.2701, + "mean_token_accuracy": 0.6464540064334869, + "num_tokens": 13489075.0, + "step": 1800 + }, + { + "entropy": 1.7840022534132003, + "epoch": 5.784, + "grad_norm": 2.2151618003845215, + "learning_rate": 7.975572325110819e-06, + "loss": 1.3248, + "mean_token_accuracy": 0.6358998969197274, + "num_tokens": 13565636.0, + "step": 1810 + }, + { + "entropy": 1.7677135676145554, + "epoch": 5.816, + "grad_norm": 2.11505389213562, + "learning_rate": 7.874347104470234e-06, + "loss": 1.2765, + "mean_token_accuracy": 0.6448719501495361, + "num_tokens": 13641112.0, + "step": 1820 + }, + { + "entropy": 1.7586044907569884, + "epoch": 5.848, + "grad_norm": 2.178250789642334, + "learning_rate": 7.773349490342157e-06, + "loss": 1.2846, + "mean_token_accuracy": 0.6450280979275703, + "num_tokens": 13715158.0, + "step": 1830 + }, + { + "entropy": 1.8128920108079911, + "epoch": 5.88, + "grad_norm": 2.2499353885650635, + "learning_rate": 7.672590297152013e-06, + "loss": 1.3248, + "mean_token_accuracy": 0.6343795835971833, + "num_tokens": 13791086.0, + "step": 1840 + }, + { + "entropy": 1.7873643577098846, + "epoch": 5.912, + "grad_norm": 2.1989104747772217, + "learning_rate": 7.572080313796064e-06, + "loss": 1.2907, + "mean_token_accuracy": 0.6413815975189209, + "num_tokens": 13865700.0, + "step": 1850 + }, + { + "entropy": 1.790488451719284, + "epoch": 5.944, + "grad_norm": 2.2605504989624023, + "learning_rate": 7.471830302486151e-06, + "loss": 1.2955, + "mean_token_accuracy": 0.6424889475107193, + "num_tokens": 13938540.0, + "step": 1860 + }, + { + "entropy": 1.7985246628522873, + "epoch": 5.976, + "grad_norm": 2.3228533267974854, + "learning_rate": 7.371850997597355e-06, + "loss": 1.2854, + "mean_token_accuracy": 0.6457341402769089, + "num_tokens": 14011087.0, + "step": 1870 + }, + { + "entropy": 1.7890221727521796, + "epoch": 6.0064, + "grad_norm": 2.192910671234131, + "learning_rate": 7.272153104518567e-06, + "loss": 1.2796, + "mean_token_accuracy": 0.6482133128141102, + "num_tokens": 14082075.0, + "step": 1880 + }, + { + "entropy": 1.7633086562156677, + "epoch": 6.0384, + "grad_norm": 2.368185043334961, + "learning_rate": 7.172747298506224e-06, + "loss": 1.2703, + "mean_token_accuracy": 0.6497290328145027, + "num_tokens": 14156298.0, + "step": 1890 + }, + { + "entropy": 1.750128635764122, + "epoch": 6.0704, + "grad_norm": 2.36487078666687, + "learning_rate": 7.073644223541227e-06, + "loss": 1.2521, + "mean_token_accuracy": 0.6534707516431808, + "num_tokens": 14232528.0, + "step": 1900 + }, + { + "entropy": 1.7416553497314453, + "epoch": 6.1024, + "grad_norm": 2.3927595615386963, + "learning_rate": 6.974854491189243e-06, + "loss": 1.217, + "mean_token_accuracy": 0.6588135868310928, + "num_tokens": 14307073.0, + "step": 1910 + }, + { + "entropy": 1.7359241485595702, + "epoch": 6.1344, + "grad_norm": 2.1107988357543945, + "learning_rate": 6.876388679464437e-06, + "loss": 1.2763, + "mean_token_accuracy": 0.6550255373120308, + "num_tokens": 14383819.0, + "step": 1920 + }, + { + "entropy": 1.7380403220653533, + "epoch": 6.1664, + "grad_norm": 2.4158387184143066, + "learning_rate": 6.7782573316968424e-06, + "loss": 1.2359, + "mean_token_accuracy": 0.656632873415947, + "num_tokens": 14460092.0, + "step": 1930 + }, + { + "entropy": 1.7227638810873032, + "epoch": 6.1984, + "grad_norm": 2.3467485904693604, + "learning_rate": 6.6804709554034075e-06, + "loss": 1.2311, + "mean_token_accuracy": 0.654091839492321, + "num_tokens": 14534160.0, + "step": 1940 + }, + { + "entropy": 1.7244948148727417, + "epoch": 6.2304, + "grad_norm": 2.760057210922241, + "learning_rate": 6.583040021162905e-06, + "loss": 1.2189, + "mean_token_accuracy": 0.6611428812146187, + "num_tokens": 14608592.0, + "step": 1950 + }, + { + "entropy": 1.7471657902002335, + "epoch": 6.2624, + "grad_norm": 2.3923745155334473, + "learning_rate": 6.485974961494772e-06, + "loss": 1.2631, + "mean_token_accuracy": 0.6524021357297898, + "num_tokens": 14683538.0, + "step": 1960 + }, + { + "entropy": 1.7506494253873826, + "epoch": 6.2943999999999996, + "grad_norm": 2.4149715900421143, + "learning_rate": 6.389286169742048e-06, + "loss": 1.2468, + "mean_token_accuracy": 0.6567713841795921, + "num_tokens": 14755778.0, + "step": 1970 + }, + { + "entropy": 1.7104488879442215, + "epoch": 6.3264, + "grad_norm": 2.632632255554199, + "learning_rate": 6.292983998958478e-06, + "loss": 1.2267, + "mean_token_accuracy": 0.6561126798391342, + "num_tokens": 14831036.0, + "step": 1980 + }, + { + "entropy": 1.7591658294200898, + "epoch": 6.3584, + "grad_norm": 2.4012722969055176, + "learning_rate": 6.1970787607999815e-06, + "loss": 1.2693, + "mean_token_accuracy": 0.6490694522857666, + "num_tokens": 14906610.0, + "step": 1990 + }, + { + "entropy": 1.7317969173192977, + "epoch": 6.3904, + "grad_norm": 2.8288748264312744, + "learning_rate": 6.101580724420478e-06, + "loss": 1.235, + "mean_token_accuracy": 0.6564200609922409, + "num_tokens": 14980134.0, + "step": 2000 + }, + { + "entropy": 1.7617577254772185, + "epoch": 6.4224, + "grad_norm": 2.4008944034576416, + "learning_rate": 6.00650011537235e-06, + "loss": 1.2773, + "mean_token_accuracy": 0.6494350075721741, + "num_tokens": 15054969.0, + "step": 2010 + }, + { + "entropy": 1.749829688668251, + "epoch": 6.4544, + "grad_norm": 2.5665228366851807, + "learning_rate": 5.911847114511497e-06, + "loss": 1.2512, + "mean_token_accuracy": 0.6512764275074006, + "num_tokens": 15129421.0, + "step": 2020 + }, + { + "entropy": 1.7387797951698303, + "epoch": 6.4864, + "grad_norm": 2.6020922660827637, + "learning_rate": 5.817631856907233e-06, + "loss": 1.2477, + "mean_token_accuracy": 0.6530226185917855, + "num_tokens": 15203465.0, + "step": 2030 + }, + { + "entropy": 1.7363551884889603, + "epoch": 6.5184, + "grad_norm": 2.161478281021118, + "learning_rate": 5.723864430757047e-06, + "loss": 1.2692, + "mean_token_accuracy": 0.6527093783020973, + "num_tokens": 15279761.0, + "step": 2040 + }, + { + "entropy": 1.7563295543193818, + "epoch": 6.5504, + "grad_norm": 2.5587289333343506, + "learning_rate": 5.630554876306407e-06, + "loss": 1.2211, + "mean_token_accuracy": 0.6574550330638885, + "num_tokens": 15351301.0, + "step": 2050 + }, + { + "entropy": 1.7521151036024094, + "epoch": 6.5824, + "grad_norm": 2.4042234420776367, + "learning_rate": 5.537713184773686e-06, + "loss": 1.271, + "mean_token_accuracy": 0.6478641331195831, + "num_tokens": 15427936.0, + "step": 2060 + }, + { + "entropy": 1.7270145863294601, + "epoch": 6.6144, + "grad_norm": 2.3122522830963135, + "learning_rate": 5.44534929728036e-06, + "loss": 1.224, + "mean_token_accuracy": 0.6566437393426895, + "num_tokens": 15502561.0, + "step": 2070 + }, + { + "entropy": 1.7568089962005615, + "epoch": 6.6464, + "grad_norm": 2.461474895477295, + "learning_rate": 5.353473103786511e-06, + "loss": 1.2864, + "mean_token_accuracy": 0.6471308276057244, + "num_tokens": 15578053.0, + "step": 2080 + }, + { + "entropy": 1.7190027862787247, + "epoch": 6.6784, + "grad_norm": 2.4412550926208496, + "learning_rate": 5.262094442031901e-06, + "loss": 1.2092, + "mean_token_accuracy": 0.6601713746786118, + "num_tokens": 15653342.0, + "step": 2090 + }, + { + "entropy": 1.717634916305542, + "epoch": 6.7104, + "grad_norm": 2.276007890701294, + "learning_rate": 5.171223096482533e-06, + "loss": 1.2271, + "mean_token_accuracy": 0.6595920532941818, + "num_tokens": 15730387.0, + "step": 2100 + }, + { + "entropy": 1.7230647921562194, + "epoch": 6.7424, + "grad_norm": 2.480471134185791, + "learning_rate": 5.080868797283019e-06, + "loss": 1.229, + "mean_token_accuracy": 0.6568982020020485, + "num_tokens": 15804405.0, + "step": 2110 + }, + { + "entropy": 1.7535502433776855, + "epoch": 6.7744, + "grad_norm": 2.448997974395752, + "learning_rate": 4.9910412192146795e-06, + "loss": 1.2584, + "mean_token_accuracy": 0.648795773088932, + "num_tokens": 15878537.0, + "step": 2120 + }, + { + "entropy": 1.786664029955864, + "epoch": 6.8064, + "grad_norm": 2.430039405822754, + "learning_rate": 4.901749980659617e-06, + "loss": 1.3358, + "mean_token_accuracy": 0.6427689291536808, + "num_tokens": 15952964.0, + "step": 2130 + }, + { + "entropy": 1.7594995677471161, + "epoch": 6.8384, + "grad_norm": 2.469172716140747, + "learning_rate": 4.813004642570822e-06, + "loss": 1.2844, + "mean_token_accuracy": 0.6534359715878963, + "num_tokens": 16028086.0, + "step": 2140 + }, + { + "entropy": 1.7347292125225067, + "epoch": 6.8704, + "grad_norm": 2.6162445545196533, + "learning_rate": 4.724814707448418e-06, + "loss": 1.2707, + "mean_token_accuracy": 0.6447671175003051, + "num_tokens": 16103263.0, + "step": 2150 + }, + { + "entropy": 1.7325938045978546, + "epoch": 6.9024, + "grad_norm": 2.416431188583374, + "learning_rate": 4.637189618322173e-06, + "loss": 1.2794, + "mean_token_accuracy": 0.6439008563756943, + "num_tokens": 16182360.0, + "step": 2160 + }, + { + "entropy": 1.7763712674379348, + "epoch": 6.9344, + "grad_norm": 2.3447437286376953, + "learning_rate": 4.550138757740381e-06, + "loss": 1.3043, + "mean_token_accuracy": 0.650251479446888, + "num_tokens": 16256272.0, + "step": 2170 + }, + { + "entropy": 1.739478302001953, + "epoch": 6.9664, + "grad_norm": 2.650451183319092, + "learning_rate": 4.463671446765206e-06, + "loss": 1.259, + "mean_token_accuracy": 0.6514677822589874, + "num_tokens": 16330984.0, + "step": 2180 + }, + { + "entropy": 1.7292406976222991, + "epoch": 6.9984, + "grad_norm": 2.5442306995391846, + "learning_rate": 4.377796943974641e-06, + "loss": 1.2554, + "mean_token_accuracy": 0.6506337657570839, + "num_tokens": 16406982.0, + "step": 2190 + } + ], + "logging_steps": 10, + "max_steps": 3130, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.5937216534515548e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2191/training_args.bin b/checkpoint-2191/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..ca9563f4064abfd2f36668559093a5f0763d7c85 --- /dev/null +++ b/checkpoint-2191/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88fbd415d023bc35b9e36c515e374642c961f2c3a428f0d0bba13a27d8c151a7 +size 6289 diff --git a/checkpoint-2504/README.md b/checkpoint-2504/README.md new file mode 100644 index 0000000000000000000000000000000000000000..96b9f5618833a1728fbecbefb87f08b279b6b2ed --- /dev/null +++ b/checkpoint-2504/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.1-8B-Instruct +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/checkpoint-2504/adapter_config.json b/checkpoint-2504/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8cd4cdaf30ceb587991efae70006ef463605c378 --- /dev/null +++ b/checkpoint-2504/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "o_proj", + "k_proj", + "gate_proj", + "v_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2504/adapter_model.safetensors b/checkpoint-2504/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b2ed67e8627f503c815948705803b7764ff9be24 --- /dev/null +++ b/checkpoint-2504/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9f02fd010544b8d6d118c08e1068045561e1bff8b4931a3e2ca34c8898d0046 +size 335604696 diff --git a/checkpoint-2504/chat_template.jinja b/checkpoint-2504/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..33089ace1be88f22a10fe861ad49718d5d886090 --- /dev/null +++ b/checkpoint-2504/chat_template.jinja @@ -0,0 +1,109 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {#- This means we're in ipython mode #} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-2504/optimizer.pt b/checkpoint-2504/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e52b9d1967a2528cc566964c02cb814eb271faf --- /dev/null +++ b/checkpoint-2504/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2057571ee63912faf553fddbde34fa405fca6408269fd2d10bec9f7c1fa12d9 +size 671473443 diff --git a/checkpoint-2504/rng_state.pth b/checkpoint-2504/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..53d611e1b0d6ad32d804e9ef94e4c6ba692dad47 --- /dev/null +++ b/checkpoint-2504/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f19b56fbf124f4086c6c42eda10544f7171a3f7ed57ebcc06da305b5308c310 +size 14645 diff --git a/checkpoint-2504/scheduler.pt b/checkpoint-2504/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7d04dd16acee088ca8b12342697cfccebb26feb --- /dev/null +++ b/checkpoint-2504/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1babff2af9cdaf77d30c7b8148ba50e8efe7e269d4ac4f53d5fd679540c2ae8 +size 1465 diff --git a/checkpoint-2504/special_tokens_map.json b/checkpoint-2504/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..02ee80b6196926a5ad790a004d9efd6ab1ba6542 --- /dev/null +++ b/checkpoint-2504/special_tokens_map.json @@ -0,0 +1,16 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-2504/tokenizer.json b/checkpoint-2504/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-2504/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-2504/tokenizer_config.json b/checkpoint-2504/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8b0c7c141373ca36e5e819a28f60e146ccef652f --- /dev/null +++ b/checkpoint-2504/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-2504/trainer_state.json b/checkpoint-2504/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..838718c45759b8d4be731e36b00c557af763c549 --- /dev/null +++ b/checkpoint-2504/trainer_state.json @@ -0,0 +1,2534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 2504, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.098961293697357, + "epoch": 0.032, + "grad_norm": 1.891703486442566, + "learning_rate": 1.9148936170212767e-06, + "loss": 2.0828, + "mean_token_accuracy": 0.530680388212204, + "num_tokens": 72723.0, + "step": 10 + }, + { + "entropy": 2.119775766134262, + "epoch": 0.064, + "grad_norm": 1.2044862508773804, + "learning_rate": 4.042553191489362e-06, + "loss": 2.0093, + "mean_token_accuracy": 0.5355814293026924, + "num_tokens": 146392.0, + "step": 20 + }, + { + "entropy": 2.220579963922501, + "epoch": 0.096, + "grad_norm": 0.9982365369796753, + "learning_rate": 6.170212765957447e-06, + "loss": 1.8939, + "mean_token_accuracy": 0.5451944440603256, + "num_tokens": 223711.0, + "step": 30 + }, + { + "entropy": 2.382017892599106, + "epoch": 0.128, + "grad_norm": 0.7386544346809387, + "learning_rate": 8.297872340425532e-06, + "loss": 1.9066, + "mean_token_accuracy": 0.5411656655371189, + "num_tokens": 300889.0, + "step": 40 + }, + { + "entropy": 2.274736815690994, + "epoch": 0.16, + "grad_norm": 0.6412256956100464, + "learning_rate": 1.0425531914893619e-05, + "loss": 1.7387, + "mean_token_accuracy": 0.5679451540112496, + "num_tokens": 377362.0, + "step": 50 + }, + { + "entropy": 2.3663365960121157, + "epoch": 0.192, + "grad_norm": 0.6228290796279907, + "learning_rate": 1.2553191489361702e-05, + "loss": 1.7492, + "mean_token_accuracy": 0.5746532663702965, + "num_tokens": 449594.0, + "step": 60 + }, + { + "entropy": 2.315044218301773, + "epoch": 0.224, + "grad_norm": 0.6034156680107117, + "learning_rate": 1.4680851063829789e-05, + "loss": 1.7111, + "mean_token_accuracy": 0.5675176709890366, + "num_tokens": 523439.0, + "step": 70 + }, + { + "entropy": 2.288265961408615, + "epoch": 0.256, + "grad_norm": 0.45914268493652344, + "learning_rate": 1.6808510638297873e-05, + "loss": 1.6931, + "mean_token_accuracy": 0.5713589735329151, + "num_tokens": 599650.0, + "step": 80 + }, + { + "entropy": 2.2693382859230042, + "epoch": 0.288, + "grad_norm": 0.6197793483734131, + "learning_rate": 1.893617021276596e-05, + "loss": 1.6542, + "mean_token_accuracy": 0.578165066242218, + "num_tokens": 675377.0, + "step": 90 + }, + { + "entropy": 2.293796479701996, + "epoch": 0.32, + "grad_norm": 0.5502006411552429, + "learning_rate": 1.9999866154043656e-05, + "loss": 1.7108, + "mean_token_accuracy": 0.5681634023785591, + "num_tokens": 751838.0, + "step": 100 + }, + { + "entropy": 2.2658903509378434, + "epoch": 0.352, + "grad_norm": 0.5713317394256592, + "learning_rate": 1.9998795407890486e-05, + "loss": 1.6168, + "mean_token_accuracy": 0.5843982398509979, + "num_tokens": 825539.0, + "step": 110 + }, + { + "entropy": 2.270280033349991, + "epoch": 0.384, + "grad_norm": 0.5967482924461365, + "learning_rate": 1.999665403023542e-05, + "loss": 1.6194, + "mean_token_accuracy": 0.5839526921510696, + "num_tokens": 897258.0, + "step": 120 + }, + { + "entropy": 2.2349284648895265, + "epoch": 0.416, + "grad_norm": 0.4899630844593048, + "learning_rate": 1.9993442250368708e-05, + "loss": 1.6313, + "mean_token_accuracy": 0.5815729826688767, + "num_tokens": 973142.0, + "step": 130 + }, + { + "entropy": 2.245553806424141, + "epoch": 0.448, + "grad_norm": 0.6546034812927246, + "learning_rate": 1.9989160412195047e-05, + "loss": 1.6395, + "mean_token_accuracy": 0.5780692532658577, + "num_tokens": 1046762.0, + "step": 140 + }, + { + "entropy": 2.288555932044983, + "epoch": 0.48, + "grad_norm": 0.5528404116630554, + "learning_rate": 1.9983808974196752e-05, + "loss": 1.7118, + "mean_token_accuracy": 0.5686657652258873, + "num_tokens": 1125167.0, + "step": 150 + }, + { + "entropy": 2.232080355286598, + "epoch": 0.512, + "grad_norm": 0.5887461304664612, + "learning_rate": 1.9977388509384656e-05, + "loss": 1.6339, + "mean_token_accuracy": 0.5838325396180153, + "num_tokens": 1199589.0, + "step": 160 + }, + { + "entropy": 2.2232475757598875, + "epoch": 0.544, + "grad_norm": 0.5764511823654175, + "learning_rate": 1.9969899705236763e-05, + "loss": 1.6173, + "mean_token_accuracy": 0.5848860442638397, + "num_tokens": 1276431.0, + "step": 170 + }, + { + "entropy": 2.244092071056366, + "epoch": 0.576, + "grad_norm": 0.6295827627182007, + "learning_rate": 1.9961343363624626e-05, + "loss": 1.6017, + "mean_token_accuracy": 0.5818701103329659, + "num_tokens": 1350012.0, + "step": 180 + }, + { + "entropy": 2.237305074930191, + "epoch": 0.608, + "grad_norm": 0.5939638018608093, + "learning_rate": 1.9951720400727495e-05, + "loss": 1.6704, + "mean_token_accuracy": 0.5779796853661537, + "num_tokens": 1423391.0, + "step": 190 + }, + { + "entropy": 2.211505854129791, + "epoch": 0.64, + "grad_norm": 0.6119778156280518, + "learning_rate": 1.9941031846934213e-05, + "loss": 1.6223, + "mean_token_accuracy": 0.5848233133554459, + "num_tokens": 1499124.0, + "step": 200 + }, + { + "entropy": 2.2195493161678312, + "epoch": 0.672, + "grad_norm": 0.6129831671714783, + "learning_rate": 1.9929278846732883e-05, + "loss": 1.5886, + "mean_token_accuracy": 0.5897421136498451, + "num_tokens": 1573541.0, + "step": 210 + }, + { + "entropy": 2.2096123576164244, + "epoch": 0.704, + "grad_norm": 0.6091306209564209, + "learning_rate": 1.9916462658588328e-05, + "loss": 1.6031, + "mean_token_accuracy": 0.5894487425684929, + "num_tokens": 1649546.0, + "step": 220 + }, + { + "entropy": 2.249841979146004, + "epoch": 0.736, + "grad_norm": 0.570816695690155, + "learning_rate": 1.9902584654807325e-05, + "loss": 1.5876, + "mean_token_accuracy": 0.5911228567361831, + "num_tokens": 1722199.0, + "step": 230 + }, + { + "entropy": 2.1915894985198974, + "epoch": 0.768, + "grad_norm": 0.5748864412307739, + "learning_rate": 1.988764632139168e-05, + "loss": 1.5963, + "mean_token_accuracy": 0.5891387596726417, + "num_tokens": 1797304.0, + "step": 240 + }, + { + "entropy": 2.2358563423156737, + "epoch": 0.8, + "grad_norm": 0.6511492729187012, + "learning_rate": 1.9871649257879115e-05, + "loss": 1.6453, + "mean_token_accuracy": 0.5792816638946533, + "num_tokens": 1870113.0, + "step": 250 + }, + { + "entropy": 2.2169984579086304, + "epoch": 0.832, + "grad_norm": 0.5317641496658325, + "learning_rate": 1.9854595177171968e-05, + "loss": 1.6594, + "mean_token_accuracy": 0.577045065164566, + "num_tokens": 1947405.0, + "step": 260 + }, + { + "entropy": 2.2434292674064635, + "epoch": 0.864, + "grad_norm": 0.5399971604347229, + "learning_rate": 1.9836485905353823e-05, + "loss": 1.7158, + "mean_token_accuracy": 0.5683416239917278, + "num_tokens": 2026284.0, + "step": 270 + }, + { + "entropy": 2.227828550338745, + "epoch": 0.896, + "grad_norm": 0.5378643870353699, + "learning_rate": 1.9817323381493933e-05, + "loss": 1.6714, + "mean_token_accuracy": 0.5818367518484593, + "num_tokens": 2103986.0, + "step": 280 + }, + { + "entropy": 2.2110894501209257, + "epoch": 0.928, + "grad_norm": 0.5195969343185425, + "learning_rate": 1.979710965743964e-05, + "loss": 1.6239, + "mean_token_accuracy": 0.5819958478212357, + "num_tokens": 2177010.0, + "step": 290 + }, + { + "entropy": 2.1666628658771514, + "epoch": 0.96, + "grad_norm": 0.5663164258003235, + "learning_rate": 1.977584689759664e-05, + "loss": 1.6024, + "mean_token_accuracy": 0.5876665830612182, + "num_tokens": 2251285.0, + "step": 300 + }, + { + "entropy": 2.214203083515167, + "epoch": 0.992, + "grad_norm": 0.6764860153198242, + "learning_rate": 1.9753537378697237e-05, + "loss": 1.6446, + "mean_token_accuracy": 0.5818003416061401, + "num_tokens": 2325752.0, + "step": 310 + }, + { + "entropy": 2.16783396821273, + "epoch": 1.0224, + "grad_norm": 0.5795008540153503, + "learning_rate": 1.9730183489556563e-05, + "loss": 1.594, + "mean_token_accuracy": 0.5867547392845154, + "num_tokens": 2396254.0, + "step": 320 + }, + { + "entropy": 2.172953352332115, + "epoch": 1.0544, + "grad_norm": 0.6686444282531738, + "learning_rate": 1.9705787730816776e-05, + "loss": 1.613, + "mean_token_accuracy": 0.5906373374164104, + "num_tokens": 2470123.0, + "step": 330 + }, + { + "entropy": 2.2217346757650374, + "epoch": 1.0864, + "grad_norm": 0.6389091610908508, + "learning_rate": 1.9680352714679324e-05, + "loss": 1.7053, + "mean_token_accuracy": 0.577599074691534, + "num_tokens": 2545749.0, + "step": 340 + }, + { + "entropy": 2.138428696990013, + "epoch": 1.1184, + "grad_norm": 0.7369883060455322, + "learning_rate": 1.9653881164625234e-05, + "loss": 1.5599, + "mean_token_accuracy": 0.5946489304304123, + "num_tokens": 2623270.0, + "step": 350 + }, + { + "entropy": 2.147254040837288, + "epoch": 1.1504, + "grad_norm": 0.6707085967063904, + "learning_rate": 1.9626375915123473e-05, + "loss": 1.5843, + "mean_token_accuracy": 0.5965728983283043, + "num_tokens": 2697616.0, + "step": 360 + }, + { + "entropy": 2.1412769109010696, + "epoch": 1.1824, + "grad_norm": 0.7201400995254517, + "learning_rate": 1.9597839911327475e-05, + "loss": 1.58, + "mean_token_accuracy": 0.5957784004509449, + "num_tokens": 2771426.0, + "step": 370 + }, + { + "entropy": 2.164059528708458, + "epoch": 1.2144, + "grad_norm": 0.7561144232749939, + "learning_rate": 1.9568276208759772e-05, + "loss": 1.5872, + "mean_token_accuracy": 0.5874110117554665, + "num_tokens": 2846711.0, + "step": 380 + }, + { + "entropy": 2.205427420139313, + "epoch": 1.2464, + "grad_norm": 0.691585898399353, + "learning_rate": 1.9537687972984804e-05, + "loss": 1.625, + "mean_token_accuracy": 0.5892911069095135, + "num_tokens": 2920916.0, + "step": 390 + }, + { + "entropy": 2.1242104679346085, + "epoch": 1.2784, + "grad_norm": 0.6999676823616028, + "learning_rate": 1.950607847926999e-05, + "loss": 1.5606, + "mean_token_accuracy": 0.5917269751429558, + "num_tokens": 2996056.0, + "step": 400 + }, + { + "entropy": 2.114223065972328, + "epoch": 1.3104, + "grad_norm": 0.7616406679153442, + "learning_rate": 1.947345111223502e-05, + "loss": 1.5296, + "mean_token_accuracy": 0.5938275754451752, + "num_tokens": 3072912.0, + "step": 410 + }, + { + "entropy": 2.1418962299823763, + "epoch": 1.3424, + "grad_norm": 0.7253025770187378, + "learning_rate": 1.943980936548942e-05, + "loss": 1.575, + "mean_token_accuracy": 0.5945621818304062, + "num_tokens": 3148498.0, + "step": 420 + }, + { + "entropy": 2.109667718410492, + "epoch": 1.3744, + "grad_norm": 0.8988682627677917, + "learning_rate": 1.9405156841258498e-05, + "loss": 1.5796, + "mean_token_accuracy": 0.5901263400912284, + "num_tokens": 3224741.0, + "step": 430 + }, + { + "entropy": 2.179358023405075, + "epoch": 1.4064, + "grad_norm": 0.741558849811554, + "learning_rate": 1.936949724999762e-05, + "loss": 1.6507, + "mean_token_accuracy": 0.581992793083191, + "num_tokens": 3299366.0, + "step": 440 + }, + { + "entropy": 2.1574251472949983, + "epoch": 1.4384000000000001, + "grad_norm": 0.7538727521896362, + "learning_rate": 1.9332834409994906e-05, + "loss": 1.5771, + "mean_token_accuracy": 0.5888051658868789, + "num_tokens": 3374162.0, + "step": 450 + }, + { + "entropy": 2.1186763852834702, + "epoch": 1.4704, + "grad_norm": 0.7905173301696777, + "learning_rate": 1.929517224696239e-05, + "loss": 1.6138, + "mean_token_accuracy": 0.584889967739582, + "num_tokens": 3452582.0, + "step": 460 + }, + { + "entropy": 2.1135365635156633, + "epoch": 1.5024, + "grad_norm": 0.7416484951972961, + "learning_rate": 1.9256514793615674e-05, + "loss": 1.5623, + "mean_token_accuracy": 0.5928735345602035, + "num_tokens": 3527694.0, + "step": 470 + }, + { + "entropy": 2.146635016798973, + "epoch": 1.5344, + "grad_norm": 0.731999397277832, + "learning_rate": 1.9216866189242095e-05, + "loss": 1.5634, + "mean_token_accuracy": 0.5988615363836288, + "num_tokens": 3600277.0, + "step": 480 + }, + { + "entropy": 2.1472962319850923, + "epoch": 1.5664, + "grad_norm": 0.7493702173233032, + "learning_rate": 1.9176230679257547e-05, + "loss": 1.5891, + "mean_token_accuracy": 0.5858126983046532, + "num_tokens": 3674781.0, + "step": 490 + }, + { + "entropy": 2.1530486762523653, + "epoch": 1.5984, + "grad_norm": 0.8006687164306641, + "learning_rate": 1.9134612614751865e-05, + "loss": 1.5674, + "mean_token_accuracy": 0.5904534175992012, + "num_tokens": 3748434.0, + "step": 500 + }, + { + "entropy": 2.169738906621933, + "epoch": 1.6303999999999998, + "grad_norm": 0.9293455481529236, + "learning_rate": 1.909201645202294e-05, + "loss": 1.6104, + "mean_token_accuracy": 0.5860036969184875, + "num_tokens": 3823982.0, + "step": 510 + }, + { + "entropy": 2.178475347161293, + "epoch": 1.6623999999999999, + "grad_norm": 0.7716575860977173, + "learning_rate": 1.904844675209956e-05, + "loss": 1.6432, + "mean_token_accuracy": 0.5838924221694469, + "num_tokens": 3900064.0, + "step": 520 + }, + { + "entropy": 2.1585603266954423, + "epoch": 1.6944, + "grad_norm": 0.8225084543228149, + "learning_rate": 1.9003908180253027e-05, + "loss": 1.5957, + "mean_token_accuracy": 0.5880116850137711, + "num_tokens": 3974029.0, + "step": 530 + }, + { + "entropy": 2.111869788169861, + "epoch": 1.7264, + "grad_norm": 0.7035638093948364, + "learning_rate": 1.8958405505497613e-05, + "loss": 1.579, + "mean_token_accuracy": 0.5890362292528153, + "num_tokens": 4049974.0, + "step": 540 + }, + { + "entropy": 2.144411253929138, + "epoch": 1.7584, + "grad_norm": 0.7046850919723511, + "learning_rate": 1.8911943600079934e-05, + "loss": 1.5926, + "mean_token_accuracy": 0.5874261602759361, + "num_tokens": 4125206.0, + "step": 550 + }, + { + "entropy": 2.1093025386333464, + "epoch": 1.7904, + "grad_norm": 0.807727575302124, + "learning_rate": 1.8864527438957223e-05, + "loss": 1.5367, + "mean_token_accuracy": 0.5988967984914779, + "num_tokens": 4199365.0, + "step": 560 + }, + { + "entropy": 2.097169244289398, + "epoch": 1.8224, + "grad_norm": 0.7856780886650085, + "learning_rate": 1.881616209926465e-05, + "loss": 1.561, + "mean_token_accuracy": 0.5948230788111687, + "num_tokens": 4275889.0, + "step": 570 + }, + { + "entropy": 2.088553088903427, + "epoch": 1.8544, + "grad_norm": 0.8993458151817322, + "learning_rate": 1.876685275977167e-05, + "loss": 1.5557, + "mean_token_accuracy": 0.5941933646798134, + "num_tokens": 4350502.0, + "step": 580 + }, + { + "entropy": 2.132419008016586, + "epoch": 1.8864, + "grad_norm": 0.7769711017608643, + "learning_rate": 1.8716604700327516e-05, + "loss": 1.6105, + "mean_token_accuracy": 0.5815305605530738, + "num_tokens": 4426429.0, + "step": 590 + }, + { + "entropy": 2.1076891005039213, + "epoch": 1.9184, + "grad_norm": 0.9261249899864197, + "learning_rate": 1.866542330129583e-05, + "loss": 1.5307, + "mean_token_accuracy": 0.5964644759893417, + "num_tokens": 4500147.0, + "step": 600 + }, + { + "entropy": 2.114642283320427, + "epoch": 1.9504000000000001, + "grad_norm": 0.806425929069519, + "learning_rate": 1.8613314042978576e-05, + "loss": 1.5809, + "mean_token_accuracy": 0.5901800125837326, + "num_tokens": 4573438.0, + "step": 610 + }, + { + "entropy": 2.1167576968669892, + "epoch": 1.9824000000000002, + "grad_norm": 0.8191499710083008, + "learning_rate": 1.856028250502923e-05, + "loss": 1.6031, + "mean_token_accuracy": 0.5843381330370903, + "num_tokens": 4648156.0, + "step": 620 + }, + { + "entropy": 2.0566249019221257, + "epoch": 2.0128, + "grad_norm": 0.7406135201454163, + "learning_rate": 1.8506334365855315e-05, + "loss": 1.5187, + "mean_token_accuracy": 0.6027438483740154, + "num_tokens": 4719492.0, + "step": 630 + }, + { + "entropy": 2.0126763731241226, + "epoch": 2.0448, + "grad_norm": 0.8845784068107605, + "learning_rate": 1.8451475402010405e-05, + "loss": 1.4841, + "mean_token_accuracy": 0.6069207280874253, + "num_tokens": 4796271.0, + "step": 640 + }, + { + "entropy": 2.0516900300979612, + "epoch": 2.0768, + "grad_norm": 0.9927017092704773, + "learning_rate": 1.8395711487575564e-05, + "loss": 1.512, + "mean_token_accuracy": 0.6031922519207, + "num_tokens": 4870202.0, + "step": 650 + }, + { + "entropy": 2.0824343889951704, + "epoch": 2.1088, + "grad_norm": 0.927236795425415, + "learning_rate": 1.8339048593530406e-05, + "loss": 1.5843, + "mean_token_accuracy": 0.5952437989413738, + "num_tokens": 4945568.0, + "step": 660 + }, + { + "entropy": 2.0304481953382494, + "epoch": 2.1408, + "grad_norm": 0.874019205570221, + "learning_rate": 1.8281492787113707e-05, + "loss": 1.5096, + "mean_token_accuracy": 0.5992600306868553, + "num_tokens": 5020723.0, + "step": 670 + }, + { + "entropy": 2.0402441143989565, + "epoch": 2.1728, + "grad_norm": 0.8746942281723022, + "learning_rate": 1.8223050231173802e-05, + "loss": 1.5119, + "mean_token_accuracy": 0.5994458049535751, + "num_tokens": 5095780.0, + "step": 680 + }, + { + "entropy": 2.018441066145897, + "epoch": 2.2048, + "grad_norm": 1.063180923461914, + "learning_rate": 1.816372718350864e-05, + "loss": 1.4923, + "mean_token_accuracy": 0.6064845189452172, + "num_tokens": 5169733.0, + "step": 690 + }, + { + "entropy": 2.0563316702842713, + "epoch": 2.2368, + "grad_norm": 1.0281789302825928, + "learning_rate": 1.810352999619574e-05, + "loss": 1.5505, + "mean_token_accuracy": 0.602813882380724, + "num_tokens": 5246393.0, + "step": 700 + }, + { + "entropy": 2.0298285841941834, + "epoch": 2.2688, + "grad_norm": 1.070520281791687, + "learning_rate": 1.804246511491206e-05, + "loss": 1.5159, + "mean_token_accuracy": 0.6006126523017883, + "num_tokens": 5322244.0, + "step": 710 + }, + { + "entropy": 2.0195819228887557, + "epoch": 2.3008, + "grad_norm": 0.9672983884811401, + "learning_rate": 1.7980539078243783e-05, + "loss": 1.5166, + "mean_token_accuracy": 0.6054230839014053, + "num_tokens": 5399317.0, + "step": 720 + }, + { + "entropy": 2.045917159318924, + "epoch": 2.3327999999999998, + "grad_norm": 1.1228744983673096, + "learning_rate": 1.791775851698622e-05, + "loss": 1.5096, + "mean_token_accuracy": 0.6015639662742615, + "num_tokens": 5473195.0, + "step": 730 + }, + { + "entropy": 2.0935415983200074, + "epoch": 2.3648, + "grad_norm": 1.149794578552246, + "learning_rate": 1.7854130153433785e-05, + "loss": 1.5583, + "mean_token_accuracy": 0.5921522840857506, + "num_tokens": 5548357.0, + "step": 740 + }, + { + "entropy": 2.044076007604599, + "epoch": 2.3968, + "grad_norm": 1.063625693321228, + "learning_rate": 1.7789660800660222e-05, + "loss": 1.5013, + "mean_token_accuracy": 0.5974589124321937, + "num_tokens": 5620915.0, + "step": 750 + }, + { + "entropy": 2.092478734254837, + "epoch": 2.4288, + "grad_norm": 1.1822012662887573, + "learning_rate": 1.7724357361789075e-05, + "loss": 1.5552, + "mean_token_accuracy": 0.5929681301116944, + "num_tokens": 5693406.0, + "step": 760 + }, + { + "entropy": 2.0430804908275606, + "epoch": 2.4608, + "grad_norm": 0.9921984076499939, + "learning_rate": 1.765822682925453e-05, + "loss": 1.4944, + "mean_token_accuracy": 0.6029774472117424, + "num_tokens": 5770143.0, + "step": 770 + }, + { + "entropy": 2.049290281534195, + "epoch": 2.4928, + "grad_norm": 1.0144131183624268, + "learning_rate": 1.7591276284052695e-05, + "loss": 1.5437, + "mean_token_accuracy": 0.5986773043870925, + "num_tokens": 5844022.0, + "step": 780 + }, + { + "entropy": 2.033898201584816, + "epoch": 2.5248, + "grad_norm": 1.1700315475463867, + "learning_rate": 1.7523512894983396e-05, + "loss": 1.5197, + "mean_token_accuracy": 0.5972102269530296, + "num_tokens": 5919099.0, + "step": 790 + }, + { + "entropy": 2.03344586789608, + "epoch": 2.5568, + "grad_norm": 1.0503427982330322, + "learning_rate": 1.745494391788257e-05, + "loss": 1.5456, + "mean_token_accuracy": 0.6011263683438302, + "num_tokens": 5997797.0, + "step": 800 + }, + { + "entropy": 2.0796399265527725, + "epoch": 2.5888, + "grad_norm": 1.0316176414489746, + "learning_rate": 1.7385576694845324e-05, + "loss": 1.608, + "mean_token_accuracy": 0.6024919278919697, + "num_tokens": 6075434.0, + "step": 810 + }, + { + "entropy": 2.0257797837257385, + "epoch": 2.6208, + "grad_norm": 1.048309087753296, + "learning_rate": 1.7315418653439802e-05, + "loss": 1.4876, + "mean_token_accuracy": 0.6070949509739876, + "num_tokens": 6149232.0, + "step": 820 + }, + { + "entropy": 2.024846690893173, + "epoch": 2.6528, + "grad_norm": 1.186710000038147, + "learning_rate": 1.7244477305911845e-05, + "loss": 1.499, + "mean_token_accuracy": 0.6022308841347694, + "num_tokens": 6222180.0, + "step": 830 + }, + { + "entropy": 1.9938248336315154, + "epoch": 2.6848, + "grad_norm": 1.1091604232788086, + "learning_rate": 1.717276024838062e-05, + "loss": 1.4795, + "mean_token_accuracy": 0.6044012248516083, + "num_tokens": 6296902.0, + "step": 840 + }, + { + "entropy": 1.9988998174667358, + "epoch": 2.7168, + "grad_norm": 1.0359690189361572, + "learning_rate": 1.710027516002526e-05, + "loss": 1.5173, + "mean_token_accuracy": 0.6025070771574974, + "num_tokens": 6373494.0, + "step": 850 + }, + { + "entropy": 2.02343093752861, + "epoch": 2.7488, + "grad_norm": 1.1783568859100342, + "learning_rate": 1.7027029802262598e-05, + "loss": 1.5146, + "mean_token_accuracy": 0.6033479735255242, + "num_tokens": 6449229.0, + "step": 860 + }, + { + "entropy": 2.0429257422685625, + "epoch": 2.7808, + "grad_norm": 0.9909568428993225, + "learning_rate": 1.6953032017916115e-05, + "loss": 1.5473, + "mean_token_accuracy": 0.5932901218533516, + "num_tokens": 6525728.0, + "step": 870 + }, + { + "entropy": 2.0058376491069794, + "epoch": 2.8128, + "grad_norm": 1.0904430150985718, + "learning_rate": 1.687828973037615e-05, + "loss": 1.4545, + "mean_token_accuracy": 0.6120153024792672, + "num_tokens": 6599335.0, + "step": 880 + }, + { + "entropy": 2.005480855703354, + "epoch": 2.8448, + "grad_norm": 1.1638548374176025, + "learning_rate": 1.6802810942751514e-05, + "loss": 1.4887, + "mean_token_accuracy": 0.6060751393437386, + "num_tokens": 6672722.0, + "step": 890 + }, + { + "entropy": 2.0311779022216796, + "epoch": 2.8768000000000002, + "grad_norm": 1.1404571533203125, + "learning_rate": 1.6726603737012527e-05, + "loss": 1.5238, + "mean_token_accuracy": 0.6015868663787842, + "num_tokens": 6748069.0, + "step": 900 + }, + { + "entropy": 2.0126856863498688, + "epoch": 2.9088000000000003, + "grad_norm": 1.0942543745040894, + "learning_rate": 1.6649676273125647e-05, + "loss": 1.4984, + "mean_token_accuracy": 0.6019899815320968, + "num_tokens": 6820935.0, + "step": 910 + }, + { + "entropy": 1.9961138010025024, + "epoch": 2.9408, + "grad_norm": 1.0870610475540161, + "learning_rate": 1.6572036788179728e-05, + "loss": 1.4962, + "mean_token_accuracy": 0.6030571654438972, + "num_tokens": 6896286.0, + "step": 920 + }, + { + "entropy": 2.035824549198151, + "epoch": 2.9728, + "grad_norm": 1.0822633504867554, + "learning_rate": 1.6493693595504022e-05, + "loss": 1.5354, + "mean_token_accuracy": 0.5986709952354431, + "num_tokens": 6971854.0, + "step": 930 + }, + { + "entropy": 2.0243908260997974, + "epoch": 3.0032, + "grad_norm": 1.0899602174758911, + "learning_rate": 1.6414655083778027e-05, + "loss": 1.5032, + "mean_token_accuracy": 0.5983682243447555, + "num_tokens": 7041122.0, + "step": 940 + }, + { + "entropy": 1.9538823068141937, + "epoch": 3.0352, + "grad_norm": 1.3042237758636475, + "learning_rate": 1.633492971613326e-05, + "loss": 1.4604, + "mean_token_accuracy": 0.6146818101406097, + "num_tokens": 7116032.0, + "step": 950 + }, + { + "entropy": 1.9383916020393372, + "epoch": 3.0672, + "grad_norm": 1.397078037261963, + "learning_rate": 1.6254526029247048e-05, + "loss": 1.4019, + "mean_token_accuracy": 0.6210932344198227, + "num_tokens": 7189009.0, + "step": 960 + }, + { + "entropy": 1.9460978150367736, + "epoch": 3.0992, + "grad_norm": 1.2756887674331665, + "learning_rate": 1.617345263242847e-05, + "loss": 1.4623, + "mean_token_accuracy": 0.6121616646647453, + "num_tokens": 7263068.0, + "step": 970 + }, + { + "entropy": 1.9156711965799331, + "epoch": 3.1312, + "grad_norm": 1.1937649250030518, + "learning_rate": 1.609171820669649e-05, + "loss": 1.4301, + "mean_token_accuracy": 0.6136599197983742, + "num_tokens": 7338652.0, + "step": 980 + }, + { + "entropy": 1.9247682303190232, + "epoch": 3.1632, + "grad_norm": 1.3291118144989014, + "learning_rate": 1.6009331503850448e-05, + "loss": 1.4545, + "mean_token_accuracy": 0.6153608947992325, + "num_tokens": 7414529.0, + "step": 990 + }, + { + "entropy": 1.9066543668508529, + "epoch": 3.1952, + "grad_norm": 1.4356389045715332, + "learning_rate": 1.5926301345532925e-05, + "loss": 1.4413, + "mean_token_accuracy": 0.612147618830204, + "num_tokens": 7489106.0, + "step": 1000 + }, + { + "entropy": 1.895160937309265, + "epoch": 3.2272, + "grad_norm": 1.4345523118972778, + "learning_rate": 1.5842636622285187e-05, + "loss": 1.4207, + "mean_token_accuracy": 0.6138400137424469, + "num_tokens": 7564304.0, + "step": 1010 + }, + { + "entropy": 1.9546802312135696, + "epoch": 3.2592, + "grad_norm": 1.5242680311203003, + "learning_rate": 1.575834629259519e-05, + "loss": 1.4435, + "mean_token_accuracy": 0.6153354361653328, + "num_tokens": 7637409.0, + "step": 1020 + }, + { + "entropy": 1.912938117980957, + "epoch": 3.2912, + "grad_norm": 1.529726505279541, + "learning_rate": 1.5673439381938365e-05, + "loss": 1.4409, + "mean_token_accuracy": 0.6191004544496537, + "num_tokens": 7711595.0, + "step": 1030 + }, + { + "entropy": 1.8989770442247391, + "epoch": 3.3232, + "grad_norm": 1.3367948532104492, + "learning_rate": 1.5587924981811196e-05, + "loss": 1.394, + "mean_token_accuracy": 0.624155393242836, + "num_tokens": 7785750.0, + "step": 1040 + }, + { + "entropy": 1.932333904504776, + "epoch": 3.3552, + "grad_norm": 1.4732215404510498, + "learning_rate": 1.5501812248757734e-05, + "loss": 1.3959, + "mean_token_accuracy": 0.6221834555268287, + "num_tokens": 7859036.0, + "step": 1050 + }, + { + "entropy": 1.9492982983589173, + "epoch": 3.3872, + "grad_norm": 1.4499313831329346, + "learning_rate": 1.5415110403389166e-05, + "loss": 1.4633, + "mean_token_accuracy": 0.6100246667861938, + "num_tokens": 7933165.0, + "step": 1060 + }, + { + "entropy": 1.9063653618097305, + "epoch": 3.4192, + "grad_norm": 1.4364317655563354, + "learning_rate": 1.5327828729396482e-05, + "loss": 1.4216, + "mean_token_accuracy": 0.6210869938135147, + "num_tokens": 8009376.0, + "step": 1070 + }, + { + "entropy": 1.9919361650943757, + "epoch": 3.4512, + "grad_norm": 1.5573089122772217, + "learning_rate": 1.5239976572556438e-05, + "loss": 1.5899, + "mean_token_accuracy": 0.5991086520254612, + "num_tokens": 8086825.0, + "step": 1080 + }, + { + "entropy": 1.922476476430893, + "epoch": 3.4832, + "grad_norm": 1.3339344263076782, + "learning_rate": 1.5151563339730849e-05, + "loss": 1.4162, + "mean_token_accuracy": 0.6182018965482712, + "num_tokens": 8161726.0, + "step": 1090 + }, + { + "entropy": 1.9143129527568816, + "epoch": 3.5152, + "grad_norm": 1.4425708055496216, + "learning_rate": 1.506259849785931e-05, + "loss": 1.4076, + "mean_token_accuracy": 0.6197950705885887, + "num_tokens": 8237046.0, + "step": 1100 + }, + { + "entropy": 1.9093267023563385, + "epoch": 3.5472, + "grad_norm": 1.5437992811203003, + "learning_rate": 1.497309157294555e-05, + "loss": 1.4339, + "mean_token_accuracy": 0.6177847877144813, + "num_tokens": 8315350.0, + "step": 1110 + }, + { + "entropy": 1.9121424347162246, + "epoch": 3.5792, + "grad_norm": 1.3761622905731201, + "learning_rate": 1.4883052149037395e-05, + "loss": 1.4175, + "mean_token_accuracy": 0.6171463698148727, + "num_tokens": 8390383.0, + "step": 1120 + }, + { + "entropy": 1.883551675081253, + "epoch": 3.6112, + "grad_norm": 1.36739182472229, + "learning_rate": 1.479248986720057e-05, + "loss": 1.4158, + "mean_token_accuracy": 0.6186214044690133, + "num_tokens": 8468414.0, + "step": 1130 + }, + { + "entropy": 1.988349151611328, + "epoch": 3.6432, + "grad_norm": 1.4566738605499268, + "learning_rate": 1.4701414424486353e-05, + "loss": 1.5366, + "mean_token_accuracy": 0.6110676810145378, + "num_tokens": 8541715.0, + "step": 1140 + }, + { + "entropy": 1.9057112097740174, + "epoch": 3.6752000000000002, + "grad_norm": 1.499079704284668, + "learning_rate": 1.4609835572893266e-05, + "loss": 1.3991, + "mean_token_accuracy": 0.6208718970417977, + "num_tokens": 8615694.0, + "step": 1150 + }, + { + "entropy": 1.9219326049089431, + "epoch": 3.7072000000000003, + "grad_norm": 1.3865621089935303, + "learning_rate": 1.4517763118322861e-05, + "loss": 1.431, + "mean_token_accuracy": 0.6143050745129586, + "num_tokens": 8692473.0, + "step": 1160 + }, + { + "entropy": 1.9036399960517882, + "epoch": 3.7392, + "grad_norm": 1.5362603664398193, + "learning_rate": 1.4425206919529747e-05, + "loss": 1.4156, + "mean_token_accuracy": 0.6199175521731377, + "num_tokens": 8767618.0, + "step": 1170 + }, + { + "entropy": 1.9499989479780198, + "epoch": 3.7712, + "grad_norm": 1.663404941558838, + "learning_rate": 1.4332176887065955e-05, + "loss": 1.4668, + "mean_token_accuracy": 0.605186915397644, + "num_tokens": 8843100.0, + "step": 1180 + }, + { + "entropy": 1.9545456051826477, + "epoch": 3.8032, + "grad_norm": 1.6169345378875732, + "learning_rate": 1.4238682982219753e-05, + "loss": 1.4241, + "mean_token_accuracy": 0.6206902250647545, + "num_tokens": 8914604.0, + "step": 1190 + }, + { + "entropy": 1.9130536198616028, + "epoch": 3.8352, + "grad_norm": 1.472740650177002, + "learning_rate": 1.4144735215949028e-05, + "loss": 1.4271, + "mean_token_accuracy": 0.6139126420021057, + "num_tokens": 8989305.0, + "step": 1200 + }, + { + "entropy": 1.938635140657425, + "epoch": 3.8672, + "grad_norm": 1.4194226264953613, + "learning_rate": 1.4050343647809354e-05, + "loss": 1.4538, + "mean_token_accuracy": 0.6131341770291329, + "num_tokens": 9065589.0, + "step": 1210 + }, + { + "entropy": 1.9123675346374511, + "epoch": 3.8992, + "grad_norm": 1.5208053588867188, + "learning_rate": 1.3955518384876863e-05, + "loss": 1.4309, + "mean_token_accuracy": 0.6139545351266861, + "num_tokens": 9140150.0, + "step": 1220 + }, + { + "entropy": 1.9148090302944183, + "epoch": 3.9312, + "grad_norm": 1.6418218612670898, + "learning_rate": 1.3860269580666004e-05, + "loss": 1.4269, + "mean_token_accuracy": 0.6169310078024864, + "num_tokens": 9215796.0, + "step": 1230 + }, + { + "entropy": 1.9157740741968154, + "epoch": 3.9632, + "grad_norm": 1.4638084173202515, + "learning_rate": 1.3764607434042353e-05, + "loss": 1.4509, + "mean_token_accuracy": 0.6164968460798264, + "num_tokens": 9291010.0, + "step": 1240 + }, + { + "entropy": 1.9184510678052902, + "epoch": 3.9952, + "grad_norm": 1.5152716636657715, + "learning_rate": 1.3668542188130567e-05, + "loss": 1.4453, + "mean_token_accuracy": 0.6112410992383956, + "num_tokens": 9367186.0, + "step": 1250 + }, + { + "entropy": 1.9016748384425515, + "epoch": 4.0256, + "grad_norm": 1.490628719329834, + "learning_rate": 1.3572084129217566e-05, + "loss": 1.382, + "mean_token_accuracy": 0.623968276538347, + "num_tokens": 9439028.0, + "step": 1260 + }, + { + "entropy": 1.8026290327310561, + "epoch": 4.0576, + "grad_norm": 1.8969308137893677, + "learning_rate": 1.347524358565115e-05, + "loss": 1.3128, + "mean_token_accuracy": 0.6386646836996078, + "num_tokens": 9513855.0, + "step": 1270 + }, + { + "entropy": 1.8283424764871596, + "epoch": 4.0896, + "grad_norm": 1.5952194929122925, + "learning_rate": 1.3378030926734052e-05, + "loss": 1.3362, + "mean_token_accuracy": 0.6328515768051147, + "num_tokens": 9589080.0, + "step": 1280 + }, + { + "entropy": 1.8405955344438554, + "epoch": 4.1216, + "grad_norm": 1.6057584285736084, + "learning_rate": 1.3280456561613653e-05, + "loss": 1.4151, + "mean_token_accuracy": 0.6261398203670978, + "num_tokens": 9666808.0, + "step": 1290 + }, + { + "entropy": 1.8390818655490875, + "epoch": 4.1536, + "grad_norm": 1.8149824142456055, + "learning_rate": 1.3182530938167409e-05, + "loss": 1.3455, + "mean_token_accuracy": 0.6318597674369812, + "num_tokens": 9740267.0, + "step": 1300 + }, + { + "entropy": 1.8203887075185776, + "epoch": 4.1856, + "grad_norm": 1.6102676391601562, + "learning_rate": 1.3084264541884118e-05, + "loss": 1.3255, + "mean_token_accuracy": 0.6316933467984199, + "num_tokens": 9816400.0, + "step": 1310 + }, + { + "entropy": 1.8592366576194763, + "epoch": 4.2176, + "grad_norm": 1.9501773118972778, + "learning_rate": 1.2985667894741197e-05, + "loss": 1.3521, + "mean_token_accuracy": 0.6301594719290733, + "num_tokens": 9889311.0, + "step": 1320 + }, + { + "entropy": 1.8420085966587068, + "epoch": 4.2496, + "grad_norm": 1.6526106595993042, + "learning_rate": 1.2886751554078015e-05, + "loss": 1.3662, + "mean_token_accuracy": 0.6302071824669838, + "num_tokens": 9965339.0, + "step": 1330 + }, + { + "entropy": 1.8313881188631058, + "epoch": 4.2816, + "grad_norm": 1.6269904375076294, + "learning_rate": 1.2787526111465453e-05, + "loss": 1.3579, + "mean_token_accuracy": 0.6328388035297394, + "num_tokens": 10039668.0, + "step": 1340 + }, + { + "entropy": 1.858151137828827, + "epoch": 4.3136, + "grad_norm": 1.9028024673461914, + "learning_rate": 1.2688002191571829e-05, + "loss": 1.421, + "mean_token_accuracy": 0.6276688367128372, + "num_tokens": 10115387.0, + "step": 1350 + }, + { + "entropy": 1.8273844957351684, + "epoch": 4.3456, + "grad_norm": 1.7530555725097656, + "learning_rate": 1.2588190451025209e-05, + "loss": 1.3527, + "mean_token_accuracy": 0.6345869660377502, + "num_tokens": 10191506.0, + "step": 1360 + }, + { + "entropy": 1.8732422679662704, + "epoch": 4.3776, + "grad_norm": 1.7372691631317139, + "learning_rate": 1.248810157727236e-05, + "loss": 1.4132, + "mean_token_accuracy": 0.6252246856689453, + "num_tokens": 10268756.0, + "step": 1370 + }, + { + "entropy": 1.8583054572343827, + "epoch": 4.4096, + "grad_norm": 1.6993470191955566, + "learning_rate": 1.2387746287434385e-05, + "loss": 1.3638, + "mean_token_accuracy": 0.6286717876791954, + "num_tokens": 10341779.0, + "step": 1380 + }, + { + "entropy": 1.8324467271566391, + "epoch": 4.4416, + "grad_norm": 1.7818169593811035, + "learning_rate": 1.2287135327159165e-05, + "loss": 1.3372, + "mean_token_accuracy": 0.6361263945698739, + "num_tokens": 10414642.0, + "step": 1390 + }, + { + "entropy": 1.8514392852783204, + "epoch": 4.4736, + "grad_norm": 1.7585517168045044, + "learning_rate": 1.2186279469470757e-05, + "loss": 1.3697, + "mean_token_accuracy": 0.628801380097866, + "num_tokens": 10489517.0, + "step": 1400 + }, + { + "entropy": 1.8218136370182036, + "epoch": 4.5056, + "grad_norm": 1.9843116998672485, + "learning_rate": 1.2085189513615872e-05, + "loss": 1.3628, + "mean_token_accuracy": 0.6295172199606895, + "num_tokens": 10565467.0, + "step": 1410 + }, + { + "entropy": 1.8919565021991729, + "epoch": 4.5376, + "grad_norm": 1.9309132099151611, + "learning_rate": 1.1983876283907522e-05, + "loss": 1.4467, + "mean_token_accuracy": 0.6263746194541454, + "num_tokens": 10641283.0, + "step": 1420 + }, + { + "entropy": 1.8356508910655975, + "epoch": 4.5696, + "grad_norm": 1.7685068845748901, + "learning_rate": 1.1882350628566008e-05, + "loss": 1.3631, + "mean_token_accuracy": 0.624418406188488, + "num_tokens": 10716701.0, + "step": 1430 + }, + { + "entropy": 1.8288098931312562, + "epoch": 4.6016, + "grad_norm": 1.8276050090789795, + "learning_rate": 1.178062341855732e-05, + "loss": 1.3619, + "mean_token_accuracy": 0.6286922857165337, + "num_tokens": 10791427.0, + "step": 1440 + }, + { + "entropy": 1.8557640790939331, + "epoch": 4.6336, + "grad_norm": 1.7773240804672241, + "learning_rate": 1.1678705546429132e-05, + "loss": 1.383, + "mean_token_accuracy": 0.6216814562678337, + "num_tokens": 10866356.0, + "step": 1450 + }, + { + "entropy": 1.8483826667070389, + "epoch": 4.6655999999999995, + "grad_norm": 1.831931710243225, + "learning_rate": 1.1576607925144456e-05, + "loss": 1.355, + "mean_token_accuracy": 0.6278511002659798, + "num_tokens": 10940772.0, + "step": 1460 + }, + { + "entropy": 1.8824394553899766, + "epoch": 4.6975999999999996, + "grad_norm": 1.9213542938232422, + "learning_rate": 1.1474341486913146e-05, + "loss": 1.3767, + "mean_token_accuracy": 0.6256057649850846, + "num_tokens": 11016144.0, + "step": 1470 + }, + { + "entropy": 1.8709469974040984, + "epoch": 4.7296, + "grad_norm": 1.8768925666809082, + "learning_rate": 1.1371917182021297e-05, + "loss": 1.3734, + "mean_token_accuracy": 0.6317574754357338, + "num_tokens": 11089939.0, + "step": 1480 + }, + { + "entropy": 1.8673742085695266, + "epoch": 4.7616, + "grad_norm": 1.796302318572998, + "learning_rate": 1.1269345977658747e-05, + "loss": 1.3707, + "mean_token_accuracy": 0.6238353416323662, + "num_tokens": 11166087.0, + "step": 1490 + }, + { + "entropy": 1.8310889720916748, + "epoch": 4.7936, + "grad_norm": 1.8969939947128296, + "learning_rate": 1.1166638856744747e-05, + "loss": 1.3373, + "mean_token_accuracy": 0.6348015949130058, + "num_tokens": 11240732.0, + "step": 1500 + }, + { + "entropy": 1.8809226244688033, + "epoch": 4.8256, + "grad_norm": 1.642104983329773, + "learning_rate": 1.1063806816751957e-05, + "loss": 1.3792, + "mean_token_accuracy": 0.6250617265701294, + "num_tokens": 11316878.0, + "step": 1510 + }, + { + "entropy": 1.8715822875499726, + "epoch": 4.8576, + "grad_norm": 1.962158441543579, + "learning_rate": 1.0960860868528872e-05, + "loss": 1.3711, + "mean_token_accuracy": 0.6293752744793892, + "num_tokens": 11389042.0, + "step": 1520 + }, + { + "entropy": 1.8657191127538681, + "epoch": 4.8896, + "grad_norm": 1.9577444791793823, + "learning_rate": 1.0857812035120845e-05, + "loss": 1.379, + "mean_token_accuracy": 0.6259156972169876, + "num_tokens": 11464215.0, + "step": 1530 + }, + { + "entropy": 1.8811951220035552, + "epoch": 4.9216, + "grad_norm": 2.015150785446167, + "learning_rate": 1.0754671350589752e-05, + "loss": 1.4155, + "mean_token_accuracy": 0.626779156178236, + "num_tokens": 11539122.0, + "step": 1540 + }, + { + "entropy": 1.863905319571495, + "epoch": 4.9536, + "grad_norm": 1.8474093675613403, + "learning_rate": 1.065144985883253e-05, + "loss": 1.3409, + "mean_token_accuracy": 0.6319419264793396, + "num_tokens": 11613016.0, + "step": 1550 + }, + { + "entropy": 1.836970153450966, + "epoch": 4.9856, + "grad_norm": 1.8822177648544312, + "learning_rate": 1.054815861239864e-05, + "loss": 1.3514, + "mean_token_accuracy": 0.6292115703225136, + "num_tokens": 11688143.0, + "step": 1560 + }, + { + "entropy": 1.8377465950815302, + "epoch": 5.016, + "grad_norm": 1.8221346139907837, + "learning_rate": 1.0444808671306588e-05, + "loss": 1.3028, + "mean_token_accuracy": 0.6413120329380035, + "num_tokens": 11758768.0, + "step": 1570 + }, + { + "entropy": 1.7883025139570237, + "epoch": 5.048, + "grad_norm": 2.1959595680236816, + "learning_rate": 1.034141110185968e-05, + "loss": 1.2797, + "mean_token_accuracy": 0.6479741290211678, + "num_tokens": 11832210.0, + "step": 1580 + }, + { + "entropy": 1.7955584406852723, + "epoch": 5.08, + "grad_norm": 2.106905698776245, + "learning_rate": 1.0237976975461074e-05, + "loss": 1.2881, + "mean_token_accuracy": 0.6424632370471954, + "num_tokens": 11906115.0, + "step": 1590 + }, + { + "entropy": 1.7998322755098344, + "epoch": 5.112, + "grad_norm": 2.327314615249634, + "learning_rate": 1.0134517367428309e-05, + "loss": 1.3324, + "mean_token_accuracy": 0.6460248224437237, + "num_tokens": 11981328.0, + "step": 1600 + }, + { + "entropy": 1.7885828018188477, + "epoch": 5.144, + "grad_norm": 2.1001713275909424, + "learning_rate": 1.0031043355807386e-05, + "loss": 1.3098, + "mean_token_accuracy": 0.63900758177042, + "num_tokens": 12056453.0, + "step": 1610 + }, + { + "entropy": 1.769435602426529, + "epoch": 5.176, + "grad_norm": 2.1210567951202393, + "learning_rate": 9.927566020186592e-06, + "loss": 1.2892, + "mean_token_accuracy": 0.6432970002293587, + "num_tokens": 12133433.0, + "step": 1620 + }, + { + "entropy": 1.7907766073942184, + "epoch": 5.208, + "grad_norm": 2.1842658519744873, + "learning_rate": 9.82409644051013e-06, + "loss": 1.2856, + "mean_token_accuracy": 0.6423615619540215, + "num_tokens": 12207150.0, + "step": 1630 + }, + { + "entropy": 1.7834827870130538, + "epoch": 5.24, + "grad_norm": 2.2503459453582764, + "learning_rate": 9.720645695891733e-06, + "loss": 1.3066, + "mean_token_accuracy": 0.6417693704366684, + "num_tokens": 12282584.0, + "step": 1640 + }, + { + "entropy": 1.763256973028183, + "epoch": 5.272, + "grad_norm": 1.9505388736724854, + "learning_rate": 9.617224863428346e-06, + "loss": 1.2951, + "mean_token_accuracy": 0.6429389595985413, + "num_tokens": 12359793.0, + "step": 1650 + }, + { + "entropy": 1.8142763644456863, + "epoch": 5.304, + "grad_norm": 1.9957698583602905, + "learning_rate": 9.513845017014048e-06, + "loss": 1.33, + "mean_token_accuracy": 0.6413653999567032, + "num_tokens": 12434251.0, + "step": 1660 + }, + { + "entropy": 1.797221601009369, + "epoch": 5.336, + "grad_norm": 2.5095462799072266, + "learning_rate": 9.410517226154276e-06, + "loss": 1.2978, + "mean_token_accuracy": 0.6389835774898529, + "num_tokens": 12508416.0, + "step": 1670 + }, + { + "entropy": 1.8157870292663574, + "epoch": 5.368, + "grad_norm": 2.1890602111816406, + "learning_rate": 9.30725255478058e-06, + "loss": 1.3183, + "mean_token_accuracy": 0.6448161751031876, + "num_tokens": 12582896.0, + "step": 1680 + }, + { + "entropy": 1.7990054041147232, + "epoch": 5.4, + "grad_norm": 2.3904025554656982, + "learning_rate": 9.204062060065915e-06, + "loss": 1.3318, + "mean_token_accuracy": 0.636146479845047, + "num_tokens": 12656802.0, + "step": 1690 + }, + { + "entropy": 1.8003453463315964, + "epoch": 5.432, + "grad_norm": 1.9204304218292236, + "learning_rate": 9.100956791240699e-06, + "loss": 1.3186, + "mean_token_accuracy": 0.6372130662202835, + "num_tokens": 12733283.0, + "step": 1700 + }, + { + "entropy": 1.8101116061210631, + "epoch": 5.464, + "grad_norm": 2.009500026702881, + "learning_rate": 8.997947788409696e-06, + "loss": 1.3247, + "mean_token_accuracy": 0.6406339526176452, + "num_tokens": 12810272.0, + "step": 1710 + }, + { + "entropy": 1.764935952425003, + "epoch": 5.496, + "grad_norm": 2.2038798332214355, + "learning_rate": 8.89504608136989e-06, + "loss": 1.2792, + "mean_token_accuracy": 0.6445836886763573, + "num_tokens": 12885633.0, + "step": 1720 + }, + { + "entropy": 1.7950240582227708, + "epoch": 5.5280000000000005, + "grad_norm": 2.0160531997680664, + "learning_rate": 8.792262688429445e-06, + "loss": 1.2934, + "mean_token_accuracy": 0.6469692558050155, + "num_tokens": 12961131.0, + "step": 1730 + }, + { + "entropy": 1.7804677098989488, + "epoch": 5.5600000000000005, + "grad_norm": 2.1956582069396973, + "learning_rate": 8.689608615227933e-06, + "loss": 1.2969, + "mean_token_accuracy": 0.6438389763236045, + "num_tokens": 13036481.0, + "step": 1740 + }, + { + "entropy": 1.7932062089443206, + "epoch": 5.592, + "grad_norm": 2.2215394973754883, + "learning_rate": 8.587094853557877e-06, + "loss": 1.2907, + "mean_token_accuracy": 0.6460438340902328, + "num_tokens": 13111001.0, + "step": 1750 + }, + { + "entropy": 1.8026408910751344, + "epoch": 5.624, + "grad_norm": 2.3881425857543945, + "learning_rate": 8.484732380187785e-06, + "loss": 1.3049, + "mean_token_accuracy": 0.6414234206080437, + "num_tokens": 13186347.0, + "step": 1760 + }, + { + "entropy": 1.8440747499465941, + "epoch": 5.656, + "grad_norm": 2.2154159545898438, + "learning_rate": 8.382532155686825e-06, + "loss": 1.3797, + "mean_token_accuracy": 0.6365857936441899, + "num_tokens": 13261455.0, + "step": 1770 + }, + { + "entropy": 1.7975190997123718, + "epoch": 5.688, + "grad_norm": 2.1991233825683594, + "learning_rate": 8.280505123251183e-06, + "loss": 1.3191, + "mean_token_accuracy": 0.6393151715397835, + "num_tokens": 13338064.0, + "step": 1780 + }, + { + "entropy": 1.8396487146615983, + "epoch": 5.72, + "grad_norm": 2.0190858840942383, + "learning_rate": 8.178662207532343e-06, + "loss": 1.4052, + "mean_token_accuracy": 0.629064130038023, + "num_tokens": 13414806.0, + "step": 1790 + }, + { + "entropy": 1.7840806126594544, + "epoch": 5.752, + "grad_norm": 2.3335204124450684, + "learning_rate": 8.077014313467274e-06, + "loss": 1.2701, + "mean_token_accuracy": 0.6464540064334869, + "num_tokens": 13489075.0, + "step": 1800 + }, + { + "entropy": 1.7840022534132003, + "epoch": 5.784, + "grad_norm": 2.2151618003845215, + "learning_rate": 7.975572325110819e-06, + "loss": 1.3248, + "mean_token_accuracy": 0.6358998969197274, + "num_tokens": 13565636.0, + "step": 1810 + }, + { + "entropy": 1.7677135676145554, + "epoch": 5.816, + "grad_norm": 2.11505389213562, + "learning_rate": 7.874347104470234e-06, + "loss": 1.2765, + "mean_token_accuracy": 0.6448719501495361, + "num_tokens": 13641112.0, + "step": 1820 + }, + { + "entropy": 1.7586044907569884, + "epoch": 5.848, + "grad_norm": 2.178250789642334, + "learning_rate": 7.773349490342157e-06, + "loss": 1.2846, + "mean_token_accuracy": 0.6450280979275703, + "num_tokens": 13715158.0, + "step": 1830 + }, + { + "entropy": 1.8128920108079911, + "epoch": 5.88, + "grad_norm": 2.2499353885650635, + "learning_rate": 7.672590297152013e-06, + "loss": 1.3248, + "mean_token_accuracy": 0.6343795835971833, + "num_tokens": 13791086.0, + "step": 1840 + }, + { + "entropy": 1.7873643577098846, + "epoch": 5.912, + "grad_norm": 2.1989104747772217, + "learning_rate": 7.572080313796064e-06, + "loss": 1.2907, + "mean_token_accuracy": 0.6413815975189209, + "num_tokens": 13865700.0, + "step": 1850 + }, + { + "entropy": 1.790488451719284, + "epoch": 5.944, + "grad_norm": 2.2605504989624023, + "learning_rate": 7.471830302486151e-06, + "loss": 1.2955, + "mean_token_accuracy": 0.6424889475107193, + "num_tokens": 13938540.0, + "step": 1860 + }, + { + "entropy": 1.7985246628522873, + "epoch": 5.976, + "grad_norm": 2.3228533267974854, + "learning_rate": 7.371850997597355e-06, + "loss": 1.2854, + "mean_token_accuracy": 0.6457341402769089, + "num_tokens": 14011087.0, + "step": 1870 + }, + { + "entropy": 1.7890221727521796, + "epoch": 6.0064, + "grad_norm": 2.192910671234131, + "learning_rate": 7.272153104518567e-06, + "loss": 1.2796, + "mean_token_accuracy": 0.6482133128141102, + "num_tokens": 14082075.0, + "step": 1880 + }, + { + "entropy": 1.7633086562156677, + "epoch": 6.0384, + "grad_norm": 2.368185043334961, + "learning_rate": 7.172747298506224e-06, + "loss": 1.2703, + "mean_token_accuracy": 0.6497290328145027, + "num_tokens": 14156298.0, + "step": 1890 + }, + { + "entropy": 1.750128635764122, + "epoch": 6.0704, + "grad_norm": 2.36487078666687, + "learning_rate": 7.073644223541227e-06, + "loss": 1.2521, + "mean_token_accuracy": 0.6534707516431808, + "num_tokens": 14232528.0, + "step": 1900 + }, + { + "entropy": 1.7416553497314453, + "epoch": 6.1024, + "grad_norm": 2.3927595615386963, + "learning_rate": 6.974854491189243e-06, + "loss": 1.217, + "mean_token_accuracy": 0.6588135868310928, + "num_tokens": 14307073.0, + "step": 1910 + }, + { + "entropy": 1.7359241485595702, + "epoch": 6.1344, + "grad_norm": 2.1107988357543945, + "learning_rate": 6.876388679464437e-06, + "loss": 1.2763, + "mean_token_accuracy": 0.6550255373120308, + "num_tokens": 14383819.0, + "step": 1920 + }, + { + "entropy": 1.7380403220653533, + "epoch": 6.1664, + "grad_norm": 2.4158387184143066, + "learning_rate": 6.7782573316968424e-06, + "loss": 1.2359, + "mean_token_accuracy": 0.656632873415947, + "num_tokens": 14460092.0, + "step": 1930 + }, + { + "entropy": 1.7227638810873032, + "epoch": 6.1984, + "grad_norm": 2.3467485904693604, + "learning_rate": 6.6804709554034075e-06, + "loss": 1.2311, + "mean_token_accuracy": 0.654091839492321, + "num_tokens": 14534160.0, + "step": 1940 + }, + { + "entropy": 1.7244948148727417, + "epoch": 6.2304, + "grad_norm": 2.760057210922241, + "learning_rate": 6.583040021162905e-06, + "loss": 1.2189, + "mean_token_accuracy": 0.6611428812146187, + "num_tokens": 14608592.0, + "step": 1950 + }, + { + "entropy": 1.7471657902002335, + "epoch": 6.2624, + "grad_norm": 2.3923745155334473, + "learning_rate": 6.485974961494772e-06, + "loss": 1.2631, + "mean_token_accuracy": 0.6524021357297898, + "num_tokens": 14683538.0, + "step": 1960 + }, + { + "entropy": 1.7506494253873826, + "epoch": 6.2943999999999996, + "grad_norm": 2.4149715900421143, + "learning_rate": 6.389286169742048e-06, + "loss": 1.2468, + "mean_token_accuracy": 0.6567713841795921, + "num_tokens": 14755778.0, + "step": 1970 + }, + { + "entropy": 1.7104488879442215, + "epoch": 6.3264, + "grad_norm": 2.632632255554199, + "learning_rate": 6.292983998958478e-06, + "loss": 1.2267, + "mean_token_accuracy": 0.6561126798391342, + "num_tokens": 14831036.0, + "step": 1980 + }, + { + "entropy": 1.7591658294200898, + "epoch": 6.3584, + "grad_norm": 2.4012722969055176, + "learning_rate": 6.1970787607999815e-06, + "loss": 1.2693, + "mean_token_accuracy": 0.6490694522857666, + "num_tokens": 14906610.0, + "step": 1990 + }, + { + "entropy": 1.7317969173192977, + "epoch": 6.3904, + "grad_norm": 2.8288748264312744, + "learning_rate": 6.101580724420478e-06, + "loss": 1.235, + "mean_token_accuracy": 0.6564200609922409, + "num_tokens": 14980134.0, + "step": 2000 + }, + { + "entropy": 1.7617577254772185, + "epoch": 6.4224, + "grad_norm": 2.4008944034576416, + "learning_rate": 6.00650011537235e-06, + "loss": 1.2773, + "mean_token_accuracy": 0.6494350075721741, + "num_tokens": 15054969.0, + "step": 2010 + }, + { + "entropy": 1.749829688668251, + "epoch": 6.4544, + "grad_norm": 2.5665228366851807, + "learning_rate": 5.911847114511497e-06, + "loss": 1.2512, + "mean_token_accuracy": 0.6512764275074006, + "num_tokens": 15129421.0, + "step": 2020 + }, + { + "entropy": 1.7387797951698303, + "epoch": 6.4864, + "grad_norm": 2.6020922660827637, + "learning_rate": 5.817631856907233e-06, + "loss": 1.2477, + "mean_token_accuracy": 0.6530226185917855, + "num_tokens": 15203465.0, + "step": 2030 + }, + { + "entropy": 1.7363551884889603, + "epoch": 6.5184, + "grad_norm": 2.161478281021118, + "learning_rate": 5.723864430757047e-06, + "loss": 1.2692, + "mean_token_accuracy": 0.6527093783020973, + "num_tokens": 15279761.0, + "step": 2040 + }, + { + "entropy": 1.7563295543193818, + "epoch": 6.5504, + "grad_norm": 2.5587289333343506, + "learning_rate": 5.630554876306407e-06, + "loss": 1.2211, + "mean_token_accuracy": 0.6574550330638885, + "num_tokens": 15351301.0, + "step": 2050 + }, + { + "entropy": 1.7521151036024094, + "epoch": 6.5824, + "grad_norm": 2.4042234420776367, + "learning_rate": 5.537713184773686e-06, + "loss": 1.271, + "mean_token_accuracy": 0.6478641331195831, + "num_tokens": 15427936.0, + "step": 2060 + }, + { + "entropy": 1.7270145863294601, + "epoch": 6.6144, + "grad_norm": 2.3122522830963135, + "learning_rate": 5.44534929728036e-06, + "loss": 1.224, + "mean_token_accuracy": 0.6566437393426895, + "num_tokens": 15502561.0, + "step": 2070 + }, + { + "entropy": 1.7568089962005615, + "epoch": 6.6464, + "grad_norm": 2.461474895477295, + "learning_rate": 5.353473103786511e-06, + "loss": 1.2864, + "mean_token_accuracy": 0.6471308276057244, + "num_tokens": 15578053.0, + "step": 2080 + }, + { + "entropy": 1.7190027862787247, + "epoch": 6.6784, + "grad_norm": 2.4412550926208496, + "learning_rate": 5.262094442031901e-06, + "loss": 1.2092, + "mean_token_accuracy": 0.6601713746786118, + "num_tokens": 15653342.0, + "step": 2090 + }, + { + "entropy": 1.717634916305542, + "epoch": 6.7104, + "grad_norm": 2.276007890701294, + "learning_rate": 5.171223096482533e-06, + "loss": 1.2271, + "mean_token_accuracy": 0.6595920532941818, + "num_tokens": 15730387.0, + "step": 2100 + }, + { + "entropy": 1.7230647921562194, + "epoch": 6.7424, + "grad_norm": 2.480471134185791, + "learning_rate": 5.080868797283019e-06, + "loss": 1.229, + "mean_token_accuracy": 0.6568982020020485, + "num_tokens": 15804405.0, + "step": 2110 + }, + { + "entropy": 1.7535502433776855, + "epoch": 6.7744, + "grad_norm": 2.448997974395752, + "learning_rate": 4.9910412192146795e-06, + "loss": 1.2584, + "mean_token_accuracy": 0.648795773088932, + "num_tokens": 15878537.0, + "step": 2120 + }, + { + "entropy": 1.786664029955864, + "epoch": 6.8064, + "grad_norm": 2.430039405822754, + "learning_rate": 4.901749980659617e-06, + "loss": 1.3358, + "mean_token_accuracy": 0.6427689291536808, + "num_tokens": 15952964.0, + "step": 2130 + }, + { + "entropy": 1.7594995677471161, + "epoch": 6.8384, + "grad_norm": 2.469172716140747, + "learning_rate": 4.813004642570822e-06, + "loss": 1.2844, + "mean_token_accuracy": 0.6534359715878963, + "num_tokens": 16028086.0, + "step": 2140 + }, + { + "entropy": 1.7347292125225067, + "epoch": 6.8704, + "grad_norm": 2.6162445545196533, + "learning_rate": 4.724814707448418e-06, + "loss": 1.2707, + "mean_token_accuracy": 0.6447671175003051, + "num_tokens": 16103263.0, + "step": 2150 + }, + { + "entropy": 1.7325938045978546, + "epoch": 6.9024, + "grad_norm": 2.416431188583374, + "learning_rate": 4.637189618322173e-06, + "loss": 1.2794, + "mean_token_accuracy": 0.6439008563756943, + "num_tokens": 16182360.0, + "step": 2160 + }, + { + "entropy": 1.7763712674379348, + "epoch": 6.9344, + "grad_norm": 2.3447437286376953, + "learning_rate": 4.550138757740381e-06, + "loss": 1.3043, + "mean_token_accuracy": 0.650251479446888, + "num_tokens": 16256272.0, + "step": 2170 + }, + { + "entropy": 1.739478302001953, + "epoch": 6.9664, + "grad_norm": 2.650451183319092, + "learning_rate": 4.463671446765206e-06, + "loss": 1.259, + "mean_token_accuracy": 0.6514677822589874, + "num_tokens": 16330984.0, + "step": 2180 + }, + { + "entropy": 1.7292406976222991, + "epoch": 6.9984, + "grad_norm": 2.5442306995391846, + "learning_rate": 4.377796943974641e-06, + "loss": 1.2554, + "mean_token_accuracy": 0.6506337657570839, + "num_tokens": 16406982.0, + "step": 2190 + }, + { + "entropy": 1.7582613606201976, + "epoch": 7.0288, + "grad_norm": 2.315408706665039, + "learning_rate": 4.292524444471097e-06, + "loss": 1.2766, + "mean_token_accuracy": 0.6549999933493765, + "num_tokens": 16479870.0, + "step": 2200 + }, + { + "entropy": 1.6856683611869812, + "epoch": 7.0608, + "grad_norm": 2.6291606426239014, + "learning_rate": 4.2078630788968775e-06, + "loss": 1.2051, + "mean_token_accuracy": 0.6612728327512741, + "num_tokens": 16557045.0, + "step": 2210 + }, + { + "entropy": 1.7316391229629517, + "epoch": 7.0928, + "grad_norm": 2.767998695373535, + "learning_rate": 4.123821912456457e-06, + "loss": 1.1981, + "mean_token_accuracy": 0.6648431628942489, + "num_tokens": 16629418.0, + "step": 2220 + }, + { + "entropy": 1.6988701403141022, + "epoch": 7.1248, + "grad_norm": 2.752492904663086, + "learning_rate": 4.040409943945856e-06, + "loss": 1.2277, + "mean_token_accuracy": 0.6629953101277352, + "num_tokens": 16706000.0, + "step": 2230 + }, + { + "entropy": 1.6829528212547302, + "epoch": 7.1568, + "grad_norm": 2.5362343788146973, + "learning_rate": 3.957636104789056e-06, + "loss": 1.1526, + "mean_token_accuracy": 0.6730572417378425, + "num_tokens": 16781728.0, + "step": 2240 + }, + { + "entropy": 1.7018825829029083, + "epoch": 7.1888, + "grad_norm": 2.772650718688965, + "learning_rate": 3.875509258081671e-06, + "loss": 1.1978, + "mean_token_accuracy": 0.6671716704964638, + "num_tokens": 16856290.0, + "step": 2250 + }, + { + "entropy": 1.7013772219419478, + "epoch": 7.2208, + "grad_norm": 2.548017740249634, + "learning_rate": 3.794038197641924e-06, + "loss": 1.2166, + "mean_token_accuracy": 0.6607363104820252, + "num_tokens": 16933284.0, + "step": 2260 + }, + { + "entropy": 1.7119423121213913, + "epoch": 7.2528, + "grad_norm": 3.085073947906494, + "learning_rate": 3.713231647069031e-06, + "loss": 1.1811, + "mean_token_accuracy": 0.6690418004989624, + "num_tokens": 17006677.0, + "step": 2270 + }, + { + "entropy": 1.6958551973104476, + "epoch": 7.2848, + "grad_norm": 2.9240498542785645, + "learning_rate": 3.633098258809119e-06, + "loss": 1.1998, + "mean_token_accuracy": 0.6630783304572105, + "num_tokens": 17081748.0, + "step": 2280 + }, + { + "entropy": 1.7125141024589539, + "epoch": 7.3168, + "grad_norm": 2.5372555255889893, + "learning_rate": 3.5536466132287562e-06, + "loss": 1.1941, + "mean_token_accuracy": 0.6668895781040192, + "num_tokens": 17155159.0, + "step": 2290 + }, + { + "entropy": 1.699021890759468, + "epoch": 7.3488, + "grad_norm": 2.555891752243042, + "learning_rate": 3.4748852176961912e-06, + "loss": 1.1871, + "mean_token_accuracy": 0.6658635303378105, + "num_tokens": 17231062.0, + "step": 2300 + }, + { + "entropy": 1.7193275570869446, + "epoch": 7.3808, + "grad_norm": 2.6699490547180176, + "learning_rate": 3.3968225056704427e-06, + "loss": 1.2396, + "mean_token_accuracy": 0.6561257526278496, + "num_tokens": 17306280.0, + "step": 2310 + }, + { + "entropy": 1.714812269806862, + "epoch": 7.4128, + "grad_norm": 2.8019561767578125, + "learning_rate": 3.319466835798235e-06, + "loss": 1.2006, + "mean_token_accuracy": 0.6625583916902542, + "num_tokens": 17379705.0, + "step": 2320 + }, + { + "entropy": 1.711946851015091, + "epoch": 7.4448, + "grad_norm": 2.758375644683838, + "learning_rate": 3.2428264910190398e-06, + "loss": 1.2234, + "mean_token_accuracy": 0.6590609803795815, + "num_tokens": 17453837.0, + "step": 2330 + }, + { + "entropy": 1.7215852111577987, + "epoch": 7.4768, + "grad_norm": 2.609473466873169, + "learning_rate": 3.166909677678116e-06, + "loss": 1.2182, + "mean_token_accuracy": 0.6575865730643272, + "num_tokens": 17528089.0, + "step": 2340 + }, + { + "entropy": 1.6932504892349243, + "epoch": 7.5088, + "grad_norm": 2.3272244930267334, + "learning_rate": 3.091724524647861e-06, + "loss": 1.1907, + "mean_token_accuracy": 0.665678508579731, + "num_tokens": 17603632.0, + "step": 2350 + }, + { + "entropy": 1.6796110332012177, + "epoch": 7.5408, + "grad_norm": 2.619145631790161, + "learning_rate": 3.0172790824573627e-06, + "loss": 1.1693, + "mean_token_accuracy": 0.6696520581841469, + "num_tokens": 17678354.0, + "step": 2360 + }, + { + "entropy": 1.7049964010715484, + "epoch": 7.5728, + "grad_norm": 2.4621567726135254, + "learning_rate": 2.943581322430399e-06, + "loss": 1.2376, + "mean_token_accuracy": 0.6554866015911103, + "num_tokens": 17755529.0, + "step": 2370 + }, + { + "entropy": 1.734333610534668, + "epoch": 7.6048, + "grad_norm": 2.7798454761505127, + "learning_rate": 2.8706391358318942e-06, + "loss": 1.2638, + "mean_token_accuracy": 0.6574688166379928, + "num_tokens": 17829887.0, + "step": 2380 + }, + { + "entropy": 1.7209048956632613, + "epoch": 7.6368, + "grad_norm": 2.579355239868164, + "learning_rate": 2.798460333022959e-06, + "loss": 1.2515, + "mean_token_accuracy": 0.6567336574196816, + "num_tokens": 17905006.0, + "step": 2390 + }, + { + "entropy": 1.7461798429489135, + "epoch": 7.6688, + "grad_norm": 2.3670754432678223, + "learning_rate": 2.72705264262458e-06, + "loss": 1.2845, + "mean_token_accuracy": 0.6506199359893798, + "num_tokens": 17980732.0, + "step": 2400 + }, + { + "entropy": 1.6880094558000565, + "epoch": 7.7008, + "grad_norm": 2.6313459873199463, + "learning_rate": 2.6564237106900815e-06, + "loss": 1.188, + "mean_token_accuracy": 0.6656670674681664, + "num_tokens": 18055716.0, + "step": 2410 + }, + { + "entropy": 1.695495843887329, + "epoch": 7.7328, + "grad_norm": 2.5075557231903076, + "learning_rate": 2.586581099886396e-06, + "loss": 1.2249, + "mean_token_accuracy": 0.6589935123920441, + "num_tokens": 18132898.0, + "step": 2420 + }, + { + "entropy": 1.7270244836807251, + "epoch": 7.7648, + "grad_norm": 2.8843488693237305, + "learning_rate": 2.5175322886843156e-06, + "loss": 1.2071, + "mean_token_accuracy": 0.6606879189610482, + "num_tokens": 18206187.0, + "step": 2430 + }, + { + "entropy": 1.7067583829164505, + "epoch": 7.7968, + "grad_norm": 2.5878074169158936, + "learning_rate": 2.4492846705576845e-06, + "loss": 1.2108, + "mean_token_accuracy": 0.662336565554142, + "num_tokens": 18280919.0, + "step": 2440 + }, + { + "entropy": 1.7000836163759232, + "epoch": 7.8288, + "grad_norm": 2.6365489959716797, + "learning_rate": 2.381845553191783e-06, + "loss": 1.2129, + "mean_token_accuracy": 0.661854301393032, + "num_tokens": 18356501.0, + "step": 2450 + }, + { + "entropy": 1.7049589693546294, + "epoch": 7.8608, + "grad_norm": 2.82125186920166, + "learning_rate": 2.315222157700797e-06, + "loss": 1.1956, + "mean_token_accuracy": 0.6644625499844551, + "num_tokens": 18429752.0, + "step": 2460 + }, + { + "entropy": 1.7245618909597398, + "epoch": 7.8928, + "grad_norm": 2.726285934448242, + "learning_rate": 2.2494216178546647e-06, + "loss": 1.1784, + "mean_token_accuracy": 0.6688286542892456, + "num_tokens": 18500518.0, + "step": 2470 + }, + { + "entropy": 1.710511189699173, + "epoch": 7.9248, + "grad_norm": 2.741217613220215, + "learning_rate": 2.184450979315177e-06, + "loss": 1.2731, + "mean_token_accuracy": 0.6543245255947113, + "num_tokens": 18579758.0, + "step": 2480 + }, + { + "entropy": 1.7591080367565155, + "epoch": 7.9568, + "grad_norm": 2.852081537246704, + "learning_rate": 2.1203171988815764e-06, + "loss": 1.3156, + "mean_token_accuracy": 0.6505647979676723, + "num_tokens": 18655339.0, + "step": 2490 + }, + { + "entropy": 1.7133348882198334, + "epoch": 7.9888, + "grad_norm": 2.7279410362243652, + "learning_rate": 2.057027143745646e-06, + "loss": 1.2112, + "mean_token_accuracy": 0.6649632036685944, + "num_tokens": 18728669.0, + "step": 2500 + } + ], + "logging_steps": 10, + "max_steps": 3130, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8225647308080415e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2504/training_args.bin b/checkpoint-2504/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..ca9563f4064abfd2f36668559093a5f0763d7c85 --- /dev/null +++ b/checkpoint-2504/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88fbd415d023bc35b9e36c515e374642c961f2c3a428f0d0bba13a27d8c151a7 +size 6289 diff --git a/checkpoint-2817/README.md b/checkpoint-2817/README.md new file mode 100644 index 0000000000000000000000000000000000000000..96b9f5618833a1728fbecbefb87f08b279b6b2ed --- /dev/null +++ b/checkpoint-2817/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.1-8B-Instruct +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/checkpoint-2817/adapter_config.json b/checkpoint-2817/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8cd4cdaf30ceb587991efae70006ef463605c378 --- /dev/null +++ b/checkpoint-2817/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "o_proj", + "k_proj", + "gate_proj", + "v_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2817/adapter_model.safetensors b/checkpoint-2817/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5993f268a9a817701c27eaa368b4b27ebfaf66a3 --- /dev/null +++ b/checkpoint-2817/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b062415a8259b60e0d3353df51f24ded45336234a992af393a1465aab075e7ea +size 335604696 diff --git a/checkpoint-2817/chat_template.jinja b/checkpoint-2817/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..33089ace1be88f22a10fe861ad49718d5d886090 --- /dev/null +++ b/checkpoint-2817/chat_template.jinja @@ -0,0 +1,109 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {#- This means we're in ipython mode #} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-2817/optimizer.pt b/checkpoint-2817/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..45a7fec7955dd7dae116fe62a8c75c420b970a3f --- /dev/null +++ b/checkpoint-2817/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74b15b38eaca078c7e4cdcb8d0ae07e396ed7f26be66ad345a3d10f264e619dd +size 671473443 diff --git a/checkpoint-2817/rng_state.pth b/checkpoint-2817/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e8059f43021a83cdec7e0469454da8df905f738a --- /dev/null +++ b/checkpoint-2817/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7210e748120614b866d9c52d75382e07c40588ec931def3893dbdb1a31868b18 +size 14645 diff --git a/checkpoint-2817/scheduler.pt b/checkpoint-2817/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..50534634e54cf509ead568ec2e093c475e00383d --- /dev/null +++ b/checkpoint-2817/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaee8643a0a0fe50d2c69a8ac91d8b342dd52ab100f90729053b3c39fa8854df +size 1465 diff --git a/checkpoint-2817/special_tokens_map.json b/checkpoint-2817/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..02ee80b6196926a5ad790a004d9efd6ab1ba6542 --- /dev/null +++ b/checkpoint-2817/special_tokens_map.json @@ -0,0 +1,16 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-2817/tokenizer.json b/checkpoint-2817/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-2817/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-2817/tokenizer_config.json b/checkpoint-2817/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8b0c7c141373ca36e5e819a28f60e146ccef652f --- /dev/null +++ b/checkpoint-2817/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-2817/trainer_state.json b/checkpoint-2817/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ac79652a44bdcfdd85e94bfc1df4505289897695 --- /dev/null +++ b/checkpoint-2817/trainer_state.json @@ -0,0 +1,2844 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.0, + "eval_steps": 500, + "global_step": 2817, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.098961293697357, + "epoch": 0.032, + "grad_norm": 1.891703486442566, + "learning_rate": 1.9148936170212767e-06, + "loss": 2.0828, + "mean_token_accuracy": 0.530680388212204, + "num_tokens": 72723.0, + "step": 10 + }, + { + "entropy": 2.119775766134262, + "epoch": 0.064, + "grad_norm": 1.2044862508773804, + "learning_rate": 4.042553191489362e-06, + "loss": 2.0093, + "mean_token_accuracy": 0.5355814293026924, + "num_tokens": 146392.0, + "step": 20 + }, + { + "entropy": 2.220579963922501, + "epoch": 0.096, + "grad_norm": 0.9982365369796753, + "learning_rate": 6.170212765957447e-06, + "loss": 1.8939, + "mean_token_accuracy": 0.5451944440603256, + "num_tokens": 223711.0, + "step": 30 + }, + { + "entropy": 2.382017892599106, + "epoch": 0.128, + "grad_norm": 0.7386544346809387, + "learning_rate": 8.297872340425532e-06, + "loss": 1.9066, + "mean_token_accuracy": 0.5411656655371189, + "num_tokens": 300889.0, + "step": 40 + }, + { + "entropy": 2.274736815690994, + "epoch": 0.16, + "grad_norm": 0.6412256956100464, + "learning_rate": 1.0425531914893619e-05, + "loss": 1.7387, + "mean_token_accuracy": 0.5679451540112496, + "num_tokens": 377362.0, + "step": 50 + }, + { + "entropy": 2.3663365960121157, + "epoch": 0.192, + "grad_norm": 0.6228290796279907, + "learning_rate": 1.2553191489361702e-05, + "loss": 1.7492, + "mean_token_accuracy": 0.5746532663702965, + "num_tokens": 449594.0, + "step": 60 + }, + { + "entropy": 2.315044218301773, + "epoch": 0.224, + "grad_norm": 0.6034156680107117, + "learning_rate": 1.4680851063829789e-05, + "loss": 1.7111, + "mean_token_accuracy": 0.5675176709890366, + "num_tokens": 523439.0, + "step": 70 + }, + { + "entropy": 2.288265961408615, + "epoch": 0.256, + "grad_norm": 0.45914268493652344, + "learning_rate": 1.6808510638297873e-05, + "loss": 1.6931, + "mean_token_accuracy": 0.5713589735329151, + "num_tokens": 599650.0, + "step": 80 + }, + { + "entropy": 2.2693382859230042, + "epoch": 0.288, + "grad_norm": 0.6197793483734131, + "learning_rate": 1.893617021276596e-05, + "loss": 1.6542, + "mean_token_accuracy": 0.578165066242218, + "num_tokens": 675377.0, + "step": 90 + }, + { + "entropy": 2.293796479701996, + "epoch": 0.32, + "grad_norm": 0.5502006411552429, + "learning_rate": 1.9999866154043656e-05, + "loss": 1.7108, + "mean_token_accuracy": 0.5681634023785591, + "num_tokens": 751838.0, + "step": 100 + }, + { + "entropy": 2.2658903509378434, + "epoch": 0.352, + "grad_norm": 0.5713317394256592, + "learning_rate": 1.9998795407890486e-05, + "loss": 1.6168, + "mean_token_accuracy": 0.5843982398509979, + "num_tokens": 825539.0, + "step": 110 + }, + { + "entropy": 2.270280033349991, + "epoch": 0.384, + "grad_norm": 0.5967482924461365, + "learning_rate": 1.999665403023542e-05, + "loss": 1.6194, + "mean_token_accuracy": 0.5839526921510696, + "num_tokens": 897258.0, + "step": 120 + }, + { + "entropy": 2.2349284648895265, + "epoch": 0.416, + "grad_norm": 0.4899630844593048, + "learning_rate": 1.9993442250368708e-05, + "loss": 1.6313, + "mean_token_accuracy": 0.5815729826688767, + "num_tokens": 973142.0, + "step": 130 + }, + { + "entropy": 2.245553806424141, + "epoch": 0.448, + "grad_norm": 0.6546034812927246, + "learning_rate": 1.9989160412195047e-05, + "loss": 1.6395, + "mean_token_accuracy": 0.5780692532658577, + "num_tokens": 1046762.0, + "step": 140 + }, + { + "entropy": 2.288555932044983, + "epoch": 0.48, + "grad_norm": 0.5528404116630554, + "learning_rate": 1.9983808974196752e-05, + "loss": 1.7118, + "mean_token_accuracy": 0.5686657652258873, + "num_tokens": 1125167.0, + "step": 150 + }, + { + "entropy": 2.232080355286598, + "epoch": 0.512, + "grad_norm": 0.5887461304664612, + "learning_rate": 1.9977388509384656e-05, + "loss": 1.6339, + "mean_token_accuracy": 0.5838325396180153, + "num_tokens": 1199589.0, + "step": 160 + }, + { + "entropy": 2.2232475757598875, + "epoch": 0.544, + "grad_norm": 0.5764511823654175, + "learning_rate": 1.9969899705236763e-05, + "loss": 1.6173, + "mean_token_accuracy": 0.5848860442638397, + "num_tokens": 1276431.0, + "step": 170 + }, + { + "entropy": 2.244092071056366, + "epoch": 0.576, + "grad_norm": 0.6295827627182007, + "learning_rate": 1.9961343363624626e-05, + "loss": 1.6017, + "mean_token_accuracy": 0.5818701103329659, + "num_tokens": 1350012.0, + "step": 180 + }, + { + "entropy": 2.237305074930191, + "epoch": 0.608, + "grad_norm": 0.5939638018608093, + "learning_rate": 1.9951720400727495e-05, + "loss": 1.6704, + "mean_token_accuracy": 0.5779796853661537, + "num_tokens": 1423391.0, + "step": 190 + }, + { + "entropy": 2.211505854129791, + "epoch": 0.64, + "grad_norm": 0.6119778156280518, + "learning_rate": 1.9941031846934213e-05, + "loss": 1.6223, + "mean_token_accuracy": 0.5848233133554459, + "num_tokens": 1499124.0, + "step": 200 + }, + { + "entropy": 2.2195493161678312, + "epoch": 0.672, + "grad_norm": 0.6129831671714783, + "learning_rate": 1.9929278846732883e-05, + "loss": 1.5886, + "mean_token_accuracy": 0.5897421136498451, + "num_tokens": 1573541.0, + "step": 210 + }, + { + "entropy": 2.2096123576164244, + "epoch": 0.704, + "grad_norm": 0.6091306209564209, + "learning_rate": 1.9916462658588328e-05, + "loss": 1.6031, + "mean_token_accuracy": 0.5894487425684929, + "num_tokens": 1649546.0, + "step": 220 + }, + { + "entropy": 2.249841979146004, + "epoch": 0.736, + "grad_norm": 0.570816695690155, + "learning_rate": 1.9902584654807325e-05, + "loss": 1.5876, + "mean_token_accuracy": 0.5911228567361831, + "num_tokens": 1722199.0, + "step": 230 + }, + { + "entropy": 2.1915894985198974, + "epoch": 0.768, + "grad_norm": 0.5748864412307739, + "learning_rate": 1.988764632139168e-05, + "loss": 1.5963, + "mean_token_accuracy": 0.5891387596726417, + "num_tokens": 1797304.0, + "step": 240 + }, + { + "entropy": 2.2358563423156737, + "epoch": 0.8, + "grad_norm": 0.6511492729187012, + "learning_rate": 1.9871649257879115e-05, + "loss": 1.6453, + "mean_token_accuracy": 0.5792816638946533, + "num_tokens": 1870113.0, + "step": 250 + }, + { + "entropy": 2.2169984579086304, + "epoch": 0.832, + "grad_norm": 0.5317641496658325, + "learning_rate": 1.9854595177171968e-05, + "loss": 1.6594, + "mean_token_accuracy": 0.577045065164566, + "num_tokens": 1947405.0, + "step": 260 + }, + { + "entropy": 2.2434292674064635, + "epoch": 0.864, + "grad_norm": 0.5399971604347229, + "learning_rate": 1.9836485905353823e-05, + "loss": 1.7158, + "mean_token_accuracy": 0.5683416239917278, + "num_tokens": 2026284.0, + "step": 270 + }, + { + "entropy": 2.227828550338745, + "epoch": 0.896, + "grad_norm": 0.5378643870353699, + "learning_rate": 1.9817323381493933e-05, + "loss": 1.6714, + "mean_token_accuracy": 0.5818367518484593, + "num_tokens": 2103986.0, + "step": 280 + }, + { + "entropy": 2.2110894501209257, + "epoch": 0.928, + "grad_norm": 0.5195969343185425, + "learning_rate": 1.979710965743964e-05, + "loss": 1.6239, + "mean_token_accuracy": 0.5819958478212357, + "num_tokens": 2177010.0, + "step": 290 + }, + { + "entropy": 2.1666628658771514, + "epoch": 0.96, + "grad_norm": 0.5663164258003235, + "learning_rate": 1.977584689759664e-05, + "loss": 1.6024, + "mean_token_accuracy": 0.5876665830612182, + "num_tokens": 2251285.0, + "step": 300 + }, + { + "entropy": 2.214203083515167, + "epoch": 0.992, + "grad_norm": 0.6764860153198242, + "learning_rate": 1.9753537378697237e-05, + "loss": 1.6446, + "mean_token_accuracy": 0.5818003416061401, + "num_tokens": 2325752.0, + "step": 310 + }, + { + "entropy": 2.16783396821273, + "epoch": 1.0224, + "grad_norm": 0.5795008540153503, + "learning_rate": 1.9730183489556563e-05, + "loss": 1.594, + "mean_token_accuracy": 0.5867547392845154, + "num_tokens": 2396254.0, + "step": 320 + }, + { + "entropy": 2.172953352332115, + "epoch": 1.0544, + "grad_norm": 0.6686444282531738, + "learning_rate": 1.9705787730816776e-05, + "loss": 1.613, + "mean_token_accuracy": 0.5906373374164104, + "num_tokens": 2470123.0, + "step": 330 + }, + { + "entropy": 2.2217346757650374, + "epoch": 1.0864, + "grad_norm": 0.6389091610908508, + "learning_rate": 1.9680352714679324e-05, + "loss": 1.7053, + "mean_token_accuracy": 0.577599074691534, + "num_tokens": 2545749.0, + "step": 340 + }, + { + "entropy": 2.138428696990013, + "epoch": 1.1184, + "grad_norm": 0.7369883060455322, + "learning_rate": 1.9653881164625234e-05, + "loss": 1.5599, + "mean_token_accuracy": 0.5946489304304123, + "num_tokens": 2623270.0, + "step": 350 + }, + { + "entropy": 2.147254040837288, + "epoch": 1.1504, + "grad_norm": 0.6707085967063904, + "learning_rate": 1.9626375915123473e-05, + "loss": 1.5843, + "mean_token_accuracy": 0.5965728983283043, + "num_tokens": 2697616.0, + "step": 360 + }, + { + "entropy": 2.1412769109010696, + "epoch": 1.1824, + "grad_norm": 0.7201400995254517, + "learning_rate": 1.9597839911327475e-05, + "loss": 1.58, + "mean_token_accuracy": 0.5957784004509449, + "num_tokens": 2771426.0, + "step": 370 + }, + { + "entropy": 2.164059528708458, + "epoch": 1.2144, + "grad_norm": 0.7561144232749939, + "learning_rate": 1.9568276208759772e-05, + "loss": 1.5872, + "mean_token_accuracy": 0.5874110117554665, + "num_tokens": 2846711.0, + "step": 380 + }, + { + "entropy": 2.205427420139313, + "epoch": 1.2464, + "grad_norm": 0.691585898399353, + "learning_rate": 1.9537687972984804e-05, + "loss": 1.625, + "mean_token_accuracy": 0.5892911069095135, + "num_tokens": 2920916.0, + "step": 390 + }, + { + "entropy": 2.1242104679346085, + "epoch": 1.2784, + "grad_norm": 0.6999676823616028, + "learning_rate": 1.950607847926999e-05, + "loss": 1.5606, + "mean_token_accuracy": 0.5917269751429558, + "num_tokens": 2996056.0, + "step": 400 + }, + { + "entropy": 2.114223065972328, + "epoch": 1.3104, + "grad_norm": 0.7616406679153442, + "learning_rate": 1.947345111223502e-05, + "loss": 1.5296, + "mean_token_accuracy": 0.5938275754451752, + "num_tokens": 3072912.0, + "step": 410 + }, + { + "entropy": 2.1418962299823763, + "epoch": 1.3424, + "grad_norm": 0.7253025770187378, + "learning_rate": 1.943980936548942e-05, + "loss": 1.575, + "mean_token_accuracy": 0.5945621818304062, + "num_tokens": 3148498.0, + "step": 420 + }, + { + "entropy": 2.109667718410492, + "epoch": 1.3744, + "grad_norm": 0.8988682627677917, + "learning_rate": 1.9405156841258498e-05, + "loss": 1.5796, + "mean_token_accuracy": 0.5901263400912284, + "num_tokens": 3224741.0, + "step": 430 + }, + { + "entropy": 2.179358023405075, + "epoch": 1.4064, + "grad_norm": 0.741558849811554, + "learning_rate": 1.936949724999762e-05, + "loss": 1.6507, + "mean_token_accuracy": 0.581992793083191, + "num_tokens": 3299366.0, + "step": 440 + }, + { + "entropy": 2.1574251472949983, + "epoch": 1.4384000000000001, + "grad_norm": 0.7538727521896362, + "learning_rate": 1.9332834409994906e-05, + "loss": 1.5771, + "mean_token_accuracy": 0.5888051658868789, + "num_tokens": 3374162.0, + "step": 450 + }, + { + "entropy": 2.1186763852834702, + "epoch": 1.4704, + "grad_norm": 0.7905173301696777, + "learning_rate": 1.929517224696239e-05, + "loss": 1.6138, + "mean_token_accuracy": 0.584889967739582, + "num_tokens": 3452582.0, + "step": 460 + }, + { + "entropy": 2.1135365635156633, + "epoch": 1.5024, + "grad_norm": 0.7416484951972961, + "learning_rate": 1.9256514793615674e-05, + "loss": 1.5623, + "mean_token_accuracy": 0.5928735345602035, + "num_tokens": 3527694.0, + "step": 470 + }, + { + "entropy": 2.146635016798973, + "epoch": 1.5344, + "grad_norm": 0.731999397277832, + "learning_rate": 1.9216866189242095e-05, + "loss": 1.5634, + "mean_token_accuracy": 0.5988615363836288, + "num_tokens": 3600277.0, + "step": 480 + }, + { + "entropy": 2.1472962319850923, + "epoch": 1.5664, + "grad_norm": 0.7493702173233032, + "learning_rate": 1.9176230679257547e-05, + "loss": 1.5891, + "mean_token_accuracy": 0.5858126983046532, + "num_tokens": 3674781.0, + "step": 490 + }, + { + "entropy": 2.1530486762523653, + "epoch": 1.5984, + "grad_norm": 0.8006687164306641, + "learning_rate": 1.9134612614751865e-05, + "loss": 1.5674, + "mean_token_accuracy": 0.5904534175992012, + "num_tokens": 3748434.0, + "step": 500 + }, + { + "entropy": 2.169738906621933, + "epoch": 1.6303999999999998, + "grad_norm": 0.9293455481529236, + "learning_rate": 1.909201645202294e-05, + "loss": 1.6104, + "mean_token_accuracy": 0.5860036969184875, + "num_tokens": 3823982.0, + "step": 510 + }, + { + "entropy": 2.178475347161293, + "epoch": 1.6623999999999999, + "grad_norm": 0.7716575860977173, + "learning_rate": 1.904844675209956e-05, + "loss": 1.6432, + "mean_token_accuracy": 0.5838924221694469, + "num_tokens": 3900064.0, + "step": 520 + }, + { + "entropy": 2.1585603266954423, + "epoch": 1.6944, + "grad_norm": 0.8225084543228149, + "learning_rate": 1.9003908180253027e-05, + "loss": 1.5957, + "mean_token_accuracy": 0.5880116850137711, + "num_tokens": 3974029.0, + "step": 530 + }, + { + "entropy": 2.111869788169861, + "epoch": 1.7264, + "grad_norm": 0.7035638093948364, + "learning_rate": 1.8958405505497613e-05, + "loss": 1.579, + "mean_token_accuracy": 0.5890362292528153, + "num_tokens": 4049974.0, + "step": 540 + }, + { + "entropy": 2.144411253929138, + "epoch": 1.7584, + "grad_norm": 0.7046850919723511, + "learning_rate": 1.8911943600079934e-05, + "loss": 1.5926, + "mean_token_accuracy": 0.5874261602759361, + "num_tokens": 4125206.0, + "step": 550 + }, + { + "entropy": 2.1093025386333464, + "epoch": 1.7904, + "grad_norm": 0.807727575302124, + "learning_rate": 1.8864527438957223e-05, + "loss": 1.5367, + "mean_token_accuracy": 0.5988967984914779, + "num_tokens": 4199365.0, + "step": 560 + }, + { + "entropy": 2.097169244289398, + "epoch": 1.8224, + "grad_norm": 0.7856780886650085, + "learning_rate": 1.881616209926465e-05, + "loss": 1.561, + "mean_token_accuracy": 0.5948230788111687, + "num_tokens": 4275889.0, + "step": 570 + }, + { + "entropy": 2.088553088903427, + "epoch": 1.8544, + "grad_norm": 0.8993458151817322, + "learning_rate": 1.876685275977167e-05, + "loss": 1.5557, + "mean_token_accuracy": 0.5941933646798134, + "num_tokens": 4350502.0, + "step": 580 + }, + { + "entropy": 2.132419008016586, + "epoch": 1.8864, + "grad_norm": 0.7769711017608643, + "learning_rate": 1.8716604700327516e-05, + "loss": 1.6105, + "mean_token_accuracy": 0.5815305605530738, + "num_tokens": 4426429.0, + "step": 590 + }, + { + "entropy": 2.1076891005039213, + "epoch": 1.9184, + "grad_norm": 0.9261249899864197, + "learning_rate": 1.866542330129583e-05, + "loss": 1.5307, + "mean_token_accuracy": 0.5964644759893417, + "num_tokens": 4500147.0, + "step": 600 + }, + { + "entropy": 2.114642283320427, + "epoch": 1.9504000000000001, + "grad_norm": 0.806425929069519, + "learning_rate": 1.8613314042978576e-05, + "loss": 1.5809, + "mean_token_accuracy": 0.5901800125837326, + "num_tokens": 4573438.0, + "step": 610 + }, + { + "entropy": 2.1167576968669892, + "epoch": 1.9824000000000002, + "grad_norm": 0.8191499710083008, + "learning_rate": 1.856028250502923e-05, + "loss": 1.6031, + "mean_token_accuracy": 0.5843381330370903, + "num_tokens": 4648156.0, + "step": 620 + }, + { + "entropy": 2.0566249019221257, + "epoch": 2.0128, + "grad_norm": 0.7406135201454163, + "learning_rate": 1.8506334365855315e-05, + "loss": 1.5187, + "mean_token_accuracy": 0.6027438483740154, + "num_tokens": 4719492.0, + "step": 630 + }, + { + "entropy": 2.0126763731241226, + "epoch": 2.0448, + "grad_norm": 0.8845784068107605, + "learning_rate": 1.8451475402010405e-05, + "loss": 1.4841, + "mean_token_accuracy": 0.6069207280874253, + "num_tokens": 4796271.0, + "step": 640 + }, + { + "entropy": 2.0516900300979612, + "epoch": 2.0768, + "grad_norm": 0.9927017092704773, + "learning_rate": 1.8395711487575564e-05, + "loss": 1.512, + "mean_token_accuracy": 0.6031922519207, + "num_tokens": 4870202.0, + "step": 650 + }, + { + "entropy": 2.0824343889951704, + "epoch": 2.1088, + "grad_norm": 0.927236795425415, + "learning_rate": 1.8339048593530406e-05, + "loss": 1.5843, + "mean_token_accuracy": 0.5952437989413738, + "num_tokens": 4945568.0, + "step": 660 + }, + { + "entropy": 2.0304481953382494, + "epoch": 2.1408, + "grad_norm": 0.874019205570221, + "learning_rate": 1.8281492787113707e-05, + "loss": 1.5096, + "mean_token_accuracy": 0.5992600306868553, + "num_tokens": 5020723.0, + "step": 670 + }, + { + "entropy": 2.0402441143989565, + "epoch": 2.1728, + "grad_norm": 0.8746942281723022, + "learning_rate": 1.8223050231173802e-05, + "loss": 1.5119, + "mean_token_accuracy": 0.5994458049535751, + "num_tokens": 5095780.0, + "step": 680 + }, + { + "entropy": 2.018441066145897, + "epoch": 2.2048, + "grad_norm": 1.063180923461914, + "learning_rate": 1.816372718350864e-05, + "loss": 1.4923, + "mean_token_accuracy": 0.6064845189452172, + "num_tokens": 5169733.0, + "step": 690 + }, + { + "entropy": 2.0563316702842713, + "epoch": 2.2368, + "grad_norm": 1.0281789302825928, + "learning_rate": 1.810352999619574e-05, + "loss": 1.5505, + "mean_token_accuracy": 0.602813882380724, + "num_tokens": 5246393.0, + "step": 700 + }, + { + "entropy": 2.0298285841941834, + "epoch": 2.2688, + "grad_norm": 1.070520281791687, + "learning_rate": 1.804246511491206e-05, + "loss": 1.5159, + "mean_token_accuracy": 0.6006126523017883, + "num_tokens": 5322244.0, + "step": 710 + }, + { + "entropy": 2.0195819228887557, + "epoch": 2.3008, + "grad_norm": 0.9672983884811401, + "learning_rate": 1.7980539078243783e-05, + "loss": 1.5166, + "mean_token_accuracy": 0.6054230839014053, + "num_tokens": 5399317.0, + "step": 720 + }, + { + "entropy": 2.045917159318924, + "epoch": 2.3327999999999998, + "grad_norm": 1.1228744983673096, + "learning_rate": 1.791775851698622e-05, + "loss": 1.5096, + "mean_token_accuracy": 0.6015639662742615, + "num_tokens": 5473195.0, + "step": 730 + }, + { + "entropy": 2.0935415983200074, + "epoch": 2.3648, + "grad_norm": 1.149794578552246, + "learning_rate": 1.7854130153433785e-05, + "loss": 1.5583, + "mean_token_accuracy": 0.5921522840857506, + "num_tokens": 5548357.0, + "step": 740 + }, + { + "entropy": 2.044076007604599, + "epoch": 2.3968, + "grad_norm": 1.063625693321228, + "learning_rate": 1.7789660800660222e-05, + "loss": 1.5013, + "mean_token_accuracy": 0.5974589124321937, + "num_tokens": 5620915.0, + "step": 750 + }, + { + "entropy": 2.092478734254837, + "epoch": 2.4288, + "grad_norm": 1.1822012662887573, + "learning_rate": 1.7724357361789075e-05, + "loss": 1.5552, + "mean_token_accuracy": 0.5929681301116944, + "num_tokens": 5693406.0, + "step": 760 + }, + { + "entropy": 2.0430804908275606, + "epoch": 2.4608, + "grad_norm": 0.9921984076499939, + "learning_rate": 1.765822682925453e-05, + "loss": 1.4944, + "mean_token_accuracy": 0.6029774472117424, + "num_tokens": 5770143.0, + "step": 770 + }, + { + "entropy": 2.049290281534195, + "epoch": 2.4928, + "grad_norm": 1.0144131183624268, + "learning_rate": 1.7591276284052695e-05, + "loss": 1.5437, + "mean_token_accuracy": 0.5986773043870925, + "num_tokens": 5844022.0, + "step": 780 + }, + { + "entropy": 2.033898201584816, + "epoch": 2.5248, + "grad_norm": 1.1700315475463867, + "learning_rate": 1.7523512894983396e-05, + "loss": 1.5197, + "mean_token_accuracy": 0.5972102269530296, + "num_tokens": 5919099.0, + "step": 790 + }, + { + "entropy": 2.03344586789608, + "epoch": 2.5568, + "grad_norm": 1.0503427982330322, + "learning_rate": 1.745494391788257e-05, + "loss": 1.5456, + "mean_token_accuracy": 0.6011263683438302, + "num_tokens": 5997797.0, + "step": 800 + }, + { + "entropy": 2.0796399265527725, + "epoch": 2.5888, + "grad_norm": 1.0316176414489746, + "learning_rate": 1.7385576694845324e-05, + "loss": 1.608, + "mean_token_accuracy": 0.6024919278919697, + "num_tokens": 6075434.0, + "step": 810 + }, + { + "entropy": 2.0257797837257385, + "epoch": 2.6208, + "grad_norm": 1.048309087753296, + "learning_rate": 1.7315418653439802e-05, + "loss": 1.4876, + "mean_token_accuracy": 0.6070949509739876, + "num_tokens": 6149232.0, + "step": 820 + }, + { + "entropy": 2.024846690893173, + "epoch": 2.6528, + "grad_norm": 1.186710000038147, + "learning_rate": 1.7244477305911845e-05, + "loss": 1.499, + "mean_token_accuracy": 0.6022308841347694, + "num_tokens": 6222180.0, + "step": 830 + }, + { + "entropy": 1.9938248336315154, + "epoch": 2.6848, + "grad_norm": 1.1091604232788086, + "learning_rate": 1.717276024838062e-05, + "loss": 1.4795, + "mean_token_accuracy": 0.6044012248516083, + "num_tokens": 6296902.0, + "step": 840 + }, + { + "entropy": 1.9988998174667358, + "epoch": 2.7168, + "grad_norm": 1.0359690189361572, + "learning_rate": 1.710027516002526e-05, + "loss": 1.5173, + "mean_token_accuracy": 0.6025070771574974, + "num_tokens": 6373494.0, + "step": 850 + }, + { + "entropy": 2.02343093752861, + "epoch": 2.7488, + "grad_norm": 1.1783568859100342, + "learning_rate": 1.7027029802262598e-05, + "loss": 1.5146, + "mean_token_accuracy": 0.6033479735255242, + "num_tokens": 6449229.0, + "step": 860 + }, + { + "entropy": 2.0429257422685625, + "epoch": 2.7808, + "grad_norm": 0.9909568428993225, + "learning_rate": 1.6953032017916115e-05, + "loss": 1.5473, + "mean_token_accuracy": 0.5932901218533516, + "num_tokens": 6525728.0, + "step": 870 + }, + { + "entropy": 2.0058376491069794, + "epoch": 2.8128, + "grad_norm": 1.0904430150985718, + "learning_rate": 1.687828973037615e-05, + "loss": 1.4545, + "mean_token_accuracy": 0.6120153024792672, + "num_tokens": 6599335.0, + "step": 880 + }, + { + "entropy": 2.005480855703354, + "epoch": 2.8448, + "grad_norm": 1.1638548374176025, + "learning_rate": 1.6802810942751514e-05, + "loss": 1.4887, + "mean_token_accuracy": 0.6060751393437386, + "num_tokens": 6672722.0, + "step": 890 + }, + { + "entropy": 2.0311779022216796, + "epoch": 2.8768000000000002, + "grad_norm": 1.1404571533203125, + "learning_rate": 1.6726603737012527e-05, + "loss": 1.5238, + "mean_token_accuracy": 0.6015868663787842, + "num_tokens": 6748069.0, + "step": 900 + }, + { + "entropy": 2.0126856863498688, + "epoch": 2.9088000000000003, + "grad_norm": 1.0942543745040894, + "learning_rate": 1.6649676273125647e-05, + "loss": 1.4984, + "mean_token_accuracy": 0.6019899815320968, + "num_tokens": 6820935.0, + "step": 910 + }, + { + "entropy": 1.9961138010025024, + "epoch": 2.9408, + "grad_norm": 1.0870610475540161, + "learning_rate": 1.6572036788179728e-05, + "loss": 1.4962, + "mean_token_accuracy": 0.6030571654438972, + "num_tokens": 6896286.0, + "step": 920 + }, + { + "entropy": 2.035824549198151, + "epoch": 2.9728, + "grad_norm": 1.0822633504867554, + "learning_rate": 1.6493693595504022e-05, + "loss": 1.5354, + "mean_token_accuracy": 0.5986709952354431, + "num_tokens": 6971854.0, + "step": 930 + }, + { + "entropy": 2.0243908260997974, + "epoch": 3.0032, + "grad_norm": 1.0899602174758911, + "learning_rate": 1.6414655083778027e-05, + "loss": 1.5032, + "mean_token_accuracy": 0.5983682243447555, + "num_tokens": 7041122.0, + "step": 940 + }, + { + "entropy": 1.9538823068141937, + "epoch": 3.0352, + "grad_norm": 1.3042237758636475, + "learning_rate": 1.633492971613326e-05, + "loss": 1.4604, + "mean_token_accuracy": 0.6146818101406097, + "num_tokens": 7116032.0, + "step": 950 + }, + { + "entropy": 1.9383916020393372, + "epoch": 3.0672, + "grad_norm": 1.397078037261963, + "learning_rate": 1.6254526029247048e-05, + "loss": 1.4019, + "mean_token_accuracy": 0.6210932344198227, + "num_tokens": 7189009.0, + "step": 960 + }, + { + "entropy": 1.9460978150367736, + "epoch": 3.0992, + "grad_norm": 1.2756887674331665, + "learning_rate": 1.617345263242847e-05, + "loss": 1.4623, + "mean_token_accuracy": 0.6121616646647453, + "num_tokens": 7263068.0, + "step": 970 + }, + { + "entropy": 1.9156711965799331, + "epoch": 3.1312, + "grad_norm": 1.1937649250030518, + "learning_rate": 1.609171820669649e-05, + "loss": 1.4301, + "mean_token_accuracy": 0.6136599197983742, + "num_tokens": 7338652.0, + "step": 980 + }, + { + "entropy": 1.9247682303190232, + "epoch": 3.1632, + "grad_norm": 1.3291118144989014, + "learning_rate": 1.6009331503850448e-05, + "loss": 1.4545, + "mean_token_accuracy": 0.6153608947992325, + "num_tokens": 7414529.0, + "step": 990 + }, + { + "entropy": 1.9066543668508529, + "epoch": 3.1952, + "grad_norm": 1.4356389045715332, + "learning_rate": 1.5926301345532925e-05, + "loss": 1.4413, + "mean_token_accuracy": 0.612147618830204, + "num_tokens": 7489106.0, + "step": 1000 + }, + { + "entropy": 1.895160937309265, + "epoch": 3.2272, + "grad_norm": 1.4345523118972778, + "learning_rate": 1.5842636622285187e-05, + "loss": 1.4207, + "mean_token_accuracy": 0.6138400137424469, + "num_tokens": 7564304.0, + "step": 1010 + }, + { + "entropy": 1.9546802312135696, + "epoch": 3.2592, + "grad_norm": 1.5242680311203003, + "learning_rate": 1.575834629259519e-05, + "loss": 1.4435, + "mean_token_accuracy": 0.6153354361653328, + "num_tokens": 7637409.0, + "step": 1020 + }, + { + "entropy": 1.912938117980957, + "epoch": 3.2912, + "grad_norm": 1.529726505279541, + "learning_rate": 1.5673439381938365e-05, + "loss": 1.4409, + "mean_token_accuracy": 0.6191004544496537, + "num_tokens": 7711595.0, + "step": 1030 + }, + { + "entropy": 1.8989770442247391, + "epoch": 3.3232, + "grad_norm": 1.3367948532104492, + "learning_rate": 1.5587924981811196e-05, + "loss": 1.394, + "mean_token_accuracy": 0.624155393242836, + "num_tokens": 7785750.0, + "step": 1040 + }, + { + "entropy": 1.932333904504776, + "epoch": 3.3552, + "grad_norm": 1.4732215404510498, + "learning_rate": 1.5501812248757734e-05, + "loss": 1.3959, + "mean_token_accuracy": 0.6221834555268287, + "num_tokens": 7859036.0, + "step": 1050 + }, + { + "entropy": 1.9492982983589173, + "epoch": 3.3872, + "grad_norm": 1.4499313831329346, + "learning_rate": 1.5415110403389166e-05, + "loss": 1.4633, + "mean_token_accuracy": 0.6100246667861938, + "num_tokens": 7933165.0, + "step": 1060 + }, + { + "entropy": 1.9063653618097305, + "epoch": 3.4192, + "grad_norm": 1.4364317655563354, + "learning_rate": 1.5327828729396482e-05, + "loss": 1.4216, + "mean_token_accuracy": 0.6210869938135147, + "num_tokens": 8009376.0, + "step": 1070 + }, + { + "entropy": 1.9919361650943757, + "epoch": 3.4512, + "grad_norm": 1.5573089122772217, + "learning_rate": 1.5239976572556438e-05, + "loss": 1.5899, + "mean_token_accuracy": 0.5991086520254612, + "num_tokens": 8086825.0, + "step": 1080 + }, + { + "entropy": 1.922476476430893, + "epoch": 3.4832, + "grad_norm": 1.3339344263076782, + "learning_rate": 1.5151563339730849e-05, + "loss": 1.4162, + "mean_token_accuracy": 0.6182018965482712, + "num_tokens": 8161726.0, + "step": 1090 + }, + { + "entropy": 1.9143129527568816, + "epoch": 3.5152, + "grad_norm": 1.4425708055496216, + "learning_rate": 1.506259849785931e-05, + "loss": 1.4076, + "mean_token_accuracy": 0.6197950705885887, + "num_tokens": 8237046.0, + "step": 1100 + }, + { + "entropy": 1.9093267023563385, + "epoch": 3.5472, + "grad_norm": 1.5437992811203003, + "learning_rate": 1.497309157294555e-05, + "loss": 1.4339, + "mean_token_accuracy": 0.6177847877144813, + "num_tokens": 8315350.0, + "step": 1110 + }, + { + "entropy": 1.9121424347162246, + "epoch": 3.5792, + "grad_norm": 1.3761622905731201, + "learning_rate": 1.4883052149037395e-05, + "loss": 1.4175, + "mean_token_accuracy": 0.6171463698148727, + "num_tokens": 8390383.0, + "step": 1120 + }, + { + "entropy": 1.883551675081253, + "epoch": 3.6112, + "grad_norm": 1.36739182472229, + "learning_rate": 1.479248986720057e-05, + "loss": 1.4158, + "mean_token_accuracy": 0.6186214044690133, + "num_tokens": 8468414.0, + "step": 1130 + }, + { + "entropy": 1.988349151611328, + "epoch": 3.6432, + "grad_norm": 1.4566738605499268, + "learning_rate": 1.4701414424486353e-05, + "loss": 1.5366, + "mean_token_accuracy": 0.6110676810145378, + "num_tokens": 8541715.0, + "step": 1140 + }, + { + "entropy": 1.9057112097740174, + "epoch": 3.6752000000000002, + "grad_norm": 1.499079704284668, + "learning_rate": 1.4609835572893266e-05, + "loss": 1.3991, + "mean_token_accuracy": 0.6208718970417977, + "num_tokens": 8615694.0, + "step": 1150 + }, + { + "entropy": 1.9219326049089431, + "epoch": 3.7072000000000003, + "grad_norm": 1.3865621089935303, + "learning_rate": 1.4517763118322861e-05, + "loss": 1.431, + "mean_token_accuracy": 0.6143050745129586, + "num_tokens": 8692473.0, + "step": 1160 + }, + { + "entropy": 1.9036399960517882, + "epoch": 3.7392, + "grad_norm": 1.5362603664398193, + "learning_rate": 1.4425206919529747e-05, + "loss": 1.4156, + "mean_token_accuracy": 0.6199175521731377, + "num_tokens": 8767618.0, + "step": 1170 + }, + { + "entropy": 1.9499989479780198, + "epoch": 3.7712, + "grad_norm": 1.663404941558838, + "learning_rate": 1.4332176887065955e-05, + "loss": 1.4668, + "mean_token_accuracy": 0.605186915397644, + "num_tokens": 8843100.0, + "step": 1180 + }, + { + "entropy": 1.9545456051826477, + "epoch": 3.8032, + "grad_norm": 1.6169345378875732, + "learning_rate": 1.4238682982219753e-05, + "loss": 1.4241, + "mean_token_accuracy": 0.6206902250647545, + "num_tokens": 8914604.0, + "step": 1190 + }, + { + "entropy": 1.9130536198616028, + "epoch": 3.8352, + "grad_norm": 1.472740650177002, + "learning_rate": 1.4144735215949028e-05, + "loss": 1.4271, + "mean_token_accuracy": 0.6139126420021057, + "num_tokens": 8989305.0, + "step": 1200 + }, + { + "entropy": 1.938635140657425, + "epoch": 3.8672, + "grad_norm": 1.4194226264953613, + "learning_rate": 1.4050343647809354e-05, + "loss": 1.4538, + "mean_token_accuracy": 0.6131341770291329, + "num_tokens": 9065589.0, + "step": 1210 + }, + { + "entropy": 1.9123675346374511, + "epoch": 3.8992, + "grad_norm": 1.5208053588867188, + "learning_rate": 1.3955518384876863e-05, + "loss": 1.4309, + "mean_token_accuracy": 0.6139545351266861, + "num_tokens": 9140150.0, + "step": 1220 + }, + { + "entropy": 1.9148090302944183, + "epoch": 3.9312, + "grad_norm": 1.6418218612670898, + "learning_rate": 1.3860269580666004e-05, + "loss": 1.4269, + "mean_token_accuracy": 0.6169310078024864, + "num_tokens": 9215796.0, + "step": 1230 + }, + { + "entropy": 1.9157740741968154, + "epoch": 3.9632, + "grad_norm": 1.4638084173202515, + "learning_rate": 1.3764607434042353e-05, + "loss": 1.4509, + "mean_token_accuracy": 0.6164968460798264, + "num_tokens": 9291010.0, + "step": 1240 + }, + { + "entropy": 1.9184510678052902, + "epoch": 3.9952, + "grad_norm": 1.5152716636657715, + "learning_rate": 1.3668542188130567e-05, + "loss": 1.4453, + "mean_token_accuracy": 0.6112410992383956, + "num_tokens": 9367186.0, + "step": 1250 + }, + { + "entropy": 1.9016748384425515, + "epoch": 4.0256, + "grad_norm": 1.490628719329834, + "learning_rate": 1.3572084129217566e-05, + "loss": 1.382, + "mean_token_accuracy": 0.623968276538347, + "num_tokens": 9439028.0, + "step": 1260 + }, + { + "entropy": 1.8026290327310561, + "epoch": 4.0576, + "grad_norm": 1.8969308137893677, + "learning_rate": 1.347524358565115e-05, + "loss": 1.3128, + "mean_token_accuracy": 0.6386646836996078, + "num_tokens": 9513855.0, + "step": 1270 + }, + { + "entropy": 1.8283424764871596, + "epoch": 4.0896, + "grad_norm": 1.5952194929122925, + "learning_rate": 1.3378030926734052e-05, + "loss": 1.3362, + "mean_token_accuracy": 0.6328515768051147, + "num_tokens": 9589080.0, + "step": 1280 + }, + { + "entropy": 1.8405955344438554, + "epoch": 4.1216, + "grad_norm": 1.6057584285736084, + "learning_rate": 1.3280456561613653e-05, + "loss": 1.4151, + "mean_token_accuracy": 0.6261398203670978, + "num_tokens": 9666808.0, + "step": 1290 + }, + { + "entropy": 1.8390818655490875, + "epoch": 4.1536, + "grad_norm": 1.8149824142456055, + "learning_rate": 1.3182530938167409e-05, + "loss": 1.3455, + "mean_token_accuracy": 0.6318597674369812, + "num_tokens": 9740267.0, + "step": 1300 + }, + { + "entropy": 1.8203887075185776, + "epoch": 4.1856, + "grad_norm": 1.6102676391601562, + "learning_rate": 1.3084264541884118e-05, + "loss": 1.3255, + "mean_token_accuracy": 0.6316933467984199, + "num_tokens": 9816400.0, + "step": 1310 + }, + { + "entropy": 1.8592366576194763, + "epoch": 4.2176, + "grad_norm": 1.9501773118972778, + "learning_rate": 1.2985667894741197e-05, + "loss": 1.3521, + "mean_token_accuracy": 0.6301594719290733, + "num_tokens": 9889311.0, + "step": 1320 + }, + { + "entropy": 1.8420085966587068, + "epoch": 4.2496, + "grad_norm": 1.6526106595993042, + "learning_rate": 1.2886751554078015e-05, + "loss": 1.3662, + "mean_token_accuracy": 0.6302071824669838, + "num_tokens": 9965339.0, + "step": 1330 + }, + { + "entropy": 1.8313881188631058, + "epoch": 4.2816, + "grad_norm": 1.6269904375076294, + "learning_rate": 1.2787526111465453e-05, + "loss": 1.3579, + "mean_token_accuracy": 0.6328388035297394, + "num_tokens": 10039668.0, + "step": 1340 + }, + { + "entropy": 1.858151137828827, + "epoch": 4.3136, + "grad_norm": 1.9028024673461914, + "learning_rate": 1.2688002191571829e-05, + "loss": 1.421, + "mean_token_accuracy": 0.6276688367128372, + "num_tokens": 10115387.0, + "step": 1350 + }, + { + "entropy": 1.8273844957351684, + "epoch": 4.3456, + "grad_norm": 1.7530555725097656, + "learning_rate": 1.2588190451025209e-05, + "loss": 1.3527, + "mean_token_accuracy": 0.6345869660377502, + "num_tokens": 10191506.0, + "step": 1360 + }, + { + "entropy": 1.8732422679662704, + "epoch": 4.3776, + "grad_norm": 1.7372691631317139, + "learning_rate": 1.248810157727236e-05, + "loss": 1.4132, + "mean_token_accuracy": 0.6252246856689453, + "num_tokens": 10268756.0, + "step": 1370 + }, + { + "entropy": 1.8583054572343827, + "epoch": 4.4096, + "grad_norm": 1.6993470191955566, + "learning_rate": 1.2387746287434385e-05, + "loss": 1.3638, + "mean_token_accuracy": 0.6286717876791954, + "num_tokens": 10341779.0, + "step": 1380 + }, + { + "entropy": 1.8324467271566391, + "epoch": 4.4416, + "grad_norm": 1.7818169593811035, + "learning_rate": 1.2287135327159165e-05, + "loss": 1.3372, + "mean_token_accuracy": 0.6361263945698739, + "num_tokens": 10414642.0, + "step": 1390 + }, + { + "entropy": 1.8514392852783204, + "epoch": 4.4736, + "grad_norm": 1.7585517168045044, + "learning_rate": 1.2186279469470757e-05, + "loss": 1.3697, + "mean_token_accuracy": 0.628801380097866, + "num_tokens": 10489517.0, + "step": 1400 + }, + { + "entropy": 1.8218136370182036, + "epoch": 4.5056, + "grad_norm": 1.9843116998672485, + "learning_rate": 1.2085189513615872e-05, + "loss": 1.3628, + "mean_token_accuracy": 0.6295172199606895, + "num_tokens": 10565467.0, + "step": 1410 + }, + { + "entropy": 1.8919565021991729, + "epoch": 4.5376, + "grad_norm": 1.9309132099151611, + "learning_rate": 1.1983876283907522e-05, + "loss": 1.4467, + "mean_token_accuracy": 0.6263746194541454, + "num_tokens": 10641283.0, + "step": 1420 + }, + { + "entropy": 1.8356508910655975, + "epoch": 4.5696, + "grad_norm": 1.7685068845748901, + "learning_rate": 1.1882350628566008e-05, + "loss": 1.3631, + "mean_token_accuracy": 0.624418406188488, + "num_tokens": 10716701.0, + "step": 1430 + }, + { + "entropy": 1.8288098931312562, + "epoch": 4.6016, + "grad_norm": 1.8276050090789795, + "learning_rate": 1.178062341855732e-05, + "loss": 1.3619, + "mean_token_accuracy": 0.6286922857165337, + "num_tokens": 10791427.0, + "step": 1440 + }, + { + "entropy": 1.8557640790939331, + "epoch": 4.6336, + "grad_norm": 1.7773240804672241, + "learning_rate": 1.1678705546429132e-05, + "loss": 1.383, + "mean_token_accuracy": 0.6216814562678337, + "num_tokens": 10866356.0, + "step": 1450 + }, + { + "entropy": 1.8483826667070389, + "epoch": 4.6655999999999995, + "grad_norm": 1.831931710243225, + "learning_rate": 1.1576607925144456e-05, + "loss": 1.355, + "mean_token_accuracy": 0.6278511002659798, + "num_tokens": 10940772.0, + "step": 1460 + }, + { + "entropy": 1.8824394553899766, + "epoch": 4.6975999999999996, + "grad_norm": 1.9213542938232422, + "learning_rate": 1.1474341486913146e-05, + "loss": 1.3767, + "mean_token_accuracy": 0.6256057649850846, + "num_tokens": 11016144.0, + "step": 1470 + }, + { + "entropy": 1.8709469974040984, + "epoch": 4.7296, + "grad_norm": 1.8768925666809082, + "learning_rate": 1.1371917182021297e-05, + "loss": 1.3734, + "mean_token_accuracy": 0.6317574754357338, + "num_tokens": 11089939.0, + "step": 1480 + }, + { + "entropy": 1.8673742085695266, + "epoch": 4.7616, + "grad_norm": 1.796302318572998, + "learning_rate": 1.1269345977658747e-05, + "loss": 1.3707, + "mean_token_accuracy": 0.6238353416323662, + "num_tokens": 11166087.0, + "step": 1490 + }, + { + "entropy": 1.8310889720916748, + "epoch": 4.7936, + "grad_norm": 1.8969939947128296, + "learning_rate": 1.1166638856744747e-05, + "loss": 1.3373, + "mean_token_accuracy": 0.6348015949130058, + "num_tokens": 11240732.0, + "step": 1500 + }, + { + "entropy": 1.8809226244688033, + "epoch": 4.8256, + "grad_norm": 1.642104983329773, + "learning_rate": 1.1063806816751957e-05, + "loss": 1.3792, + "mean_token_accuracy": 0.6250617265701294, + "num_tokens": 11316878.0, + "step": 1510 + }, + { + "entropy": 1.8715822875499726, + "epoch": 4.8576, + "grad_norm": 1.962158441543579, + "learning_rate": 1.0960860868528872e-05, + "loss": 1.3711, + "mean_token_accuracy": 0.6293752744793892, + "num_tokens": 11389042.0, + "step": 1520 + }, + { + "entropy": 1.8657191127538681, + "epoch": 4.8896, + "grad_norm": 1.9577444791793823, + "learning_rate": 1.0857812035120845e-05, + "loss": 1.379, + "mean_token_accuracy": 0.6259156972169876, + "num_tokens": 11464215.0, + "step": 1530 + }, + { + "entropy": 1.8811951220035552, + "epoch": 4.9216, + "grad_norm": 2.015150785446167, + "learning_rate": 1.0754671350589752e-05, + "loss": 1.4155, + "mean_token_accuracy": 0.626779156178236, + "num_tokens": 11539122.0, + "step": 1540 + }, + { + "entropy": 1.863905319571495, + "epoch": 4.9536, + "grad_norm": 1.8474093675613403, + "learning_rate": 1.065144985883253e-05, + "loss": 1.3409, + "mean_token_accuracy": 0.6319419264793396, + "num_tokens": 11613016.0, + "step": 1550 + }, + { + "entropy": 1.836970153450966, + "epoch": 4.9856, + "grad_norm": 1.8822177648544312, + "learning_rate": 1.054815861239864e-05, + "loss": 1.3514, + "mean_token_accuracy": 0.6292115703225136, + "num_tokens": 11688143.0, + "step": 1560 + }, + { + "entropy": 1.8377465950815302, + "epoch": 5.016, + "grad_norm": 1.8221346139907837, + "learning_rate": 1.0444808671306588e-05, + "loss": 1.3028, + "mean_token_accuracy": 0.6413120329380035, + "num_tokens": 11758768.0, + "step": 1570 + }, + { + "entropy": 1.7883025139570237, + "epoch": 5.048, + "grad_norm": 2.1959595680236816, + "learning_rate": 1.034141110185968e-05, + "loss": 1.2797, + "mean_token_accuracy": 0.6479741290211678, + "num_tokens": 11832210.0, + "step": 1580 + }, + { + "entropy": 1.7955584406852723, + "epoch": 5.08, + "grad_norm": 2.106905698776245, + "learning_rate": 1.0237976975461074e-05, + "loss": 1.2881, + "mean_token_accuracy": 0.6424632370471954, + "num_tokens": 11906115.0, + "step": 1590 + }, + { + "entropy": 1.7998322755098344, + "epoch": 5.112, + "grad_norm": 2.327314615249634, + "learning_rate": 1.0134517367428309e-05, + "loss": 1.3324, + "mean_token_accuracy": 0.6460248224437237, + "num_tokens": 11981328.0, + "step": 1600 + }, + { + "entropy": 1.7885828018188477, + "epoch": 5.144, + "grad_norm": 2.1001713275909424, + "learning_rate": 1.0031043355807386e-05, + "loss": 1.3098, + "mean_token_accuracy": 0.63900758177042, + "num_tokens": 12056453.0, + "step": 1610 + }, + { + "entropy": 1.769435602426529, + "epoch": 5.176, + "grad_norm": 2.1210567951202393, + "learning_rate": 9.927566020186592e-06, + "loss": 1.2892, + "mean_token_accuracy": 0.6432970002293587, + "num_tokens": 12133433.0, + "step": 1620 + }, + { + "entropy": 1.7907766073942184, + "epoch": 5.208, + "grad_norm": 2.1842658519744873, + "learning_rate": 9.82409644051013e-06, + "loss": 1.2856, + "mean_token_accuracy": 0.6423615619540215, + "num_tokens": 12207150.0, + "step": 1630 + }, + { + "entropy": 1.7834827870130538, + "epoch": 5.24, + "grad_norm": 2.2503459453582764, + "learning_rate": 9.720645695891733e-06, + "loss": 1.3066, + "mean_token_accuracy": 0.6417693704366684, + "num_tokens": 12282584.0, + "step": 1640 + }, + { + "entropy": 1.763256973028183, + "epoch": 5.272, + "grad_norm": 1.9505388736724854, + "learning_rate": 9.617224863428346e-06, + "loss": 1.2951, + "mean_token_accuracy": 0.6429389595985413, + "num_tokens": 12359793.0, + "step": 1650 + }, + { + "entropy": 1.8142763644456863, + "epoch": 5.304, + "grad_norm": 1.9957698583602905, + "learning_rate": 9.513845017014048e-06, + "loss": 1.33, + "mean_token_accuracy": 0.6413653999567032, + "num_tokens": 12434251.0, + "step": 1660 + }, + { + "entropy": 1.797221601009369, + "epoch": 5.336, + "grad_norm": 2.5095462799072266, + "learning_rate": 9.410517226154276e-06, + "loss": 1.2978, + "mean_token_accuracy": 0.6389835774898529, + "num_tokens": 12508416.0, + "step": 1670 + }, + { + "entropy": 1.8157870292663574, + "epoch": 5.368, + "grad_norm": 2.1890602111816406, + "learning_rate": 9.30725255478058e-06, + "loss": 1.3183, + "mean_token_accuracy": 0.6448161751031876, + "num_tokens": 12582896.0, + "step": 1680 + }, + { + "entropy": 1.7990054041147232, + "epoch": 5.4, + "grad_norm": 2.3904025554656982, + "learning_rate": 9.204062060065915e-06, + "loss": 1.3318, + "mean_token_accuracy": 0.636146479845047, + "num_tokens": 12656802.0, + "step": 1690 + }, + { + "entropy": 1.8003453463315964, + "epoch": 5.432, + "grad_norm": 1.9204304218292236, + "learning_rate": 9.100956791240699e-06, + "loss": 1.3186, + "mean_token_accuracy": 0.6372130662202835, + "num_tokens": 12733283.0, + "step": 1700 + }, + { + "entropy": 1.8101116061210631, + "epoch": 5.464, + "grad_norm": 2.009500026702881, + "learning_rate": 8.997947788409696e-06, + "loss": 1.3247, + "mean_token_accuracy": 0.6406339526176452, + "num_tokens": 12810272.0, + "step": 1710 + }, + { + "entropy": 1.764935952425003, + "epoch": 5.496, + "grad_norm": 2.2038798332214355, + "learning_rate": 8.89504608136989e-06, + "loss": 1.2792, + "mean_token_accuracy": 0.6445836886763573, + "num_tokens": 12885633.0, + "step": 1720 + }, + { + "entropy": 1.7950240582227708, + "epoch": 5.5280000000000005, + "grad_norm": 2.0160531997680664, + "learning_rate": 8.792262688429445e-06, + "loss": 1.2934, + "mean_token_accuracy": 0.6469692558050155, + "num_tokens": 12961131.0, + "step": 1730 + }, + { + "entropy": 1.7804677098989488, + "epoch": 5.5600000000000005, + "grad_norm": 2.1956582069396973, + "learning_rate": 8.689608615227933e-06, + "loss": 1.2969, + "mean_token_accuracy": 0.6438389763236045, + "num_tokens": 13036481.0, + "step": 1740 + }, + { + "entropy": 1.7932062089443206, + "epoch": 5.592, + "grad_norm": 2.2215394973754883, + "learning_rate": 8.587094853557877e-06, + "loss": 1.2907, + "mean_token_accuracy": 0.6460438340902328, + "num_tokens": 13111001.0, + "step": 1750 + }, + { + "entropy": 1.8026408910751344, + "epoch": 5.624, + "grad_norm": 2.3881425857543945, + "learning_rate": 8.484732380187785e-06, + "loss": 1.3049, + "mean_token_accuracy": 0.6414234206080437, + "num_tokens": 13186347.0, + "step": 1760 + }, + { + "entropy": 1.8440747499465941, + "epoch": 5.656, + "grad_norm": 2.2154159545898438, + "learning_rate": 8.382532155686825e-06, + "loss": 1.3797, + "mean_token_accuracy": 0.6365857936441899, + "num_tokens": 13261455.0, + "step": 1770 + }, + { + "entropy": 1.7975190997123718, + "epoch": 5.688, + "grad_norm": 2.1991233825683594, + "learning_rate": 8.280505123251183e-06, + "loss": 1.3191, + "mean_token_accuracy": 0.6393151715397835, + "num_tokens": 13338064.0, + "step": 1780 + }, + { + "entropy": 1.8396487146615983, + "epoch": 5.72, + "grad_norm": 2.0190858840942383, + "learning_rate": 8.178662207532343e-06, + "loss": 1.4052, + "mean_token_accuracy": 0.629064130038023, + "num_tokens": 13414806.0, + "step": 1790 + }, + { + "entropy": 1.7840806126594544, + "epoch": 5.752, + "grad_norm": 2.3335204124450684, + "learning_rate": 8.077014313467274e-06, + "loss": 1.2701, + "mean_token_accuracy": 0.6464540064334869, + "num_tokens": 13489075.0, + "step": 1800 + }, + { + "entropy": 1.7840022534132003, + "epoch": 5.784, + "grad_norm": 2.2151618003845215, + "learning_rate": 7.975572325110819e-06, + "loss": 1.3248, + "mean_token_accuracy": 0.6358998969197274, + "num_tokens": 13565636.0, + "step": 1810 + }, + { + "entropy": 1.7677135676145554, + "epoch": 5.816, + "grad_norm": 2.11505389213562, + "learning_rate": 7.874347104470234e-06, + "loss": 1.2765, + "mean_token_accuracy": 0.6448719501495361, + "num_tokens": 13641112.0, + "step": 1820 + }, + { + "entropy": 1.7586044907569884, + "epoch": 5.848, + "grad_norm": 2.178250789642334, + "learning_rate": 7.773349490342157e-06, + "loss": 1.2846, + "mean_token_accuracy": 0.6450280979275703, + "num_tokens": 13715158.0, + "step": 1830 + }, + { + "entropy": 1.8128920108079911, + "epoch": 5.88, + "grad_norm": 2.2499353885650635, + "learning_rate": 7.672590297152013e-06, + "loss": 1.3248, + "mean_token_accuracy": 0.6343795835971833, + "num_tokens": 13791086.0, + "step": 1840 + }, + { + "entropy": 1.7873643577098846, + "epoch": 5.912, + "grad_norm": 2.1989104747772217, + "learning_rate": 7.572080313796064e-06, + "loss": 1.2907, + "mean_token_accuracy": 0.6413815975189209, + "num_tokens": 13865700.0, + "step": 1850 + }, + { + "entropy": 1.790488451719284, + "epoch": 5.944, + "grad_norm": 2.2605504989624023, + "learning_rate": 7.471830302486151e-06, + "loss": 1.2955, + "mean_token_accuracy": 0.6424889475107193, + "num_tokens": 13938540.0, + "step": 1860 + }, + { + "entropy": 1.7985246628522873, + "epoch": 5.976, + "grad_norm": 2.3228533267974854, + "learning_rate": 7.371850997597355e-06, + "loss": 1.2854, + "mean_token_accuracy": 0.6457341402769089, + "num_tokens": 14011087.0, + "step": 1870 + }, + { + "entropy": 1.7890221727521796, + "epoch": 6.0064, + "grad_norm": 2.192910671234131, + "learning_rate": 7.272153104518567e-06, + "loss": 1.2796, + "mean_token_accuracy": 0.6482133128141102, + "num_tokens": 14082075.0, + "step": 1880 + }, + { + "entropy": 1.7633086562156677, + "epoch": 6.0384, + "grad_norm": 2.368185043334961, + "learning_rate": 7.172747298506224e-06, + "loss": 1.2703, + "mean_token_accuracy": 0.6497290328145027, + "num_tokens": 14156298.0, + "step": 1890 + }, + { + "entropy": 1.750128635764122, + "epoch": 6.0704, + "grad_norm": 2.36487078666687, + "learning_rate": 7.073644223541227e-06, + "loss": 1.2521, + "mean_token_accuracy": 0.6534707516431808, + "num_tokens": 14232528.0, + "step": 1900 + }, + { + "entropy": 1.7416553497314453, + "epoch": 6.1024, + "grad_norm": 2.3927595615386963, + "learning_rate": 6.974854491189243e-06, + "loss": 1.217, + "mean_token_accuracy": 0.6588135868310928, + "num_tokens": 14307073.0, + "step": 1910 + }, + { + "entropy": 1.7359241485595702, + "epoch": 6.1344, + "grad_norm": 2.1107988357543945, + "learning_rate": 6.876388679464437e-06, + "loss": 1.2763, + "mean_token_accuracy": 0.6550255373120308, + "num_tokens": 14383819.0, + "step": 1920 + }, + { + "entropy": 1.7380403220653533, + "epoch": 6.1664, + "grad_norm": 2.4158387184143066, + "learning_rate": 6.7782573316968424e-06, + "loss": 1.2359, + "mean_token_accuracy": 0.656632873415947, + "num_tokens": 14460092.0, + "step": 1930 + }, + { + "entropy": 1.7227638810873032, + "epoch": 6.1984, + "grad_norm": 2.3467485904693604, + "learning_rate": 6.6804709554034075e-06, + "loss": 1.2311, + "mean_token_accuracy": 0.654091839492321, + "num_tokens": 14534160.0, + "step": 1940 + }, + { + "entropy": 1.7244948148727417, + "epoch": 6.2304, + "grad_norm": 2.760057210922241, + "learning_rate": 6.583040021162905e-06, + "loss": 1.2189, + "mean_token_accuracy": 0.6611428812146187, + "num_tokens": 14608592.0, + "step": 1950 + }, + { + "entropy": 1.7471657902002335, + "epoch": 6.2624, + "grad_norm": 2.3923745155334473, + "learning_rate": 6.485974961494772e-06, + "loss": 1.2631, + "mean_token_accuracy": 0.6524021357297898, + "num_tokens": 14683538.0, + "step": 1960 + }, + { + "entropy": 1.7506494253873826, + "epoch": 6.2943999999999996, + "grad_norm": 2.4149715900421143, + "learning_rate": 6.389286169742048e-06, + "loss": 1.2468, + "mean_token_accuracy": 0.6567713841795921, + "num_tokens": 14755778.0, + "step": 1970 + }, + { + "entropy": 1.7104488879442215, + "epoch": 6.3264, + "grad_norm": 2.632632255554199, + "learning_rate": 6.292983998958478e-06, + "loss": 1.2267, + "mean_token_accuracy": 0.6561126798391342, + "num_tokens": 14831036.0, + "step": 1980 + }, + { + "entropy": 1.7591658294200898, + "epoch": 6.3584, + "grad_norm": 2.4012722969055176, + "learning_rate": 6.1970787607999815e-06, + "loss": 1.2693, + "mean_token_accuracy": 0.6490694522857666, + "num_tokens": 14906610.0, + "step": 1990 + }, + { + "entropy": 1.7317969173192977, + "epoch": 6.3904, + "grad_norm": 2.8288748264312744, + "learning_rate": 6.101580724420478e-06, + "loss": 1.235, + "mean_token_accuracy": 0.6564200609922409, + "num_tokens": 14980134.0, + "step": 2000 + }, + { + "entropy": 1.7617577254772185, + "epoch": 6.4224, + "grad_norm": 2.4008944034576416, + "learning_rate": 6.00650011537235e-06, + "loss": 1.2773, + "mean_token_accuracy": 0.6494350075721741, + "num_tokens": 15054969.0, + "step": 2010 + }, + { + "entropy": 1.749829688668251, + "epoch": 6.4544, + "grad_norm": 2.5665228366851807, + "learning_rate": 5.911847114511497e-06, + "loss": 1.2512, + "mean_token_accuracy": 0.6512764275074006, + "num_tokens": 15129421.0, + "step": 2020 + }, + { + "entropy": 1.7387797951698303, + "epoch": 6.4864, + "grad_norm": 2.6020922660827637, + "learning_rate": 5.817631856907233e-06, + "loss": 1.2477, + "mean_token_accuracy": 0.6530226185917855, + "num_tokens": 15203465.0, + "step": 2030 + }, + { + "entropy": 1.7363551884889603, + "epoch": 6.5184, + "grad_norm": 2.161478281021118, + "learning_rate": 5.723864430757047e-06, + "loss": 1.2692, + "mean_token_accuracy": 0.6527093783020973, + "num_tokens": 15279761.0, + "step": 2040 + }, + { + "entropy": 1.7563295543193818, + "epoch": 6.5504, + "grad_norm": 2.5587289333343506, + "learning_rate": 5.630554876306407e-06, + "loss": 1.2211, + "mean_token_accuracy": 0.6574550330638885, + "num_tokens": 15351301.0, + "step": 2050 + }, + { + "entropy": 1.7521151036024094, + "epoch": 6.5824, + "grad_norm": 2.4042234420776367, + "learning_rate": 5.537713184773686e-06, + "loss": 1.271, + "mean_token_accuracy": 0.6478641331195831, + "num_tokens": 15427936.0, + "step": 2060 + }, + { + "entropy": 1.7270145863294601, + "epoch": 6.6144, + "grad_norm": 2.3122522830963135, + "learning_rate": 5.44534929728036e-06, + "loss": 1.224, + "mean_token_accuracy": 0.6566437393426895, + "num_tokens": 15502561.0, + "step": 2070 + }, + { + "entropy": 1.7568089962005615, + "epoch": 6.6464, + "grad_norm": 2.461474895477295, + "learning_rate": 5.353473103786511e-06, + "loss": 1.2864, + "mean_token_accuracy": 0.6471308276057244, + "num_tokens": 15578053.0, + "step": 2080 + }, + { + "entropy": 1.7190027862787247, + "epoch": 6.6784, + "grad_norm": 2.4412550926208496, + "learning_rate": 5.262094442031901e-06, + "loss": 1.2092, + "mean_token_accuracy": 0.6601713746786118, + "num_tokens": 15653342.0, + "step": 2090 + }, + { + "entropy": 1.717634916305542, + "epoch": 6.7104, + "grad_norm": 2.276007890701294, + "learning_rate": 5.171223096482533e-06, + "loss": 1.2271, + "mean_token_accuracy": 0.6595920532941818, + "num_tokens": 15730387.0, + "step": 2100 + }, + { + "entropy": 1.7230647921562194, + "epoch": 6.7424, + "grad_norm": 2.480471134185791, + "learning_rate": 5.080868797283019e-06, + "loss": 1.229, + "mean_token_accuracy": 0.6568982020020485, + "num_tokens": 15804405.0, + "step": 2110 + }, + { + "entropy": 1.7535502433776855, + "epoch": 6.7744, + "grad_norm": 2.448997974395752, + "learning_rate": 4.9910412192146795e-06, + "loss": 1.2584, + "mean_token_accuracy": 0.648795773088932, + "num_tokens": 15878537.0, + "step": 2120 + }, + { + "entropy": 1.786664029955864, + "epoch": 6.8064, + "grad_norm": 2.430039405822754, + "learning_rate": 4.901749980659617e-06, + "loss": 1.3358, + "mean_token_accuracy": 0.6427689291536808, + "num_tokens": 15952964.0, + "step": 2130 + }, + { + "entropy": 1.7594995677471161, + "epoch": 6.8384, + "grad_norm": 2.469172716140747, + "learning_rate": 4.813004642570822e-06, + "loss": 1.2844, + "mean_token_accuracy": 0.6534359715878963, + "num_tokens": 16028086.0, + "step": 2140 + }, + { + "entropy": 1.7347292125225067, + "epoch": 6.8704, + "grad_norm": 2.6162445545196533, + "learning_rate": 4.724814707448418e-06, + "loss": 1.2707, + "mean_token_accuracy": 0.6447671175003051, + "num_tokens": 16103263.0, + "step": 2150 + }, + { + "entropy": 1.7325938045978546, + "epoch": 6.9024, + "grad_norm": 2.416431188583374, + "learning_rate": 4.637189618322173e-06, + "loss": 1.2794, + "mean_token_accuracy": 0.6439008563756943, + "num_tokens": 16182360.0, + "step": 2160 + }, + { + "entropy": 1.7763712674379348, + "epoch": 6.9344, + "grad_norm": 2.3447437286376953, + "learning_rate": 4.550138757740381e-06, + "loss": 1.3043, + "mean_token_accuracy": 0.650251479446888, + "num_tokens": 16256272.0, + "step": 2170 + }, + { + "entropy": 1.739478302001953, + "epoch": 6.9664, + "grad_norm": 2.650451183319092, + "learning_rate": 4.463671446765206e-06, + "loss": 1.259, + "mean_token_accuracy": 0.6514677822589874, + "num_tokens": 16330984.0, + "step": 2180 + }, + { + "entropy": 1.7292406976222991, + "epoch": 6.9984, + "grad_norm": 2.5442306995391846, + "learning_rate": 4.377796943974641e-06, + "loss": 1.2554, + "mean_token_accuracy": 0.6506337657570839, + "num_tokens": 16406982.0, + "step": 2190 + }, + { + "entropy": 1.7582613606201976, + "epoch": 7.0288, + "grad_norm": 2.315408706665039, + "learning_rate": 4.292524444471097e-06, + "loss": 1.2766, + "mean_token_accuracy": 0.6549999933493765, + "num_tokens": 16479870.0, + "step": 2200 + }, + { + "entropy": 1.6856683611869812, + "epoch": 7.0608, + "grad_norm": 2.6291606426239014, + "learning_rate": 4.2078630788968775e-06, + "loss": 1.2051, + "mean_token_accuracy": 0.6612728327512741, + "num_tokens": 16557045.0, + "step": 2210 + }, + { + "entropy": 1.7316391229629517, + "epoch": 7.0928, + "grad_norm": 2.767998695373535, + "learning_rate": 4.123821912456457e-06, + "loss": 1.1981, + "mean_token_accuracy": 0.6648431628942489, + "num_tokens": 16629418.0, + "step": 2220 + }, + { + "entropy": 1.6988701403141022, + "epoch": 7.1248, + "grad_norm": 2.752492904663086, + "learning_rate": 4.040409943945856e-06, + "loss": 1.2277, + "mean_token_accuracy": 0.6629953101277352, + "num_tokens": 16706000.0, + "step": 2230 + }, + { + "entropy": 1.6829528212547302, + "epoch": 7.1568, + "grad_norm": 2.5362343788146973, + "learning_rate": 3.957636104789056e-06, + "loss": 1.1526, + "mean_token_accuracy": 0.6730572417378425, + "num_tokens": 16781728.0, + "step": 2240 + }, + { + "entropy": 1.7018825829029083, + "epoch": 7.1888, + "grad_norm": 2.772650718688965, + "learning_rate": 3.875509258081671e-06, + "loss": 1.1978, + "mean_token_accuracy": 0.6671716704964638, + "num_tokens": 16856290.0, + "step": 2250 + }, + { + "entropy": 1.7013772219419478, + "epoch": 7.2208, + "grad_norm": 2.548017740249634, + "learning_rate": 3.794038197641924e-06, + "loss": 1.2166, + "mean_token_accuracy": 0.6607363104820252, + "num_tokens": 16933284.0, + "step": 2260 + }, + { + "entropy": 1.7119423121213913, + "epoch": 7.2528, + "grad_norm": 3.085073947906494, + "learning_rate": 3.713231647069031e-06, + "loss": 1.1811, + "mean_token_accuracy": 0.6690418004989624, + "num_tokens": 17006677.0, + "step": 2270 + }, + { + "entropy": 1.6958551973104476, + "epoch": 7.2848, + "grad_norm": 2.9240498542785645, + "learning_rate": 3.633098258809119e-06, + "loss": 1.1998, + "mean_token_accuracy": 0.6630783304572105, + "num_tokens": 17081748.0, + "step": 2280 + }, + { + "entropy": 1.7125141024589539, + "epoch": 7.3168, + "grad_norm": 2.5372555255889893, + "learning_rate": 3.5536466132287562e-06, + "loss": 1.1941, + "mean_token_accuracy": 0.6668895781040192, + "num_tokens": 17155159.0, + "step": 2290 + }, + { + "entropy": 1.699021890759468, + "epoch": 7.3488, + "grad_norm": 2.555891752243042, + "learning_rate": 3.4748852176961912e-06, + "loss": 1.1871, + "mean_token_accuracy": 0.6658635303378105, + "num_tokens": 17231062.0, + "step": 2300 + }, + { + "entropy": 1.7193275570869446, + "epoch": 7.3808, + "grad_norm": 2.6699490547180176, + "learning_rate": 3.3968225056704427e-06, + "loss": 1.2396, + "mean_token_accuracy": 0.6561257526278496, + "num_tokens": 17306280.0, + "step": 2310 + }, + { + "entropy": 1.714812269806862, + "epoch": 7.4128, + "grad_norm": 2.8019561767578125, + "learning_rate": 3.319466835798235e-06, + "loss": 1.2006, + "mean_token_accuracy": 0.6625583916902542, + "num_tokens": 17379705.0, + "step": 2320 + }, + { + "entropy": 1.711946851015091, + "epoch": 7.4448, + "grad_norm": 2.758375644683838, + "learning_rate": 3.2428264910190398e-06, + "loss": 1.2234, + "mean_token_accuracy": 0.6590609803795815, + "num_tokens": 17453837.0, + "step": 2330 + }, + { + "entropy": 1.7215852111577987, + "epoch": 7.4768, + "grad_norm": 2.609473466873169, + "learning_rate": 3.166909677678116e-06, + "loss": 1.2182, + "mean_token_accuracy": 0.6575865730643272, + "num_tokens": 17528089.0, + "step": 2340 + }, + { + "entropy": 1.6932504892349243, + "epoch": 7.5088, + "grad_norm": 2.3272244930267334, + "learning_rate": 3.091724524647861e-06, + "loss": 1.1907, + "mean_token_accuracy": 0.665678508579731, + "num_tokens": 17603632.0, + "step": 2350 + }, + { + "entropy": 1.6796110332012177, + "epoch": 7.5408, + "grad_norm": 2.619145631790161, + "learning_rate": 3.0172790824573627e-06, + "loss": 1.1693, + "mean_token_accuracy": 0.6696520581841469, + "num_tokens": 17678354.0, + "step": 2360 + }, + { + "entropy": 1.7049964010715484, + "epoch": 7.5728, + "grad_norm": 2.4621567726135254, + "learning_rate": 2.943581322430399e-06, + "loss": 1.2376, + "mean_token_accuracy": 0.6554866015911103, + "num_tokens": 17755529.0, + "step": 2370 + }, + { + "entropy": 1.734333610534668, + "epoch": 7.6048, + "grad_norm": 2.7798454761505127, + "learning_rate": 2.8706391358318942e-06, + "loss": 1.2638, + "mean_token_accuracy": 0.6574688166379928, + "num_tokens": 17829887.0, + "step": 2380 + }, + { + "entropy": 1.7209048956632613, + "epoch": 7.6368, + "grad_norm": 2.579355239868164, + "learning_rate": 2.798460333022959e-06, + "loss": 1.2515, + "mean_token_accuracy": 0.6567336574196816, + "num_tokens": 17905006.0, + "step": 2390 + }, + { + "entropy": 1.7461798429489135, + "epoch": 7.6688, + "grad_norm": 2.3670754432678223, + "learning_rate": 2.72705264262458e-06, + "loss": 1.2845, + "mean_token_accuracy": 0.6506199359893798, + "num_tokens": 17980732.0, + "step": 2400 + }, + { + "entropy": 1.6880094558000565, + "epoch": 7.7008, + "grad_norm": 2.6313459873199463, + "learning_rate": 2.6564237106900815e-06, + "loss": 1.188, + "mean_token_accuracy": 0.6656670674681664, + "num_tokens": 18055716.0, + "step": 2410 + }, + { + "entropy": 1.695495843887329, + "epoch": 7.7328, + "grad_norm": 2.5075557231903076, + "learning_rate": 2.586581099886396e-06, + "loss": 1.2249, + "mean_token_accuracy": 0.6589935123920441, + "num_tokens": 18132898.0, + "step": 2420 + }, + { + "entropy": 1.7270244836807251, + "epoch": 7.7648, + "grad_norm": 2.8843488693237305, + "learning_rate": 2.5175322886843156e-06, + "loss": 1.2071, + "mean_token_accuracy": 0.6606879189610482, + "num_tokens": 18206187.0, + "step": 2430 + }, + { + "entropy": 1.7067583829164505, + "epoch": 7.7968, + "grad_norm": 2.5878074169158936, + "learning_rate": 2.4492846705576845e-06, + "loss": 1.2108, + "mean_token_accuracy": 0.662336565554142, + "num_tokens": 18280919.0, + "step": 2440 + }, + { + "entropy": 1.7000836163759232, + "epoch": 7.8288, + "grad_norm": 2.6365489959716797, + "learning_rate": 2.381845553191783e-06, + "loss": 1.2129, + "mean_token_accuracy": 0.661854301393032, + "num_tokens": 18356501.0, + "step": 2450 + }, + { + "entropy": 1.7049589693546294, + "epoch": 7.8608, + "grad_norm": 2.82125186920166, + "learning_rate": 2.315222157700797e-06, + "loss": 1.1956, + "mean_token_accuracy": 0.6644625499844551, + "num_tokens": 18429752.0, + "step": 2460 + }, + { + "entropy": 1.7245618909597398, + "epoch": 7.8928, + "grad_norm": 2.726285934448242, + "learning_rate": 2.2494216178546647e-06, + "loss": 1.1784, + "mean_token_accuracy": 0.6688286542892456, + "num_tokens": 18500518.0, + "step": 2470 + }, + { + "entropy": 1.710511189699173, + "epoch": 7.9248, + "grad_norm": 2.741217613220215, + "learning_rate": 2.184450979315177e-06, + "loss": 1.2731, + "mean_token_accuracy": 0.6543245255947113, + "num_tokens": 18579758.0, + "step": 2480 + }, + { + "entropy": 1.7591080367565155, + "epoch": 7.9568, + "grad_norm": 2.852081537246704, + "learning_rate": 2.1203171988815764e-06, + "loss": 1.3156, + "mean_token_accuracy": 0.6505647979676723, + "num_tokens": 18655339.0, + "step": 2490 + }, + { + "entropy": 1.7133348882198334, + "epoch": 7.9888, + "grad_norm": 2.7279410362243652, + "learning_rate": 2.057027143745646e-06, + "loss": 1.2112, + "mean_token_accuracy": 0.6649632036685944, + "num_tokens": 18728669.0, + "step": 2500 + }, + { + "entropy": 1.7192017655623586, + "epoch": 8.0192, + "grad_norm": 2.6821248531341553, + "learning_rate": 1.994587590756397e-06, + "loss": 1.2187, + "mean_token_accuracy": 0.6636982513101477, + "num_tokens": 18799987.0, + "step": 2510 + }, + { + "entropy": 1.6869423031806945, + "epoch": 8.0512, + "grad_norm": 2.9373373985290527, + "learning_rate": 1.9330052256944355e-06, + "loss": 1.1855, + "mean_token_accuracy": 0.667529807984829, + "num_tokens": 18876438.0, + "step": 2520 + }, + { + "entropy": 1.6881215393543243, + "epoch": 8.0832, + "grad_norm": 2.6245837211608887, + "learning_rate": 1.872286642556066e-06, + "loss": 1.2082, + "mean_token_accuracy": 0.6658473521471023, + "num_tokens": 18951429.0, + "step": 2530 + }, + { + "entropy": 1.7176503151655198, + "epoch": 8.1152, + "grad_norm": 2.9030368328094482, + "learning_rate": 1.8124383428472402e-06, + "loss": 1.2514, + "mean_token_accuracy": 0.6588339149951935, + "num_tokens": 19026806.0, + "step": 2540 + }, + { + "entropy": 1.727318498492241, + "epoch": 8.1472, + "grad_norm": 2.9318246841430664, + "learning_rate": 1.7534667348874068e-06, + "loss": 1.2355, + "mean_token_accuracy": 0.6663900375366211, + "num_tokens": 19100428.0, + "step": 2550 + }, + { + "entropy": 1.6834076642990112, + "epoch": 8.1792, + "grad_norm": 2.59753680229187, + "learning_rate": 1.6953781331233078e-06, + "loss": 1.1813, + "mean_token_accuracy": 0.6736618399620056, + "num_tokens": 19177248.0, + "step": 2560 + }, + { + "entropy": 1.6987117886543275, + "epoch": 8.2112, + "grad_norm": 2.7193281650543213, + "learning_rate": 1.638178757452894e-06, + "loss": 1.2408, + "mean_token_accuracy": 0.6610291600227356, + "num_tokens": 19252481.0, + "step": 2570 + }, + { + "entropy": 1.6835868597030639, + "epoch": 8.2432, + "grad_norm": 2.6238672733306885, + "learning_rate": 1.5818747325592765e-06, + "loss": 1.2186, + "mean_token_accuracy": 0.6627017930150032, + "num_tokens": 19329624.0, + "step": 2580 + }, + { + "entropy": 1.689078041911125, + "epoch": 8.2752, + "grad_norm": 2.643185615539551, + "learning_rate": 1.5264720872549622e-06, + "loss": 1.185, + "mean_token_accuracy": 0.6670305237174035, + "num_tokens": 19404703.0, + "step": 2590 + }, + { + "entropy": 1.6852527409791946, + "epoch": 8.3072, + "grad_norm": 2.3309333324432373, + "learning_rate": 1.471976753836285e-06, + "loss": 1.1771, + "mean_token_accuracy": 0.6721793606877327, + "num_tokens": 19479389.0, + "step": 2600 + }, + { + "entropy": 1.6828288197517396, + "epoch": 8.3392, + "grad_norm": 3.0214831829071045, + "learning_rate": 1.418394567448207e-06, + "loss": 1.1345, + "mean_token_accuracy": 0.6779033228754997, + "num_tokens": 19551761.0, + "step": 2610 + }, + { + "entropy": 1.679307323694229, + "epoch": 8.3712, + "grad_norm": 2.65228009223938, + "learning_rate": 1.3657312654595168e-06, + "loss": 1.1723, + "mean_token_accuracy": 0.6679997086524964, + "num_tokens": 19625882.0, + "step": 2620 + }, + { + "entropy": 1.6989664137363434, + "epoch": 8.4032, + "grad_norm": 2.541346788406372, + "learning_rate": 1.31399248684849e-06, + "loss": 1.197, + "mean_token_accuracy": 0.6655030563473702, + "num_tokens": 19699807.0, + "step": 2630 + }, + { + "entropy": 1.7121862709522246, + "epoch": 8.4352, + "grad_norm": 2.8018271923065186, + "learning_rate": 1.2631837715990957e-06, + "loss": 1.1807, + "mean_token_accuracy": 0.6682800635695457, + "num_tokens": 19772329.0, + "step": 2640 + }, + { + "entropy": 1.6779522567987442, + "epoch": 8.4672, + "grad_norm": 2.786879777908325, + "learning_rate": 1.213310560107791e-06, + "loss": 1.172, + "mean_token_accuracy": 0.6687621667981147, + "num_tokens": 19847678.0, + "step": 2650 + }, + { + "entropy": 1.6814669162034988, + "epoch": 8.4992, + "grad_norm": 2.6873278617858887, + "learning_rate": 1.1643781926009846e-06, + "loss": 1.1662, + "mean_token_accuracy": 0.6698434934020042, + "num_tokens": 19922014.0, + "step": 2660 + }, + { + "entropy": 1.6885319381952286, + "epoch": 8.5312, + "grad_norm": 2.8372082710266113, + "learning_rate": 1.116391908563239e-06, + "loss": 1.1679, + "mean_token_accuracy": 0.6713936537504196, + "num_tokens": 19995173.0, + "step": 2670 + }, + { + "entropy": 1.6706371814012528, + "epoch": 8.5632, + "grad_norm": 2.465330123901367, + "learning_rate": 1.0693568461762238e-06, + "loss": 1.1611, + "mean_token_accuracy": 0.6726542502641678, + "num_tokens": 20071888.0, + "step": 2680 + }, + { + "entropy": 1.6876189917325974, + "epoch": 8.5952, + "grad_norm": 2.66048264503479, + "learning_rate": 1.023278041768565e-06, + "loss": 1.1801, + "mean_token_accuracy": 0.6687627866864204, + "num_tokens": 20147771.0, + "step": 2690 + }, + { + "entropy": 1.6499578058719635, + "epoch": 8.6272, + "grad_norm": 2.6129636764526367, + "learning_rate": 9.781604292765524e-07, + "loss": 1.1687, + "mean_token_accuracy": 0.6690827563405037, + "num_tokens": 20225986.0, + "step": 2700 + }, + { + "entropy": 1.7168281257152558, + "epoch": 8.6592, + "grad_norm": 2.7212672233581543, + "learning_rate": 9.34008839715852e-07, + "loss": 1.2203, + "mean_token_accuracy": 0.6617053344845771, + "num_tokens": 20299960.0, + "step": 2710 + }, + { + "entropy": 1.6980592608451843, + "epoch": 8.6912, + "grad_norm": 2.9652392864227295, + "learning_rate": 8.90828000664209e-07, + "loss": 1.2021, + "mean_token_accuracy": 0.6637651309370994, + "num_tokens": 20375041.0, + "step": 2720 + }, + { + "entropy": 1.6723056137561798, + "epoch": 8.7232, + "grad_norm": 2.5188965797424316, + "learning_rate": 8.486225357552369e-07, + "loss": 1.1326, + "mean_token_accuracy": 0.6736432477831841, + "num_tokens": 20448565.0, + "step": 2730 + }, + { + "entropy": 1.6988836616277694, + "epoch": 8.7552, + "grad_norm": 3.4039053916931152, + "learning_rate": 8.073969641833446e-07, + "loss": 1.2387, + "mean_token_accuracy": 0.6662507936358452, + "num_tokens": 20524681.0, + "step": 2740 + }, + { + "entropy": 1.679963505268097, + "epoch": 8.7872, + "grad_norm": 2.585479497909546, + "learning_rate": 7.671557002198316e-07, + "loss": 1.1417, + "mean_token_accuracy": 0.6760099261999131, + "num_tokens": 20598013.0, + "step": 2750 + }, + { + "entropy": 1.6847810536623, + "epoch": 8.8192, + "grad_norm": 2.4690332412719727, + "learning_rate": 7.279030527402297e-07, + "loss": 1.1946, + "mean_token_accuracy": 0.6612684234976769, + "num_tokens": 20672943.0, + "step": 2760 + }, + { + "entropy": 1.6944990634918213, + "epoch": 8.8512, + "grad_norm": 2.9186036586761475, + "learning_rate": 6.896432247629237e-07, + "loss": 1.1897, + "mean_token_accuracy": 0.6619520500302315, + "num_tokens": 20748074.0, + "step": 2770 + }, + { + "entropy": 1.6818810850381851, + "epoch": 8.8832, + "grad_norm": 2.770056962966919, + "learning_rate": 6.52380312999108e-07, + "loss": 1.1955, + "mean_token_accuracy": 0.6678687691688537, + "num_tokens": 20825520.0, + "step": 2780 + }, + { + "entropy": 1.7140829920768739, + "epoch": 8.9152, + "grad_norm": 2.797840118408203, + "learning_rate": 6.161183074141319e-07, + "loss": 1.249, + "mean_token_accuracy": 0.6633088305592537, + "num_tokens": 20899714.0, + "step": 2790 + }, + { + "entropy": 1.6798905462026597, + "epoch": 8.9472, + "grad_norm": 2.660682201385498, + "learning_rate": 5.808610908002599e-07, + "loss": 1.1739, + "mean_token_accuracy": 0.6718563959002495, + "num_tokens": 20975614.0, + "step": 2800 + }, + { + "entropy": 1.6790486425161362, + "epoch": 8.9792, + "grad_norm": 2.887852191925049, + "learning_rate": 5.466124383609317e-07, + "loss": 1.2012, + "mean_token_accuracy": 0.6658951610326767, + "num_tokens": 21052188.0, + "step": 2810 + } + ], + "logging_steps": 10, + "max_steps": 3130, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.050060763900805e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2817/training_args.bin b/checkpoint-2817/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..ca9563f4064abfd2f36668559093a5f0763d7c85 --- /dev/null +++ b/checkpoint-2817/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88fbd415d023bc35b9e36c515e374642c961f2c3a428f0d0bba13a27d8c151a7 +size 6289 diff --git a/checkpoint-3130/README.md b/checkpoint-3130/README.md new file mode 100644 index 0000000000000000000000000000000000000000..96b9f5618833a1728fbecbefb87f08b279b6b2ed --- /dev/null +++ b/checkpoint-3130/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.1-8B-Instruct +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/checkpoint-3130/adapter_config.json b/checkpoint-3130/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8cd4cdaf30ceb587991efae70006ef463605c378 --- /dev/null +++ b/checkpoint-3130/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "q_proj", + "o_proj", + "k_proj", + "gate_proj", + "v_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3130/adapter_model.safetensors b/checkpoint-3130/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..859e9fc4591cabb8a4d2c0188e37fb97f6082fda --- /dev/null +++ b/checkpoint-3130/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c12804c41316890b8e14ea34914bc845a667e4232adadde80c1ee814cc01eeb +size 335604696 diff --git a/checkpoint-3130/chat_template.jinja b/checkpoint-3130/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..33089ace1be88f22a10fe861ad49718d5d886090 --- /dev/null +++ b/checkpoint-3130/chat_template.jinja @@ -0,0 +1,109 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {#- This means we're in ipython mode #} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/checkpoint-3130/optimizer.pt b/checkpoint-3130/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..44ab8aebd009fdee5b39a4ee5f0ff9a2ebb2705c --- /dev/null +++ b/checkpoint-3130/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a72bcc765fad8ad7f91467fc79d958362bfc93aedec5167fb74c04212c399f5 +size 671473443 diff --git a/checkpoint-3130/rng_state.pth b/checkpoint-3130/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..44f96243d17946b0455f408cfd22df34945a163c --- /dev/null +++ b/checkpoint-3130/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1962cf68222666d68cf7a1b9df919efed7fcb764622130ac9e2f4f1e428ad257 +size 14645 diff --git a/checkpoint-3130/scheduler.pt b/checkpoint-3130/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..295dfcedf4c892fea17fdfc377a130b2d29a35ae --- /dev/null +++ b/checkpoint-3130/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:750cd4a85782ec06a4add241453c8d49668d9daf764c30b0837a87518192a114 +size 1465 diff --git a/checkpoint-3130/special_tokens_map.json b/checkpoint-3130/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..02ee80b6196926a5ad790a004d9efd6ab1ba6542 --- /dev/null +++ b/checkpoint-3130/special_tokens_map.json @@ -0,0 +1,16 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-3130/tokenizer.json b/checkpoint-3130/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-3130/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-3130/tokenizer_config.json b/checkpoint-3130/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8b0c7c141373ca36e5e819a28f60e146ccef652f --- /dev/null +++ b/checkpoint-3130/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-3130/trainer_state.json b/checkpoint-3130/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f1be704f1a0aca9e72de1a41dcc95ad1baf3be88 --- /dev/null +++ b/checkpoint-3130/trainer_state.json @@ -0,0 +1,3164 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 3130, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.098961293697357, + "epoch": 0.032, + "grad_norm": 1.891703486442566, + "learning_rate": 1.9148936170212767e-06, + "loss": 2.0828, + "mean_token_accuracy": 0.530680388212204, + "num_tokens": 72723.0, + "step": 10 + }, + { + "entropy": 2.119775766134262, + "epoch": 0.064, + "grad_norm": 1.2044862508773804, + "learning_rate": 4.042553191489362e-06, + "loss": 2.0093, + "mean_token_accuracy": 0.5355814293026924, + "num_tokens": 146392.0, + "step": 20 + }, + { + "entropy": 2.220579963922501, + "epoch": 0.096, + "grad_norm": 0.9982365369796753, + "learning_rate": 6.170212765957447e-06, + "loss": 1.8939, + "mean_token_accuracy": 0.5451944440603256, + "num_tokens": 223711.0, + "step": 30 + }, + { + "entropy": 2.382017892599106, + "epoch": 0.128, + "grad_norm": 0.7386544346809387, + "learning_rate": 8.297872340425532e-06, + "loss": 1.9066, + "mean_token_accuracy": 0.5411656655371189, + "num_tokens": 300889.0, + "step": 40 + }, + { + "entropy": 2.274736815690994, + "epoch": 0.16, + "grad_norm": 0.6412256956100464, + "learning_rate": 1.0425531914893619e-05, + "loss": 1.7387, + "mean_token_accuracy": 0.5679451540112496, + "num_tokens": 377362.0, + "step": 50 + }, + { + "entropy": 2.3663365960121157, + "epoch": 0.192, + "grad_norm": 0.6228290796279907, + "learning_rate": 1.2553191489361702e-05, + "loss": 1.7492, + "mean_token_accuracy": 0.5746532663702965, + "num_tokens": 449594.0, + "step": 60 + }, + { + "entropy": 2.315044218301773, + "epoch": 0.224, + "grad_norm": 0.6034156680107117, + "learning_rate": 1.4680851063829789e-05, + "loss": 1.7111, + "mean_token_accuracy": 0.5675176709890366, + "num_tokens": 523439.0, + "step": 70 + }, + { + "entropy": 2.288265961408615, + "epoch": 0.256, + "grad_norm": 0.45914268493652344, + "learning_rate": 1.6808510638297873e-05, + "loss": 1.6931, + "mean_token_accuracy": 0.5713589735329151, + "num_tokens": 599650.0, + "step": 80 + }, + { + "entropy": 2.2693382859230042, + "epoch": 0.288, + "grad_norm": 0.6197793483734131, + "learning_rate": 1.893617021276596e-05, + "loss": 1.6542, + "mean_token_accuracy": 0.578165066242218, + "num_tokens": 675377.0, + "step": 90 + }, + { + "entropy": 2.293796479701996, + "epoch": 0.32, + "grad_norm": 0.5502006411552429, + "learning_rate": 1.9999866154043656e-05, + "loss": 1.7108, + "mean_token_accuracy": 0.5681634023785591, + "num_tokens": 751838.0, + "step": 100 + }, + { + "entropy": 2.2658903509378434, + "epoch": 0.352, + "grad_norm": 0.5713317394256592, + "learning_rate": 1.9998795407890486e-05, + "loss": 1.6168, + "mean_token_accuracy": 0.5843982398509979, + "num_tokens": 825539.0, + "step": 110 + }, + { + "entropy": 2.270280033349991, + "epoch": 0.384, + "grad_norm": 0.5967482924461365, + "learning_rate": 1.999665403023542e-05, + "loss": 1.6194, + "mean_token_accuracy": 0.5839526921510696, + "num_tokens": 897258.0, + "step": 120 + }, + { + "entropy": 2.2349284648895265, + "epoch": 0.416, + "grad_norm": 0.4899630844593048, + "learning_rate": 1.9993442250368708e-05, + "loss": 1.6313, + "mean_token_accuracy": 0.5815729826688767, + "num_tokens": 973142.0, + "step": 130 + }, + { + "entropy": 2.245553806424141, + "epoch": 0.448, + "grad_norm": 0.6546034812927246, + "learning_rate": 1.9989160412195047e-05, + "loss": 1.6395, + "mean_token_accuracy": 0.5780692532658577, + "num_tokens": 1046762.0, + "step": 140 + }, + { + "entropy": 2.288555932044983, + "epoch": 0.48, + "grad_norm": 0.5528404116630554, + "learning_rate": 1.9983808974196752e-05, + "loss": 1.7118, + "mean_token_accuracy": 0.5686657652258873, + "num_tokens": 1125167.0, + "step": 150 + }, + { + "entropy": 2.232080355286598, + "epoch": 0.512, + "grad_norm": 0.5887461304664612, + "learning_rate": 1.9977388509384656e-05, + "loss": 1.6339, + "mean_token_accuracy": 0.5838325396180153, + "num_tokens": 1199589.0, + "step": 160 + }, + { + "entropy": 2.2232475757598875, + "epoch": 0.544, + "grad_norm": 0.5764511823654175, + "learning_rate": 1.9969899705236763e-05, + "loss": 1.6173, + "mean_token_accuracy": 0.5848860442638397, + "num_tokens": 1276431.0, + "step": 170 + }, + { + "entropy": 2.244092071056366, + "epoch": 0.576, + "grad_norm": 0.6295827627182007, + "learning_rate": 1.9961343363624626e-05, + "loss": 1.6017, + "mean_token_accuracy": 0.5818701103329659, + "num_tokens": 1350012.0, + "step": 180 + }, + { + "entropy": 2.237305074930191, + "epoch": 0.608, + "grad_norm": 0.5939638018608093, + "learning_rate": 1.9951720400727495e-05, + "loss": 1.6704, + "mean_token_accuracy": 0.5779796853661537, + "num_tokens": 1423391.0, + "step": 190 + }, + { + "entropy": 2.211505854129791, + "epoch": 0.64, + "grad_norm": 0.6119778156280518, + "learning_rate": 1.9941031846934213e-05, + "loss": 1.6223, + "mean_token_accuracy": 0.5848233133554459, + "num_tokens": 1499124.0, + "step": 200 + }, + { + "entropy": 2.2195493161678312, + "epoch": 0.672, + "grad_norm": 0.6129831671714783, + "learning_rate": 1.9929278846732883e-05, + "loss": 1.5886, + "mean_token_accuracy": 0.5897421136498451, + "num_tokens": 1573541.0, + "step": 210 + }, + { + "entropy": 2.2096123576164244, + "epoch": 0.704, + "grad_norm": 0.6091306209564209, + "learning_rate": 1.9916462658588328e-05, + "loss": 1.6031, + "mean_token_accuracy": 0.5894487425684929, + "num_tokens": 1649546.0, + "step": 220 + }, + { + "entropy": 2.249841979146004, + "epoch": 0.736, + "grad_norm": 0.570816695690155, + "learning_rate": 1.9902584654807325e-05, + "loss": 1.5876, + "mean_token_accuracy": 0.5911228567361831, + "num_tokens": 1722199.0, + "step": 230 + }, + { + "entropy": 2.1915894985198974, + "epoch": 0.768, + "grad_norm": 0.5748864412307739, + "learning_rate": 1.988764632139168e-05, + "loss": 1.5963, + "mean_token_accuracy": 0.5891387596726417, + "num_tokens": 1797304.0, + "step": 240 + }, + { + "entropy": 2.2358563423156737, + "epoch": 0.8, + "grad_norm": 0.6511492729187012, + "learning_rate": 1.9871649257879115e-05, + "loss": 1.6453, + "mean_token_accuracy": 0.5792816638946533, + "num_tokens": 1870113.0, + "step": 250 + }, + { + "entropy": 2.2169984579086304, + "epoch": 0.832, + "grad_norm": 0.5317641496658325, + "learning_rate": 1.9854595177171968e-05, + "loss": 1.6594, + "mean_token_accuracy": 0.577045065164566, + "num_tokens": 1947405.0, + "step": 260 + }, + { + "entropy": 2.2434292674064635, + "epoch": 0.864, + "grad_norm": 0.5399971604347229, + "learning_rate": 1.9836485905353823e-05, + "loss": 1.7158, + "mean_token_accuracy": 0.5683416239917278, + "num_tokens": 2026284.0, + "step": 270 + }, + { + "entropy": 2.227828550338745, + "epoch": 0.896, + "grad_norm": 0.5378643870353699, + "learning_rate": 1.9817323381493933e-05, + "loss": 1.6714, + "mean_token_accuracy": 0.5818367518484593, + "num_tokens": 2103986.0, + "step": 280 + }, + { + "entropy": 2.2110894501209257, + "epoch": 0.928, + "grad_norm": 0.5195969343185425, + "learning_rate": 1.979710965743964e-05, + "loss": 1.6239, + "mean_token_accuracy": 0.5819958478212357, + "num_tokens": 2177010.0, + "step": 290 + }, + { + "entropy": 2.1666628658771514, + "epoch": 0.96, + "grad_norm": 0.5663164258003235, + "learning_rate": 1.977584689759664e-05, + "loss": 1.6024, + "mean_token_accuracy": 0.5876665830612182, + "num_tokens": 2251285.0, + "step": 300 + }, + { + "entropy": 2.214203083515167, + "epoch": 0.992, + "grad_norm": 0.6764860153198242, + "learning_rate": 1.9753537378697237e-05, + "loss": 1.6446, + "mean_token_accuracy": 0.5818003416061401, + "num_tokens": 2325752.0, + "step": 310 + }, + { + "entropy": 2.16783396821273, + "epoch": 1.0224, + "grad_norm": 0.5795008540153503, + "learning_rate": 1.9730183489556563e-05, + "loss": 1.594, + "mean_token_accuracy": 0.5867547392845154, + "num_tokens": 2396254.0, + "step": 320 + }, + { + "entropy": 2.172953352332115, + "epoch": 1.0544, + "grad_norm": 0.6686444282531738, + "learning_rate": 1.9705787730816776e-05, + "loss": 1.613, + "mean_token_accuracy": 0.5906373374164104, + "num_tokens": 2470123.0, + "step": 330 + }, + { + "entropy": 2.2217346757650374, + "epoch": 1.0864, + "grad_norm": 0.6389091610908508, + "learning_rate": 1.9680352714679324e-05, + "loss": 1.7053, + "mean_token_accuracy": 0.577599074691534, + "num_tokens": 2545749.0, + "step": 340 + }, + { + "entropy": 2.138428696990013, + "epoch": 1.1184, + "grad_norm": 0.7369883060455322, + "learning_rate": 1.9653881164625234e-05, + "loss": 1.5599, + "mean_token_accuracy": 0.5946489304304123, + "num_tokens": 2623270.0, + "step": 350 + }, + { + "entropy": 2.147254040837288, + "epoch": 1.1504, + "grad_norm": 0.6707085967063904, + "learning_rate": 1.9626375915123473e-05, + "loss": 1.5843, + "mean_token_accuracy": 0.5965728983283043, + "num_tokens": 2697616.0, + "step": 360 + }, + { + "entropy": 2.1412769109010696, + "epoch": 1.1824, + "grad_norm": 0.7201400995254517, + "learning_rate": 1.9597839911327475e-05, + "loss": 1.58, + "mean_token_accuracy": 0.5957784004509449, + "num_tokens": 2771426.0, + "step": 370 + }, + { + "entropy": 2.164059528708458, + "epoch": 1.2144, + "grad_norm": 0.7561144232749939, + "learning_rate": 1.9568276208759772e-05, + "loss": 1.5872, + "mean_token_accuracy": 0.5874110117554665, + "num_tokens": 2846711.0, + "step": 380 + }, + { + "entropy": 2.205427420139313, + "epoch": 1.2464, + "grad_norm": 0.691585898399353, + "learning_rate": 1.9537687972984804e-05, + "loss": 1.625, + "mean_token_accuracy": 0.5892911069095135, + "num_tokens": 2920916.0, + "step": 390 + }, + { + "entropy": 2.1242104679346085, + "epoch": 1.2784, + "grad_norm": 0.6999676823616028, + "learning_rate": 1.950607847926999e-05, + "loss": 1.5606, + "mean_token_accuracy": 0.5917269751429558, + "num_tokens": 2996056.0, + "step": 400 + }, + { + "entropy": 2.114223065972328, + "epoch": 1.3104, + "grad_norm": 0.7616406679153442, + "learning_rate": 1.947345111223502e-05, + "loss": 1.5296, + "mean_token_accuracy": 0.5938275754451752, + "num_tokens": 3072912.0, + "step": 410 + }, + { + "entropy": 2.1418962299823763, + "epoch": 1.3424, + "grad_norm": 0.7253025770187378, + "learning_rate": 1.943980936548942e-05, + "loss": 1.575, + "mean_token_accuracy": 0.5945621818304062, + "num_tokens": 3148498.0, + "step": 420 + }, + { + "entropy": 2.109667718410492, + "epoch": 1.3744, + "grad_norm": 0.8988682627677917, + "learning_rate": 1.9405156841258498e-05, + "loss": 1.5796, + "mean_token_accuracy": 0.5901263400912284, + "num_tokens": 3224741.0, + "step": 430 + }, + { + "entropy": 2.179358023405075, + "epoch": 1.4064, + "grad_norm": 0.741558849811554, + "learning_rate": 1.936949724999762e-05, + "loss": 1.6507, + "mean_token_accuracy": 0.581992793083191, + "num_tokens": 3299366.0, + "step": 440 + }, + { + "entropy": 2.1574251472949983, + "epoch": 1.4384000000000001, + "grad_norm": 0.7538727521896362, + "learning_rate": 1.9332834409994906e-05, + "loss": 1.5771, + "mean_token_accuracy": 0.5888051658868789, + "num_tokens": 3374162.0, + "step": 450 + }, + { + "entropy": 2.1186763852834702, + "epoch": 1.4704, + "grad_norm": 0.7905173301696777, + "learning_rate": 1.929517224696239e-05, + "loss": 1.6138, + "mean_token_accuracy": 0.584889967739582, + "num_tokens": 3452582.0, + "step": 460 + }, + { + "entropy": 2.1135365635156633, + "epoch": 1.5024, + "grad_norm": 0.7416484951972961, + "learning_rate": 1.9256514793615674e-05, + "loss": 1.5623, + "mean_token_accuracy": 0.5928735345602035, + "num_tokens": 3527694.0, + "step": 470 + }, + { + "entropy": 2.146635016798973, + "epoch": 1.5344, + "grad_norm": 0.731999397277832, + "learning_rate": 1.9216866189242095e-05, + "loss": 1.5634, + "mean_token_accuracy": 0.5988615363836288, + "num_tokens": 3600277.0, + "step": 480 + }, + { + "entropy": 2.1472962319850923, + "epoch": 1.5664, + "grad_norm": 0.7493702173233032, + "learning_rate": 1.9176230679257547e-05, + "loss": 1.5891, + "mean_token_accuracy": 0.5858126983046532, + "num_tokens": 3674781.0, + "step": 490 + }, + { + "entropy": 2.1530486762523653, + "epoch": 1.5984, + "grad_norm": 0.8006687164306641, + "learning_rate": 1.9134612614751865e-05, + "loss": 1.5674, + "mean_token_accuracy": 0.5904534175992012, + "num_tokens": 3748434.0, + "step": 500 + }, + { + "entropy": 2.169738906621933, + "epoch": 1.6303999999999998, + "grad_norm": 0.9293455481529236, + "learning_rate": 1.909201645202294e-05, + "loss": 1.6104, + "mean_token_accuracy": 0.5860036969184875, + "num_tokens": 3823982.0, + "step": 510 + }, + { + "entropy": 2.178475347161293, + "epoch": 1.6623999999999999, + "grad_norm": 0.7716575860977173, + "learning_rate": 1.904844675209956e-05, + "loss": 1.6432, + "mean_token_accuracy": 0.5838924221694469, + "num_tokens": 3900064.0, + "step": 520 + }, + { + "entropy": 2.1585603266954423, + "epoch": 1.6944, + "grad_norm": 0.8225084543228149, + "learning_rate": 1.9003908180253027e-05, + "loss": 1.5957, + "mean_token_accuracy": 0.5880116850137711, + "num_tokens": 3974029.0, + "step": 530 + }, + { + "entropy": 2.111869788169861, + "epoch": 1.7264, + "grad_norm": 0.7035638093948364, + "learning_rate": 1.8958405505497613e-05, + "loss": 1.579, + "mean_token_accuracy": 0.5890362292528153, + "num_tokens": 4049974.0, + "step": 540 + }, + { + "entropy": 2.144411253929138, + "epoch": 1.7584, + "grad_norm": 0.7046850919723511, + "learning_rate": 1.8911943600079934e-05, + "loss": 1.5926, + "mean_token_accuracy": 0.5874261602759361, + "num_tokens": 4125206.0, + "step": 550 + }, + { + "entropy": 2.1093025386333464, + "epoch": 1.7904, + "grad_norm": 0.807727575302124, + "learning_rate": 1.8864527438957223e-05, + "loss": 1.5367, + "mean_token_accuracy": 0.5988967984914779, + "num_tokens": 4199365.0, + "step": 560 + }, + { + "entropy": 2.097169244289398, + "epoch": 1.8224, + "grad_norm": 0.7856780886650085, + "learning_rate": 1.881616209926465e-05, + "loss": 1.561, + "mean_token_accuracy": 0.5948230788111687, + "num_tokens": 4275889.0, + "step": 570 + }, + { + "entropy": 2.088553088903427, + "epoch": 1.8544, + "grad_norm": 0.8993458151817322, + "learning_rate": 1.876685275977167e-05, + "loss": 1.5557, + "mean_token_accuracy": 0.5941933646798134, + "num_tokens": 4350502.0, + "step": 580 + }, + { + "entropy": 2.132419008016586, + "epoch": 1.8864, + "grad_norm": 0.7769711017608643, + "learning_rate": 1.8716604700327516e-05, + "loss": 1.6105, + "mean_token_accuracy": 0.5815305605530738, + "num_tokens": 4426429.0, + "step": 590 + }, + { + "entropy": 2.1076891005039213, + "epoch": 1.9184, + "grad_norm": 0.9261249899864197, + "learning_rate": 1.866542330129583e-05, + "loss": 1.5307, + "mean_token_accuracy": 0.5964644759893417, + "num_tokens": 4500147.0, + "step": 600 + }, + { + "entropy": 2.114642283320427, + "epoch": 1.9504000000000001, + "grad_norm": 0.806425929069519, + "learning_rate": 1.8613314042978576e-05, + "loss": 1.5809, + "mean_token_accuracy": 0.5901800125837326, + "num_tokens": 4573438.0, + "step": 610 + }, + { + "entropy": 2.1167576968669892, + "epoch": 1.9824000000000002, + "grad_norm": 0.8191499710083008, + "learning_rate": 1.856028250502923e-05, + "loss": 1.6031, + "mean_token_accuracy": 0.5843381330370903, + "num_tokens": 4648156.0, + "step": 620 + }, + { + "entropy": 2.0566249019221257, + "epoch": 2.0128, + "grad_norm": 0.7406135201454163, + "learning_rate": 1.8506334365855315e-05, + "loss": 1.5187, + "mean_token_accuracy": 0.6027438483740154, + "num_tokens": 4719492.0, + "step": 630 + }, + { + "entropy": 2.0126763731241226, + "epoch": 2.0448, + "grad_norm": 0.8845784068107605, + "learning_rate": 1.8451475402010405e-05, + "loss": 1.4841, + "mean_token_accuracy": 0.6069207280874253, + "num_tokens": 4796271.0, + "step": 640 + }, + { + "entropy": 2.0516900300979612, + "epoch": 2.0768, + "grad_norm": 0.9927017092704773, + "learning_rate": 1.8395711487575564e-05, + "loss": 1.512, + "mean_token_accuracy": 0.6031922519207, + "num_tokens": 4870202.0, + "step": 650 + }, + { + "entropy": 2.0824343889951704, + "epoch": 2.1088, + "grad_norm": 0.927236795425415, + "learning_rate": 1.8339048593530406e-05, + "loss": 1.5843, + "mean_token_accuracy": 0.5952437989413738, + "num_tokens": 4945568.0, + "step": 660 + }, + { + "entropy": 2.0304481953382494, + "epoch": 2.1408, + "grad_norm": 0.874019205570221, + "learning_rate": 1.8281492787113707e-05, + "loss": 1.5096, + "mean_token_accuracy": 0.5992600306868553, + "num_tokens": 5020723.0, + "step": 670 + }, + { + "entropy": 2.0402441143989565, + "epoch": 2.1728, + "grad_norm": 0.8746942281723022, + "learning_rate": 1.8223050231173802e-05, + "loss": 1.5119, + "mean_token_accuracy": 0.5994458049535751, + "num_tokens": 5095780.0, + "step": 680 + }, + { + "entropy": 2.018441066145897, + "epoch": 2.2048, + "grad_norm": 1.063180923461914, + "learning_rate": 1.816372718350864e-05, + "loss": 1.4923, + "mean_token_accuracy": 0.6064845189452172, + "num_tokens": 5169733.0, + "step": 690 + }, + { + "entropy": 2.0563316702842713, + "epoch": 2.2368, + "grad_norm": 1.0281789302825928, + "learning_rate": 1.810352999619574e-05, + "loss": 1.5505, + "mean_token_accuracy": 0.602813882380724, + "num_tokens": 5246393.0, + "step": 700 + }, + { + "entropy": 2.0298285841941834, + "epoch": 2.2688, + "grad_norm": 1.070520281791687, + "learning_rate": 1.804246511491206e-05, + "loss": 1.5159, + "mean_token_accuracy": 0.6006126523017883, + "num_tokens": 5322244.0, + "step": 710 + }, + { + "entropy": 2.0195819228887557, + "epoch": 2.3008, + "grad_norm": 0.9672983884811401, + "learning_rate": 1.7980539078243783e-05, + "loss": 1.5166, + "mean_token_accuracy": 0.6054230839014053, + "num_tokens": 5399317.0, + "step": 720 + }, + { + "entropy": 2.045917159318924, + "epoch": 2.3327999999999998, + "grad_norm": 1.1228744983673096, + "learning_rate": 1.791775851698622e-05, + "loss": 1.5096, + "mean_token_accuracy": 0.6015639662742615, + "num_tokens": 5473195.0, + "step": 730 + }, + { + "entropy": 2.0935415983200074, + "epoch": 2.3648, + "grad_norm": 1.149794578552246, + "learning_rate": 1.7854130153433785e-05, + "loss": 1.5583, + "mean_token_accuracy": 0.5921522840857506, + "num_tokens": 5548357.0, + "step": 740 + }, + { + "entropy": 2.044076007604599, + "epoch": 2.3968, + "grad_norm": 1.063625693321228, + "learning_rate": 1.7789660800660222e-05, + "loss": 1.5013, + "mean_token_accuracy": 0.5974589124321937, + "num_tokens": 5620915.0, + "step": 750 + }, + { + "entropy": 2.092478734254837, + "epoch": 2.4288, + "grad_norm": 1.1822012662887573, + "learning_rate": 1.7724357361789075e-05, + "loss": 1.5552, + "mean_token_accuracy": 0.5929681301116944, + "num_tokens": 5693406.0, + "step": 760 + }, + { + "entropy": 2.0430804908275606, + "epoch": 2.4608, + "grad_norm": 0.9921984076499939, + "learning_rate": 1.765822682925453e-05, + "loss": 1.4944, + "mean_token_accuracy": 0.6029774472117424, + "num_tokens": 5770143.0, + "step": 770 + }, + { + "entropy": 2.049290281534195, + "epoch": 2.4928, + "grad_norm": 1.0144131183624268, + "learning_rate": 1.7591276284052695e-05, + "loss": 1.5437, + "mean_token_accuracy": 0.5986773043870925, + "num_tokens": 5844022.0, + "step": 780 + }, + { + "entropy": 2.033898201584816, + "epoch": 2.5248, + "grad_norm": 1.1700315475463867, + "learning_rate": 1.7523512894983396e-05, + "loss": 1.5197, + "mean_token_accuracy": 0.5972102269530296, + "num_tokens": 5919099.0, + "step": 790 + }, + { + "entropy": 2.03344586789608, + "epoch": 2.5568, + "grad_norm": 1.0503427982330322, + "learning_rate": 1.745494391788257e-05, + "loss": 1.5456, + "mean_token_accuracy": 0.6011263683438302, + "num_tokens": 5997797.0, + "step": 800 + }, + { + "entropy": 2.0796399265527725, + "epoch": 2.5888, + "grad_norm": 1.0316176414489746, + "learning_rate": 1.7385576694845324e-05, + "loss": 1.608, + "mean_token_accuracy": 0.6024919278919697, + "num_tokens": 6075434.0, + "step": 810 + }, + { + "entropy": 2.0257797837257385, + "epoch": 2.6208, + "grad_norm": 1.048309087753296, + "learning_rate": 1.7315418653439802e-05, + "loss": 1.4876, + "mean_token_accuracy": 0.6070949509739876, + "num_tokens": 6149232.0, + "step": 820 + }, + { + "entropy": 2.024846690893173, + "epoch": 2.6528, + "grad_norm": 1.186710000038147, + "learning_rate": 1.7244477305911845e-05, + "loss": 1.499, + "mean_token_accuracy": 0.6022308841347694, + "num_tokens": 6222180.0, + "step": 830 + }, + { + "entropy": 1.9938248336315154, + "epoch": 2.6848, + "grad_norm": 1.1091604232788086, + "learning_rate": 1.717276024838062e-05, + "loss": 1.4795, + "mean_token_accuracy": 0.6044012248516083, + "num_tokens": 6296902.0, + "step": 840 + }, + { + "entropy": 1.9988998174667358, + "epoch": 2.7168, + "grad_norm": 1.0359690189361572, + "learning_rate": 1.710027516002526e-05, + "loss": 1.5173, + "mean_token_accuracy": 0.6025070771574974, + "num_tokens": 6373494.0, + "step": 850 + }, + { + "entropy": 2.02343093752861, + "epoch": 2.7488, + "grad_norm": 1.1783568859100342, + "learning_rate": 1.7027029802262598e-05, + "loss": 1.5146, + "mean_token_accuracy": 0.6033479735255242, + "num_tokens": 6449229.0, + "step": 860 + }, + { + "entropy": 2.0429257422685625, + "epoch": 2.7808, + "grad_norm": 0.9909568428993225, + "learning_rate": 1.6953032017916115e-05, + "loss": 1.5473, + "mean_token_accuracy": 0.5932901218533516, + "num_tokens": 6525728.0, + "step": 870 + }, + { + "entropy": 2.0058376491069794, + "epoch": 2.8128, + "grad_norm": 1.0904430150985718, + "learning_rate": 1.687828973037615e-05, + "loss": 1.4545, + "mean_token_accuracy": 0.6120153024792672, + "num_tokens": 6599335.0, + "step": 880 + }, + { + "entropy": 2.005480855703354, + "epoch": 2.8448, + "grad_norm": 1.1638548374176025, + "learning_rate": 1.6802810942751514e-05, + "loss": 1.4887, + "mean_token_accuracy": 0.6060751393437386, + "num_tokens": 6672722.0, + "step": 890 + }, + { + "entropy": 2.0311779022216796, + "epoch": 2.8768000000000002, + "grad_norm": 1.1404571533203125, + "learning_rate": 1.6726603737012527e-05, + "loss": 1.5238, + "mean_token_accuracy": 0.6015868663787842, + "num_tokens": 6748069.0, + "step": 900 + }, + { + "entropy": 2.0126856863498688, + "epoch": 2.9088000000000003, + "grad_norm": 1.0942543745040894, + "learning_rate": 1.6649676273125647e-05, + "loss": 1.4984, + "mean_token_accuracy": 0.6019899815320968, + "num_tokens": 6820935.0, + "step": 910 + }, + { + "entropy": 1.9961138010025024, + "epoch": 2.9408, + "grad_norm": 1.0870610475540161, + "learning_rate": 1.6572036788179728e-05, + "loss": 1.4962, + "mean_token_accuracy": 0.6030571654438972, + "num_tokens": 6896286.0, + "step": 920 + }, + { + "entropy": 2.035824549198151, + "epoch": 2.9728, + "grad_norm": 1.0822633504867554, + "learning_rate": 1.6493693595504022e-05, + "loss": 1.5354, + "mean_token_accuracy": 0.5986709952354431, + "num_tokens": 6971854.0, + "step": 930 + }, + { + "entropy": 2.0243908260997974, + "epoch": 3.0032, + "grad_norm": 1.0899602174758911, + "learning_rate": 1.6414655083778027e-05, + "loss": 1.5032, + "mean_token_accuracy": 0.5983682243447555, + "num_tokens": 7041122.0, + "step": 940 + }, + { + "entropy": 1.9538823068141937, + "epoch": 3.0352, + "grad_norm": 1.3042237758636475, + "learning_rate": 1.633492971613326e-05, + "loss": 1.4604, + "mean_token_accuracy": 0.6146818101406097, + "num_tokens": 7116032.0, + "step": 950 + }, + { + "entropy": 1.9383916020393372, + "epoch": 3.0672, + "grad_norm": 1.397078037261963, + "learning_rate": 1.6254526029247048e-05, + "loss": 1.4019, + "mean_token_accuracy": 0.6210932344198227, + "num_tokens": 7189009.0, + "step": 960 + }, + { + "entropy": 1.9460978150367736, + "epoch": 3.0992, + "grad_norm": 1.2756887674331665, + "learning_rate": 1.617345263242847e-05, + "loss": 1.4623, + "mean_token_accuracy": 0.6121616646647453, + "num_tokens": 7263068.0, + "step": 970 + }, + { + "entropy": 1.9156711965799331, + "epoch": 3.1312, + "grad_norm": 1.1937649250030518, + "learning_rate": 1.609171820669649e-05, + "loss": 1.4301, + "mean_token_accuracy": 0.6136599197983742, + "num_tokens": 7338652.0, + "step": 980 + }, + { + "entropy": 1.9247682303190232, + "epoch": 3.1632, + "grad_norm": 1.3291118144989014, + "learning_rate": 1.6009331503850448e-05, + "loss": 1.4545, + "mean_token_accuracy": 0.6153608947992325, + "num_tokens": 7414529.0, + "step": 990 + }, + { + "entropy": 1.9066543668508529, + "epoch": 3.1952, + "grad_norm": 1.4356389045715332, + "learning_rate": 1.5926301345532925e-05, + "loss": 1.4413, + "mean_token_accuracy": 0.612147618830204, + "num_tokens": 7489106.0, + "step": 1000 + }, + { + "entropy": 1.895160937309265, + "epoch": 3.2272, + "grad_norm": 1.4345523118972778, + "learning_rate": 1.5842636622285187e-05, + "loss": 1.4207, + "mean_token_accuracy": 0.6138400137424469, + "num_tokens": 7564304.0, + "step": 1010 + }, + { + "entropy": 1.9546802312135696, + "epoch": 3.2592, + "grad_norm": 1.5242680311203003, + "learning_rate": 1.575834629259519e-05, + "loss": 1.4435, + "mean_token_accuracy": 0.6153354361653328, + "num_tokens": 7637409.0, + "step": 1020 + }, + { + "entropy": 1.912938117980957, + "epoch": 3.2912, + "grad_norm": 1.529726505279541, + "learning_rate": 1.5673439381938365e-05, + "loss": 1.4409, + "mean_token_accuracy": 0.6191004544496537, + "num_tokens": 7711595.0, + "step": 1030 + }, + { + "entropy": 1.8989770442247391, + "epoch": 3.3232, + "grad_norm": 1.3367948532104492, + "learning_rate": 1.5587924981811196e-05, + "loss": 1.394, + "mean_token_accuracy": 0.624155393242836, + "num_tokens": 7785750.0, + "step": 1040 + }, + { + "entropy": 1.932333904504776, + "epoch": 3.3552, + "grad_norm": 1.4732215404510498, + "learning_rate": 1.5501812248757734e-05, + "loss": 1.3959, + "mean_token_accuracy": 0.6221834555268287, + "num_tokens": 7859036.0, + "step": 1050 + }, + { + "entropy": 1.9492982983589173, + "epoch": 3.3872, + "grad_norm": 1.4499313831329346, + "learning_rate": 1.5415110403389166e-05, + "loss": 1.4633, + "mean_token_accuracy": 0.6100246667861938, + "num_tokens": 7933165.0, + "step": 1060 + }, + { + "entropy": 1.9063653618097305, + "epoch": 3.4192, + "grad_norm": 1.4364317655563354, + "learning_rate": 1.5327828729396482e-05, + "loss": 1.4216, + "mean_token_accuracy": 0.6210869938135147, + "num_tokens": 8009376.0, + "step": 1070 + }, + { + "entropy": 1.9919361650943757, + "epoch": 3.4512, + "grad_norm": 1.5573089122772217, + "learning_rate": 1.5239976572556438e-05, + "loss": 1.5899, + "mean_token_accuracy": 0.5991086520254612, + "num_tokens": 8086825.0, + "step": 1080 + }, + { + "entropy": 1.922476476430893, + "epoch": 3.4832, + "grad_norm": 1.3339344263076782, + "learning_rate": 1.5151563339730849e-05, + "loss": 1.4162, + "mean_token_accuracy": 0.6182018965482712, + "num_tokens": 8161726.0, + "step": 1090 + }, + { + "entropy": 1.9143129527568816, + "epoch": 3.5152, + "grad_norm": 1.4425708055496216, + "learning_rate": 1.506259849785931e-05, + "loss": 1.4076, + "mean_token_accuracy": 0.6197950705885887, + "num_tokens": 8237046.0, + "step": 1100 + }, + { + "entropy": 1.9093267023563385, + "epoch": 3.5472, + "grad_norm": 1.5437992811203003, + "learning_rate": 1.497309157294555e-05, + "loss": 1.4339, + "mean_token_accuracy": 0.6177847877144813, + "num_tokens": 8315350.0, + "step": 1110 + }, + { + "entropy": 1.9121424347162246, + "epoch": 3.5792, + "grad_norm": 1.3761622905731201, + "learning_rate": 1.4883052149037395e-05, + "loss": 1.4175, + "mean_token_accuracy": 0.6171463698148727, + "num_tokens": 8390383.0, + "step": 1120 + }, + { + "entropy": 1.883551675081253, + "epoch": 3.6112, + "grad_norm": 1.36739182472229, + "learning_rate": 1.479248986720057e-05, + "loss": 1.4158, + "mean_token_accuracy": 0.6186214044690133, + "num_tokens": 8468414.0, + "step": 1130 + }, + { + "entropy": 1.988349151611328, + "epoch": 3.6432, + "grad_norm": 1.4566738605499268, + "learning_rate": 1.4701414424486353e-05, + "loss": 1.5366, + "mean_token_accuracy": 0.6110676810145378, + "num_tokens": 8541715.0, + "step": 1140 + }, + { + "entropy": 1.9057112097740174, + "epoch": 3.6752000000000002, + "grad_norm": 1.499079704284668, + "learning_rate": 1.4609835572893266e-05, + "loss": 1.3991, + "mean_token_accuracy": 0.6208718970417977, + "num_tokens": 8615694.0, + "step": 1150 + }, + { + "entropy": 1.9219326049089431, + "epoch": 3.7072000000000003, + "grad_norm": 1.3865621089935303, + "learning_rate": 1.4517763118322861e-05, + "loss": 1.431, + "mean_token_accuracy": 0.6143050745129586, + "num_tokens": 8692473.0, + "step": 1160 + }, + { + "entropy": 1.9036399960517882, + "epoch": 3.7392, + "grad_norm": 1.5362603664398193, + "learning_rate": 1.4425206919529747e-05, + "loss": 1.4156, + "mean_token_accuracy": 0.6199175521731377, + "num_tokens": 8767618.0, + "step": 1170 + }, + { + "entropy": 1.9499989479780198, + "epoch": 3.7712, + "grad_norm": 1.663404941558838, + "learning_rate": 1.4332176887065955e-05, + "loss": 1.4668, + "mean_token_accuracy": 0.605186915397644, + "num_tokens": 8843100.0, + "step": 1180 + }, + { + "entropy": 1.9545456051826477, + "epoch": 3.8032, + "grad_norm": 1.6169345378875732, + "learning_rate": 1.4238682982219753e-05, + "loss": 1.4241, + "mean_token_accuracy": 0.6206902250647545, + "num_tokens": 8914604.0, + "step": 1190 + }, + { + "entropy": 1.9130536198616028, + "epoch": 3.8352, + "grad_norm": 1.472740650177002, + "learning_rate": 1.4144735215949028e-05, + "loss": 1.4271, + "mean_token_accuracy": 0.6139126420021057, + "num_tokens": 8989305.0, + "step": 1200 + }, + { + "entropy": 1.938635140657425, + "epoch": 3.8672, + "grad_norm": 1.4194226264953613, + "learning_rate": 1.4050343647809354e-05, + "loss": 1.4538, + "mean_token_accuracy": 0.6131341770291329, + "num_tokens": 9065589.0, + "step": 1210 + }, + { + "entropy": 1.9123675346374511, + "epoch": 3.8992, + "grad_norm": 1.5208053588867188, + "learning_rate": 1.3955518384876863e-05, + "loss": 1.4309, + "mean_token_accuracy": 0.6139545351266861, + "num_tokens": 9140150.0, + "step": 1220 + }, + { + "entropy": 1.9148090302944183, + "epoch": 3.9312, + "grad_norm": 1.6418218612670898, + "learning_rate": 1.3860269580666004e-05, + "loss": 1.4269, + "mean_token_accuracy": 0.6169310078024864, + "num_tokens": 9215796.0, + "step": 1230 + }, + { + "entropy": 1.9157740741968154, + "epoch": 3.9632, + "grad_norm": 1.4638084173202515, + "learning_rate": 1.3764607434042353e-05, + "loss": 1.4509, + "mean_token_accuracy": 0.6164968460798264, + "num_tokens": 9291010.0, + "step": 1240 + }, + { + "entropy": 1.9184510678052902, + "epoch": 3.9952, + "grad_norm": 1.5152716636657715, + "learning_rate": 1.3668542188130567e-05, + "loss": 1.4453, + "mean_token_accuracy": 0.6112410992383956, + "num_tokens": 9367186.0, + "step": 1250 + }, + { + "entropy": 1.9016748384425515, + "epoch": 4.0256, + "grad_norm": 1.490628719329834, + "learning_rate": 1.3572084129217566e-05, + "loss": 1.382, + "mean_token_accuracy": 0.623968276538347, + "num_tokens": 9439028.0, + "step": 1260 + }, + { + "entropy": 1.8026290327310561, + "epoch": 4.0576, + "grad_norm": 1.8969308137893677, + "learning_rate": 1.347524358565115e-05, + "loss": 1.3128, + "mean_token_accuracy": 0.6386646836996078, + "num_tokens": 9513855.0, + "step": 1270 + }, + { + "entropy": 1.8283424764871596, + "epoch": 4.0896, + "grad_norm": 1.5952194929122925, + "learning_rate": 1.3378030926734052e-05, + "loss": 1.3362, + "mean_token_accuracy": 0.6328515768051147, + "num_tokens": 9589080.0, + "step": 1280 + }, + { + "entropy": 1.8405955344438554, + "epoch": 4.1216, + "grad_norm": 1.6057584285736084, + "learning_rate": 1.3280456561613653e-05, + "loss": 1.4151, + "mean_token_accuracy": 0.6261398203670978, + "num_tokens": 9666808.0, + "step": 1290 + }, + { + "entropy": 1.8390818655490875, + "epoch": 4.1536, + "grad_norm": 1.8149824142456055, + "learning_rate": 1.3182530938167409e-05, + "loss": 1.3455, + "mean_token_accuracy": 0.6318597674369812, + "num_tokens": 9740267.0, + "step": 1300 + }, + { + "entropy": 1.8203887075185776, + "epoch": 4.1856, + "grad_norm": 1.6102676391601562, + "learning_rate": 1.3084264541884118e-05, + "loss": 1.3255, + "mean_token_accuracy": 0.6316933467984199, + "num_tokens": 9816400.0, + "step": 1310 + }, + { + "entropy": 1.8592366576194763, + "epoch": 4.2176, + "grad_norm": 1.9501773118972778, + "learning_rate": 1.2985667894741197e-05, + "loss": 1.3521, + "mean_token_accuracy": 0.6301594719290733, + "num_tokens": 9889311.0, + "step": 1320 + }, + { + "entropy": 1.8420085966587068, + "epoch": 4.2496, + "grad_norm": 1.6526106595993042, + "learning_rate": 1.2886751554078015e-05, + "loss": 1.3662, + "mean_token_accuracy": 0.6302071824669838, + "num_tokens": 9965339.0, + "step": 1330 + }, + { + "entropy": 1.8313881188631058, + "epoch": 4.2816, + "grad_norm": 1.6269904375076294, + "learning_rate": 1.2787526111465453e-05, + "loss": 1.3579, + "mean_token_accuracy": 0.6328388035297394, + "num_tokens": 10039668.0, + "step": 1340 + }, + { + "entropy": 1.858151137828827, + "epoch": 4.3136, + "grad_norm": 1.9028024673461914, + "learning_rate": 1.2688002191571829e-05, + "loss": 1.421, + "mean_token_accuracy": 0.6276688367128372, + "num_tokens": 10115387.0, + "step": 1350 + }, + { + "entropy": 1.8273844957351684, + "epoch": 4.3456, + "grad_norm": 1.7530555725097656, + "learning_rate": 1.2588190451025209e-05, + "loss": 1.3527, + "mean_token_accuracy": 0.6345869660377502, + "num_tokens": 10191506.0, + "step": 1360 + }, + { + "entropy": 1.8732422679662704, + "epoch": 4.3776, + "grad_norm": 1.7372691631317139, + "learning_rate": 1.248810157727236e-05, + "loss": 1.4132, + "mean_token_accuracy": 0.6252246856689453, + "num_tokens": 10268756.0, + "step": 1370 + }, + { + "entropy": 1.8583054572343827, + "epoch": 4.4096, + "grad_norm": 1.6993470191955566, + "learning_rate": 1.2387746287434385e-05, + "loss": 1.3638, + "mean_token_accuracy": 0.6286717876791954, + "num_tokens": 10341779.0, + "step": 1380 + }, + { + "entropy": 1.8324467271566391, + "epoch": 4.4416, + "grad_norm": 1.7818169593811035, + "learning_rate": 1.2287135327159165e-05, + "loss": 1.3372, + "mean_token_accuracy": 0.6361263945698739, + "num_tokens": 10414642.0, + "step": 1390 + }, + { + "entropy": 1.8514392852783204, + "epoch": 4.4736, + "grad_norm": 1.7585517168045044, + "learning_rate": 1.2186279469470757e-05, + "loss": 1.3697, + "mean_token_accuracy": 0.628801380097866, + "num_tokens": 10489517.0, + "step": 1400 + }, + { + "entropy": 1.8218136370182036, + "epoch": 4.5056, + "grad_norm": 1.9843116998672485, + "learning_rate": 1.2085189513615872e-05, + "loss": 1.3628, + "mean_token_accuracy": 0.6295172199606895, + "num_tokens": 10565467.0, + "step": 1410 + }, + { + "entropy": 1.8919565021991729, + "epoch": 4.5376, + "grad_norm": 1.9309132099151611, + "learning_rate": 1.1983876283907522e-05, + "loss": 1.4467, + "mean_token_accuracy": 0.6263746194541454, + "num_tokens": 10641283.0, + "step": 1420 + }, + { + "entropy": 1.8356508910655975, + "epoch": 4.5696, + "grad_norm": 1.7685068845748901, + "learning_rate": 1.1882350628566008e-05, + "loss": 1.3631, + "mean_token_accuracy": 0.624418406188488, + "num_tokens": 10716701.0, + "step": 1430 + }, + { + "entropy": 1.8288098931312562, + "epoch": 4.6016, + "grad_norm": 1.8276050090789795, + "learning_rate": 1.178062341855732e-05, + "loss": 1.3619, + "mean_token_accuracy": 0.6286922857165337, + "num_tokens": 10791427.0, + "step": 1440 + }, + { + "entropy": 1.8557640790939331, + "epoch": 4.6336, + "grad_norm": 1.7773240804672241, + "learning_rate": 1.1678705546429132e-05, + "loss": 1.383, + "mean_token_accuracy": 0.6216814562678337, + "num_tokens": 10866356.0, + "step": 1450 + }, + { + "entropy": 1.8483826667070389, + "epoch": 4.6655999999999995, + "grad_norm": 1.831931710243225, + "learning_rate": 1.1576607925144456e-05, + "loss": 1.355, + "mean_token_accuracy": 0.6278511002659798, + "num_tokens": 10940772.0, + "step": 1460 + }, + { + "entropy": 1.8824394553899766, + "epoch": 4.6975999999999996, + "grad_norm": 1.9213542938232422, + "learning_rate": 1.1474341486913146e-05, + "loss": 1.3767, + "mean_token_accuracy": 0.6256057649850846, + "num_tokens": 11016144.0, + "step": 1470 + }, + { + "entropy": 1.8709469974040984, + "epoch": 4.7296, + "grad_norm": 1.8768925666809082, + "learning_rate": 1.1371917182021297e-05, + "loss": 1.3734, + "mean_token_accuracy": 0.6317574754357338, + "num_tokens": 11089939.0, + "step": 1480 + }, + { + "entropy": 1.8673742085695266, + "epoch": 4.7616, + "grad_norm": 1.796302318572998, + "learning_rate": 1.1269345977658747e-05, + "loss": 1.3707, + "mean_token_accuracy": 0.6238353416323662, + "num_tokens": 11166087.0, + "step": 1490 + }, + { + "entropy": 1.8310889720916748, + "epoch": 4.7936, + "grad_norm": 1.8969939947128296, + "learning_rate": 1.1166638856744747e-05, + "loss": 1.3373, + "mean_token_accuracy": 0.6348015949130058, + "num_tokens": 11240732.0, + "step": 1500 + }, + { + "entropy": 1.8809226244688033, + "epoch": 4.8256, + "grad_norm": 1.642104983329773, + "learning_rate": 1.1063806816751957e-05, + "loss": 1.3792, + "mean_token_accuracy": 0.6250617265701294, + "num_tokens": 11316878.0, + "step": 1510 + }, + { + "entropy": 1.8715822875499726, + "epoch": 4.8576, + "grad_norm": 1.962158441543579, + "learning_rate": 1.0960860868528872e-05, + "loss": 1.3711, + "mean_token_accuracy": 0.6293752744793892, + "num_tokens": 11389042.0, + "step": 1520 + }, + { + "entropy": 1.8657191127538681, + "epoch": 4.8896, + "grad_norm": 1.9577444791793823, + "learning_rate": 1.0857812035120845e-05, + "loss": 1.379, + "mean_token_accuracy": 0.6259156972169876, + "num_tokens": 11464215.0, + "step": 1530 + }, + { + "entropy": 1.8811951220035552, + "epoch": 4.9216, + "grad_norm": 2.015150785446167, + "learning_rate": 1.0754671350589752e-05, + "loss": 1.4155, + "mean_token_accuracy": 0.626779156178236, + "num_tokens": 11539122.0, + "step": 1540 + }, + { + "entropy": 1.863905319571495, + "epoch": 4.9536, + "grad_norm": 1.8474093675613403, + "learning_rate": 1.065144985883253e-05, + "loss": 1.3409, + "mean_token_accuracy": 0.6319419264793396, + "num_tokens": 11613016.0, + "step": 1550 + }, + { + "entropy": 1.836970153450966, + "epoch": 4.9856, + "grad_norm": 1.8822177648544312, + "learning_rate": 1.054815861239864e-05, + "loss": 1.3514, + "mean_token_accuracy": 0.6292115703225136, + "num_tokens": 11688143.0, + "step": 1560 + }, + { + "entropy": 1.8377465950815302, + "epoch": 5.016, + "grad_norm": 1.8221346139907837, + "learning_rate": 1.0444808671306588e-05, + "loss": 1.3028, + "mean_token_accuracy": 0.6413120329380035, + "num_tokens": 11758768.0, + "step": 1570 + }, + { + "entropy": 1.7883025139570237, + "epoch": 5.048, + "grad_norm": 2.1959595680236816, + "learning_rate": 1.034141110185968e-05, + "loss": 1.2797, + "mean_token_accuracy": 0.6479741290211678, + "num_tokens": 11832210.0, + "step": 1580 + }, + { + "entropy": 1.7955584406852723, + "epoch": 5.08, + "grad_norm": 2.106905698776245, + "learning_rate": 1.0237976975461074e-05, + "loss": 1.2881, + "mean_token_accuracy": 0.6424632370471954, + "num_tokens": 11906115.0, + "step": 1590 + }, + { + "entropy": 1.7998322755098344, + "epoch": 5.112, + "grad_norm": 2.327314615249634, + "learning_rate": 1.0134517367428309e-05, + "loss": 1.3324, + "mean_token_accuracy": 0.6460248224437237, + "num_tokens": 11981328.0, + "step": 1600 + }, + { + "entropy": 1.7885828018188477, + "epoch": 5.144, + "grad_norm": 2.1001713275909424, + "learning_rate": 1.0031043355807386e-05, + "loss": 1.3098, + "mean_token_accuracy": 0.63900758177042, + "num_tokens": 12056453.0, + "step": 1610 + }, + { + "entropy": 1.769435602426529, + "epoch": 5.176, + "grad_norm": 2.1210567951202393, + "learning_rate": 9.927566020186592e-06, + "loss": 1.2892, + "mean_token_accuracy": 0.6432970002293587, + "num_tokens": 12133433.0, + "step": 1620 + }, + { + "entropy": 1.7907766073942184, + "epoch": 5.208, + "grad_norm": 2.1842658519744873, + "learning_rate": 9.82409644051013e-06, + "loss": 1.2856, + "mean_token_accuracy": 0.6423615619540215, + "num_tokens": 12207150.0, + "step": 1630 + }, + { + "entropy": 1.7834827870130538, + "epoch": 5.24, + "grad_norm": 2.2503459453582764, + "learning_rate": 9.720645695891733e-06, + "loss": 1.3066, + "mean_token_accuracy": 0.6417693704366684, + "num_tokens": 12282584.0, + "step": 1640 + }, + { + "entropy": 1.763256973028183, + "epoch": 5.272, + "grad_norm": 1.9505388736724854, + "learning_rate": 9.617224863428346e-06, + "loss": 1.2951, + "mean_token_accuracy": 0.6429389595985413, + "num_tokens": 12359793.0, + "step": 1650 + }, + { + "entropy": 1.8142763644456863, + "epoch": 5.304, + "grad_norm": 1.9957698583602905, + "learning_rate": 9.513845017014048e-06, + "loss": 1.33, + "mean_token_accuracy": 0.6413653999567032, + "num_tokens": 12434251.0, + "step": 1660 + }, + { + "entropy": 1.797221601009369, + "epoch": 5.336, + "grad_norm": 2.5095462799072266, + "learning_rate": 9.410517226154276e-06, + "loss": 1.2978, + "mean_token_accuracy": 0.6389835774898529, + "num_tokens": 12508416.0, + "step": 1670 + }, + { + "entropy": 1.8157870292663574, + "epoch": 5.368, + "grad_norm": 2.1890602111816406, + "learning_rate": 9.30725255478058e-06, + "loss": 1.3183, + "mean_token_accuracy": 0.6448161751031876, + "num_tokens": 12582896.0, + "step": 1680 + }, + { + "entropy": 1.7990054041147232, + "epoch": 5.4, + "grad_norm": 2.3904025554656982, + "learning_rate": 9.204062060065915e-06, + "loss": 1.3318, + "mean_token_accuracy": 0.636146479845047, + "num_tokens": 12656802.0, + "step": 1690 + }, + { + "entropy": 1.8003453463315964, + "epoch": 5.432, + "grad_norm": 1.9204304218292236, + "learning_rate": 9.100956791240699e-06, + "loss": 1.3186, + "mean_token_accuracy": 0.6372130662202835, + "num_tokens": 12733283.0, + "step": 1700 + }, + { + "entropy": 1.8101116061210631, + "epoch": 5.464, + "grad_norm": 2.009500026702881, + "learning_rate": 8.997947788409696e-06, + "loss": 1.3247, + "mean_token_accuracy": 0.6406339526176452, + "num_tokens": 12810272.0, + "step": 1710 + }, + { + "entropy": 1.764935952425003, + "epoch": 5.496, + "grad_norm": 2.2038798332214355, + "learning_rate": 8.89504608136989e-06, + "loss": 1.2792, + "mean_token_accuracy": 0.6445836886763573, + "num_tokens": 12885633.0, + "step": 1720 + }, + { + "entropy": 1.7950240582227708, + "epoch": 5.5280000000000005, + "grad_norm": 2.0160531997680664, + "learning_rate": 8.792262688429445e-06, + "loss": 1.2934, + "mean_token_accuracy": 0.6469692558050155, + "num_tokens": 12961131.0, + "step": 1730 + }, + { + "entropy": 1.7804677098989488, + "epoch": 5.5600000000000005, + "grad_norm": 2.1956582069396973, + "learning_rate": 8.689608615227933e-06, + "loss": 1.2969, + "mean_token_accuracy": 0.6438389763236045, + "num_tokens": 13036481.0, + "step": 1740 + }, + { + "entropy": 1.7932062089443206, + "epoch": 5.592, + "grad_norm": 2.2215394973754883, + "learning_rate": 8.587094853557877e-06, + "loss": 1.2907, + "mean_token_accuracy": 0.6460438340902328, + "num_tokens": 13111001.0, + "step": 1750 + }, + { + "entropy": 1.8026408910751344, + "epoch": 5.624, + "grad_norm": 2.3881425857543945, + "learning_rate": 8.484732380187785e-06, + "loss": 1.3049, + "mean_token_accuracy": 0.6414234206080437, + "num_tokens": 13186347.0, + "step": 1760 + }, + { + "entropy": 1.8440747499465941, + "epoch": 5.656, + "grad_norm": 2.2154159545898438, + "learning_rate": 8.382532155686825e-06, + "loss": 1.3797, + "mean_token_accuracy": 0.6365857936441899, + "num_tokens": 13261455.0, + "step": 1770 + }, + { + "entropy": 1.7975190997123718, + "epoch": 5.688, + "grad_norm": 2.1991233825683594, + "learning_rate": 8.280505123251183e-06, + "loss": 1.3191, + "mean_token_accuracy": 0.6393151715397835, + "num_tokens": 13338064.0, + "step": 1780 + }, + { + "entropy": 1.8396487146615983, + "epoch": 5.72, + "grad_norm": 2.0190858840942383, + "learning_rate": 8.178662207532343e-06, + "loss": 1.4052, + "mean_token_accuracy": 0.629064130038023, + "num_tokens": 13414806.0, + "step": 1790 + }, + { + "entropy": 1.7840806126594544, + "epoch": 5.752, + "grad_norm": 2.3335204124450684, + "learning_rate": 8.077014313467274e-06, + "loss": 1.2701, + "mean_token_accuracy": 0.6464540064334869, + "num_tokens": 13489075.0, + "step": 1800 + }, + { + "entropy": 1.7840022534132003, + "epoch": 5.784, + "grad_norm": 2.2151618003845215, + "learning_rate": 7.975572325110819e-06, + "loss": 1.3248, + "mean_token_accuracy": 0.6358998969197274, + "num_tokens": 13565636.0, + "step": 1810 + }, + { + "entropy": 1.7677135676145554, + "epoch": 5.816, + "grad_norm": 2.11505389213562, + "learning_rate": 7.874347104470234e-06, + "loss": 1.2765, + "mean_token_accuracy": 0.6448719501495361, + "num_tokens": 13641112.0, + "step": 1820 + }, + { + "entropy": 1.7586044907569884, + "epoch": 5.848, + "grad_norm": 2.178250789642334, + "learning_rate": 7.773349490342157e-06, + "loss": 1.2846, + "mean_token_accuracy": 0.6450280979275703, + "num_tokens": 13715158.0, + "step": 1830 + }, + { + "entropy": 1.8128920108079911, + "epoch": 5.88, + "grad_norm": 2.2499353885650635, + "learning_rate": 7.672590297152013e-06, + "loss": 1.3248, + "mean_token_accuracy": 0.6343795835971833, + "num_tokens": 13791086.0, + "step": 1840 + }, + { + "entropy": 1.7873643577098846, + "epoch": 5.912, + "grad_norm": 2.1989104747772217, + "learning_rate": 7.572080313796064e-06, + "loss": 1.2907, + "mean_token_accuracy": 0.6413815975189209, + "num_tokens": 13865700.0, + "step": 1850 + }, + { + "entropy": 1.790488451719284, + "epoch": 5.944, + "grad_norm": 2.2605504989624023, + "learning_rate": 7.471830302486151e-06, + "loss": 1.2955, + "mean_token_accuracy": 0.6424889475107193, + "num_tokens": 13938540.0, + "step": 1860 + }, + { + "entropy": 1.7985246628522873, + "epoch": 5.976, + "grad_norm": 2.3228533267974854, + "learning_rate": 7.371850997597355e-06, + "loss": 1.2854, + "mean_token_accuracy": 0.6457341402769089, + "num_tokens": 14011087.0, + "step": 1870 + }, + { + "entropy": 1.7890221727521796, + "epoch": 6.0064, + "grad_norm": 2.192910671234131, + "learning_rate": 7.272153104518567e-06, + "loss": 1.2796, + "mean_token_accuracy": 0.6482133128141102, + "num_tokens": 14082075.0, + "step": 1880 + }, + { + "entropy": 1.7633086562156677, + "epoch": 6.0384, + "grad_norm": 2.368185043334961, + "learning_rate": 7.172747298506224e-06, + "loss": 1.2703, + "mean_token_accuracy": 0.6497290328145027, + "num_tokens": 14156298.0, + "step": 1890 + }, + { + "entropy": 1.750128635764122, + "epoch": 6.0704, + "grad_norm": 2.36487078666687, + "learning_rate": 7.073644223541227e-06, + "loss": 1.2521, + "mean_token_accuracy": 0.6534707516431808, + "num_tokens": 14232528.0, + "step": 1900 + }, + { + "entropy": 1.7416553497314453, + "epoch": 6.1024, + "grad_norm": 2.3927595615386963, + "learning_rate": 6.974854491189243e-06, + "loss": 1.217, + "mean_token_accuracy": 0.6588135868310928, + "num_tokens": 14307073.0, + "step": 1910 + }, + { + "entropy": 1.7359241485595702, + "epoch": 6.1344, + "grad_norm": 2.1107988357543945, + "learning_rate": 6.876388679464437e-06, + "loss": 1.2763, + "mean_token_accuracy": 0.6550255373120308, + "num_tokens": 14383819.0, + "step": 1920 + }, + { + "entropy": 1.7380403220653533, + "epoch": 6.1664, + "grad_norm": 2.4158387184143066, + "learning_rate": 6.7782573316968424e-06, + "loss": 1.2359, + "mean_token_accuracy": 0.656632873415947, + "num_tokens": 14460092.0, + "step": 1930 + }, + { + "entropy": 1.7227638810873032, + "epoch": 6.1984, + "grad_norm": 2.3467485904693604, + "learning_rate": 6.6804709554034075e-06, + "loss": 1.2311, + "mean_token_accuracy": 0.654091839492321, + "num_tokens": 14534160.0, + "step": 1940 + }, + { + "entropy": 1.7244948148727417, + "epoch": 6.2304, + "grad_norm": 2.760057210922241, + "learning_rate": 6.583040021162905e-06, + "loss": 1.2189, + "mean_token_accuracy": 0.6611428812146187, + "num_tokens": 14608592.0, + "step": 1950 + }, + { + "entropy": 1.7471657902002335, + "epoch": 6.2624, + "grad_norm": 2.3923745155334473, + "learning_rate": 6.485974961494772e-06, + "loss": 1.2631, + "mean_token_accuracy": 0.6524021357297898, + "num_tokens": 14683538.0, + "step": 1960 + }, + { + "entropy": 1.7506494253873826, + "epoch": 6.2943999999999996, + "grad_norm": 2.4149715900421143, + "learning_rate": 6.389286169742048e-06, + "loss": 1.2468, + "mean_token_accuracy": 0.6567713841795921, + "num_tokens": 14755778.0, + "step": 1970 + }, + { + "entropy": 1.7104488879442215, + "epoch": 6.3264, + "grad_norm": 2.632632255554199, + "learning_rate": 6.292983998958478e-06, + "loss": 1.2267, + "mean_token_accuracy": 0.6561126798391342, + "num_tokens": 14831036.0, + "step": 1980 + }, + { + "entropy": 1.7591658294200898, + "epoch": 6.3584, + "grad_norm": 2.4012722969055176, + "learning_rate": 6.1970787607999815e-06, + "loss": 1.2693, + "mean_token_accuracy": 0.6490694522857666, + "num_tokens": 14906610.0, + "step": 1990 + }, + { + "entropy": 1.7317969173192977, + "epoch": 6.3904, + "grad_norm": 2.8288748264312744, + "learning_rate": 6.101580724420478e-06, + "loss": 1.235, + "mean_token_accuracy": 0.6564200609922409, + "num_tokens": 14980134.0, + "step": 2000 + }, + { + "entropy": 1.7617577254772185, + "epoch": 6.4224, + "grad_norm": 2.4008944034576416, + "learning_rate": 6.00650011537235e-06, + "loss": 1.2773, + "mean_token_accuracy": 0.6494350075721741, + "num_tokens": 15054969.0, + "step": 2010 + }, + { + "entropy": 1.749829688668251, + "epoch": 6.4544, + "grad_norm": 2.5665228366851807, + "learning_rate": 5.911847114511497e-06, + "loss": 1.2512, + "mean_token_accuracy": 0.6512764275074006, + "num_tokens": 15129421.0, + "step": 2020 + }, + { + "entropy": 1.7387797951698303, + "epoch": 6.4864, + "grad_norm": 2.6020922660827637, + "learning_rate": 5.817631856907233e-06, + "loss": 1.2477, + "mean_token_accuracy": 0.6530226185917855, + "num_tokens": 15203465.0, + "step": 2030 + }, + { + "entropy": 1.7363551884889603, + "epoch": 6.5184, + "grad_norm": 2.161478281021118, + "learning_rate": 5.723864430757047e-06, + "loss": 1.2692, + "mean_token_accuracy": 0.6527093783020973, + "num_tokens": 15279761.0, + "step": 2040 + }, + { + "entropy": 1.7563295543193818, + "epoch": 6.5504, + "grad_norm": 2.5587289333343506, + "learning_rate": 5.630554876306407e-06, + "loss": 1.2211, + "mean_token_accuracy": 0.6574550330638885, + "num_tokens": 15351301.0, + "step": 2050 + }, + { + "entropy": 1.7521151036024094, + "epoch": 6.5824, + "grad_norm": 2.4042234420776367, + "learning_rate": 5.537713184773686e-06, + "loss": 1.271, + "mean_token_accuracy": 0.6478641331195831, + "num_tokens": 15427936.0, + "step": 2060 + }, + { + "entropy": 1.7270145863294601, + "epoch": 6.6144, + "grad_norm": 2.3122522830963135, + "learning_rate": 5.44534929728036e-06, + "loss": 1.224, + "mean_token_accuracy": 0.6566437393426895, + "num_tokens": 15502561.0, + "step": 2070 + }, + { + "entropy": 1.7568089962005615, + "epoch": 6.6464, + "grad_norm": 2.461474895477295, + "learning_rate": 5.353473103786511e-06, + "loss": 1.2864, + "mean_token_accuracy": 0.6471308276057244, + "num_tokens": 15578053.0, + "step": 2080 + }, + { + "entropy": 1.7190027862787247, + "epoch": 6.6784, + "grad_norm": 2.4412550926208496, + "learning_rate": 5.262094442031901e-06, + "loss": 1.2092, + "mean_token_accuracy": 0.6601713746786118, + "num_tokens": 15653342.0, + "step": 2090 + }, + { + "entropy": 1.717634916305542, + "epoch": 6.7104, + "grad_norm": 2.276007890701294, + "learning_rate": 5.171223096482533e-06, + "loss": 1.2271, + "mean_token_accuracy": 0.6595920532941818, + "num_tokens": 15730387.0, + "step": 2100 + }, + { + "entropy": 1.7230647921562194, + "epoch": 6.7424, + "grad_norm": 2.480471134185791, + "learning_rate": 5.080868797283019e-06, + "loss": 1.229, + "mean_token_accuracy": 0.6568982020020485, + "num_tokens": 15804405.0, + "step": 2110 + }, + { + "entropy": 1.7535502433776855, + "epoch": 6.7744, + "grad_norm": 2.448997974395752, + "learning_rate": 4.9910412192146795e-06, + "loss": 1.2584, + "mean_token_accuracy": 0.648795773088932, + "num_tokens": 15878537.0, + "step": 2120 + }, + { + "entropy": 1.786664029955864, + "epoch": 6.8064, + "grad_norm": 2.430039405822754, + "learning_rate": 4.901749980659617e-06, + "loss": 1.3358, + "mean_token_accuracy": 0.6427689291536808, + "num_tokens": 15952964.0, + "step": 2130 + }, + { + "entropy": 1.7594995677471161, + "epoch": 6.8384, + "grad_norm": 2.469172716140747, + "learning_rate": 4.813004642570822e-06, + "loss": 1.2844, + "mean_token_accuracy": 0.6534359715878963, + "num_tokens": 16028086.0, + "step": 2140 + }, + { + "entropy": 1.7347292125225067, + "epoch": 6.8704, + "grad_norm": 2.6162445545196533, + "learning_rate": 4.724814707448418e-06, + "loss": 1.2707, + "mean_token_accuracy": 0.6447671175003051, + "num_tokens": 16103263.0, + "step": 2150 + }, + { + "entropy": 1.7325938045978546, + "epoch": 6.9024, + "grad_norm": 2.416431188583374, + "learning_rate": 4.637189618322173e-06, + "loss": 1.2794, + "mean_token_accuracy": 0.6439008563756943, + "num_tokens": 16182360.0, + "step": 2160 + }, + { + "entropy": 1.7763712674379348, + "epoch": 6.9344, + "grad_norm": 2.3447437286376953, + "learning_rate": 4.550138757740381e-06, + "loss": 1.3043, + "mean_token_accuracy": 0.650251479446888, + "num_tokens": 16256272.0, + "step": 2170 + }, + { + "entropy": 1.739478302001953, + "epoch": 6.9664, + "grad_norm": 2.650451183319092, + "learning_rate": 4.463671446765206e-06, + "loss": 1.259, + "mean_token_accuracy": 0.6514677822589874, + "num_tokens": 16330984.0, + "step": 2180 + }, + { + "entropy": 1.7292406976222991, + "epoch": 6.9984, + "grad_norm": 2.5442306995391846, + "learning_rate": 4.377796943974641e-06, + "loss": 1.2554, + "mean_token_accuracy": 0.6506337657570839, + "num_tokens": 16406982.0, + "step": 2190 + }, + { + "entropy": 1.7582613606201976, + "epoch": 7.0288, + "grad_norm": 2.315408706665039, + "learning_rate": 4.292524444471097e-06, + "loss": 1.2766, + "mean_token_accuracy": 0.6549999933493765, + "num_tokens": 16479870.0, + "step": 2200 + }, + { + "entropy": 1.6856683611869812, + "epoch": 7.0608, + "grad_norm": 2.6291606426239014, + "learning_rate": 4.2078630788968775e-06, + "loss": 1.2051, + "mean_token_accuracy": 0.6612728327512741, + "num_tokens": 16557045.0, + "step": 2210 + }, + { + "entropy": 1.7316391229629517, + "epoch": 7.0928, + "grad_norm": 2.767998695373535, + "learning_rate": 4.123821912456457e-06, + "loss": 1.1981, + "mean_token_accuracy": 0.6648431628942489, + "num_tokens": 16629418.0, + "step": 2220 + }, + { + "entropy": 1.6988701403141022, + "epoch": 7.1248, + "grad_norm": 2.752492904663086, + "learning_rate": 4.040409943945856e-06, + "loss": 1.2277, + "mean_token_accuracy": 0.6629953101277352, + "num_tokens": 16706000.0, + "step": 2230 + }, + { + "entropy": 1.6829528212547302, + "epoch": 7.1568, + "grad_norm": 2.5362343788146973, + "learning_rate": 3.957636104789056e-06, + "loss": 1.1526, + "mean_token_accuracy": 0.6730572417378425, + "num_tokens": 16781728.0, + "step": 2240 + }, + { + "entropy": 1.7018825829029083, + "epoch": 7.1888, + "grad_norm": 2.772650718688965, + "learning_rate": 3.875509258081671e-06, + "loss": 1.1978, + "mean_token_accuracy": 0.6671716704964638, + "num_tokens": 16856290.0, + "step": 2250 + }, + { + "entropy": 1.7013772219419478, + "epoch": 7.2208, + "grad_norm": 2.548017740249634, + "learning_rate": 3.794038197641924e-06, + "loss": 1.2166, + "mean_token_accuracy": 0.6607363104820252, + "num_tokens": 16933284.0, + "step": 2260 + }, + { + "entropy": 1.7119423121213913, + "epoch": 7.2528, + "grad_norm": 3.085073947906494, + "learning_rate": 3.713231647069031e-06, + "loss": 1.1811, + "mean_token_accuracy": 0.6690418004989624, + "num_tokens": 17006677.0, + "step": 2270 + }, + { + "entropy": 1.6958551973104476, + "epoch": 7.2848, + "grad_norm": 2.9240498542785645, + "learning_rate": 3.633098258809119e-06, + "loss": 1.1998, + "mean_token_accuracy": 0.6630783304572105, + "num_tokens": 17081748.0, + "step": 2280 + }, + { + "entropy": 1.7125141024589539, + "epoch": 7.3168, + "grad_norm": 2.5372555255889893, + "learning_rate": 3.5536466132287562e-06, + "loss": 1.1941, + "mean_token_accuracy": 0.6668895781040192, + "num_tokens": 17155159.0, + "step": 2290 + }, + { + "entropy": 1.699021890759468, + "epoch": 7.3488, + "grad_norm": 2.555891752243042, + "learning_rate": 3.4748852176961912e-06, + "loss": 1.1871, + "mean_token_accuracy": 0.6658635303378105, + "num_tokens": 17231062.0, + "step": 2300 + }, + { + "entropy": 1.7193275570869446, + "epoch": 7.3808, + "grad_norm": 2.6699490547180176, + "learning_rate": 3.3968225056704427e-06, + "loss": 1.2396, + "mean_token_accuracy": 0.6561257526278496, + "num_tokens": 17306280.0, + "step": 2310 + }, + { + "entropy": 1.714812269806862, + "epoch": 7.4128, + "grad_norm": 2.8019561767578125, + "learning_rate": 3.319466835798235e-06, + "loss": 1.2006, + "mean_token_accuracy": 0.6625583916902542, + "num_tokens": 17379705.0, + "step": 2320 + }, + { + "entropy": 1.711946851015091, + "epoch": 7.4448, + "grad_norm": 2.758375644683838, + "learning_rate": 3.2428264910190398e-06, + "loss": 1.2234, + "mean_token_accuracy": 0.6590609803795815, + "num_tokens": 17453837.0, + "step": 2330 + }, + { + "entropy": 1.7215852111577987, + "epoch": 7.4768, + "grad_norm": 2.609473466873169, + "learning_rate": 3.166909677678116e-06, + "loss": 1.2182, + "mean_token_accuracy": 0.6575865730643272, + "num_tokens": 17528089.0, + "step": 2340 + }, + { + "entropy": 1.6932504892349243, + "epoch": 7.5088, + "grad_norm": 2.3272244930267334, + "learning_rate": 3.091724524647861e-06, + "loss": 1.1907, + "mean_token_accuracy": 0.665678508579731, + "num_tokens": 17603632.0, + "step": 2350 + }, + { + "entropy": 1.6796110332012177, + "epoch": 7.5408, + "grad_norm": 2.619145631790161, + "learning_rate": 3.0172790824573627e-06, + "loss": 1.1693, + "mean_token_accuracy": 0.6696520581841469, + "num_tokens": 17678354.0, + "step": 2360 + }, + { + "entropy": 1.7049964010715484, + "epoch": 7.5728, + "grad_norm": 2.4621567726135254, + "learning_rate": 2.943581322430399e-06, + "loss": 1.2376, + "mean_token_accuracy": 0.6554866015911103, + "num_tokens": 17755529.0, + "step": 2370 + }, + { + "entropy": 1.734333610534668, + "epoch": 7.6048, + "grad_norm": 2.7798454761505127, + "learning_rate": 2.8706391358318942e-06, + "loss": 1.2638, + "mean_token_accuracy": 0.6574688166379928, + "num_tokens": 17829887.0, + "step": 2380 + }, + { + "entropy": 1.7209048956632613, + "epoch": 7.6368, + "grad_norm": 2.579355239868164, + "learning_rate": 2.798460333022959e-06, + "loss": 1.2515, + "mean_token_accuracy": 0.6567336574196816, + "num_tokens": 17905006.0, + "step": 2390 + }, + { + "entropy": 1.7461798429489135, + "epoch": 7.6688, + "grad_norm": 2.3670754432678223, + "learning_rate": 2.72705264262458e-06, + "loss": 1.2845, + "mean_token_accuracy": 0.6506199359893798, + "num_tokens": 17980732.0, + "step": 2400 + }, + { + "entropy": 1.6880094558000565, + "epoch": 7.7008, + "grad_norm": 2.6313459873199463, + "learning_rate": 2.6564237106900815e-06, + "loss": 1.188, + "mean_token_accuracy": 0.6656670674681664, + "num_tokens": 18055716.0, + "step": 2410 + }, + { + "entropy": 1.695495843887329, + "epoch": 7.7328, + "grad_norm": 2.5075557231903076, + "learning_rate": 2.586581099886396e-06, + "loss": 1.2249, + "mean_token_accuracy": 0.6589935123920441, + "num_tokens": 18132898.0, + "step": 2420 + }, + { + "entropy": 1.7270244836807251, + "epoch": 7.7648, + "grad_norm": 2.8843488693237305, + "learning_rate": 2.5175322886843156e-06, + "loss": 1.2071, + "mean_token_accuracy": 0.6606879189610482, + "num_tokens": 18206187.0, + "step": 2430 + }, + { + "entropy": 1.7067583829164505, + "epoch": 7.7968, + "grad_norm": 2.5878074169158936, + "learning_rate": 2.4492846705576845e-06, + "loss": 1.2108, + "mean_token_accuracy": 0.662336565554142, + "num_tokens": 18280919.0, + "step": 2440 + }, + { + "entropy": 1.7000836163759232, + "epoch": 7.8288, + "grad_norm": 2.6365489959716797, + "learning_rate": 2.381845553191783e-06, + "loss": 1.2129, + "mean_token_accuracy": 0.661854301393032, + "num_tokens": 18356501.0, + "step": 2450 + }, + { + "entropy": 1.7049589693546294, + "epoch": 7.8608, + "grad_norm": 2.82125186920166, + "learning_rate": 2.315222157700797e-06, + "loss": 1.1956, + "mean_token_accuracy": 0.6644625499844551, + "num_tokens": 18429752.0, + "step": 2460 + }, + { + "entropy": 1.7245618909597398, + "epoch": 7.8928, + "grad_norm": 2.726285934448242, + "learning_rate": 2.2494216178546647e-06, + "loss": 1.1784, + "mean_token_accuracy": 0.6688286542892456, + "num_tokens": 18500518.0, + "step": 2470 + }, + { + "entropy": 1.710511189699173, + "epoch": 7.9248, + "grad_norm": 2.741217613220215, + "learning_rate": 2.184450979315177e-06, + "loss": 1.2731, + "mean_token_accuracy": 0.6543245255947113, + "num_tokens": 18579758.0, + "step": 2480 + }, + { + "entropy": 1.7591080367565155, + "epoch": 7.9568, + "grad_norm": 2.852081537246704, + "learning_rate": 2.1203171988815764e-06, + "loss": 1.3156, + "mean_token_accuracy": 0.6505647979676723, + "num_tokens": 18655339.0, + "step": 2490 + }, + { + "entropy": 1.7133348882198334, + "epoch": 7.9888, + "grad_norm": 2.7279410362243652, + "learning_rate": 2.057027143745646e-06, + "loss": 1.2112, + "mean_token_accuracy": 0.6649632036685944, + "num_tokens": 18728669.0, + "step": 2500 + }, + { + "entropy": 1.7192017655623586, + "epoch": 8.0192, + "grad_norm": 2.6821248531341553, + "learning_rate": 1.994587590756397e-06, + "loss": 1.2187, + "mean_token_accuracy": 0.6636982513101477, + "num_tokens": 18799987.0, + "step": 2510 + }, + { + "entropy": 1.6869423031806945, + "epoch": 8.0512, + "grad_norm": 2.9373373985290527, + "learning_rate": 1.9330052256944355e-06, + "loss": 1.1855, + "mean_token_accuracy": 0.667529807984829, + "num_tokens": 18876438.0, + "step": 2520 + }, + { + "entropy": 1.6881215393543243, + "epoch": 8.0832, + "grad_norm": 2.6245837211608887, + "learning_rate": 1.872286642556066e-06, + "loss": 1.2082, + "mean_token_accuracy": 0.6658473521471023, + "num_tokens": 18951429.0, + "step": 2530 + }, + { + "entropy": 1.7176503151655198, + "epoch": 8.1152, + "grad_norm": 2.9030368328094482, + "learning_rate": 1.8124383428472402e-06, + "loss": 1.2514, + "mean_token_accuracy": 0.6588339149951935, + "num_tokens": 19026806.0, + "step": 2540 + }, + { + "entropy": 1.727318498492241, + "epoch": 8.1472, + "grad_norm": 2.9318246841430664, + "learning_rate": 1.7534667348874068e-06, + "loss": 1.2355, + "mean_token_accuracy": 0.6663900375366211, + "num_tokens": 19100428.0, + "step": 2550 + }, + { + "entropy": 1.6834076642990112, + "epoch": 8.1792, + "grad_norm": 2.59753680229187, + "learning_rate": 1.6953781331233078e-06, + "loss": 1.1813, + "mean_token_accuracy": 0.6736618399620056, + "num_tokens": 19177248.0, + "step": 2560 + }, + { + "entropy": 1.6987117886543275, + "epoch": 8.2112, + "grad_norm": 2.7193281650543213, + "learning_rate": 1.638178757452894e-06, + "loss": 1.2408, + "mean_token_accuracy": 0.6610291600227356, + "num_tokens": 19252481.0, + "step": 2570 + }, + { + "entropy": 1.6835868597030639, + "epoch": 8.2432, + "grad_norm": 2.6238672733306885, + "learning_rate": 1.5818747325592765e-06, + "loss": 1.2186, + "mean_token_accuracy": 0.6627017930150032, + "num_tokens": 19329624.0, + "step": 2580 + }, + { + "entropy": 1.689078041911125, + "epoch": 8.2752, + "grad_norm": 2.643185615539551, + "learning_rate": 1.5264720872549622e-06, + "loss": 1.185, + "mean_token_accuracy": 0.6670305237174035, + "num_tokens": 19404703.0, + "step": 2590 + }, + { + "entropy": 1.6852527409791946, + "epoch": 8.3072, + "grad_norm": 2.3309333324432373, + "learning_rate": 1.471976753836285e-06, + "loss": 1.1771, + "mean_token_accuracy": 0.6721793606877327, + "num_tokens": 19479389.0, + "step": 2600 + }, + { + "entropy": 1.6828288197517396, + "epoch": 8.3392, + "grad_norm": 3.0214831829071045, + "learning_rate": 1.418394567448207e-06, + "loss": 1.1345, + "mean_token_accuracy": 0.6779033228754997, + "num_tokens": 19551761.0, + "step": 2610 + }, + { + "entropy": 1.679307323694229, + "epoch": 8.3712, + "grad_norm": 2.65228009223938, + "learning_rate": 1.3657312654595168e-06, + "loss": 1.1723, + "mean_token_accuracy": 0.6679997086524964, + "num_tokens": 19625882.0, + "step": 2620 + }, + { + "entropy": 1.6989664137363434, + "epoch": 8.4032, + "grad_norm": 2.541346788406372, + "learning_rate": 1.31399248684849e-06, + "loss": 1.197, + "mean_token_accuracy": 0.6655030563473702, + "num_tokens": 19699807.0, + "step": 2630 + }, + { + "entropy": 1.7121862709522246, + "epoch": 8.4352, + "grad_norm": 2.8018271923065186, + "learning_rate": 1.2631837715990957e-06, + "loss": 1.1807, + "mean_token_accuracy": 0.6682800635695457, + "num_tokens": 19772329.0, + "step": 2640 + }, + { + "entropy": 1.6779522567987442, + "epoch": 8.4672, + "grad_norm": 2.786879777908325, + "learning_rate": 1.213310560107791e-06, + "loss": 1.172, + "mean_token_accuracy": 0.6687621667981147, + "num_tokens": 19847678.0, + "step": 2650 + }, + { + "entropy": 1.6814669162034988, + "epoch": 8.4992, + "grad_norm": 2.6873278617858887, + "learning_rate": 1.1643781926009846e-06, + "loss": 1.1662, + "mean_token_accuracy": 0.6698434934020042, + "num_tokens": 19922014.0, + "step": 2660 + }, + { + "entropy": 1.6885319381952286, + "epoch": 8.5312, + "grad_norm": 2.8372082710266113, + "learning_rate": 1.116391908563239e-06, + "loss": 1.1679, + "mean_token_accuracy": 0.6713936537504196, + "num_tokens": 19995173.0, + "step": 2670 + }, + { + "entropy": 1.6706371814012528, + "epoch": 8.5632, + "grad_norm": 2.465330123901367, + "learning_rate": 1.0693568461762238e-06, + "loss": 1.1611, + "mean_token_accuracy": 0.6726542502641678, + "num_tokens": 20071888.0, + "step": 2680 + }, + { + "entropy": 1.6876189917325974, + "epoch": 8.5952, + "grad_norm": 2.66048264503479, + "learning_rate": 1.023278041768565e-06, + "loss": 1.1801, + "mean_token_accuracy": 0.6687627866864204, + "num_tokens": 20147771.0, + "step": 2690 + }, + { + "entropy": 1.6499578058719635, + "epoch": 8.6272, + "grad_norm": 2.6129636764526367, + "learning_rate": 9.781604292765524e-07, + "loss": 1.1687, + "mean_token_accuracy": 0.6690827563405037, + "num_tokens": 20225986.0, + "step": 2700 + }, + { + "entropy": 1.7168281257152558, + "epoch": 8.6592, + "grad_norm": 2.7212672233581543, + "learning_rate": 9.34008839715852e-07, + "loss": 1.2203, + "mean_token_accuracy": 0.6617053344845771, + "num_tokens": 20299960.0, + "step": 2710 + }, + { + "entropy": 1.6980592608451843, + "epoch": 8.6912, + "grad_norm": 2.9652392864227295, + "learning_rate": 8.90828000664209e-07, + "loss": 1.2021, + "mean_token_accuracy": 0.6637651309370994, + "num_tokens": 20375041.0, + "step": 2720 + }, + { + "entropy": 1.6723056137561798, + "epoch": 8.7232, + "grad_norm": 2.5188965797424316, + "learning_rate": 8.486225357552369e-07, + "loss": 1.1326, + "mean_token_accuracy": 0.6736432477831841, + "num_tokens": 20448565.0, + "step": 2730 + }, + { + "entropy": 1.6988836616277694, + "epoch": 8.7552, + "grad_norm": 3.4039053916931152, + "learning_rate": 8.073969641833446e-07, + "loss": 1.2387, + "mean_token_accuracy": 0.6662507936358452, + "num_tokens": 20524681.0, + "step": 2740 + }, + { + "entropy": 1.679963505268097, + "epoch": 8.7872, + "grad_norm": 2.585479497909546, + "learning_rate": 7.671557002198316e-07, + "loss": 1.1417, + "mean_token_accuracy": 0.6760099261999131, + "num_tokens": 20598013.0, + "step": 2750 + }, + { + "entropy": 1.6847810536623, + "epoch": 8.8192, + "grad_norm": 2.4690332412719727, + "learning_rate": 7.279030527402297e-07, + "loss": 1.1946, + "mean_token_accuracy": 0.6612684234976769, + "num_tokens": 20672943.0, + "step": 2760 + }, + { + "entropy": 1.6944990634918213, + "epoch": 8.8512, + "grad_norm": 2.9186036586761475, + "learning_rate": 6.896432247629237e-07, + "loss": 1.1897, + "mean_token_accuracy": 0.6619520500302315, + "num_tokens": 20748074.0, + "step": 2770 + }, + { + "entropy": 1.6818810850381851, + "epoch": 8.8832, + "grad_norm": 2.770056962966919, + "learning_rate": 6.52380312999108e-07, + "loss": 1.1955, + "mean_token_accuracy": 0.6678687691688537, + "num_tokens": 20825520.0, + "step": 2780 + }, + { + "entropy": 1.7140829920768739, + "epoch": 8.9152, + "grad_norm": 2.797840118408203, + "learning_rate": 6.161183074141319e-07, + "loss": 1.249, + "mean_token_accuracy": 0.6633088305592537, + "num_tokens": 20899714.0, + "step": 2790 + }, + { + "entropy": 1.6798905462026597, + "epoch": 8.9472, + "grad_norm": 2.660682201385498, + "learning_rate": 5.808610908002599e-07, + "loss": 1.1739, + "mean_token_accuracy": 0.6718563959002495, + "num_tokens": 20975614.0, + "step": 2800 + }, + { + "entropy": 1.6790486425161362, + "epoch": 8.9792, + "grad_norm": 2.887852191925049, + "learning_rate": 5.466124383609317e-07, + "loss": 1.2012, + "mean_token_accuracy": 0.6658951610326767, + "num_tokens": 21052188.0, + "step": 2810 + }, + { + "entropy": 1.7020568722172786, + "epoch": 9.0096, + "grad_norm": 2.808269500732422, + "learning_rate": 5.133760173065139e-07, + "loss": 1.202, + "mean_token_accuracy": 0.6611303969433433, + "num_tokens": 21122753.0, + "step": 2820 + }, + { + "entropy": 1.687294954061508, + "epoch": 9.0416, + "grad_norm": 2.8108749389648438, + "learning_rate": 4.811553864616392e-07, + "loss": 1.1919, + "mean_token_accuracy": 0.6673015937209129, + "num_tokens": 21199039.0, + "step": 2830 + }, + { + "entropy": 1.6716860055923461, + "epoch": 9.0736, + "grad_norm": 2.86171555519104, + "learning_rate": 4.4995399588413925e-07, + "loss": 1.1499, + "mean_token_accuracy": 0.675554732978344, + "num_tokens": 21273603.0, + "step": 2840 + }, + { + "entropy": 1.6763453155755996, + "epoch": 9.1056, + "grad_norm": 2.9186508655548096, + "learning_rate": 4.197751864956234e-07, + "loss": 1.158, + "mean_token_accuracy": 0.6813155248761177, + "num_tokens": 21348222.0, + "step": 2850 + }, + { + "entropy": 1.6480779707431794, + "epoch": 9.1376, + "grad_norm": 2.8819360733032227, + "learning_rate": 3.9062218972374946e-07, + "loss": 1.1622, + "mean_token_accuracy": 0.6734258145093918, + "num_tokens": 21426613.0, + "step": 2860 + }, + { + "entropy": 1.7158046215772629, + "epoch": 9.1696, + "grad_norm": 3.0548996925354004, + "learning_rate": 3.6249812715621114e-07, + "loss": 1.1986, + "mean_token_accuracy": 0.6687729433178902, + "num_tokens": 21499642.0, + "step": 2870 + }, + { + "entropy": 1.6646199077367783, + "epoch": 9.2016, + "grad_norm": 3.0521023273468018, + "learning_rate": 3.3540601020650043e-07, + "loss": 1.178, + "mean_token_accuracy": 0.6709826931357383, + "num_tokens": 21575774.0, + "step": 2880 + }, + { + "entropy": 1.6796242028474808, + "epoch": 9.2336, + "grad_norm": 3.1181037425994873, + "learning_rate": 3.093487397914408e-07, + "loss": 1.171, + "mean_token_accuracy": 0.6708515658974648, + "num_tokens": 21649453.0, + "step": 2890 + }, + { + "entropy": 1.6794216215610505, + "epoch": 9.2656, + "grad_norm": 2.6174721717834473, + "learning_rate": 2.843291060205855e-07, + "loss": 1.1964, + "mean_token_accuracy": 0.6628055095672607, + "num_tokens": 21725452.0, + "step": 2900 + }, + { + "entropy": 1.664293149113655, + "epoch": 9.2976, + "grad_norm": 3.111161470413208, + "learning_rate": 2.6034978789745147e-07, + "loss": 1.1669, + "mean_token_accuracy": 0.6700958028435707, + "num_tokens": 21800644.0, + "step": 2910 + }, + { + "entropy": 1.6728513896465302, + "epoch": 9.3296, + "grad_norm": 2.8739449977874756, + "learning_rate": 2.3741335303266945e-07, + "loss": 1.1571, + "mean_token_accuracy": 0.6733614057302475, + "num_tokens": 21874239.0, + "step": 2920 + }, + { + "entropy": 1.6704490303993225, + "epoch": 9.3616, + "grad_norm": 2.9254367351531982, + "learning_rate": 2.1552225736905074e-07, + "loss": 1.1635, + "mean_token_accuracy": 0.6746389493346214, + "num_tokens": 21949654.0, + "step": 2930 + }, + { + "entropy": 1.681338819861412, + "epoch": 9.3936, + "grad_norm": 3.1011123657226562, + "learning_rate": 1.9467884491861656e-07, + "loss": 1.1865, + "mean_token_accuracy": 0.6682163000106811, + "num_tokens": 22024666.0, + "step": 2940 + }, + { + "entropy": 1.6906850814819336, + "epoch": 9.4256, + "grad_norm": 2.701781749725342, + "learning_rate": 1.7488534751160968e-07, + "loss": 1.2131, + "mean_token_accuracy": 0.6646158531308174, + "num_tokens": 22101712.0, + "step": 2950 + }, + { + "entropy": 1.6738087445497514, + "epoch": 9.4576, + "grad_norm": 2.8893275260925293, + "learning_rate": 1.5614388455751695e-07, + "loss": 1.1704, + "mean_token_accuracy": 0.6673101767897606, + "num_tokens": 22177007.0, + "step": 2960 + }, + { + "entropy": 1.6411693811416626, + "epoch": 9.4896, + "grad_norm": 2.499668598175049, + "learning_rate": 1.3845646281813508e-07, + "loss": 1.1654, + "mean_token_accuracy": 0.6725494012236595, + "num_tokens": 22254895.0, + "step": 2970 + }, + { + "entropy": 1.6842692226171494, + "epoch": 9.5216, + "grad_norm": 2.824566602706909, + "learning_rate": 1.218249761926904e-07, + "loss": 1.2049, + "mean_token_accuracy": 0.6695105731487274, + "num_tokens": 22331936.0, + "step": 2980 + }, + { + "entropy": 1.6937810093164445, + "epoch": 9.5536, + "grad_norm": 2.895420551300049, + "learning_rate": 1.0625120551505219e-07, + "loss": 1.2132, + "mean_token_accuracy": 0.6676653727889061, + "num_tokens": 22406574.0, + "step": 2990 + }, + { + "entropy": 1.6784279078245163, + "epoch": 9.5856, + "grad_norm": 3.2500829696655273, + "learning_rate": 9.173681836304737e-08, + "loss": 1.1551, + "mean_token_accuracy": 0.6703512877225876, + "num_tokens": 22479427.0, + "step": 3000 + }, + { + "entropy": 1.6994036912918091, + "epoch": 9.6176, + "grad_norm": 2.849135398864746, + "learning_rate": 7.82833688798934e-08, + "loss": 1.2222, + "mean_token_accuracy": 0.6700764432549476, + "num_tokens": 22553585.0, + "step": 3010 + }, + { + "entropy": 1.6797764748334885, + "epoch": 9.6496, + "grad_norm": 2.81838059425354, + "learning_rate": 6.589229760780358e-08, + "loss": 1.1758, + "mean_token_accuracy": 0.6680518165230751, + "num_tokens": 22626334.0, + "step": 3020 + }, + { + "entropy": 1.689881592988968, + "epoch": 9.6816, + "grad_norm": 2.747126579284668, + "learning_rate": 5.456493133372265e-08, + "loss": 1.1908, + "mean_token_accuracy": 0.6759619385004043, + "num_tokens": 22700226.0, + "step": 3030 + }, + { + "entropy": 1.6853878051042557, + "epoch": 9.7136, + "grad_norm": 2.680530548095703, + "learning_rate": 4.4302482947273794e-08, + "loss": 1.1966, + "mean_token_accuracy": 0.6655573427677155, + "num_tokens": 22776278.0, + "step": 3040 + }, + { + "entropy": 1.668430432677269, + "epoch": 9.7456, + "grad_norm": 2.925072193145752, + "learning_rate": 3.5106051310876963e-08, + "loss": 1.1737, + "mean_token_accuracy": 0.6721765831112861, + "num_tokens": 22852169.0, + "step": 3050 + }, + { + "entropy": 1.6752154231071472, + "epoch": 9.7776, + "grad_norm": 3.129913806915283, + "learning_rate": 2.6976621142092985e-08, + "loss": 1.1714, + "mean_token_accuracy": 0.6710604816675186, + "num_tokens": 22927712.0, + "step": 3060 + }, + { + "entropy": 1.7014556497335434, + "epoch": 9.8096, + "grad_norm": 3.160234212875366, + "learning_rate": 1.9915062908179018e-08, + "loss": 1.2091, + "mean_token_accuracy": 0.6720787703990936, + "num_tokens": 23001114.0, + "step": 3070 + }, + { + "entropy": 1.6812238603830338, + "epoch": 9.8416, + "grad_norm": 2.8287084102630615, + "learning_rate": 1.3922132732888671e-08, + "loss": 1.1776, + "mean_token_accuracy": 0.6745603963732719, + "num_tokens": 23076453.0, + "step": 3080 + }, + { + "entropy": 1.6627614200115204, + "epoch": 9.8736, + "grad_norm": 2.616438627243042, + "learning_rate": 8.998472315502326e-09, + "loss": 1.1742, + "mean_token_accuracy": 0.6687036663293838, + "num_tokens": 23153953.0, + "step": 3090 + }, + { + "entropy": 1.68206068277359, + "epoch": 9.9056, + "grad_norm": 3.164825201034546, + "learning_rate": 5.144608862122091e-09, + "loss": 1.1592, + "mean_token_accuracy": 0.6742370665073395, + "num_tokens": 23228258.0, + "step": 3100 + }, + { + "entropy": 1.6850847274065017, + "epoch": 9.9376, + "grad_norm": 2.848604917526245, + "learning_rate": 2.3609550292180704e-09, + "loss": 1.1626, + "mean_token_accuracy": 0.6706350147724152, + "num_tokens": 23301338.0, + "step": 3110 + }, + { + "entropy": 1.6835005402565002, + "epoch": 9.9696, + "grad_norm": 3.0547077655792236, + "learning_rate": 6.478088794448223e-10, + "loss": 1.1727, + "mean_token_accuracy": 0.6699156507849693, + "num_tokens": 23376057.0, + "step": 3120 + }, + { + "entropy": 1.6769425210199858, + "epoch": 10.0, + "grad_norm": 4.373083591461182, + "learning_rate": 5.353849719114123e-12, + "loss": 1.1583, + "mean_token_accuracy": 0.6756294855945989, + "num_tokens": 23444400.0, + "step": 3130 + } + ], + "logging_steps": 10, + "max_steps": 3130, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.2785264648762163e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3130/training_args.bin b/checkpoint-3130/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..ca9563f4064abfd2f36668559093a5f0763d7c85 --- /dev/null +++ b/checkpoint-3130/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88fbd415d023bc35b9e36c515e374642c961f2c3a428f0d0bba13a27d8c151a7 +size 6289 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..02ee80b6196926a5ad790a004d9efd6ab1ba6542 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,16 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8b0c7c141373ca36e5e819a28f60e146ccef652f --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..ca9563f4064abfd2f36668559093a5f0763d7c85 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88fbd415d023bc35b9e36c515e374642c961f2c3a428f0d0bba13a27d8c151a7 +size 6289