diff --git a/.gitattributes b/.gitattributes index 52373fe24473b1aa44333d318f578ae6bf04b49b..26c416c25f9a1d72ed2556d2e4c9796a2c0df9d7 100644 --- a/.gitattributes +++ b/.gitattributes @@ -34,3 +34,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1108/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1385/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1662/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-277/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-554/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-831/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md index 58a4061707bcc32db3b543936f6b650c01f3dccb..84338ed4fcff7c00153c3915ec07a24afd9e1bc7 100644 --- a/README.md +++ b/README.md @@ -1,208 +1,61 @@ --- base_model: openai/gpt-oss-20b library_name: peft +model_name: foamGPT tags: - base_model:adapter:openai/gpt-oss-20b - lora - sft - transformers - trl +licence: license --- -# Model Card for Model ID +# Model Card for foamGPT - +This model is a fine-tuned version of [openai/gpt-oss-20b](https://huggingface.co/openai/gpt-oss-20b). +It has been trained using [TRL](https://github.com/huggingface/trl). +## Quick start +```python +from transformers import pipeline -## Model Details +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` -### Model Description +## Training procedure - + +This model was trained with SFT. -- **Developed by:** [More Information Needed] -- **Funded by [optional]:** [More Information Needed] -- **Shared by [optional]:** [More Information Needed] -- **Model type:** [More Information Needed] -- **Language(s) (NLP):** [More Information Needed] -- **License:** [More Information Needed] -- **Finetuned from model [optional]:** [More Information Needed] - -### Model Sources [optional] - - - -- **Repository:** [More Information Needed] -- **Paper [optional]:** [More Information Needed] -- **Demo [optional]:** [More Information Needed] - -## Uses - - - -### Direct Use - - - -[More Information Needed] - -### Downstream Use [optional] - - - -[More Information Needed] - -### Out-of-Scope Use - - - -[More Information Needed] - -## Bias, Risks, and Limitations - - - -[More Information Needed] - -### Recommendations - - - -Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. - -## How to Get Started with the Model - -Use the code below to get started with the model. - -[More Information Needed] - -## Training Details - -### Training Data - - - -[More Information Needed] - -### Training Procedure - - - -#### Preprocessing [optional] - -[More Information Needed] - - -#### Training Hyperparameters - -- **Training regime:** [More Information Needed] - -#### Speeds, Sizes, Times [optional] - - - -[More Information Needed] - -## Evaluation - - - -### Testing Data, Factors & Metrics - -#### Testing Data - - - -[More Information Needed] - -#### Factors - - - -[More Information Needed] - -#### Metrics - - - -[More Information Needed] - -### Results - -[More Information Needed] - -#### Summary - - - -## Model Examination [optional] - - - -[More Information Needed] - -## Environmental Impact - - - -Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). - -- **Hardware Type:** [More Information Needed] -- **Hours used:** [More Information Needed] -- **Cloud Provider:** [More Information Needed] -- **Compute Region:** [More Information Needed] -- **Carbon Emitted:** [More Information Needed] - -## Technical Specifications [optional] - -### Model Architecture and Objective - -[More Information Needed] - -### Compute Infrastructure - -[More Information Needed] - -#### Hardware - -[More Information Needed] - -#### Software - -[More Information Needed] - -## Citation [optional] - - - -**BibTeX:** - -[More Information Needed] - -**APA:** - -[More Information Needed] - -## Glossary [optional] - - - -[More Information Needed] - -## More Information [optional] - -[More Information Needed] - -## Model Card Authors [optional] - -[More Information Needed] - -## Model Card Contact - -[More Information Needed] ### Framework versions -- PEFT 0.18.0 \ No newline at end of file +- PEFT 0.18.0 +- TRL: 0.25.1 +- Transformers: 4.57.3 +- Pytorch: 2.9.1+cu128 +- Datasets: 4.4.1 +- Tokenizers: 0.22.1 + +## Citations + + + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json index f51c324b05bed2627ace076076f0c5b61b46e814..076480eaf349cc658de2eb00b26c7360a85f8f56 100644 --- a/adapter_config.json +++ b/adapter_config.json @@ -32,10 +32,10 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "o_proj", "k_proj", - "q_proj", - "v_proj" + "v_proj", + "o_proj", + "q_proj" ], "target_parameters": [ "7.mlp.experts.gate_up_proj", diff --git a/adapter_model.safetensors b/adapter_model.safetensors index 215c88d22704ae79acca1a487db41607742d4ef1..441ba7da699c3189ce9a3b4c7b9665bb477011ee 100644 --- a/adapter_model.safetensors +++ b/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8cf684cd7e0aa4938a29e6a58379151433ed24442753e3c025b0b09cb31ef199 +oid sha256:cbb7e0563f99e663dc1295f65f2fd5fa33a2cf7adec8ea455a3bee433b491f61 size 60189176 diff --git a/checkpoint-1108/README.md b/checkpoint-1108/README.md new file mode 100644 index 0000000000000000000000000000000000000000..58a4061707bcc32db3b543936f6b650c01f3dccb --- /dev/null +++ b/checkpoint-1108/README.md @@ -0,0 +1,208 @@ +--- +base_model: openai/gpt-oss-20b +library_name: peft +tags: +- base_model:adapter:openai/gpt-oss-20b +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/checkpoint-1108/adapter_config.json b/checkpoint-1108/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..076480eaf349cc658de2eb00b26c7360a85f8f56 --- /dev/null +++ b/checkpoint-1108/adapter_config.json @@ -0,0 +1,53 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "GptOssForCausalLM", + "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss" + }, + "base_model_name_or_path": "openai/gpt-oss-20b", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "o_proj", + "q_proj" + ], + "target_parameters": [ + "7.mlp.experts.gate_up_proj", + "7.mlp.experts.down_proj", + "15.mlp.experts.gate_up_proj", + "15.mlp.experts.down_proj", + "23.mlp.experts.gate_up_proj", + "23.mlp.experts.down_proj" + ], + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1108/adapter_model.safetensors b/checkpoint-1108/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..945775ab6c9602626bb7a4f1e17452e62fa7a984 --- /dev/null +++ b/checkpoint-1108/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dafd5d56dcc3e532cf783a9a9664ca1ddbbab8889a87f246a6048baf6f49978 +size 60189176 diff --git a/checkpoint-1108/chat_template.jinja b/checkpoint-1108/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc7bb11927d29f653ba2740f2db2c688fd77592f --- /dev/null +++ b/checkpoint-1108/chat_template.jinja @@ -0,0 +1,331 @@ +{#- + In addition to the normal inputs of `messages` and `tools`, this template also accepts the + following kwargs: + - "builtin_tools": A list, can contain "browser" and/or "python". + - "model_identity": A string that optionally describes the model identity. + - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium". + #} + +{#- Tool Definition Rendering ============================================== #} +{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%} + {%- if param_spec.type == "array" -%} + {%- if param_spec['items'] -%} + {%- if param_spec['items']['type'] == "string" -%} + {{- "string[]" }} + {%- elif param_spec['items']['type'] == "number" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "integer" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "boolean" -%} + {{- "boolean[]" }} + {%- else -%} + {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%} + {%- if inner_type == "object | object" or inner_type|length > 50 -%} + {{- "any[]" }} + {%- else -%} + {{- inner_type + "[]" }} + {%- endif -%} + {%- endif -%} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- else -%} + {{- "any[]" }} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%} + {#- Handle array of types like ["object", "object"] from Union[dict, list] #} + {%- if param_spec.type | length > 1 -%} + {{- param_spec.type | join(" | ") }} + {%- else -%} + {{- param_spec.type[0] }} + {%- endif -%} + {%- elif param_spec.oneOf -%} + {#- Handle oneOf schemas - check for complex unions and fallback to any #} + {%- set has_object_variants = false -%} + {%- for variant in param_spec.oneOf -%} + {%- if variant.type == "object" -%} + {%- set has_object_variants = true -%} + {%- endif -%} + {%- endfor -%} + {%- if has_object_variants and param_spec.oneOf|length > 1 -%} + {{- "any" }} + {%- else -%} + {%- for variant in param_spec.oneOf -%} + {{- render_typescript_type(variant, required_params) -}} + {%- if variant.description %} + {{- "// " + variant.description }} + {%- endif -%} + {%- if variant.default is defined %} + {{ "// default: " + variant.default|tojson }} + {%- endif -%} + {%- if not loop.last %} + {{- " | " }} + {% endif -%} + {%- endfor -%} + {%- endif -%} + {%- elif param_spec.type == "string" -%} + {%- if param_spec.enum -%} + {{- '"' + param_spec.enum|join('" | "') + '"' -}} + {%- else -%} + {{- "string" }} + {%- if param_spec.nullable %} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type == "number" -%} + {{- "number" }} + {%- elif param_spec.type == "integer" -%} + {{- "number" }} + {%- elif param_spec.type == "boolean" -%} + {{- "boolean" }} + + {%- elif param_spec.type == "object" -%} + {%- if param_spec.properties -%} + {{- "{\n" }} + {%- for prop_name, prop_spec in param_spec.properties.items() -%} + {{- prop_name -}} + {%- if prop_name not in (param_spec.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{ render_typescript_type(prop_spec, param_spec.required or []) }} + {%- if not loop.last -%} + {{-", " }} + {%- endif -%} + {%- endfor -%} + {{- "}" }} + {%- else -%} + {{- "object" }} + {%- endif -%} + {%- else -%} + {{- "any" }} + {%- endif -%} +{%- endmacro -%} + +{%- macro render_tool_namespace(namespace_name, tools) -%} + {{- "## " + namespace_name + "\n\n" }} + {{- "namespace " + namespace_name + " {\n\n" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- "// " + tool.description + "\n" }} + {{- "type "+ tool.name + " = " }} + {%- if tool.parameters and tool.parameters.properties %} + {{- "(_: {\n" }} + {%- for param_name, param_spec in tool.parameters.properties.items() %} + {%- if param_spec.description %} + {{- "// " + param_spec.description + "\n" }} + {%- endif %} + {{- param_name }} + {%- if param_name not in (tool.parameters.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{- render_typescript_type(param_spec, tool.parameters.required or []) }} + {%- if param_spec.default is defined -%} + {%- if param_spec.enum %} + {{- ", // default: " + param_spec.default }} + {%- elif param_spec.oneOf %} + {{- "// default: " + param_spec.default }} + {%- else %} + {{- ", // default: " + param_spec.default|tojson }} + {%- endif -%} + {%- endif -%} + {%- if not loop.last %} + {{- ",\n" }} + {%- else %} + {{- ",\n" }} + {%- endif -%} + {%- endfor %} + {{- "}) => any;\n\n" }} + {%- else -%} + {{- "() => any;\n\n" }} + {%- endif -%} + {%- endfor %} + {{- "} // namespace " + namespace_name }} +{%- endmacro -%} + +{%- macro render_builtin_tools(browser_tool, python_tool) -%} + {%- if browser_tool %} + {{- "## browser\n\n" }} + {{- "// Tool for browsing.\n" }} + {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }} + {{- "// Cite information from the tool using the following format:\n" }} + {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }} + {{- "// Do not quote more than 10 words directly from the tool output.\n" }} + {{- "// sources=web (default: web)\n" }} + {{- "namespace browser {\n\n" }} + {{- "// Searches for information related to `query` and displays `topn` results.\n" }} + {{- "type search = (_: {\n" }} + {{- "query: string,\n" }} + {{- "topn?: number, // default: 10\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }} + {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }} + {{- "// If `cursor` is not provided, the most recent page is implied.\n" }} + {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }} + {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }} + {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }} + {{- "type open = (_: {\n" }} + {{- "id?: number | string, // default: -1\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "loc?: number, // default: -1\n" }} + {{- "num_lines?: number, // default: -1\n" }} + {{- "view_source?: boolean, // default: false\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }} + {{- "type find = (_: {\n" }} + {{- "pattern: string,\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "}) => any;\n\n" }} + {{- "} // namespace browser\n\n" }} + {%- endif -%} + + {%- if python_tool %} + {{- "## python\n\n" }} + {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }} + {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }} + {%- endif -%} +{%- endmacro -%} + +{#- System Message Construction ============================================ #} +{%- macro build_system_message() -%} + {%- if model_identity is not defined %} + {%- set model_identity = "You are ChatGPT, a large language model trained by OpenAI." %} + {%- endif %} + {{- model_identity + "\n" }} + {{- "Knowledge cutoff: 2024-06\n" }} + {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }} + {%- if reasoning_effort is not defined %} + {%- set reasoning_effort = "medium" %} + {%- endif %} + {{- "Reasoning: " + reasoning_effort + "\n\n" }} + {%- if builtin_tools %} + {{- "# Tools\n\n" }} + {%- set available_builtin_tools = namespace(browser=false, python=false) %} + {%- for tool in builtin_tools %} + {%- if tool == "browser" %} + {%- set available_builtin_tools.browser = true %} + {%- elif tool == "python" %} + {%- set available_builtin_tools.python = true %} + {%- endif %} + {%- endfor %} + {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }} + {%- endif -%} + {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }} + {%- if tools -%} + {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }} + {%- endif -%} +{%- endmacro -%} + +{#- Main Template Logic ================================================= #} +{#- Set defaults #} + +{#- Render system message #} +{{- "<|start|>system<|message|>" }} +{{- build_system_message() }} +{{- "<|end|>" }} + +{#- Extract developer message #} +{%- if messages[0].role == "developer" or messages[0].role == "system" %} + {%- set developer_message = messages[0].content %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set developer_message = "" %} + {%- set loop_messages = messages %} +{%- endif %} + +{#- Render developer message #} +{%- if developer_message or tools %} + {{- "<|start|>developer<|message|>" }} + {%- if developer_message %} + {{- "# Instructions\n\n" }} + {{- developer_message }} + {{- "\n\n" }} + {%- endif %} + {%- if tools -%} + {{- "# Tools\n\n" }} + {{- render_tool_namespace("functions", tools) }} + {%- endif -%} + {{- "<|end|>" }} +{%- endif %} + +{#- Render messages #} +{%- set last_tool_call = namespace(name=none) %} +{%- for message in loop_messages -%} + {#- At this point only assistant/user/tool messages should remain #} + {%- if message.role == 'assistant' -%} + {#- Checks to ensure the messages are being passed in the format we expect #} + {%- if "content" in message %} + {%- if "<|channel|>analysis<|message|>" in message.content or "<|channel|>final<|message|>" in message.content %} + {{- raise_exception("You have passed a message containing <|channel|> tags in the content field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }} + {%- endif %} + {%- endif %} + {%- if "thinking" in message %} + {%- if "<|channel|>analysis<|message|>" in message.thinking or "<|channel|>final<|message|>" in message.thinking %} + {{- raise_exception("You have passed a message containing <|channel|> tags in the thinking field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }} + {%- endif %} + {%- endif %} + {%- if "tool_calls" in message %} + {#- We need very careful handling here - we want to drop the tool call analysis message if the model #} + {#- has output a later <|final|> message, but otherwise we want to retain it. This is the only case #} + {#- when we render CoT/analysis messages in inference. #} + {%- set future_final_message = namespace(found=false) %} + {%- for future_message in loop_messages[loop.index:] %} + {%- if future_message.role == 'assistant' and "tool_calls" not in future_message %} + {%- set future_final_message.found = true %} + {%- endif %} + {%- endfor %} + {#- We assume max 1 tool call per message, and so we infer the tool call name #} + {#- in "tool" messages from the most recent assistant tool call name #} + {%- set tool_call = message.tool_calls[0] %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if message.content and message.thinking %} + {{- raise_exception("Cannot pass both content and thinking in an assistant message with tool calls! Put the analysis message in one or the other, but not both.") }} + {%- elif message.content and not future_final_message.found %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }} + {%- elif message.thinking and not future_final_message.found %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {%- endif %} + {{- "<|start|>assistant to=" }} + {{- "functions." + tool_call.name + "<|channel|>commentary " }} + {{- (tool_call.content_type if tool_call.content_type is defined else "json") + "<|message|>" }} + {{- tool_call.arguments|tojson }} + {{- "<|call|>" }} + {%- set last_tool_call.name = tool_call.name %} + {%- elif loop.last and not add_generation_prompt %} + {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #} + {#- This is a situation that should only occur in training, never in inference. #} + {%- if "thinking" in message %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {%- endif %} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }} + {%- else %} + {#- CoT is dropped during all previous turns, so we never render it for inference #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- endif %} + {%- elif message.role == 'tool' -%} + {%- if last_tool_call.name is none %} + {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }} + {%- endif %} + {{- "<|start|>functions." + last_tool_call.name }} + {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} + {%- elif message.role == 'user' -%} + {{- "<|start|>user<|message|>" + message.content + "<|end|>" }} + {%- endif -%} +{%- endfor -%} + +{#- Generation prompt #} +{%- if add_generation_prompt -%} +<|start|>assistant +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-1108/optimizer.pt b/checkpoint-1108/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d6416f794f6f5205ff5f7d6855e12cfa119fec4 --- /dev/null +++ b/checkpoint-1108/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eab27e99f571469d30bdceb6bd34ea43d8f36041d31edfd559cc710be6b9311c +size 120498699 diff --git a/checkpoint-1108/rng_state.pth b/checkpoint-1108/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1fb544a66454d577cf2a3cb06c26493972f57952 --- /dev/null +++ b/checkpoint-1108/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdc37bbd2e979f041dfbbb004a5c74bab6cdda159cb18116df728588515a9ef6 +size 14645 diff --git a/checkpoint-1108/scheduler.pt b/checkpoint-1108/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..39f1f78d97abf88306085808b37142bfd45c7308 --- /dev/null +++ b/checkpoint-1108/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82b6948da532ab03ff90841030410ccf318e649fef964e20f9682e93245a316b +size 1465 diff --git a/checkpoint-1108/special_tokens_map.json b/checkpoint-1108/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c47e282982a9c6856832947a72ded329fad2e8c --- /dev/null +++ b/checkpoint-1108/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|return|>" +} diff --git a/checkpoint-1108/tokenizer.json b/checkpoint-1108/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647 --- /dev/null +++ b/checkpoint-1108/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3 +size 27868174 diff --git a/checkpoint-1108/tokenizer_config.json b/checkpoint-1108/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e86f6faa71de0fc3afe47ea8984da9e6138c031c --- /dev/null +++ b/checkpoint-1108/tokenizer_config.json @@ -0,0 +1,183 @@ +{ + "added_tokens_decoder": { + "199998": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "199999": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200000": { + "content": "<|reserved_200000|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200001": { + "content": "<|reserved_200001|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200002": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200003": { + "content": "<|constrain|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200004": { + "content": "<|reserved_200004|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200005": { + "content": "<|channel|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200006": { + "content": "<|start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200008": { + "content": "<|message|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200009": { + "content": "<|reserved_200009|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200010": { + "content": "<|reserved_200010|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200011": { + "content": "<|reserved_200011|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200012": { + "content": "<|call|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200013": { + "content": "<|reserved_200013|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200014": { + "content": "<|reserved_200014|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200015": { + "content": "<|reserved_200015|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200016": { + "content": "<|reserved_200016|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200017": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200018": { + "content": "<|endofprompt|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|startoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|return|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|return|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-1108/trainer_state.json b/checkpoint-1108/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d7091b6264f635e335e3ed46cf619556aee31f3f --- /dev/null +++ b/checkpoint-1108/trainer_state.json @@ -0,0 +1,11158 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 1108, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.358862280845642, + "epoch": 0.0036199095022624436, + "grad_norm": 2.292628288269043, + "learning_rate": 0.0, + "loss": 0.7311, + "mean_token_accuracy": 0.8534883409738541, + "num_tokens": 9316.0, + "step": 1 + }, + { + "entropy": 2.674945294857025, + "epoch": 0.007239819004524887, + "grad_norm": 3.8950836658477783, + "learning_rate": 1.0219999999999999e-05, + "loss": 1.0621, + "mean_token_accuracy": 0.8183160275220871, + "num_tokens": 17707.0, + "step": 2 + }, + { + "entropy": 2.4915525913238525, + "epoch": 0.01085972850678733, + "grad_norm": 2.792142868041992, + "learning_rate": 2.0439999999999997e-05, + "loss": 0.8448, + "mean_token_accuracy": 0.8489587754011154, + "num_tokens": 26783.0, + "step": 3 + }, + { + "entropy": 2.525622010231018, + "epoch": 0.014479638009049774, + "grad_norm": 2.7071900367736816, + "learning_rate": 3.0659999999999994e-05, + "loss": 0.8847, + "mean_token_accuracy": 0.8486668318510056, + "num_tokens": 35947.0, + "step": 4 + }, + { + "entropy": 2.588509976863861, + "epoch": 0.01809954751131222, + "grad_norm": 2.981574773788452, + "learning_rate": 4.0879999999999995e-05, + "loss": 1.0783, + "mean_token_accuracy": 0.8135111033916473, + "num_tokens": 44505.0, + "step": 5 + }, + { + "entropy": 2.662865400314331, + "epoch": 0.02171945701357466, + "grad_norm": 2.629283905029297, + "learning_rate": 5.1099999999999995e-05, + "loss": 0.9485, + "mean_token_accuracy": 0.8152717798948288, + "num_tokens": 53140.0, + "step": 6 + }, + { + "entropy": 2.6662243604660034, + "epoch": 0.025339366515837104, + "grad_norm": 2.730058431625366, + "learning_rate": 6.131999999999999e-05, + "loss": 0.6982, + "mean_token_accuracy": 0.8552135527133942, + "num_tokens": 61932.0, + "step": 7 + }, + { + "entropy": 2.661384105682373, + "epoch": 0.02895927601809955, + "grad_norm": 2.562839984893799, + "learning_rate": 7.154e-05, + "loss": 0.7296, + "mean_token_accuracy": 0.8579540699720383, + "num_tokens": 70973.0, + "step": 8 + }, + { + "entropy": 2.7889368534088135, + "epoch": 0.03257918552036199, + "grad_norm": 2.8640544414520264, + "learning_rate": 8.175999999999999e-05, + "loss": 0.5965, + "mean_token_accuracy": 0.8638457208871841, + "num_tokens": 79977.0, + "step": 9 + }, + { + "entropy": 2.811532199382782, + "epoch": 0.03619909502262444, + "grad_norm": 2.6199426651000977, + "learning_rate": 9.197999999999998e-05, + "loss": 0.4819, + "mean_token_accuracy": 0.8786454051733017, + "num_tokens": 88915.0, + "step": 10 + }, + { + "entropy": 2.941167712211609, + "epoch": 0.039819004524886875, + "grad_norm": 1.2497272491455078, + "learning_rate": 0.00010219999999999999, + "loss": 0.7192, + "mean_token_accuracy": 0.841494083404541, + "num_tokens": 97749.0, + "step": 11 + }, + { + "entropy": 3.0547962188720703, + "epoch": 0.04343891402714932, + "grad_norm": 1.436136245727539, + "learning_rate": 0.00011241999999999998, + "loss": 0.5908, + "mean_token_accuracy": 0.8657624870538712, + "num_tokens": 106048.0, + "step": 12 + }, + { + "entropy": 2.9914053082466125, + "epoch": 0.047058823529411764, + "grad_norm": 0.9903654456138611, + "learning_rate": 0.00012263999999999998, + "loss": 0.4008, + "mean_token_accuracy": 0.8985499292612076, + "num_tokens": 115216.0, + "step": 13 + }, + { + "entropy": 3.1867465376853943, + "epoch": 0.05067873303167421, + "grad_norm": 1.019572377204895, + "learning_rate": 0.00013286, + "loss": 0.5062, + "mean_token_accuracy": 0.8893097043037415, + "num_tokens": 124040.0, + "step": 14 + }, + { + "entropy": 3.2431325912475586, + "epoch": 0.05429864253393665, + "grad_norm": 1.2394084930419922, + "learning_rate": 0.00014308, + "loss": 0.361, + "mean_token_accuracy": 0.9009967148303986, + "num_tokens": 132447.0, + "step": 15 + }, + { + "entropy": 3.1858643889427185, + "epoch": 0.0579185520361991, + "grad_norm": 0.9859603643417358, + "learning_rate": 0.00015329999999999999, + "loss": 0.4498, + "mean_token_accuracy": 0.887280747294426, + "num_tokens": 141228.0, + "step": 16 + }, + { + "entropy": 3.5029141902923584, + "epoch": 0.06153846153846154, + "grad_norm": 1.453957438468933, + "learning_rate": 0.00016351999999999998, + "loss": 0.4949, + "mean_token_accuracy": 0.888081505894661, + "num_tokens": 149789.0, + "step": 17 + }, + { + "entropy": 3.4572895765304565, + "epoch": 0.06515837104072399, + "grad_norm": 1.390377402305603, + "learning_rate": 0.00017374, + "loss": 0.5449, + "mean_token_accuracy": 0.8745045810937881, + "num_tokens": 157813.0, + "step": 18 + }, + { + "entropy": 3.3081750869750977, + "epoch": 0.06877828054298643, + "grad_norm": 1.1171791553497314, + "learning_rate": 0.00018395999999999997, + "loss": 0.4786, + "mean_token_accuracy": 0.8893420845270157, + "num_tokens": 166315.0, + "step": 19 + }, + { + "entropy": 3.3776715993881226, + "epoch": 0.07239819004524888, + "grad_norm": 1.5567998886108398, + "learning_rate": 0.00019418, + "loss": 0.3669, + "mean_token_accuracy": 0.9146632701158524, + "num_tokens": 175207.0, + "step": 20 + }, + { + "entropy": 3.2677870988845825, + "epoch": 0.0760180995475113, + "grad_norm": 1.7404611110687256, + "learning_rate": 0.00020439999999999998, + "loss": 0.5287, + "mean_token_accuracy": 0.8777483552694321, + "num_tokens": 183833.0, + "step": 21 + }, + { + "entropy": 3.313201069831848, + "epoch": 0.07963800904977375, + "grad_norm": 1.0836979150772095, + "learning_rate": 0.00021461999999999997, + "loss": 0.3014, + "mean_token_accuracy": 0.9215261936187744, + "num_tokens": 192591.0, + "step": 22 + }, + { + "entropy": 3.208672881126404, + "epoch": 0.0832579185520362, + "grad_norm": 1.2197301387786865, + "learning_rate": 0.00022483999999999997, + "loss": 0.4401, + "mean_token_accuracy": 0.9031257778406143, + "num_tokens": 201372.0, + "step": 23 + }, + { + "entropy": 3.1830995082855225, + "epoch": 0.08687782805429864, + "grad_norm": 1.2422229051589966, + "learning_rate": 0.00023506, + "loss": 0.5144, + "mean_token_accuracy": 0.8915928155183792, + "num_tokens": 210348.0, + "step": 24 + }, + { + "entropy": 3.085207223892212, + "epoch": 0.09049773755656108, + "grad_norm": 0.8987624049186707, + "learning_rate": 0.00024527999999999996, + "loss": 0.3253, + "mean_token_accuracy": 0.9221627116203308, + "num_tokens": 219131.0, + "step": 25 + }, + { + "entropy": 3.026031017303467, + "epoch": 0.09411764705882353, + "grad_norm": 1.0273475646972656, + "learning_rate": 0.0002555, + "loss": 0.3495, + "mean_token_accuracy": 0.9147634357213974, + "num_tokens": 228292.0, + "step": 26 + }, + { + "entropy": 3.0420032739639282, + "epoch": 0.09773755656108597, + "grad_norm": 1.0590945482254028, + "learning_rate": 0.00026572, + "loss": 0.4495, + "mean_token_accuracy": 0.9019353687763214, + "num_tokens": 236942.0, + "step": 27 + }, + { + "entropy": 3.0469263792037964, + "epoch": 0.10135746606334842, + "grad_norm": 0.9584959745407104, + "learning_rate": 0.00027594, + "loss": 0.405, + "mean_token_accuracy": 0.9216890782117844, + "num_tokens": 245543.0, + "step": 28 + }, + { + "entropy": 2.92683744430542, + "epoch": 0.10497737556561086, + "grad_norm": 0.8826628923416138, + "learning_rate": 0.00028616, + "loss": 0.4004, + "mean_token_accuracy": 0.9173285663127899, + "num_tokens": 254264.0, + "step": 29 + }, + { + "entropy": 3.0086968541145325, + "epoch": 0.1085972850678733, + "grad_norm": 0.8521863222122192, + "learning_rate": 0.00029637999999999995, + "loss": 0.2876, + "mean_token_accuracy": 0.9335231184959412, + "num_tokens": 263143.0, + "step": 30 + }, + { + "entropy": 2.9086623191833496, + "epoch": 0.11221719457013575, + "grad_norm": 0.7830919623374939, + "learning_rate": 0.00030659999999999997, + "loss": 0.548, + "mean_token_accuracy": 0.8831343650817871, + "num_tokens": 272055.0, + "step": 31 + }, + { + "entropy": 2.9730575680732727, + "epoch": 0.1158371040723982, + "grad_norm": 0.7217472195625305, + "learning_rate": 0.00031682, + "loss": 0.3564, + "mean_token_accuracy": 0.9119151830673218, + "num_tokens": 280971.0, + "step": 32 + }, + { + "entropy": 3.081720530986786, + "epoch": 0.11945701357466064, + "grad_norm": 0.8697704076766968, + "learning_rate": 0.00032703999999999996, + "loss": 0.334, + "mean_token_accuracy": 0.9234935492277145, + "num_tokens": 289449.0, + "step": 33 + }, + { + "entropy": 3.1043431162834167, + "epoch": 0.12307692307692308, + "grad_norm": 0.7962514758110046, + "learning_rate": 0.00033726, + "loss": 0.1602, + "mean_token_accuracy": 0.9554370939731598, + "num_tokens": 297804.0, + "step": 34 + }, + { + "entropy": 3.0275490283966064, + "epoch": 0.12669683257918551, + "grad_norm": 0.5887104272842407, + "learning_rate": 0.00034748, + "loss": 0.2254, + "mean_token_accuracy": 0.9491932094097137, + "num_tokens": 306589.0, + "step": 35 + }, + { + "entropy": 3.099652886390686, + "epoch": 0.13031674208144797, + "grad_norm": 0.894397497177124, + "learning_rate": 0.00035769999999999997, + "loss": 0.6397, + "mean_token_accuracy": 0.8802188038825989, + "num_tokens": 315534.0, + "step": 36 + }, + { + "entropy": 3.0312134623527527, + "epoch": 0.1339366515837104, + "grad_norm": 0.6374682188034058, + "learning_rate": 0.00036791999999999993, + "loss": 0.2183, + "mean_token_accuracy": 0.9478497952222824, + "num_tokens": 324492.0, + "step": 37 + }, + { + "entropy": 3.28497713804245, + "epoch": 0.13755656108597286, + "grad_norm": 0.6740968823432922, + "learning_rate": 0.00037813999999999995, + "loss": 0.3619, + "mean_token_accuracy": 0.9288723170757294, + "num_tokens": 333195.0, + "step": 38 + }, + { + "entropy": 3.1478323340415955, + "epoch": 0.1411764705882353, + "grad_norm": 0.7235494256019592, + "learning_rate": 0.00038836, + "loss": 0.324, + "mean_token_accuracy": 0.9179254025220871, + "num_tokens": 342028.0, + "step": 39 + }, + { + "entropy": 3.279879152774811, + "epoch": 0.14479638009049775, + "grad_norm": 0.7512595653533936, + "learning_rate": 0.00039858, + "loss": 0.4804, + "mean_token_accuracy": 0.889826312661171, + "num_tokens": 350902.0, + "step": 40 + }, + { + "entropy": 3.173546612262726, + "epoch": 0.14841628959276018, + "grad_norm": 0.6978861689567566, + "learning_rate": 0.00040879999999999996, + "loss": 0.3442, + "mean_token_accuracy": 0.9205169230699539, + "num_tokens": 359787.0, + "step": 41 + }, + { + "entropy": 3.2385765314102173, + "epoch": 0.1520361990950226, + "grad_norm": 0.8108944892883301, + "learning_rate": 0.00041901999999999993, + "loss": 0.4223, + "mean_token_accuracy": 0.8979178965091705, + "num_tokens": 368426.0, + "step": 42 + }, + { + "entropy": 3.146568477153778, + "epoch": 0.15565610859728507, + "grad_norm": 0.5847787261009216, + "learning_rate": 0.00042923999999999995, + "loss": 0.1953, + "mean_token_accuracy": 0.9556037336587906, + "num_tokens": 377349.0, + "step": 43 + }, + { + "entropy": 3.066233277320862, + "epoch": 0.1592760180995475, + "grad_norm": 0.7887329459190369, + "learning_rate": 0.00043945999999999997, + "loss": 0.6815, + "mean_token_accuracy": 0.8654293268918991, + "num_tokens": 386603.0, + "step": 44 + }, + { + "entropy": 3.1745981574058533, + "epoch": 0.16289592760180996, + "grad_norm": 0.7280165553092957, + "learning_rate": 0.00044967999999999994, + "loss": 0.1932, + "mean_token_accuracy": 0.9479279220104218, + "num_tokens": 395070.0, + "step": 45 + }, + { + "entropy": 3.1094446182250977, + "epoch": 0.1665158371040724, + "grad_norm": 0.6453448534011841, + "learning_rate": 0.00045989999999999996, + "loss": 0.2608, + "mean_token_accuracy": 0.9249396026134491, + "num_tokens": 403651.0, + "step": 46 + }, + { + "entropy": 2.9050925970077515, + "epoch": 0.17013574660633485, + "grad_norm": 0.6689278483390808, + "learning_rate": 0.00047012, + "loss": 0.4489, + "mean_token_accuracy": 0.898686870932579, + "num_tokens": 412898.0, + "step": 47 + }, + { + "entropy": 3.2239145040512085, + "epoch": 0.17375565610859728, + "grad_norm": 1.0014020204544067, + "learning_rate": 0.00048033999999999994, + "loss": 0.3234, + "mean_token_accuracy": 0.9231891483068466, + "num_tokens": 421420.0, + "step": 48 + }, + { + "entropy": 3.035899817943573, + "epoch": 0.17737556561085974, + "grad_norm": 0.6415768265724182, + "learning_rate": 0.0004905599999999999, + "loss": 0.2259, + "mean_token_accuracy": 0.9447792917490005, + "num_tokens": 430258.0, + "step": 49 + }, + { + "entropy": 3.057477653026581, + "epoch": 0.18099547511312217, + "grad_norm": 0.6042271256446838, + "learning_rate": 0.0005007799999999999, + "loss": 0.2228, + "mean_token_accuracy": 0.9473378211259842, + "num_tokens": 439593.0, + "step": 50 + }, + { + "entropy": 2.8375911116600037, + "epoch": 0.18461538461538463, + "grad_norm": 0.739811897277832, + "learning_rate": 0.000511, + "loss": 0.3623, + "mean_token_accuracy": 0.9050924181938171, + "num_tokens": 449056.0, + "step": 51 + }, + { + "entropy": 2.9926682114601135, + "epoch": 0.18823529411764706, + "grad_norm": 0.6637321710586548, + "learning_rate": 0.0005109995633102972, + "loss": 0.2924, + "mean_token_accuracy": 0.9397273659706116, + "num_tokens": 457677.0, + "step": 52 + }, + { + "entropy": 2.7932987809181213, + "epoch": 0.19185520361990951, + "grad_norm": 0.5666584372520447, + "learning_rate": 0.0005109982532428477, + "loss": 0.2055, + "mean_token_accuracy": 0.9385408014059067, + "num_tokens": 466969.0, + "step": 53 + }, + { + "entropy": 2.765812337398529, + "epoch": 0.19547511312217195, + "grad_norm": 0.7875120639801025, + "learning_rate": 0.0005109960698026271, + "loss": 0.4549, + "mean_token_accuracy": 0.9052814990282059, + "num_tokens": 476285.0, + "step": 54 + }, + { + "entropy": 2.884207248687744, + "epoch": 0.19909502262443438, + "grad_norm": 0.7538661956787109, + "learning_rate": 0.0005109930129979285, + "loss": 0.3751, + "mean_token_accuracy": 0.9210246652364731, + "num_tokens": 484668.0, + "step": 55 + }, + { + "entropy": 2.779718518257141, + "epoch": 0.20271493212669683, + "grad_norm": 0.8069296479225159, + "learning_rate": 0.0005109890828403621, + "loss": 0.3664, + "mean_token_accuracy": 0.9219843596220016, + "num_tokens": 493292.0, + "step": 56 + }, + { + "entropy": 2.841543674468994, + "epoch": 0.20633484162895926, + "grad_norm": 0.5545904636383057, + "learning_rate": 0.0005109842793448548, + "loss": 0.1973, + "mean_token_accuracy": 0.9547395706176758, + "num_tokens": 501973.0, + "step": 57 + }, + { + "entropy": 2.8180030584335327, + "epoch": 0.20995475113122172, + "grad_norm": 1.015456199645996, + "learning_rate": 0.0005109786025296513, + "loss": 0.6019, + "mean_token_accuracy": 0.88613361120224, + "num_tokens": 510840.0, + "step": 58 + }, + { + "entropy": 2.7450912594795227, + "epoch": 0.21357466063348415, + "grad_norm": 0.6784740686416626, + "learning_rate": 0.0005109720524163127, + "loss": 0.2868, + "mean_token_accuracy": 0.9295425117015839, + "num_tokens": 519656.0, + "step": 59 + }, + { + "entropy": 2.822400987148285, + "epoch": 0.2171945701357466, + "grad_norm": 0.8780149817466736, + "learning_rate": 0.000510964629029717, + "loss": 0.4371, + "mean_token_accuracy": 0.9089596569538116, + "num_tokens": 528105.0, + "step": 60 + }, + { + "entropy": 2.522100865840912, + "epoch": 0.22081447963800904, + "grad_norm": 0.51394122838974, + "learning_rate": 0.0005109563323980594, + "loss": 0.2509, + "mean_token_accuracy": 0.941976860165596, + "num_tokens": 537707.0, + "step": 61 + }, + { + "entropy": 2.6596657633781433, + "epoch": 0.2244343891402715, + "grad_norm": 0.6359816789627075, + "learning_rate": 0.0005109471625528516, + "loss": 0.3685, + "mean_token_accuracy": 0.9191890209913254, + "num_tokens": 546517.0, + "step": 62 + }, + { + "entropy": 2.800311803817749, + "epoch": 0.22805429864253393, + "grad_norm": 0.6862941980361938, + "learning_rate": 0.0005109371195289215, + "loss": 0.2457, + "mean_token_accuracy": 0.9330879002809525, + "num_tokens": 555493.0, + "step": 63 + }, + { + "entropy": 2.7235344648361206, + "epoch": 0.2316742081447964, + "grad_norm": 1.0464682579040527, + "learning_rate": 0.0005109262033644142, + "loss": 0.4417, + "mean_token_accuracy": 0.8957678377628326, + "num_tokens": 564255.0, + "step": 64 + }, + { + "entropy": 2.6643534302711487, + "epoch": 0.23529411764705882, + "grad_norm": 1.0790019035339355, + "learning_rate": 0.0005109144141007903, + "loss": 0.4947, + "mean_token_accuracy": 0.8889007717370987, + "num_tokens": 573401.0, + "step": 65 + }, + { + "entropy": 2.760925054550171, + "epoch": 0.23891402714932128, + "grad_norm": 0.7957189679145813, + "learning_rate": 0.0005109017517828273, + "loss": 0.2259, + "mean_token_accuracy": 0.944578230381012, + "num_tokens": 581905.0, + "step": 66 + }, + { + "entropy": 2.7048792839050293, + "epoch": 0.2425339366515837, + "grad_norm": 0.9530714750289917, + "learning_rate": 0.0005108882164586181, + "loss": 0.3122, + "mean_token_accuracy": 0.9257418513298035, + "num_tokens": 590802.0, + "step": 67 + }, + { + "entropy": 2.6733291149139404, + "epoch": 0.24615384615384617, + "grad_norm": 0.8295993208885193, + "learning_rate": 0.0005108738081795716, + "loss": 0.3701, + "mean_token_accuracy": 0.898589238524437, + "num_tokens": 599279.0, + "step": 68 + }, + { + "entropy": 2.5613606572151184, + "epoch": 0.2497737556561086, + "grad_norm": 0.6205935478210449, + "learning_rate": 0.0005108585270004123, + "loss": 0.4372, + "mean_token_accuracy": 0.9116007685661316, + "num_tokens": 608107.0, + "step": 69 + }, + { + "entropy": 2.458296835422516, + "epoch": 0.25339366515837103, + "grad_norm": 0.7629838585853577, + "learning_rate": 0.0005108423729791799, + "loss": 0.2307, + "mean_token_accuracy": 0.9386163502931595, + "num_tokens": 616881.0, + "step": 70 + }, + { + "entropy": 2.4176695346832275, + "epoch": 0.25701357466063346, + "grad_norm": 0.902400016784668, + "learning_rate": 0.0005108253461772298, + "loss": 0.2853, + "mean_token_accuracy": 0.9237343072891235, + "num_tokens": 625323.0, + "step": 71 + }, + { + "entropy": 2.2265281677246094, + "epoch": 0.26063348416289595, + "grad_norm": 0.7744383811950684, + "learning_rate": 0.0005108074466592316, + "loss": 0.2435, + "mean_token_accuracy": 0.9508260935544968, + "num_tokens": 634260.0, + "step": 72 + }, + { + "entropy": 2.1855952441692352, + "epoch": 0.2642533936651584, + "grad_norm": 0.8615190386772156, + "learning_rate": 0.0005107886744931702, + "loss": 0.3323, + "mean_token_accuracy": 0.9276078194379807, + "num_tokens": 643235.0, + "step": 73 + }, + { + "entropy": 2.179121494293213, + "epoch": 0.2678733031674208, + "grad_norm": 0.8953279256820679, + "learning_rate": 0.0005107690297503444, + "loss": 0.2384, + "mean_token_accuracy": 0.9425230622291565, + "num_tokens": 652032.0, + "step": 74 + }, + { + "entropy": 2.1565526127815247, + "epoch": 0.27149321266968324, + "grad_norm": 0.6830486059188843, + "learning_rate": 0.0005107485125053678, + "loss": 0.2759, + "mean_token_accuracy": 0.9360661953687668, + "num_tokens": 660978.0, + "step": 75 + }, + { + "entropy": 2.0900665521621704, + "epoch": 0.2751131221719457, + "grad_norm": 0.786665141582489, + "learning_rate": 0.0005107271228361672, + "loss": 0.4061, + "mean_token_accuracy": 0.910009115934372, + "num_tokens": 669817.0, + "step": 76 + }, + { + "entropy": 2.1311859488487244, + "epoch": 0.27873303167420815, + "grad_norm": 0.6399909853935242, + "learning_rate": 0.0005107048608239836, + "loss": 0.272, + "mean_token_accuracy": 0.9424714297056198, + "num_tokens": 678469.0, + "step": 77 + }, + { + "entropy": 2.059997320175171, + "epoch": 0.2823529411764706, + "grad_norm": 0.8114754557609558, + "learning_rate": 0.0005106817265533706, + "loss": 0.4029, + "mean_token_accuracy": 0.9037660360336304, + "num_tokens": 687261.0, + "step": 78 + }, + { + "entropy": 1.9725019037723541, + "epoch": 0.285972850678733, + "grad_norm": 0.9420941472053528, + "learning_rate": 0.0005106577201121952, + "loss": 0.535, + "mean_token_accuracy": 0.8996377140283585, + "num_tokens": 695941.0, + "step": 79 + }, + { + "entropy": 1.9951164424419403, + "epoch": 0.2895927601809955, + "grad_norm": 0.6476142406463623, + "learning_rate": 0.0005106328415916372, + "loss": 0.2242, + "mean_token_accuracy": 0.941379725933075, + "num_tokens": 704643.0, + "step": 80 + }, + { + "entropy": 1.8962564170360565, + "epoch": 0.29321266968325793, + "grad_norm": 0.5974630117416382, + "learning_rate": 0.0005106070910861881, + "loss": 0.2934, + "mean_token_accuracy": 0.9217697530984879, + "num_tokens": 713605.0, + "step": 81 + }, + { + "entropy": 1.9781515896320343, + "epoch": 0.29683257918552036, + "grad_norm": 0.8755478262901306, + "learning_rate": 0.0005105804686936518, + "loss": 0.4551, + "mean_token_accuracy": 0.9051328897476196, + "num_tokens": 722385.0, + "step": 82 + }, + { + "entropy": 1.9892418384552002, + "epoch": 0.3004524886877828, + "grad_norm": 0.6887345314025879, + "learning_rate": 0.0005105529745151433, + "loss": 0.244, + "mean_token_accuracy": 0.9261117279529572, + "num_tokens": 730962.0, + "step": 83 + }, + { + "entropy": 2.0053181648254395, + "epoch": 0.3040723981900452, + "grad_norm": 0.6930885910987854, + "learning_rate": 0.0005105246086550893, + "loss": 0.3155, + "mean_token_accuracy": 0.9206147193908691, + "num_tokens": 739499.0, + "step": 84 + }, + { + "entropy": 1.9716475903987885, + "epoch": 0.3076923076923077, + "grad_norm": 0.5049461722373962, + "learning_rate": 0.0005104953712212266, + "loss": 0.2215, + "mean_token_accuracy": 0.9608763605356216, + "num_tokens": 748604.0, + "step": 85 + }, + { + "entropy": 1.9186978042125702, + "epoch": 0.31131221719457014, + "grad_norm": 0.5756685733795166, + "learning_rate": 0.000510465262324603, + "loss": 0.2658, + "mean_token_accuracy": 0.9372887462377548, + "num_tokens": 757919.0, + "step": 86 + }, + { + "entropy": 1.9738290905952454, + "epoch": 0.31493212669683257, + "grad_norm": 0.6163789629936218, + "learning_rate": 0.0005104342820795758, + "loss": 0.2472, + "mean_token_accuracy": 0.9430449157953262, + "num_tokens": 766708.0, + "step": 87 + }, + { + "entropy": 2.1927571892738342, + "epoch": 0.318552036199095, + "grad_norm": 0.7953162789344788, + "learning_rate": 0.0005104024306038119, + "loss": 0.261, + "mean_token_accuracy": 0.9425829648971558, + "num_tokens": 774601.0, + "step": 88 + }, + { + "entropy": 2.043731451034546, + "epoch": 0.3221719457013575, + "grad_norm": 0.8098088502883911, + "learning_rate": 0.0005103697080182872, + "loss": 0.3126, + "mean_token_accuracy": 0.9158089309930801, + "num_tokens": 783170.0, + "step": 89 + }, + { + "entropy": 1.9801572561264038, + "epoch": 0.3257918552036199, + "grad_norm": 0.5227240920066833, + "learning_rate": 0.0005103361144472864, + "loss": 0.1291, + "mean_token_accuracy": 0.9666071832180023, + "num_tokens": 791769.0, + "step": 90 + }, + { + "entropy": 1.9553790986537933, + "epoch": 0.32941176470588235, + "grad_norm": 0.7819464206695557, + "learning_rate": 0.0005103016500184022, + "loss": 0.531, + "mean_token_accuracy": 0.8817111849784851, + "num_tokens": 800824.0, + "step": 91 + }, + { + "entropy": 1.9291303753852844, + "epoch": 0.3330316742081448, + "grad_norm": 0.7178757190704346, + "learning_rate": 0.0005102663148625347, + "loss": 0.3301, + "mean_token_accuracy": 0.9357631802558899, + "num_tokens": 809347.0, + "step": 92 + }, + { + "entropy": 1.9846041798591614, + "epoch": 0.33665158371040727, + "grad_norm": 1.316636085510254, + "learning_rate": 0.0005102301091138916, + "loss": 0.4241, + "mean_token_accuracy": 0.8993304669857025, + "num_tokens": 817174.0, + "step": 93 + }, + { + "entropy": 1.814637303352356, + "epoch": 0.3402714932126697, + "grad_norm": 0.5486414432525635, + "learning_rate": 0.0005101930329099865, + "loss": 0.116, + "mean_token_accuracy": 0.9674727618694305, + "num_tokens": 826177.0, + "step": 94 + }, + { + "entropy": 1.9128066003322601, + "epoch": 0.3438914027149321, + "grad_norm": 0.620303750038147, + "learning_rate": 0.00051015508639164, + "loss": 0.1833, + "mean_token_accuracy": 0.9569521993398666, + "num_tokens": 835409.0, + "step": 95 + }, + { + "entropy": 1.7541870176792145, + "epoch": 0.34751131221719456, + "grad_norm": 0.8337438702583313, + "learning_rate": 0.0005101162697029776, + "loss": 0.3327, + "mean_token_accuracy": 0.9193180054426193, + "num_tokens": 844692.0, + "step": 96 + }, + { + "entropy": 1.8255240619182587, + "epoch": 0.351131221719457, + "grad_norm": 0.877780556678772, + "learning_rate": 0.00051007658299143, + "loss": 0.2106, + "mean_token_accuracy": 0.9527023881673813, + "num_tokens": 853309.0, + "step": 97 + }, + { + "entropy": 1.8611579239368439, + "epoch": 0.3547511312217195, + "grad_norm": 1.0667716264724731, + "learning_rate": 0.0005100360264077325, + "loss": 0.3196, + "mean_token_accuracy": 0.9195879399776459, + "num_tokens": 861859.0, + "step": 98 + }, + { + "entropy": 1.821915864944458, + "epoch": 0.3583710407239819, + "grad_norm": 0.8400309681892395, + "learning_rate": 0.0005099946001059241, + "loss": 0.4036, + "mean_token_accuracy": 0.8951036781072617, + "num_tokens": 871060.0, + "step": 99 + }, + { + "entropy": 1.7648265063762665, + "epoch": 0.36199095022624433, + "grad_norm": 1.1391404867172241, + "learning_rate": 0.0005099523042433472, + "loss": 0.389, + "mean_token_accuracy": 0.901309460401535, + "num_tokens": 880593.0, + "step": 100 + }, + { + "entropy": 1.8506875336170197, + "epoch": 0.36561085972850677, + "grad_norm": 0.6923297643661499, + "learning_rate": 0.000509909138980647, + "loss": 0.2504, + "mean_token_accuracy": 0.9384842216968536, + "num_tokens": 889739.0, + "step": 101 + }, + { + "entropy": 1.9311015605926514, + "epoch": 0.36923076923076925, + "grad_norm": 0.9677391052246094, + "learning_rate": 0.0005098651044817704, + "loss": 0.6953, + "mean_token_accuracy": 0.8752655684947968, + "num_tokens": 898992.0, + "step": 102 + }, + { + "entropy": 1.9590983986854553, + "epoch": 0.3728506787330317, + "grad_norm": 0.6364567279815674, + "learning_rate": 0.0005098202009139663, + "loss": 0.4318, + "mean_token_accuracy": 0.9056479930877686, + "num_tokens": 908225.0, + "step": 103 + }, + { + "entropy": 1.9455370008945465, + "epoch": 0.3764705882352941, + "grad_norm": 0.6747863292694092, + "learning_rate": 0.0005097744284477839, + "loss": 0.244, + "mean_token_accuracy": 0.9428392052650452, + "num_tokens": 917134.0, + "step": 104 + }, + { + "entropy": 1.8632825911045074, + "epoch": 0.38009049773755654, + "grad_norm": 0.5705651044845581, + "learning_rate": 0.0005097277872570731, + "loss": 0.2508, + "mean_token_accuracy": 0.9325222969055176, + "num_tokens": 926573.0, + "step": 105 + }, + { + "entropy": 1.9370323717594147, + "epoch": 0.38371040723981903, + "grad_norm": 0.6298627853393555, + "learning_rate": 0.000509680277518983, + "loss": 0.2481, + "mean_token_accuracy": 0.9281332045793533, + "num_tokens": 935853.0, + "step": 106 + }, + { + "entropy": 2.0217572450637817, + "epoch": 0.38733031674208146, + "grad_norm": 0.5434353947639465, + "learning_rate": 0.0005096318994139617, + "loss": 0.1809, + "mean_token_accuracy": 0.9592084139585495, + "num_tokens": 944279.0, + "step": 107 + }, + { + "entropy": 1.9619770646095276, + "epoch": 0.3909502262443439, + "grad_norm": 0.6959638595581055, + "learning_rate": 0.0005095826531257552, + "loss": 0.1376, + "mean_token_accuracy": 0.9608310014009476, + "num_tokens": 953336.0, + "step": 108 + }, + { + "entropy": 2.12511146068573, + "epoch": 0.3945701357466063, + "grad_norm": 1.0152848958969116, + "learning_rate": 0.0005095325388414074, + "loss": 0.4382, + "mean_token_accuracy": 0.915201798081398, + "num_tokens": 962002.0, + "step": 109 + }, + { + "entropy": 2.0171878039836884, + "epoch": 0.39819004524886875, + "grad_norm": 0.8337467312812805, + "learning_rate": 0.0005094815567512587, + "loss": 0.2672, + "mean_token_accuracy": 0.9313560128211975, + "num_tokens": 970954.0, + "step": 110 + }, + { + "entropy": 2.1024146378040314, + "epoch": 0.40180995475113124, + "grad_norm": 0.8214333057403564, + "learning_rate": 0.0005094297070489455, + "loss": 0.3146, + "mean_token_accuracy": 0.9289091974496841, + "num_tokens": 979929.0, + "step": 111 + }, + { + "entropy": 2.260519325733185, + "epoch": 0.40542986425339367, + "grad_norm": 1.1298810243606567, + "learning_rate": 0.0005093769899313996, + "loss": 0.3055, + "mean_token_accuracy": 0.9213490188121796, + "num_tokens": 988477.0, + "step": 112 + }, + { + "entropy": 2.2228699326515198, + "epoch": 0.4090497737556561, + "grad_norm": 0.8601953983306885, + "learning_rate": 0.0005093234055988475, + "loss": 0.2738, + "mean_token_accuracy": 0.920888364315033, + "num_tokens": 997091.0, + "step": 113 + }, + { + "entropy": 2.2165185809135437, + "epoch": 0.41266968325791853, + "grad_norm": 0.6331561803817749, + "learning_rate": 0.0005092689542548091, + "loss": 0.2241, + "mean_token_accuracy": 0.9408514499664307, + "num_tokens": 1005866.0, + "step": 114 + }, + { + "entropy": 2.324040472507477, + "epoch": 0.416289592760181, + "grad_norm": 0.680496096611023, + "learning_rate": 0.0005092136361060975, + "loss": 0.2454, + "mean_token_accuracy": 0.9433349967002869, + "num_tokens": 1014277.0, + "step": 115 + }, + { + "entropy": 2.413789749145508, + "epoch": 0.41990950226244345, + "grad_norm": 0.7489557862281799, + "learning_rate": 0.0005091574513628183, + "loss": 0.2856, + "mean_token_accuracy": 0.934124082326889, + "num_tokens": 1023032.0, + "step": 116 + }, + { + "entropy": 2.4693005681037903, + "epoch": 0.4235294117647059, + "grad_norm": 0.6842612624168396, + "learning_rate": 0.0005091004002383682, + "loss": 0.2778, + "mean_token_accuracy": 0.9386793673038483, + "num_tokens": 1031883.0, + "step": 117 + }, + { + "entropy": 2.4351969361305237, + "epoch": 0.4271493212669683, + "grad_norm": 0.9150674343109131, + "learning_rate": 0.0005090424829494347, + "loss": 0.3151, + "mean_token_accuracy": 0.9177709072828293, + "num_tokens": 1040985.0, + "step": 118 + }, + { + "entropy": 2.5141562819480896, + "epoch": 0.4307692307692308, + "grad_norm": 1.0200655460357666, + "learning_rate": 0.000508983699715995, + "loss": 0.5134, + "mean_token_accuracy": 0.8835459351539612, + "num_tokens": 1049949.0, + "step": 119 + }, + { + "entropy": 2.479240596294403, + "epoch": 0.4343891402714932, + "grad_norm": 0.783278226852417, + "learning_rate": 0.0005089240507613151, + "loss": 0.2745, + "mean_token_accuracy": 0.9389322698116302, + "num_tokens": 1058953.0, + "step": 120 + }, + { + "entropy": 2.457803785800934, + "epoch": 0.43800904977375565, + "grad_norm": 0.7620834112167358, + "learning_rate": 0.0005088635363119497, + "loss": 0.3394, + "mean_token_accuracy": 0.9145695865154266, + "num_tokens": 1068624.0, + "step": 121 + }, + { + "entropy": 2.4909247756004333, + "epoch": 0.4416289592760181, + "grad_norm": 0.5868712067604065, + "learning_rate": 0.0005088021565977403, + "loss": 0.1726, + "mean_token_accuracy": 0.9567564129829407, + "num_tokens": 1077686.0, + "step": 122 + }, + { + "entropy": 2.5540462732315063, + "epoch": 0.4452488687782805, + "grad_norm": 1.1467291116714478, + "learning_rate": 0.0005087399118518148, + "loss": 0.2617, + "mean_token_accuracy": 0.9329706132411957, + "num_tokens": 1086230.0, + "step": 123 + }, + { + "entropy": 2.377680242061615, + "epoch": 0.448868778280543, + "grad_norm": 0.7021825909614563, + "learning_rate": 0.0005086768023105866, + "loss": 0.4124, + "mean_token_accuracy": 0.9093360006809235, + "num_tokens": 1095867.0, + "step": 124 + }, + { + "entropy": 2.55239599943161, + "epoch": 0.45248868778280543, + "grad_norm": 0.5947801470756531, + "learning_rate": 0.0005086128282137538, + "loss": 0.2752, + "mean_token_accuracy": 0.9248816668987274, + "num_tokens": 1105003.0, + "step": 125 + }, + { + "entropy": 2.4695483446121216, + "epoch": 0.45610859728506786, + "grad_norm": 1.345604658126831, + "learning_rate": 0.0005085479898042985, + "loss": 0.2577, + "mean_token_accuracy": 0.9318550229072571, + "num_tokens": 1114162.0, + "step": 126 + }, + { + "entropy": 2.4898732900619507, + "epoch": 0.4597285067873303, + "grad_norm": 0.8534179329872131, + "learning_rate": 0.0005084822873284848, + "loss": 0.3013, + "mean_token_accuracy": 0.9195661097764969, + "num_tokens": 1123457.0, + "step": 127 + }, + { + "entropy": 2.5951223969459534, + "epoch": 0.4633484162895928, + "grad_norm": 1.1677368879318237, + "learning_rate": 0.0005084157210358592, + "loss": 0.1612, + "mean_token_accuracy": 0.9599333852529526, + "num_tokens": 1131774.0, + "step": 128 + }, + { + "entropy": 2.7315847873687744, + "epoch": 0.4669683257918552, + "grad_norm": 0.7633224129676819, + "learning_rate": 0.0005083482911792492, + "loss": 0.2437, + "mean_token_accuracy": 0.9487509876489639, + "num_tokens": 1140301.0, + "step": 129 + }, + { + "entropy": 2.6348633766174316, + "epoch": 0.47058823529411764, + "grad_norm": 0.7573317885398865, + "learning_rate": 0.0005082799980147617, + "loss": 0.2426, + "mean_token_accuracy": 0.947308748960495, + "num_tokens": 1148929.0, + "step": 130 + }, + { + "entropy": 2.60002738237381, + "epoch": 0.47420814479638007, + "grad_norm": 1.8195319175720215, + "learning_rate": 0.0005082108418017829, + "loss": 0.1792, + "mean_token_accuracy": 0.9512491375207901, + "num_tokens": 1157682.0, + "step": 131 + }, + { + "entropy": 2.5319923162460327, + "epoch": 0.47782805429864256, + "grad_norm": 0.6342993378639221, + "learning_rate": 0.0005081408228029771, + "loss": 0.1843, + "mean_token_accuracy": 0.9440758228302002, + "num_tokens": 1166687.0, + "step": 132 + }, + { + "entropy": 2.5666881799697876, + "epoch": 0.481447963800905, + "grad_norm": 0.8979415893554688, + "learning_rate": 0.0005080699412842852, + "loss": 0.4824, + "mean_token_accuracy": 0.8837443292140961, + "num_tokens": 1175746.0, + "step": 133 + }, + { + "entropy": 2.6854636669158936, + "epoch": 0.4850678733031674, + "grad_norm": 0.8302125334739685, + "learning_rate": 0.0005079981975149243, + "loss": 0.267, + "mean_token_accuracy": 0.9279022663831711, + "num_tokens": 1184196.0, + "step": 134 + }, + { + "entropy": 2.564552128314972, + "epoch": 0.48868778280542985, + "grad_norm": 0.6785959005355835, + "learning_rate": 0.0005079255917673863, + "loss": 0.2031, + "mean_token_accuracy": 0.9463823586702347, + "num_tokens": 1192982.0, + "step": 135 + }, + { + "entropy": 2.673682928085327, + "epoch": 0.49230769230769234, + "grad_norm": 1.4760410785675049, + "learning_rate": 0.0005078521243174371, + "loss": 0.4791, + "mean_token_accuracy": 0.8969505727291107, + "num_tokens": 1201454.0, + "step": 136 + }, + { + "entropy": 2.6232714653015137, + "epoch": 0.49592760180995477, + "grad_norm": 0.7845668792724609, + "learning_rate": 0.0005077777954441157, + "loss": 0.2472, + "mean_token_accuracy": 0.9404618591070175, + "num_tokens": 1210182.0, + "step": 137 + }, + { + "entropy": 2.5614060163497925, + "epoch": 0.4995475113122172, + "grad_norm": 0.725419819355011, + "learning_rate": 0.0005077026054297322, + "loss": 0.3643, + "mean_token_accuracy": 0.9193316847085953, + "num_tokens": 1219487.0, + "step": 138 + }, + { + "entropy": 2.5907246470451355, + "epoch": 0.5031674208144796, + "grad_norm": 0.7741782665252686, + "learning_rate": 0.0005076265545598682, + "loss": 0.276, + "mean_token_accuracy": 0.9447730481624603, + "num_tokens": 1228066.0, + "step": 139 + }, + { + "entropy": 2.531104028224945, + "epoch": 0.5067873303167421, + "grad_norm": 0.680992603302002, + "learning_rate": 0.0005075496431233745, + "loss": 0.2004, + "mean_token_accuracy": 0.9470729678869247, + "num_tokens": 1236980.0, + "step": 140 + }, + { + "entropy": 2.590231478214264, + "epoch": 0.5104072398190045, + "grad_norm": 0.8260406255722046, + "learning_rate": 0.0005074718714123704, + "loss": 0.2756, + "mean_token_accuracy": 0.9301882535219193, + "num_tokens": 1245565.0, + "step": 141 + }, + { + "entropy": 2.4858668446540833, + "epoch": 0.5140271493212669, + "grad_norm": 0.8085922598838806, + "learning_rate": 0.0005073932397222429, + "loss": 0.2314, + "mean_token_accuracy": 0.9449103325605392, + "num_tokens": 1254366.0, + "step": 142 + }, + { + "entropy": 2.5374304056167603, + "epoch": 0.5176470588235295, + "grad_norm": 0.7858129143714905, + "learning_rate": 0.0005073137483516452, + "loss": 0.1622, + "mean_token_accuracy": 0.9510673582553864, + "num_tokens": 1263197.0, + "step": 143 + }, + { + "entropy": 2.608425199985504, + "epoch": 0.5212669683257919, + "grad_norm": 1.2698506116867065, + "learning_rate": 0.0005072333976024957, + "loss": 0.1729, + "mean_token_accuracy": 0.9509973376989365, + "num_tokens": 1271725.0, + "step": 144 + }, + { + "entropy": 2.437038242816925, + "epoch": 0.5248868778280543, + "grad_norm": 1.0788538455963135, + "learning_rate": 0.0005071521877799765, + "loss": 0.3344, + "mean_token_accuracy": 0.9166721999645233, + "num_tokens": 1280963.0, + "step": 145 + }, + { + "entropy": 2.589951515197754, + "epoch": 0.5285067873303168, + "grad_norm": 0.9228294491767883, + "learning_rate": 0.0005070701191925332, + "loss": 0.3095, + "mean_token_accuracy": 0.9239777624607086, + "num_tokens": 1289683.0, + "step": 146 + }, + { + "entropy": 2.575794994831085, + "epoch": 0.5321266968325792, + "grad_norm": 1.359767198562622, + "learning_rate": 0.0005069871921518726, + "loss": 0.2447, + "mean_token_accuracy": 0.9374738186597824, + "num_tokens": 1298397.0, + "step": 147 + }, + { + "entropy": 2.5628358721733093, + "epoch": 0.5357466063348416, + "grad_norm": 0.9870713353157043, + "learning_rate": 0.000506903406972962, + "loss": 0.4824, + "mean_token_accuracy": 0.9027767181396484, + "num_tokens": 1307191.0, + "step": 148 + }, + { + "entropy": 2.5513240098953247, + "epoch": 0.539366515837104, + "grad_norm": 0.7921387553215027, + "learning_rate": 0.0005068187639740286, + "loss": 0.3278, + "mean_token_accuracy": 0.9161934554576874, + "num_tokens": 1315878.0, + "step": 149 + }, + { + "entropy": 2.526439070701599, + "epoch": 0.5429864253393665, + "grad_norm": 0.6320391297340393, + "learning_rate": 0.000506733263476557, + "loss": 0.1701, + "mean_token_accuracy": 0.9575318098068237, + "num_tokens": 1324786.0, + "step": 150 + }, + { + "entropy": 2.4837265014648438, + "epoch": 0.5466063348416289, + "grad_norm": 0.5369354486465454, + "learning_rate": 0.000506646905805289, + "loss": 0.1328, + "mean_token_accuracy": 0.9636050164699554, + "num_tokens": 1333766.0, + "step": 151 + }, + { + "entropy": 2.5264737010002136, + "epoch": 0.5502262443438914, + "grad_norm": 0.7346852421760559, + "learning_rate": 0.0005065596912882222, + "loss": 0.2012, + "mean_token_accuracy": 0.9448132663965225, + "num_tokens": 1343004.0, + "step": 152 + }, + { + "entropy": 2.569309651851654, + "epoch": 0.5538461538461539, + "grad_norm": 0.9926508069038391, + "learning_rate": 0.0005064716202566082, + "loss": 0.2831, + "mean_token_accuracy": 0.9332023113965988, + "num_tokens": 1351561.0, + "step": 153 + }, + { + "entropy": 2.3148274421691895, + "epoch": 0.5574660633484163, + "grad_norm": 0.6301954984664917, + "learning_rate": 0.0005063826930449523, + "loss": 0.3622, + "mean_token_accuracy": 0.9349419325590134, + "num_tokens": 1360997.0, + "step": 154 + }, + { + "entropy": 2.497675657272339, + "epoch": 0.5610859728506787, + "grad_norm": 0.8846175670623779, + "learning_rate": 0.000506292909991011, + "loss": 0.2314, + "mean_token_accuracy": 0.9468862265348434, + "num_tokens": 1369600.0, + "step": 155 + }, + { + "entropy": 2.313987612724304, + "epoch": 0.5647058823529412, + "grad_norm": 0.5701894164085388, + "learning_rate": 0.0005062022714357922, + "loss": 0.2154, + "mean_token_accuracy": 0.945093959569931, + "num_tokens": 1379125.0, + "step": 156 + }, + { + "entropy": 2.4019755125045776, + "epoch": 0.5683257918552036, + "grad_norm": 0.8769335746765137, + "learning_rate": 0.0005061107777235524, + "loss": 0.3565, + "mean_token_accuracy": 0.9133864492177963, + "num_tokens": 1388111.0, + "step": 157 + }, + { + "entropy": 2.3127577900886536, + "epoch": 0.571945701357466, + "grad_norm": 1.1026453971862793, + "learning_rate": 0.0005060184292017965, + "loss": 0.2897, + "mean_token_accuracy": 0.899736076593399, + "num_tokens": 1397528.0, + "step": 158 + }, + { + "entropy": 2.2682697772979736, + "epoch": 0.5755656108597285, + "grad_norm": 0.5426591038703918, + "learning_rate": 0.000505925226221276, + "loss": 0.167, + "mean_token_accuracy": 0.9609879851341248, + "num_tokens": 1406809.0, + "step": 159 + }, + { + "entropy": 2.4639336466789246, + "epoch": 0.579185520361991, + "grad_norm": 0.6552363038063049, + "learning_rate": 0.0005058311691359875, + "loss": 0.2511, + "mean_token_accuracy": 0.9355164766311646, + "num_tokens": 1415498.0, + "step": 160 + }, + { + "entropy": 2.467900663614273, + "epoch": 0.5828054298642534, + "grad_norm": 0.7168154120445251, + "learning_rate": 0.000505736258303172, + "loss": 0.234, + "mean_token_accuracy": 0.9450509995222092, + "num_tokens": 1424524.0, + "step": 161 + }, + { + "entropy": 2.3683157563209534, + "epoch": 0.5864253393665159, + "grad_norm": 0.6433501839637756, + "learning_rate": 0.0005056404940833128, + "loss": 0.3441, + "mean_token_accuracy": 0.9261108189821243, + "num_tokens": 1434194.0, + "step": 162 + }, + { + "entropy": 2.4686295986175537, + "epoch": 0.5900452488687783, + "grad_norm": 0.9615177512168884, + "learning_rate": 0.0005055438768401348, + "loss": 0.1492, + "mean_token_accuracy": 0.966903567314148, + "num_tokens": 1442972.0, + "step": 163 + }, + { + "entropy": 2.5551892518997192, + "epoch": 0.5936651583710407, + "grad_norm": 0.4957484006881714, + "learning_rate": 0.0005054464069406023, + "loss": 0.1242, + "mean_token_accuracy": 0.969713419675827, + "num_tokens": 1451324.0, + "step": 164 + }, + { + "entropy": 2.554121434688568, + "epoch": 0.5972850678733032, + "grad_norm": 0.7399498224258423, + "learning_rate": 0.0005053480847549187, + "loss": 0.206, + "mean_token_accuracy": 0.9498797357082367, + "num_tokens": 1459698.0, + "step": 165 + }, + { + "entropy": 2.5181015729904175, + "epoch": 0.6009049773755656, + "grad_norm": 0.7433251142501831, + "learning_rate": 0.0005052489106565241, + "loss": 0.2883, + "mean_token_accuracy": 0.9419967085123062, + "num_tokens": 1468460.0, + "step": 166 + }, + { + "entropy": 2.3073930144309998, + "epoch": 0.604524886877828, + "grad_norm": 0.5920398831367493, + "learning_rate": 0.0005051488850220941, + "loss": 0.197, + "mean_token_accuracy": 0.952111005783081, + "num_tokens": 1477579.0, + "step": 167 + }, + { + "entropy": 2.532376289367676, + "epoch": 0.6081447963800904, + "grad_norm": 0.7033098936080933, + "learning_rate": 0.0005050480082315392, + "loss": 0.2122, + "mean_token_accuracy": 0.9488633275032043, + "num_tokens": 1486307.0, + "step": 168 + }, + { + "entropy": 2.397290349006653, + "epoch": 0.611764705882353, + "grad_norm": 0.8026869893074036, + "learning_rate": 0.0005049462806680021, + "loss": 0.2541, + "mean_token_accuracy": 0.9427233040332794, + "num_tokens": 1495152.0, + "step": 169 + }, + { + "entropy": 2.464823842048645, + "epoch": 0.6153846153846154, + "grad_norm": 0.6508225798606873, + "learning_rate": 0.0005048437027178571, + "loss": 0.2639, + "mean_token_accuracy": 0.9391255974769592, + "num_tokens": 1503903.0, + "step": 170 + }, + { + "entropy": 2.520734131336212, + "epoch": 0.6190045248868778, + "grad_norm": 0.8373616337776184, + "learning_rate": 0.0005047402747707084, + "loss": 0.3078, + "mean_token_accuracy": 0.9302930980920792, + "num_tokens": 1512588.0, + "step": 171 + }, + { + "entropy": 2.388108015060425, + "epoch": 0.6226244343891403, + "grad_norm": 0.6334089636802673, + "learning_rate": 0.0005046359972193884, + "loss": 0.1372, + "mean_token_accuracy": 0.9666119515895844, + "num_tokens": 1522011.0, + "step": 172 + }, + { + "entropy": 2.537126660346985, + "epoch": 0.6262443438914027, + "grad_norm": 0.7665116190910339, + "learning_rate": 0.0005045308704599566, + "loss": 0.2603, + "mean_token_accuracy": 0.9350012242794037, + "num_tokens": 1530767.0, + "step": 173 + }, + { + "entropy": 2.567205488681793, + "epoch": 0.6298642533936651, + "grad_norm": 0.8043875098228455, + "learning_rate": 0.0005044248948916977, + "loss": 0.2497, + "mean_token_accuracy": 0.9400482773780823, + "num_tokens": 1539971.0, + "step": 174 + }, + { + "entropy": 2.585887610912323, + "epoch": 0.6334841628959276, + "grad_norm": 0.5282150506973267, + "learning_rate": 0.0005043180709171206, + "loss": 0.1126, + "mean_token_accuracy": 0.9680279046297073, + "num_tokens": 1548971.0, + "step": 175 + }, + { + "entropy": 2.4289392232894897, + "epoch": 0.63710407239819, + "grad_norm": 0.6838382482528687, + "learning_rate": 0.0005042103989419563, + "loss": 0.2076, + "mean_token_accuracy": 0.9468046277761459, + "num_tokens": 1558403.0, + "step": 176 + }, + { + "entropy": 2.6080575585365295, + "epoch": 0.6407239819004525, + "grad_norm": 0.9058650732040405, + "learning_rate": 0.0005041018793751566, + "loss": 0.1781, + "mean_token_accuracy": 0.9432647377252579, + "num_tokens": 1567209.0, + "step": 177 + }, + { + "entropy": 2.5212480425834656, + "epoch": 0.644343891402715, + "grad_norm": 0.796381950378418, + "learning_rate": 0.0005039925126288929, + "loss": 0.2286, + "mean_token_accuracy": 0.9305787235498428, + "num_tokens": 1576255.0, + "step": 178 + }, + { + "entropy": 2.588195264339447, + "epoch": 0.6479638009049774, + "grad_norm": 0.6489388942718506, + "learning_rate": 0.0005038822991185536, + "loss": 0.1717, + "mean_token_accuracy": 0.9572225511074066, + "num_tokens": 1585335.0, + "step": 179 + }, + { + "entropy": 2.609215259552002, + "epoch": 0.6515837104072398, + "grad_norm": 0.8551130294799805, + "learning_rate": 0.0005037712392627441, + "loss": 0.2358, + "mean_token_accuracy": 0.9529621452093124, + "num_tokens": 1594354.0, + "step": 180 + }, + { + "entropy": 2.4199504256248474, + "epoch": 0.6552036199095023, + "grad_norm": 0.5775637030601501, + "learning_rate": 0.0005036593334832836, + "loss": 0.2402, + "mean_token_accuracy": 0.9437069743871689, + "num_tokens": 1603750.0, + "step": 181 + }, + { + "entropy": 2.516424596309662, + "epoch": 0.6588235294117647, + "grad_norm": 0.6967942118644714, + "learning_rate": 0.0005035465822052047, + "loss": 0.1624, + "mean_token_accuracy": 0.9518167823553085, + "num_tokens": 1612474.0, + "step": 182 + }, + { + "entropy": 2.463354170322418, + "epoch": 0.6624434389140271, + "grad_norm": 0.49672600626945496, + "learning_rate": 0.000503432985856751, + "loss": 0.1654, + "mean_token_accuracy": 0.9564716964960098, + "num_tokens": 1621563.0, + "step": 183 + }, + { + "entropy": 2.4456416964530945, + "epoch": 0.6660633484162896, + "grad_norm": 0.6207183003425598, + "learning_rate": 0.000503318544869376, + "loss": 0.1918, + "mean_token_accuracy": 0.9476529806852341, + "num_tokens": 1630801.0, + "step": 184 + }, + { + "entropy": 2.641440451145172, + "epoch": 0.669683257918552, + "grad_norm": 1.220821499824524, + "learning_rate": 0.000503203259677741, + "loss": 0.4019, + "mean_token_accuracy": 0.9172120243310928, + "num_tokens": 1639522.0, + "step": 185 + }, + { + "entropy": 2.6447275280952454, + "epoch": 0.6733031674208145, + "grad_norm": 0.7546490430831909, + "learning_rate": 0.000503087130719714, + "loss": 0.2484, + "mean_token_accuracy": 0.9387800246477127, + "num_tokens": 1647964.0, + "step": 186 + }, + { + "entropy": 2.4657886028289795, + "epoch": 0.676923076923077, + "grad_norm": 0.7679230570793152, + "learning_rate": 0.0005029701584363675, + "loss": 0.2659, + "mean_token_accuracy": 0.930300235748291, + "num_tokens": 1657181.0, + "step": 187 + }, + { + "entropy": 2.37973552942276, + "epoch": 0.6805429864253394, + "grad_norm": 0.7473414540290833, + "learning_rate": 0.0005028523432719772, + "loss": 0.32, + "mean_token_accuracy": 0.9233052879571915, + "num_tokens": 1666477.0, + "step": 188 + }, + { + "entropy": 2.5238219499588013, + "epoch": 0.6841628959276018, + "grad_norm": 0.5573673248291016, + "learning_rate": 0.0005027336856740201, + "loss": 0.1846, + "mean_token_accuracy": 0.9445535093545914, + "num_tokens": 1675002.0, + "step": 189 + }, + { + "entropy": 2.456815242767334, + "epoch": 0.6877828054298643, + "grad_norm": 0.47237634658813477, + "learning_rate": 0.0005026141860931728, + "loss": 0.1065, + "mean_token_accuracy": 0.964375838637352, + "num_tokens": 1683623.0, + "step": 190 + }, + { + "entropy": 2.548456132411957, + "epoch": 0.6914027149321267, + "grad_norm": 0.7699162364006042, + "learning_rate": 0.00050249384498331, + "loss": 0.1985, + "mean_token_accuracy": 0.9438774734735489, + "num_tokens": 1691718.0, + "step": 191 + }, + { + "entropy": 2.4514941573143005, + "epoch": 0.6950226244343891, + "grad_norm": 1.4113538265228271, + "learning_rate": 0.0005023726628015027, + "loss": 0.4541, + "mean_token_accuracy": 0.9207872897386551, + "num_tokens": 1699824.0, + "step": 192 + }, + { + "entropy": 2.2560824751853943, + "epoch": 0.6986425339366515, + "grad_norm": 0.6007948517799377, + "learning_rate": 0.0005022506400080161, + "loss": 0.1871, + "mean_token_accuracy": 0.9502484053373337, + "num_tokens": 1708722.0, + "step": 193 + }, + { + "entropy": 2.1833614110946655, + "epoch": 0.702262443438914, + "grad_norm": 0.7005489468574524, + "learning_rate": 0.0005021277770663082, + "loss": 0.2222, + "mean_token_accuracy": 0.9386974722146988, + "num_tokens": 1717592.0, + "step": 194 + }, + { + "entropy": 2.2031923830509186, + "epoch": 0.7058823529411765, + "grad_norm": 0.5830584764480591, + "learning_rate": 0.0005020040744430284, + "loss": 0.1106, + "mean_token_accuracy": 0.9719562232494354, + "num_tokens": 1726149.0, + "step": 195 + }, + { + "entropy": 2.199785351753235, + "epoch": 0.709502262443439, + "grad_norm": 0.7465847134590149, + "learning_rate": 0.0005018795326080149, + "loss": 0.1935, + "mean_token_accuracy": 0.9497270882129669, + "num_tokens": 1734541.0, + "step": 196 + }, + { + "entropy": 2.1103186309337616, + "epoch": 0.7131221719457014, + "grad_norm": 1.0782264471054077, + "learning_rate": 0.0005017541520342934, + "loss": 0.2895, + "mean_token_accuracy": 0.9274258464574814, + "num_tokens": 1743722.0, + "step": 197 + }, + { + "entropy": 2.2248528599739075, + "epoch": 0.7167420814479638, + "grad_norm": 0.6409780979156494, + "learning_rate": 0.0005016279331980754, + "loss": 0.1425, + "mean_token_accuracy": 0.96550352871418, + "num_tokens": 1752156.0, + "step": 198 + }, + { + "entropy": 2.19924658536911, + "epoch": 0.7203619909502262, + "grad_norm": 0.7019934058189392, + "learning_rate": 0.0005015008765787561, + "loss": 0.1969, + "mean_token_accuracy": 0.9429282248020172, + "num_tokens": 1760978.0, + "step": 199 + }, + { + "entropy": 2.297484815120697, + "epoch": 0.7239819004524887, + "grad_norm": 0.7826490998268127, + "learning_rate": 0.0005013729826589127, + "loss": 0.2399, + "mean_token_accuracy": 0.9416657984256744, + "num_tokens": 1769533.0, + "step": 200 + }, + { + "entropy": 2.2471498548984528, + "epoch": 0.7276018099547511, + "grad_norm": 0.621566891670227, + "learning_rate": 0.0005012442519243027, + "loss": 0.1876, + "mean_token_accuracy": 0.9460793286561966, + "num_tokens": 1778286.0, + "step": 201 + }, + { + "entropy": 2.2212815284729004, + "epoch": 0.7312217194570135, + "grad_norm": 0.622283935546875, + "learning_rate": 0.0005011146848638616, + "loss": 0.1617, + "mean_token_accuracy": 0.9482609927654266, + "num_tokens": 1787392.0, + "step": 202 + }, + { + "entropy": 2.308752655982971, + "epoch": 0.7348416289592761, + "grad_norm": 0.7263973355293274, + "learning_rate": 0.0005009842819697018, + "loss": 0.2043, + "mean_token_accuracy": 0.9378403723239899, + "num_tokens": 1796133.0, + "step": 203 + }, + { + "entropy": 2.3376497626304626, + "epoch": 0.7384615384615385, + "grad_norm": 0.5493630766868591, + "learning_rate": 0.0005008530437371101, + "loss": 0.1145, + "mean_token_accuracy": 0.970586434006691, + "num_tokens": 1804769.0, + "step": 204 + }, + { + "entropy": 2.373005509376526, + "epoch": 0.7420814479638009, + "grad_norm": 0.6313483119010925, + "learning_rate": 0.0005007209706645461, + "loss": 0.2183, + "mean_token_accuracy": 0.9472708404064178, + "num_tokens": 1813364.0, + "step": 205 + }, + { + "entropy": 2.468949854373932, + "epoch": 0.7457013574660634, + "grad_norm": 1.0125588178634644, + "learning_rate": 0.00050058806325364, + "loss": 0.2225, + "mean_token_accuracy": 0.9351322948932648, + "num_tokens": 1822149.0, + "step": 206 + }, + { + "entropy": 2.2420623898506165, + "epoch": 0.7493212669683258, + "grad_norm": 0.913761556148529, + "learning_rate": 0.0005004543220091911, + "loss": 0.2386, + "mean_token_accuracy": 0.9453927427530289, + "num_tokens": 1831533.0, + "step": 207 + }, + { + "entropy": 2.2966006994247437, + "epoch": 0.7529411764705882, + "grad_norm": 0.7386876940727234, + "learning_rate": 0.0005003197474391658, + "loss": 0.1768, + "mean_token_accuracy": 0.949826255440712, + "num_tokens": 1840157.0, + "step": 208 + }, + { + "entropy": 2.306001305580139, + "epoch": 0.7565610859728507, + "grad_norm": 0.8900741338729858, + "learning_rate": 0.0005001843400546955, + "loss": 0.2899, + "mean_token_accuracy": 0.9241485595703125, + "num_tokens": 1848898.0, + "step": 209 + }, + { + "entropy": 2.117514967918396, + "epoch": 0.7601809954751131, + "grad_norm": 0.644622802734375, + "learning_rate": 0.0005000481003700746, + "loss": 0.2714, + "mean_token_accuracy": 0.9299416691064835, + "num_tokens": 1858330.0, + "step": 210 + }, + { + "entropy": 2.3768392205238342, + "epoch": 0.7638009049773755, + "grad_norm": 0.9724471569061279, + "learning_rate": 0.0004999110289027587, + "loss": 0.1633, + "mean_token_accuracy": 0.9550061523914337, + "num_tokens": 1866806.0, + "step": 211 + }, + { + "entropy": 2.090679556131363, + "epoch": 0.7674208144796381, + "grad_norm": 0.5419518351554871, + "learning_rate": 0.0004997731261733628, + "loss": 0.1369, + "mean_token_accuracy": 0.9619670957326889, + "num_tokens": 1875937.0, + "step": 212 + }, + { + "entropy": 2.099909245967865, + "epoch": 0.7710407239819005, + "grad_norm": 0.6858121752738953, + "learning_rate": 0.0004996343927056592, + "loss": 0.1633, + "mean_token_accuracy": 0.9528832882642746, + "num_tokens": 1885145.0, + "step": 213 + }, + { + "entropy": 2.130059242248535, + "epoch": 0.7746606334841629, + "grad_norm": 0.7691065073013306, + "learning_rate": 0.000499494829026575, + "loss": 0.348, + "mean_token_accuracy": 0.9162366837263107, + "num_tokens": 1894255.0, + "step": 214 + }, + { + "entropy": 2.191373586654663, + "epoch": 0.7782805429864253, + "grad_norm": 0.7427324652671814, + "learning_rate": 0.000499354435666191, + "loss": 0.3373, + "mean_token_accuracy": 0.9311849176883698, + "num_tokens": 1902981.0, + "step": 215 + }, + { + "entropy": 2.1425398886203766, + "epoch": 0.7819004524886878, + "grad_norm": 0.6410383582115173, + "learning_rate": 0.0004992132131577392, + "loss": 0.2079, + "mean_token_accuracy": 0.949742391705513, + "num_tokens": 1912253.0, + "step": 216 + }, + { + "entropy": 2.1396586298942566, + "epoch": 0.7855203619909502, + "grad_norm": 0.5689850449562073, + "learning_rate": 0.0004990711620376003, + "loss": 0.1999, + "mean_token_accuracy": 0.946034774184227, + "num_tokens": 1921409.0, + "step": 217 + }, + { + "entropy": 2.2237865328788757, + "epoch": 0.7891402714932126, + "grad_norm": 0.6408923864364624, + "learning_rate": 0.0004989282828453029, + "loss": 0.2452, + "mean_token_accuracy": 0.9510752111673355, + "num_tokens": 1930397.0, + "step": 218 + }, + { + "entropy": 2.234771251678467, + "epoch": 0.7927601809954751, + "grad_norm": 0.751447856426239, + "learning_rate": 0.0004987845761235203, + "loss": 0.3057, + "mean_token_accuracy": 0.9217256307601929, + "num_tokens": 1939172.0, + "step": 219 + }, + { + "entropy": 2.2653815746307373, + "epoch": 0.7963800904977375, + "grad_norm": 0.751455545425415, + "learning_rate": 0.0004986400424180688, + "loss": 0.3245, + "mean_token_accuracy": 0.9256318956613541, + "num_tokens": 1947979.0, + "step": 220 + }, + { + "entropy": 2.3123483061790466, + "epoch": 0.8, + "grad_norm": 0.5939492583274841, + "learning_rate": 0.0004984946822779061, + "loss": 0.2429, + "mean_token_accuracy": 0.9333402067422867, + "num_tokens": 1956814.0, + "step": 221 + }, + { + "entropy": 2.3289234042167664, + "epoch": 0.8036199095022625, + "grad_norm": 0.5591994524002075, + "learning_rate": 0.0004983484962551284, + "loss": 0.1507, + "mean_token_accuracy": 0.96376833319664, + "num_tokens": 1965641.0, + "step": 222 + }, + { + "entropy": 2.4314023852348328, + "epoch": 0.8072398190045249, + "grad_norm": 0.5805783271789551, + "learning_rate": 0.0004982014849049687, + "loss": 0.2049, + "mean_token_accuracy": 0.9586948156356812, + "num_tokens": 1974180.0, + "step": 223 + }, + { + "entropy": 2.3639765977859497, + "epoch": 0.8108597285067873, + "grad_norm": 0.6924490332603455, + "learning_rate": 0.0004980536487857951, + "loss": 0.2137, + "mean_token_accuracy": 0.9441423565149307, + "num_tokens": 1982744.0, + "step": 224 + }, + { + "entropy": 2.3361759781837463, + "epoch": 0.8144796380090498, + "grad_norm": 0.4579620361328125, + "learning_rate": 0.0004979049884591077, + "loss": 0.1041, + "mean_token_accuracy": 0.9753208309412003, + "num_tokens": 1991583.0, + "step": 225 + }, + { + "entropy": 2.286989688873291, + "epoch": 0.8180995475113122, + "grad_norm": 0.6489312052726746, + "learning_rate": 0.0004977555044895377, + "loss": 0.2131, + "mean_token_accuracy": 0.9520440250635147, + "num_tokens": 2000193.0, + "step": 226 + }, + { + "entropy": 2.288672834634781, + "epoch": 0.8217194570135746, + "grad_norm": 0.7738961577415466, + "learning_rate": 0.0004976051974448441, + "loss": 0.325, + "mean_token_accuracy": 0.9060750156641006, + "num_tokens": 2009233.0, + "step": 227 + }, + { + "entropy": 2.288076102733612, + "epoch": 0.8253393665158371, + "grad_norm": 0.7042292356491089, + "learning_rate": 0.0004974540678959123, + "loss": 0.2206, + "mean_token_accuracy": 0.94980289041996, + "num_tokens": 2018417.0, + "step": 228 + }, + { + "entropy": 2.217707335948944, + "epoch": 0.8289592760180996, + "grad_norm": 0.6834208369255066, + "learning_rate": 0.0004973021164167515, + "loss": 0.2907, + "mean_token_accuracy": 0.951058641076088, + "num_tokens": 2027822.0, + "step": 229 + }, + { + "entropy": 2.1610691249370575, + "epoch": 0.832579185520362, + "grad_norm": 0.665044903755188, + "learning_rate": 0.0004971493435844928, + "loss": 0.2387, + "mean_token_accuracy": 0.9506549835205078, + "num_tokens": 2036983.0, + "step": 230 + }, + { + "entropy": 2.321135401725769, + "epoch": 0.8361990950226245, + "grad_norm": 0.8208273649215698, + "learning_rate": 0.0004969957499793869, + "loss": 0.2399, + "mean_token_accuracy": 0.9435176253318787, + "num_tokens": 2045574.0, + "step": 231 + }, + { + "entropy": 2.1943611800670624, + "epoch": 0.8398190045248869, + "grad_norm": 0.6293840408325195, + "learning_rate": 0.0004968413361848019, + "loss": 0.1784, + "mean_token_accuracy": 0.9559669345617294, + "num_tokens": 2054336.0, + "step": 232 + }, + { + "entropy": 2.2722273468971252, + "epoch": 0.8434389140271493, + "grad_norm": 0.6535817980766296, + "learning_rate": 0.0004966861027872211, + "loss": 0.1675, + "mean_token_accuracy": 0.9532535970211029, + "num_tokens": 2063225.0, + "step": 233 + }, + { + "entropy": 2.3278334736824036, + "epoch": 0.8470588235294118, + "grad_norm": 1.1610206365585327, + "learning_rate": 0.0004965300503762406, + "loss": 0.1588, + "mean_token_accuracy": 0.9641145765781403, + "num_tokens": 2071738.0, + "step": 234 + }, + { + "entropy": 2.202972888946533, + "epoch": 0.8506787330316742, + "grad_norm": 0.4811885356903076, + "learning_rate": 0.0004963731795445675, + "loss": 0.0813, + "mean_token_accuracy": 0.9766911715269089, + "num_tokens": 2080375.0, + "step": 235 + }, + { + "entropy": 2.2433705925941467, + "epoch": 0.8542986425339366, + "grad_norm": 0.8113318681716919, + "learning_rate": 0.0004962154908880171, + "loss": 0.2965, + "mean_token_accuracy": 0.9290606826543808, + "num_tokens": 2089522.0, + "step": 236 + }, + { + "entropy": 2.2168884873390198, + "epoch": 0.857918552036199, + "grad_norm": 0.6128959655761719, + "learning_rate": 0.0004960569850055111, + "loss": 0.1724, + "mean_token_accuracy": 0.9603384286165237, + "num_tokens": 2098162.0, + "step": 237 + }, + { + "entropy": 2.2738255858421326, + "epoch": 0.8615384615384616, + "grad_norm": 0.8557195663452148, + "learning_rate": 0.0004958976624990749, + "loss": 0.2596, + "mean_token_accuracy": 0.9487071484327316, + "num_tokens": 2106984.0, + "step": 238 + }, + { + "entropy": 2.2031425833702087, + "epoch": 0.865158371040724, + "grad_norm": 0.6621816158294678, + "learning_rate": 0.0004957375239738359, + "loss": 0.232, + "mean_token_accuracy": 0.9525040090084076, + "num_tokens": 2116040.0, + "step": 239 + }, + { + "entropy": 2.374737858772278, + "epoch": 0.8687782805429864, + "grad_norm": 0.8481062054634094, + "learning_rate": 0.0004955765700380204, + "loss": 0.2516, + "mean_token_accuracy": 0.9396061599254608, + "num_tokens": 2124862.0, + "step": 240 + }, + { + "entropy": 2.266704559326172, + "epoch": 0.8723981900452489, + "grad_norm": 0.6284282803535461, + "learning_rate": 0.0004954148013029521, + "loss": 0.3244, + "mean_token_accuracy": 0.9381244331598282, + "num_tokens": 2134018.0, + "step": 241 + }, + { + "entropy": 2.3935859203338623, + "epoch": 0.8760180995475113, + "grad_norm": 1.1564176082611084, + "learning_rate": 0.0004952522183830493, + "loss": 0.2706, + "mean_token_accuracy": 0.9297053664922714, + "num_tokens": 2142745.0, + "step": 242 + }, + { + "entropy": 2.281618118286133, + "epoch": 0.8796380090497737, + "grad_norm": 0.5324040055274963, + "learning_rate": 0.0004950888218958225, + "loss": 0.1573, + "mean_token_accuracy": 0.9568462073802948, + "num_tokens": 2151607.0, + "step": 243 + }, + { + "entropy": 2.230749189853668, + "epoch": 0.8832579185520362, + "grad_norm": 0.680780291557312, + "learning_rate": 0.0004949246124618726, + "loss": 0.1956, + "mean_token_accuracy": 0.9479999989271164, + "num_tokens": 2160904.0, + "step": 244 + }, + { + "entropy": 2.21382600069046, + "epoch": 0.8868778280542986, + "grad_norm": 0.6321626305580139, + "learning_rate": 0.0004947595907048877, + "loss": 0.2444, + "mean_token_accuracy": 0.9376699328422546, + "num_tokens": 2170021.0, + "step": 245 + }, + { + "entropy": 2.3659472465515137, + "epoch": 0.890497737556561, + "grad_norm": 0.9778954982757568, + "learning_rate": 0.0004945937572516417, + "loss": 0.3783, + "mean_token_accuracy": 0.9104805737733841, + "num_tokens": 2178995.0, + "step": 246 + }, + { + "entropy": 2.3233078718185425, + "epoch": 0.8941176470588236, + "grad_norm": 0.53229820728302, + "learning_rate": 0.0004944271127319909, + "loss": 0.0759, + "mean_token_accuracy": 0.9791453778743744, + "num_tokens": 2187823.0, + "step": 247 + }, + { + "entropy": 2.2469444274902344, + "epoch": 0.897737556561086, + "grad_norm": 0.6367197632789612, + "learning_rate": 0.0004942596577788728, + "loss": 0.2677, + "mean_token_accuracy": 0.9392691254615784, + "num_tokens": 2196923.0, + "step": 248 + }, + { + "entropy": 2.4508965611457825, + "epoch": 0.9013574660633484, + "grad_norm": 0.6042234897613525, + "learning_rate": 0.0004940913930283024, + "loss": 0.1102, + "mean_token_accuracy": 0.9762090593576431, + "num_tokens": 2205400.0, + "step": 249 + }, + { + "entropy": 2.365670144557953, + "epoch": 0.9049773755656109, + "grad_norm": 0.6490639448165894, + "learning_rate": 0.0004939223191193707, + "loss": 0.1532, + "mean_token_accuracy": 0.9489114433526993, + "num_tokens": 2214201.0, + "step": 250 + }, + { + "entropy": 2.4013625383377075, + "epoch": 0.9085972850678733, + "grad_norm": 0.5969854593276978, + "learning_rate": 0.0004937524366942419, + "loss": 0.1273, + "mean_token_accuracy": 0.9682519882917404, + "num_tokens": 2222979.0, + "step": 251 + }, + { + "entropy": 2.4402357935905457, + "epoch": 0.9122171945701357, + "grad_norm": 0.7559595704078674, + "learning_rate": 0.0004935817463981513, + "loss": 0.1979, + "mean_token_accuracy": 0.9483373910188675, + "num_tokens": 2231169.0, + "step": 252 + }, + { + "entropy": 2.4673256874084473, + "epoch": 0.9158371040723982, + "grad_norm": 0.8663308620452881, + "learning_rate": 0.0004934102488794023, + "loss": 0.2453, + "mean_token_accuracy": 0.9408974200487137, + "num_tokens": 2240099.0, + "step": 253 + }, + { + "entropy": 2.426262080669403, + "epoch": 0.9194570135746606, + "grad_norm": 0.7920467257499695, + "learning_rate": 0.0004932379447893643, + "loss": 0.2828, + "mean_token_accuracy": 0.9319239109754562, + "num_tokens": 2249088.0, + "step": 254 + }, + { + "entropy": 2.5018852949142456, + "epoch": 0.9230769230769231, + "grad_norm": 0.7216617465019226, + "learning_rate": 0.0004930648347824701, + "loss": 0.1647, + "mean_token_accuracy": 0.9551804810762405, + "num_tokens": 2257710.0, + "step": 255 + }, + { + "entropy": 2.43031644821167, + "epoch": 0.9266968325791856, + "grad_norm": 0.646794319152832, + "learning_rate": 0.0004928909195162138, + "loss": 0.1328, + "mean_token_accuracy": 0.9663553237915039, + "num_tokens": 2266883.0, + "step": 256 + }, + { + "entropy": 2.5406370759010315, + "epoch": 0.930316742081448, + "grad_norm": 0.5482825040817261, + "learning_rate": 0.0004927161996511474, + "loss": 0.1872, + "mean_token_accuracy": 0.9557004272937775, + "num_tokens": 2275728.0, + "step": 257 + }, + { + "entropy": 2.636320471763611, + "epoch": 0.9339366515837104, + "grad_norm": 0.7454632520675659, + "learning_rate": 0.0004925406758508797, + "loss": 0.1461, + "mean_token_accuracy": 0.9578974395990372, + "num_tokens": 2284319.0, + "step": 258 + }, + { + "entropy": 2.6067575812339783, + "epoch": 0.9375565610859729, + "grad_norm": 0.8695769309997559, + "learning_rate": 0.000492364348782072, + "loss": 0.1712, + "mean_token_accuracy": 0.9652896523475647, + "num_tokens": 2293035.0, + "step": 259 + }, + { + "entropy": 2.5837162137031555, + "epoch": 0.9411764705882353, + "grad_norm": 0.5752995014190674, + "learning_rate": 0.0004921872191144371, + "loss": 0.1398, + "mean_token_accuracy": 0.9553333520889282, + "num_tokens": 2301802.0, + "step": 260 + }, + { + "entropy": 2.713033616542816, + "epoch": 0.9447963800904977, + "grad_norm": 0.85626620054245, + "learning_rate": 0.0004920092875207363, + "loss": 0.2207, + "mean_token_accuracy": 0.9468346834182739, + "num_tokens": 2309981.0, + "step": 261 + }, + { + "entropy": 2.400112509727478, + "epoch": 0.9484162895927601, + "grad_norm": 0.6766608953475952, + "learning_rate": 0.0004918305546767764, + "loss": 0.1644, + "mean_token_accuracy": 0.9502440094947815, + "num_tokens": 2319212.0, + "step": 262 + }, + { + "entropy": 2.503827154636383, + "epoch": 0.9520361990950226, + "grad_norm": 0.789470911026001, + "learning_rate": 0.0004916510212614072, + "loss": 0.2117, + "mean_token_accuracy": 0.9454390555620193, + "num_tokens": 2328234.0, + "step": 263 + }, + { + "entropy": 2.669040560722351, + "epoch": 0.9556561085972851, + "grad_norm": 0.9579212069511414, + "learning_rate": 0.0004914706879565197, + "loss": 0.2193, + "mean_token_accuracy": 0.9321542829275131, + "num_tokens": 2336543.0, + "step": 264 + }, + { + "entropy": 2.507073998451233, + "epoch": 0.9592760180995475, + "grad_norm": 0.5315744876861572, + "learning_rate": 0.000491289555447043, + "loss": 0.0851, + "mean_token_accuracy": 0.9771326780319214, + "num_tokens": 2345292.0, + "step": 265 + }, + { + "entropy": 2.4205283522605896, + "epoch": 0.96289592760181, + "grad_norm": 0.5441373586654663, + "learning_rate": 0.000491107624420941, + "loss": 0.1323, + "mean_token_accuracy": 0.9541790336370468, + "num_tokens": 2354242.0, + "step": 266 + }, + { + "entropy": 2.3817258477211, + "epoch": 0.9665158371040724, + "grad_norm": 0.5946238040924072, + "learning_rate": 0.0004909248955692111, + "loss": 0.1708, + "mean_token_accuracy": 0.947738841176033, + "num_tokens": 2363183.0, + "step": 267 + }, + { + "entropy": 2.5073485374450684, + "epoch": 0.9701357466063348, + "grad_norm": 0.6979324817657471, + "learning_rate": 0.0004907413695858812, + "loss": 0.2099, + "mean_token_accuracy": 0.9423733651638031, + "num_tokens": 2371885.0, + "step": 268 + }, + { + "entropy": 2.5705007910728455, + "epoch": 0.9737556561085973, + "grad_norm": 0.8203943967819214, + "learning_rate": 0.0004905570471680057, + "loss": 0.217, + "mean_token_accuracy": 0.9511639326810837, + "num_tokens": 2380316.0, + "step": 269 + }, + { + "entropy": 2.2677993774414062, + "epoch": 0.9773755656108597, + "grad_norm": 0.5840432047843933, + "learning_rate": 0.0004903719290156649, + "loss": 0.2364, + "mean_token_accuracy": 0.9407180696725845, + "num_tokens": 2389723.0, + "step": 270 + }, + { + "entropy": 2.477886915206909, + "epoch": 0.9809954751131221, + "grad_norm": 0.818929135799408, + "learning_rate": 0.0004901860158319612, + "loss": 0.1707, + "mean_token_accuracy": 0.9579566866159439, + "num_tokens": 2398388.0, + "step": 271 + }, + { + "entropy": 2.549662232398987, + "epoch": 0.9846153846153847, + "grad_norm": 0.7804781198501587, + "learning_rate": 0.0004899993083230166, + "loss": 0.2944, + "mean_token_accuracy": 0.9381812512874603, + "num_tokens": 2406929.0, + "step": 272 + }, + { + "entropy": 2.4465304017066956, + "epoch": 0.9882352941176471, + "grad_norm": 0.5218799114227295, + "learning_rate": 0.0004898118071979699, + "loss": 0.1661, + "mean_token_accuracy": 0.9500218778848648, + "num_tokens": 2415631.0, + "step": 273 + }, + { + "entropy": 2.5852283239364624, + "epoch": 0.9918552036199095, + "grad_norm": 0.591163158416748, + "learning_rate": 0.0004896235131689743, + "loss": 0.2005, + "mean_token_accuracy": 0.9455285370349884, + "num_tokens": 2424091.0, + "step": 274 + }, + { + "entropy": 2.478701651096344, + "epoch": 0.995475113122172, + "grad_norm": 1.0615383386611938, + "learning_rate": 0.0004894344269511945, + "loss": 0.2864, + "mean_token_accuracy": 0.9306265562772751, + "num_tokens": 2432705.0, + "step": 275 + }, + { + "entropy": 2.600062847137451, + "epoch": 0.9990950226244344, + "grad_norm": 0.7011683583259583, + "learning_rate": 0.0004892445492628043, + "loss": 0.1664, + "mean_token_accuracy": 0.9547821134328842, + "num_tokens": 2440992.0, + "step": 276 + }, + { + "entropy": 2.3411240577697754, + "epoch": 1.0, + "grad_norm": 0.4944029450416565, + "learning_rate": 0.000489053880824983, + "loss": 0.022, + "mean_token_accuracy": 0.9929078221321106, + "num_tokens": 2441725.0, + "step": 277 + }, + { + "epoch": 1.0, + "eval_entropy": 2.5467925265552553, + "eval_loss": 0.21274714171886444, + "eval_mean_token_accuracy": 0.9444630068492114, + "eval_num_tokens": 2441725.0, + "eval_runtime": 116.0434, + "eval_samples_per_second": 3.18, + "eval_steps_per_second": 1.06, + "step": 277 + }, + { + "entropy": 2.609170138835907, + "epoch": 1.0036199095022624, + "grad_norm": 1.0785081386566162, + "learning_rate": 0.0004888624223619136, + "loss": 0.3167, + "mean_token_accuracy": 0.9296800643205643, + "num_tokens": 2450193.0, + "step": 278 + }, + { + "entropy": 2.497025430202484, + "epoch": 1.0072398190045249, + "grad_norm": 0.5221985578536987, + "learning_rate": 0.0004886701746007801, + "loss": 0.0854, + "mean_token_accuracy": 0.9753399342298508, + "num_tokens": 2459309.0, + "step": 279 + }, + { + "entropy": 2.5487362146377563, + "epoch": 1.0108597285067873, + "grad_norm": 0.5161958336830139, + "learning_rate": 0.0004884771382717638, + "loss": 0.0819, + "mean_token_accuracy": 0.9748431146144867, + "num_tokens": 2467844.0, + "step": 280 + }, + { + "entropy": 2.5276209115982056, + "epoch": 1.0144796380090497, + "grad_norm": 0.5731730461120605, + "learning_rate": 0.0004882833141080412, + "loss": 0.1541, + "mean_token_accuracy": 0.9567564427852631, + "num_tokens": 2476894.0, + "step": 281 + }, + { + "entropy": 2.4442760348320007, + "epoch": 1.0180995475113122, + "grad_norm": 0.7120366096496582, + "learning_rate": 0.0004880887028457813, + "loss": 0.1945, + "mean_token_accuracy": 0.9465379565954208, + "num_tokens": 2485971.0, + "step": 282 + }, + { + "entropy": 2.4069360494613647, + "epoch": 1.0217194570135746, + "grad_norm": 0.7468647360801697, + "learning_rate": 0.00048789330522414244, + "loss": 0.2345, + "mean_token_accuracy": 0.9446765780448914, + "num_tokens": 2495043.0, + "step": 283 + }, + { + "entropy": 2.468382716178894, + "epoch": 1.025339366515837, + "grad_norm": 0.666231632232666, + "learning_rate": 0.0004876971219852697, + "loss": 0.1779, + "mean_token_accuracy": 0.9534575343132019, + "num_tokens": 2503672.0, + "step": 284 + }, + { + "entropy": 2.4362316727638245, + "epoch": 1.0289592760180994, + "grad_norm": 0.8445858955383301, + "learning_rate": 0.000487500153874292, + "loss": 0.1698, + "mean_token_accuracy": 0.953661322593689, + "num_tokens": 2512322.0, + "step": 285 + }, + { + "entropy": 2.364333391189575, + "epoch": 1.032579185520362, + "grad_norm": 0.4805246591567993, + "learning_rate": 0.0004873024016393193, + "loss": 0.0778, + "mean_token_accuracy": 0.9824571758508682, + "num_tokens": 2520791.0, + "step": 286 + }, + { + "entropy": 2.223461151123047, + "epoch": 1.0361990950226245, + "grad_norm": 0.648465096950531, + "learning_rate": 0.0004871038660314399, + "loss": 0.2593, + "mean_token_accuracy": 0.9419913589954376, + "num_tokens": 2530082.0, + "step": 287 + }, + { + "entropy": 2.3313387036323547, + "epoch": 1.039819004524887, + "grad_norm": 0.6912294626235962, + "learning_rate": 0.00048690454780471725, + "loss": 0.1354, + "mean_token_accuracy": 0.9561934620141983, + "num_tokens": 2538728.0, + "step": 288 + }, + { + "entropy": 2.191806375980377, + "epoch": 1.0434389140271494, + "grad_norm": 0.8620694279670715, + "learning_rate": 0.0004867044477161874, + "loss": 0.1103, + "mean_token_accuracy": 0.968692272901535, + "num_tokens": 2547219.0, + "step": 289 + }, + { + "entropy": 2.167125165462494, + "epoch": 1.0470588235294118, + "grad_norm": 0.6192149519920349, + "learning_rate": 0.0004865035665258559, + "loss": 0.1288, + "mean_token_accuracy": 0.9643534421920776, + "num_tokens": 2555940.0, + "step": 290 + }, + { + "entropy": 2.2750985622406006, + "epoch": 1.0506787330316743, + "grad_norm": 1.7459602355957031, + "learning_rate": 0.0004863019049966953, + "loss": 0.393, + "mean_token_accuracy": 0.9146681725978851, + "num_tokens": 2564362.0, + "step": 291 + }, + { + "entropy": 2.236129105091095, + "epoch": 1.0542986425339367, + "grad_norm": 0.6311184167861938, + "learning_rate": 0.0004860994638946416, + "loss": 0.1536, + "mean_token_accuracy": 0.9636097103357315, + "num_tokens": 2573316.0, + "step": 292 + }, + { + "entropy": 2.2642418146133423, + "epoch": 1.0579185520361991, + "grad_norm": 0.6023411154747009, + "learning_rate": 0.000485896243988592, + "loss": 0.191, + "mean_token_accuracy": 0.9476015418767929, + "num_tokens": 2581835.0, + "step": 293 + }, + { + "entropy": 2.3589024543762207, + "epoch": 1.0615384615384615, + "grad_norm": 0.48049232363700867, + "learning_rate": 0.0004856922460504016, + "loss": 0.1017, + "mean_token_accuracy": 0.9713075459003448, + "num_tokens": 2590317.0, + "step": 294 + }, + { + "entropy": 2.4141315817832947, + "epoch": 1.065158371040724, + "grad_norm": 0.8456616997718811, + "learning_rate": 0.0004854874708548806, + "loss": 0.1422, + "mean_token_accuracy": 0.9622762501239777, + "num_tokens": 2598538.0, + "step": 295 + }, + { + "entropy": 2.069903999567032, + "epoch": 1.0687782805429864, + "grad_norm": 0.7641116380691528, + "learning_rate": 0.0004852819191797912, + "loss": 0.2185, + "mean_token_accuracy": 0.9464851468801498, + "num_tokens": 2608219.0, + "step": 296 + }, + { + "entropy": 2.163217008113861, + "epoch": 1.0723981900452488, + "grad_norm": 0.546085000038147, + "learning_rate": 0.0004850755918058449, + "loss": 0.1035, + "mean_token_accuracy": 0.9708487540483475, + "num_tokens": 2617261.0, + "step": 297 + }, + { + "entropy": 2.2678662836551666, + "epoch": 1.0760180995475113, + "grad_norm": 0.8699386119842529, + "learning_rate": 0.0004848684895166994, + "loss": 0.2384, + "mean_token_accuracy": 0.9486480504274368, + "num_tokens": 2626144.0, + "step": 298 + }, + { + "entropy": 2.13065105676651, + "epoch": 1.0796380090497737, + "grad_norm": 0.44323107600212097, + "learning_rate": 0.00048466061309895554, + "loss": 0.0818, + "mean_token_accuracy": 0.9722468554973602, + "num_tokens": 2635626.0, + "step": 299 + }, + { + "entropy": 2.184772551059723, + "epoch": 1.0832579185520361, + "grad_norm": 0.7928256988525391, + "learning_rate": 0.0004844519633421545, + "loss": 0.2378, + "mean_token_accuracy": 0.9477885961532593, + "num_tokens": 2644674.0, + "step": 300 + }, + { + "entropy": 2.1669145822525024, + "epoch": 1.0868778280542986, + "grad_norm": 0.5570158362388611, + "learning_rate": 0.00048424254103877456, + "loss": 0.1434, + "mean_token_accuracy": 0.9587411731481552, + "num_tokens": 2653658.0, + "step": 301 + }, + { + "entropy": 2.3057579398155212, + "epoch": 1.090497737556561, + "grad_norm": 0.9084392189979553, + "learning_rate": 0.00048403234698422837, + "loss": 0.3831, + "mean_token_accuracy": 0.8896283358335495, + "num_tokens": 2662350.0, + "step": 302 + }, + { + "entropy": 2.1741657853126526, + "epoch": 1.0941176470588236, + "grad_norm": 0.6791238784790039, + "learning_rate": 0.0004838213819768597, + "loss": 0.1648, + "mean_token_accuracy": 0.9576362520456314, + "num_tokens": 2671450.0, + "step": 303 + }, + { + "entropy": 2.089864045381546, + "epoch": 1.097737556561086, + "grad_norm": 0.5696312189102173, + "learning_rate": 0.0004836096468179406, + "loss": 0.1269, + "mean_token_accuracy": 0.9658148884773254, + "num_tokens": 2680581.0, + "step": 304 + }, + { + "entropy": 2.2657605409622192, + "epoch": 1.1013574660633485, + "grad_norm": 1.605503797531128, + "learning_rate": 0.0004833971423116682, + "loss": 0.1027, + "mean_token_accuracy": 0.9762597978115082, + "num_tokens": 2689001.0, + "step": 305 + }, + { + "entropy": 2.079287111759186, + "epoch": 1.104977375565611, + "grad_norm": 0.5804780721664429, + "learning_rate": 0.00048318386926516157, + "loss": 0.1137, + "mean_token_accuracy": 0.9633719325065613, + "num_tokens": 2698050.0, + "step": 306 + }, + { + "entropy": 2.201345145702362, + "epoch": 1.1085972850678734, + "grad_norm": 0.8606241941452026, + "learning_rate": 0.000482969828488459, + "loss": 0.2124, + "mean_token_accuracy": 0.9472681730985641, + "num_tokens": 2706704.0, + "step": 307 + }, + { + "entropy": 2.095236599445343, + "epoch": 1.1122171945701358, + "grad_norm": 0.7078782320022583, + "learning_rate": 0.0004827550207945147, + "loss": 0.1957, + "mean_token_accuracy": 0.9564679116010666, + "num_tokens": 2715745.0, + "step": 308 + }, + { + "entropy": 2.186302363872528, + "epoch": 1.1158371040723982, + "grad_norm": 0.7166503667831421, + "learning_rate": 0.0004825394469991956, + "loss": 0.1539, + "mean_token_accuracy": 0.9662427455186844, + "num_tokens": 2724296.0, + "step": 309 + }, + { + "entropy": 2.052559405565262, + "epoch": 1.1194570135746607, + "grad_norm": 0.6510501503944397, + "learning_rate": 0.00048232310792127846, + "loss": 0.1831, + "mean_token_accuracy": 0.9533994495868683, + "num_tokens": 2733482.0, + "step": 310 + }, + { + "entropy": 2.093154102563858, + "epoch": 1.123076923076923, + "grad_norm": 0.711121678352356, + "learning_rate": 0.0004821060043824466, + "loss": 0.2315, + "mean_token_accuracy": 0.9381555914878845, + "num_tokens": 2742912.0, + "step": 311 + }, + { + "entropy": 2.188497006893158, + "epoch": 1.1266968325791855, + "grad_norm": 0.6782490015029907, + "learning_rate": 0.00048188813720728707, + "loss": 0.2, + "mean_token_accuracy": 0.9501812607049942, + "num_tokens": 2751808.0, + "step": 312 + }, + { + "entropy": 2.0495824217796326, + "epoch": 1.130316742081448, + "grad_norm": 0.7644634246826172, + "learning_rate": 0.00048166950722328697, + "loss": 0.2152, + "mean_token_accuracy": 0.9440928995609283, + "num_tokens": 2761066.0, + "step": 313 + }, + { + "entropy": 2.1707025468349457, + "epoch": 1.1339366515837104, + "grad_norm": 0.655131459236145, + "learning_rate": 0.00048145011526083106, + "loss": 0.1637, + "mean_token_accuracy": 0.9500558227300644, + "num_tokens": 2769870.0, + "step": 314 + }, + { + "entropy": 2.1047372221946716, + "epoch": 1.1375565610859728, + "grad_norm": 0.5353516936302185, + "learning_rate": 0.0004812299621531979, + "loss": 0.1705, + "mean_token_accuracy": 0.9455999433994293, + "num_tokens": 2779383.0, + "step": 315 + }, + { + "entropy": 2.1921610236167908, + "epoch": 1.1411764705882352, + "grad_norm": 0.8998016119003296, + "learning_rate": 0.00048100904873655696, + "loss": 0.3918, + "mean_token_accuracy": 0.9382697492837906, + "num_tokens": 2788386.0, + "step": 316 + }, + { + "entropy": 2.0850723683834076, + "epoch": 1.1447963800904977, + "grad_norm": 0.867432713508606, + "learning_rate": 0.0004807873758499656, + "loss": 0.2196, + "mean_token_accuracy": 0.9498324394226074, + "num_tokens": 2797496.0, + "step": 317 + }, + { + "entropy": 2.1980925798416138, + "epoch": 1.14841628959276, + "grad_norm": 0.6076980233192444, + "learning_rate": 0.00048056494433536577, + "loss": 0.1086, + "mean_token_accuracy": 0.9642161130905151, + "num_tokens": 2805836.0, + "step": 318 + }, + { + "entropy": 2.15611070394516, + "epoch": 1.1520361990950225, + "grad_norm": 0.6276211738586426, + "learning_rate": 0.0004803417550375806, + "loss": 0.1463, + "mean_token_accuracy": 0.9622830748558044, + "num_tokens": 2814404.0, + "step": 319 + }, + { + "entropy": 2.0017230808734894, + "epoch": 1.155656108597285, + "grad_norm": 0.5840948820114136, + "learning_rate": 0.0004801178088043115, + "loss": 0.1869, + "mean_token_accuracy": 0.9506777077913284, + "num_tokens": 2823786.0, + "step": 320 + }, + { + "entropy": 2.1539418697357178, + "epoch": 1.1592760180995474, + "grad_norm": 1.074331283569336, + "learning_rate": 0.0004798931064861349, + "loss": 0.2797, + "mean_token_accuracy": 0.9271649420261383, + "num_tokens": 2832374.0, + "step": 321 + }, + { + "entropy": 1.930726408958435, + "epoch": 1.16289592760181, + "grad_norm": 0.5121958255767822, + "learning_rate": 0.0004796676489364988, + "loss": 0.1579, + "mean_token_accuracy": 0.9582571685314178, + "num_tokens": 2841561.0, + "step": 322 + }, + { + "entropy": 2.0205810368061066, + "epoch": 1.1665158371040725, + "grad_norm": 0.6360969543457031, + "learning_rate": 0.00047944143701171966, + "loss": 0.1582, + "mean_token_accuracy": 0.9620308429002762, + "num_tokens": 2850171.0, + "step": 323 + }, + { + "entropy": 1.9655758142471313, + "epoch": 1.170135746606335, + "grad_norm": 0.6647385358810425, + "learning_rate": 0.0004792144715709792, + "loss": 0.1594, + "mean_token_accuracy": 0.954497441649437, + "num_tokens": 2858905.0, + "step": 324 + }, + { + "entropy": 1.9725223183631897, + "epoch": 1.1737556561085973, + "grad_norm": 0.6429229974746704, + "learning_rate": 0.0004789867534763211, + "loss": 0.1407, + "mean_token_accuracy": 0.9645214527845383, + "num_tokens": 2867533.0, + "step": 325 + }, + { + "entropy": 1.9473685026168823, + "epoch": 1.1773755656108598, + "grad_norm": 0.811651349067688, + "learning_rate": 0.0004787582835926477, + "loss": 0.1608, + "mean_token_accuracy": 0.9479968994855881, + "num_tokens": 2876286.0, + "step": 326 + }, + { + "entropy": 1.8863109350204468, + "epoch": 1.1809954751131222, + "grad_norm": 0.5587059855461121, + "learning_rate": 0.00047852906278771686, + "loss": 0.131, + "mean_token_accuracy": 0.9684520065784454, + "num_tokens": 2885667.0, + "step": 327 + }, + { + "entropy": 1.8288891315460205, + "epoch": 1.1846153846153846, + "grad_norm": 0.8450536131858826, + "learning_rate": 0.0004782990919321383, + "loss": 0.2224, + "mean_token_accuracy": 0.9377491921186447, + "num_tokens": 2894765.0, + "step": 328 + }, + { + "entropy": 1.9347718358039856, + "epoch": 1.188235294117647, + "grad_norm": 0.7665867209434509, + "learning_rate": 0.0004780683718993705, + "loss": 0.167, + "mean_token_accuracy": 0.9583602845668793, + "num_tokens": 2903551.0, + "step": 329 + }, + { + "entropy": 1.9097798764705658, + "epoch": 1.1918552036199095, + "grad_norm": 0.7705667018890381, + "learning_rate": 0.00047783690356571784, + "loss": 0.2115, + "mean_token_accuracy": 0.9526428133249283, + "num_tokens": 2912197.0, + "step": 330 + }, + { + "entropy": 1.9174850285053253, + "epoch": 1.195475113122172, + "grad_norm": 0.5695499181747437, + "learning_rate": 0.00047760468781032634, + "loss": 0.1033, + "mean_token_accuracy": 0.969958484172821, + "num_tokens": 2920579.0, + "step": 331 + }, + { + "entropy": 1.8578442931175232, + "epoch": 1.1990950226244343, + "grad_norm": 0.7843735814094543, + "learning_rate": 0.000477371725515181, + "loss": 0.1664, + "mean_token_accuracy": 0.9545005410909653, + "num_tokens": 2929352.0, + "step": 332 + }, + { + "entropy": 1.8509328961372375, + "epoch": 1.2027149321266968, + "grad_norm": 0.5951048135757446, + "learning_rate": 0.0004771380175651026, + "loss": 0.1566, + "mean_token_accuracy": 0.9551403075456619, + "num_tokens": 2938387.0, + "step": 333 + }, + { + "entropy": 1.8236390948295593, + "epoch": 1.2063348416289592, + "grad_norm": 0.4988223910331726, + "learning_rate": 0.0004769035648477434, + "loss": 0.1242, + "mean_token_accuracy": 0.966319814324379, + "num_tokens": 2947741.0, + "step": 334 + }, + { + "entropy": 1.9594822525978088, + "epoch": 1.2099547511312216, + "grad_norm": 0.7550755143165588, + "learning_rate": 0.00047666836825358477, + "loss": 0.1591, + "mean_token_accuracy": 0.9666347652673721, + "num_tokens": 2956313.0, + "step": 335 + }, + { + "entropy": 1.9148444533348083, + "epoch": 1.213574660633484, + "grad_norm": 0.5889077186584473, + "learning_rate": 0.00047643242867593345, + "loss": 0.1343, + "mean_token_accuracy": 0.9611433297395706, + "num_tokens": 2964928.0, + "step": 336 + }, + { + "entropy": 1.8126957714557648, + "epoch": 1.2171945701357467, + "grad_norm": 0.5447750091552734, + "learning_rate": 0.0004761957470109179, + "loss": 0.1659, + "mean_token_accuracy": 0.9552300125360489, + "num_tokens": 2974160.0, + "step": 337 + }, + { + "entropy": 1.7981431782245636, + "epoch": 1.2208144796380092, + "grad_norm": 0.5400761365890503, + "learning_rate": 0.0004759583241574854, + "loss": 0.1339, + "mean_token_accuracy": 0.9620136916637421, + "num_tokens": 2982900.0, + "step": 338 + }, + { + "entropy": 1.8613979518413544, + "epoch": 1.2244343891402716, + "grad_norm": 0.7452914714813232, + "learning_rate": 0.0004757201610173981, + "loss": 0.4, + "mean_token_accuracy": 0.9068266004323959, + "num_tokens": 2991783.0, + "step": 339 + }, + { + "entropy": 1.8654026687145233, + "epoch": 1.228054298642534, + "grad_norm": 1.7142685651779175, + "learning_rate": 0.00047548125849523, + "loss": 0.3168, + "mean_token_accuracy": 0.9308896362781525, + "num_tokens": 3000530.0, + "step": 340 + }, + { + "entropy": 1.7702704071998596, + "epoch": 1.2316742081447964, + "grad_norm": 0.6687431931495667, + "learning_rate": 0.0004752416174983633, + "loss": 0.1697, + "mean_token_accuracy": 0.9530515670776367, + "num_tokens": 3009355.0, + "step": 341 + }, + { + "entropy": 1.735857516527176, + "epoch": 1.2352941176470589, + "grad_norm": 0.6127599477767944, + "learning_rate": 0.00047500123893698507, + "loss": 0.1706, + "mean_token_accuracy": 0.9593266248703003, + "num_tokens": 3018518.0, + "step": 342 + }, + { + "entropy": 1.7076368927955627, + "epoch": 1.2389140271493213, + "grad_norm": 0.6973987817764282, + "learning_rate": 0.0004747601237240836, + "loss": 0.1615, + "mean_token_accuracy": 0.9539438933134079, + "num_tokens": 3027752.0, + "step": 343 + }, + { + "entropy": 1.7353227138519287, + "epoch": 1.2425339366515837, + "grad_norm": 0.8406392335891724, + "learning_rate": 0.00047451827277544546, + "loss": 0.2063, + "mean_token_accuracy": 0.9488435834646225, + "num_tokens": 3036383.0, + "step": 344 + }, + { + "entropy": 1.6597246527671814, + "epoch": 1.2461538461538462, + "grad_norm": 0.5971431732177734, + "learning_rate": 0.00047427568700965107, + "loss": 0.1013, + "mean_token_accuracy": 0.9721864312887192, + "num_tokens": 3045375.0, + "step": 345 + }, + { + "entropy": 1.7100033462047577, + "epoch": 1.2497737556561086, + "grad_norm": 0.5883470773696899, + "learning_rate": 0.00047403236734807225, + "loss": 0.1164, + "mean_token_accuracy": 0.9664830714464188, + "num_tokens": 3054084.0, + "step": 346 + }, + { + "entropy": 1.7402609288692474, + "epoch": 1.253393665158371, + "grad_norm": 0.7355862855911255, + "learning_rate": 0.00047378831471486815, + "loss": 0.2007, + "mean_token_accuracy": 0.9560511559247971, + "num_tokens": 3062727.0, + "step": 347 + }, + { + "entropy": 1.79518261551857, + "epoch": 1.2570135746606335, + "grad_norm": 0.6006518006324768, + "learning_rate": 0.00047354353003698163, + "loss": 0.1085, + "mean_token_accuracy": 0.9598321914672852, + "num_tokens": 3071178.0, + "step": 348 + }, + { + "entropy": 1.7328391373157501, + "epoch": 1.260633484162896, + "grad_norm": 0.560342013835907, + "learning_rate": 0.0004732980142441362, + "loss": 0.1593, + "mean_token_accuracy": 0.9579409211874008, + "num_tokens": 3079927.0, + "step": 349 + }, + { + "entropy": 1.7356511652469635, + "epoch": 1.2642533936651583, + "grad_norm": 0.9149975776672363, + "learning_rate": 0.00047305176826883206, + "loss": 0.4064, + "mean_token_accuracy": 0.9265118837356567, + "num_tokens": 3089314.0, + "step": 350 + }, + { + "entropy": 1.8573569357395172, + "epoch": 1.2678733031674208, + "grad_norm": 0.8300670981407166, + "learning_rate": 0.0004728047930463428, + "loss": 0.195, + "mean_token_accuracy": 0.9453776180744171, + "num_tokens": 3097702.0, + "step": 351 + }, + { + "entropy": 1.7906217575073242, + "epoch": 1.2714932126696832, + "grad_norm": 0.5668906569480896, + "learning_rate": 0.0004725570895147118, + "loss": 0.1572, + "mean_token_accuracy": 0.962067037820816, + "num_tokens": 3106379.0, + "step": 352 + }, + { + "entropy": 1.6957395374774933, + "epoch": 1.2751131221719456, + "grad_norm": 0.4048328399658203, + "learning_rate": 0.0004723086586147487, + "loss": 0.0944, + "mean_token_accuracy": 0.9716819673776627, + "num_tokens": 3115622.0, + "step": 353 + }, + { + "entropy": 1.8158144056797028, + "epoch": 1.278733031674208, + "grad_norm": 0.6396092772483826, + "learning_rate": 0.00047205950129002564, + "loss": 0.1011, + "mean_token_accuracy": 0.9698463827371597, + "num_tokens": 3124016.0, + "step": 354 + }, + { + "entropy": 1.730194479227066, + "epoch": 1.2823529411764705, + "grad_norm": 0.662876307964325, + "learning_rate": 0.000471809618486874, + "loss": 0.1641, + "mean_token_accuracy": 0.9520179778337479, + "num_tokens": 3132712.0, + "step": 355 + }, + { + "entropy": 1.6776110529899597, + "epoch": 1.285972850678733, + "grad_norm": 0.868507981300354, + "learning_rate": 0.0004715590111543804, + "loss": 0.3374, + "mean_token_accuracy": 0.9303739666938782, + "num_tokens": 3142103.0, + "step": 356 + }, + { + "entropy": 1.6501678824424744, + "epoch": 1.2895927601809956, + "grad_norm": 0.5433686971664429, + "learning_rate": 0.0004713076802443834, + "loss": 0.1237, + "mean_token_accuracy": 0.9653612226247787, + "num_tokens": 3151192.0, + "step": 357 + }, + { + "entropy": 1.6524465382099152, + "epoch": 1.293212669683258, + "grad_norm": 0.6145523190498352, + "learning_rate": 0.00047105562671147, + "loss": 0.1204, + "mean_token_accuracy": 0.9690534323453903, + "num_tokens": 3159839.0, + "step": 358 + }, + { + "entropy": 1.5339214205741882, + "epoch": 1.2968325791855204, + "grad_norm": 0.500477135181427, + "learning_rate": 0.00047080285151297144, + "loss": 0.1295, + "mean_token_accuracy": 0.9571033865213394, + "num_tokens": 3169047.0, + "step": 359 + }, + { + "entropy": 1.6765435338020325, + "epoch": 1.3004524886877828, + "grad_norm": 0.6697553396224976, + "learning_rate": 0.00047054935560896026, + "loss": 0.135, + "mean_token_accuracy": 0.9672541171312332, + "num_tokens": 3177062.0, + "step": 360 + }, + { + "entropy": 1.5932062566280365, + "epoch": 1.3040723981900453, + "grad_norm": 0.706957221031189, + "learning_rate": 0.0004702951399622462, + "loss": 0.1229, + "mean_token_accuracy": 0.9634416699409485, + "num_tokens": 3185829.0, + "step": 361 + }, + { + "entropy": 1.5623145997524261, + "epoch": 1.3076923076923077, + "grad_norm": 0.6199461221694946, + "learning_rate": 0.00047004020553837275, + "loss": 0.1449, + "mean_token_accuracy": 0.9620065689086914, + "num_tokens": 3194426.0, + "step": 362 + }, + { + "entropy": 1.5226828753948212, + "epoch": 1.3113122171945701, + "grad_norm": 0.8962509036064148, + "learning_rate": 0.0004697845533056132, + "loss": 0.2207, + "mean_token_accuracy": 0.9403344839811325, + "num_tokens": 3203655.0, + "step": 363 + }, + { + "entropy": 1.5395641326904297, + "epoch": 1.3149321266968326, + "grad_norm": 0.5993619561195374, + "learning_rate": 0.00046952818423496727, + "loss": 0.1486, + "mean_token_accuracy": 0.9614185988903046, + "num_tokens": 3212069.0, + "step": 364 + }, + { + "entropy": 1.5738630294799805, + "epoch": 1.318552036199095, + "grad_norm": 0.7393983602523804, + "learning_rate": 0.00046927109930015756, + "loss": 0.1812, + "mean_token_accuracy": 0.9535021334886551, + "num_tokens": 3220482.0, + "step": 365 + }, + { + "entropy": 1.5462632775306702, + "epoch": 1.3221719457013574, + "grad_norm": 0.7453555464744568, + "learning_rate": 0.0004690132994776253, + "loss": 0.164, + "mean_token_accuracy": 0.9585814625024796, + "num_tokens": 3229505.0, + "step": 366 + }, + { + "entropy": 1.5241961777210236, + "epoch": 1.3257918552036199, + "grad_norm": 0.7553415298461914, + "learning_rate": 0.00046875478574652713, + "loss": 0.1445, + "mean_token_accuracy": 0.9682841598987579, + "num_tokens": 3238326.0, + "step": 367 + }, + { + "entropy": 1.5344699025154114, + "epoch": 1.3294117647058823, + "grad_norm": 0.8565949201583862, + "learning_rate": 0.0004684955590887311, + "loss": 0.2521, + "mean_token_accuracy": 0.920401468873024, + "num_tokens": 3247482.0, + "step": 368 + }, + { + "entropy": 1.5109277665615082, + "epoch": 1.3330316742081447, + "grad_norm": 0.5170580148696899, + "learning_rate": 0.00046823562048881295, + "loss": 0.1393, + "mean_token_accuracy": 0.9584086239337921, + "num_tokens": 3256464.0, + "step": 369 + }, + { + "entropy": 1.4666939079761505, + "epoch": 1.3366515837104074, + "grad_norm": 0.6995373368263245, + "learning_rate": 0.0004679749709340529, + "loss": 0.1726, + "mean_token_accuracy": 0.9477890431880951, + "num_tokens": 3265853.0, + "step": 370 + }, + { + "entropy": 1.4208430051803589, + "epoch": 1.3402714932126698, + "grad_norm": 1.1363991498947144, + "learning_rate": 0.000467713611414431, + "loss": 0.196, + "mean_token_accuracy": 0.9495431333780289, + "num_tokens": 3275367.0, + "step": 371 + }, + { + "entropy": 1.5009459853172302, + "epoch": 1.3438914027149322, + "grad_norm": 0.7883325219154358, + "learning_rate": 0.00046745154292262414, + "loss": 0.2526, + "mean_token_accuracy": 0.9334618002176285, + "num_tokens": 3284772.0, + "step": 372 + }, + { + "entropy": 1.5485479533672333, + "epoch": 1.3475113122171947, + "grad_norm": 0.6516429781913757, + "learning_rate": 0.00046718876645400156, + "loss": 0.2057, + "mean_token_accuracy": 0.9546459317207336, + "num_tokens": 3293493.0, + "step": 373 + }, + { + "entropy": 1.6237249970436096, + "epoch": 1.351131221719457, + "grad_norm": 0.8916263580322266, + "learning_rate": 0.00046692528300662213, + "loss": 0.2123, + "mean_token_accuracy": 0.9456845372915268, + "num_tokens": 3302063.0, + "step": 374 + }, + { + "entropy": 1.561572015285492, + "epoch": 1.3547511312217195, + "grad_norm": 0.7527791857719421, + "learning_rate": 0.00046666109358122935, + "loss": 0.2113, + "mean_token_accuracy": 0.9537477940320969, + "num_tokens": 3311037.0, + "step": 375 + }, + { + "entropy": 1.5594256818294525, + "epoch": 1.358371040723982, + "grad_norm": 1.25638747215271, + "learning_rate": 0.0004663961991812485, + "loss": 0.1629, + "mean_token_accuracy": 0.9508458077907562, + "num_tokens": 3319635.0, + "step": 376 + }, + { + "entropy": 1.6909976303577423, + "epoch": 1.3619909502262444, + "grad_norm": 0.7627813220024109, + "learning_rate": 0.00046613060081278194, + "loss": 0.2303, + "mean_token_accuracy": 0.9425801336765289, + "num_tokens": 3328043.0, + "step": 377 + }, + { + "entropy": 1.6074829697608948, + "epoch": 1.3656108597285068, + "grad_norm": 0.6584346294403076, + "learning_rate": 0.00046586429948460646, + "loss": 0.1815, + "mean_token_accuracy": 0.9536214470863342, + "num_tokens": 3337143.0, + "step": 378 + }, + { + "entropy": 1.7382183969020844, + "epoch": 1.3692307692307693, + "grad_norm": 1.37154221534729, + "learning_rate": 0.0004655972962081684, + "loss": 0.1849, + "mean_token_accuracy": 0.948440819978714, + "num_tokens": 3346033.0, + "step": 379 + }, + { + "entropy": 1.7148900926113129, + "epoch": 1.3728506787330317, + "grad_norm": 0.9487980604171753, + "learning_rate": 0.00046532959199758, + "loss": 0.2521, + "mean_token_accuracy": 0.9344504028558731, + "num_tokens": 3354849.0, + "step": 380 + }, + { + "entropy": 1.7164019346237183, + "epoch": 1.3764705882352941, + "grad_norm": 0.5609025359153748, + "learning_rate": 0.00046506118786961614, + "loss": 0.1425, + "mean_token_accuracy": 0.9571309834718704, + "num_tokens": 3363674.0, + "step": 381 + }, + { + "entropy": 1.894619107246399, + "epoch": 1.3800904977375565, + "grad_norm": 0.9811336994171143, + "learning_rate": 0.00046479208484370997, + "loss": 0.2522, + "mean_token_accuracy": 0.9424156546592712, + "num_tokens": 3372325.0, + "step": 382 + }, + { + "entropy": 1.78870290517807, + "epoch": 1.383710407239819, + "grad_norm": 0.5707085132598877, + "learning_rate": 0.00046452228394194893, + "loss": 0.1354, + "mean_token_accuracy": 0.9613165706396103, + "num_tokens": 3381270.0, + "step": 383 + }, + { + "entropy": 1.803922712802887, + "epoch": 1.3873303167420814, + "grad_norm": 0.5655364394187927, + "learning_rate": 0.0004642517861890713, + "loss": 0.0818, + "mean_token_accuracy": 0.9776160269975662, + "num_tokens": 3390363.0, + "step": 384 + }, + { + "entropy": 1.8172507882118225, + "epoch": 1.3909502262443438, + "grad_norm": 0.6950513124465942, + "learning_rate": 0.00046398059261246205, + "loss": 0.1145, + "mean_token_accuracy": 0.963288351893425, + "num_tokens": 3399176.0, + "step": 385 + }, + { + "entropy": 1.9182518422603607, + "epoch": 1.3945701357466063, + "grad_norm": 0.5900619029998779, + "learning_rate": 0.0004637087042421489, + "loss": 0.108, + "mean_token_accuracy": 0.9723307639360428, + "num_tokens": 3407978.0, + "step": 386 + }, + { + "entropy": 1.8558574616909027, + "epoch": 1.3981900452488687, + "grad_norm": 0.6279832124710083, + "learning_rate": 0.00046343612211079843, + "loss": 0.1471, + "mean_token_accuracy": 0.9603912532329559, + "num_tokens": 3416856.0, + "step": 387 + }, + { + "entropy": 1.8146779537200928, + "epoch": 1.4018099547511311, + "grad_norm": 0.6171274781227112, + "learning_rate": 0.0004631628472537125, + "loss": 0.1872, + "mean_token_accuracy": 0.9447146654129028, + "num_tokens": 3426044.0, + "step": 388 + }, + { + "entropy": 1.9342225790023804, + "epoch": 1.4054298642533936, + "grad_norm": 0.9947887659072876, + "learning_rate": 0.00046288888070882374, + "loss": 0.2966, + "mean_token_accuracy": 0.9279204607009888, + "num_tokens": 3435154.0, + "step": 389 + }, + { + "entropy": 1.9391801953315735, + "epoch": 1.409049773755656, + "grad_norm": 0.7155653834342957, + "learning_rate": 0.000462614223516692, + "loss": 0.1847, + "mean_token_accuracy": 0.9475171864032745, + "num_tokens": 3444563.0, + "step": 390 + }, + { + "entropy": 2.0716978013515472, + "epoch": 1.4126696832579184, + "grad_norm": 0.8198989629745483, + "learning_rate": 0.0004623388767205004, + "loss": 0.1317, + "mean_token_accuracy": 0.9608721435070038, + "num_tokens": 3453410.0, + "step": 391 + }, + { + "entropy": 2.1060431599617004, + "epoch": 1.416289592760181, + "grad_norm": 1.025406002998352, + "learning_rate": 0.00046206284136605106, + "loss": 0.2146, + "mean_token_accuracy": 0.9414294511079788, + "num_tokens": 3461958.0, + "step": 392 + }, + { + "entropy": 2.1459922194480896, + "epoch": 1.4199095022624435, + "grad_norm": 0.9209627509117126, + "learning_rate": 0.00046178611850176146, + "loss": 0.2137, + "mean_token_accuracy": 0.956874743103981, + "num_tokens": 3470547.0, + "step": 393 + }, + { + "entropy": 2.0233450531959534, + "epoch": 1.423529411764706, + "grad_norm": 0.5777944922447205, + "learning_rate": 0.00046150870917866025, + "loss": 0.122, + "mean_token_accuracy": 0.9672323018312454, + "num_tokens": 3479618.0, + "step": 394 + }, + { + "entropy": 2.035937190055847, + "epoch": 1.4271493212669684, + "grad_norm": 0.7945542931556702, + "learning_rate": 0.0004612306144503835, + "loss": 0.2879, + "mean_token_accuracy": 0.946587473154068, + "num_tokens": 3488533.0, + "step": 395 + }, + { + "entropy": 2.155315637588501, + "epoch": 1.4307692307692308, + "grad_norm": 0.6385292410850525, + "learning_rate": 0.00046095183537317035, + "loss": 0.1008, + "mean_token_accuracy": 0.9655124247074127, + "num_tokens": 3496686.0, + "step": 396 + }, + { + "entropy": 2.186827063560486, + "epoch": 1.4343891402714932, + "grad_norm": 0.4759826958179474, + "learning_rate": 0.0004606723730058593, + "loss": 0.0768, + "mean_token_accuracy": 0.9783597737550735, + "num_tokens": 3504958.0, + "step": 397 + }, + { + "entropy": 1.974392294883728, + "epoch": 1.4380090497737557, + "grad_norm": 0.6250292062759399, + "learning_rate": 0.00046039222840988406, + "loss": 0.1381, + "mean_token_accuracy": 0.9586146324872971, + "num_tokens": 3513694.0, + "step": 398 + }, + { + "entropy": 2.045738846063614, + "epoch": 1.441628959276018, + "grad_norm": 0.5517769455909729, + "learning_rate": 0.0004601114026492695, + "loss": 0.1312, + "mean_token_accuracy": 0.9682512134313583, + "num_tokens": 3522395.0, + "step": 399 + }, + { + "entropy": 2.105030357837677, + "epoch": 1.4452488687782805, + "grad_norm": 0.6748242974281311, + "learning_rate": 0.0004598298967906276, + "loss": 0.1056, + "mean_token_accuracy": 0.9701305478811264, + "num_tokens": 3530838.0, + "step": 400 + }, + { + "entropy": 2.024325281381607, + "epoch": 1.448868778280543, + "grad_norm": 0.6320233941078186, + "learning_rate": 0.00045954771190315344, + "loss": 0.1129, + "mean_token_accuracy": 0.9633017927408218, + "num_tokens": 3540184.0, + "step": 401 + }, + { + "entropy": 2.1561593413352966, + "epoch": 1.4524886877828054, + "grad_norm": 0.7380363941192627, + "learning_rate": 0.0004592648490586213, + "loss": 0.1304, + "mean_token_accuracy": 0.9599586874246597, + "num_tokens": 3548727.0, + "step": 402 + }, + { + "entropy": 2.2986454367637634, + "epoch": 1.4561085972850678, + "grad_norm": 0.669114351272583, + "learning_rate": 0.00045898130933138024, + "loss": 0.1005, + "mean_token_accuracy": 0.9724964797496796, + "num_tokens": 3556780.0, + "step": 403 + }, + { + "entropy": 2.103136509656906, + "epoch": 1.4597285067873302, + "grad_norm": 0.6677402853965759, + "learning_rate": 0.0004586970937983504, + "loss": 0.1177, + "mean_token_accuracy": 0.9597653448581696, + "num_tokens": 3565427.0, + "step": 404 + }, + { + "entropy": 2.112696200609207, + "epoch": 1.463348416289593, + "grad_norm": 0.4597342014312744, + "learning_rate": 0.0004584122035390185, + "loss": 0.0695, + "mean_token_accuracy": 0.9763098359107971, + "num_tokens": 3573902.0, + "step": 405 + }, + { + "entropy": 2.0472628474235535, + "epoch": 1.4669683257918553, + "grad_norm": 0.7842056751251221, + "learning_rate": 0.0004581266396354339, + "loss": 0.1981, + "mean_token_accuracy": 0.9521032422780991, + "num_tokens": 3582913.0, + "step": 406 + }, + { + "entropy": 2.236558735370636, + "epoch": 1.4705882352941178, + "grad_norm": 0.7634767293930054, + "learning_rate": 0.000457840403172205, + "loss": 0.1956, + "mean_token_accuracy": 0.9602932929992676, + "num_tokens": 3591197.0, + "step": 407 + }, + { + "entropy": 2.182949125766754, + "epoch": 1.4742081447963802, + "grad_norm": 0.7084661722183228, + "learning_rate": 0.00045755349523649415, + "loss": 0.2463, + "mean_token_accuracy": 0.9392582327127457, + "num_tokens": 3600134.0, + "step": 408 + }, + { + "entropy": 2.135133147239685, + "epoch": 1.4778280542986426, + "grad_norm": 0.8172940015792847, + "learning_rate": 0.00045726591691801433, + "loss": 0.2375, + "mean_token_accuracy": 0.9458330571651459, + "num_tokens": 3608945.0, + "step": 409 + }, + { + "entropy": 2.157473146915436, + "epoch": 1.481447963800905, + "grad_norm": 0.6165594458580017, + "learning_rate": 0.0004569776693090246, + "loss": 0.1628, + "mean_token_accuracy": 0.9586529731750488, + "num_tokens": 3617790.0, + "step": 410 + }, + { + "entropy": 2.15165376663208, + "epoch": 1.4850678733031675, + "grad_norm": 0.6619407534599304, + "learning_rate": 0.0004566887535043263, + "loss": 0.1866, + "mean_token_accuracy": 0.9545126557350159, + "num_tokens": 3626937.0, + "step": 411 + }, + { + "entropy": 2.271161735057831, + "epoch": 1.48868778280543, + "grad_norm": 0.5861835479736328, + "learning_rate": 0.0004563991706012582, + "loss": 0.1409, + "mean_token_accuracy": 0.9595955163240433, + "num_tokens": 3636025.0, + "step": 412 + }, + { + "entropy": 2.277799427509308, + "epoch": 1.4923076923076923, + "grad_norm": 0.6464956402778625, + "learning_rate": 0.00045610892169969323, + "loss": 0.0792, + "mean_token_accuracy": 0.9806316941976547, + "num_tokens": 3644746.0, + "step": 413 + }, + { + "entropy": 2.2143171429634094, + "epoch": 1.4959276018099548, + "grad_norm": 0.7531687021255493, + "learning_rate": 0.00045581800790203366, + "loss": 0.2584, + "mean_token_accuracy": 0.9225966930389404, + "num_tokens": 3654064.0, + "step": 414 + }, + { + "entropy": 2.231681764125824, + "epoch": 1.4995475113122172, + "grad_norm": 0.6902768015861511, + "learning_rate": 0.00045552643031320726, + "loss": 0.232, + "mean_token_accuracy": 0.9433842301368713, + "num_tokens": 3663130.0, + "step": 415 + }, + { + "entropy": 2.2672717571258545, + "epoch": 1.5031674208144796, + "grad_norm": 0.5134314894676208, + "learning_rate": 0.00045523419004066273, + "loss": 0.0874, + "mean_token_accuracy": 0.9708191752433777, + "num_tokens": 3671981.0, + "step": 416 + }, + { + "entropy": 2.3302834033966064, + "epoch": 1.506787330316742, + "grad_norm": 0.885969340801239, + "learning_rate": 0.0004549412881943659, + "loss": 0.0723, + "mean_token_accuracy": 0.9791463166475296, + "num_tokens": 3680525.0, + "step": 417 + }, + { + "entropy": 2.2693899869918823, + "epoch": 1.5104072398190045, + "grad_norm": 0.7424856424331665, + "learning_rate": 0.00045464772588679547, + "loss": 0.1509, + "mean_token_accuracy": 0.9600907415151596, + "num_tokens": 3689430.0, + "step": 418 + }, + { + "entropy": 2.4042725563049316, + "epoch": 1.514027149321267, + "grad_norm": 0.8968034982681274, + "learning_rate": 0.0004543535042329382, + "loss": 0.1984, + "mean_token_accuracy": 0.9488537162542343, + "num_tokens": 3697836.0, + "step": 419 + }, + { + "entropy": 2.2518428564071655, + "epoch": 1.5176470588235293, + "grad_norm": 0.5963534712791443, + "learning_rate": 0.0004540586243502858, + "loss": 0.1214, + "mean_token_accuracy": 0.9711381644010544, + "num_tokens": 3706675.0, + "step": 420 + }, + { + "entropy": 2.275522291660309, + "epoch": 1.5212669683257918, + "grad_norm": 1.0797090530395508, + "learning_rate": 0.0004537630873588293, + "loss": 0.2508, + "mean_token_accuracy": 0.9247037768363953, + "num_tokens": 3715631.0, + "step": 421 + }, + { + "entropy": 2.249617278575897, + "epoch": 1.5248868778280542, + "grad_norm": 0.7636313438415527, + "learning_rate": 0.000453466894381056, + "loss": 0.1112, + "mean_token_accuracy": 0.9681926071643829, + "num_tokens": 3724579.0, + "step": 422 + }, + { + "entropy": 2.280571699142456, + "epoch": 1.5285067873303166, + "grad_norm": 0.9915648698806763, + "learning_rate": 0.00045317004654194464, + "loss": 0.3532, + "mean_token_accuracy": 0.9360047876834869, + "num_tokens": 3733607.0, + "step": 423 + }, + { + "entropy": 2.241512656211853, + "epoch": 1.532126696832579, + "grad_norm": 0.924977719783783, + "learning_rate": 0.0004528725449689611, + "loss": 0.1997, + "mean_token_accuracy": 0.9475428760051727, + "num_tokens": 3742611.0, + "step": 424 + }, + { + "entropy": 2.201731503009796, + "epoch": 1.5357466063348415, + "grad_norm": 0.7018861770629883, + "learning_rate": 0.0004525743907920542, + "loss": 0.1683, + "mean_token_accuracy": 0.9465018659830093, + "num_tokens": 3751737.0, + "step": 425 + }, + { + "entropy": 2.28944593667984, + "epoch": 1.539366515837104, + "grad_norm": 0.5893452763557434, + "learning_rate": 0.00045227558514365166, + "loss": 0.0969, + "mean_token_accuracy": 0.9711766839027405, + "num_tokens": 3761245.0, + "step": 426 + }, + { + "entropy": 2.3497202396392822, + "epoch": 1.5429864253393664, + "grad_norm": 0.685279130935669, + "learning_rate": 0.0004519761291586551, + "loss": 0.106, + "mean_token_accuracy": 0.9663016647100449, + "num_tokens": 3769854.0, + "step": 427 + }, + { + "entropy": 2.308362066745758, + "epoch": 1.5466063348416288, + "grad_norm": 0.5116177797317505, + "learning_rate": 0.00045167602397443694, + "loss": 0.1132, + "mean_token_accuracy": 0.9700013697147369, + "num_tokens": 3778996.0, + "step": 428 + }, + { + "entropy": 2.238637685775757, + "epoch": 1.5502262443438914, + "grad_norm": 0.8374833464622498, + "learning_rate": 0.00045137527073083457, + "loss": 0.2539, + "mean_token_accuracy": 0.9407305717468262, + "num_tokens": 3787835.0, + "step": 429 + }, + { + "entropy": 2.3406758308410645, + "epoch": 1.5538461538461539, + "grad_norm": 0.5140913724899292, + "learning_rate": 0.0004510738705701473, + "loss": 0.1113, + "mean_token_accuracy": 0.9635641574859619, + "num_tokens": 3796498.0, + "step": 430 + }, + { + "entropy": 2.2642539143562317, + "epoch": 1.5574660633484163, + "grad_norm": 0.5750702023506165, + "learning_rate": 0.0004507718246371313, + "loss": 0.1127, + "mean_token_accuracy": 0.9660817235708237, + "num_tokens": 3805464.0, + "step": 431 + }, + { + "entropy": 2.2058264315128326, + "epoch": 1.5610859728506787, + "grad_norm": 0.6448659300804138, + "learning_rate": 0.0004504691340789955, + "loss": 0.0994, + "mean_token_accuracy": 0.96739861369133, + "num_tokens": 3814309.0, + "step": 432 + }, + { + "entropy": 2.330399215221405, + "epoch": 1.5647058823529412, + "grad_norm": 0.8432528376579285, + "learning_rate": 0.0004501658000453973, + "loss": 0.1999, + "mean_token_accuracy": 0.9510775059461594, + "num_tokens": 3823126.0, + "step": 433 + }, + { + "entropy": 2.4211326837539673, + "epoch": 1.5683257918552036, + "grad_norm": 0.8101194500923157, + "learning_rate": 0.00044986182368843806, + "loss": 0.144, + "mean_token_accuracy": 0.9656328558921814, + "num_tokens": 3831274.0, + "step": 434 + }, + { + "entropy": 2.2594956755638123, + "epoch": 1.571945701357466, + "grad_norm": 0.6753663420677185, + "learning_rate": 0.0004495572061626585, + "loss": 0.1433, + "mean_token_accuracy": 0.9572386592626572, + "num_tokens": 3840206.0, + "step": 435 + }, + { + "entropy": 2.1233682930469513, + "epoch": 1.5755656108597285, + "grad_norm": 0.48616713285446167, + "learning_rate": 0.000449251948625035, + "loss": 0.0934, + "mean_token_accuracy": 0.9740773588418961, + "num_tokens": 3849363.0, + "step": 436 + }, + { + "entropy": 2.325556695461273, + "epoch": 1.5791855203619911, + "grad_norm": 0.7744045853614807, + "learning_rate": 0.00044894605223497446, + "loss": 0.127, + "mean_token_accuracy": 0.9687052518129349, + "num_tokens": 3857733.0, + "step": 437 + }, + { + "entropy": 2.266542673110962, + "epoch": 1.5828054298642535, + "grad_norm": 2.373530387878418, + "learning_rate": 0.00044863951815431045, + "loss": 0.2404, + "mean_token_accuracy": 0.9437267184257507, + "num_tokens": 3866374.0, + "step": 438 + }, + { + "entropy": 2.1757248640060425, + "epoch": 1.586425339366516, + "grad_norm": 0.5588560700416565, + "learning_rate": 0.00044833234754729847, + "loss": 0.142, + "mean_token_accuracy": 0.9601300358772278, + "num_tokens": 3875520.0, + "step": 439 + }, + { + "entropy": 2.124377518892288, + "epoch": 1.5900452488687784, + "grad_norm": 0.5602438449859619, + "learning_rate": 0.0004480245415806116, + "loss": 0.1556, + "mean_token_accuracy": 0.9561446160078049, + "num_tokens": 3884345.0, + "step": 440 + }, + { + "entropy": 2.1571075320243835, + "epoch": 1.5936651583710408, + "grad_norm": 0.472598671913147, + "learning_rate": 0.0004477161014233361, + "loss": 0.0848, + "mean_token_accuracy": 0.9742853343486786, + "num_tokens": 3893129.0, + "step": 441 + }, + { + "entropy": 2.0434057414531708, + "epoch": 1.5972850678733033, + "grad_norm": 0.7104448676109314, + "learning_rate": 0.00044740702824696703, + "loss": 0.1524, + "mean_token_accuracy": 0.9542464315891266, + "num_tokens": 3902120.0, + "step": 442 + }, + { + "entropy": 2.1118403673171997, + "epoch": 1.6009049773755657, + "grad_norm": 0.6632394194602966, + "learning_rate": 0.0004470973232254037, + "loss": 0.3001, + "mean_token_accuracy": 0.928197592496872, + "num_tokens": 3910974.0, + "step": 443 + }, + { + "entropy": 2.0292475819587708, + "epoch": 1.6045248868778281, + "grad_norm": 1.050956130027771, + "learning_rate": 0.00044678698753494527, + "loss": 0.2226, + "mean_token_accuracy": 0.9448522627353668, + "num_tokens": 3920005.0, + "step": 444 + }, + { + "entropy": 1.991033524274826, + "epoch": 1.6081447963800906, + "grad_norm": 0.670244038105011, + "learning_rate": 0.00044647602235428624, + "loss": 0.2158, + "mean_token_accuracy": 0.9551118016242981, + "num_tokens": 3929334.0, + "step": 445 + }, + { + "entropy": 2.04949289560318, + "epoch": 1.611764705882353, + "grad_norm": 0.6321494579315186, + "learning_rate": 0.00044616442886451197, + "loss": 0.1743, + "mean_token_accuracy": 0.9494802355766296, + "num_tokens": 3938211.0, + "step": 446 + }, + { + "entropy": 2.1101951897144318, + "epoch": 1.6153846153846154, + "grad_norm": 0.6970012187957764, + "learning_rate": 0.0004458522082490943, + "loss": 0.1228, + "mean_token_accuracy": 0.9624926447868347, + "num_tokens": 3946534.0, + "step": 447 + }, + { + "entropy": 1.9337081909179688, + "epoch": 1.6190045248868778, + "grad_norm": 0.5971657633781433, + "learning_rate": 0.0004455393616938868, + "loss": 0.1431, + "mean_token_accuracy": 0.9635348320007324, + "num_tokens": 3955694.0, + "step": 448 + }, + { + "entropy": 1.9635128676891327, + "epoch": 1.6226244343891403, + "grad_norm": 0.8510827422142029, + "learning_rate": 0.00044522589038712074, + "loss": 0.2446, + "mean_token_accuracy": 0.9457641988992691, + "num_tokens": 3964907.0, + "step": 449 + }, + { + "entropy": 2.0336360335350037, + "epoch": 1.6262443438914027, + "grad_norm": 0.5803818106651306, + "learning_rate": 0.00044491179551939985, + "loss": 0.0872, + "mean_token_accuracy": 0.9734505414962769, + "num_tokens": 3973584.0, + "step": 450 + }, + { + "entropy": 2.0668878853321075, + "epoch": 1.6298642533936651, + "grad_norm": 0.6990496516227722, + "learning_rate": 0.0004445970782836967, + "loss": 0.1138, + "mean_token_accuracy": 0.9702571034431458, + "num_tokens": 3982632.0, + "step": 451 + }, + { + "entropy": 2.1481760144233704, + "epoch": 1.6334841628959276, + "grad_norm": 0.6156729459762573, + "learning_rate": 0.00044428173987534733, + "loss": 0.0936, + "mean_token_accuracy": 0.9739355593919754, + "num_tokens": 3991147.0, + "step": 452 + }, + { + "entropy": 2.0678701996803284, + "epoch": 1.63710407239819, + "grad_norm": 0.5441684126853943, + "learning_rate": 0.0004439657814920472, + "loss": 0.123, + "mean_token_accuracy": 0.9693446308374405, + "num_tokens": 3999990.0, + "step": 453 + }, + { + "entropy": 1.9867055118083954, + "epoch": 1.6407239819004524, + "grad_norm": 0.9218093156814575, + "learning_rate": 0.00044364920433384656, + "loss": 0.1997, + "mean_token_accuracy": 0.9564195573329926, + "num_tokens": 4009097.0, + "step": 454 + }, + { + "entropy": 2.145586997270584, + "epoch": 1.6443438914027149, + "grad_norm": 0.77643883228302, + "learning_rate": 0.0004433320096031458, + "loss": 0.1491, + "mean_token_accuracy": 0.9602408111095428, + "num_tokens": 4018059.0, + "step": 455 + }, + { + "entropy": 2.071108251810074, + "epoch": 1.6479638009049773, + "grad_norm": 0.5267088413238525, + "learning_rate": 0.0004430141985046909, + "loss": 0.0875, + "mean_token_accuracy": 0.9764399826526642, + "num_tokens": 4027089.0, + "step": 456 + }, + { + "entropy": 2.1659318804740906, + "epoch": 1.6515837104072397, + "grad_norm": 1.0642318725585938, + "learning_rate": 0.000442695772245569, + "loss": 0.2623, + "mean_token_accuracy": 0.9307756721973419, + "num_tokens": 4035719.0, + "step": 457 + }, + { + "entropy": 2.0232724249362946, + "epoch": 1.6552036199095022, + "grad_norm": 0.6213289499282837, + "learning_rate": 0.0004423767320352035, + "loss": 0.1597, + "mean_token_accuracy": 0.9599647223949432, + "num_tokens": 4045088.0, + "step": 458 + }, + { + "entropy": 2.047410547733307, + "epoch": 1.6588235294117646, + "grad_norm": 0.6346105933189392, + "learning_rate": 0.0004420570790853498, + "loss": 0.1422, + "mean_token_accuracy": 0.9649711549282074, + "num_tokens": 4054262.0, + "step": 459 + }, + { + "entropy": 2.0923012793064117, + "epoch": 1.662443438914027, + "grad_norm": 0.46477749943733215, + "learning_rate": 0.0004417368146100907, + "loss": 0.079, + "mean_token_accuracy": 0.9777993708848953, + "num_tokens": 4063107.0, + "step": 460 + }, + { + "entropy": 2.168913394212723, + "epoch": 1.6660633484162894, + "grad_norm": 0.5164734721183777, + "learning_rate": 0.0004414159398258312, + "loss": 0.0941, + "mean_token_accuracy": 0.9725133627653122, + "num_tokens": 4071656.0, + "step": 461 + }, + { + "entropy": 2.152670443058014, + "epoch": 1.6696832579185519, + "grad_norm": 0.8985757231712341, + "learning_rate": 0.00044109445595129495, + "loss": 0.2142, + "mean_token_accuracy": 0.9387252777814865, + "num_tokens": 4080023.0, + "step": 462 + }, + { + "entropy": 2.111784875392914, + "epoch": 1.6733031674208145, + "grad_norm": 0.47521084547042847, + "learning_rate": 0.0004407723642075184, + "loss": 0.0581, + "mean_token_accuracy": 0.9821985810995102, + "num_tokens": 4088469.0, + "step": 463 + }, + { + "entropy": 1.9784683287143707, + "epoch": 1.676923076923077, + "grad_norm": 0.5552536249160767, + "learning_rate": 0.0004404496658178472, + "loss": 0.1353, + "mean_token_accuracy": 0.9619844257831573, + "num_tokens": 4097737.0, + "step": 464 + }, + { + "entropy": 2.015674114227295, + "epoch": 1.6805429864253394, + "grad_norm": 0.6078305244445801, + "learning_rate": 0.0004401263620079309, + "loss": 0.1916, + "mean_token_accuracy": 0.9506707191467285, + "num_tokens": 4107156.0, + "step": 465 + }, + { + "entropy": 2.0832217931747437, + "epoch": 1.6841628959276018, + "grad_norm": 0.6618755459785461, + "learning_rate": 0.0004398024540057186, + "loss": 0.1671, + "mean_token_accuracy": 0.9617152661085129, + "num_tokens": 4116019.0, + "step": 466 + }, + { + "entropy": 2.0383114516735077, + "epoch": 1.6877828054298643, + "grad_norm": 0.5774693489074707, + "learning_rate": 0.0004394779430414541, + "loss": 0.2647, + "mean_token_accuracy": 0.9387127161026001, + "num_tokens": 4125001.0, + "step": 467 + }, + { + "entropy": 2.201409190893173, + "epoch": 1.6914027149321267, + "grad_norm": 0.7600311636924744, + "learning_rate": 0.0004391528303476715, + "loss": 0.073, + "mean_token_accuracy": 0.979825034737587, + "num_tokens": 4133467.0, + "step": 468 + }, + { + "entropy": 2.168666422367096, + "epoch": 1.6950226244343891, + "grad_norm": 0.7801902294158936, + "learning_rate": 0.00043882711715919015, + "loss": 0.2406, + "mean_token_accuracy": 0.9451306313276291, + "num_tokens": 4141765.0, + "step": 469 + }, + { + "entropy": 2.1429262161254883, + "epoch": 1.6986425339366515, + "grad_norm": 0.5192358493804932, + "learning_rate": 0.0004385008047131104, + "loss": 0.1052, + "mean_token_accuracy": 0.9749262481927872, + "num_tokens": 4150732.0, + "step": 470 + }, + { + "entropy": 2.1387495696544647, + "epoch": 1.702262443438914, + "grad_norm": 0.6219777464866638, + "learning_rate": 0.0004381738942488083, + "loss": 0.2127, + "mean_token_accuracy": 0.9398418068885803, + "num_tokens": 4159715.0, + "step": 471 + }, + { + "entropy": 2.1718398332595825, + "epoch": 1.7058823529411766, + "grad_norm": 0.5738123655319214, + "learning_rate": 0.0004378463870079316, + "loss": 0.1703, + "mean_token_accuracy": 0.9520847648382187, + "num_tokens": 4168526.0, + "step": 472 + }, + { + "entropy": 2.2768235206604004, + "epoch": 1.709502262443439, + "grad_norm": 0.662564754486084, + "learning_rate": 0.00043751828423439456, + "loss": 0.138, + "mean_token_accuracy": 0.9581841826438904, + "num_tokens": 4177189.0, + "step": 473 + }, + { + "entropy": 2.29143089056015, + "epoch": 1.7131221719457015, + "grad_norm": 0.8638074398040771, + "learning_rate": 0.00043718958717437324, + "loss": 0.1432, + "mean_token_accuracy": 0.9645630270242691, + "num_tokens": 4185367.0, + "step": 474 + }, + { + "entropy": 2.2810245156288147, + "epoch": 1.716742081447964, + "grad_norm": 0.6139346957206726, + "learning_rate": 0.00043686029707630097, + "loss": 0.173, + "mean_token_accuracy": 0.9592728316783905, + "num_tokens": 4194418.0, + "step": 475 + }, + { + "entropy": 2.1307725310325623, + "epoch": 1.7203619909502263, + "grad_norm": 0.5192779302597046, + "learning_rate": 0.00043653041519086354, + "loss": 0.1025, + "mean_token_accuracy": 0.970764696598053, + "num_tokens": 4203705.0, + "step": 476 + }, + { + "entropy": 2.160595118999481, + "epoch": 1.7239819004524888, + "grad_norm": 0.7398526668548584, + "learning_rate": 0.0004361999427709943, + "loss": 0.229, + "mean_token_accuracy": 0.9352773874998093, + "num_tokens": 4212648.0, + "step": 477 + }, + { + "entropy": 2.1865442991256714, + "epoch": 1.7276018099547512, + "grad_norm": 0.6227203011512756, + "learning_rate": 0.0004358688810718699, + "loss": 0.1118, + "mean_token_accuracy": 0.9689576476812363, + "num_tokens": 4221208.0, + "step": 478 + }, + { + "entropy": 2.086527943611145, + "epoch": 1.7312217194570136, + "grad_norm": 0.722144603729248, + "learning_rate": 0.00043553723135090447, + "loss": 0.1656, + "mean_token_accuracy": 0.9537550210952759, + "num_tokens": 4230810.0, + "step": 479 + }, + { + "entropy": 2.068355441093445, + "epoch": 1.734841628959276, + "grad_norm": 0.5781517028808594, + "learning_rate": 0.0004352049948677462, + "loss": 0.1497, + "mean_token_accuracy": 0.9600837379693985, + "num_tokens": 4240394.0, + "step": 480 + }, + { + "entropy": 2.185140371322632, + "epoch": 1.7384615384615385, + "grad_norm": 0.7261873483657837, + "learning_rate": 0.0004348721728842715, + "loss": 0.1582, + "mean_token_accuracy": 0.9584025889635086, + "num_tokens": 4249205.0, + "step": 481 + }, + { + "entropy": 2.21835720539093, + "epoch": 1.742081447963801, + "grad_norm": 0.5321667194366455, + "learning_rate": 0.0004345387666645807, + "loss": 0.1344, + "mean_token_accuracy": 0.9659005403518677, + "num_tokens": 4257808.0, + "step": 482 + }, + { + "entropy": 2.078131854534149, + "epoch": 1.7457013574660634, + "grad_norm": 0.5598498582839966, + "learning_rate": 0.00043420477747499307, + "loss": 0.1347, + "mean_token_accuracy": 0.9678008407354355, + "num_tokens": 4266728.0, + "step": 483 + }, + { + "entropy": 2.060504525899887, + "epoch": 1.7493212669683258, + "grad_norm": 0.5017166137695312, + "learning_rate": 0.0004338702065840422, + "loss": 0.0722, + "mean_token_accuracy": 0.9762782007455826, + "num_tokens": 4275514.0, + "step": 484 + }, + { + "entropy": 2.165244698524475, + "epoch": 1.7529411764705882, + "grad_norm": 0.4664002060890198, + "learning_rate": 0.00043353505526247084, + "loss": 0.1206, + "mean_token_accuracy": 0.9696767777204514, + "num_tokens": 4284013.0, + "step": 485 + }, + { + "entropy": 2.103049159049988, + "epoch": 1.7565610859728507, + "grad_norm": 0.6669000387191772, + "learning_rate": 0.0004331993247832265, + "loss": 0.1052, + "mean_token_accuracy": 0.9665459096431732, + "num_tokens": 4293011.0, + "step": 486 + }, + { + "entropy": 2.1286613941192627, + "epoch": 1.760180995475113, + "grad_norm": 0.7821269631385803, + "learning_rate": 0.00043286301642145634, + "loss": 0.3669, + "mean_token_accuracy": 0.9062697291374207, + "num_tokens": 4301965.0, + "step": 487 + }, + { + "entropy": 2.098009169101715, + "epoch": 1.7638009049773755, + "grad_norm": 0.5720731616020203, + "learning_rate": 0.0004325261314545024, + "loss": 0.1324, + "mean_token_accuracy": 0.9650943875312805, + "num_tokens": 4310914.0, + "step": 488 + }, + { + "entropy": 2.164614498615265, + "epoch": 1.767420814479638, + "grad_norm": 1.0500473976135254, + "learning_rate": 0.0004321886711618967, + "loss": 0.1182, + "mean_token_accuracy": 0.9720661342144012, + "num_tokens": 4319072.0, + "step": 489 + }, + { + "entropy": 2.2015402913093567, + "epoch": 1.7710407239819004, + "grad_norm": 0.5770253539085388, + "learning_rate": 0.00043185063682535634, + "loss": 0.1226, + "mean_token_accuracy": 0.9615659862756729, + "num_tokens": 4327539.0, + "step": 490 + }, + { + "entropy": 2.075456440448761, + "epoch": 1.7746606334841628, + "grad_norm": 0.6456925272941589, + "learning_rate": 0.0004315120297287789, + "loss": 0.1123, + "mean_token_accuracy": 0.9628709554672241, + "num_tokens": 4336523.0, + "step": 491 + }, + { + "entropy": 2.158169150352478, + "epoch": 1.7782805429864252, + "grad_norm": 0.8282069563865662, + "learning_rate": 0.00043117285115823733, + "loss": 0.2146, + "mean_token_accuracy": 0.9413971602916718, + "num_tokens": 4345294.0, + "step": 492 + }, + { + "entropy": 2.02735897898674, + "epoch": 1.7819004524886877, + "grad_norm": 0.783597469329834, + "learning_rate": 0.000430833102401975, + "loss": 0.1376, + "mean_token_accuracy": 0.964630737900734, + "num_tokens": 4354107.0, + "step": 493 + }, + { + "entropy": 2.138492166996002, + "epoch": 1.78552036199095, + "grad_norm": 0.6317175030708313, + "learning_rate": 0.000430492784750401, + "loss": 0.1005, + "mean_token_accuracy": 0.9734214246273041, + "num_tokens": 4362560.0, + "step": 494 + }, + { + "entropy": 2.0253217220306396, + "epoch": 1.7891402714932125, + "grad_norm": 0.5523395538330078, + "learning_rate": 0.000430151899496085, + "loss": 0.1633, + "mean_token_accuracy": 0.9558031558990479, + "num_tokens": 4371698.0, + "step": 495 + }, + { + "entropy": 2.160472810268402, + "epoch": 1.792760180995475, + "grad_norm": 0.6557935476303101, + "learning_rate": 0.00042981044793375295, + "loss": 0.1154, + "mean_token_accuracy": 0.9722230583429337, + "num_tokens": 4380612.0, + "step": 496 + }, + { + "entropy": 2.0284159183502197, + "epoch": 1.7963800904977374, + "grad_norm": 0.7357863187789917, + "learning_rate": 0.00042946843136028117, + "loss": 0.1166, + "mean_token_accuracy": 0.9629471153020859, + "num_tokens": 4389521.0, + "step": 497 + }, + { + "entropy": 2.1544791162014008, + "epoch": 1.8, + "grad_norm": 0.5604898929595947, + "learning_rate": 0.00042912585107469226, + "loss": 0.0834, + "mean_token_accuracy": 0.9783036410808563, + "num_tokens": 4398059.0, + "step": 498 + }, + { + "entropy": 2.1051094830036163, + "epoch": 1.8036199095022625, + "grad_norm": 0.4598539173603058, + "learning_rate": 0.0004287827083781497, + "loss": 0.0411, + "mean_token_accuracy": 0.9868490546941757, + "num_tokens": 4406453.0, + "step": 499 + }, + { + "entropy": 2.0219272077083588, + "epoch": 1.807239819004525, + "grad_norm": 0.8164628744125366, + "learning_rate": 0.00042843900457395343, + "loss": 0.1988, + "mean_token_accuracy": 0.9502352625131607, + "num_tokens": 4415440.0, + "step": 500 + }, + { + "entropy": 1.980013906955719, + "epoch": 1.8108597285067873, + "grad_norm": 0.572798490524292, + "learning_rate": 0.0004280947409675341, + "loss": 0.1148, + "mean_token_accuracy": 0.966580331325531, + "num_tokens": 4424532.0, + "step": 501 + }, + { + "entropy": 2.0646563172340393, + "epoch": 1.8144796380090498, + "grad_norm": 0.769386351108551, + "learning_rate": 0.00042774991886644875, + "loss": 0.1592, + "mean_token_accuracy": 0.9553463608026505, + "num_tokens": 4432913.0, + "step": 502 + }, + { + "entropy": 2.040877491235733, + "epoch": 1.8180995475113122, + "grad_norm": 0.7467371821403503, + "learning_rate": 0.0004274045395803758, + "loss": 0.2247, + "mean_token_accuracy": 0.9526964277029037, + "num_tokens": 4441425.0, + "step": 503 + }, + { + "entropy": 1.9934698939323425, + "epoch": 1.8217194570135746, + "grad_norm": 0.6602952480316162, + "learning_rate": 0.00042705860442110964, + "loss": 0.1681, + "mean_token_accuracy": 0.9594631940126419, + "num_tokens": 4450383.0, + "step": 504 + }, + { + "entropy": 2.0858289897441864, + "epoch": 1.825339366515837, + "grad_norm": 0.684380829334259, + "learning_rate": 0.0004267121147025562, + "loss": 0.1154, + "mean_token_accuracy": 0.9638111293315887, + "num_tokens": 4458862.0, + "step": 505 + }, + { + "entropy": 2.0886995792388916, + "epoch": 1.8289592760180997, + "grad_norm": 0.5784837007522583, + "learning_rate": 0.00042636507174072756, + "loss": 0.1026, + "mean_token_accuracy": 0.9676834791898727, + "num_tokens": 4467386.0, + "step": 506 + }, + { + "entropy": 2.0236063301563263, + "epoch": 1.8325791855203621, + "grad_norm": 0.5101180672645569, + "learning_rate": 0.00042601747685373716, + "loss": 0.1031, + "mean_token_accuracy": 0.9734093993902206, + "num_tokens": 4476054.0, + "step": 507 + }, + { + "entropy": 1.9801031053066254, + "epoch": 1.8361990950226246, + "grad_norm": 0.6581607460975647, + "learning_rate": 0.00042566933136179455, + "loss": 0.1548, + "mean_token_accuracy": 0.9581006914377213, + "num_tokens": 4484895.0, + "step": 508 + }, + { + "entropy": 2.0244787633419037, + "epoch": 1.839819004524887, + "grad_norm": 0.8100608587265015, + "learning_rate": 0.0004253206365872008, + "loss": 0.196, + "mean_token_accuracy": 0.9532899260520935, + "num_tokens": 4493737.0, + "step": 509 + }, + { + "entropy": 1.9108119010925293, + "epoch": 1.8434389140271494, + "grad_norm": 0.4903942048549652, + "learning_rate": 0.00042497139385434314, + "loss": 0.1313, + "mean_token_accuracy": 0.9667337089776993, + "num_tokens": 4502840.0, + "step": 510 + }, + { + "entropy": 2.009468197822571, + "epoch": 1.8470588235294119, + "grad_norm": 0.6010113954544067, + "learning_rate": 0.0004246216044896897, + "loss": 0.1013, + "mean_token_accuracy": 0.9692314714193344, + "num_tokens": 4511407.0, + "step": 511 + }, + { + "entropy": 2.0337170362472534, + "epoch": 1.8506787330316743, + "grad_norm": 0.7906802892684937, + "learning_rate": 0.00042427126982178546, + "loss": 0.1682, + "mean_token_accuracy": 0.9550099819898605, + "num_tokens": 4520018.0, + "step": 512 + }, + { + "entropy": 1.8813888728618622, + "epoch": 1.8542986425339367, + "grad_norm": 0.5353080034255981, + "learning_rate": 0.00042392039118124586, + "loss": 0.1228, + "mean_token_accuracy": 0.9624074995517731, + "num_tokens": 4529270.0, + "step": 513 + }, + { + "entropy": 2.012698233127594, + "epoch": 1.8579185520361992, + "grad_norm": 0.6713843941688538, + "learning_rate": 0.00042356896990075285, + "loss": 0.2225, + "mean_token_accuracy": 0.9417333751916885, + "num_tokens": 4538008.0, + "step": 514 + }, + { + "entropy": 1.880586564540863, + "epoch": 1.8615384615384616, + "grad_norm": 0.5821724534034729, + "learning_rate": 0.00042321700731504916, + "loss": 0.1144, + "mean_token_accuracy": 0.9677341282367706, + "num_tokens": 4546950.0, + "step": 515 + }, + { + "entropy": 2.0066279470920563, + "epoch": 1.865158371040724, + "grad_norm": 0.4095056354999542, + "learning_rate": 0.0004228645047609335, + "loss": 0.0424, + "mean_token_accuracy": 0.9854962974786758, + "num_tokens": 4555452.0, + "step": 516 + }, + { + "entropy": 2.042815536260605, + "epoch": 1.8687782805429864, + "grad_norm": 0.5398769974708557, + "learning_rate": 0.0004225114635772555, + "loss": 0.1343, + "mean_token_accuracy": 0.9615450948476791, + "num_tokens": 4564386.0, + "step": 517 + }, + { + "entropy": 2.0948933362960815, + "epoch": 1.8723981900452489, + "grad_norm": 0.6738974452018738, + "learning_rate": 0.0004221578851049107, + "loss": 0.1541, + "mean_token_accuracy": 0.9526563137769699, + "num_tokens": 4573041.0, + "step": 518 + }, + { + "entropy": 2.102545380592346, + "epoch": 1.8760180995475113, + "grad_norm": 0.7769943475723267, + "learning_rate": 0.00042180377068683504, + "loss": 0.2362, + "mean_token_accuracy": 0.9472651779651642, + "num_tokens": 4581666.0, + "step": 519 + }, + { + "entropy": 2.087820291519165, + "epoch": 1.8796380090497737, + "grad_norm": 0.5722424983978271, + "learning_rate": 0.0004214491216680004, + "loss": 0.1657, + "mean_token_accuracy": 0.9537082612514496, + "num_tokens": 4590238.0, + "step": 520 + }, + { + "entropy": 2.0093430876731873, + "epoch": 1.8832579185520362, + "grad_norm": 0.5844932198524475, + "learning_rate": 0.00042109393939540867, + "loss": 0.1485, + "mean_token_accuracy": 0.9624215811491013, + "num_tokens": 4599352.0, + "step": 521 + }, + { + "entropy": 1.9117147326469421, + "epoch": 1.8868778280542986, + "grad_norm": 0.46085676550865173, + "learning_rate": 0.0004207382252180876, + "loss": 0.0853, + "mean_token_accuracy": 0.9769327491521835, + "num_tokens": 4608571.0, + "step": 522 + }, + { + "entropy": 2.0205602943897247, + "epoch": 1.890497737556561, + "grad_norm": 0.5571608543395996, + "learning_rate": 0.000420381980487085, + "loss": 0.1517, + "mean_token_accuracy": 0.9646699875593185, + "num_tokens": 4617445.0, + "step": 523 + }, + { + "entropy": 1.9571953415870667, + "epoch": 1.8941176470588235, + "grad_norm": 0.470630943775177, + "learning_rate": 0.0004200252065554636, + "loss": 0.1005, + "mean_token_accuracy": 0.9750025719404221, + "num_tokens": 4626756.0, + "step": 524 + }, + { + "entropy": 2.063209116458893, + "epoch": 1.897737556561086, + "grad_norm": 0.6447069644927979, + "learning_rate": 0.00041966790477829637, + "loss": 0.113, + "mean_token_accuracy": 0.9695079624652863, + "num_tokens": 4635378.0, + "step": 525 + }, + { + "entropy": 1.9232109785079956, + "epoch": 1.9013574660633483, + "grad_norm": 0.5114295482635498, + "learning_rate": 0.000419310076512661, + "loss": 0.1492, + "mean_token_accuracy": 0.9653338938951492, + "num_tokens": 4644769.0, + "step": 526 + }, + { + "entropy": 2.1691197752952576, + "epoch": 1.9049773755656108, + "grad_norm": 0.7630137205123901, + "learning_rate": 0.00041895172311763476, + "loss": 0.212, + "mean_token_accuracy": 0.9533941894769669, + "num_tokens": 4652857.0, + "step": 527 + }, + { + "entropy": 2.04753240942955, + "epoch": 1.9085972850678732, + "grad_norm": 0.6423042416572571, + "learning_rate": 0.00041859284595428955, + "loss": 0.1455, + "mean_token_accuracy": 0.956505224108696, + "num_tokens": 4661591.0, + "step": 528 + }, + { + "entropy": 1.9440338611602783, + "epoch": 1.9122171945701356, + "grad_norm": 0.5011327266693115, + "learning_rate": 0.00041823344638568656, + "loss": 0.1255, + "mean_token_accuracy": 0.965131089091301, + "num_tokens": 4670594.0, + "step": 529 + }, + { + "entropy": 2.0554805397987366, + "epoch": 1.915837104072398, + "grad_norm": 0.5821590423583984, + "learning_rate": 0.0004178735257768713, + "loss": 0.0486, + "mean_token_accuracy": 0.9875282496213913, + "num_tokens": 4679344.0, + "step": 530 + }, + { + "entropy": 2.130349576473236, + "epoch": 1.9194570135746605, + "grad_norm": 0.5332052111625671, + "learning_rate": 0.0004175130854948679, + "loss": 0.0915, + "mean_token_accuracy": 0.9737034440040588, + "num_tokens": 4687922.0, + "step": 531 + }, + { + "entropy": 2.146788775920868, + "epoch": 1.9230769230769231, + "grad_norm": 0.5016877055168152, + "learning_rate": 0.00041715212690867455, + "loss": 0.1281, + "mean_token_accuracy": 0.9681432545185089, + "num_tokens": 4696593.0, + "step": 532 + }, + { + "entropy": 2.041268438100815, + "epoch": 1.9266968325791856, + "grad_norm": 0.5257729887962341, + "learning_rate": 0.00041679065138925807, + "loss": 0.1272, + "mean_token_accuracy": 0.9649266451597214, + "num_tokens": 4705792.0, + "step": 533 + }, + { + "entropy": 2.114819645881653, + "epoch": 1.930316742081448, + "grad_norm": 0.7085135579109192, + "learning_rate": 0.0004164286603095484, + "loss": 0.1545, + "mean_token_accuracy": 0.9581228941679001, + "num_tokens": 4714599.0, + "step": 534 + }, + { + "entropy": 2.022280514240265, + "epoch": 1.9339366515837104, + "grad_norm": 0.5309014320373535, + "learning_rate": 0.00041606615504443387, + "loss": 0.1933, + "mean_token_accuracy": 0.9562340676784515, + "num_tokens": 4724062.0, + "step": 535 + }, + { + "entropy": 2.0959260165691376, + "epoch": 1.9375565610859729, + "grad_norm": 0.6528061628341675, + "learning_rate": 0.0004157031369707557, + "loss": 0.1306, + "mean_token_accuracy": 0.9612343460321426, + "num_tokens": 4733077.0, + "step": 536 + }, + { + "entropy": 2.2772948145866394, + "epoch": 1.9411764705882353, + "grad_norm": 0.7351471185684204, + "learning_rate": 0.0004153396074673028, + "loss": 0.1494, + "mean_token_accuracy": 0.9608108699321747, + "num_tokens": 4741201.0, + "step": 537 + }, + { + "entropy": 2.0935052037239075, + "epoch": 1.9447963800904977, + "grad_norm": 0.5435840487480164, + "learning_rate": 0.0004149755679148065, + "loss": 0.0884, + "mean_token_accuracy": 0.9745689779520035, + "num_tokens": 4750306.0, + "step": 538 + }, + { + "entropy": 2.2082818746566772, + "epoch": 1.9484162895927601, + "grad_norm": 0.3780331611633301, + "learning_rate": 0.00041461101969593537, + "loss": 0.0739, + "mean_token_accuracy": 0.9777179658412933, + "num_tokens": 4758954.0, + "step": 539 + }, + { + "entropy": 2.1683040261268616, + "epoch": 1.9520361990950226, + "grad_norm": 0.4637961685657501, + "learning_rate": 0.00041424596419529017, + "loss": 0.0632, + "mean_token_accuracy": 0.9834533184766769, + "num_tokens": 4767615.0, + "step": 540 + }, + { + "entropy": 2.075555235147476, + "epoch": 1.9556561085972852, + "grad_norm": 0.7603118419647217, + "learning_rate": 0.00041388040279939804, + "loss": 0.2835, + "mean_token_accuracy": 0.9364205300807953, + "num_tokens": 4776714.0, + "step": 541 + }, + { + "entropy": 2.18926739692688, + "epoch": 1.9592760180995477, + "grad_norm": 0.8895708918571472, + "learning_rate": 0.0004135143368967079, + "loss": 0.2514, + "mean_token_accuracy": 0.9361050724983215, + "num_tokens": 4785402.0, + "step": 542 + }, + { + "entropy": 2.2387169003486633, + "epoch": 1.96289592760181, + "grad_norm": 0.6013544797897339, + "learning_rate": 0.00041314776787758454, + "loss": 0.1502, + "mean_token_accuracy": 0.9594238847494125, + "num_tokens": 4793928.0, + "step": 543 + }, + { + "entropy": 2.208383619785309, + "epoch": 1.9665158371040725, + "grad_norm": 0.6934756636619568, + "learning_rate": 0.00041278069713430386, + "loss": 0.1777, + "mean_token_accuracy": 0.9619583487510681, + "num_tokens": 4802612.0, + "step": 544 + }, + { + "entropy": 2.2621757984161377, + "epoch": 1.970135746606335, + "grad_norm": 0.6920077800750732, + "learning_rate": 0.00041241312606104743, + "loss": 0.1689, + "mean_token_accuracy": 0.9594835937023163, + "num_tokens": 4811332.0, + "step": 545 + }, + { + "entropy": 2.2654454112052917, + "epoch": 1.9737556561085974, + "grad_norm": 0.6259592771530151, + "learning_rate": 0.000412045056053897, + "loss": 0.142, + "mean_token_accuracy": 0.9648078680038452, + "num_tokens": 4820441.0, + "step": 546 + }, + { + "entropy": 2.218056857585907, + "epoch": 1.9773755656108598, + "grad_norm": 0.5390617847442627, + "learning_rate": 0.0004116764885108292, + "loss": 0.1737, + "mean_token_accuracy": 0.9595656991004944, + "num_tokens": 4829437.0, + "step": 547 + }, + { + "entropy": 2.2571592330932617, + "epoch": 1.9809954751131222, + "grad_norm": 0.3656528890132904, + "learning_rate": 0.0004113074248317108, + "loss": 0.0545, + "mean_token_accuracy": 0.9825418293476105, + "num_tokens": 4838118.0, + "step": 548 + }, + { + "entropy": 2.1890549659729004, + "epoch": 1.9846153846153847, + "grad_norm": 0.5716155767440796, + "learning_rate": 0.00041093786641829247, + "loss": 0.0997, + "mean_token_accuracy": 0.9715700745582581, + "num_tokens": 4847073.0, + "step": 549 + }, + { + "entropy": 2.2726192474365234, + "epoch": 1.988235294117647, + "grad_norm": 0.4709530770778656, + "learning_rate": 0.0004105678146742042, + "loss": 0.0746, + "mean_token_accuracy": 0.9799739569425583, + "num_tokens": 4855755.0, + "step": 550 + }, + { + "entropy": 2.2328362464904785, + "epoch": 1.9918552036199095, + "grad_norm": 0.6773779392242432, + "learning_rate": 0.0004101972710049498, + "loss": 0.1418, + "mean_token_accuracy": 0.9629421681165695, + "num_tokens": 4864601.0, + "step": 551 + }, + { + "entropy": 2.199812740087509, + "epoch": 1.995475113122172, + "grad_norm": 0.717012882232666, + "learning_rate": 0.00040982623681790113, + "loss": 0.2948, + "mean_token_accuracy": 0.9432803690433502, + "num_tokens": 4873630.0, + "step": 552 + }, + { + "entropy": 2.2102787494659424, + "epoch": 1.9990950226244344, + "grad_norm": 0.6925314664840698, + "learning_rate": 0.00040945471352229346, + "loss": 0.2579, + "mean_token_accuracy": 0.9435124397277832, + "num_tokens": 4882714.0, + "step": 553 + }, + { + "entropy": 2.3318979740142822, + "epoch": 2.0, + "grad_norm": 2.688188314437866, + "learning_rate": 0.0004090827025292197, + "loss": 0.0283, + "mean_token_accuracy": 0.9918032884597778, + "num_tokens": 4883450.0, + "step": 554 + }, + { + "epoch": 2.0, + "eval_entropy": 2.2165925522160723, + "eval_loss": 0.16817161440849304, + "eval_mean_token_accuracy": 0.9567220133494555, + "eval_num_tokens": 4883450.0, + "eval_runtime": 116.1556, + "eval_samples_per_second": 3.177, + "eval_steps_per_second": 1.059, + "step": 554 + }, + { + "entropy": 2.0389976799488068, + "epoch": 2.0036199095022624, + "grad_norm": 0.8596204519271851, + "learning_rate": 0.00040871020525162484, + "loss": 0.1341, + "mean_token_accuracy": 0.9626202881336212, + "num_tokens": 4893236.0, + "step": 555 + }, + { + "entropy": 2.245832860469818, + "epoch": 2.007239819004525, + "grad_norm": 0.39707237482070923, + "learning_rate": 0.00040833722310430114, + "loss": 0.0564, + "mean_token_accuracy": 0.9868980199098587, + "num_tokens": 4901819.0, + "step": 556 + }, + { + "entropy": 2.169717162847519, + "epoch": 2.0108597285067873, + "grad_norm": 0.46584129333496094, + "learning_rate": 0.0004079637575038822, + "loss": 0.0792, + "mean_token_accuracy": 0.9758767485618591, + "num_tokens": 4910892.0, + "step": 557 + }, + { + "entropy": 2.27083820104599, + "epoch": 2.0144796380090497, + "grad_norm": 0.8394352197647095, + "learning_rate": 0.0004075898098688381, + "loss": 0.0962, + "mean_token_accuracy": 0.9723308384418488, + "num_tokens": 4919510.0, + "step": 558 + }, + { + "entropy": 2.1067663431167603, + "epoch": 2.018099547511312, + "grad_norm": 0.4951268434524536, + "learning_rate": 0.0004072153816194696, + "loss": 0.1195, + "mean_token_accuracy": 0.9703402817249298, + "num_tokens": 4928439.0, + "step": 559 + }, + { + "entropy": 2.016420066356659, + "epoch": 2.0217194570135746, + "grad_norm": 0.5574740171432495, + "learning_rate": 0.00040684047417790273, + "loss": 0.1037, + "mean_token_accuracy": 0.9727325141429901, + "num_tokens": 4938061.0, + "step": 560 + }, + { + "entropy": 2.1843727231025696, + "epoch": 2.025339366515837, + "grad_norm": 0.786014199256897, + "learning_rate": 0.00040646508896808394, + "loss": 0.155, + "mean_token_accuracy": 0.9608975350856781, + "num_tokens": 4946619.0, + "step": 561 + }, + { + "entropy": 2.160427451133728, + "epoch": 2.0289592760180994, + "grad_norm": 0.5267161130905151, + "learning_rate": 0.000406089227415774, + "loss": 0.0632, + "mean_token_accuracy": 0.9791042655706406, + "num_tokens": 4955324.0, + "step": 562 + }, + { + "entropy": 2.0923200249671936, + "epoch": 2.032579185520362, + "grad_norm": 0.8306187987327576, + "learning_rate": 0.00040571289094854304, + "loss": 0.1976, + "mean_token_accuracy": 0.9538775235414505, + "num_tokens": 4964321.0, + "step": 563 + }, + { + "entropy": 2.0181354880332947, + "epoch": 2.0361990950226243, + "grad_norm": 0.6798867583274841, + "learning_rate": 0.0004053360809957649, + "loss": 0.1797, + "mean_token_accuracy": 0.9569422006607056, + "num_tokens": 4973937.0, + "step": 564 + }, + { + "entropy": 2.123030036687851, + "epoch": 2.0398190045248867, + "grad_norm": 0.4481683671474457, + "learning_rate": 0.00040495879898861173, + "loss": 0.0639, + "mean_token_accuracy": 0.9827965050935745, + "num_tokens": 4982779.0, + "step": 565 + }, + { + "entropy": 2.0797010362148285, + "epoch": 2.043438914027149, + "grad_norm": 0.7745859622955322, + "learning_rate": 0.00040458104636004877, + "loss": 0.1602, + "mean_token_accuracy": 0.9600242227315903, + "num_tokens": 4991793.0, + "step": 566 + }, + { + "entropy": 2.0320390164852142, + "epoch": 2.0470588235294116, + "grad_norm": 0.5792120695114136, + "learning_rate": 0.0004042028245448286, + "loss": 0.0816, + "mean_token_accuracy": 0.9757721722126007, + "num_tokens": 5000834.0, + "step": 567 + }, + { + "entropy": 2.1047743558883667, + "epoch": 2.050678733031674, + "grad_norm": 0.5770072937011719, + "learning_rate": 0.0004038241349794858, + "loss": 0.1367, + "mean_token_accuracy": 0.9598450362682343, + "num_tokens": 5010155.0, + "step": 568 + }, + { + "entropy": 2.022550255060196, + "epoch": 2.0542986425339365, + "grad_norm": 0.47085902094841003, + "learning_rate": 0.0004034449791023319, + "loss": 0.1005, + "mean_token_accuracy": 0.970214769244194, + "num_tokens": 5020010.0, + "step": 569 + }, + { + "entropy": 2.034317582845688, + "epoch": 2.057918552036199, + "grad_norm": 0.4816018044948578, + "learning_rate": 0.0004030653583534489, + "loss": 0.118, + "mean_token_accuracy": 0.9635649025440216, + "num_tokens": 5029205.0, + "step": 570 + }, + { + "entropy": 2.1142700910568237, + "epoch": 2.0615384615384613, + "grad_norm": 0.561765730381012, + "learning_rate": 0.0004026852741746849, + "loss": 0.0628, + "mean_token_accuracy": 0.9811093211174011, + "num_tokens": 5037830.0, + "step": 571 + }, + { + "entropy": 2.1506906747817993, + "epoch": 2.065158371040724, + "grad_norm": 0.9037840366363525, + "learning_rate": 0.0004023047280096482, + "loss": 0.1395, + "mean_token_accuracy": 0.9645196944475174, + "num_tokens": 5046618.0, + "step": 572 + }, + { + "entropy": 2.1811060309410095, + "epoch": 2.0687782805429866, + "grad_norm": 0.6224188208580017, + "learning_rate": 0.0004019237213037014, + "loss": 0.0766, + "mean_token_accuracy": 0.9752616137266159, + "num_tokens": 5055467.0, + "step": 573 + }, + { + "entropy": 2.0479070246219635, + "epoch": 2.072398190045249, + "grad_norm": 0.5052458643913269, + "learning_rate": 0.00040154225550395665, + "loss": 0.091, + "mean_token_accuracy": 0.9753529280424118, + "num_tokens": 5064518.0, + "step": 574 + }, + { + "entropy": 2.18623149394989, + "epoch": 2.0760180995475115, + "grad_norm": 0.49587905406951904, + "learning_rate": 0.00040116033205926964, + "loss": 0.0703, + "mean_token_accuracy": 0.979348823428154, + "num_tokens": 5072713.0, + "step": 575 + }, + { + "entropy": 2.131018817424774, + "epoch": 2.079638009049774, + "grad_norm": 0.607468843460083, + "learning_rate": 0.0004007779524202343, + "loss": 0.0988, + "mean_token_accuracy": 0.9756181836128235, + "num_tokens": 5081046.0, + "step": 576 + }, + { + "entropy": 2.0251292288303375, + "epoch": 2.0832579185520363, + "grad_norm": 0.867511510848999, + "learning_rate": 0.00040039511803917723, + "loss": 0.1672, + "mean_token_accuracy": 0.9638413190841675, + "num_tokens": 5089859.0, + "step": 577 + }, + { + "entropy": 2.0818732380867004, + "epoch": 2.086877828054299, + "grad_norm": 0.5915331840515137, + "learning_rate": 0.0004000118303701521, + "loss": 0.1103, + "mean_token_accuracy": 0.9715124219655991, + "num_tokens": 5098331.0, + "step": 578 + }, + { + "entropy": 1.9556698501110077, + "epoch": 2.090497737556561, + "grad_norm": 0.5216535329818726, + "learning_rate": 0.0003996280908689345, + "loss": 0.1481, + "mean_token_accuracy": 0.9601311087608337, + "num_tokens": 5107557.0, + "step": 579 + }, + { + "entropy": 2.015773117542267, + "epoch": 2.0941176470588236, + "grad_norm": 0.7138916254043579, + "learning_rate": 0.00039924390099301584, + "loss": 0.1173, + "mean_token_accuracy": 0.9670253992080688, + "num_tokens": 5116677.0, + "step": 580 + }, + { + "entropy": 2.0676984786987305, + "epoch": 2.097737556561086, + "grad_norm": 0.7776201963424683, + "learning_rate": 0.0003988592622015984, + "loss": 0.0668, + "mean_token_accuracy": 0.9766870141029358, + "num_tokens": 5125262.0, + "step": 581 + }, + { + "entropy": 2.0256679952144623, + "epoch": 2.1013574660633485, + "grad_norm": 0.5481430888175964, + "learning_rate": 0.00039847417595558903, + "loss": 0.0898, + "mean_token_accuracy": 0.9747780114412308, + "num_tokens": 5133848.0, + "step": 582 + }, + { + "entropy": 2.049301326274872, + "epoch": 2.104977375565611, + "grad_norm": 0.6634963154792786, + "learning_rate": 0.00039808864371759464, + "loss": 0.1012, + "mean_token_accuracy": 0.9695883542299271, + "num_tokens": 5142266.0, + "step": 583 + }, + { + "entropy": 1.8873322904109955, + "epoch": 2.1085972850678734, + "grad_norm": 0.6262965798377991, + "learning_rate": 0.0003977026669519156, + "loss": 0.1064, + "mean_token_accuracy": 0.9686857610940933, + "num_tokens": 5151297.0, + "step": 584 + }, + { + "entropy": 2.0208800733089447, + "epoch": 2.112217194570136, + "grad_norm": 0.6475429534912109, + "learning_rate": 0.0003973162471245411, + "loss": 0.126, + "mean_token_accuracy": 0.9671273976564407, + "num_tokens": 5159913.0, + "step": 585 + }, + { + "entropy": 2.0354510843753815, + "epoch": 2.1158371040723982, + "grad_norm": 0.6373077034950256, + "learning_rate": 0.0003969293857031426, + "loss": 0.1403, + "mean_token_accuracy": 0.9615094214677811, + "num_tokens": 5168392.0, + "step": 586 + }, + { + "entropy": 2.0489701330661774, + "epoch": 2.1194570135746607, + "grad_norm": 0.7459731698036194, + "learning_rate": 0.0003965420841570693, + "loss": 0.0847, + "mean_token_accuracy": 0.9742033332586288, + "num_tokens": 5176858.0, + "step": 587 + }, + { + "entropy": 2.0531455874443054, + "epoch": 2.123076923076923, + "grad_norm": 0.8357418179512024, + "learning_rate": 0.00039615434395734174, + "loss": 0.2558, + "mean_token_accuracy": 0.9348864704370499, + "num_tokens": 5185101.0, + "step": 588 + }, + { + "entropy": 1.9761857986450195, + "epoch": 2.1266968325791855, + "grad_norm": 0.4816463887691498, + "learning_rate": 0.00039576616657664666, + "loss": 0.0934, + "mean_token_accuracy": 0.9781179577112198, + "num_tokens": 5193987.0, + "step": 589 + }, + { + "entropy": 2.0150316655635834, + "epoch": 2.130316742081448, + "grad_norm": 0.7039950489997864, + "learning_rate": 0.0003953775534893311, + "loss": 0.1558, + "mean_token_accuracy": 0.9602096229791641, + "num_tokens": 5202598.0, + "step": 590 + }, + { + "entropy": 2.0542426705360413, + "epoch": 2.1339366515837104, + "grad_norm": 0.6318346858024597, + "learning_rate": 0.00039498850617139737, + "loss": 0.1277, + "mean_token_accuracy": 0.9658758789300919, + "num_tokens": 5211157.0, + "step": 591 + }, + { + "entropy": 2.0793416798114777, + "epoch": 2.137556561085973, + "grad_norm": 0.6513328552246094, + "learning_rate": 0.0003945990261004964, + "loss": 0.3452, + "mean_token_accuracy": 0.9376382231712341, + "num_tokens": 5220057.0, + "step": 592 + }, + { + "entropy": 1.834738850593567, + "epoch": 2.1411764705882352, + "grad_norm": 0.709550678730011, + "learning_rate": 0.0003942091147559234, + "loss": 0.1632, + "mean_token_accuracy": 0.9588025957345963, + "num_tokens": 5229649.0, + "step": 593 + }, + { + "entropy": 2.115740954875946, + "epoch": 2.1447963800904977, + "grad_norm": 0.6495632529258728, + "learning_rate": 0.00039381877361861127, + "loss": 0.0799, + "mean_token_accuracy": 0.9793208837509155, + "num_tokens": 5238060.0, + "step": 594 + }, + { + "entropy": 1.9325994551181793, + "epoch": 2.14841628959276, + "grad_norm": 0.3864371180534363, + "learning_rate": 0.0003934280041711253, + "loss": 0.0392, + "mean_token_accuracy": 0.9867032468318939, + "num_tokens": 5246715.0, + "step": 595 + }, + { + "entropy": 1.9573578834533691, + "epoch": 2.1520361990950225, + "grad_norm": 0.8978553414344788, + "learning_rate": 0.0003930368078976578, + "loss": 0.1043, + "mean_token_accuracy": 0.9700421690940857, + "num_tokens": 5255677.0, + "step": 596 + }, + { + "entropy": 2.017194092273712, + "epoch": 2.155656108597285, + "grad_norm": 0.8082290887832642, + "learning_rate": 0.0003926451862840221, + "loss": 0.193, + "mean_token_accuracy": 0.9494165182113647, + "num_tokens": 5264229.0, + "step": 597 + }, + { + "entropy": 1.8982190787792206, + "epoch": 2.1592760180995474, + "grad_norm": 0.7600063681602478, + "learning_rate": 0.00039225314081764673, + "loss": 0.2152, + "mean_token_accuracy": 0.9523166120052338, + "num_tokens": 5273397.0, + "step": 598 + }, + { + "entropy": 1.9896901845932007, + "epoch": 2.16289592760181, + "grad_norm": 0.45877528190612793, + "learning_rate": 0.0003918606729875706, + "loss": 0.0892, + "mean_token_accuracy": 0.9720247238874435, + "num_tokens": 5282376.0, + "step": 599 + }, + { + "entropy": 1.8235589861869812, + "epoch": 2.1665158371040723, + "grad_norm": 0.49329352378845215, + "learning_rate": 0.0003914677842844365, + "loss": 0.0803, + "mean_token_accuracy": 0.9721037000417709, + "num_tokens": 5291815.0, + "step": 600 + }, + { + "entropy": 1.9400377571582794, + "epoch": 2.1701357466063347, + "grad_norm": 0.5306346416473389, + "learning_rate": 0.0003910744762004857, + "loss": 0.0602, + "mean_token_accuracy": 0.9762802571058273, + "num_tokens": 5300394.0, + "step": 601 + }, + { + "entropy": 1.7808023691177368, + "epoch": 2.173755656108597, + "grad_norm": 0.5050559043884277, + "learning_rate": 0.00039068075022955255, + "loss": 0.0862, + "mean_token_accuracy": 0.9724314510822296, + "num_tokens": 5309685.0, + "step": 602 + }, + { + "entropy": 1.9939678311347961, + "epoch": 2.1773755656108595, + "grad_norm": 0.6879346966743469, + "learning_rate": 0.0003902866078670584, + "loss": 0.0936, + "mean_token_accuracy": 0.9765703976154327, + "num_tokens": 5318020.0, + "step": 603 + }, + { + "entropy": 1.9384137690067291, + "epoch": 2.180995475113122, + "grad_norm": 0.6881359219551086, + "learning_rate": 0.0003898920506100061, + "loss": 0.1303, + "mean_token_accuracy": 0.9615567773580551, + "num_tokens": 5326895.0, + "step": 604 + }, + { + "entropy": 1.9919665455818176, + "epoch": 2.184615384615385, + "grad_norm": 0.6181508302688599, + "learning_rate": 0.00038949707995697446, + "loss": 0.0745, + "mean_token_accuracy": 0.9808734804391861, + "num_tokens": 5335355.0, + "step": 605 + }, + { + "entropy": 1.9376583397388458, + "epoch": 2.1882352941176473, + "grad_norm": 0.46525871753692627, + "learning_rate": 0.0003891016974081125, + "loss": 0.0826, + "mean_token_accuracy": 0.9753947854042053, + "num_tokens": 5343879.0, + "step": 606 + }, + { + "entropy": 1.8252979516983032, + "epoch": 2.1918552036199097, + "grad_norm": 0.5332593321800232, + "learning_rate": 0.00038870590446513325, + "loss": 0.1218, + "mean_token_accuracy": 0.9644111543893814, + "num_tokens": 5352980.0, + "step": 607 + }, + { + "entropy": 1.8981524407863617, + "epoch": 2.195475113122172, + "grad_norm": 0.5849556922912598, + "learning_rate": 0.0003883097026313089, + "loss": 0.0854, + "mean_token_accuracy": 0.9766328930854797, + "num_tokens": 5361576.0, + "step": 608 + }, + { + "entropy": 1.9466857016086578, + "epoch": 2.1990950226244346, + "grad_norm": 1.0213185548782349, + "learning_rate": 0.00038791309341146453, + "loss": 0.1282, + "mean_token_accuracy": 0.975858062505722, + "num_tokens": 5369947.0, + "step": 609 + }, + { + "entropy": 1.9219308197498322, + "epoch": 2.202714932126697, + "grad_norm": 0.7259594798088074, + "learning_rate": 0.00038751607831197243, + "loss": 0.0986, + "mean_token_accuracy": 0.9709735363721848, + "num_tokens": 5378429.0, + "step": 610 + }, + { + "entropy": 1.934881567955017, + "epoch": 2.2063348416289594, + "grad_norm": 0.6190217137336731, + "learning_rate": 0.0003871186588407467, + "loss": 0.1259, + "mean_token_accuracy": 0.9606761038303375, + "num_tokens": 5386986.0, + "step": 611 + }, + { + "entropy": 1.9234256446361542, + "epoch": 2.209954751131222, + "grad_norm": 1.1731759309768677, + "learning_rate": 0.00038672083650723697, + "loss": 0.3705, + "mean_token_accuracy": 0.9448409974575043, + "num_tokens": 5395623.0, + "step": 612 + }, + { + "entropy": 1.9198957085609436, + "epoch": 2.2135746606334843, + "grad_norm": 0.38831791281700134, + "learning_rate": 0.00038632261282242316, + "loss": 0.0405, + "mean_token_accuracy": 0.9884084165096283, + "num_tokens": 5403964.0, + "step": 613 + }, + { + "entropy": 1.9401849210262299, + "epoch": 2.2171945701357467, + "grad_norm": 0.6391944885253906, + "learning_rate": 0.0003859239892988097, + "loss": 0.0803, + "mean_token_accuracy": 0.9763080179691315, + "num_tokens": 5412601.0, + "step": 614 + }, + { + "entropy": 1.906328171491623, + "epoch": 2.220814479638009, + "grad_norm": 0.5495765805244446, + "learning_rate": 0.00038552496745041935, + "loss": 0.0919, + "mean_token_accuracy": 0.9796502739191055, + "num_tokens": 5421112.0, + "step": 615 + }, + { + "entropy": 1.9130763709545135, + "epoch": 2.2244343891402716, + "grad_norm": 0.8233397006988525, + "learning_rate": 0.0003851255487927883, + "loss": 0.1246, + "mean_token_accuracy": 0.9621723592281342, + "num_tokens": 5429851.0, + "step": 616 + }, + { + "entropy": 1.8408336341381073, + "epoch": 2.228054298642534, + "grad_norm": 0.8857082724571228, + "learning_rate": 0.00038472573484295904, + "loss": 0.1061, + "mean_token_accuracy": 0.9664444029331207, + "num_tokens": 5438983.0, + "step": 617 + }, + { + "entropy": 1.8644142150878906, + "epoch": 2.2316742081447964, + "grad_norm": 0.6762974262237549, + "learning_rate": 0.0003843255271194762, + "loss": 0.1532, + "mean_token_accuracy": 0.952915757894516, + "num_tokens": 5447922.0, + "step": 618 + }, + { + "entropy": 1.7125722169876099, + "epoch": 2.235294117647059, + "grad_norm": 0.44111478328704834, + "learning_rate": 0.00038392492714237975, + "loss": 0.0819, + "mean_token_accuracy": 0.9738304615020752, + "num_tokens": 5457128.0, + "step": 619 + }, + { + "entropy": 1.7900195717811584, + "epoch": 2.2389140271493213, + "grad_norm": 0.5224407911300659, + "learning_rate": 0.0003835239364331993, + "loss": 0.1023, + "mean_token_accuracy": 0.975239485502243, + "num_tokens": 5465760.0, + "step": 620 + }, + { + "entropy": 1.715638667345047, + "epoch": 2.2425339366515837, + "grad_norm": 0.6327251195907593, + "learning_rate": 0.00038312255651494866, + "loss": 0.154, + "mean_token_accuracy": 0.9579339027404785, + "num_tokens": 5475190.0, + "step": 621 + }, + { + "entropy": 1.8499042093753815, + "epoch": 2.246153846153846, + "grad_norm": 0.6490166187286377, + "learning_rate": 0.00038272078891212017, + "loss": 0.1248, + "mean_token_accuracy": 0.9679877310991287, + "num_tokens": 5484011.0, + "step": 622 + }, + { + "entropy": 1.7533331513404846, + "epoch": 2.2497737556561086, + "grad_norm": 0.6320033073425293, + "learning_rate": 0.000382318635150678, + "loss": 0.1588, + "mean_token_accuracy": 0.9576389044523239, + "num_tokens": 5493123.0, + "step": 623 + }, + { + "entropy": 1.8554400503635406, + "epoch": 2.253393665158371, + "grad_norm": 0.7169481515884399, + "learning_rate": 0.0003819160967580536, + "loss": 0.1316, + "mean_token_accuracy": 0.966967299580574, + "num_tokens": 5501923.0, + "step": 624 + }, + { + "entropy": 1.9283805191516876, + "epoch": 2.2570135746606335, + "grad_norm": 0.599856436252594, + "learning_rate": 0.00038151317526313917, + "loss": 0.1326, + "mean_token_accuracy": 0.961080014705658, + "num_tokens": 5510356.0, + "step": 625 + }, + { + "entropy": 1.7921342253684998, + "epoch": 2.260633484162896, + "grad_norm": 0.7019768357276917, + "learning_rate": 0.0003811098721962818, + "loss": 0.0976, + "mean_token_accuracy": 0.970125287771225, + "num_tokens": 5519016.0, + "step": 626 + }, + { + "entropy": 1.7646876573562622, + "epoch": 2.2642533936651583, + "grad_norm": 0.7311795949935913, + "learning_rate": 0.00038070618908927784, + "loss": 0.0908, + "mean_token_accuracy": 0.9719386845827103, + "num_tokens": 5528139.0, + "step": 627 + }, + { + "entropy": 1.8233769237995148, + "epoch": 2.2678733031674208, + "grad_norm": 0.6742154955863953, + "learning_rate": 0.0003803021274753674, + "loss": 0.1348, + "mean_token_accuracy": 0.9619691967964172, + "num_tokens": 5537036.0, + "step": 628 + }, + { + "entropy": 1.7711736857891083, + "epoch": 2.271493212669683, + "grad_norm": 0.6000869274139404, + "learning_rate": 0.00037989768888922775, + "loss": 0.1086, + "mean_token_accuracy": 0.9672373533248901, + "num_tokens": 5545932.0, + "step": 629 + }, + { + "entropy": 1.8396382629871368, + "epoch": 2.2751131221719456, + "grad_norm": 0.541504979133606, + "learning_rate": 0.0003794928748669683, + "loss": 0.0775, + "mean_token_accuracy": 0.977355495095253, + "num_tokens": 5554403.0, + "step": 630 + }, + { + "entropy": 1.890054315328598, + "epoch": 2.278733031674208, + "grad_norm": 0.5629594326019287, + "learning_rate": 0.00037908768694612434, + "loss": 0.0711, + "mean_token_accuracy": 0.9779117107391357, + "num_tokens": 5563156.0, + "step": 631 + }, + { + "entropy": 1.9505741894245148, + "epoch": 2.2823529411764705, + "grad_norm": 0.6717761754989624, + "learning_rate": 0.0003786821266656512, + "loss": 0.1077, + "mean_token_accuracy": 0.9674138873815536, + "num_tokens": 5571618.0, + "step": 632 + }, + { + "entropy": 1.8377742171287537, + "epoch": 2.285972850678733, + "grad_norm": 0.6176472902297974, + "learning_rate": 0.0003782761955659185, + "loss": 0.1106, + "mean_token_accuracy": 0.9669957906007767, + "num_tokens": 5580668.0, + "step": 633 + }, + { + "entropy": 1.8336479365825653, + "epoch": 2.2895927601809953, + "grad_norm": 0.5120813846588135, + "learning_rate": 0.0003778698951887042, + "loss": 0.0732, + "mean_token_accuracy": 0.9774532318115234, + "num_tokens": 5589491.0, + "step": 634 + }, + { + "entropy": 1.9576656222343445, + "epoch": 2.2932126696832578, + "grad_norm": 0.9347079396247864, + "learning_rate": 0.00037746322707718895, + "loss": 0.2275, + "mean_token_accuracy": 0.9512088149785995, + "num_tokens": 5598327.0, + "step": 635 + }, + { + "entropy": 1.9309991896152496, + "epoch": 2.29683257918552, + "grad_norm": 0.506108283996582, + "learning_rate": 0.0003770561927759502, + "loss": 0.1046, + "mean_token_accuracy": 0.9633967131376266, + "num_tokens": 5606948.0, + "step": 636 + }, + { + "entropy": 1.963425725698471, + "epoch": 2.3004524886877826, + "grad_norm": 0.5499919056892395, + "learning_rate": 0.0003766487938309561, + "loss": 0.0804, + "mean_token_accuracy": 0.9783825874328613, + "num_tokens": 5615342.0, + "step": 637 + }, + { + "entropy": 1.8853708505630493, + "epoch": 2.304072398190045, + "grad_norm": 0.5846657156944275, + "learning_rate": 0.00037624103178955946, + "loss": 0.0904, + "mean_token_accuracy": 0.9774703830480576, + "num_tokens": 5624449.0, + "step": 638 + }, + { + "entropy": 1.928403079509735, + "epoch": 2.3076923076923075, + "grad_norm": 0.5203971266746521, + "learning_rate": 0.0003758329082004928, + "loss": 0.0917, + "mean_token_accuracy": 0.9723261743783951, + "num_tokens": 5633273.0, + "step": 639 + }, + { + "entropy": 1.8914157152175903, + "epoch": 2.31131221719457, + "grad_norm": 0.5215239524841309, + "learning_rate": 0.00037542442461386145, + "loss": 0.1072, + "mean_token_accuracy": 0.9704900681972504, + "num_tokens": 5642357.0, + "step": 640 + }, + { + "entropy": 1.9754666090011597, + "epoch": 2.3149321266968323, + "grad_norm": 0.6710624694824219, + "learning_rate": 0.0003750155825811379, + "loss": 0.1344, + "mean_token_accuracy": 0.9615458548069, + "num_tokens": 5651409.0, + "step": 641 + }, + { + "entropy": 1.97001314163208, + "epoch": 2.318552036199095, + "grad_norm": 0.6511638164520264, + "learning_rate": 0.00037460638365515673, + "loss": 0.0502, + "mean_token_accuracy": 0.9829420000314713, + "num_tokens": 5660362.0, + "step": 642 + }, + { + "entropy": 1.9473612904548645, + "epoch": 2.3221719457013577, + "grad_norm": 0.5315663814544678, + "learning_rate": 0.00037419682939010725, + "loss": 0.1004, + "mean_token_accuracy": 0.9741797298192978, + "num_tokens": 5669386.0, + "step": 643 + }, + { + "entropy": 1.9136508405208588, + "epoch": 2.32579185520362, + "grad_norm": 0.6636398434638977, + "learning_rate": 0.00037378692134152887, + "loss": 0.0928, + "mean_token_accuracy": 0.9753085225820541, + "num_tokens": 5678226.0, + "step": 644 + }, + { + "entropy": 2.0870893597602844, + "epoch": 2.3294117647058825, + "grad_norm": 0.45003074407577515, + "learning_rate": 0.00037337666106630464, + "loss": 0.0937, + "mean_token_accuracy": 0.9742898046970367, + "num_tokens": 5687017.0, + "step": 645 + }, + { + "entropy": 2.084017276763916, + "epoch": 2.333031674208145, + "grad_norm": 0.6305840611457825, + "learning_rate": 0.0003729660501226553, + "loss": 0.1085, + "mean_token_accuracy": 0.9696957617998123, + "num_tokens": 5695585.0, + "step": 646 + }, + { + "entropy": 2.0916273295879364, + "epoch": 2.3366515837104074, + "grad_norm": 0.6674802303314209, + "learning_rate": 0.00037255509007013353, + "loss": 0.1214, + "mean_token_accuracy": 0.9657080322504044, + "num_tokens": 5704167.0, + "step": 647 + }, + { + "entropy": 2.0445155799388885, + "epoch": 2.34027149321267, + "grad_norm": 0.9245135188102722, + "learning_rate": 0.0003721437824696181, + "loss": 0.124, + "mean_token_accuracy": 0.9668982475996017, + "num_tokens": 5712896.0, + "step": 648 + }, + { + "entropy": 2.040050685405731, + "epoch": 2.3438914027149322, + "grad_norm": 0.558266818523407, + "learning_rate": 0.00037173212888330756, + "loss": 0.103, + "mean_token_accuracy": 0.9663692861795425, + "num_tokens": 5721568.0, + "step": 649 + }, + { + "entropy": 2.078313887119293, + "epoch": 2.3475113122171947, + "grad_norm": 0.6157237887382507, + "learning_rate": 0.0003713201308747148, + "loss": 0.1247, + "mean_token_accuracy": 0.9645204842090607, + "num_tokens": 5730097.0, + "step": 650 + }, + { + "entropy": 1.9473297894001007, + "epoch": 2.351131221719457, + "grad_norm": 0.6460309028625488, + "learning_rate": 0.0003709077900086607, + "loss": 0.193, + "mean_token_accuracy": 0.9537071883678436, + "num_tokens": 5738953.0, + "step": 651 + }, + { + "entropy": 1.9319245219230652, + "epoch": 2.3547511312217195, + "grad_norm": 0.826302170753479, + "learning_rate": 0.0003704951078512684, + "loss": 0.2072, + "mean_token_accuracy": 0.9553762674331665, + "num_tokens": 5748421.0, + "step": 652 + }, + { + "entropy": 2.000667005777359, + "epoch": 2.358371040723982, + "grad_norm": 0.508975625038147, + "learning_rate": 0.00037008208596995743, + "loss": 0.1124, + "mean_token_accuracy": 0.9674097448587418, + "num_tokens": 5757333.0, + "step": 653 + }, + { + "entropy": 1.9692010879516602, + "epoch": 2.3619909502262444, + "grad_norm": 0.597391664981842, + "learning_rate": 0.00036966872593343747, + "loss": 0.0958, + "mean_token_accuracy": 0.9727880656719208, + "num_tokens": 5766427.0, + "step": 654 + }, + { + "entropy": 1.9356706142425537, + "epoch": 2.365610859728507, + "grad_norm": 0.6264978051185608, + "learning_rate": 0.0003692550293117025, + "loss": 0.0925, + "mean_token_accuracy": 0.9736592024564743, + "num_tokens": 5775578.0, + "step": 655 + }, + { + "entropy": 2.086688846349716, + "epoch": 2.3692307692307693, + "grad_norm": 0.926537811756134, + "learning_rate": 0.00036884099767602523, + "loss": 0.1772, + "mean_token_accuracy": 0.9588586837053299, + "num_tokens": 5783754.0, + "step": 656 + }, + { + "entropy": 1.8272685706615448, + "epoch": 2.3728506787330317, + "grad_norm": 0.5276276469230652, + "learning_rate": 0.0003684266325989504, + "loss": 0.106, + "mean_token_accuracy": 0.9692760407924652, + "num_tokens": 5793159.0, + "step": 657 + }, + { + "entropy": 1.8490014672279358, + "epoch": 2.376470588235294, + "grad_norm": 0.6970511078834534, + "learning_rate": 0.0003680119356542895, + "loss": 0.0849, + "mean_token_accuracy": 0.9812656790018082, + "num_tokens": 5802503.0, + "step": 658 + }, + { + "entropy": 1.8577990531921387, + "epoch": 2.3800904977375565, + "grad_norm": 0.49535682797431946, + "learning_rate": 0.00036759690841711435, + "loss": 0.0965, + "mean_token_accuracy": 0.9723764955997467, + "num_tokens": 5811839.0, + "step": 659 + }, + { + "entropy": 1.785957396030426, + "epoch": 2.383710407239819, + "grad_norm": 0.7373266220092773, + "learning_rate": 0.00036718155246375124, + "loss": 0.103, + "mean_token_accuracy": 0.9659082442522049, + "num_tokens": 5821076.0, + "step": 660 + }, + { + "entropy": 1.8944315016269684, + "epoch": 2.3873303167420814, + "grad_norm": 0.4784161448478699, + "learning_rate": 0.000366765869371775, + "loss": 0.0899, + "mean_token_accuracy": 0.9731316566467285, + "num_tokens": 5830098.0, + "step": 661 + }, + { + "entropy": 1.8901143372058868, + "epoch": 2.390950226244344, + "grad_norm": 0.5539003610610962, + "learning_rate": 0.00036634986072000305, + "loss": 0.078, + "mean_token_accuracy": 0.9769923985004425, + "num_tokens": 5839149.0, + "step": 662 + }, + { + "entropy": 1.8183043003082275, + "epoch": 2.3945701357466063, + "grad_norm": 0.48431649804115295, + "learning_rate": 0.0003659335280884893, + "loss": 0.0669, + "mean_token_accuracy": 0.978607714176178, + "num_tokens": 5848064.0, + "step": 663 + }, + { + "entropy": 1.7216700911521912, + "epoch": 2.3981900452488687, + "grad_norm": 0.5597919821739197, + "learning_rate": 0.00036551687305851803, + "loss": 0.1026, + "mean_token_accuracy": 0.9733614027500153, + "num_tokens": 5857075.0, + "step": 664 + }, + { + "entropy": 1.7788107991218567, + "epoch": 2.401809954751131, + "grad_norm": 0.6780642867088318, + "learning_rate": 0.00036509989721259824, + "loss": 0.0895, + "mean_token_accuracy": 0.9711848199367523, + "num_tokens": 5866029.0, + "step": 665 + }, + { + "entropy": 1.8354471325874329, + "epoch": 2.4054298642533936, + "grad_norm": 0.6284046769142151, + "learning_rate": 0.0003646826021344573, + "loss": 0.1153, + "mean_token_accuracy": 0.9645407199859619, + "num_tokens": 5874523.0, + "step": 666 + }, + { + "entropy": 1.829980492591858, + "epoch": 2.409049773755656, + "grad_norm": 0.6398605704307556, + "learning_rate": 0.00036426498940903506, + "loss": 0.0605, + "mean_token_accuracy": 0.9823256582021713, + "num_tokens": 5883067.0, + "step": 667 + }, + { + "entropy": 1.839373379945755, + "epoch": 2.4126696832579184, + "grad_norm": 0.6254173517227173, + "learning_rate": 0.000363847060622478, + "loss": 0.0708, + "mean_token_accuracy": 0.978134423494339, + "num_tokens": 5891921.0, + "step": 668 + }, + { + "entropy": 1.7790280282497406, + "epoch": 2.416289592760181, + "grad_norm": 0.5987306833267212, + "learning_rate": 0.0003634288173621326, + "loss": 0.0888, + "mean_token_accuracy": 0.9814571887254715, + "num_tokens": 5900603.0, + "step": 669 + }, + { + "entropy": 1.6918425559997559, + "epoch": 2.4199095022624433, + "grad_norm": 0.784694492816925, + "learning_rate": 0.00036301026121654057, + "loss": 0.1353, + "mean_token_accuracy": 0.9646909832954407, + "num_tokens": 5910028.0, + "step": 670 + }, + { + "entropy": 1.726965218782425, + "epoch": 2.4235294117647057, + "grad_norm": 0.7017857432365417, + "learning_rate": 0.00036259139377543104, + "loss": 0.1531, + "mean_token_accuracy": 0.9617924690246582, + "num_tokens": 5919145.0, + "step": 671 + }, + { + "entropy": 1.7354467511177063, + "epoch": 2.427149321266968, + "grad_norm": 0.49217918515205383, + "learning_rate": 0.00036217221662971613, + "loss": 0.1217, + "mean_token_accuracy": 0.96451136469841, + "num_tokens": 5928203.0, + "step": 672 + }, + { + "entropy": 1.827672392129898, + "epoch": 2.430769230769231, + "grad_norm": 0.5875037312507629, + "learning_rate": 0.0003617527313714841, + "loss": 0.1151, + "mean_token_accuracy": 0.9714375436306, + "num_tokens": 5936876.0, + "step": 673 + }, + { + "entropy": 1.787518948316574, + "epoch": 2.4343891402714934, + "grad_norm": 0.5444310307502747, + "learning_rate": 0.0003613329395939933, + "loss": 0.1096, + "mean_token_accuracy": 0.9701481461524963, + "num_tokens": 5946025.0, + "step": 674 + }, + { + "entropy": 1.832441657781601, + "epoch": 2.438009049773756, + "grad_norm": 0.6885861754417419, + "learning_rate": 0.00036091284289166637, + "loss": 0.1409, + "mean_token_accuracy": 0.9587968736886978, + "num_tokens": 5954406.0, + "step": 675 + }, + { + "entropy": 1.7488494515419006, + "epoch": 2.4416289592760183, + "grad_norm": 0.4765988290309906, + "learning_rate": 0.0003604924428600843, + "loss": 0.1183, + "mean_token_accuracy": 0.9581810384988785, + "num_tokens": 5963472.0, + "step": 676 + }, + { + "entropy": 1.885668009519577, + "epoch": 2.4452488687782807, + "grad_norm": 0.7310354113578796, + "learning_rate": 0.00036007174109597983, + "loss": 0.1248, + "mean_token_accuracy": 0.9588694721460342, + "num_tokens": 5971771.0, + "step": 677 + }, + { + "entropy": 1.8329627513885498, + "epoch": 2.448868778280543, + "grad_norm": 0.37075191736221313, + "learning_rate": 0.00035965073919723206, + "loss": 0.0694, + "mean_token_accuracy": 0.9812011271715164, + "num_tokens": 5980536.0, + "step": 678 + }, + { + "entropy": 1.8218618333339691, + "epoch": 2.4524886877828056, + "grad_norm": 0.5196499228477478, + "learning_rate": 0.0003592294387628597, + "loss": 0.0833, + "mean_token_accuracy": 0.9765996187925339, + "num_tokens": 5989462.0, + "step": 679 + }, + { + "entropy": 1.7702144086360931, + "epoch": 2.456108597285068, + "grad_norm": 0.68550044298172, + "learning_rate": 0.0003588078413930155, + "loss": 0.1395, + "mean_token_accuracy": 0.9701545089483261, + "num_tokens": 5998702.0, + "step": 680 + }, + { + "entropy": 1.729397028684616, + "epoch": 2.4597285067873305, + "grad_norm": 0.6107930541038513, + "learning_rate": 0.00035838594868898004, + "loss": 0.1009, + "mean_token_accuracy": 0.9712544083595276, + "num_tokens": 6007594.0, + "step": 681 + }, + { + "entropy": 1.6558150053024292, + "epoch": 2.463348416289593, + "grad_norm": 0.45058509707450867, + "learning_rate": 0.0003579637622531555, + "loss": 0.0747, + "mean_token_accuracy": 0.9791784882545471, + "num_tokens": 6016874.0, + "step": 682 + }, + { + "entropy": 1.7209869921207428, + "epoch": 2.4669683257918553, + "grad_norm": 0.6103800535202026, + "learning_rate": 0.0003575412836890599, + "loss": 0.1096, + "mean_token_accuracy": 0.9665796160697937, + "num_tokens": 6026056.0, + "step": 683 + }, + { + "entropy": 1.790249615907669, + "epoch": 2.4705882352941178, + "grad_norm": 0.67525315284729, + "learning_rate": 0.0003571185146013205, + "loss": 0.0811, + "mean_token_accuracy": 0.9776998162269592, + "num_tokens": 6034624.0, + "step": 684 + }, + { + "entropy": 1.735906183719635, + "epoch": 2.47420814479638, + "grad_norm": 0.884986162185669, + "learning_rate": 0.00035669545659566836, + "loss": 0.2324, + "mean_token_accuracy": 0.9448857754468918, + "num_tokens": 6043557.0, + "step": 685 + }, + { + "entropy": 1.673194944858551, + "epoch": 2.4778280542986426, + "grad_norm": 0.7441328763961792, + "learning_rate": 0.0003562721112789316, + "loss": 0.1661, + "mean_token_accuracy": 0.9566781520843506, + "num_tokens": 6052623.0, + "step": 686 + }, + { + "entropy": 1.736072987318039, + "epoch": 2.481447963800905, + "grad_norm": 0.5674424767494202, + "learning_rate": 0.00035584848025902973, + "loss": 0.0751, + "mean_token_accuracy": 0.9750215858221054, + "num_tokens": 6061347.0, + "step": 687 + }, + { + "entropy": 1.625234305858612, + "epoch": 2.4850678733031675, + "grad_norm": 0.6596720218658447, + "learning_rate": 0.00035542456514496725, + "loss": 0.0796, + "mean_token_accuracy": 0.9773041009902954, + "num_tokens": 6070396.0, + "step": 688 + }, + { + "entropy": 1.6548752784729004, + "epoch": 2.48868778280543, + "grad_norm": 0.5798892378807068, + "learning_rate": 0.00035500036754682794, + "loss": 0.1412, + "mean_token_accuracy": 0.9653023481369019, + "num_tokens": 6079757.0, + "step": 689 + }, + { + "entropy": 1.6213977932929993, + "epoch": 2.4923076923076923, + "grad_norm": 0.44931474328041077, + "learning_rate": 0.00035457588907576823, + "loss": 0.0724, + "mean_token_accuracy": 0.9800422787666321, + "num_tokens": 6088646.0, + "step": 690 + }, + { + "entropy": 1.6762541830539703, + "epoch": 2.4959276018099548, + "grad_norm": 0.6818104386329651, + "learning_rate": 0.0003541511313440114, + "loss": 0.1217, + "mean_token_accuracy": 0.9675028026103973, + "num_tokens": 6097441.0, + "step": 691 + }, + { + "entropy": 1.7241974771022797, + "epoch": 2.499547511312217, + "grad_norm": 0.4126259982585907, + "learning_rate": 0.00035372609596484166, + "loss": 0.0615, + "mean_token_accuracy": 0.9799284338951111, + "num_tokens": 6105578.0, + "step": 692 + }, + { + "entropy": 1.6379709541797638, + "epoch": 2.5031674208144796, + "grad_norm": 0.47291842103004456, + "learning_rate": 0.00035330078455259734, + "loss": 0.0858, + "mean_token_accuracy": 0.9744312763214111, + "num_tokens": 6114404.0, + "step": 693 + }, + { + "entropy": 1.6317658722400665, + "epoch": 2.506787330316742, + "grad_norm": 0.5747683048248291, + "learning_rate": 0.00035287519872266544, + "loss": 0.1344, + "mean_token_accuracy": 0.9632531553506851, + "num_tokens": 6123319.0, + "step": 694 + }, + { + "entropy": 1.6969698369503021, + "epoch": 2.5104072398190045, + "grad_norm": 0.5810018181800842, + "learning_rate": 0.00035244934009147523, + "loss": 0.0927, + "mean_token_accuracy": 0.9729650169610977, + "num_tokens": 6131814.0, + "step": 695 + }, + { + "entropy": 1.631262481212616, + "epoch": 2.514027149321267, + "grad_norm": 0.44387346506118774, + "learning_rate": 0.00035202321027649205, + "loss": 0.0657, + "mean_token_accuracy": 0.9802225232124329, + "num_tokens": 6140967.0, + "step": 696 + }, + { + "entropy": 1.610716551542282, + "epoch": 2.5176470588235293, + "grad_norm": 0.6546471118927002, + "learning_rate": 0.0003515968108962112, + "loss": 0.1114, + "mean_token_accuracy": 0.9671156108379364, + "num_tokens": 6149938.0, + "step": 697 + }, + { + "entropy": 1.598843276500702, + "epoch": 2.521266968325792, + "grad_norm": 0.541953444480896, + "learning_rate": 0.0003511701435701519, + "loss": 0.0504, + "mean_token_accuracy": 0.98616062104702, + "num_tokens": 6158686.0, + "step": 698 + }, + { + "entropy": 1.7793676853179932, + "epoch": 2.524886877828054, + "grad_norm": 0.6303162574768066, + "learning_rate": 0.00035074320991885106, + "loss": 0.0797, + "mean_token_accuracy": 0.9783169627189636, + "num_tokens": 6166835.0, + "step": 699 + }, + { + "entropy": 1.598317414522171, + "epoch": 2.5285067873303166, + "grad_norm": 0.4783090054988861, + "learning_rate": 0.000350316011563857, + "loss": 0.0693, + "mean_token_accuracy": 0.9740357846021652, + "num_tokens": 6175978.0, + "step": 700 + }, + { + "entropy": 1.6361595392227173, + "epoch": 2.532126696832579, + "grad_norm": 0.46353498101234436, + "learning_rate": 0.00034988855012772367, + "loss": 0.0543, + "mean_token_accuracy": 0.9821173399686813, + "num_tokens": 6185071.0, + "step": 701 + }, + { + "entropy": 1.6333596408367157, + "epoch": 2.5357466063348415, + "grad_norm": 0.4968421459197998, + "learning_rate": 0.0003494608272340039, + "loss": 0.1588, + "mean_token_accuracy": 0.9692430347204208, + "num_tokens": 6194279.0, + "step": 702 + }, + { + "entropy": 1.6701206266880035, + "epoch": 2.539366515837104, + "grad_norm": 0.7050784826278687, + "learning_rate": 0.00034903284450724385, + "loss": 0.1298, + "mean_token_accuracy": 0.9623726159334183, + "num_tokens": 6203017.0, + "step": 703 + }, + { + "entropy": 1.6594900786876678, + "epoch": 2.5429864253393664, + "grad_norm": 0.7955659031867981, + "learning_rate": 0.0003486046035729765, + "loss": 0.1695, + "mean_token_accuracy": 0.9616524875164032, + "num_tokens": 6212016.0, + "step": 704 + }, + { + "entropy": 1.7208792865276337, + "epoch": 2.546606334841629, + "grad_norm": 0.7105070352554321, + "learning_rate": 0.00034817610605771546, + "loss": 0.1655, + "mean_token_accuracy": 0.9637335985898972, + "num_tokens": 6220619.0, + "step": 705 + }, + { + "entropy": 1.668517529964447, + "epoch": 2.5502262443438912, + "grad_norm": 0.3955032527446747, + "learning_rate": 0.0003477473535889488, + "loss": 0.0502, + "mean_token_accuracy": 0.9823585599660873, + "num_tokens": 6229785.0, + "step": 706 + }, + { + "entropy": 1.7515103816986084, + "epoch": 2.5538461538461537, + "grad_norm": 0.6166616082191467, + "learning_rate": 0.00034731834779513313, + "loss": 0.1113, + "mean_token_accuracy": 0.9675650298595428, + "num_tokens": 6238724.0, + "step": 707 + }, + { + "entropy": 1.8460631668567657, + "epoch": 2.557466063348416, + "grad_norm": 0.8243921399116516, + "learning_rate": 0.0003468890903056872, + "loss": 0.1625, + "mean_token_accuracy": 0.9648249596357346, + "num_tokens": 6246939.0, + "step": 708 + }, + { + "entropy": 1.784417450428009, + "epoch": 2.5610859728506785, + "grad_norm": 0.5633116960525513, + "learning_rate": 0.00034645958275098557, + "loss": 0.1074, + "mean_token_accuracy": 0.9705483913421631, + "num_tokens": 6255686.0, + "step": 709 + }, + { + "entropy": 1.7208334505558014, + "epoch": 2.564705882352941, + "grad_norm": 0.8083389401435852, + "learning_rate": 0.0003460298267623526, + "loss": 0.1184, + "mean_token_accuracy": 0.9747882932424545, + "num_tokens": 6265047.0, + "step": 710 + }, + { + "entropy": 1.7345463037490845, + "epoch": 2.5683257918552034, + "grad_norm": 0.6094368100166321, + "learning_rate": 0.0003455998239720565, + "loss": 0.1689, + "mean_token_accuracy": 0.9613602459430695, + "num_tokens": 6274460.0, + "step": 711 + }, + { + "entropy": 1.9464713335037231, + "epoch": 2.571945701357466, + "grad_norm": 0.6025084853172302, + "learning_rate": 0.0003451695760133025, + "loss": 0.1477, + "mean_token_accuracy": 0.9618766456842422, + "num_tokens": 6282700.0, + "step": 712 + }, + { + "entropy": 1.8449675738811493, + "epoch": 2.5755656108597282, + "grad_norm": 0.43869853019714355, + "learning_rate": 0.0003447390845202272, + "loss": 0.0892, + "mean_token_accuracy": 0.974039301276207, + "num_tokens": 6291627.0, + "step": 713 + }, + { + "entropy": 1.9028298556804657, + "epoch": 2.579185520361991, + "grad_norm": 0.5455291271209717, + "learning_rate": 0.0003443083511278922, + "loss": 0.0939, + "mean_token_accuracy": 0.9729337990283966, + "num_tokens": 6300198.0, + "step": 714 + }, + { + "entropy": 1.8395194113254547, + "epoch": 2.5828054298642535, + "grad_norm": 0.48734748363494873, + "learning_rate": 0.00034387737747227786, + "loss": 0.0791, + "mean_token_accuracy": 0.9785804748535156, + "num_tokens": 6309362.0, + "step": 715 + }, + { + "entropy": 1.8357026278972626, + "epoch": 2.586425339366516, + "grad_norm": 0.4359396994113922, + "learning_rate": 0.000343446165190277, + "loss": 0.0752, + "mean_token_accuracy": 0.9807359129190445, + "num_tokens": 6318232.0, + "step": 716 + }, + { + "entropy": 1.7531521618366241, + "epoch": 2.5900452488687784, + "grad_norm": 0.7446436882019043, + "learning_rate": 0.0003430147159196887, + "loss": 0.1467, + "mean_token_accuracy": 0.9661064445972443, + "num_tokens": 6327607.0, + "step": 717 + }, + { + "entropy": 1.83816197514534, + "epoch": 2.593665158371041, + "grad_norm": 0.3669150173664093, + "learning_rate": 0.0003425830312992125, + "loss": 0.076, + "mean_token_accuracy": 0.9777591675519943, + "num_tokens": 6336991.0, + "step": 718 + }, + { + "entropy": 1.9396244585514069, + "epoch": 2.5972850678733033, + "grad_norm": 0.6049129962921143, + "learning_rate": 0.00034215111296844147, + "loss": 0.1001, + "mean_token_accuracy": 0.968943640589714, + "num_tokens": 6345381.0, + "step": 719 + }, + { + "entropy": 1.8745197057724, + "epoch": 2.6009049773755657, + "grad_norm": 0.8561233878135681, + "learning_rate": 0.00034171896256785645, + "loss": 0.2378, + "mean_token_accuracy": 0.9442594349384308, + "num_tokens": 6354290.0, + "step": 720 + }, + { + "entropy": 1.8199078440666199, + "epoch": 2.604524886877828, + "grad_norm": 0.4546636939048767, + "learning_rate": 0.00034128658173881993, + "loss": 0.0407, + "mean_token_accuracy": 0.9873656630516052, + "num_tokens": 6362826.0, + "step": 721 + }, + { + "entropy": 1.8066097497940063, + "epoch": 2.6081447963800906, + "grad_norm": 0.6496687531471252, + "learning_rate": 0.0003408539721235691, + "loss": 0.1279, + "mean_token_accuracy": 0.9674505293369293, + "num_tokens": 6371666.0, + "step": 722 + }, + { + "entropy": 1.8027856945991516, + "epoch": 2.611764705882353, + "grad_norm": 0.6001412272453308, + "learning_rate": 0.0003404211353652106, + "loss": 0.1144, + "mean_token_accuracy": 0.9672902077436447, + "num_tokens": 6380469.0, + "step": 723 + }, + { + "entropy": 1.7859437465667725, + "epoch": 2.6153846153846154, + "grad_norm": 0.4654795229434967, + "learning_rate": 0.0003399880731077136, + "loss": 0.0655, + "mean_token_accuracy": 0.9804074019193649, + "num_tokens": 6389485.0, + "step": 724 + }, + { + "entropy": 1.722127079963684, + "epoch": 2.619004524886878, + "grad_norm": 0.5452624559402466, + "learning_rate": 0.0003395547869959037, + "loss": 0.0827, + "mean_token_accuracy": 0.972189649939537, + "num_tokens": 6398523.0, + "step": 725 + }, + { + "entropy": 1.7406074404716492, + "epoch": 2.6226244343891403, + "grad_norm": 0.5524203181266785, + "learning_rate": 0.00033912127867545685, + "loss": 0.1279, + "mean_token_accuracy": 0.9688322842121124, + "num_tokens": 6407560.0, + "step": 726 + }, + { + "entropy": 1.7783840000629425, + "epoch": 2.6262443438914027, + "grad_norm": 0.6428073644638062, + "learning_rate": 0.00033868754979289275, + "loss": 0.1392, + "mean_token_accuracy": 0.9665655642747879, + "num_tokens": 6416230.0, + "step": 727 + }, + { + "entropy": 1.7406431436538696, + "epoch": 2.629864253393665, + "grad_norm": 0.6197221875190735, + "learning_rate": 0.0003382536019955691, + "loss": 0.2688, + "mean_token_accuracy": 0.9567561745643616, + "num_tokens": 6425158.0, + "step": 728 + }, + { + "entropy": 1.7054848670959473, + "epoch": 2.6334841628959276, + "grad_norm": 0.499615877866745, + "learning_rate": 0.0003378194369316749, + "loss": 0.0765, + "mean_token_accuracy": 0.9788558930158615, + "num_tokens": 6434219.0, + "step": 729 + }, + { + "entropy": 1.8623437583446503, + "epoch": 2.63710407239819, + "grad_norm": 0.428608775138855, + "learning_rate": 0.0003373850562502243, + "loss": 0.044, + "mean_token_accuracy": 0.9862259030342102, + "num_tokens": 6442657.0, + "step": 730 + }, + { + "entropy": 1.6827208995819092, + "epoch": 2.6407239819004524, + "grad_norm": 0.46222713589668274, + "learning_rate": 0.00033695046160105076, + "loss": 0.0687, + "mean_token_accuracy": 0.9762164503335953, + "num_tokens": 6451550.0, + "step": 731 + }, + { + "entropy": 1.707773894071579, + "epoch": 2.644343891402715, + "grad_norm": 0.4701695442199707, + "learning_rate": 0.0003365156546347998, + "loss": 0.0622, + "mean_token_accuracy": 0.9804075062274933, + "num_tokens": 6460494.0, + "step": 732 + }, + { + "entropy": 1.7011042833328247, + "epoch": 2.6479638009049773, + "grad_norm": 0.5986224412918091, + "learning_rate": 0.0003360806370029239, + "loss": 0.0954, + "mean_token_accuracy": 0.9730664491653442, + "num_tokens": 6469728.0, + "step": 733 + }, + { + "entropy": 1.810427963733673, + "epoch": 2.6515837104072397, + "grad_norm": 0.8224559426307678, + "learning_rate": 0.0003356454103576754, + "loss": 0.1218, + "mean_token_accuracy": 0.9742488712072372, + "num_tokens": 6478643.0, + "step": 734 + }, + { + "entropy": 1.773183435201645, + "epoch": 2.655203619909502, + "grad_norm": 0.609344482421875, + "learning_rate": 0.0003352099763521006, + "loss": 0.0955, + "mean_token_accuracy": 0.9747250378131866, + "num_tokens": 6487314.0, + "step": 735 + }, + { + "entropy": 1.7761066555976868, + "epoch": 2.6588235294117646, + "grad_norm": 0.6947258114814758, + "learning_rate": 0.0003347743366400333, + "loss": 0.1188, + "mean_token_accuracy": 0.9693178832530975, + "num_tokens": 6496074.0, + "step": 736 + }, + { + "entropy": 1.7725336253643036, + "epoch": 2.662443438914027, + "grad_norm": 0.6928444504737854, + "learning_rate": 0.0003343384928760887, + "loss": 0.1589, + "mean_token_accuracy": 0.9603369683027267, + "num_tokens": 6504997.0, + "step": 737 + }, + { + "entropy": 1.8763961493968964, + "epoch": 2.6660633484162894, + "grad_norm": 0.6204855442047119, + "learning_rate": 0.00033390244671565694, + "loss": 0.1115, + "mean_token_accuracy": 0.9727036952972412, + "num_tokens": 6513639.0, + "step": 738 + }, + { + "entropy": 1.8347080647945404, + "epoch": 2.669683257918552, + "grad_norm": 0.4470975697040558, + "learning_rate": 0.00033346619981489687, + "loss": 0.0707, + "mean_token_accuracy": 0.9816004037857056, + "num_tokens": 6522524.0, + "step": 739 + }, + { + "entropy": 1.8440867066383362, + "epoch": 2.6733031674208148, + "grad_norm": 0.6848122477531433, + "learning_rate": 0.0003330297538307298, + "loss": 0.1133, + "mean_token_accuracy": 0.966602012515068, + "num_tokens": 6531421.0, + "step": 740 + }, + { + "entropy": 1.829009771347046, + "epoch": 2.676923076923077, + "grad_norm": 0.37875643372535706, + "learning_rate": 0.0003325931104208333, + "loss": 0.0539, + "mean_token_accuracy": 0.9850967526435852, + "num_tokens": 6540304.0, + "step": 741 + }, + { + "entropy": 1.8256315886974335, + "epoch": 2.6805429864253396, + "grad_norm": 0.4970630407333374, + "learning_rate": 0.00033215627124363466, + "loss": 0.1195, + "mean_token_accuracy": 0.9662436544895172, + "num_tokens": 6549267.0, + "step": 742 + }, + { + "entropy": 1.823629915714264, + "epoch": 2.684162895927602, + "grad_norm": 0.659981906414032, + "learning_rate": 0.0003317192379583047, + "loss": 0.1368, + "mean_token_accuracy": 0.9655566364526749, + "num_tokens": 6558447.0, + "step": 743 + }, + { + "entropy": 1.8459455370903015, + "epoch": 2.6877828054298645, + "grad_norm": 0.620197057723999, + "learning_rate": 0.0003312820122247515, + "loss": 0.1766, + "mean_token_accuracy": 0.9569400995969772, + "num_tokens": 6567424.0, + "step": 744 + }, + { + "entropy": 1.7685991525650024, + "epoch": 2.691402714932127, + "grad_norm": 0.34498465061187744, + "learning_rate": 0.0003308445957036142, + "loss": 0.0615, + "mean_token_accuracy": 0.982216015458107, + "num_tokens": 6577071.0, + "step": 745 + }, + { + "entropy": 1.8037284910678864, + "epoch": 2.6950226244343893, + "grad_norm": 0.5550521016120911, + "learning_rate": 0.00033040699005625654, + "loss": 0.0701, + "mean_token_accuracy": 0.9795115292072296, + "num_tokens": 6586396.0, + "step": 746 + }, + { + "entropy": 1.813001424074173, + "epoch": 2.6986425339366518, + "grad_norm": 0.4117080271244049, + "learning_rate": 0.0003299691969447603, + "loss": 0.0657, + "mean_token_accuracy": 0.978747770190239, + "num_tokens": 6595189.0, + "step": 747 + }, + { + "entropy": 1.844575196504593, + "epoch": 2.702262443438914, + "grad_norm": 0.32197874784469604, + "learning_rate": 0.00032953121803191976, + "loss": 0.0342, + "mean_token_accuracy": 0.9904316365718842, + "num_tokens": 6604169.0, + "step": 748 + }, + { + "entropy": 1.9490505158901215, + "epoch": 2.7058823529411766, + "grad_norm": 0.5810762047767639, + "learning_rate": 0.00032909305498123465, + "loss": 0.1419, + "mean_token_accuracy": 0.9646100401878357, + "num_tokens": 6612744.0, + "step": 749 + }, + { + "entropy": 1.9927488267421722, + "epoch": 2.709502262443439, + "grad_norm": 0.7435065507888794, + "learning_rate": 0.0003286547094569039, + "loss": 0.1368, + "mean_token_accuracy": 0.9609140008687973, + "num_tokens": 6621000.0, + "step": 750 + }, + { + "entropy": 1.8266884088516235, + "epoch": 2.7131221719457015, + "grad_norm": 0.6717537045478821, + "learning_rate": 0.00032821618312381975, + "loss": 0.1449, + "mean_token_accuracy": 0.9694183021783829, + "num_tokens": 6629893.0, + "step": 751 + }, + { + "entropy": 1.850794643163681, + "epoch": 2.716742081447964, + "grad_norm": 0.44241195917129517, + "learning_rate": 0.00032777747764756117, + "loss": 0.0602, + "mean_token_accuracy": 0.9823136776685715, + "num_tokens": 6638696.0, + "step": 752 + }, + { + "entropy": 1.8408480882644653, + "epoch": 2.7203619909502263, + "grad_norm": 0.6299809217453003, + "learning_rate": 0.00032733859469438736, + "loss": 0.1408, + "mean_token_accuracy": 0.9629880636930466, + "num_tokens": 6647431.0, + "step": 753 + }, + { + "entropy": 1.7875444293022156, + "epoch": 2.723981900452489, + "grad_norm": 0.48492106795310974, + "learning_rate": 0.00032689953593123175, + "loss": 0.0806, + "mean_token_accuracy": 0.9798424690961838, + "num_tokens": 6656443.0, + "step": 754 + }, + { + "entropy": 1.778283566236496, + "epoch": 2.727601809954751, + "grad_norm": 0.46145930886268616, + "learning_rate": 0.0003264603030256955, + "loss": 0.0707, + "mean_token_accuracy": 0.9741399586200714, + "num_tokens": 6665465.0, + "step": 755 + }, + { + "entropy": 1.7340950965881348, + "epoch": 2.7312217194570136, + "grad_norm": 0.5734900236129761, + "learning_rate": 0.00032602089764604126, + "loss": 0.1443, + "mean_token_accuracy": 0.96195288002491, + "num_tokens": 6674797.0, + "step": 756 + }, + { + "entropy": 1.7791962027549744, + "epoch": 2.734841628959276, + "grad_norm": 0.5199477076530457, + "learning_rate": 0.00032558132146118636, + "loss": 0.0794, + "mean_token_accuracy": 0.975062221288681, + "num_tokens": 6683578.0, + "step": 757 + }, + { + "entropy": 1.825905591249466, + "epoch": 2.7384615384615385, + "grad_norm": 0.5944926738739014, + "learning_rate": 0.0003251415761406975, + "loss": 0.0909, + "mean_token_accuracy": 0.954865038394928, + "num_tokens": 6691818.0, + "step": 758 + }, + { + "entropy": 1.804949015378952, + "epoch": 2.742081447963801, + "grad_norm": 0.7065241932868958, + "learning_rate": 0.0003247016633547833, + "loss": 0.1511, + "mean_token_accuracy": 0.9687065333127975, + "num_tokens": 6700619.0, + "step": 759 + }, + { + "entropy": 1.7419202327728271, + "epoch": 2.7457013574660634, + "grad_norm": 0.49316564202308655, + "learning_rate": 0.00032426158477428857, + "loss": 0.0867, + "mean_token_accuracy": 0.9774050414562225, + "num_tokens": 6709635.0, + "step": 760 + }, + { + "entropy": 1.8934829235076904, + "epoch": 2.749321266968326, + "grad_norm": 0.9417999386787415, + "learning_rate": 0.00032382134207068787, + "loss": 0.1464, + "mean_token_accuracy": 0.9591032713651657, + "num_tokens": 6717657.0, + "step": 761 + }, + { + "entropy": 1.7354997992515564, + "epoch": 2.7529411764705882, + "grad_norm": 0.7240809798240662, + "learning_rate": 0.00032338093691607907, + "loss": 0.13, + "mean_token_accuracy": 0.9705345183610916, + "num_tokens": 6726671.0, + "step": 762 + }, + { + "entropy": 1.7620687186717987, + "epoch": 2.7565610859728507, + "grad_norm": 0.4986638128757477, + "learning_rate": 0.0003229403709831772, + "loss": 0.0963, + "mean_token_accuracy": 0.9756871312856674, + "num_tokens": 6735157.0, + "step": 763 + }, + { + "entropy": 1.7719130218029022, + "epoch": 2.760180995475113, + "grad_norm": 0.6204966902732849, + "learning_rate": 0.00032249964594530757, + "loss": 0.0578, + "mean_token_accuracy": 0.9815829247236252, + "num_tokens": 6743855.0, + "step": 764 + }, + { + "entropy": 1.7228702902793884, + "epoch": 2.7638009049773755, + "grad_norm": 0.5283492207527161, + "learning_rate": 0.0003220587634764003, + "loss": 0.069, + "mean_token_accuracy": 0.9851528853178024, + "num_tokens": 6753040.0, + "step": 765 + }, + { + "entropy": 1.7129736840724945, + "epoch": 2.767420814479638, + "grad_norm": 0.49026060104370117, + "learning_rate": 0.0003216177252509831, + "loss": 0.0672, + "mean_token_accuracy": 0.9857761710882187, + "num_tokens": 6762014.0, + "step": 766 + }, + { + "entropy": 1.7600707411766052, + "epoch": 2.7710407239819004, + "grad_norm": 0.5250128507614136, + "learning_rate": 0.00032117653294417523, + "loss": 0.1134, + "mean_token_accuracy": 0.9638848602771759, + "num_tokens": 6771012.0, + "step": 767 + }, + { + "entropy": 1.768298476934433, + "epoch": 2.774660633484163, + "grad_norm": 0.5671310424804688, + "learning_rate": 0.00032073518823168143, + "loss": 0.057, + "mean_token_accuracy": 0.9840837568044662, + "num_tokens": 6779601.0, + "step": 768 + }, + { + "entropy": 1.7464122474193573, + "epoch": 2.7782805429864252, + "grad_norm": 0.6007266044616699, + "learning_rate": 0.0003202936927897852, + "loss": 0.081, + "mean_token_accuracy": 0.9773043692111969, + "num_tokens": 6788518.0, + "step": 769 + }, + { + "entropy": 1.6484523713588715, + "epoch": 2.7819004524886877, + "grad_norm": 0.5163906812667847, + "learning_rate": 0.00031985204829534236, + "loss": 0.1215, + "mean_token_accuracy": 0.9645300209522247, + "num_tokens": 6797924.0, + "step": 770 + }, + { + "entropy": 1.7306124567985535, + "epoch": 2.78552036199095, + "grad_norm": 0.5778948068618774, + "learning_rate": 0.00031941025642577515, + "loss": 0.127, + "mean_token_accuracy": 0.9713134616613388, + "num_tokens": 6806828.0, + "step": 771 + }, + { + "entropy": 1.6599189043045044, + "epoch": 2.7891402714932125, + "grad_norm": 0.5121646523475647, + "learning_rate": 0.0003189683188590653, + "loss": 0.1066, + "mean_token_accuracy": 0.9707446396350861, + "num_tokens": 6816144.0, + "step": 772 + }, + { + "entropy": 1.71377295255661, + "epoch": 2.792760180995475, + "grad_norm": 0.9535031318664551, + "learning_rate": 0.00031852623727374787, + "loss": 0.2316, + "mean_token_accuracy": 0.9587533473968506, + "num_tokens": 6824849.0, + "step": 773 + }, + { + "entropy": 1.7716725766658783, + "epoch": 2.7963800904977374, + "grad_norm": 0.5735589265823364, + "learning_rate": 0.00031808401334890537, + "loss": 0.1028, + "mean_token_accuracy": 0.9716143608093262, + "num_tokens": 6833331.0, + "step": 774 + }, + { + "entropy": 1.7134707272052765, + "epoch": 2.8, + "grad_norm": 0.7087857127189636, + "learning_rate": 0.00031764164876416036, + "loss": 0.1201, + "mean_token_accuracy": 0.9686445444822311, + "num_tokens": 6842254.0, + "step": 775 + }, + { + "entropy": 1.6055873930454254, + "epoch": 2.8036199095022623, + "grad_norm": 0.4578965902328491, + "learning_rate": 0.00031719914519967, + "loss": 0.0827, + "mean_token_accuracy": 0.972065269947052, + "num_tokens": 6851644.0, + "step": 776 + }, + { + "entropy": 1.6444376707077026, + "epoch": 2.8072398190045247, + "grad_norm": 0.5656917095184326, + "learning_rate": 0.0003167565043361194, + "loss": 0.1036, + "mean_token_accuracy": 0.9723617881536484, + "num_tokens": 6860787.0, + "step": 777 + }, + { + "entropy": 1.6980305314064026, + "epoch": 2.810859728506787, + "grad_norm": 0.7013098001480103, + "learning_rate": 0.0003163137278547146, + "loss": 0.0838, + "mean_token_accuracy": 0.9793482422828674, + "num_tokens": 6869378.0, + "step": 778 + }, + { + "entropy": 1.6744478940963745, + "epoch": 2.8144796380090495, + "grad_norm": 0.6889812350273132, + "learning_rate": 0.00031587081743717735, + "loss": 0.0964, + "mean_token_accuracy": 0.9762091189622879, + "num_tokens": 6878050.0, + "step": 779 + }, + { + "entropy": 1.6397214829921722, + "epoch": 2.818099547511312, + "grad_norm": 0.7166011333465576, + "learning_rate": 0.00031542777476573785, + "loss": 0.1792, + "mean_token_accuracy": 0.9539972990751266, + "num_tokens": 6887153.0, + "step": 780 + }, + { + "entropy": 1.6447750926017761, + "epoch": 2.8217194570135744, + "grad_norm": 0.7113035321235657, + "learning_rate": 0.0003149846015231286, + "loss": 0.1464, + "mean_token_accuracy": 0.96909099817276, + "num_tokens": 6895877.0, + "step": 781 + }, + { + "entropy": 1.6827795505523682, + "epoch": 2.825339366515837, + "grad_norm": 0.6915350556373596, + "learning_rate": 0.0003145412993925781, + "loss": 0.1335, + "mean_token_accuracy": 0.9615183472633362, + "num_tokens": 6904553.0, + "step": 782 + }, + { + "entropy": 1.6189779937267303, + "epoch": 2.8289592760180997, + "grad_norm": 0.467428982257843, + "learning_rate": 0.00031409787005780423, + "loss": 0.0829, + "mean_token_accuracy": 0.9781016558408737, + "num_tokens": 6913634.0, + "step": 783 + }, + { + "entropy": 1.6323690116405487, + "epoch": 2.832579185520362, + "grad_norm": 0.49170154333114624, + "learning_rate": 0.00031365431520300813, + "loss": 0.0828, + "mean_token_accuracy": 0.9719655811786652, + "num_tokens": 6922638.0, + "step": 784 + }, + { + "entropy": 1.6121336817741394, + "epoch": 2.8361990950226246, + "grad_norm": 0.5629302263259888, + "learning_rate": 0.00031321063651286777, + "loss": 0.0757, + "mean_token_accuracy": 0.9791934490203857, + "num_tokens": 6931590.0, + "step": 785 + }, + { + "entropy": 1.7345627546310425, + "epoch": 2.839819004524887, + "grad_norm": 0.5514137148857117, + "learning_rate": 0.0003127668356725313, + "loss": 0.0819, + "mean_token_accuracy": 0.9800210148096085, + "num_tokens": 6940137.0, + "step": 786 + }, + { + "entropy": 1.6671563386917114, + "epoch": 2.8434389140271494, + "grad_norm": 0.5090643167495728, + "learning_rate": 0.0003123229143676109, + "loss": 0.0794, + "mean_token_accuracy": 0.9826332330703735, + "num_tokens": 6948616.0, + "step": 787 + }, + { + "entropy": 1.551501840353012, + "epoch": 2.847058823529412, + "grad_norm": 0.3994922935962677, + "learning_rate": 0.0003118788742841761, + "loss": 0.0491, + "mean_token_accuracy": 0.9865831136703491, + "num_tokens": 6957369.0, + "step": 788 + }, + { + "entropy": 1.500845193862915, + "epoch": 2.8506787330316743, + "grad_norm": 0.6023295521736145, + "learning_rate": 0.00031143471710874795, + "loss": 0.114, + "mean_token_accuracy": 0.9669302552938461, + "num_tokens": 6966667.0, + "step": 789 + }, + { + "entropy": 1.5258118510246277, + "epoch": 2.8542986425339367, + "grad_norm": 0.5326524972915649, + "learning_rate": 0.00031099044452829186, + "loss": 0.0657, + "mean_token_accuracy": 0.9833361059427261, + "num_tokens": 6975880.0, + "step": 790 + }, + { + "entropy": 1.5674570798873901, + "epoch": 2.857918552036199, + "grad_norm": 0.4518730044364929, + "learning_rate": 0.00031054605823021186, + "loss": 0.0569, + "mean_token_accuracy": 0.9832890778779984, + "num_tokens": 6984824.0, + "step": 791 + }, + { + "entropy": 1.5301121771335602, + "epoch": 2.8615384615384616, + "grad_norm": 0.5933698415756226, + "learning_rate": 0.00031010155990234364, + "loss": 0.1129, + "mean_token_accuracy": 0.9684284627437592, + "num_tokens": 6994076.0, + "step": 792 + }, + { + "entropy": 1.5711756348609924, + "epoch": 2.865158371040724, + "grad_norm": 0.6634730696678162, + "learning_rate": 0.00030965695123294837, + "loss": 0.1204, + "mean_token_accuracy": 0.972825437784195, + "num_tokens": 7003048.0, + "step": 793 + }, + { + "entropy": 1.6537431180477142, + "epoch": 2.8687782805429864, + "grad_norm": 0.5688450336456299, + "learning_rate": 0.0003092122339107067, + "loss": 0.0659, + "mean_token_accuracy": 0.9861912727355957, + "num_tokens": 7011743.0, + "step": 794 + }, + { + "entropy": 1.731940358877182, + "epoch": 2.872398190045249, + "grad_norm": 0.9030163288116455, + "learning_rate": 0.0003087674096247115, + "loss": 0.0829, + "mean_token_accuracy": 0.9802074134349823, + "num_tokens": 7020003.0, + "step": 795 + }, + { + "entropy": 1.6672345995903015, + "epoch": 2.8760180995475113, + "grad_norm": 0.5129911303520203, + "learning_rate": 0.00030832248006446223, + "loss": 0.0823, + "mean_token_accuracy": 0.9805259853601456, + "num_tokens": 7029275.0, + "step": 796 + }, + { + "entropy": 1.7102139592170715, + "epoch": 2.8796380090497737, + "grad_norm": 0.6210790872573853, + "learning_rate": 0.00030787744691985797, + "loss": 0.1248, + "mean_token_accuracy": 0.9665560126304626, + "num_tokens": 7038068.0, + "step": 797 + }, + { + "entropy": 1.659182459115982, + "epoch": 2.883257918552036, + "grad_norm": 0.6379976868629456, + "learning_rate": 0.0003074323118811913, + "loss": 0.1065, + "mean_token_accuracy": 0.9647062122821808, + "num_tokens": 7047039.0, + "step": 798 + }, + { + "entropy": 1.6344517767429352, + "epoch": 2.8868778280542986, + "grad_norm": 0.5851842761039734, + "learning_rate": 0.00030698707663914186, + "loss": 0.1046, + "mean_token_accuracy": 0.9666399955749512, + "num_tokens": 7056105.0, + "step": 799 + }, + { + "entropy": 1.6803805828094482, + "epoch": 2.890497737556561, + "grad_norm": 0.5926725268363953, + "learning_rate": 0.00030654174288477, + "loss": 0.1019, + "mean_token_accuracy": 0.9712099581956863, + "num_tokens": 7064710.0, + "step": 800 + }, + { + "entropy": 1.7004003822803497, + "epoch": 2.8941176470588235, + "grad_norm": 0.6103729605674744, + "learning_rate": 0.0003060963123095098, + "loss": 0.091, + "mean_token_accuracy": 0.9780148714780807, + "num_tokens": 7073218.0, + "step": 801 + }, + { + "entropy": 1.8133964240550995, + "epoch": 2.897737556561086, + "grad_norm": 0.872008740901947, + "learning_rate": 0.0003056507866051636, + "loss": 0.3003, + "mean_token_accuracy": 0.9385994374752045, + "num_tokens": 7081791.0, + "step": 802 + }, + { + "entropy": 1.7527997195720673, + "epoch": 2.9013574660633483, + "grad_norm": 0.553669810295105, + "learning_rate": 0.0003052051674638945, + "loss": 0.0999, + "mean_token_accuracy": 0.9695112109184265, + "num_tokens": 7090196.0, + "step": 803 + }, + { + "entropy": 1.6374657154083252, + "epoch": 2.9049773755656108, + "grad_norm": 0.4158615469932556, + "learning_rate": 0.00030475945657822107, + "loss": 0.0682, + "mean_token_accuracy": 0.9802833646535873, + "num_tokens": 7099216.0, + "step": 804 + }, + { + "entropy": 1.6056133210659027, + "epoch": 2.908597285067873, + "grad_norm": 0.47468429803848267, + "learning_rate": 0.00030431365564101003, + "loss": 0.1188, + "mean_token_accuracy": 0.9720293581485748, + "num_tokens": 7108787.0, + "step": 805 + }, + { + "entropy": 1.7184821665287018, + "epoch": 2.9122171945701356, + "grad_norm": 0.6617569923400879, + "learning_rate": 0.00030386776634547003, + "loss": 0.1121, + "mean_token_accuracy": 0.9623472690582275, + "num_tokens": 7117158.0, + "step": 806 + }, + { + "entropy": 1.7546651065349579, + "epoch": 2.915837104072398, + "grad_norm": 0.5058173537254333, + "learning_rate": 0.0003034217903851454, + "loss": 0.0861, + "mean_token_accuracy": 0.9664297550916672, + "num_tokens": 7125800.0, + "step": 807 + }, + { + "entropy": 1.6985557675361633, + "epoch": 2.9194570135746605, + "grad_norm": 0.5197705626487732, + "learning_rate": 0.00030297572945390996, + "loss": 0.1009, + "mean_token_accuracy": 0.9677706956863403, + "num_tokens": 7134221.0, + "step": 808 + }, + { + "entropy": 1.6737182438373566, + "epoch": 2.9230769230769234, + "grad_norm": 0.4528989791870117, + "learning_rate": 0.00030252958524595966, + "loss": 0.0656, + "mean_token_accuracy": 0.9853187948465347, + "num_tokens": 7142716.0, + "step": 809 + }, + { + "entropy": 1.687746375799179, + "epoch": 2.926696832579186, + "grad_norm": 0.8552060723304749, + "learning_rate": 0.00030208335945580716, + "loss": 0.1584, + "mean_token_accuracy": 0.958037719130516, + "num_tokens": 7151288.0, + "step": 810 + }, + { + "entropy": 1.6994356215000153, + "epoch": 2.930316742081448, + "grad_norm": 0.470833957195282, + "learning_rate": 0.00030163705377827496, + "loss": 0.0537, + "mean_token_accuracy": 0.9804185479879379, + "num_tokens": 7159738.0, + "step": 811 + }, + { + "entropy": 1.7072536945343018, + "epoch": 2.9339366515837106, + "grad_norm": 0.5749104022979736, + "learning_rate": 0.0003011906699084888, + "loss": 0.0502, + "mean_token_accuracy": 0.9830235093832016, + "num_tokens": 7168101.0, + "step": 812 + }, + { + "entropy": 1.70310440659523, + "epoch": 2.937556561085973, + "grad_norm": 0.7587386965751648, + "learning_rate": 0.0003007442095418715, + "loss": 0.1362, + "mean_token_accuracy": 0.9594880938529968, + "num_tokens": 7176663.0, + "step": 813 + }, + { + "entropy": 1.6307457983493805, + "epoch": 2.9411764705882355, + "grad_norm": 0.5054190754890442, + "learning_rate": 0.00030029767437413665, + "loss": 0.0744, + "mean_token_accuracy": 0.9738886505365372, + "num_tokens": 7185376.0, + "step": 814 + }, + { + "entropy": 1.5872860848903656, + "epoch": 2.944796380090498, + "grad_norm": 0.5463546514511108, + "learning_rate": 0.00029985106610128147, + "loss": 0.0916, + "mean_token_accuracy": 0.9782509952783585, + "num_tokens": 7194304.0, + "step": 815 + }, + { + "entropy": 1.6643644273281097, + "epoch": 2.9484162895927604, + "grad_norm": 0.5434613823890686, + "learning_rate": 0.0002994043864195811, + "loss": 0.1007, + "mean_token_accuracy": 0.9665197134017944, + "num_tokens": 7202895.0, + "step": 816 + }, + { + "entropy": 1.701482743024826, + "epoch": 2.952036199095023, + "grad_norm": 1.2643967866897583, + "learning_rate": 0.00029895763702558206, + "loss": 0.1377, + "mean_token_accuracy": 0.9696027487516403, + "num_tokens": 7211000.0, + "step": 817 + }, + { + "entropy": 1.688760131597519, + "epoch": 2.9556561085972852, + "grad_norm": 0.5438109636306763, + "learning_rate": 0.00029851081961609536, + "loss": 0.0637, + "mean_token_accuracy": 0.9724639654159546, + "num_tokens": 7219274.0, + "step": 818 + }, + { + "entropy": 1.6547857522964478, + "epoch": 2.9592760180995477, + "grad_norm": 0.4520387649536133, + "learning_rate": 0.0002980639358881906, + "loss": 0.0376, + "mean_token_accuracy": 0.9887004494667053, + "num_tokens": 7228000.0, + "step": 819 + }, + { + "entropy": 1.5814381837844849, + "epoch": 2.96289592760181, + "grad_norm": 0.49122339487075806, + "learning_rate": 0.00029761698753918894, + "loss": 0.0533, + "mean_token_accuracy": 0.983299508690834, + "num_tokens": 7236798.0, + "step": 820 + }, + { + "entropy": 1.5796774625778198, + "epoch": 2.9665158371040725, + "grad_norm": 0.43303897976875305, + "learning_rate": 0.00029716997626665726, + "loss": 0.0517, + "mean_token_accuracy": 0.984140008687973, + "num_tokens": 7245570.0, + "step": 821 + }, + { + "entropy": 1.5434466302394867, + "epoch": 2.970135746606335, + "grad_norm": 0.5712567567825317, + "learning_rate": 0.0002967229037684014, + "loss": 0.0634, + "mean_token_accuracy": 0.9851510971784592, + "num_tokens": 7254482.0, + "step": 822 + }, + { + "entropy": 1.5368549823760986, + "epoch": 2.9737556561085974, + "grad_norm": 0.5042312741279602, + "learning_rate": 0.0002962757717424595, + "loss": 0.1041, + "mean_token_accuracy": 0.9698852747678757, + "num_tokens": 7263428.0, + "step": 823 + }, + { + "entropy": 1.5740615129470825, + "epoch": 2.97737556561086, + "grad_norm": 0.8506835699081421, + "learning_rate": 0.0002958285818870963, + "loss": 0.0653, + "mean_token_accuracy": 0.9827365875244141, + "num_tokens": 7272425.0, + "step": 824 + }, + { + "entropy": 1.625010073184967, + "epoch": 2.9809954751131222, + "grad_norm": 0.6260822415351868, + "learning_rate": 0.00029538133590079556, + "loss": 0.1112, + "mean_token_accuracy": 0.9715189933776855, + "num_tokens": 7281312.0, + "step": 825 + }, + { + "entropy": 1.6078990697860718, + "epoch": 2.9846153846153847, + "grad_norm": 0.4316014349460602, + "learning_rate": 0.00029493403548225467, + "loss": 0.059, + "mean_token_accuracy": 0.9821690768003464, + "num_tokens": 7289748.0, + "step": 826 + }, + { + "entropy": 1.6132618486881256, + "epoch": 2.988235294117647, + "grad_norm": 0.6471059322357178, + "learning_rate": 0.0002944866823303776, + "loss": 0.0839, + "mean_token_accuracy": 0.9747331887483597, + "num_tokens": 7298453.0, + "step": 827 + }, + { + "entropy": 1.6038751900196075, + "epoch": 2.9918552036199095, + "grad_norm": 0.5383681654930115, + "learning_rate": 0.0002940392781442686, + "loss": 0.0728, + "mean_token_accuracy": 0.9774085730314255, + "num_tokens": 7307116.0, + "step": 828 + }, + { + "entropy": 1.6446776688098907, + "epoch": 2.995475113122172, + "grad_norm": 0.5420554280281067, + "learning_rate": 0.0002935918246232259, + "loss": 0.0799, + "mean_token_accuracy": 0.977481946349144, + "num_tokens": 7315668.0, + "step": 829 + }, + { + "entropy": 1.5571844279766083, + "epoch": 2.9990950226244344, + "grad_norm": 0.6471306681632996, + "learning_rate": 0.00029314432346673485, + "loss": 0.1657, + "mean_token_accuracy": 0.9566951394081116, + "num_tokens": 7324721.0, + "step": 830 + }, + { + "entropy": 2.0783205032348633, + "epoch": 3.0, + "grad_norm": 3.195817232131958, + "learning_rate": 0.000292696776374462, + "loss": 0.0742, + "mean_token_accuracy": 0.96875, + "num_tokens": 7325175.0, + "step": 831 + }, + { + "epoch": 3.0, + "eval_entropy": 1.6213929740394033, + "eval_loss": 0.14780744910240173, + "eval_mean_token_accuracy": 0.9634173047251817, + "eval_num_tokens": 7325175.0, + "eval_runtime": 116.0041, + "eval_samples_per_second": 3.181, + "eval_steps_per_second": 1.06, + "step": 831 + }, + { + "entropy": 1.639732986688614, + "epoch": 3.0036199095022624, + "grad_norm": 0.45313218235969543, + "learning_rate": 0.00029224918504624814, + "loss": 0.0569, + "mean_token_accuracy": 0.9821487963199615, + "num_tokens": 7333756.0, + "step": 832 + }, + { + "entropy": 1.620821863412857, + "epoch": 3.007239819004525, + "grad_norm": 0.4471704363822937, + "learning_rate": 0.0002918015511821022, + "loss": 0.059, + "mean_token_accuracy": 0.9843536615371704, + "num_tokens": 7342266.0, + "step": 833 + }, + { + "entropy": 1.722977101802826, + "epoch": 3.0108597285067873, + "grad_norm": 0.5039600729942322, + "learning_rate": 0.0002913538764821947, + "loss": 0.0438, + "mean_token_accuracy": 0.9868119210004807, + "num_tokens": 7350541.0, + "step": 834 + }, + { + "entropy": 1.6466768980026245, + "epoch": 3.0144796380090497, + "grad_norm": 0.4470590054988861, + "learning_rate": 0.0002909061626468512, + "loss": 0.0418, + "mean_token_accuracy": 0.9859062284231186, + "num_tokens": 7359236.0, + "step": 835 + }, + { + "entropy": 1.6936305463314056, + "epoch": 3.018099547511312, + "grad_norm": 0.5103632211685181, + "learning_rate": 0.00029045841137654584, + "loss": 0.0649, + "mean_token_accuracy": 0.9817161113023758, + "num_tokens": 7367649.0, + "step": 836 + }, + { + "entropy": 1.5894218981266022, + "epoch": 3.0217194570135746, + "grad_norm": 0.4315621554851532, + "learning_rate": 0.000290010624371895, + "loss": 0.0779, + "mean_token_accuracy": 0.9756399989128113, + "num_tokens": 7376772.0, + "step": 837 + }, + { + "entropy": 1.6676535904407501, + "epoch": 3.025339366515837, + "grad_norm": 0.6142503023147583, + "learning_rate": 0.00028956280333365084, + "loss": 0.0601, + "mean_token_accuracy": 0.9850548654794693, + "num_tokens": 7385454.0, + "step": 838 + }, + { + "entropy": 1.6877512037754059, + "epoch": 3.0289592760180994, + "grad_norm": 0.5499544739723206, + "learning_rate": 0.0002891149499626948, + "loss": 0.06, + "mean_token_accuracy": 0.980460986495018, + "num_tokens": 7393843.0, + "step": 839 + }, + { + "entropy": 1.64662566781044, + "epoch": 3.032579185520362, + "grad_norm": 0.7865297198295593, + "learning_rate": 0.00028866706596003094, + "loss": 0.1098, + "mean_token_accuracy": 0.9614097326993942, + "num_tokens": 7402203.0, + "step": 840 + }, + { + "entropy": 1.5609408617019653, + "epoch": 3.0361990950226243, + "grad_norm": 0.5209096074104309, + "learning_rate": 0.0002882191530267797, + "loss": 0.0893, + "mean_token_accuracy": 0.9731233417987823, + "num_tokens": 7411227.0, + "step": 841 + }, + { + "entropy": 1.6110387742519379, + "epoch": 3.0398190045248867, + "grad_norm": 0.49672260880470276, + "learning_rate": 0.00028777121286417185, + "loss": 0.0512, + "mean_token_accuracy": 0.9793709367513657, + "num_tokens": 7419751.0, + "step": 842 + }, + { + "entropy": 1.5630280673503876, + "epoch": 3.043438914027149, + "grad_norm": 0.5099878907203674, + "learning_rate": 0.00028732324717354083, + "loss": 0.0447, + "mean_token_accuracy": 0.9830391556024551, + "num_tokens": 7428533.0, + "step": 843 + }, + { + "entropy": 1.5407153069972992, + "epoch": 3.0470588235294116, + "grad_norm": 0.7725343704223633, + "learning_rate": 0.0002868752576563175, + "loss": 0.071, + "mean_token_accuracy": 0.9820850938558578, + "num_tokens": 7437390.0, + "step": 844 + }, + { + "entropy": 1.5895936191082, + "epoch": 3.050678733031674, + "grad_norm": 0.5729185938835144, + "learning_rate": 0.0002864272460140234, + "loss": 0.0651, + "mean_token_accuracy": 0.9816865175962448, + "num_tokens": 7445715.0, + "step": 845 + }, + { + "entropy": 1.5614444315433502, + "epoch": 3.0542986425339365, + "grad_norm": 0.49079445004463196, + "learning_rate": 0.00028597921394826346, + "loss": 0.078, + "mean_token_accuracy": 0.9791339933872223, + "num_tokens": 7454770.0, + "step": 846 + }, + { + "entropy": 1.4948404431343079, + "epoch": 3.057918552036199, + "grad_norm": 0.45897549390792847, + "learning_rate": 0.0002855311631607209, + "loss": 0.0506, + "mean_token_accuracy": 0.9858186691999435, + "num_tokens": 7463578.0, + "step": 847 + }, + { + "entropy": 1.4837007820606232, + "epoch": 3.0615384615384613, + "grad_norm": 0.6153395771980286, + "learning_rate": 0.0002850830953531494, + "loss": 0.0862, + "mean_token_accuracy": 0.9767305850982666, + "num_tokens": 7472726.0, + "step": 848 + }, + { + "entropy": 1.5454865992069244, + "epoch": 3.065158371040724, + "grad_norm": 0.9645626544952393, + "learning_rate": 0.00028463501222736787, + "loss": 0.1448, + "mean_token_accuracy": 0.9689669013023376, + "num_tokens": 7481594.0, + "step": 849 + }, + { + "entropy": 1.503423035144806, + "epoch": 3.0687782805429866, + "grad_norm": 0.5449880361557007, + "learning_rate": 0.00028418691548525306, + "loss": 0.0809, + "mean_token_accuracy": 0.9776449203491211, + "num_tokens": 7490420.0, + "step": 850 + }, + { + "entropy": 1.43245068192482, + "epoch": 3.072398190045249, + "grad_norm": 0.7362976670265198, + "learning_rate": 0.0002837388068287334, + "loss": 0.0956, + "mean_token_accuracy": 0.9742193967103958, + "num_tokens": 7499567.0, + "step": 851 + }, + { + "entropy": 1.4874032735824585, + "epoch": 3.0760180995475115, + "grad_norm": 0.5615106821060181, + "learning_rate": 0.00028329068795978274, + "loss": 0.0837, + "mean_token_accuracy": 0.9790486842393875, + "num_tokens": 7508507.0, + "step": 852 + }, + { + "entropy": 1.4498116374015808, + "epoch": 3.079638009049774, + "grad_norm": 0.4348931610584259, + "learning_rate": 0.00028284256058041363, + "loss": 0.0634, + "mean_token_accuracy": 0.9843485057353973, + "num_tokens": 7517576.0, + "step": 853 + }, + { + "entropy": 1.5673332512378693, + "epoch": 3.0832579185520363, + "grad_norm": 0.5114635825157166, + "learning_rate": 0.000282394426392671, + "loss": 0.0663, + "mean_token_accuracy": 0.9789460599422455, + "num_tokens": 7526323.0, + "step": 854 + }, + { + "entropy": 1.5406886637210846, + "epoch": 3.086877828054299, + "grad_norm": 0.4056108593940735, + "learning_rate": 0.0002819462870986256, + "loss": 0.051, + "mean_token_accuracy": 0.985608771443367, + "num_tokens": 7535015.0, + "step": 855 + }, + { + "entropy": 1.5750982761383057, + "epoch": 3.090497737556561, + "grad_norm": 0.5252281427383423, + "learning_rate": 0.00028149814440036757, + "loss": 0.0426, + "mean_token_accuracy": 0.9839760363101959, + "num_tokens": 7543712.0, + "step": 856 + }, + { + "entropy": 1.5696458220481873, + "epoch": 3.0941176470588236, + "grad_norm": 0.7764946222305298, + "learning_rate": 0.00028105, + "loss": 0.1381, + "mean_token_accuracy": 0.9690622985363007, + "num_tokens": 7552651.0, + "step": 857 + }, + { + "entropy": 1.601443588733673, + "epoch": 3.097737556561086, + "grad_norm": 0.5605809688568115, + "learning_rate": 0.0002806018555996324, + "loss": 0.0847, + "mean_token_accuracy": 0.9771271497011185, + "num_tokens": 7561157.0, + "step": 858 + }, + { + "entropy": 1.5733444690704346, + "epoch": 3.1013574660633485, + "grad_norm": 0.6177924871444702, + "learning_rate": 0.00028015371290137443, + "loss": 0.0654, + "mean_token_accuracy": 0.9801760017871857, + "num_tokens": 7569884.0, + "step": 859 + }, + { + "entropy": 1.6089049577713013, + "epoch": 3.104977375565611, + "grad_norm": 0.9339480400085449, + "learning_rate": 0.000279705573607329, + "loss": 0.0924, + "mean_token_accuracy": 0.9692800939083099, + "num_tokens": 7578417.0, + "step": 860 + }, + { + "entropy": 1.58562570810318, + "epoch": 3.1085972850678734, + "grad_norm": 0.6229763031005859, + "learning_rate": 0.00027925743941958637, + "loss": 0.0689, + "mean_token_accuracy": 0.9820535033941269, + "num_tokens": 7586899.0, + "step": 861 + }, + { + "entropy": 1.6127779185771942, + "epoch": 3.112217194570136, + "grad_norm": 0.5199776291847229, + "learning_rate": 0.0002788093120402174, + "loss": 0.0696, + "mean_token_accuracy": 0.9842628389596939, + "num_tokens": 7595283.0, + "step": 862 + }, + { + "entropy": 1.5815349221229553, + "epoch": 3.1158371040723982, + "grad_norm": 0.3927786946296692, + "learning_rate": 0.0002783611931712666, + "loss": 0.0489, + "mean_token_accuracy": 0.9855190068483353, + "num_tokens": 7603800.0, + "step": 863 + }, + { + "entropy": 1.5128042995929718, + "epoch": 3.1194570135746607, + "grad_norm": 0.5245664715766907, + "learning_rate": 0.00027791308451474695, + "loss": 0.0916, + "mean_token_accuracy": 0.9793255031108856, + "num_tokens": 7612765.0, + "step": 864 + }, + { + "entropy": 1.4662578105926514, + "epoch": 3.123076923076923, + "grad_norm": 0.4836482107639313, + "learning_rate": 0.000277464987772632, + "loss": 0.0363, + "mean_token_accuracy": 0.9882297664880753, + "num_tokens": 7621842.0, + "step": 865 + }, + { + "entropy": 1.6075958013534546, + "epoch": 3.1266968325791855, + "grad_norm": 0.6621652841567993, + "learning_rate": 0.00027701690464685053, + "loss": 0.0703, + "mean_token_accuracy": 0.9801139384508133, + "num_tokens": 7630299.0, + "step": 866 + }, + { + "entropy": 1.5028826892375946, + "epoch": 3.130316742081448, + "grad_norm": 1.076515555381775, + "learning_rate": 0.00027656883683927917, + "loss": 0.1021, + "mean_token_accuracy": 0.9723865538835526, + "num_tokens": 7639269.0, + "step": 867 + }, + { + "entropy": 1.4604552686214447, + "epoch": 3.1339366515837104, + "grad_norm": 0.6197560429573059, + "learning_rate": 0.0002761207860517365, + "loss": 0.0831, + "mean_token_accuracy": 0.9773454070091248, + "num_tokens": 7648589.0, + "step": 868 + }, + { + "entropy": 1.5533301830291748, + "epoch": 3.137556561085973, + "grad_norm": 0.6384056806564331, + "learning_rate": 0.00027567275398597665, + "loss": 0.085, + "mean_token_accuracy": 0.9763429015874863, + "num_tokens": 7657465.0, + "step": 869 + }, + { + "entropy": 1.499713659286499, + "epoch": 3.1411764705882352, + "grad_norm": 0.5099884867668152, + "learning_rate": 0.0002752247423436825, + "loss": 0.0506, + "mean_token_accuracy": 0.9845949709415436, + "num_tokens": 7666239.0, + "step": 870 + }, + { + "entropy": 1.5065864324569702, + "epoch": 3.1447963800904977, + "grad_norm": 0.500906765460968, + "learning_rate": 0.00027477675282645917, + "loss": 0.0505, + "mean_token_accuracy": 0.9816035628318787, + "num_tokens": 7675002.0, + "step": 871 + }, + { + "entropy": 1.492633044719696, + "epoch": 3.14841628959276, + "grad_norm": 0.5848217606544495, + "learning_rate": 0.00027432878713582826, + "loss": 0.0714, + "mean_token_accuracy": 0.9832541942596436, + "num_tokens": 7683452.0, + "step": 872 + }, + { + "entropy": 1.5013020932674408, + "epoch": 3.1520361990950225, + "grad_norm": 0.7728188037872314, + "learning_rate": 0.0002738808469732202, + "loss": 0.1403, + "mean_token_accuracy": 0.9723454862833023, + "num_tokens": 7692088.0, + "step": 873 + }, + { + "entropy": 1.4020000398159027, + "epoch": 3.155656108597285, + "grad_norm": 0.7066675424575806, + "learning_rate": 0.00027343293403996906, + "loss": 0.0631, + "mean_token_accuracy": 0.9841864109039307, + "num_tokens": 7701066.0, + "step": 874 + }, + { + "entropy": 1.4469320476055145, + "epoch": 3.1592760180995474, + "grad_norm": 0.47683194279670715, + "learning_rate": 0.0002729850500373052, + "loss": 0.0787, + "mean_token_accuracy": 0.9787198454141617, + "num_tokens": 7710189.0, + "step": 875 + }, + { + "entropy": 1.4941265881061554, + "epoch": 3.16289592760181, + "grad_norm": 0.5534874796867371, + "learning_rate": 0.00027253719666634916, + "loss": 0.0681, + "mean_token_accuracy": 0.9741384238004684, + "num_tokens": 7718736.0, + "step": 876 + }, + { + "entropy": 1.48758664727211, + "epoch": 3.1665158371040723, + "grad_norm": 0.42443010210990906, + "learning_rate": 0.000272089375628105, + "loss": 0.0452, + "mean_token_accuracy": 0.986537754535675, + "num_tokens": 7727565.0, + "step": 877 + }, + { + "entropy": 1.4197124242782593, + "epoch": 3.1701357466063347, + "grad_norm": 0.4680332541465759, + "learning_rate": 0.00027164158862345416, + "loss": 0.0712, + "mean_token_accuracy": 0.979786142706871, + "num_tokens": 7736663.0, + "step": 878 + }, + { + "entropy": 1.4459567070007324, + "epoch": 3.173755656108597, + "grad_norm": 0.5269680619239807, + "learning_rate": 0.00027119383735314887, + "loss": 0.0527, + "mean_token_accuracy": 0.9839773774147034, + "num_tokens": 7745837.0, + "step": 879 + }, + { + "entropy": 1.4754200279712677, + "epoch": 3.1773755656108595, + "grad_norm": 0.39273717999458313, + "learning_rate": 0.00027074612351780524, + "loss": 0.0188, + "mean_token_accuracy": 0.9941024333238602, + "num_tokens": 7754747.0, + "step": 880 + }, + { + "entropy": 1.440185934305191, + "epoch": 3.180995475113122, + "grad_norm": 0.6401451826095581, + "learning_rate": 0.00027029844881789776, + "loss": 0.0825, + "mean_token_accuracy": 0.9758540540933609, + "num_tokens": 7763933.0, + "step": 881 + }, + { + "entropy": 1.4647364616394043, + "epoch": 3.184615384615385, + "grad_norm": 0.6890838146209717, + "learning_rate": 0.0002698508149537519, + "loss": 0.0609, + "mean_token_accuracy": 0.9836824238300323, + "num_tokens": 7772693.0, + "step": 882 + }, + { + "entropy": 1.510004311800003, + "epoch": 3.1882352941176473, + "grad_norm": 0.4847521185874939, + "learning_rate": 0.000269403223625538, + "loss": 0.0665, + "mean_token_accuracy": 0.9833553731441498, + "num_tokens": 7781591.0, + "step": 883 + }, + { + "entropy": 1.5321883261203766, + "epoch": 3.1918552036199097, + "grad_norm": 0.5583149790763855, + "learning_rate": 0.00026895567653326515, + "loss": 0.0481, + "mean_token_accuracy": 0.9884297996759415, + "num_tokens": 7789893.0, + "step": 884 + }, + { + "entropy": 1.5181719362735748, + "epoch": 3.195475113122172, + "grad_norm": 0.5727811455726624, + "learning_rate": 0.000268508175376774, + "loss": 0.051, + "mean_token_accuracy": 0.9885639101266861, + "num_tokens": 7798544.0, + "step": 885 + }, + { + "entropy": 1.6374200582504272, + "epoch": 3.1990950226244346, + "grad_norm": 0.5002682209014893, + "learning_rate": 0.0002680607218557314, + "loss": 0.0778, + "mean_token_accuracy": 0.9834531843662262, + "num_tokens": 7807030.0, + "step": 886 + }, + { + "entropy": 1.4485781788825989, + "epoch": 3.202714932126697, + "grad_norm": 0.5490010976791382, + "learning_rate": 0.0002676133176696224, + "loss": 0.0612, + "mean_token_accuracy": 0.9833452105522156, + "num_tokens": 7816008.0, + "step": 887 + }, + { + "entropy": 1.5048691630363464, + "epoch": 3.2063348416289594, + "grad_norm": 0.37134769558906555, + "learning_rate": 0.0002671659645177453, + "loss": 0.0411, + "mean_token_accuracy": 0.9869852215051651, + "num_tokens": 7825152.0, + "step": 888 + }, + { + "entropy": 1.522626668214798, + "epoch": 3.209954751131222, + "grad_norm": 0.3474898040294647, + "learning_rate": 0.00026671866409920444, + "loss": 0.0453, + "mean_token_accuracy": 0.9880259335041046, + "num_tokens": 7833517.0, + "step": 889 + }, + { + "entropy": 1.4796842634677887, + "epoch": 3.2135746606334843, + "grad_norm": 0.6107187271118164, + "learning_rate": 0.0002662714181129038, + "loss": 0.0587, + "mean_token_accuracy": 0.9835474342107773, + "num_tokens": 7842418.0, + "step": 890 + }, + { + "entropy": 1.527999073266983, + "epoch": 3.2171945701357467, + "grad_norm": 0.8143520355224609, + "learning_rate": 0.00026582422825754037, + "loss": 0.1435, + "mean_token_accuracy": 0.9624571949243546, + "num_tokens": 7851284.0, + "step": 891 + }, + { + "entropy": 1.488987773656845, + "epoch": 3.220814479638009, + "grad_norm": 0.4910070300102234, + "learning_rate": 0.0002653770962315986, + "loss": 0.0627, + "mean_token_accuracy": 0.9796515852212906, + "num_tokens": 7860011.0, + "step": 892 + }, + { + "entropy": 1.497105747461319, + "epoch": 3.2244343891402716, + "grad_norm": 0.6304562091827393, + "learning_rate": 0.00026493002373334274, + "loss": 0.0837, + "mean_token_accuracy": 0.975927010178566, + "num_tokens": 7868618.0, + "step": 893 + }, + { + "entropy": 1.4863994121551514, + "epoch": 3.228054298642534, + "grad_norm": 0.4768204092979431, + "learning_rate": 0.00026448301246081106, + "loss": 0.0449, + "mean_token_accuracy": 0.9877417385578156, + "num_tokens": 7877335.0, + "step": 894 + }, + { + "entropy": 1.4551187455654144, + "epoch": 3.2316742081447964, + "grad_norm": 0.5773951411247253, + "learning_rate": 0.0002640360641118095, + "loss": 0.0807, + "mean_token_accuracy": 0.974293515086174, + "num_tokens": 7886486.0, + "step": 895 + }, + { + "entropy": 1.4752719104290009, + "epoch": 3.235294117647059, + "grad_norm": 0.8372188806533813, + "learning_rate": 0.00026358918038390464, + "loss": 0.1428, + "mean_token_accuracy": 0.9693069010972977, + "num_tokens": 7895501.0, + "step": 896 + }, + { + "entropy": 1.462522953748703, + "epoch": 3.2389140271493213, + "grad_norm": 0.4307233393192291, + "learning_rate": 0.0002631423629744179, + "loss": 0.0574, + "mean_token_accuracy": 0.9867023974657059, + "num_tokens": 7904614.0, + "step": 897 + }, + { + "entropy": 1.5075399577617645, + "epoch": 3.2425339366515837, + "grad_norm": 0.6246724724769592, + "learning_rate": 0.00026269561358041886, + "loss": 0.074, + "mean_token_accuracy": 0.9773128777742386, + "num_tokens": 7913383.0, + "step": 898 + }, + { + "entropy": 1.407273530960083, + "epoch": 3.246153846153846, + "grad_norm": 0.31213951110839844, + "learning_rate": 0.0002622489338987186, + "loss": 0.0225, + "mean_token_accuracy": 0.994924858212471, + "num_tokens": 7922686.0, + "step": 899 + }, + { + "entropy": 1.3883163630962372, + "epoch": 3.2497737556561086, + "grad_norm": 0.476696252822876, + "learning_rate": 0.00026180232562586335, + "loss": 0.0727, + "mean_token_accuracy": 0.9775501936674118, + "num_tokens": 7931958.0, + "step": 900 + }, + { + "entropy": 1.4200710952281952, + "epoch": 3.253393665158371, + "grad_norm": 0.5860406756401062, + "learning_rate": 0.0002613557904581284, + "loss": 0.0658, + "mean_token_accuracy": 0.9809585213661194, + "num_tokens": 7940834.0, + "step": 901 + }, + { + "entropy": 1.4369202852249146, + "epoch": 3.2570135746606335, + "grad_norm": 0.47559866309165955, + "learning_rate": 0.0002609093300915112, + "loss": 0.0481, + "mean_token_accuracy": 0.9899342954158783, + "num_tokens": 7949907.0, + "step": 902 + }, + { + "entropy": 1.544773817062378, + "epoch": 3.260633484162896, + "grad_norm": 0.6772119402885437, + "learning_rate": 0.00026046294622172504, + "loss": 0.067, + "mean_token_accuracy": 0.9841168224811554, + "num_tokens": 7958556.0, + "step": 903 + }, + { + "entropy": 1.504111796617508, + "epoch": 3.2642533936651583, + "grad_norm": 0.4870680868625641, + "learning_rate": 0.0002600166405441928, + "loss": 0.0379, + "mean_token_accuracy": 0.990149587392807, + "num_tokens": 7967079.0, + "step": 904 + }, + { + "entropy": 1.349067509174347, + "epoch": 3.2678733031674208, + "grad_norm": 0.46113792061805725, + "learning_rate": 0.0002595704147540404, + "loss": 0.0521, + "mean_token_accuracy": 0.9874720871448517, + "num_tokens": 7976551.0, + "step": 905 + }, + { + "entropy": 1.462818831205368, + "epoch": 3.271493212669683, + "grad_norm": 0.7971535325050354, + "learning_rate": 0.0002591242705460901, + "loss": 0.041, + "mean_token_accuracy": 0.9884305745363235, + "num_tokens": 7985622.0, + "step": 906 + }, + { + "entropy": 1.3945342004299164, + "epoch": 3.2751131221719456, + "grad_norm": 0.7364558577537537, + "learning_rate": 0.00025867820961485453, + "loss": 0.0978, + "mean_token_accuracy": 0.9766480922698975, + "num_tokens": 7995012.0, + "step": 907 + }, + { + "entropy": 1.4662614464759827, + "epoch": 3.278733031674208, + "grad_norm": 0.4509989619255066, + "learning_rate": 0.0002582322336545299, + "loss": 0.0401, + "mean_token_accuracy": 0.9875599294900894, + "num_tokens": 8003688.0, + "step": 908 + }, + { + "entropy": 1.5244667828083038, + "epoch": 3.2823529411764705, + "grad_norm": 0.76254802942276, + "learning_rate": 0.00025778634435899, + "loss": 0.0706, + "mean_token_accuracy": 0.9814789742231369, + "num_tokens": 8011711.0, + "step": 909 + }, + { + "entropy": 1.498660922050476, + "epoch": 3.285972850678733, + "grad_norm": 0.5205233097076416, + "learning_rate": 0.0002573405434217788, + "loss": 0.0433, + "mean_token_accuracy": 0.9895520657300949, + "num_tokens": 8020268.0, + "step": 910 + }, + { + "entropy": 1.3673588633537292, + "epoch": 3.2895927601809953, + "grad_norm": 0.36727628111839294, + "learning_rate": 0.0002568948325361054, + "loss": 0.046, + "mean_token_accuracy": 0.9816896766424179, + "num_tokens": 8029676.0, + "step": 911 + }, + { + "entropy": 1.3924965262413025, + "epoch": 3.2932126696832578, + "grad_norm": 0.6359453797340393, + "learning_rate": 0.0002564492133948364, + "loss": 0.0677, + "mean_token_accuracy": 0.9825267344713211, + "num_tokens": 8038613.0, + "step": 912 + }, + { + "entropy": 1.426201194524765, + "epoch": 3.29683257918552, + "grad_norm": 0.5639982223510742, + "learning_rate": 0.0002560036876904902, + "loss": 0.0762, + "mean_token_accuracy": 0.9812760651111603, + "num_tokens": 8047516.0, + "step": 913 + }, + { + "entropy": 1.4323900640010834, + "epoch": 3.3004524886877826, + "grad_norm": 0.5035631060600281, + "learning_rate": 0.00025555825711522995, + "loss": 0.0479, + "mean_token_accuracy": 0.9820713251829147, + "num_tokens": 8056237.0, + "step": 914 + }, + { + "entropy": 1.433140367269516, + "epoch": 3.304072398190045, + "grad_norm": 0.5381770133972168, + "learning_rate": 0.00025511292336085804, + "loss": 0.0584, + "mean_token_accuracy": 0.9868257641792297, + "num_tokens": 8064918.0, + "step": 915 + }, + { + "entropy": 1.412838101387024, + "epoch": 3.3076923076923075, + "grad_norm": 0.46058139204978943, + "learning_rate": 0.00025466768811880866, + "loss": 0.0396, + "mean_token_accuracy": 0.9873918145895004, + "num_tokens": 8073881.0, + "step": 916 + }, + { + "entropy": 1.4484798610210419, + "epoch": 3.31131221719457, + "grad_norm": 0.8550136685371399, + "learning_rate": 0.000254222553080142, + "loss": 0.0744, + "mean_token_accuracy": 0.9780523777008057, + "num_tokens": 8082249.0, + "step": 917 + }, + { + "entropy": 1.4633181393146515, + "epoch": 3.3149321266968323, + "grad_norm": 0.8231784105300903, + "learning_rate": 0.00025377751993553777, + "loss": 0.0847, + "mean_token_accuracy": 0.9764655083417892, + "num_tokens": 8090772.0, + "step": 918 + }, + { + "entropy": 1.5348555445671082, + "epoch": 3.318552036199095, + "grad_norm": 0.6072585582733154, + "learning_rate": 0.00025333259037528847, + "loss": 0.0547, + "mean_token_accuracy": 0.983170285820961, + "num_tokens": 8098744.0, + "step": 919 + }, + { + "entropy": 1.4343461096286774, + "epoch": 3.3221719457013577, + "grad_norm": 0.5895786881446838, + "learning_rate": 0.0002528877660892933, + "loss": 0.033, + "mean_token_accuracy": 0.9907310158014297, + "num_tokens": 8107459.0, + "step": 920 + }, + { + "entropy": 1.3224802315235138, + "epoch": 3.32579185520362, + "grad_norm": 0.4657888114452362, + "learning_rate": 0.0002524430487670515, + "loss": 0.0581, + "mean_token_accuracy": 0.9821915626525879, + "num_tokens": 8116673.0, + "step": 921 + }, + { + "entropy": 1.4497299492359161, + "epoch": 3.3294117647058825, + "grad_norm": 0.5360382795333862, + "learning_rate": 0.0002519984400976564, + "loss": 0.0849, + "mean_token_accuracy": 0.9713759422302246, + "num_tokens": 8125774.0, + "step": 922 + }, + { + "entropy": 1.4038201570510864, + "epoch": 3.333031674208145, + "grad_norm": 0.5329150557518005, + "learning_rate": 0.00025155394176978814, + "loss": 0.0679, + "mean_token_accuracy": 0.9782100170850754, + "num_tokens": 8134777.0, + "step": 923 + }, + { + "entropy": 1.3989399075508118, + "epoch": 3.3366515837104074, + "grad_norm": 0.47847944498062134, + "learning_rate": 0.00025110955547170803, + "loss": 0.0579, + "mean_token_accuracy": 0.9826236069202423, + "num_tokens": 8143596.0, + "step": 924 + }, + { + "entropy": 1.4384986460208893, + "epoch": 3.34027149321267, + "grad_norm": 0.6291977763175964, + "learning_rate": 0.0002506652828912521, + "loss": 0.0826, + "mean_token_accuracy": 0.9759467244148254, + "num_tokens": 8152554.0, + "step": 925 + }, + { + "entropy": 1.3491226136684418, + "epoch": 3.3438914027149322, + "grad_norm": 0.4057374596595764, + "learning_rate": 0.00025022112571582383, + "loss": 0.0428, + "mean_token_accuracy": 0.9861899316310883, + "num_tokens": 8161845.0, + "step": 926 + }, + { + "entropy": 1.3831347525119781, + "epoch": 3.3475113122171947, + "grad_norm": 0.5007946491241455, + "learning_rate": 0.0002497770856323891, + "loss": 0.0417, + "mean_token_accuracy": 0.9847332686185837, + "num_tokens": 8170865.0, + "step": 927 + }, + { + "entropy": 1.4520001113414764, + "epoch": 3.351131221719457, + "grad_norm": 0.5229163765907288, + "learning_rate": 0.00024933316432746864, + "loss": 0.0515, + "mean_token_accuracy": 0.98235984146595, + "num_tokens": 8179738.0, + "step": 928 + }, + { + "entropy": 1.4497073292732239, + "epoch": 3.3547511312217195, + "grad_norm": 0.6086527705192566, + "learning_rate": 0.0002488893634871322, + "loss": 0.082, + "mean_token_accuracy": 0.9839034825563431, + "num_tokens": 8188402.0, + "step": 929 + }, + { + "entropy": 1.4439297020435333, + "epoch": 3.358371040723982, + "grad_norm": 0.6497851014137268, + "learning_rate": 0.00024844568479699187, + "loss": 0.0863, + "mean_token_accuracy": 0.9722652286291122, + "num_tokens": 8196956.0, + "step": 930 + }, + { + "entropy": 1.3755157589912415, + "epoch": 3.3619909502262444, + "grad_norm": 0.6988303661346436, + "learning_rate": 0.0002480021299421957, + "loss": 0.0999, + "mean_token_accuracy": 0.9738437533378601, + "num_tokens": 8205951.0, + "step": 931 + }, + { + "entropy": 1.3790476024150848, + "epoch": 3.365610859728507, + "grad_norm": 0.8188769221305847, + "learning_rate": 0.0002475587006074219, + "loss": 0.206, + "mean_token_accuracy": 0.9557942748069763, + "num_tokens": 8215256.0, + "step": 932 + }, + { + "entropy": 1.4337495565414429, + "epoch": 3.3692307692307693, + "grad_norm": 0.481511652469635, + "learning_rate": 0.00024711539847687135, + "loss": 0.0568, + "mean_token_accuracy": 0.982319638133049, + "num_tokens": 8224081.0, + "step": 933 + }, + { + "entropy": 1.4721867442131042, + "epoch": 3.3728506787330317, + "grad_norm": 0.595804750919342, + "learning_rate": 0.00024667222523426204, + "loss": 0.073, + "mean_token_accuracy": 0.979112833738327, + "num_tokens": 8232560.0, + "step": 934 + }, + { + "entropy": 1.4026366472244263, + "epoch": 3.376470588235294, + "grad_norm": 0.8112502098083496, + "learning_rate": 0.0002462291825628226, + "loss": 0.1302, + "mean_token_accuracy": 0.9592884331941605, + "num_tokens": 8241529.0, + "step": 935 + }, + { + "entropy": 1.4276806712150574, + "epoch": 3.3800904977375565, + "grad_norm": 0.3144559860229492, + "learning_rate": 0.0002457862721452854, + "loss": 0.0355, + "mean_token_accuracy": 0.9895562827587128, + "num_tokens": 8250331.0, + "step": 936 + }, + { + "entropy": 1.4367564022541046, + "epoch": 3.383710407239819, + "grad_norm": 0.6843166947364807, + "learning_rate": 0.0002453434956638806, + "loss": 0.0674, + "mean_token_accuracy": 0.9829154461622238, + "num_tokens": 8259137.0, + "step": 937 + }, + { + "entropy": 1.390118271112442, + "epoch": 3.3873303167420814, + "grad_norm": 0.437500536441803, + "learning_rate": 0.00024490085480032996, + "loss": 0.0323, + "mean_token_accuracy": 0.9916883558034897, + "num_tokens": 8268372.0, + "step": 938 + }, + { + "entropy": 1.3605903685092926, + "epoch": 3.390950226244344, + "grad_norm": 0.6721571087837219, + "learning_rate": 0.00024445835123583964, + "loss": 0.1217, + "mean_token_accuracy": 0.9565094709396362, + "num_tokens": 8277388.0, + "step": 939 + }, + { + "entropy": 1.3998730778694153, + "epoch": 3.3945701357466063, + "grad_norm": 0.38136187195777893, + "learning_rate": 0.00024401598665109463, + "loss": 0.0397, + "mean_token_accuracy": 0.9870003908872604, + "num_tokens": 8286150.0, + "step": 940 + }, + { + "entropy": 1.406863808631897, + "epoch": 3.3981900452488687, + "grad_norm": 0.5735233426094055, + "learning_rate": 0.00024357376272625205, + "loss": 0.0794, + "mean_token_accuracy": 0.9789908528327942, + "num_tokens": 8294719.0, + "step": 941 + }, + { + "entropy": 1.418317824602127, + "epoch": 3.401809954751131, + "grad_norm": 0.624377965927124, + "learning_rate": 0.00024313168114093475, + "loss": 0.0466, + "mean_token_accuracy": 0.9851591736078262, + "num_tokens": 8303298.0, + "step": 942 + }, + { + "entropy": 1.3575542867183685, + "epoch": 3.4054298642533936, + "grad_norm": 0.5194457173347473, + "learning_rate": 0.00024268974357422488, + "loss": 0.0743, + "mean_token_accuracy": 0.9743311256170273, + "num_tokens": 8312635.0, + "step": 943 + }, + { + "entropy": 1.392454832792282, + "epoch": 3.409049773755656, + "grad_norm": 0.5445207357406616, + "learning_rate": 0.00024224795170465756, + "loss": 0.0986, + "mean_token_accuracy": 0.9710196405649185, + "num_tokens": 8321364.0, + "step": 944 + }, + { + "entropy": 1.324178010225296, + "epoch": 3.4126696832579184, + "grad_norm": 0.4121778607368469, + "learning_rate": 0.0002418063072102148, + "loss": 0.0513, + "mean_token_accuracy": 0.9844248443841934, + "num_tokens": 8330452.0, + "step": 945 + }, + { + "entropy": 1.4191058278083801, + "epoch": 3.416289592760181, + "grad_norm": 0.48296698927879333, + "learning_rate": 0.00024136481176831854, + "loss": 0.0561, + "mean_token_accuracy": 0.9812565594911575, + "num_tokens": 8339243.0, + "step": 946 + }, + { + "entropy": 1.3743943274021149, + "epoch": 3.4199095022624433, + "grad_norm": 0.5322384834289551, + "learning_rate": 0.00024092346705582474, + "loss": 0.065, + "mean_token_accuracy": 0.9788537919521332, + "num_tokens": 8347866.0, + "step": 947 + }, + { + "entropy": 1.4042058885097504, + "epoch": 3.4235294117647057, + "grad_norm": 0.5542939901351929, + "learning_rate": 0.00024048227474901697, + "loss": 0.0835, + "mean_token_accuracy": 0.9758901000022888, + "num_tokens": 8356604.0, + "step": 948 + }, + { + "entropy": 1.4089910387992859, + "epoch": 3.427149321266968, + "grad_norm": 0.6025400757789612, + "learning_rate": 0.00024004123652359973, + "loss": 0.0736, + "mean_token_accuracy": 0.9723546206951141, + "num_tokens": 8365168.0, + "step": 949 + }, + { + "entropy": 1.3679145872592926, + "epoch": 3.430769230769231, + "grad_norm": 0.6585437059402466, + "learning_rate": 0.00023960035405469235, + "loss": 0.1387, + "mean_token_accuracy": 0.9651710242033005, + "num_tokens": 8374034.0, + "step": 950 + }, + { + "entropy": 1.3707129955291748, + "epoch": 3.4343891402714934, + "grad_norm": 0.639600932598114, + "learning_rate": 0.0002391596290168228, + "loss": 0.0491, + "mean_token_accuracy": 0.9869592487812042, + "num_tokens": 8383019.0, + "step": 951 + }, + { + "entropy": 1.3920492231845856, + "epoch": 3.438009049773756, + "grad_norm": 0.4947279393672943, + "learning_rate": 0.00023871906308392088, + "loss": 0.0647, + "mean_token_accuracy": 0.98161181807518, + "num_tokens": 8392191.0, + "step": 952 + }, + { + "entropy": 1.4259005188941956, + "epoch": 3.4416289592760183, + "grad_norm": 0.5486235618591309, + "learning_rate": 0.00023827865792931205, + "loss": 0.0581, + "mean_token_accuracy": 0.9796920716762543, + "num_tokens": 8400966.0, + "step": 953 + }, + { + "entropy": 1.4309614896774292, + "epoch": 3.4452488687782807, + "grad_norm": 0.6024688482284546, + "learning_rate": 0.00023783841522571138, + "loss": 0.1217, + "mean_token_accuracy": 0.9621599614620209, + "num_tokens": 8409878.0, + "step": 954 + }, + { + "entropy": 1.427911102771759, + "epoch": 3.448868778280543, + "grad_norm": 0.4339677691459656, + "learning_rate": 0.00023739833664521671, + "loss": 0.0521, + "mean_token_accuracy": 0.9818601310253143, + "num_tokens": 8418609.0, + "step": 955 + }, + { + "entropy": 1.4235480725765228, + "epoch": 3.4524886877828056, + "grad_norm": 0.5715889930725098, + "learning_rate": 0.00023695842385930242, + "loss": 0.0657, + "mean_token_accuracy": 0.9833882004022598, + "num_tokens": 8427265.0, + "step": 956 + }, + { + "entropy": 1.335403710603714, + "epoch": 3.456108597285068, + "grad_norm": 0.34678834676742554, + "learning_rate": 0.00023651867853881356, + "loss": 0.0507, + "mean_token_accuracy": 0.9843446165323257, + "num_tokens": 8436591.0, + "step": 957 + }, + { + "entropy": 1.3923978507518768, + "epoch": 3.4597285067873305, + "grad_norm": 0.8088510632514954, + "learning_rate": 0.00023607910235395882, + "loss": 0.1065, + "mean_token_accuracy": 0.9738757163286209, + "num_tokens": 8445472.0, + "step": 958 + }, + { + "entropy": 1.513672262430191, + "epoch": 3.463348416289593, + "grad_norm": 0.6919769048690796, + "learning_rate": 0.0002356396969743044, + "loss": 0.0846, + "mean_token_accuracy": 0.9809803068637848, + "num_tokens": 8453788.0, + "step": 959 + }, + { + "entropy": 1.3483870327472687, + "epoch": 3.4669683257918553, + "grad_norm": 0.5901163220405579, + "learning_rate": 0.00023520046406876822, + "loss": 0.1035, + "mean_token_accuracy": 0.9659459739923477, + "num_tokens": 8463134.0, + "step": 960 + }, + { + "entropy": 1.4076683819293976, + "epoch": 3.4705882352941178, + "grad_norm": 0.5772054195404053, + "learning_rate": 0.00023476140530561253, + "loss": 0.058, + "mean_token_accuracy": 0.9804215878248215, + "num_tokens": 8471959.0, + "step": 961 + }, + { + "entropy": 1.3382205069065094, + "epoch": 3.47420814479638, + "grad_norm": 0.4780332148075104, + "learning_rate": 0.00023432252235243883, + "loss": 0.074, + "mean_token_accuracy": 0.9757792204618454, + "num_tokens": 8480866.0, + "step": 962 + }, + { + "entropy": 1.432678759098053, + "epoch": 3.4778280542986426, + "grad_norm": 0.5997639298439026, + "learning_rate": 0.00023388381687618022, + "loss": 0.0641, + "mean_token_accuracy": 0.9824596792459488, + "num_tokens": 8489355.0, + "step": 963 + }, + { + "entropy": 1.3388859629631042, + "epoch": 3.481447963800905, + "grad_norm": 0.3654438257217407, + "learning_rate": 0.0002334452905430961, + "loss": 0.0553, + "mean_token_accuracy": 0.9859583377838135, + "num_tokens": 8498607.0, + "step": 964 + }, + { + "entropy": 1.4407559633255005, + "epoch": 3.4850678733031675, + "grad_norm": 0.6571084856987, + "learning_rate": 0.00023300694501876535, + "loss": 0.0887, + "mean_token_accuracy": 0.9736911207437515, + "num_tokens": 8506915.0, + "step": 965 + }, + { + "entropy": 1.4553894400596619, + "epoch": 3.48868778280543, + "grad_norm": 0.459780752658844, + "learning_rate": 0.00023256878196808019, + "loss": 0.0578, + "mean_token_accuracy": 0.98088139295578, + "num_tokens": 8515157.0, + "step": 966 + }, + { + "entropy": 1.4285954535007477, + "epoch": 3.4923076923076923, + "grad_norm": 0.4488624930381775, + "learning_rate": 0.0002321308030552396, + "loss": 0.0466, + "mean_token_accuracy": 0.9883453100919724, + "num_tokens": 8523741.0, + "step": 967 + }, + { + "entropy": 1.3596898019313812, + "epoch": 3.4959276018099548, + "grad_norm": 0.5626068711280823, + "learning_rate": 0.00023169300994374352, + "loss": 0.0663, + "mean_token_accuracy": 0.979169949889183, + "num_tokens": 8532431.0, + "step": 968 + }, + { + "entropy": 1.4428678750991821, + "epoch": 3.499547511312217, + "grad_norm": 0.546142578125, + "learning_rate": 0.0002312554042963858, + "loss": 0.0777, + "mean_token_accuracy": 0.9774435311555862, + "num_tokens": 8540889.0, + "step": 969 + }, + { + "entropy": 1.3509635627269745, + "epoch": 3.5031674208144796, + "grad_norm": 0.6781264543533325, + "learning_rate": 0.00023081798777524847, + "loss": 0.0941, + "mean_token_accuracy": 0.9698395729064941, + "num_tokens": 8550128.0, + "step": 970 + }, + { + "entropy": 1.2727701663970947, + "epoch": 3.506787330316742, + "grad_norm": 0.477498322725296, + "learning_rate": 0.00023038076204169534, + "loss": 0.0447, + "mean_token_accuracy": 0.98555026948452, + "num_tokens": 8559305.0, + "step": 971 + }, + { + "entropy": 1.3704063892364502, + "epoch": 3.5104072398190045, + "grad_norm": 0.5665515661239624, + "learning_rate": 0.00022994372875636534, + "loss": 0.0727, + "mean_token_accuracy": 0.9838419556617737, + "num_tokens": 8568175.0, + "step": 972 + }, + { + "entropy": 1.3341827094554901, + "epoch": 3.514027149321267, + "grad_norm": 0.7451890110969543, + "learning_rate": 0.00022950688957916666, + "loss": 0.0892, + "mean_token_accuracy": 0.9721158593893051, + "num_tokens": 8576916.0, + "step": 973 + }, + { + "entropy": 1.3170603513717651, + "epoch": 3.5176470588235293, + "grad_norm": 0.6274797916412354, + "learning_rate": 0.00022907024616927016, + "loss": 0.0867, + "mean_token_accuracy": 0.9760665446519852, + "num_tokens": 8585937.0, + "step": 974 + }, + { + "entropy": 1.324271559715271, + "epoch": 3.521266968325792, + "grad_norm": 0.49691009521484375, + "learning_rate": 0.00022863380018510321, + "loss": 0.0617, + "mean_token_accuracy": 0.9794053137302399, + "num_tokens": 8594885.0, + "step": 975 + }, + { + "entropy": 1.3782964646816254, + "epoch": 3.524886877828054, + "grad_norm": 0.5726630687713623, + "learning_rate": 0.00022819755328434306, + "loss": 0.0789, + "mean_token_accuracy": 0.9756396412849426, + "num_tokens": 8603243.0, + "step": 976 + }, + { + "entropy": 1.3255096673965454, + "epoch": 3.5285067873303166, + "grad_norm": 0.5568417906761169, + "learning_rate": 0.00022776150712391127, + "loss": 0.0734, + "mean_token_accuracy": 0.974893182516098, + "num_tokens": 8612414.0, + "step": 977 + }, + { + "entropy": 1.4089244902133942, + "epoch": 3.532126696832579, + "grad_norm": 0.5721971392631531, + "learning_rate": 0.00022732566335996674, + "loss": 0.0719, + "mean_token_accuracy": 0.976212814450264, + "num_tokens": 8620851.0, + "step": 978 + }, + { + "entropy": 1.278488278388977, + "epoch": 3.5357466063348415, + "grad_norm": 0.4737057387828827, + "learning_rate": 0.00022689002364789938, + "loss": 0.0329, + "mean_token_accuracy": 0.9908775240182877, + "num_tokens": 8630000.0, + "step": 979 + }, + { + "entropy": 1.3656818866729736, + "epoch": 3.539366515837104, + "grad_norm": 0.6749998927116394, + "learning_rate": 0.00022645458964232456, + "loss": 0.0875, + "mean_token_accuracy": 0.978403314948082, + "num_tokens": 8638635.0, + "step": 980 + }, + { + "entropy": 1.3721227645874023, + "epoch": 3.5429864253393664, + "grad_norm": 0.5295807719230652, + "learning_rate": 0.00022601936299707616, + "loss": 0.0694, + "mean_token_accuracy": 0.9826173633337021, + "num_tokens": 8647726.0, + "step": 981 + }, + { + "entropy": 1.3711304068565369, + "epoch": 3.546606334841629, + "grad_norm": 0.5617223381996155, + "learning_rate": 0.0002255843453652002, + "loss": 0.0745, + "mean_token_accuracy": 0.9778908938169479, + "num_tokens": 8656421.0, + "step": 982 + }, + { + "entropy": 1.3603121936321259, + "epoch": 3.5502262443438912, + "grad_norm": 0.5830493569374084, + "learning_rate": 0.00022514953839894932, + "loss": 0.0498, + "mean_token_accuracy": 0.9847024828195572, + "num_tokens": 8665206.0, + "step": 983 + }, + { + "entropy": 1.3419533371925354, + "epoch": 3.5538461538461537, + "grad_norm": 0.5035730004310608, + "learning_rate": 0.00022471494374977556, + "loss": 0.0873, + "mean_token_accuracy": 0.9755606353282928, + "num_tokens": 8674482.0, + "step": 984 + }, + { + "entropy": 1.4004729092121124, + "epoch": 3.557466063348416, + "grad_norm": 0.4822017252445221, + "learning_rate": 0.0002242805630683251, + "loss": 0.0574, + "mean_token_accuracy": 0.9790775179862976, + "num_tokens": 8683066.0, + "step": 985 + }, + { + "entropy": 1.32045316696167, + "epoch": 3.5610859728506785, + "grad_norm": 0.3949761688709259, + "learning_rate": 0.00022384639800443088, + "loss": 0.0396, + "mean_token_accuracy": 0.9879113733768463, + "num_tokens": 8691966.0, + "step": 986 + }, + { + "entropy": 1.3542158901691437, + "epoch": 3.564705882352941, + "grad_norm": 0.6060124635696411, + "learning_rate": 0.0002234124502071072, + "loss": 0.0827, + "mean_token_accuracy": 0.9750298708677292, + "num_tokens": 8700859.0, + "step": 987 + }, + { + "entropy": 1.2594963014125824, + "epoch": 3.5683257918552034, + "grad_norm": 0.5580794215202332, + "learning_rate": 0.00022297872132454318, + "loss": 0.0691, + "mean_token_accuracy": 0.9793778210878372, + "num_tokens": 8710316.0, + "step": 988 + }, + { + "entropy": 1.3390154540538788, + "epoch": 3.571945701357466, + "grad_norm": 0.38052669167518616, + "learning_rate": 0.00022254521300409626, + "loss": 0.0436, + "mean_token_accuracy": 0.9838068634271622, + "num_tokens": 8719219.0, + "step": 989 + }, + { + "entropy": 1.4103966355323792, + "epoch": 3.5755656108597282, + "grad_norm": 0.6793152093887329, + "learning_rate": 0.00022211192689228633, + "loss": 0.0738, + "mean_token_accuracy": 0.9803069233894348, + "num_tokens": 8727658.0, + "step": 990 + }, + { + "entropy": 1.3170084357261658, + "epoch": 3.579185520361991, + "grad_norm": 0.5633034110069275, + "learning_rate": 0.00022167886463478933, + "loss": 0.0483, + "mean_token_accuracy": 0.9852565824985504, + "num_tokens": 8736502.0, + "step": 991 + }, + { + "entropy": 1.3381566107273102, + "epoch": 3.5828054298642535, + "grad_norm": 0.46346399188041687, + "learning_rate": 0.00022124602787643088, + "loss": 0.0324, + "mean_token_accuracy": 0.9907149076461792, + "num_tokens": 8745057.0, + "step": 992 + }, + { + "entropy": 1.2907682061195374, + "epoch": 3.586425339366516, + "grad_norm": 0.5343260169029236, + "learning_rate": 0.00022081341826118013, + "loss": 0.0577, + "mean_token_accuracy": 0.982825830578804, + "num_tokens": 8754046.0, + "step": 993 + }, + { + "entropy": 1.3302249014377594, + "epoch": 3.5900452488687784, + "grad_norm": 0.7435888051986694, + "learning_rate": 0.00022038103743214345, + "loss": 0.0644, + "mean_token_accuracy": 0.9805946946144104, + "num_tokens": 8762749.0, + "step": 994 + }, + { + "entropy": 1.3043336868286133, + "epoch": 3.593665158371041, + "grad_norm": 0.5991454124450684, + "learning_rate": 0.00021994888703155853, + "loss": 0.1013, + "mean_token_accuracy": 0.9729356169700623, + "num_tokens": 8771617.0, + "step": 995 + }, + { + "entropy": 1.2590918838977814, + "epoch": 3.5972850678733033, + "grad_norm": 0.6732775568962097, + "learning_rate": 0.00021951696870078748, + "loss": 0.2119, + "mean_token_accuracy": 0.9542236030101776, + "num_tokens": 8781055.0, + "step": 996 + }, + { + "entropy": 1.3064957559108734, + "epoch": 3.6009049773755657, + "grad_norm": 0.6471344828605652, + "learning_rate": 0.00021908528408031124, + "loss": 0.0775, + "mean_token_accuracy": 0.9764008969068527, + "num_tokens": 8789616.0, + "step": 997 + }, + { + "entropy": 1.2735644578933716, + "epoch": 3.604524886877828, + "grad_norm": 0.4242008626461029, + "learning_rate": 0.00021865383480972308, + "loss": 0.0517, + "mean_token_accuracy": 0.9826734960079193, + "num_tokens": 8798420.0, + "step": 998 + }, + { + "entropy": 1.2687926590442657, + "epoch": 3.6081447963800906, + "grad_norm": 0.6521032452583313, + "learning_rate": 0.00021822262252772212, + "loss": 0.0831, + "mean_token_accuracy": 0.9806712120771408, + "num_tokens": 8807486.0, + "step": 999 + }, + { + "entropy": 1.2503767311573029, + "epoch": 3.611764705882353, + "grad_norm": 0.44378921389579773, + "learning_rate": 0.00021779164887210774, + "loss": 0.0709, + "mean_token_accuracy": 0.9845052361488342, + "num_tokens": 8816795.0, + "step": 1000 + }, + { + "entropy": 1.2962157726287842, + "epoch": 3.6153846153846154, + "grad_norm": 0.47278398275375366, + "learning_rate": 0.0002173609154797728, + "loss": 0.0321, + "mean_token_accuracy": 0.986628457903862, + "num_tokens": 8825449.0, + "step": 1001 + }, + { + "entropy": 1.335391879081726, + "epoch": 3.619004524886878, + "grad_norm": 0.3435405492782593, + "learning_rate": 0.00021693042398669747, + "loss": 0.0361, + "mean_token_accuracy": 0.9887901991605759, + "num_tokens": 8834296.0, + "step": 1002 + }, + { + "entropy": 1.295527994632721, + "epoch": 3.6226244343891403, + "grad_norm": 0.4150637686252594, + "learning_rate": 0.0002165001760279435, + "loss": 0.0419, + "mean_token_accuracy": 0.9862009286880493, + "num_tokens": 8843354.0, + "step": 1003 + }, + { + "entropy": 1.270320326089859, + "epoch": 3.6262443438914027, + "grad_norm": 0.4439278542995453, + "learning_rate": 0.0002160701732376474, + "loss": 0.0676, + "mean_token_accuracy": 0.9789925366640091, + "num_tokens": 8852311.0, + "step": 1004 + }, + { + "entropy": 1.2495850026607513, + "epoch": 3.629864253393665, + "grad_norm": 0.4471176266670227, + "learning_rate": 0.00021564041724901446, + "loss": 0.0469, + "mean_token_accuracy": 0.98641636967659, + "num_tokens": 8861126.0, + "step": 1005 + }, + { + "entropy": 1.3307124376296997, + "epoch": 3.6334841628959276, + "grad_norm": 0.547099769115448, + "learning_rate": 0.0002152109096943128, + "loss": 0.0861, + "mean_token_accuracy": 0.9793668240308762, + "num_tokens": 8870129.0, + "step": 1006 + }, + { + "entropy": 1.3895522952079773, + "epoch": 3.63710407239819, + "grad_norm": 0.5946421027183533, + "learning_rate": 0.00021478165220486674, + "loss": 0.0704, + "mean_token_accuracy": 0.9831217378377914, + "num_tokens": 8878385.0, + "step": 1007 + }, + { + "entropy": 1.3864755928516388, + "epoch": 3.6407239819004524, + "grad_norm": 0.42203575372695923, + "learning_rate": 0.00021435264641105116, + "loss": 0.0557, + "mean_token_accuracy": 0.9843680560588837, + "num_tokens": 8887161.0, + "step": 1008 + }, + { + "entropy": 1.3556683957576752, + "epoch": 3.644343891402715, + "grad_norm": 0.5707162618637085, + "learning_rate": 0.00021392389394228454, + "loss": 0.0523, + "mean_token_accuracy": 0.9845058023929596, + "num_tokens": 8896049.0, + "step": 1009 + }, + { + "entropy": 1.2712234854698181, + "epoch": 3.6479638009049773, + "grad_norm": 0.6082377433776855, + "learning_rate": 0.00021349539642702347, + "loss": 0.1082, + "mean_token_accuracy": 0.9710930436849594, + "num_tokens": 8905546.0, + "step": 1010 + }, + { + "entropy": 1.3434297740459442, + "epoch": 3.6515837104072397, + "grad_norm": 0.7305653095245361, + "learning_rate": 0.0002130671554927561, + "loss": 0.088, + "mean_token_accuracy": 0.9745359718799591, + "num_tokens": 8914502.0, + "step": 1011 + }, + { + "entropy": 1.3378058075904846, + "epoch": 3.655203619909502, + "grad_norm": 0.4537632167339325, + "learning_rate": 0.00021263917276599607, + "loss": 0.047, + "mean_token_accuracy": 0.9869710952043533, + "num_tokens": 8923463.0, + "step": 1012 + }, + { + "entropy": 1.3655290305614471, + "epoch": 3.6588235294117646, + "grad_norm": 0.5036798119544983, + "learning_rate": 0.0002122114498722763, + "loss": 0.0655, + "mean_token_accuracy": 0.982716903090477, + "num_tokens": 8932384.0, + "step": 1013 + }, + { + "entropy": 1.3198035657405853, + "epoch": 3.662443438914027, + "grad_norm": 0.3511429727077484, + "learning_rate": 0.000211783988436143, + "loss": 0.0382, + "mean_token_accuracy": 0.9901436120271683, + "num_tokens": 8941300.0, + "step": 1014 + }, + { + "entropy": 1.3130914568901062, + "epoch": 3.6660633484162894, + "grad_norm": 0.4056939482688904, + "learning_rate": 0.00021135679008114894, + "loss": 0.0639, + "mean_token_accuracy": 0.9808386266231537, + "num_tokens": 8950534.0, + "step": 1015 + }, + { + "entropy": 1.3055587410926819, + "epoch": 3.669683257918552, + "grad_norm": 0.33344724774360657, + "learning_rate": 0.00021092985642984802, + "loss": 0.0449, + "mean_token_accuracy": 0.9886894524097443, + "num_tokens": 8960263.0, + "step": 1016 + }, + { + "entropy": 1.3109475672245026, + "epoch": 3.6733031674208148, + "grad_norm": 0.490029901266098, + "learning_rate": 0.00021050318910378874, + "loss": 0.0876, + "mean_token_accuracy": 0.9755903035402298, + "num_tokens": 8969611.0, + "step": 1017 + }, + { + "entropy": 1.3801122307777405, + "epoch": 3.676923076923077, + "grad_norm": 0.3520437479019165, + "learning_rate": 0.00021007678972350798, + "loss": 0.0482, + "mean_token_accuracy": 0.9860682934522629, + "num_tokens": 8978283.0, + "step": 1018 + }, + { + "entropy": 1.309948354959488, + "epoch": 3.6805429864253396, + "grad_norm": 0.485009104013443, + "learning_rate": 0.00020965065990852474, + "loss": 0.0824, + "mean_token_accuracy": 0.9751296043395996, + "num_tokens": 8987535.0, + "step": 1019 + }, + { + "entropy": 1.3771768808364868, + "epoch": 3.684162895927602, + "grad_norm": 0.5419639945030212, + "learning_rate": 0.00020922480127733448, + "loss": 0.0649, + "mean_token_accuracy": 0.9826148748397827, + "num_tokens": 8996533.0, + "step": 1020 + }, + { + "entropy": 1.337918907403946, + "epoch": 3.6877828054298645, + "grad_norm": 0.36202654242515564, + "learning_rate": 0.00020879921544740264, + "loss": 0.0311, + "mean_token_accuracy": 0.9919043332338333, + "num_tokens": 9005497.0, + "step": 1021 + }, + { + "entropy": 1.439345896244049, + "epoch": 3.691402714932127, + "grad_norm": 0.6851293444633484, + "learning_rate": 0.0002083739040351584, + "loss": 0.096, + "mean_token_accuracy": 0.9736751765012741, + "num_tokens": 9014037.0, + "step": 1022 + }, + { + "entropy": 1.44906947016716, + "epoch": 3.6950226244343893, + "grad_norm": 0.4260176122188568, + "learning_rate": 0.00020794886865598848, + "loss": 0.0523, + "mean_token_accuracy": 0.9793268889188766, + "num_tokens": 9022452.0, + "step": 1023 + }, + { + "entropy": 1.449628233909607, + "epoch": 3.6986425339366518, + "grad_norm": 0.6072604656219482, + "learning_rate": 0.00020752411092423177, + "loss": 0.0727, + "mean_token_accuracy": 0.9774363785982132, + "num_tokens": 9030847.0, + "step": 1024 + }, + { + "entropy": 1.3873493075370789, + "epoch": 3.702262443438914, + "grad_norm": 0.44552555680274963, + "learning_rate": 0.00020709963245317209, + "loss": 0.0639, + "mean_token_accuracy": 0.9800115376710892, + "num_tokens": 9039891.0, + "step": 1025 + }, + { + "entropy": 1.4281193912029266, + "epoch": 3.7058823529411766, + "grad_norm": 0.5228530764579773, + "learning_rate": 0.0002066754348550327, + "loss": 0.0738, + "mean_token_accuracy": 0.9765488505363464, + "num_tokens": 9048686.0, + "step": 1026 + }, + { + "entropy": 1.3790385127067566, + "epoch": 3.709502262443439, + "grad_norm": 0.4316764175891876, + "learning_rate": 0.00020625151974097022, + "loss": 0.0641, + "mean_token_accuracy": 0.97920823097229, + "num_tokens": 9057678.0, + "step": 1027 + }, + { + "entropy": 1.4287641942501068, + "epoch": 3.7131221719457015, + "grad_norm": 0.4056229591369629, + "learning_rate": 0.00020582788872106842, + "loss": 0.036, + "mean_token_accuracy": 0.9899342656135559, + "num_tokens": 9066521.0, + "step": 1028 + }, + { + "entropy": 1.454606294631958, + "epoch": 3.716742081447964, + "grad_norm": 0.7525569200515747, + "learning_rate": 0.0002054045434043316, + "loss": 0.1423, + "mean_token_accuracy": 0.9605622440576553, + "num_tokens": 9075595.0, + "step": 1029 + }, + { + "entropy": 1.3942890167236328, + "epoch": 3.7203619909502263, + "grad_norm": 0.4933125078678131, + "learning_rate": 0.00020498148539867944, + "loss": 0.0773, + "mean_token_accuracy": 0.970758929848671, + "num_tokens": 9084661.0, + "step": 1030 + }, + { + "entropy": 1.384048968553543, + "epoch": 3.723981900452489, + "grad_norm": 0.43627068400382996, + "learning_rate": 0.00020455871631094017, + "loss": 0.0678, + "mean_token_accuracy": 0.983132854104042, + "num_tokens": 9094062.0, + "step": 1031 + }, + { + "entropy": 1.4216719567775726, + "epoch": 3.727601809954751, + "grad_norm": 0.6412005424499512, + "learning_rate": 0.0002041362377468445, + "loss": 0.1097, + "mean_token_accuracy": 0.9793971478939056, + "num_tokens": 9103015.0, + "step": 1032 + }, + { + "entropy": 1.4771287441253662, + "epoch": 3.7312217194570136, + "grad_norm": 0.5385004281997681, + "learning_rate": 0.00020371405131102002, + "loss": 0.0553, + "mean_token_accuracy": 0.9826144278049469, + "num_tokens": 9111433.0, + "step": 1033 + }, + { + "entropy": 1.4442466795444489, + "epoch": 3.734841628959276, + "grad_norm": 0.5972802042961121, + "learning_rate": 0.00020329215860698458, + "loss": 0.0584, + "mean_token_accuracy": 0.984418511390686, + "num_tokens": 9120012.0, + "step": 1034 + }, + { + "entropy": 1.4893062710762024, + "epoch": 3.7384615384615385, + "grad_norm": 0.7473769783973694, + "learning_rate": 0.00020287056123714035, + "loss": 0.1091, + "mean_token_accuracy": 0.9683271646499634, + "num_tokens": 9128636.0, + "step": 1035 + }, + { + "entropy": 1.3519982993602753, + "epoch": 3.742081447963801, + "grad_norm": 0.47699517011642456, + "learning_rate": 0.00020244926080276794, + "loss": 0.0525, + "mean_token_accuracy": 0.9904675185680389, + "num_tokens": 9137968.0, + "step": 1036 + }, + { + "entropy": 1.4561591148376465, + "epoch": 3.7457013574660634, + "grad_norm": 0.42199093103408813, + "learning_rate": 0.00020202825890402003, + "loss": 0.0451, + "mean_token_accuracy": 0.9883602410554886, + "num_tokens": 9146589.0, + "step": 1037 + }, + { + "entropy": 1.408205658197403, + "epoch": 3.749321266968326, + "grad_norm": 0.4122658371925354, + "learning_rate": 0.0002016075571399157, + "loss": 0.0489, + "mean_token_accuracy": 0.9833643138408661, + "num_tokens": 9155443.0, + "step": 1038 + }, + { + "entropy": 1.3683985471725464, + "epoch": 3.7529411764705882, + "grad_norm": 0.43202081322669983, + "learning_rate": 0.0002011871571083336, + "loss": 0.0424, + "mean_token_accuracy": 0.9905680269002914, + "num_tokens": 9164792.0, + "step": 1039 + }, + { + "entropy": 1.3635350167751312, + "epoch": 3.7565610859728507, + "grad_norm": 0.606826663017273, + "learning_rate": 0.00020076706040600672, + "loss": 0.0883, + "mean_token_accuracy": 0.9747144728899002, + "num_tokens": 9174060.0, + "step": 1040 + }, + { + "entropy": 1.3948090970516205, + "epoch": 3.760180995475113, + "grad_norm": 0.561805009841919, + "learning_rate": 0.00020034726862851594, + "loss": 0.1131, + "mean_token_accuracy": 0.9722562730312347, + "num_tokens": 9183245.0, + "step": 1041 + }, + { + "entropy": 1.4282923936843872, + "epoch": 3.7638009049773755, + "grad_norm": 0.5546544790267944, + "learning_rate": 0.00019992778337028384, + "loss": 0.0762, + "mean_token_accuracy": 0.9801356643438339, + "num_tokens": 9191879.0, + "step": 1042 + }, + { + "entropy": 1.3675439953804016, + "epoch": 3.767420814479638, + "grad_norm": 0.5167890787124634, + "learning_rate": 0.0001995086062245689, + "loss": 0.0804, + "mean_token_accuracy": 0.98072350025177, + "num_tokens": 9201014.0, + "step": 1043 + }, + { + "entropy": 1.424451231956482, + "epoch": 3.7710407239819004, + "grad_norm": 0.44696182012557983, + "learning_rate": 0.00019908973878345943, + "loss": 0.0583, + "mean_token_accuracy": 0.9818143099546432, + "num_tokens": 9209954.0, + "step": 1044 + }, + { + "entropy": 1.3515954911708832, + "epoch": 3.774660633484163, + "grad_norm": 0.619891345500946, + "learning_rate": 0.0001986711826378673, + "loss": 0.0949, + "mean_token_accuracy": 0.9688181430101395, + "num_tokens": 9219157.0, + "step": 1045 + }, + { + "entropy": 1.3176933526992798, + "epoch": 3.7782805429864252, + "grad_norm": 0.43150845170021057, + "learning_rate": 0.00019825293937752203, + "loss": 0.0459, + "mean_token_accuracy": 0.9851347357034683, + "num_tokens": 9228415.0, + "step": 1046 + }, + { + "entropy": 1.3939999043941498, + "epoch": 3.7819004524886877, + "grad_norm": 0.6242758631706238, + "learning_rate": 0.00019783501059096495, + "loss": 0.0703, + "mean_token_accuracy": 0.9822264909744263, + "num_tokens": 9237479.0, + "step": 1047 + }, + { + "entropy": 1.430876463651657, + "epoch": 3.78552036199095, + "grad_norm": 0.6107195019721985, + "learning_rate": 0.00019741739786554273, + "loss": 0.0758, + "mean_token_accuracy": 0.9829006642103195, + "num_tokens": 9245975.0, + "step": 1048 + }, + { + "entropy": 1.4005264639854431, + "epoch": 3.7891402714932125, + "grad_norm": 0.5321121215820312, + "learning_rate": 0.00019700010278740174, + "loss": 0.0636, + "mean_token_accuracy": 0.9827183485031128, + "num_tokens": 9254487.0, + "step": 1049 + }, + { + "entropy": 1.4024662375450134, + "epoch": 3.792760180995475, + "grad_norm": 0.5756775140762329, + "learning_rate": 0.00019658312694148191, + "loss": 0.0702, + "mean_token_accuracy": 0.9786443412303925, + "num_tokens": 9263345.0, + "step": 1050 + }, + { + "entropy": 1.3956109881401062, + "epoch": 3.7963800904977374, + "grad_norm": 0.5821980834007263, + "learning_rate": 0.00019616647191151077, + "loss": 0.0715, + "mean_token_accuracy": 0.97563835978508, + "num_tokens": 9271916.0, + "step": 1051 + }, + { + "entropy": 1.4688811898231506, + "epoch": 3.8, + "grad_norm": 0.3763403594493866, + "learning_rate": 0.00019575013927999692, + "loss": 0.0399, + "mean_token_accuracy": 0.9858106821775436, + "num_tokens": 9280577.0, + "step": 1052 + }, + { + "entropy": 1.482151448726654, + "epoch": 3.8036199095022623, + "grad_norm": 0.4746648371219635, + "learning_rate": 0.00019533413062822495, + "loss": 0.0338, + "mean_token_accuracy": 0.9888868033885956, + "num_tokens": 9289036.0, + "step": 1053 + }, + { + "entropy": 1.39437335729599, + "epoch": 3.8072398190045247, + "grad_norm": 0.34683090448379517, + "learning_rate": 0.00019491844753624884, + "loss": 0.0411, + "mean_token_accuracy": 0.98799167573452, + "num_tokens": 9297968.0, + "step": 1054 + }, + { + "entropy": 1.4089177548885345, + "epoch": 3.810859728506787, + "grad_norm": 0.6755173802375793, + "learning_rate": 0.00019450309158288562, + "loss": 0.08, + "mean_token_accuracy": 0.975567102432251, + "num_tokens": 9306399.0, + "step": 1055 + }, + { + "entropy": 1.4395931661128998, + "epoch": 3.8144796380090495, + "grad_norm": 0.6466923356056213, + "learning_rate": 0.00019408806434571043, + "loss": 0.0962, + "mean_token_accuracy": 0.9790873825550079, + "num_tokens": 9315067.0, + "step": 1056 + }, + { + "entropy": 1.463386446237564, + "epoch": 3.818099547511312, + "grad_norm": 0.7904548645019531, + "learning_rate": 0.0001936733674010496, + "loss": 0.0982, + "mean_token_accuracy": 0.9723253399133682, + "num_tokens": 9323453.0, + "step": 1057 + }, + { + "entropy": 1.4089987576007843, + "epoch": 3.8217194570135744, + "grad_norm": 0.5947125554084778, + "learning_rate": 0.00019325900232397477, + "loss": 0.0558, + "mean_token_accuracy": 0.985149696469307, + "num_tokens": 9332220.0, + "step": 1058 + }, + { + "entropy": 1.424832284450531, + "epoch": 3.825339366515837, + "grad_norm": 0.6046349406242371, + "learning_rate": 0.00019284497068829747, + "loss": 0.103, + "mean_token_accuracy": 0.9751139581203461, + "num_tokens": 9341074.0, + "step": 1059 + }, + { + "entropy": 1.3354915082454681, + "epoch": 3.8289592760180997, + "grad_norm": 0.20708034932613373, + "learning_rate": 0.00019243127406656248, + "loss": 0.0117, + "mean_token_accuracy": 0.9978606253862381, + "num_tokens": 9350232.0, + "step": 1060 + }, + { + "entropy": 1.335442215204239, + "epoch": 3.832579185520362, + "grad_norm": 0.7157071232795715, + "learning_rate": 0.00019201791403004257, + "loss": 0.0915, + "mean_token_accuracy": 0.9730544090270996, + "num_tokens": 9359486.0, + "step": 1061 + }, + { + "entropy": 1.4432708621025085, + "epoch": 3.8361990950226246, + "grad_norm": 0.7831724286079407, + "learning_rate": 0.00019160489214873155, + "loss": 0.1163, + "mean_token_accuracy": 0.9673851430416107, + "num_tokens": 9368235.0, + "step": 1062 + }, + { + "entropy": 1.440447449684143, + "epoch": 3.839819004524887, + "grad_norm": 0.6418187022209167, + "learning_rate": 0.00019119220999133923, + "loss": 0.0587, + "mean_token_accuracy": 0.9853110462427139, + "num_tokens": 9376622.0, + "step": 1063 + }, + { + "entropy": 1.3945489525794983, + "epoch": 3.8434389140271494, + "grad_norm": 0.5115446448326111, + "learning_rate": 0.0001907798691252852, + "loss": 0.0627, + "mean_token_accuracy": 0.9849587380886078, + "num_tokens": 9385653.0, + "step": 1064 + }, + { + "entropy": 1.4040117859840393, + "epoch": 3.847058823529412, + "grad_norm": 0.40980765223503113, + "learning_rate": 0.0001903678711166924, + "loss": 0.0319, + "mean_token_accuracy": 0.990267813205719, + "num_tokens": 9394335.0, + "step": 1065 + }, + { + "entropy": 1.440806269645691, + "epoch": 3.8506787330316743, + "grad_norm": 0.7762898206710815, + "learning_rate": 0.00018995621753038183, + "loss": 0.1477, + "mean_token_accuracy": 0.9675359576940536, + "num_tokens": 9402789.0, + "step": 1066 + }, + { + "entropy": 1.3455476462841034, + "epoch": 3.8542986425339367, + "grad_norm": 0.4371282458305359, + "learning_rate": 0.00018954490992986644, + "loss": 0.047, + "mean_token_accuracy": 0.9871475845575333, + "num_tokens": 9411665.0, + "step": 1067 + }, + { + "entropy": 1.3937756717205048, + "epoch": 3.857918552036199, + "grad_norm": 0.8712350726127625, + "learning_rate": 0.0001891339498773447, + "loss": 0.143, + "mean_token_accuracy": 0.9606377333402634, + "num_tokens": 9420475.0, + "step": 1068 + }, + { + "entropy": 1.4569672644138336, + "epoch": 3.8615384615384616, + "grad_norm": 0.6399717926979065, + "learning_rate": 0.00018872333893369536, + "loss": 0.0625, + "mean_token_accuracy": 0.9822671264410019, + "num_tokens": 9429062.0, + "step": 1069 + }, + { + "entropy": 1.397208034992218, + "epoch": 3.865158371040724, + "grad_norm": 0.41650331020355225, + "learning_rate": 0.00018831307865847108, + "loss": 0.0565, + "mean_token_accuracy": 0.9822796285152435, + "num_tokens": 9437938.0, + "step": 1070 + }, + { + "entropy": 1.3304217159748077, + "epoch": 3.8687782805429864, + "grad_norm": 0.34858253598213196, + "learning_rate": 0.00018790317060989273, + "loss": 0.0355, + "mean_token_accuracy": 0.9889863580465317, + "num_tokens": 9446897.0, + "step": 1071 + }, + { + "entropy": 1.4303509891033173, + "epoch": 3.872398190045249, + "grad_norm": 0.5634018182754517, + "learning_rate": 0.00018749361634484325, + "loss": 0.0999, + "mean_token_accuracy": 0.9707607924938202, + "num_tokens": 9455618.0, + "step": 1072 + }, + { + "entropy": 1.4059797525405884, + "epoch": 3.8760180995475113, + "grad_norm": 0.4992756247520447, + "learning_rate": 0.00018708441741886194, + "loss": 0.062, + "mean_token_accuracy": 0.9801923334598541, + "num_tokens": 9464254.0, + "step": 1073 + }, + { + "entropy": 1.329335242509842, + "epoch": 3.8796380090497737, + "grad_norm": 0.43501394987106323, + "learning_rate": 0.00018667557538613863, + "loss": 0.0474, + "mean_token_accuracy": 0.987145259976387, + "num_tokens": 9473340.0, + "step": 1074 + }, + { + "entropy": 1.3786957263946533, + "epoch": 3.883257918552036, + "grad_norm": 0.640612781047821, + "learning_rate": 0.00018626709179950717, + "loss": 0.1196, + "mean_token_accuracy": 0.9668680727481842, + "num_tokens": 9482286.0, + "step": 1075 + }, + { + "entropy": 1.3925435245037079, + "epoch": 3.8868778280542986, + "grad_norm": 0.6338940262794495, + "learning_rate": 0.0001858589682104405, + "loss": 0.0643, + "mean_token_accuracy": 0.982734814286232, + "num_tokens": 9490868.0, + "step": 1076 + }, + { + "entropy": 1.3901928961277008, + "epoch": 3.890497737556561, + "grad_norm": 0.5943475365638733, + "learning_rate": 0.000185451206169044, + "loss": 0.0636, + "mean_token_accuracy": 0.9796071499586105, + "num_tokens": 9499551.0, + "step": 1077 + }, + { + "entropy": 1.3408487439155579, + "epoch": 3.8941176470588235, + "grad_norm": 0.47063320875167847, + "learning_rate": 0.00018504380722404975, + "loss": 0.059, + "mean_token_accuracy": 0.98704494535923, + "num_tokens": 9508605.0, + "step": 1078 + }, + { + "entropy": 1.3606760799884796, + "epoch": 3.897737556561086, + "grad_norm": 0.5077705383300781, + "learning_rate": 0.00018463677292281092, + "loss": 0.0586, + "mean_token_accuracy": 0.9849795997142792, + "num_tokens": 9517376.0, + "step": 1079 + }, + { + "entropy": 1.389219492673874, + "epoch": 3.9013574660633483, + "grad_norm": 0.451435923576355, + "learning_rate": 0.00018423010481129584, + "loss": 0.0414, + "mean_token_accuracy": 0.9872728437185287, + "num_tokens": 9525724.0, + "step": 1080 + }, + { + "entropy": 1.3427990972995758, + "epoch": 3.9049773755656108, + "grad_norm": 0.4996180236339569, + "learning_rate": 0.00018382380443408158, + "loss": 0.0519, + "mean_token_accuracy": 0.9842040240764618, + "num_tokens": 9534581.0, + "step": 1081 + }, + { + "entropy": 1.3136717081069946, + "epoch": 3.908597285067873, + "grad_norm": 0.31684455275535583, + "learning_rate": 0.00018341787333434872, + "loss": 0.0367, + "mean_token_accuracy": 0.986624076962471, + "num_tokens": 9543780.0, + "step": 1082 + }, + { + "entropy": 1.357143759727478, + "epoch": 3.9122171945701356, + "grad_norm": 0.392623633146286, + "learning_rate": 0.00018301231305387552, + "loss": 0.0361, + "mean_token_accuracy": 0.9899974465370178, + "num_tokens": 9552316.0, + "step": 1083 + }, + { + "entropy": 1.4222826957702637, + "epoch": 3.915837104072398, + "grad_norm": 0.6109654903411865, + "learning_rate": 0.00018260712513303167, + "loss": 0.0801, + "mean_token_accuracy": 0.9758190959692001, + "num_tokens": 9560547.0, + "step": 1084 + }, + { + "entropy": 1.387441635131836, + "epoch": 3.9194570135746605, + "grad_norm": 0.611193060874939, + "learning_rate": 0.00018220231111077217, + "loss": 0.0627, + "mean_token_accuracy": 0.9828397631645203, + "num_tokens": 9569112.0, + "step": 1085 + }, + { + "entropy": 1.3265759348869324, + "epoch": 3.9230769230769234, + "grad_norm": 0.35626664757728577, + "learning_rate": 0.0001817978725246326, + "loss": 0.0347, + "mean_token_accuracy": 0.9868002831935883, + "num_tokens": 9577936.0, + "step": 1086 + }, + { + "entropy": 1.3036309480667114, + "epoch": 3.926696832579186, + "grad_norm": 0.9055293202400208, + "learning_rate": 0.00018139381091072213, + "loss": 0.0869, + "mean_token_accuracy": 0.976190984249115, + "num_tokens": 9586725.0, + "step": 1087 + }, + { + "entropy": 1.317764014005661, + "epoch": 3.930316742081448, + "grad_norm": 0.34299445152282715, + "learning_rate": 0.00018099012780371814, + "loss": 0.0193, + "mean_token_accuracy": 0.9950294345617294, + "num_tokens": 9595580.0, + "step": 1088 + }, + { + "entropy": 1.4780614078044891, + "epoch": 3.9339366515837106, + "grad_norm": 0.45136016607284546, + "learning_rate": 0.00018058682473686075, + "loss": 0.03, + "mean_token_accuracy": 0.9902182072401047, + "num_tokens": 9603693.0, + "step": 1089 + }, + { + "entropy": 1.3006681501865387, + "epoch": 3.937556561085973, + "grad_norm": 0.8072985410690308, + "learning_rate": 0.00018018390324194637, + "loss": 0.1406, + "mean_token_accuracy": 0.9719722718000412, + "num_tokens": 9613517.0, + "step": 1090 + }, + { + "entropy": 1.2329892814159393, + "epoch": 3.9411764705882355, + "grad_norm": 0.7343161702156067, + "learning_rate": 0.00017978136484932198, + "loss": 0.1221, + "mean_token_accuracy": 0.9715431183576584, + "num_tokens": 9623002.0, + "step": 1091 + }, + { + "entropy": 1.339203268289566, + "epoch": 3.944796380090498, + "grad_norm": 0.422463595867157, + "learning_rate": 0.00017937921108787986, + "loss": 0.0366, + "mean_token_accuracy": 0.9875014275312424, + "num_tokens": 9631474.0, + "step": 1092 + }, + { + "entropy": 1.3312835395336151, + "epoch": 3.9484162895927604, + "grad_norm": 0.5072442889213562, + "learning_rate": 0.00017897744348505123, + "loss": 0.0561, + "mean_token_accuracy": 0.9836284965276718, + "num_tokens": 9640156.0, + "step": 1093 + }, + { + "entropy": 1.4004390239715576, + "epoch": 3.952036199095023, + "grad_norm": 0.48746341466903687, + "learning_rate": 0.0001785760635668007, + "loss": 0.044, + "mean_token_accuracy": 0.9849557876586914, + "num_tokens": 9648458.0, + "step": 1094 + }, + { + "entropy": 1.350889652967453, + "epoch": 3.9556561085972852, + "grad_norm": 0.41746893525123596, + "learning_rate": 0.00017817507285762023, + "loss": 0.0532, + "mean_token_accuracy": 0.9855844229459763, + "num_tokens": 9657136.0, + "step": 1095 + }, + { + "entropy": 1.237421602010727, + "epoch": 3.9592760180995477, + "grad_norm": 0.46493780612945557, + "learning_rate": 0.00017777447288052373, + "loss": 0.0759, + "mean_token_accuracy": 0.9721266627311707, + "num_tokens": 9667058.0, + "step": 1096 + }, + { + "entropy": 1.3534648716449738, + "epoch": 3.96289592760181, + "grad_norm": 0.406345933675766, + "learning_rate": 0.000177374265157041, + "loss": 0.0537, + "mean_token_accuracy": 0.9838696867227554, + "num_tokens": 9675627.0, + "step": 1097 + }, + { + "entropy": 1.237879753112793, + "epoch": 3.9665158371040725, + "grad_norm": 0.527837872505188, + "learning_rate": 0.00017697445120721175, + "loss": 0.0737, + "mean_token_accuracy": 0.9752355068922043, + "num_tokens": 9685091.0, + "step": 1098 + }, + { + "entropy": 1.2298554480075836, + "epoch": 3.970135746606335, + "grad_norm": 0.45402729511260986, + "learning_rate": 0.00017657503254958054, + "loss": 0.0556, + "mean_token_accuracy": 0.9843012988567352, + "num_tokens": 9694688.0, + "step": 1099 + }, + { + "entropy": 1.270505130290985, + "epoch": 3.9737556561085974, + "grad_norm": 0.6557897329330444, + "learning_rate": 0.00017617601070119037, + "loss": 0.0918, + "mean_token_accuracy": 0.9786079078912735, + "num_tokens": 9704286.0, + "step": 1100 + }, + { + "entropy": 1.3577526807785034, + "epoch": 3.97737556561086, + "grad_norm": 0.48044729232788086, + "learning_rate": 0.0001757773871775768, + "loss": 0.0564, + "mean_token_accuracy": 0.9776984602212906, + "num_tokens": 9712668.0, + "step": 1101 + }, + { + "entropy": 1.309740036725998, + "epoch": 3.9809954751131222, + "grad_norm": 0.8556230664253235, + "learning_rate": 0.00017537916349276303, + "loss": 0.2013, + "mean_token_accuracy": 0.9610435962677002, + "num_tokens": 9722042.0, + "step": 1102 + }, + { + "entropy": 1.375863939523697, + "epoch": 3.9846153846153847, + "grad_norm": 0.4123291075229645, + "learning_rate": 0.00017498134115925327, + "loss": 0.0208, + "mean_token_accuracy": 0.9937012493610382, + "num_tokens": 9730420.0, + "step": 1103 + }, + { + "entropy": 1.3407017588615417, + "epoch": 3.988235294117647, + "grad_norm": 0.3886757493019104, + "learning_rate": 0.0001745839216880275, + "loss": 0.0223, + "mean_token_accuracy": 0.9922950863838196, + "num_tokens": 9739292.0, + "step": 1104 + }, + { + "entropy": 1.2783922851085663, + "epoch": 3.9918552036199095, + "grad_norm": 0.39245131611824036, + "learning_rate": 0.00017418690658853542, + "loss": 0.0607, + "mean_token_accuracy": 0.9823390543460846, + "num_tokens": 9748635.0, + "step": 1105 + }, + { + "entropy": 1.3240907490253448, + "epoch": 3.995475113122172, + "grad_norm": 0.925537645816803, + "learning_rate": 0.00017379029736869103, + "loss": 0.1301, + "mean_token_accuracy": 0.9688823968172073, + "num_tokens": 9757450.0, + "step": 1106 + }, + { + "entropy": 1.2996585667133331, + "epoch": 3.9990950226244344, + "grad_norm": 0.5589770674705505, + "learning_rate": 0.00017339409553486675, + "loss": 0.0833, + "mean_token_accuracy": 0.9765840470790863, + "num_tokens": 9766204.0, + "step": 1107 + }, + { + "entropy": 1.3336817026138306, + "epoch": 4.0, + "grad_norm": 1.8711317777633667, + "learning_rate": 0.00017299830259188753, + "loss": 0.0647, + "mean_token_accuracy": 0.9789473414421082, + "num_tokens": 9766900.0, + "step": 1108 + }, + { + "epoch": 4.0, + "eval_entropy": 1.3279241662684496, + "eval_loss": 0.13373936712741852, + "eval_mean_token_accuracy": 0.9691996351490176, + "eval_num_tokens": 9766900.0, + "eval_runtime": 116.1625, + "eval_samples_per_second": 3.177, + "eval_steps_per_second": 1.059, + "step": 1108 + } + ], + "logging_steps": 1, + "max_steps": 1662, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3262848215090586e+18, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1108/training_args.bin b/checkpoint-1108/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..070a2de135e794840c49e066215a1c9f2e550d1f --- /dev/null +++ b/checkpoint-1108/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc271f94ce32216bd6f2ee9866fb7d62a0583bc7ee0c7fa953fa57c302729c6c +size 6289 diff --git a/checkpoint-1385/README.md b/checkpoint-1385/README.md new file mode 100644 index 0000000000000000000000000000000000000000..58a4061707bcc32db3b543936f6b650c01f3dccb --- /dev/null +++ b/checkpoint-1385/README.md @@ -0,0 +1,208 @@ +--- +base_model: openai/gpt-oss-20b +library_name: peft +tags: +- base_model:adapter:openai/gpt-oss-20b +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/checkpoint-1385/adapter_config.json b/checkpoint-1385/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..076480eaf349cc658de2eb00b26c7360a85f8f56 --- /dev/null +++ b/checkpoint-1385/adapter_config.json @@ -0,0 +1,53 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "GptOssForCausalLM", + "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss" + }, + "base_model_name_or_path": "openai/gpt-oss-20b", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "o_proj", + "q_proj" + ], + "target_parameters": [ + "7.mlp.experts.gate_up_proj", + "7.mlp.experts.down_proj", + "15.mlp.experts.gate_up_proj", + "15.mlp.experts.down_proj", + "23.mlp.experts.gate_up_proj", + "23.mlp.experts.down_proj" + ], + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1385/adapter_model.safetensors b/checkpoint-1385/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..84bdf392c2da7551209c72b3ba62f5a4b6ec2a8c --- /dev/null +++ b/checkpoint-1385/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11039a14428ec46e436f20a30594362edacaabd6cc1801facc4113d6078503a7 +size 60189176 diff --git a/checkpoint-1385/chat_template.jinja b/checkpoint-1385/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc7bb11927d29f653ba2740f2db2c688fd77592f --- /dev/null +++ b/checkpoint-1385/chat_template.jinja @@ -0,0 +1,331 @@ +{#- + In addition to the normal inputs of `messages` and `tools`, this template also accepts the + following kwargs: + - "builtin_tools": A list, can contain "browser" and/or "python". + - "model_identity": A string that optionally describes the model identity. + - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium". + #} + +{#- Tool Definition Rendering ============================================== #} +{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%} + {%- if param_spec.type == "array" -%} + {%- if param_spec['items'] -%} + {%- if param_spec['items']['type'] == "string" -%} + {{- "string[]" }} + {%- elif param_spec['items']['type'] == "number" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "integer" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "boolean" -%} + {{- "boolean[]" }} + {%- else -%} + {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%} + {%- if inner_type == "object | object" or inner_type|length > 50 -%} + {{- "any[]" }} + {%- else -%} + {{- inner_type + "[]" }} + {%- endif -%} + {%- endif -%} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- else -%} + {{- "any[]" }} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%} + {#- Handle array of types like ["object", "object"] from Union[dict, list] #} + {%- if param_spec.type | length > 1 -%} + {{- param_spec.type | join(" | ") }} + {%- else -%} + {{- param_spec.type[0] }} + {%- endif -%} + {%- elif param_spec.oneOf -%} + {#- Handle oneOf schemas - check for complex unions and fallback to any #} + {%- set has_object_variants = false -%} + {%- for variant in param_spec.oneOf -%} + {%- if variant.type == "object" -%} + {%- set has_object_variants = true -%} + {%- endif -%} + {%- endfor -%} + {%- if has_object_variants and param_spec.oneOf|length > 1 -%} + {{- "any" }} + {%- else -%} + {%- for variant in param_spec.oneOf -%} + {{- render_typescript_type(variant, required_params) -}} + {%- if variant.description %} + {{- "// " + variant.description }} + {%- endif -%} + {%- if variant.default is defined %} + {{ "// default: " + variant.default|tojson }} + {%- endif -%} + {%- if not loop.last %} + {{- " | " }} + {% endif -%} + {%- endfor -%} + {%- endif -%} + {%- elif param_spec.type == "string" -%} + {%- if param_spec.enum -%} + {{- '"' + param_spec.enum|join('" | "') + '"' -}} + {%- else -%} + {{- "string" }} + {%- if param_spec.nullable %} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type == "number" -%} + {{- "number" }} + {%- elif param_spec.type == "integer" -%} + {{- "number" }} + {%- elif param_spec.type == "boolean" -%} + {{- "boolean" }} + + {%- elif param_spec.type == "object" -%} + {%- if param_spec.properties -%} + {{- "{\n" }} + {%- for prop_name, prop_spec in param_spec.properties.items() -%} + {{- prop_name -}} + {%- if prop_name not in (param_spec.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{ render_typescript_type(prop_spec, param_spec.required or []) }} + {%- if not loop.last -%} + {{-", " }} + {%- endif -%} + {%- endfor -%} + {{- "}" }} + {%- else -%} + {{- "object" }} + {%- endif -%} + {%- else -%} + {{- "any" }} + {%- endif -%} +{%- endmacro -%} + +{%- macro render_tool_namespace(namespace_name, tools) -%} + {{- "## " + namespace_name + "\n\n" }} + {{- "namespace " + namespace_name + " {\n\n" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- "// " + tool.description + "\n" }} + {{- "type "+ tool.name + " = " }} + {%- if tool.parameters and tool.parameters.properties %} + {{- "(_: {\n" }} + {%- for param_name, param_spec in tool.parameters.properties.items() %} + {%- if param_spec.description %} + {{- "// " + param_spec.description + "\n" }} + {%- endif %} + {{- param_name }} + {%- if param_name not in (tool.parameters.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{- render_typescript_type(param_spec, tool.parameters.required or []) }} + {%- if param_spec.default is defined -%} + {%- if param_spec.enum %} + {{- ", // default: " + param_spec.default }} + {%- elif param_spec.oneOf %} + {{- "// default: " + param_spec.default }} + {%- else %} + {{- ", // default: " + param_spec.default|tojson }} + {%- endif -%} + {%- endif -%} + {%- if not loop.last %} + {{- ",\n" }} + {%- else %} + {{- ",\n" }} + {%- endif -%} + {%- endfor %} + {{- "}) => any;\n\n" }} + {%- else -%} + {{- "() => any;\n\n" }} + {%- endif -%} + {%- endfor %} + {{- "} // namespace " + namespace_name }} +{%- endmacro -%} + +{%- macro render_builtin_tools(browser_tool, python_tool) -%} + {%- if browser_tool %} + {{- "## browser\n\n" }} + {{- "// Tool for browsing.\n" }} + {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }} + {{- "// Cite information from the tool using the following format:\n" }} + {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }} + {{- "// Do not quote more than 10 words directly from the tool output.\n" }} + {{- "// sources=web (default: web)\n" }} + {{- "namespace browser {\n\n" }} + {{- "// Searches for information related to `query` and displays `topn` results.\n" }} + {{- "type search = (_: {\n" }} + {{- "query: string,\n" }} + {{- "topn?: number, // default: 10\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }} + {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }} + {{- "// If `cursor` is not provided, the most recent page is implied.\n" }} + {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }} + {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }} + {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }} + {{- "type open = (_: {\n" }} + {{- "id?: number | string, // default: -1\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "loc?: number, // default: -1\n" }} + {{- "num_lines?: number, // default: -1\n" }} + {{- "view_source?: boolean, // default: false\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }} + {{- "type find = (_: {\n" }} + {{- "pattern: string,\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "}) => any;\n\n" }} + {{- "} // namespace browser\n\n" }} + {%- endif -%} + + {%- if python_tool %} + {{- "## python\n\n" }} + {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }} + {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }} + {%- endif -%} +{%- endmacro -%} + +{#- System Message Construction ============================================ #} +{%- macro build_system_message() -%} + {%- if model_identity is not defined %} + {%- set model_identity = "You are ChatGPT, a large language model trained by OpenAI." %} + {%- endif %} + {{- model_identity + "\n" }} + {{- "Knowledge cutoff: 2024-06\n" }} + {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }} + {%- if reasoning_effort is not defined %} + {%- set reasoning_effort = "medium" %} + {%- endif %} + {{- "Reasoning: " + reasoning_effort + "\n\n" }} + {%- if builtin_tools %} + {{- "# Tools\n\n" }} + {%- set available_builtin_tools = namespace(browser=false, python=false) %} + {%- for tool in builtin_tools %} + {%- if tool == "browser" %} + {%- set available_builtin_tools.browser = true %} + {%- elif tool == "python" %} + {%- set available_builtin_tools.python = true %} + {%- endif %} + {%- endfor %} + {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }} + {%- endif -%} + {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }} + {%- if tools -%} + {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }} + {%- endif -%} +{%- endmacro -%} + +{#- Main Template Logic ================================================= #} +{#- Set defaults #} + +{#- Render system message #} +{{- "<|start|>system<|message|>" }} +{{- build_system_message() }} +{{- "<|end|>" }} + +{#- Extract developer message #} +{%- if messages[0].role == "developer" or messages[0].role == "system" %} + {%- set developer_message = messages[0].content %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set developer_message = "" %} + {%- set loop_messages = messages %} +{%- endif %} + +{#- Render developer message #} +{%- if developer_message or tools %} + {{- "<|start|>developer<|message|>" }} + {%- if developer_message %} + {{- "# Instructions\n\n" }} + {{- developer_message }} + {{- "\n\n" }} + {%- endif %} + {%- if tools -%} + {{- "# Tools\n\n" }} + {{- render_tool_namespace("functions", tools) }} + {%- endif -%} + {{- "<|end|>" }} +{%- endif %} + +{#- Render messages #} +{%- set last_tool_call = namespace(name=none) %} +{%- for message in loop_messages -%} + {#- At this point only assistant/user/tool messages should remain #} + {%- if message.role == 'assistant' -%} + {#- Checks to ensure the messages are being passed in the format we expect #} + {%- if "content" in message %} + {%- if "<|channel|>analysis<|message|>" in message.content or "<|channel|>final<|message|>" in message.content %} + {{- raise_exception("You have passed a message containing <|channel|> tags in the content field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }} + {%- endif %} + {%- endif %} + {%- if "thinking" in message %} + {%- if "<|channel|>analysis<|message|>" in message.thinking or "<|channel|>final<|message|>" in message.thinking %} + {{- raise_exception("You have passed a message containing <|channel|> tags in the thinking field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }} + {%- endif %} + {%- endif %} + {%- if "tool_calls" in message %} + {#- We need very careful handling here - we want to drop the tool call analysis message if the model #} + {#- has output a later <|final|> message, but otherwise we want to retain it. This is the only case #} + {#- when we render CoT/analysis messages in inference. #} + {%- set future_final_message = namespace(found=false) %} + {%- for future_message in loop_messages[loop.index:] %} + {%- if future_message.role == 'assistant' and "tool_calls" not in future_message %} + {%- set future_final_message.found = true %} + {%- endif %} + {%- endfor %} + {#- We assume max 1 tool call per message, and so we infer the tool call name #} + {#- in "tool" messages from the most recent assistant tool call name #} + {%- set tool_call = message.tool_calls[0] %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if message.content and message.thinking %} + {{- raise_exception("Cannot pass both content and thinking in an assistant message with tool calls! Put the analysis message in one or the other, but not both.") }} + {%- elif message.content and not future_final_message.found %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }} + {%- elif message.thinking and not future_final_message.found %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {%- endif %} + {{- "<|start|>assistant to=" }} + {{- "functions." + tool_call.name + "<|channel|>commentary " }} + {{- (tool_call.content_type if tool_call.content_type is defined else "json") + "<|message|>" }} + {{- tool_call.arguments|tojson }} + {{- "<|call|>" }} + {%- set last_tool_call.name = tool_call.name %} + {%- elif loop.last and not add_generation_prompt %} + {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #} + {#- This is a situation that should only occur in training, never in inference. #} + {%- if "thinking" in message %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {%- endif %} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }} + {%- else %} + {#- CoT is dropped during all previous turns, so we never render it for inference #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- endif %} + {%- elif message.role == 'tool' -%} + {%- if last_tool_call.name is none %} + {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }} + {%- endif %} + {{- "<|start|>functions." + last_tool_call.name }} + {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} + {%- elif message.role == 'user' -%} + {{- "<|start|>user<|message|>" + message.content + "<|end|>" }} + {%- endif -%} +{%- endfor -%} + +{#- Generation prompt #} +{%- if add_generation_prompt -%} +<|start|>assistant +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-1385/optimizer.pt b/checkpoint-1385/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8989c799bd85463e9c7278cacd37569418342344 --- /dev/null +++ b/checkpoint-1385/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dd221e8c2d4ec7451f217aa60891172fb47e3bedd1a7d14746f6763481e10f2 +size 120498699 diff --git a/checkpoint-1385/rng_state.pth b/checkpoint-1385/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9cd53ddd51087a39d299073b9d407413ac1f02a5 --- /dev/null +++ b/checkpoint-1385/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17cd930da9783ca70bad4b9cdeee6a06c0acea8f34645a333c93341f487f66a3 +size 14645 diff --git a/checkpoint-1385/scheduler.pt b/checkpoint-1385/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c64e0baf97ede434b54a16f8afd614d188e9786 --- /dev/null +++ b/checkpoint-1385/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a72d77a004fcee2805a327f62e84881c19258b5adac872dc7a57295e1ceb656 +size 1465 diff --git a/checkpoint-1385/special_tokens_map.json b/checkpoint-1385/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c47e282982a9c6856832947a72ded329fad2e8c --- /dev/null +++ b/checkpoint-1385/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|return|>" +} diff --git a/checkpoint-1385/tokenizer.json b/checkpoint-1385/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647 --- /dev/null +++ b/checkpoint-1385/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3 +size 27868174 diff --git a/checkpoint-1385/tokenizer_config.json b/checkpoint-1385/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e86f6faa71de0fc3afe47ea8984da9e6138c031c --- /dev/null +++ b/checkpoint-1385/tokenizer_config.json @@ -0,0 +1,183 @@ +{ + "added_tokens_decoder": { + "199998": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "199999": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200000": { + "content": "<|reserved_200000|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200001": { + "content": "<|reserved_200001|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200002": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200003": { + "content": "<|constrain|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200004": { + "content": "<|reserved_200004|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200005": { + "content": "<|channel|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200006": { + "content": "<|start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200008": { + "content": "<|message|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200009": { + "content": "<|reserved_200009|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200010": { + "content": "<|reserved_200010|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200011": { + "content": "<|reserved_200011|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200012": { + "content": "<|call|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200013": { + "content": "<|reserved_200013|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200014": { + "content": "<|reserved_200014|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200015": { + "content": "<|reserved_200015|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200016": { + "content": "<|reserved_200016|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200017": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200018": { + "content": "<|endofprompt|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|startoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|return|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|return|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-1385/trainer_state.json b/checkpoint-1385/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3af2e7f08a3244c78678c612f172ea20114faac0 --- /dev/null +++ b/checkpoint-1385/trainer_state.json @@ -0,0 +1,13939 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 1385, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.358862280845642, + "epoch": 0.0036199095022624436, + "grad_norm": 2.292628288269043, + "learning_rate": 0.0, + "loss": 0.7311, + "mean_token_accuracy": 0.8534883409738541, + "num_tokens": 9316.0, + "step": 1 + }, + { + "entropy": 2.674945294857025, + "epoch": 0.007239819004524887, + "grad_norm": 3.8950836658477783, + "learning_rate": 1.0219999999999999e-05, + "loss": 1.0621, + "mean_token_accuracy": 0.8183160275220871, + "num_tokens": 17707.0, + "step": 2 + }, + { + "entropy": 2.4915525913238525, + "epoch": 0.01085972850678733, + "grad_norm": 2.792142868041992, + "learning_rate": 2.0439999999999997e-05, + "loss": 0.8448, + "mean_token_accuracy": 0.8489587754011154, + "num_tokens": 26783.0, + "step": 3 + }, + { + "entropy": 2.525622010231018, + "epoch": 0.014479638009049774, + "grad_norm": 2.7071900367736816, + "learning_rate": 3.0659999999999994e-05, + "loss": 0.8847, + "mean_token_accuracy": 0.8486668318510056, + "num_tokens": 35947.0, + "step": 4 + }, + { + "entropy": 2.588509976863861, + "epoch": 0.01809954751131222, + "grad_norm": 2.981574773788452, + "learning_rate": 4.0879999999999995e-05, + "loss": 1.0783, + "mean_token_accuracy": 0.8135111033916473, + "num_tokens": 44505.0, + "step": 5 + }, + { + "entropy": 2.662865400314331, + "epoch": 0.02171945701357466, + "grad_norm": 2.629283905029297, + "learning_rate": 5.1099999999999995e-05, + "loss": 0.9485, + "mean_token_accuracy": 0.8152717798948288, + "num_tokens": 53140.0, + "step": 6 + }, + { + "entropy": 2.6662243604660034, + "epoch": 0.025339366515837104, + "grad_norm": 2.730058431625366, + "learning_rate": 6.131999999999999e-05, + "loss": 0.6982, + "mean_token_accuracy": 0.8552135527133942, + "num_tokens": 61932.0, + "step": 7 + }, + { + "entropy": 2.661384105682373, + "epoch": 0.02895927601809955, + "grad_norm": 2.562839984893799, + "learning_rate": 7.154e-05, + "loss": 0.7296, + "mean_token_accuracy": 0.8579540699720383, + "num_tokens": 70973.0, + "step": 8 + }, + { + "entropy": 2.7889368534088135, + "epoch": 0.03257918552036199, + "grad_norm": 2.8640544414520264, + "learning_rate": 8.175999999999999e-05, + "loss": 0.5965, + "mean_token_accuracy": 0.8638457208871841, + "num_tokens": 79977.0, + "step": 9 + }, + { + "entropy": 2.811532199382782, + "epoch": 0.03619909502262444, + "grad_norm": 2.6199426651000977, + "learning_rate": 9.197999999999998e-05, + "loss": 0.4819, + "mean_token_accuracy": 0.8786454051733017, + "num_tokens": 88915.0, + "step": 10 + }, + { + "entropy": 2.941167712211609, + "epoch": 0.039819004524886875, + "grad_norm": 1.2497272491455078, + "learning_rate": 0.00010219999999999999, + "loss": 0.7192, + "mean_token_accuracy": 0.841494083404541, + "num_tokens": 97749.0, + "step": 11 + }, + { + "entropy": 3.0547962188720703, + "epoch": 0.04343891402714932, + "grad_norm": 1.436136245727539, + "learning_rate": 0.00011241999999999998, + "loss": 0.5908, + "mean_token_accuracy": 0.8657624870538712, + "num_tokens": 106048.0, + "step": 12 + }, + { + "entropy": 2.9914053082466125, + "epoch": 0.047058823529411764, + "grad_norm": 0.9903654456138611, + "learning_rate": 0.00012263999999999998, + "loss": 0.4008, + "mean_token_accuracy": 0.8985499292612076, + "num_tokens": 115216.0, + "step": 13 + }, + { + "entropy": 3.1867465376853943, + "epoch": 0.05067873303167421, + "grad_norm": 1.019572377204895, + "learning_rate": 0.00013286, + "loss": 0.5062, + "mean_token_accuracy": 0.8893097043037415, + "num_tokens": 124040.0, + "step": 14 + }, + { + "entropy": 3.2431325912475586, + "epoch": 0.05429864253393665, + "grad_norm": 1.2394084930419922, + "learning_rate": 0.00014308, + "loss": 0.361, + "mean_token_accuracy": 0.9009967148303986, + "num_tokens": 132447.0, + "step": 15 + }, + { + "entropy": 3.1858643889427185, + "epoch": 0.0579185520361991, + "grad_norm": 0.9859603643417358, + "learning_rate": 0.00015329999999999999, + "loss": 0.4498, + "mean_token_accuracy": 0.887280747294426, + "num_tokens": 141228.0, + "step": 16 + }, + { + "entropy": 3.5029141902923584, + "epoch": 0.06153846153846154, + "grad_norm": 1.453957438468933, + "learning_rate": 0.00016351999999999998, + "loss": 0.4949, + "mean_token_accuracy": 0.888081505894661, + "num_tokens": 149789.0, + "step": 17 + }, + { + "entropy": 3.4572895765304565, + "epoch": 0.06515837104072399, + "grad_norm": 1.390377402305603, + "learning_rate": 0.00017374, + "loss": 0.5449, + "mean_token_accuracy": 0.8745045810937881, + "num_tokens": 157813.0, + "step": 18 + }, + { + "entropy": 3.3081750869750977, + "epoch": 0.06877828054298643, + "grad_norm": 1.1171791553497314, + "learning_rate": 0.00018395999999999997, + "loss": 0.4786, + "mean_token_accuracy": 0.8893420845270157, + "num_tokens": 166315.0, + "step": 19 + }, + { + "entropy": 3.3776715993881226, + "epoch": 0.07239819004524888, + "grad_norm": 1.5567998886108398, + "learning_rate": 0.00019418, + "loss": 0.3669, + "mean_token_accuracy": 0.9146632701158524, + "num_tokens": 175207.0, + "step": 20 + }, + { + "entropy": 3.2677870988845825, + "epoch": 0.0760180995475113, + "grad_norm": 1.7404611110687256, + "learning_rate": 0.00020439999999999998, + "loss": 0.5287, + "mean_token_accuracy": 0.8777483552694321, + "num_tokens": 183833.0, + "step": 21 + }, + { + "entropy": 3.313201069831848, + "epoch": 0.07963800904977375, + "grad_norm": 1.0836979150772095, + "learning_rate": 0.00021461999999999997, + "loss": 0.3014, + "mean_token_accuracy": 0.9215261936187744, + "num_tokens": 192591.0, + "step": 22 + }, + { + "entropy": 3.208672881126404, + "epoch": 0.0832579185520362, + "grad_norm": 1.2197301387786865, + "learning_rate": 0.00022483999999999997, + "loss": 0.4401, + "mean_token_accuracy": 0.9031257778406143, + "num_tokens": 201372.0, + "step": 23 + }, + { + "entropy": 3.1830995082855225, + "epoch": 0.08687782805429864, + "grad_norm": 1.2422229051589966, + "learning_rate": 0.00023506, + "loss": 0.5144, + "mean_token_accuracy": 0.8915928155183792, + "num_tokens": 210348.0, + "step": 24 + }, + { + "entropy": 3.085207223892212, + "epoch": 0.09049773755656108, + "grad_norm": 0.8987624049186707, + "learning_rate": 0.00024527999999999996, + "loss": 0.3253, + "mean_token_accuracy": 0.9221627116203308, + "num_tokens": 219131.0, + "step": 25 + }, + { + "entropy": 3.026031017303467, + "epoch": 0.09411764705882353, + "grad_norm": 1.0273475646972656, + "learning_rate": 0.0002555, + "loss": 0.3495, + "mean_token_accuracy": 0.9147634357213974, + "num_tokens": 228292.0, + "step": 26 + }, + { + "entropy": 3.0420032739639282, + "epoch": 0.09773755656108597, + "grad_norm": 1.0590945482254028, + "learning_rate": 0.00026572, + "loss": 0.4495, + "mean_token_accuracy": 0.9019353687763214, + "num_tokens": 236942.0, + "step": 27 + }, + { + "entropy": 3.0469263792037964, + "epoch": 0.10135746606334842, + "grad_norm": 0.9584959745407104, + "learning_rate": 0.00027594, + "loss": 0.405, + "mean_token_accuracy": 0.9216890782117844, + "num_tokens": 245543.0, + "step": 28 + }, + { + "entropy": 2.92683744430542, + "epoch": 0.10497737556561086, + "grad_norm": 0.8826628923416138, + "learning_rate": 0.00028616, + "loss": 0.4004, + "mean_token_accuracy": 0.9173285663127899, + "num_tokens": 254264.0, + "step": 29 + }, + { + "entropy": 3.0086968541145325, + "epoch": 0.1085972850678733, + "grad_norm": 0.8521863222122192, + "learning_rate": 0.00029637999999999995, + "loss": 0.2876, + "mean_token_accuracy": 0.9335231184959412, + "num_tokens": 263143.0, + "step": 30 + }, + { + "entropy": 2.9086623191833496, + "epoch": 0.11221719457013575, + "grad_norm": 0.7830919623374939, + "learning_rate": 0.00030659999999999997, + "loss": 0.548, + "mean_token_accuracy": 0.8831343650817871, + "num_tokens": 272055.0, + "step": 31 + }, + { + "entropy": 2.9730575680732727, + "epoch": 0.1158371040723982, + "grad_norm": 0.7217472195625305, + "learning_rate": 0.00031682, + "loss": 0.3564, + "mean_token_accuracy": 0.9119151830673218, + "num_tokens": 280971.0, + "step": 32 + }, + { + "entropy": 3.081720530986786, + "epoch": 0.11945701357466064, + "grad_norm": 0.8697704076766968, + "learning_rate": 0.00032703999999999996, + "loss": 0.334, + "mean_token_accuracy": 0.9234935492277145, + "num_tokens": 289449.0, + "step": 33 + }, + { + "entropy": 3.1043431162834167, + "epoch": 0.12307692307692308, + "grad_norm": 0.7962514758110046, + "learning_rate": 0.00033726, + "loss": 0.1602, + "mean_token_accuracy": 0.9554370939731598, + "num_tokens": 297804.0, + "step": 34 + }, + { + "entropy": 3.0275490283966064, + "epoch": 0.12669683257918551, + "grad_norm": 0.5887104272842407, + "learning_rate": 0.00034748, + "loss": 0.2254, + "mean_token_accuracy": 0.9491932094097137, + "num_tokens": 306589.0, + "step": 35 + }, + { + "entropy": 3.099652886390686, + "epoch": 0.13031674208144797, + "grad_norm": 0.894397497177124, + "learning_rate": 0.00035769999999999997, + "loss": 0.6397, + "mean_token_accuracy": 0.8802188038825989, + "num_tokens": 315534.0, + "step": 36 + }, + { + "entropy": 3.0312134623527527, + "epoch": 0.1339366515837104, + "grad_norm": 0.6374682188034058, + "learning_rate": 0.00036791999999999993, + "loss": 0.2183, + "mean_token_accuracy": 0.9478497952222824, + "num_tokens": 324492.0, + "step": 37 + }, + { + "entropy": 3.28497713804245, + "epoch": 0.13755656108597286, + "grad_norm": 0.6740968823432922, + "learning_rate": 0.00037813999999999995, + "loss": 0.3619, + "mean_token_accuracy": 0.9288723170757294, + "num_tokens": 333195.0, + "step": 38 + }, + { + "entropy": 3.1478323340415955, + "epoch": 0.1411764705882353, + "grad_norm": 0.7235494256019592, + "learning_rate": 0.00038836, + "loss": 0.324, + "mean_token_accuracy": 0.9179254025220871, + "num_tokens": 342028.0, + "step": 39 + }, + { + "entropy": 3.279879152774811, + "epoch": 0.14479638009049775, + "grad_norm": 0.7512595653533936, + "learning_rate": 0.00039858, + "loss": 0.4804, + "mean_token_accuracy": 0.889826312661171, + "num_tokens": 350902.0, + "step": 40 + }, + { + "entropy": 3.173546612262726, + "epoch": 0.14841628959276018, + "grad_norm": 0.6978861689567566, + "learning_rate": 0.00040879999999999996, + "loss": 0.3442, + "mean_token_accuracy": 0.9205169230699539, + "num_tokens": 359787.0, + "step": 41 + }, + { + "entropy": 3.2385765314102173, + "epoch": 0.1520361990950226, + "grad_norm": 0.8108944892883301, + "learning_rate": 0.00041901999999999993, + "loss": 0.4223, + "mean_token_accuracy": 0.8979178965091705, + "num_tokens": 368426.0, + "step": 42 + }, + { + "entropy": 3.146568477153778, + "epoch": 0.15565610859728507, + "grad_norm": 0.5847787261009216, + "learning_rate": 0.00042923999999999995, + "loss": 0.1953, + "mean_token_accuracy": 0.9556037336587906, + "num_tokens": 377349.0, + "step": 43 + }, + { + "entropy": 3.066233277320862, + "epoch": 0.1592760180995475, + "grad_norm": 0.7887329459190369, + "learning_rate": 0.00043945999999999997, + "loss": 0.6815, + "mean_token_accuracy": 0.8654293268918991, + "num_tokens": 386603.0, + "step": 44 + }, + { + "entropy": 3.1745981574058533, + "epoch": 0.16289592760180996, + "grad_norm": 0.7280165553092957, + "learning_rate": 0.00044967999999999994, + "loss": 0.1932, + "mean_token_accuracy": 0.9479279220104218, + "num_tokens": 395070.0, + "step": 45 + }, + { + "entropy": 3.1094446182250977, + "epoch": 0.1665158371040724, + "grad_norm": 0.6453448534011841, + "learning_rate": 0.00045989999999999996, + "loss": 0.2608, + "mean_token_accuracy": 0.9249396026134491, + "num_tokens": 403651.0, + "step": 46 + }, + { + "entropy": 2.9050925970077515, + "epoch": 0.17013574660633485, + "grad_norm": 0.6689278483390808, + "learning_rate": 0.00047012, + "loss": 0.4489, + "mean_token_accuracy": 0.898686870932579, + "num_tokens": 412898.0, + "step": 47 + }, + { + "entropy": 3.2239145040512085, + "epoch": 0.17375565610859728, + "grad_norm": 1.0014020204544067, + "learning_rate": 0.00048033999999999994, + "loss": 0.3234, + "mean_token_accuracy": 0.9231891483068466, + "num_tokens": 421420.0, + "step": 48 + }, + { + "entropy": 3.035899817943573, + "epoch": 0.17737556561085974, + "grad_norm": 0.6415768265724182, + "learning_rate": 0.0004905599999999999, + "loss": 0.2259, + "mean_token_accuracy": 0.9447792917490005, + "num_tokens": 430258.0, + "step": 49 + }, + { + "entropy": 3.057477653026581, + "epoch": 0.18099547511312217, + "grad_norm": 0.6042271256446838, + "learning_rate": 0.0005007799999999999, + "loss": 0.2228, + "mean_token_accuracy": 0.9473378211259842, + "num_tokens": 439593.0, + "step": 50 + }, + { + "entropy": 2.8375911116600037, + "epoch": 0.18461538461538463, + "grad_norm": 0.739811897277832, + "learning_rate": 0.000511, + "loss": 0.3623, + "mean_token_accuracy": 0.9050924181938171, + "num_tokens": 449056.0, + "step": 51 + }, + { + "entropy": 2.9926682114601135, + "epoch": 0.18823529411764706, + "grad_norm": 0.6637321710586548, + "learning_rate": 0.0005109995633102972, + "loss": 0.2924, + "mean_token_accuracy": 0.9397273659706116, + "num_tokens": 457677.0, + "step": 52 + }, + { + "entropy": 2.7932987809181213, + "epoch": 0.19185520361990951, + "grad_norm": 0.5666584372520447, + "learning_rate": 0.0005109982532428477, + "loss": 0.2055, + "mean_token_accuracy": 0.9385408014059067, + "num_tokens": 466969.0, + "step": 53 + }, + { + "entropy": 2.765812337398529, + "epoch": 0.19547511312217195, + "grad_norm": 0.7875120639801025, + "learning_rate": 0.0005109960698026271, + "loss": 0.4549, + "mean_token_accuracy": 0.9052814990282059, + "num_tokens": 476285.0, + "step": 54 + }, + { + "entropy": 2.884207248687744, + "epoch": 0.19909502262443438, + "grad_norm": 0.7538661956787109, + "learning_rate": 0.0005109930129979285, + "loss": 0.3751, + "mean_token_accuracy": 0.9210246652364731, + "num_tokens": 484668.0, + "step": 55 + }, + { + "entropy": 2.779718518257141, + "epoch": 0.20271493212669683, + "grad_norm": 0.8069296479225159, + "learning_rate": 0.0005109890828403621, + "loss": 0.3664, + "mean_token_accuracy": 0.9219843596220016, + "num_tokens": 493292.0, + "step": 56 + }, + { + "entropy": 2.841543674468994, + "epoch": 0.20633484162895926, + "grad_norm": 0.5545904636383057, + "learning_rate": 0.0005109842793448548, + "loss": 0.1973, + "mean_token_accuracy": 0.9547395706176758, + "num_tokens": 501973.0, + "step": 57 + }, + { + "entropy": 2.8180030584335327, + "epoch": 0.20995475113122172, + "grad_norm": 1.015456199645996, + "learning_rate": 0.0005109786025296513, + "loss": 0.6019, + "mean_token_accuracy": 0.88613361120224, + "num_tokens": 510840.0, + "step": 58 + }, + { + "entropy": 2.7450912594795227, + "epoch": 0.21357466063348415, + "grad_norm": 0.6784740686416626, + "learning_rate": 0.0005109720524163127, + "loss": 0.2868, + "mean_token_accuracy": 0.9295425117015839, + "num_tokens": 519656.0, + "step": 59 + }, + { + "entropy": 2.822400987148285, + "epoch": 0.2171945701357466, + "grad_norm": 0.8780149817466736, + "learning_rate": 0.000510964629029717, + "loss": 0.4371, + "mean_token_accuracy": 0.9089596569538116, + "num_tokens": 528105.0, + "step": 60 + }, + { + "entropy": 2.522100865840912, + "epoch": 0.22081447963800904, + "grad_norm": 0.51394122838974, + "learning_rate": 0.0005109563323980594, + "loss": 0.2509, + "mean_token_accuracy": 0.941976860165596, + "num_tokens": 537707.0, + "step": 61 + }, + { + "entropy": 2.6596657633781433, + "epoch": 0.2244343891402715, + "grad_norm": 0.6359816789627075, + "learning_rate": 0.0005109471625528516, + "loss": 0.3685, + "mean_token_accuracy": 0.9191890209913254, + "num_tokens": 546517.0, + "step": 62 + }, + { + "entropy": 2.800311803817749, + "epoch": 0.22805429864253393, + "grad_norm": 0.6862941980361938, + "learning_rate": 0.0005109371195289215, + "loss": 0.2457, + "mean_token_accuracy": 0.9330879002809525, + "num_tokens": 555493.0, + "step": 63 + }, + { + "entropy": 2.7235344648361206, + "epoch": 0.2316742081447964, + "grad_norm": 1.0464682579040527, + "learning_rate": 0.0005109262033644142, + "loss": 0.4417, + "mean_token_accuracy": 0.8957678377628326, + "num_tokens": 564255.0, + "step": 64 + }, + { + "entropy": 2.6643534302711487, + "epoch": 0.23529411764705882, + "grad_norm": 1.0790019035339355, + "learning_rate": 0.0005109144141007903, + "loss": 0.4947, + "mean_token_accuracy": 0.8889007717370987, + "num_tokens": 573401.0, + "step": 65 + }, + { + "entropy": 2.760925054550171, + "epoch": 0.23891402714932128, + "grad_norm": 0.7957189679145813, + "learning_rate": 0.0005109017517828273, + "loss": 0.2259, + "mean_token_accuracy": 0.944578230381012, + "num_tokens": 581905.0, + "step": 66 + }, + { + "entropy": 2.7048792839050293, + "epoch": 0.2425339366515837, + "grad_norm": 0.9530714750289917, + "learning_rate": 0.0005108882164586181, + "loss": 0.3122, + "mean_token_accuracy": 0.9257418513298035, + "num_tokens": 590802.0, + "step": 67 + }, + { + "entropy": 2.6733291149139404, + "epoch": 0.24615384615384617, + "grad_norm": 0.8295993208885193, + "learning_rate": 0.0005108738081795716, + "loss": 0.3701, + "mean_token_accuracy": 0.898589238524437, + "num_tokens": 599279.0, + "step": 68 + }, + { + "entropy": 2.5613606572151184, + "epoch": 0.2497737556561086, + "grad_norm": 0.6205935478210449, + "learning_rate": 0.0005108585270004123, + "loss": 0.4372, + "mean_token_accuracy": 0.9116007685661316, + "num_tokens": 608107.0, + "step": 69 + }, + { + "entropy": 2.458296835422516, + "epoch": 0.25339366515837103, + "grad_norm": 0.7629838585853577, + "learning_rate": 0.0005108423729791799, + "loss": 0.2307, + "mean_token_accuracy": 0.9386163502931595, + "num_tokens": 616881.0, + "step": 70 + }, + { + "entropy": 2.4176695346832275, + "epoch": 0.25701357466063346, + "grad_norm": 0.902400016784668, + "learning_rate": 0.0005108253461772298, + "loss": 0.2853, + "mean_token_accuracy": 0.9237343072891235, + "num_tokens": 625323.0, + "step": 71 + }, + { + "entropy": 2.2265281677246094, + "epoch": 0.26063348416289595, + "grad_norm": 0.7744383811950684, + "learning_rate": 0.0005108074466592316, + "loss": 0.2435, + "mean_token_accuracy": 0.9508260935544968, + "num_tokens": 634260.0, + "step": 72 + }, + { + "entropy": 2.1855952441692352, + "epoch": 0.2642533936651584, + "grad_norm": 0.8615190386772156, + "learning_rate": 0.0005107886744931702, + "loss": 0.3323, + "mean_token_accuracy": 0.9276078194379807, + "num_tokens": 643235.0, + "step": 73 + }, + { + "entropy": 2.179121494293213, + "epoch": 0.2678733031674208, + "grad_norm": 0.8953279256820679, + "learning_rate": 0.0005107690297503444, + "loss": 0.2384, + "mean_token_accuracy": 0.9425230622291565, + "num_tokens": 652032.0, + "step": 74 + }, + { + "entropy": 2.1565526127815247, + "epoch": 0.27149321266968324, + "grad_norm": 0.6830486059188843, + "learning_rate": 0.0005107485125053678, + "loss": 0.2759, + "mean_token_accuracy": 0.9360661953687668, + "num_tokens": 660978.0, + "step": 75 + }, + { + "entropy": 2.0900665521621704, + "epoch": 0.2751131221719457, + "grad_norm": 0.786665141582489, + "learning_rate": 0.0005107271228361672, + "loss": 0.4061, + "mean_token_accuracy": 0.910009115934372, + "num_tokens": 669817.0, + "step": 76 + }, + { + "entropy": 2.1311859488487244, + "epoch": 0.27873303167420815, + "grad_norm": 0.6399909853935242, + "learning_rate": 0.0005107048608239836, + "loss": 0.272, + "mean_token_accuracy": 0.9424714297056198, + "num_tokens": 678469.0, + "step": 77 + }, + { + "entropy": 2.059997320175171, + "epoch": 0.2823529411764706, + "grad_norm": 0.8114754557609558, + "learning_rate": 0.0005106817265533706, + "loss": 0.4029, + "mean_token_accuracy": 0.9037660360336304, + "num_tokens": 687261.0, + "step": 78 + }, + { + "entropy": 1.9725019037723541, + "epoch": 0.285972850678733, + "grad_norm": 0.9420941472053528, + "learning_rate": 0.0005106577201121952, + "loss": 0.535, + "mean_token_accuracy": 0.8996377140283585, + "num_tokens": 695941.0, + "step": 79 + }, + { + "entropy": 1.9951164424419403, + "epoch": 0.2895927601809955, + "grad_norm": 0.6476142406463623, + "learning_rate": 0.0005106328415916372, + "loss": 0.2242, + "mean_token_accuracy": 0.941379725933075, + "num_tokens": 704643.0, + "step": 80 + }, + { + "entropy": 1.8962564170360565, + "epoch": 0.29321266968325793, + "grad_norm": 0.5974630117416382, + "learning_rate": 0.0005106070910861881, + "loss": 0.2934, + "mean_token_accuracy": 0.9217697530984879, + "num_tokens": 713605.0, + "step": 81 + }, + { + "entropy": 1.9781515896320343, + "epoch": 0.29683257918552036, + "grad_norm": 0.8755478262901306, + "learning_rate": 0.0005105804686936518, + "loss": 0.4551, + "mean_token_accuracy": 0.9051328897476196, + "num_tokens": 722385.0, + "step": 82 + }, + { + "entropy": 1.9892418384552002, + "epoch": 0.3004524886877828, + "grad_norm": 0.6887345314025879, + "learning_rate": 0.0005105529745151433, + "loss": 0.244, + "mean_token_accuracy": 0.9261117279529572, + "num_tokens": 730962.0, + "step": 83 + }, + { + "entropy": 2.0053181648254395, + "epoch": 0.3040723981900452, + "grad_norm": 0.6930885910987854, + "learning_rate": 0.0005105246086550893, + "loss": 0.3155, + "mean_token_accuracy": 0.9206147193908691, + "num_tokens": 739499.0, + "step": 84 + }, + { + "entropy": 1.9716475903987885, + "epoch": 0.3076923076923077, + "grad_norm": 0.5049461722373962, + "learning_rate": 0.0005104953712212266, + "loss": 0.2215, + "mean_token_accuracy": 0.9608763605356216, + "num_tokens": 748604.0, + "step": 85 + }, + { + "entropy": 1.9186978042125702, + "epoch": 0.31131221719457014, + "grad_norm": 0.5756685733795166, + "learning_rate": 0.000510465262324603, + "loss": 0.2658, + "mean_token_accuracy": 0.9372887462377548, + "num_tokens": 757919.0, + "step": 86 + }, + { + "entropy": 1.9738290905952454, + "epoch": 0.31493212669683257, + "grad_norm": 0.6163789629936218, + "learning_rate": 0.0005104342820795758, + "loss": 0.2472, + "mean_token_accuracy": 0.9430449157953262, + "num_tokens": 766708.0, + "step": 87 + }, + { + "entropy": 2.1927571892738342, + "epoch": 0.318552036199095, + "grad_norm": 0.7953162789344788, + "learning_rate": 0.0005104024306038119, + "loss": 0.261, + "mean_token_accuracy": 0.9425829648971558, + "num_tokens": 774601.0, + "step": 88 + }, + { + "entropy": 2.043731451034546, + "epoch": 0.3221719457013575, + "grad_norm": 0.8098088502883911, + "learning_rate": 0.0005103697080182872, + "loss": 0.3126, + "mean_token_accuracy": 0.9158089309930801, + "num_tokens": 783170.0, + "step": 89 + }, + { + "entropy": 1.9801572561264038, + "epoch": 0.3257918552036199, + "grad_norm": 0.5227240920066833, + "learning_rate": 0.0005103361144472864, + "loss": 0.1291, + "mean_token_accuracy": 0.9666071832180023, + "num_tokens": 791769.0, + "step": 90 + }, + { + "entropy": 1.9553790986537933, + "epoch": 0.32941176470588235, + "grad_norm": 0.7819464206695557, + "learning_rate": 0.0005103016500184022, + "loss": 0.531, + "mean_token_accuracy": 0.8817111849784851, + "num_tokens": 800824.0, + "step": 91 + }, + { + "entropy": 1.9291303753852844, + "epoch": 0.3330316742081448, + "grad_norm": 0.7178757190704346, + "learning_rate": 0.0005102663148625347, + "loss": 0.3301, + "mean_token_accuracy": 0.9357631802558899, + "num_tokens": 809347.0, + "step": 92 + }, + { + "entropy": 1.9846041798591614, + "epoch": 0.33665158371040727, + "grad_norm": 1.316636085510254, + "learning_rate": 0.0005102301091138916, + "loss": 0.4241, + "mean_token_accuracy": 0.8993304669857025, + "num_tokens": 817174.0, + "step": 93 + }, + { + "entropy": 1.814637303352356, + "epoch": 0.3402714932126697, + "grad_norm": 0.5486414432525635, + "learning_rate": 0.0005101930329099865, + "loss": 0.116, + "mean_token_accuracy": 0.9674727618694305, + "num_tokens": 826177.0, + "step": 94 + }, + { + "entropy": 1.9128066003322601, + "epoch": 0.3438914027149321, + "grad_norm": 0.620303750038147, + "learning_rate": 0.00051015508639164, + "loss": 0.1833, + "mean_token_accuracy": 0.9569521993398666, + "num_tokens": 835409.0, + "step": 95 + }, + { + "entropy": 1.7541870176792145, + "epoch": 0.34751131221719456, + "grad_norm": 0.8337438702583313, + "learning_rate": 0.0005101162697029776, + "loss": 0.3327, + "mean_token_accuracy": 0.9193180054426193, + "num_tokens": 844692.0, + "step": 96 + }, + { + "entropy": 1.8255240619182587, + "epoch": 0.351131221719457, + "grad_norm": 0.877780556678772, + "learning_rate": 0.00051007658299143, + "loss": 0.2106, + "mean_token_accuracy": 0.9527023881673813, + "num_tokens": 853309.0, + "step": 97 + }, + { + "entropy": 1.8611579239368439, + "epoch": 0.3547511312217195, + "grad_norm": 1.0667716264724731, + "learning_rate": 0.0005100360264077325, + "loss": 0.3196, + "mean_token_accuracy": 0.9195879399776459, + "num_tokens": 861859.0, + "step": 98 + }, + { + "entropy": 1.821915864944458, + "epoch": 0.3583710407239819, + "grad_norm": 0.8400309681892395, + "learning_rate": 0.0005099946001059241, + "loss": 0.4036, + "mean_token_accuracy": 0.8951036781072617, + "num_tokens": 871060.0, + "step": 99 + }, + { + "entropy": 1.7648265063762665, + "epoch": 0.36199095022624433, + "grad_norm": 1.1391404867172241, + "learning_rate": 0.0005099523042433472, + "loss": 0.389, + "mean_token_accuracy": 0.901309460401535, + "num_tokens": 880593.0, + "step": 100 + }, + { + "entropy": 1.8506875336170197, + "epoch": 0.36561085972850677, + "grad_norm": 0.6923297643661499, + "learning_rate": 0.000509909138980647, + "loss": 0.2504, + "mean_token_accuracy": 0.9384842216968536, + "num_tokens": 889739.0, + "step": 101 + }, + { + "entropy": 1.9311015605926514, + "epoch": 0.36923076923076925, + "grad_norm": 0.9677391052246094, + "learning_rate": 0.0005098651044817704, + "loss": 0.6953, + "mean_token_accuracy": 0.8752655684947968, + "num_tokens": 898992.0, + "step": 102 + }, + { + "entropy": 1.9590983986854553, + "epoch": 0.3728506787330317, + "grad_norm": 0.6364567279815674, + "learning_rate": 0.0005098202009139663, + "loss": 0.4318, + "mean_token_accuracy": 0.9056479930877686, + "num_tokens": 908225.0, + "step": 103 + }, + { + "entropy": 1.9455370008945465, + "epoch": 0.3764705882352941, + "grad_norm": 0.6747863292694092, + "learning_rate": 0.0005097744284477839, + "loss": 0.244, + "mean_token_accuracy": 0.9428392052650452, + "num_tokens": 917134.0, + "step": 104 + }, + { + "entropy": 1.8632825911045074, + "epoch": 0.38009049773755654, + "grad_norm": 0.5705651044845581, + "learning_rate": 0.0005097277872570731, + "loss": 0.2508, + "mean_token_accuracy": 0.9325222969055176, + "num_tokens": 926573.0, + "step": 105 + }, + { + "entropy": 1.9370323717594147, + "epoch": 0.38371040723981903, + "grad_norm": 0.6298627853393555, + "learning_rate": 0.000509680277518983, + "loss": 0.2481, + "mean_token_accuracy": 0.9281332045793533, + "num_tokens": 935853.0, + "step": 106 + }, + { + "entropy": 2.0217572450637817, + "epoch": 0.38733031674208146, + "grad_norm": 0.5434353947639465, + "learning_rate": 0.0005096318994139617, + "loss": 0.1809, + "mean_token_accuracy": 0.9592084139585495, + "num_tokens": 944279.0, + "step": 107 + }, + { + "entropy": 1.9619770646095276, + "epoch": 0.3909502262443439, + "grad_norm": 0.6959638595581055, + "learning_rate": 0.0005095826531257552, + "loss": 0.1376, + "mean_token_accuracy": 0.9608310014009476, + "num_tokens": 953336.0, + "step": 108 + }, + { + "entropy": 2.12511146068573, + "epoch": 0.3945701357466063, + "grad_norm": 1.0152848958969116, + "learning_rate": 0.0005095325388414074, + "loss": 0.4382, + "mean_token_accuracy": 0.915201798081398, + "num_tokens": 962002.0, + "step": 109 + }, + { + "entropy": 2.0171878039836884, + "epoch": 0.39819004524886875, + "grad_norm": 0.8337467312812805, + "learning_rate": 0.0005094815567512587, + "loss": 0.2672, + "mean_token_accuracy": 0.9313560128211975, + "num_tokens": 970954.0, + "step": 110 + }, + { + "entropy": 2.1024146378040314, + "epoch": 0.40180995475113124, + "grad_norm": 0.8214333057403564, + "learning_rate": 0.0005094297070489455, + "loss": 0.3146, + "mean_token_accuracy": 0.9289091974496841, + "num_tokens": 979929.0, + "step": 111 + }, + { + "entropy": 2.260519325733185, + "epoch": 0.40542986425339367, + "grad_norm": 1.1298810243606567, + "learning_rate": 0.0005093769899313996, + "loss": 0.3055, + "mean_token_accuracy": 0.9213490188121796, + "num_tokens": 988477.0, + "step": 112 + }, + { + "entropy": 2.2228699326515198, + "epoch": 0.4090497737556561, + "grad_norm": 0.8601953983306885, + "learning_rate": 0.0005093234055988475, + "loss": 0.2738, + "mean_token_accuracy": 0.920888364315033, + "num_tokens": 997091.0, + "step": 113 + }, + { + "entropy": 2.2165185809135437, + "epoch": 0.41266968325791853, + "grad_norm": 0.6331561803817749, + "learning_rate": 0.0005092689542548091, + "loss": 0.2241, + "mean_token_accuracy": 0.9408514499664307, + "num_tokens": 1005866.0, + "step": 114 + }, + { + "entropy": 2.324040472507477, + "epoch": 0.416289592760181, + "grad_norm": 0.680496096611023, + "learning_rate": 0.0005092136361060975, + "loss": 0.2454, + "mean_token_accuracy": 0.9433349967002869, + "num_tokens": 1014277.0, + "step": 115 + }, + { + "entropy": 2.413789749145508, + "epoch": 0.41990950226244345, + "grad_norm": 0.7489557862281799, + "learning_rate": 0.0005091574513628183, + "loss": 0.2856, + "mean_token_accuracy": 0.934124082326889, + "num_tokens": 1023032.0, + "step": 116 + }, + { + "entropy": 2.4693005681037903, + "epoch": 0.4235294117647059, + "grad_norm": 0.6842612624168396, + "learning_rate": 0.0005091004002383682, + "loss": 0.2778, + "mean_token_accuracy": 0.9386793673038483, + "num_tokens": 1031883.0, + "step": 117 + }, + { + "entropy": 2.4351969361305237, + "epoch": 0.4271493212669683, + "grad_norm": 0.9150674343109131, + "learning_rate": 0.0005090424829494347, + "loss": 0.3151, + "mean_token_accuracy": 0.9177709072828293, + "num_tokens": 1040985.0, + "step": 118 + }, + { + "entropy": 2.5141562819480896, + "epoch": 0.4307692307692308, + "grad_norm": 1.0200655460357666, + "learning_rate": 0.000508983699715995, + "loss": 0.5134, + "mean_token_accuracy": 0.8835459351539612, + "num_tokens": 1049949.0, + "step": 119 + }, + { + "entropy": 2.479240596294403, + "epoch": 0.4343891402714932, + "grad_norm": 0.783278226852417, + "learning_rate": 0.0005089240507613151, + "loss": 0.2745, + "mean_token_accuracy": 0.9389322698116302, + "num_tokens": 1058953.0, + "step": 120 + }, + { + "entropy": 2.457803785800934, + "epoch": 0.43800904977375565, + "grad_norm": 0.7620834112167358, + "learning_rate": 0.0005088635363119497, + "loss": 0.3394, + "mean_token_accuracy": 0.9145695865154266, + "num_tokens": 1068624.0, + "step": 121 + }, + { + "entropy": 2.4909247756004333, + "epoch": 0.4416289592760181, + "grad_norm": 0.5868712067604065, + "learning_rate": 0.0005088021565977403, + "loss": 0.1726, + "mean_token_accuracy": 0.9567564129829407, + "num_tokens": 1077686.0, + "step": 122 + }, + { + "entropy": 2.5540462732315063, + "epoch": 0.4452488687782805, + "grad_norm": 1.1467291116714478, + "learning_rate": 0.0005087399118518148, + "loss": 0.2617, + "mean_token_accuracy": 0.9329706132411957, + "num_tokens": 1086230.0, + "step": 123 + }, + { + "entropy": 2.377680242061615, + "epoch": 0.448868778280543, + "grad_norm": 0.7021825909614563, + "learning_rate": 0.0005086768023105866, + "loss": 0.4124, + "mean_token_accuracy": 0.9093360006809235, + "num_tokens": 1095867.0, + "step": 124 + }, + { + "entropy": 2.55239599943161, + "epoch": 0.45248868778280543, + "grad_norm": 0.5947801470756531, + "learning_rate": 0.0005086128282137538, + "loss": 0.2752, + "mean_token_accuracy": 0.9248816668987274, + "num_tokens": 1105003.0, + "step": 125 + }, + { + "entropy": 2.4695483446121216, + "epoch": 0.45610859728506786, + "grad_norm": 1.345604658126831, + "learning_rate": 0.0005085479898042985, + "loss": 0.2577, + "mean_token_accuracy": 0.9318550229072571, + "num_tokens": 1114162.0, + "step": 126 + }, + { + "entropy": 2.4898732900619507, + "epoch": 0.4597285067873303, + "grad_norm": 0.8534179329872131, + "learning_rate": 0.0005084822873284848, + "loss": 0.3013, + "mean_token_accuracy": 0.9195661097764969, + "num_tokens": 1123457.0, + "step": 127 + }, + { + "entropy": 2.5951223969459534, + "epoch": 0.4633484162895928, + "grad_norm": 1.1677368879318237, + "learning_rate": 0.0005084157210358592, + "loss": 0.1612, + "mean_token_accuracy": 0.9599333852529526, + "num_tokens": 1131774.0, + "step": 128 + }, + { + "entropy": 2.7315847873687744, + "epoch": 0.4669683257918552, + "grad_norm": 0.7633224129676819, + "learning_rate": 0.0005083482911792492, + "loss": 0.2437, + "mean_token_accuracy": 0.9487509876489639, + "num_tokens": 1140301.0, + "step": 129 + }, + { + "entropy": 2.6348633766174316, + "epoch": 0.47058823529411764, + "grad_norm": 0.7573317885398865, + "learning_rate": 0.0005082799980147617, + "loss": 0.2426, + "mean_token_accuracy": 0.947308748960495, + "num_tokens": 1148929.0, + "step": 130 + }, + { + "entropy": 2.60002738237381, + "epoch": 0.47420814479638007, + "grad_norm": 1.8195319175720215, + "learning_rate": 0.0005082108418017829, + "loss": 0.1792, + "mean_token_accuracy": 0.9512491375207901, + "num_tokens": 1157682.0, + "step": 131 + }, + { + "entropy": 2.5319923162460327, + "epoch": 0.47782805429864256, + "grad_norm": 0.6342993378639221, + "learning_rate": 0.0005081408228029771, + "loss": 0.1843, + "mean_token_accuracy": 0.9440758228302002, + "num_tokens": 1166687.0, + "step": 132 + }, + { + "entropy": 2.5666881799697876, + "epoch": 0.481447963800905, + "grad_norm": 0.8979415893554688, + "learning_rate": 0.0005080699412842852, + "loss": 0.4824, + "mean_token_accuracy": 0.8837443292140961, + "num_tokens": 1175746.0, + "step": 133 + }, + { + "entropy": 2.6854636669158936, + "epoch": 0.4850678733031674, + "grad_norm": 0.8302125334739685, + "learning_rate": 0.0005079981975149243, + "loss": 0.267, + "mean_token_accuracy": 0.9279022663831711, + "num_tokens": 1184196.0, + "step": 134 + }, + { + "entropy": 2.564552128314972, + "epoch": 0.48868778280542985, + "grad_norm": 0.6785959005355835, + "learning_rate": 0.0005079255917673863, + "loss": 0.2031, + "mean_token_accuracy": 0.9463823586702347, + "num_tokens": 1192982.0, + "step": 135 + }, + { + "entropy": 2.673682928085327, + "epoch": 0.49230769230769234, + "grad_norm": 1.4760410785675049, + "learning_rate": 0.0005078521243174371, + "loss": 0.4791, + "mean_token_accuracy": 0.8969505727291107, + "num_tokens": 1201454.0, + "step": 136 + }, + { + "entropy": 2.6232714653015137, + "epoch": 0.49592760180995477, + "grad_norm": 0.7845668792724609, + "learning_rate": 0.0005077777954441157, + "loss": 0.2472, + "mean_token_accuracy": 0.9404618591070175, + "num_tokens": 1210182.0, + "step": 137 + }, + { + "entropy": 2.5614060163497925, + "epoch": 0.4995475113122172, + "grad_norm": 0.725419819355011, + "learning_rate": 0.0005077026054297322, + "loss": 0.3643, + "mean_token_accuracy": 0.9193316847085953, + "num_tokens": 1219487.0, + "step": 138 + }, + { + "entropy": 2.5907246470451355, + "epoch": 0.5031674208144796, + "grad_norm": 0.7741782665252686, + "learning_rate": 0.0005076265545598682, + "loss": 0.276, + "mean_token_accuracy": 0.9447730481624603, + "num_tokens": 1228066.0, + "step": 139 + }, + { + "entropy": 2.531104028224945, + "epoch": 0.5067873303167421, + "grad_norm": 0.680992603302002, + "learning_rate": 0.0005075496431233745, + "loss": 0.2004, + "mean_token_accuracy": 0.9470729678869247, + "num_tokens": 1236980.0, + "step": 140 + }, + { + "entropy": 2.590231478214264, + "epoch": 0.5104072398190045, + "grad_norm": 0.8260406255722046, + "learning_rate": 0.0005074718714123704, + "loss": 0.2756, + "mean_token_accuracy": 0.9301882535219193, + "num_tokens": 1245565.0, + "step": 141 + }, + { + "entropy": 2.4858668446540833, + "epoch": 0.5140271493212669, + "grad_norm": 0.8085922598838806, + "learning_rate": 0.0005073932397222429, + "loss": 0.2314, + "mean_token_accuracy": 0.9449103325605392, + "num_tokens": 1254366.0, + "step": 142 + }, + { + "entropy": 2.5374304056167603, + "epoch": 0.5176470588235295, + "grad_norm": 0.7858129143714905, + "learning_rate": 0.0005073137483516452, + "loss": 0.1622, + "mean_token_accuracy": 0.9510673582553864, + "num_tokens": 1263197.0, + "step": 143 + }, + { + "entropy": 2.608425199985504, + "epoch": 0.5212669683257919, + "grad_norm": 1.2698506116867065, + "learning_rate": 0.0005072333976024957, + "loss": 0.1729, + "mean_token_accuracy": 0.9509973376989365, + "num_tokens": 1271725.0, + "step": 144 + }, + { + "entropy": 2.437038242816925, + "epoch": 0.5248868778280543, + "grad_norm": 1.0788538455963135, + "learning_rate": 0.0005071521877799765, + "loss": 0.3344, + "mean_token_accuracy": 0.9166721999645233, + "num_tokens": 1280963.0, + "step": 145 + }, + { + "entropy": 2.589951515197754, + "epoch": 0.5285067873303168, + "grad_norm": 0.9228294491767883, + "learning_rate": 0.0005070701191925332, + "loss": 0.3095, + "mean_token_accuracy": 0.9239777624607086, + "num_tokens": 1289683.0, + "step": 146 + }, + { + "entropy": 2.575794994831085, + "epoch": 0.5321266968325792, + "grad_norm": 1.359767198562622, + "learning_rate": 0.0005069871921518726, + "loss": 0.2447, + "mean_token_accuracy": 0.9374738186597824, + "num_tokens": 1298397.0, + "step": 147 + }, + { + "entropy": 2.5628358721733093, + "epoch": 0.5357466063348416, + "grad_norm": 0.9870713353157043, + "learning_rate": 0.000506903406972962, + "loss": 0.4824, + "mean_token_accuracy": 0.9027767181396484, + "num_tokens": 1307191.0, + "step": 148 + }, + { + "entropy": 2.5513240098953247, + "epoch": 0.539366515837104, + "grad_norm": 0.7921387553215027, + "learning_rate": 0.0005068187639740286, + "loss": 0.3278, + "mean_token_accuracy": 0.9161934554576874, + "num_tokens": 1315878.0, + "step": 149 + }, + { + "entropy": 2.526439070701599, + "epoch": 0.5429864253393665, + "grad_norm": 0.6320391297340393, + "learning_rate": 0.000506733263476557, + "loss": 0.1701, + "mean_token_accuracy": 0.9575318098068237, + "num_tokens": 1324786.0, + "step": 150 + }, + { + "entropy": 2.4837265014648438, + "epoch": 0.5466063348416289, + "grad_norm": 0.5369354486465454, + "learning_rate": 0.000506646905805289, + "loss": 0.1328, + "mean_token_accuracy": 0.9636050164699554, + "num_tokens": 1333766.0, + "step": 151 + }, + { + "entropy": 2.5264737010002136, + "epoch": 0.5502262443438914, + "grad_norm": 0.7346852421760559, + "learning_rate": 0.0005065596912882222, + "loss": 0.2012, + "mean_token_accuracy": 0.9448132663965225, + "num_tokens": 1343004.0, + "step": 152 + }, + { + "entropy": 2.569309651851654, + "epoch": 0.5538461538461539, + "grad_norm": 0.9926508069038391, + "learning_rate": 0.0005064716202566082, + "loss": 0.2831, + "mean_token_accuracy": 0.9332023113965988, + "num_tokens": 1351561.0, + "step": 153 + }, + { + "entropy": 2.3148274421691895, + "epoch": 0.5574660633484163, + "grad_norm": 0.6301954984664917, + "learning_rate": 0.0005063826930449523, + "loss": 0.3622, + "mean_token_accuracy": 0.9349419325590134, + "num_tokens": 1360997.0, + "step": 154 + }, + { + "entropy": 2.497675657272339, + "epoch": 0.5610859728506787, + "grad_norm": 0.8846175670623779, + "learning_rate": 0.000506292909991011, + "loss": 0.2314, + "mean_token_accuracy": 0.9468862265348434, + "num_tokens": 1369600.0, + "step": 155 + }, + { + "entropy": 2.313987612724304, + "epoch": 0.5647058823529412, + "grad_norm": 0.5701894164085388, + "learning_rate": 0.0005062022714357922, + "loss": 0.2154, + "mean_token_accuracy": 0.945093959569931, + "num_tokens": 1379125.0, + "step": 156 + }, + { + "entropy": 2.4019755125045776, + "epoch": 0.5683257918552036, + "grad_norm": 0.8769335746765137, + "learning_rate": 0.0005061107777235524, + "loss": 0.3565, + "mean_token_accuracy": 0.9133864492177963, + "num_tokens": 1388111.0, + "step": 157 + }, + { + "entropy": 2.3127577900886536, + "epoch": 0.571945701357466, + "grad_norm": 1.1026453971862793, + "learning_rate": 0.0005060184292017965, + "loss": 0.2897, + "mean_token_accuracy": 0.899736076593399, + "num_tokens": 1397528.0, + "step": 158 + }, + { + "entropy": 2.2682697772979736, + "epoch": 0.5755656108597285, + "grad_norm": 0.5426591038703918, + "learning_rate": 0.000505925226221276, + "loss": 0.167, + "mean_token_accuracy": 0.9609879851341248, + "num_tokens": 1406809.0, + "step": 159 + }, + { + "entropy": 2.4639336466789246, + "epoch": 0.579185520361991, + "grad_norm": 0.6552363038063049, + "learning_rate": 0.0005058311691359875, + "loss": 0.2511, + "mean_token_accuracy": 0.9355164766311646, + "num_tokens": 1415498.0, + "step": 160 + }, + { + "entropy": 2.467900663614273, + "epoch": 0.5828054298642534, + "grad_norm": 0.7168154120445251, + "learning_rate": 0.000505736258303172, + "loss": 0.234, + "mean_token_accuracy": 0.9450509995222092, + "num_tokens": 1424524.0, + "step": 161 + }, + { + "entropy": 2.3683157563209534, + "epoch": 0.5864253393665159, + "grad_norm": 0.6433501839637756, + "learning_rate": 0.0005056404940833128, + "loss": 0.3441, + "mean_token_accuracy": 0.9261108189821243, + "num_tokens": 1434194.0, + "step": 162 + }, + { + "entropy": 2.4686295986175537, + "epoch": 0.5900452488687783, + "grad_norm": 0.9615177512168884, + "learning_rate": 0.0005055438768401348, + "loss": 0.1492, + "mean_token_accuracy": 0.966903567314148, + "num_tokens": 1442972.0, + "step": 163 + }, + { + "entropy": 2.5551892518997192, + "epoch": 0.5936651583710407, + "grad_norm": 0.4957484006881714, + "learning_rate": 0.0005054464069406023, + "loss": 0.1242, + "mean_token_accuracy": 0.969713419675827, + "num_tokens": 1451324.0, + "step": 164 + }, + { + "entropy": 2.554121434688568, + "epoch": 0.5972850678733032, + "grad_norm": 0.7399498224258423, + "learning_rate": 0.0005053480847549187, + "loss": 0.206, + "mean_token_accuracy": 0.9498797357082367, + "num_tokens": 1459698.0, + "step": 165 + }, + { + "entropy": 2.5181015729904175, + "epoch": 0.6009049773755656, + "grad_norm": 0.7433251142501831, + "learning_rate": 0.0005052489106565241, + "loss": 0.2883, + "mean_token_accuracy": 0.9419967085123062, + "num_tokens": 1468460.0, + "step": 166 + }, + { + "entropy": 2.3073930144309998, + "epoch": 0.604524886877828, + "grad_norm": 0.5920398831367493, + "learning_rate": 0.0005051488850220941, + "loss": 0.197, + "mean_token_accuracy": 0.952111005783081, + "num_tokens": 1477579.0, + "step": 167 + }, + { + "entropy": 2.532376289367676, + "epoch": 0.6081447963800904, + "grad_norm": 0.7033098936080933, + "learning_rate": 0.0005050480082315392, + "loss": 0.2122, + "mean_token_accuracy": 0.9488633275032043, + "num_tokens": 1486307.0, + "step": 168 + }, + { + "entropy": 2.397290349006653, + "epoch": 0.611764705882353, + "grad_norm": 0.8026869893074036, + "learning_rate": 0.0005049462806680021, + "loss": 0.2541, + "mean_token_accuracy": 0.9427233040332794, + "num_tokens": 1495152.0, + "step": 169 + }, + { + "entropy": 2.464823842048645, + "epoch": 0.6153846153846154, + "grad_norm": 0.6508225798606873, + "learning_rate": 0.0005048437027178571, + "loss": 0.2639, + "mean_token_accuracy": 0.9391255974769592, + "num_tokens": 1503903.0, + "step": 170 + }, + { + "entropy": 2.520734131336212, + "epoch": 0.6190045248868778, + "grad_norm": 0.8373616337776184, + "learning_rate": 0.0005047402747707084, + "loss": 0.3078, + "mean_token_accuracy": 0.9302930980920792, + "num_tokens": 1512588.0, + "step": 171 + }, + { + "entropy": 2.388108015060425, + "epoch": 0.6226244343891403, + "grad_norm": 0.6334089636802673, + "learning_rate": 0.0005046359972193884, + "loss": 0.1372, + "mean_token_accuracy": 0.9666119515895844, + "num_tokens": 1522011.0, + "step": 172 + }, + { + "entropy": 2.537126660346985, + "epoch": 0.6262443438914027, + "grad_norm": 0.7665116190910339, + "learning_rate": 0.0005045308704599566, + "loss": 0.2603, + "mean_token_accuracy": 0.9350012242794037, + "num_tokens": 1530767.0, + "step": 173 + }, + { + "entropy": 2.567205488681793, + "epoch": 0.6298642533936651, + "grad_norm": 0.8043875098228455, + "learning_rate": 0.0005044248948916977, + "loss": 0.2497, + "mean_token_accuracy": 0.9400482773780823, + "num_tokens": 1539971.0, + "step": 174 + }, + { + "entropy": 2.585887610912323, + "epoch": 0.6334841628959276, + "grad_norm": 0.5282150506973267, + "learning_rate": 0.0005043180709171206, + "loss": 0.1126, + "mean_token_accuracy": 0.9680279046297073, + "num_tokens": 1548971.0, + "step": 175 + }, + { + "entropy": 2.4289392232894897, + "epoch": 0.63710407239819, + "grad_norm": 0.6838382482528687, + "learning_rate": 0.0005042103989419563, + "loss": 0.2076, + "mean_token_accuracy": 0.9468046277761459, + "num_tokens": 1558403.0, + "step": 176 + }, + { + "entropy": 2.6080575585365295, + "epoch": 0.6407239819004525, + "grad_norm": 0.9058650732040405, + "learning_rate": 0.0005041018793751566, + "loss": 0.1781, + "mean_token_accuracy": 0.9432647377252579, + "num_tokens": 1567209.0, + "step": 177 + }, + { + "entropy": 2.5212480425834656, + "epoch": 0.644343891402715, + "grad_norm": 0.796381950378418, + "learning_rate": 0.0005039925126288929, + "loss": 0.2286, + "mean_token_accuracy": 0.9305787235498428, + "num_tokens": 1576255.0, + "step": 178 + }, + { + "entropy": 2.588195264339447, + "epoch": 0.6479638009049774, + "grad_norm": 0.6489388942718506, + "learning_rate": 0.0005038822991185536, + "loss": 0.1717, + "mean_token_accuracy": 0.9572225511074066, + "num_tokens": 1585335.0, + "step": 179 + }, + { + "entropy": 2.609215259552002, + "epoch": 0.6515837104072398, + "grad_norm": 0.8551130294799805, + "learning_rate": 0.0005037712392627441, + "loss": 0.2358, + "mean_token_accuracy": 0.9529621452093124, + "num_tokens": 1594354.0, + "step": 180 + }, + { + "entropy": 2.4199504256248474, + "epoch": 0.6552036199095023, + "grad_norm": 0.5775637030601501, + "learning_rate": 0.0005036593334832836, + "loss": 0.2402, + "mean_token_accuracy": 0.9437069743871689, + "num_tokens": 1603750.0, + "step": 181 + }, + { + "entropy": 2.516424596309662, + "epoch": 0.6588235294117647, + "grad_norm": 0.6967942118644714, + "learning_rate": 0.0005035465822052047, + "loss": 0.1624, + "mean_token_accuracy": 0.9518167823553085, + "num_tokens": 1612474.0, + "step": 182 + }, + { + "entropy": 2.463354170322418, + "epoch": 0.6624434389140271, + "grad_norm": 0.49672600626945496, + "learning_rate": 0.000503432985856751, + "loss": 0.1654, + "mean_token_accuracy": 0.9564716964960098, + "num_tokens": 1621563.0, + "step": 183 + }, + { + "entropy": 2.4456416964530945, + "epoch": 0.6660633484162896, + "grad_norm": 0.6207183003425598, + "learning_rate": 0.000503318544869376, + "loss": 0.1918, + "mean_token_accuracy": 0.9476529806852341, + "num_tokens": 1630801.0, + "step": 184 + }, + { + "entropy": 2.641440451145172, + "epoch": 0.669683257918552, + "grad_norm": 1.220821499824524, + "learning_rate": 0.000503203259677741, + "loss": 0.4019, + "mean_token_accuracy": 0.9172120243310928, + "num_tokens": 1639522.0, + "step": 185 + }, + { + "entropy": 2.6447275280952454, + "epoch": 0.6733031674208145, + "grad_norm": 0.7546490430831909, + "learning_rate": 0.000503087130719714, + "loss": 0.2484, + "mean_token_accuracy": 0.9387800246477127, + "num_tokens": 1647964.0, + "step": 186 + }, + { + "entropy": 2.4657886028289795, + "epoch": 0.676923076923077, + "grad_norm": 0.7679230570793152, + "learning_rate": 0.0005029701584363675, + "loss": 0.2659, + "mean_token_accuracy": 0.930300235748291, + "num_tokens": 1657181.0, + "step": 187 + }, + { + "entropy": 2.37973552942276, + "epoch": 0.6805429864253394, + "grad_norm": 0.7473414540290833, + "learning_rate": 0.0005028523432719772, + "loss": 0.32, + "mean_token_accuracy": 0.9233052879571915, + "num_tokens": 1666477.0, + "step": 188 + }, + { + "entropy": 2.5238219499588013, + "epoch": 0.6841628959276018, + "grad_norm": 0.5573673248291016, + "learning_rate": 0.0005027336856740201, + "loss": 0.1846, + "mean_token_accuracy": 0.9445535093545914, + "num_tokens": 1675002.0, + "step": 189 + }, + { + "entropy": 2.456815242767334, + "epoch": 0.6877828054298643, + "grad_norm": 0.47237634658813477, + "learning_rate": 0.0005026141860931728, + "loss": 0.1065, + "mean_token_accuracy": 0.964375838637352, + "num_tokens": 1683623.0, + "step": 190 + }, + { + "entropy": 2.548456132411957, + "epoch": 0.6914027149321267, + "grad_norm": 0.7699162364006042, + "learning_rate": 0.00050249384498331, + "loss": 0.1985, + "mean_token_accuracy": 0.9438774734735489, + "num_tokens": 1691718.0, + "step": 191 + }, + { + "entropy": 2.4514941573143005, + "epoch": 0.6950226244343891, + "grad_norm": 1.4113538265228271, + "learning_rate": 0.0005023726628015027, + "loss": 0.4541, + "mean_token_accuracy": 0.9207872897386551, + "num_tokens": 1699824.0, + "step": 192 + }, + { + "entropy": 2.2560824751853943, + "epoch": 0.6986425339366515, + "grad_norm": 0.6007948517799377, + "learning_rate": 0.0005022506400080161, + "loss": 0.1871, + "mean_token_accuracy": 0.9502484053373337, + "num_tokens": 1708722.0, + "step": 193 + }, + { + "entropy": 2.1833614110946655, + "epoch": 0.702262443438914, + "grad_norm": 0.7005489468574524, + "learning_rate": 0.0005021277770663082, + "loss": 0.2222, + "mean_token_accuracy": 0.9386974722146988, + "num_tokens": 1717592.0, + "step": 194 + }, + { + "entropy": 2.2031923830509186, + "epoch": 0.7058823529411765, + "grad_norm": 0.5830584764480591, + "learning_rate": 0.0005020040744430284, + "loss": 0.1106, + "mean_token_accuracy": 0.9719562232494354, + "num_tokens": 1726149.0, + "step": 195 + }, + { + "entropy": 2.199785351753235, + "epoch": 0.709502262443439, + "grad_norm": 0.7465847134590149, + "learning_rate": 0.0005018795326080149, + "loss": 0.1935, + "mean_token_accuracy": 0.9497270882129669, + "num_tokens": 1734541.0, + "step": 196 + }, + { + "entropy": 2.1103186309337616, + "epoch": 0.7131221719457014, + "grad_norm": 1.0782264471054077, + "learning_rate": 0.0005017541520342934, + "loss": 0.2895, + "mean_token_accuracy": 0.9274258464574814, + "num_tokens": 1743722.0, + "step": 197 + }, + { + "entropy": 2.2248528599739075, + "epoch": 0.7167420814479638, + "grad_norm": 0.6409780979156494, + "learning_rate": 0.0005016279331980754, + "loss": 0.1425, + "mean_token_accuracy": 0.96550352871418, + "num_tokens": 1752156.0, + "step": 198 + }, + { + "entropy": 2.19924658536911, + "epoch": 0.7203619909502262, + "grad_norm": 0.7019934058189392, + "learning_rate": 0.0005015008765787561, + "loss": 0.1969, + "mean_token_accuracy": 0.9429282248020172, + "num_tokens": 1760978.0, + "step": 199 + }, + { + "entropy": 2.297484815120697, + "epoch": 0.7239819004524887, + "grad_norm": 0.7826490998268127, + "learning_rate": 0.0005013729826589127, + "loss": 0.2399, + "mean_token_accuracy": 0.9416657984256744, + "num_tokens": 1769533.0, + "step": 200 + }, + { + "entropy": 2.2471498548984528, + "epoch": 0.7276018099547511, + "grad_norm": 0.621566891670227, + "learning_rate": 0.0005012442519243027, + "loss": 0.1876, + "mean_token_accuracy": 0.9460793286561966, + "num_tokens": 1778286.0, + "step": 201 + }, + { + "entropy": 2.2212815284729004, + "epoch": 0.7312217194570135, + "grad_norm": 0.622283935546875, + "learning_rate": 0.0005011146848638616, + "loss": 0.1617, + "mean_token_accuracy": 0.9482609927654266, + "num_tokens": 1787392.0, + "step": 202 + }, + { + "entropy": 2.308752655982971, + "epoch": 0.7348416289592761, + "grad_norm": 0.7263973355293274, + "learning_rate": 0.0005009842819697018, + "loss": 0.2043, + "mean_token_accuracy": 0.9378403723239899, + "num_tokens": 1796133.0, + "step": 203 + }, + { + "entropy": 2.3376497626304626, + "epoch": 0.7384615384615385, + "grad_norm": 0.5493630766868591, + "learning_rate": 0.0005008530437371101, + "loss": 0.1145, + "mean_token_accuracy": 0.970586434006691, + "num_tokens": 1804769.0, + "step": 204 + }, + { + "entropy": 2.373005509376526, + "epoch": 0.7420814479638009, + "grad_norm": 0.6313483119010925, + "learning_rate": 0.0005007209706645461, + "loss": 0.2183, + "mean_token_accuracy": 0.9472708404064178, + "num_tokens": 1813364.0, + "step": 205 + }, + { + "entropy": 2.468949854373932, + "epoch": 0.7457013574660634, + "grad_norm": 1.0125588178634644, + "learning_rate": 0.00050058806325364, + "loss": 0.2225, + "mean_token_accuracy": 0.9351322948932648, + "num_tokens": 1822149.0, + "step": 206 + }, + { + "entropy": 2.2420623898506165, + "epoch": 0.7493212669683258, + "grad_norm": 0.913761556148529, + "learning_rate": 0.0005004543220091911, + "loss": 0.2386, + "mean_token_accuracy": 0.9453927427530289, + "num_tokens": 1831533.0, + "step": 207 + }, + { + "entropy": 2.2966006994247437, + "epoch": 0.7529411764705882, + "grad_norm": 0.7386876940727234, + "learning_rate": 0.0005003197474391658, + "loss": 0.1768, + "mean_token_accuracy": 0.949826255440712, + "num_tokens": 1840157.0, + "step": 208 + }, + { + "entropy": 2.306001305580139, + "epoch": 0.7565610859728507, + "grad_norm": 0.8900741338729858, + "learning_rate": 0.0005001843400546955, + "loss": 0.2899, + "mean_token_accuracy": 0.9241485595703125, + "num_tokens": 1848898.0, + "step": 209 + }, + { + "entropy": 2.117514967918396, + "epoch": 0.7601809954751131, + "grad_norm": 0.644622802734375, + "learning_rate": 0.0005000481003700746, + "loss": 0.2714, + "mean_token_accuracy": 0.9299416691064835, + "num_tokens": 1858330.0, + "step": 210 + }, + { + "entropy": 2.3768392205238342, + "epoch": 0.7638009049773755, + "grad_norm": 0.9724471569061279, + "learning_rate": 0.0004999110289027587, + "loss": 0.1633, + "mean_token_accuracy": 0.9550061523914337, + "num_tokens": 1866806.0, + "step": 211 + }, + { + "entropy": 2.090679556131363, + "epoch": 0.7674208144796381, + "grad_norm": 0.5419518351554871, + "learning_rate": 0.0004997731261733628, + "loss": 0.1369, + "mean_token_accuracy": 0.9619670957326889, + "num_tokens": 1875937.0, + "step": 212 + }, + { + "entropy": 2.099909245967865, + "epoch": 0.7710407239819005, + "grad_norm": 0.6858121752738953, + "learning_rate": 0.0004996343927056592, + "loss": 0.1633, + "mean_token_accuracy": 0.9528832882642746, + "num_tokens": 1885145.0, + "step": 213 + }, + { + "entropy": 2.130059242248535, + "epoch": 0.7746606334841629, + "grad_norm": 0.7691065073013306, + "learning_rate": 0.000499494829026575, + "loss": 0.348, + "mean_token_accuracy": 0.9162366837263107, + "num_tokens": 1894255.0, + "step": 214 + }, + { + "entropy": 2.191373586654663, + "epoch": 0.7782805429864253, + "grad_norm": 0.7427324652671814, + "learning_rate": 0.000499354435666191, + "loss": 0.3373, + "mean_token_accuracy": 0.9311849176883698, + "num_tokens": 1902981.0, + "step": 215 + }, + { + "entropy": 2.1425398886203766, + "epoch": 0.7819004524886878, + "grad_norm": 0.6410383582115173, + "learning_rate": 0.0004992132131577392, + "loss": 0.2079, + "mean_token_accuracy": 0.949742391705513, + "num_tokens": 1912253.0, + "step": 216 + }, + { + "entropy": 2.1396586298942566, + "epoch": 0.7855203619909502, + "grad_norm": 0.5689850449562073, + "learning_rate": 0.0004990711620376003, + "loss": 0.1999, + "mean_token_accuracy": 0.946034774184227, + "num_tokens": 1921409.0, + "step": 217 + }, + { + "entropy": 2.2237865328788757, + "epoch": 0.7891402714932126, + "grad_norm": 0.6408923864364624, + "learning_rate": 0.0004989282828453029, + "loss": 0.2452, + "mean_token_accuracy": 0.9510752111673355, + "num_tokens": 1930397.0, + "step": 218 + }, + { + "entropy": 2.234771251678467, + "epoch": 0.7927601809954751, + "grad_norm": 0.751447856426239, + "learning_rate": 0.0004987845761235203, + "loss": 0.3057, + "mean_token_accuracy": 0.9217256307601929, + "num_tokens": 1939172.0, + "step": 219 + }, + { + "entropy": 2.2653815746307373, + "epoch": 0.7963800904977375, + "grad_norm": 0.751455545425415, + "learning_rate": 0.0004986400424180688, + "loss": 0.3245, + "mean_token_accuracy": 0.9256318956613541, + "num_tokens": 1947979.0, + "step": 220 + }, + { + "entropy": 2.3123483061790466, + "epoch": 0.8, + "grad_norm": 0.5939492583274841, + "learning_rate": 0.0004984946822779061, + "loss": 0.2429, + "mean_token_accuracy": 0.9333402067422867, + "num_tokens": 1956814.0, + "step": 221 + }, + { + "entropy": 2.3289234042167664, + "epoch": 0.8036199095022625, + "grad_norm": 0.5591994524002075, + "learning_rate": 0.0004983484962551284, + "loss": 0.1507, + "mean_token_accuracy": 0.96376833319664, + "num_tokens": 1965641.0, + "step": 222 + }, + { + "entropy": 2.4314023852348328, + "epoch": 0.8072398190045249, + "grad_norm": 0.5805783271789551, + "learning_rate": 0.0004982014849049687, + "loss": 0.2049, + "mean_token_accuracy": 0.9586948156356812, + "num_tokens": 1974180.0, + "step": 223 + }, + { + "entropy": 2.3639765977859497, + "epoch": 0.8108597285067873, + "grad_norm": 0.6924490332603455, + "learning_rate": 0.0004980536487857951, + "loss": 0.2137, + "mean_token_accuracy": 0.9441423565149307, + "num_tokens": 1982744.0, + "step": 224 + }, + { + "entropy": 2.3361759781837463, + "epoch": 0.8144796380090498, + "grad_norm": 0.4579620361328125, + "learning_rate": 0.0004979049884591077, + "loss": 0.1041, + "mean_token_accuracy": 0.9753208309412003, + "num_tokens": 1991583.0, + "step": 225 + }, + { + "entropy": 2.286989688873291, + "epoch": 0.8180995475113122, + "grad_norm": 0.6489312052726746, + "learning_rate": 0.0004977555044895377, + "loss": 0.2131, + "mean_token_accuracy": 0.9520440250635147, + "num_tokens": 2000193.0, + "step": 226 + }, + { + "entropy": 2.288672834634781, + "epoch": 0.8217194570135746, + "grad_norm": 0.7738961577415466, + "learning_rate": 0.0004976051974448441, + "loss": 0.325, + "mean_token_accuracy": 0.9060750156641006, + "num_tokens": 2009233.0, + "step": 227 + }, + { + "entropy": 2.288076102733612, + "epoch": 0.8253393665158371, + "grad_norm": 0.7042292356491089, + "learning_rate": 0.0004974540678959123, + "loss": 0.2206, + "mean_token_accuracy": 0.94980289041996, + "num_tokens": 2018417.0, + "step": 228 + }, + { + "entropy": 2.217707335948944, + "epoch": 0.8289592760180996, + "grad_norm": 0.6834208369255066, + "learning_rate": 0.0004973021164167515, + "loss": 0.2907, + "mean_token_accuracy": 0.951058641076088, + "num_tokens": 2027822.0, + "step": 229 + }, + { + "entropy": 2.1610691249370575, + "epoch": 0.832579185520362, + "grad_norm": 0.665044903755188, + "learning_rate": 0.0004971493435844928, + "loss": 0.2387, + "mean_token_accuracy": 0.9506549835205078, + "num_tokens": 2036983.0, + "step": 230 + }, + { + "entropy": 2.321135401725769, + "epoch": 0.8361990950226245, + "grad_norm": 0.8208273649215698, + "learning_rate": 0.0004969957499793869, + "loss": 0.2399, + "mean_token_accuracy": 0.9435176253318787, + "num_tokens": 2045574.0, + "step": 231 + }, + { + "entropy": 2.1943611800670624, + "epoch": 0.8398190045248869, + "grad_norm": 0.6293840408325195, + "learning_rate": 0.0004968413361848019, + "loss": 0.1784, + "mean_token_accuracy": 0.9559669345617294, + "num_tokens": 2054336.0, + "step": 232 + }, + { + "entropy": 2.2722273468971252, + "epoch": 0.8434389140271493, + "grad_norm": 0.6535817980766296, + "learning_rate": 0.0004966861027872211, + "loss": 0.1675, + "mean_token_accuracy": 0.9532535970211029, + "num_tokens": 2063225.0, + "step": 233 + }, + { + "entropy": 2.3278334736824036, + "epoch": 0.8470588235294118, + "grad_norm": 1.1610206365585327, + "learning_rate": 0.0004965300503762406, + "loss": 0.1588, + "mean_token_accuracy": 0.9641145765781403, + "num_tokens": 2071738.0, + "step": 234 + }, + { + "entropy": 2.202972888946533, + "epoch": 0.8506787330316742, + "grad_norm": 0.4811885356903076, + "learning_rate": 0.0004963731795445675, + "loss": 0.0813, + "mean_token_accuracy": 0.9766911715269089, + "num_tokens": 2080375.0, + "step": 235 + }, + { + "entropy": 2.2433705925941467, + "epoch": 0.8542986425339366, + "grad_norm": 0.8113318681716919, + "learning_rate": 0.0004962154908880171, + "loss": 0.2965, + "mean_token_accuracy": 0.9290606826543808, + "num_tokens": 2089522.0, + "step": 236 + }, + { + "entropy": 2.2168884873390198, + "epoch": 0.857918552036199, + "grad_norm": 0.6128959655761719, + "learning_rate": 0.0004960569850055111, + "loss": 0.1724, + "mean_token_accuracy": 0.9603384286165237, + "num_tokens": 2098162.0, + "step": 237 + }, + { + "entropy": 2.2738255858421326, + "epoch": 0.8615384615384616, + "grad_norm": 0.8557195663452148, + "learning_rate": 0.0004958976624990749, + "loss": 0.2596, + "mean_token_accuracy": 0.9487071484327316, + "num_tokens": 2106984.0, + "step": 238 + }, + { + "entropy": 2.2031425833702087, + "epoch": 0.865158371040724, + "grad_norm": 0.6621816158294678, + "learning_rate": 0.0004957375239738359, + "loss": 0.232, + "mean_token_accuracy": 0.9525040090084076, + "num_tokens": 2116040.0, + "step": 239 + }, + { + "entropy": 2.374737858772278, + "epoch": 0.8687782805429864, + "grad_norm": 0.8481062054634094, + "learning_rate": 0.0004955765700380204, + "loss": 0.2516, + "mean_token_accuracy": 0.9396061599254608, + "num_tokens": 2124862.0, + "step": 240 + }, + { + "entropy": 2.266704559326172, + "epoch": 0.8723981900452489, + "grad_norm": 0.6284282803535461, + "learning_rate": 0.0004954148013029521, + "loss": 0.3244, + "mean_token_accuracy": 0.9381244331598282, + "num_tokens": 2134018.0, + "step": 241 + }, + { + "entropy": 2.3935859203338623, + "epoch": 0.8760180995475113, + "grad_norm": 1.1564176082611084, + "learning_rate": 0.0004952522183830493, + "loss": 0.2706, + "mean_token_accuracy": 0.9297053664922714, + "num_tokens": 2142745.0, + "step": 242 + }, + { + "entropy": 2.281618118286133, + "epoch": 0.8796380090497737, + "grad_norm": 0.5324040055274963, + "learning_rate": 0.0004950888218958225, + "loss": 0.1573, + "mean_token_accuracy": 0.9568462073802948, + "num_tokens": 2151607.0, + "step": 243 + }, + { + "entropy": 2.230749189853668, + "epoch": 0.8832579185520362, + "grad_norm": 0.680780291557312, + "learning_rate": 0.0004949246124618726, + "loss": 0.1956, + "mean_token_accuracy": 0.9479999989271164, + "num_tokens": 2160904.0, + "step": 244 + }, + { + "entropy": 2.21382600069046, + "epoch": 0.8868778280542986, + "grad_norm": 0.6321626305580139, + "learning_rate": 0.0004947595907048877, + "loss": 0.2444, + "mean_token_accuracy": 0.9376699328422546, + "num_tokens": 2170021.0, + "step": 245 + }, + { + "entropy": 2.3659472465515137, + "epoch": 0.890497737556561, + "grad_norm": 0.9778954982757568, + "learning_rate": 0.0004945937572516417, + "loss": 0.3783, + "mean_token_accuracy": 0.9104805737733841, + "num_tokens": 2178995.0, + "step": 246 + }, + { + "entropy": 2.3233078718185425, + "epoch": 0.8941176470588236, + "grad_norm": 0.53229820728302, + "learning_rate": 0.0004944271127319909, + "loss": 0.0759, + "mean_token_accuracy": 0.9791453778743744, + "num_tokens": 2187823.0, + "step": 247 + }, + { + "entropy": 2.2469444274902344, + "epoch": 0.897737556561086, + "grad_norm": 0.6367197632789612, + "learning_rate": 0.0004942596577788728, + "loss": 0.2677, + "mean_token_accuracy": 0.9392691254615784, + "num_tokens": 2196923.0, + "step": 248 + }, + { + "entropy": 2.4508965611457825, + "epoch": 0.9013574660633484, + "grad_norm": 0.6042234897613525, + "learning_rate": 0.0004940913930283024, + "loss": 0.1102, + "mean_token_accuracy": 0.9762090593576431, + "num_tokens": 2205400.0, + "step": 249 + }, + { + "entropy": 2.365670144557953, + "epoch": 0.9049773755656109, + "grad_norm": 0.6490639448165894, + "learning_rate": 0.0004939223191193707, + "loss": 0.1532, + "mean_token_accuracy": 0.9489114433526993, + "num_tokens": 2214201.0, + "step": 250 + }, + { + "entropy": 2.4013625383377075, + "epoch": 0.9085972850678733, + "grad_norm": 0.5969854593276978, + "learning_rate": 0.0004937524366942419, + "loss": 0.1273, + "mean_token_accuracy": 0.9682519882917404, + "num_tokens": 2222979.0, + "step": 251 + }, + { + "entropy": 2.4402357935905457, + "epoch": 0.9122171945701357, + "grad_norm": 0.7559595704078674, + "learning_rate": 0.0004935817463981513, + "loss": 0.1979, + "mean_token_accuracy": 0.9483373910188675, + "num_tokens": 2231169.0, + "step": 252 + }, + { + "entropy": 2.4673256874084473, + "epoch": 0.9158371040723982, + "grad_norm": 0.8663308620452881, + "learning_rate": 0.0004934102488794023, + "loss": 0.2453, + "mean_token_accuracy": 0.9408974200487137, + "num_tokens": 2240099.0, + "step": 253 + }, + { + "entropy": 2.426262080669403, + "epoch": 0.9194570135746606, + "grad_norm": 0.7920467257499695, + "learning_rate": 0.0004932379447893643, + "loss": 0.2828, + "mean_token_accuracy": 0.9319239109754562, + "num_tokens": 2249088.0, + "step": 254 + }, + { + "entropy": 2.5018852949142456, + "epoch": 0.9230769230769231, + "grad_norm": 0.7216617465019226, + "learning_rate": 0.0004930648347824701, + "loss": 0.1647, + "mean_token_accuracy": 0.9551804810762405, + "num_tokens": 2257710.0, + "step": 255 + }, + { + "entropy": 2.43031644821167, + "epoch": 0.9266968325791856, + "grad_norm": 0.646794319152832, + "learning_rate": 0.0004928909195162138, + "loss": 0.1328, + "mean_token_accuracy": 0.9663553237915039, + "num_tokens": 2266883.0, + "step": 256 + }, + { + "entropy": 2.5406370759010315, + "epoch": 0.930316742081448, + "grad_norm": 0.5482825040817261, + "learning_rate": 0.0004927161996511474, + "loss": 0.1872, + "mean_token_accuracy": 0.9557004272937775, + "num_tokens": 2275728.0, + "step": 257 + }, + { + "entropy": 2.636320471763611, + "epoch": 0.9339366515837104, + "grad_norm": 0.7454632520675659, + "learning_rate": 0.0004925406758508797, + "loss": 0.1461, + "mean_token_accuracy": 0.9578974395990372, + "num_tokens": 2284319.0, + "step": 258 + }, + { + "entropy": 2.6067575812339783, + "epoch": 0.9375565610859729, + "grad_norm": 0.8695769309997559, + "learning_rate": 0.000492364348782072, + "loss": 0.1712, + "mean_token_accuracy": 0.9652896523475647, + "num_tokens": 2293035.0, + "step": 259 + }, + { + "entropy": 2.5837162137031555, + "epoch": 0.9411764705882353, + "grad_norm": 0.5752995014190674, + "learning_rate": 0.0004921872191144371, + "loss": 0.1398, + "mean_token_accuracy": 0.9553333520889282, + "num_tokens": 2301802.0, + "step": 260 + }, + { + "entropy": 2.713033616542816, + "epoch": 0.9447963800904977, + "grad_norm": 0.85626620054245, + "learning_rate": 0.0004920092875207363, + "loss": 0.2207, + "mean_token_accuracy": 0.9468346834182739, + "num_tokens": 2309981.0, + "step": 261 + }, + { + "entropy": 2.400112509727478, + "epoch": 0.9484162895927601, + "grad_norm": 0.6766608953475952, + "learning_rate": 0.0004918305546767764, + "loss": 0.1644, + "mean_token_accuracy": 0.9502440094947815, + "num_tokens": 2319212.0, + "step": 262 + }, + { + "entropy": 2.503827154636383, + "epoch": 0.9520361990950226, + "grad_norm": 0.789470911026001, + "learning_rate": 0.0004916510212614072, + "loss": 0.2117, + "mean_token_accuracy": 0.9454390555620193, + "num_tokens": 2328234.0, + "step": 263 + }, + { + "entropy": 2.669040560722351, + "epoch": 0.9556561085972851, + "grad_norm": 0.9579212069511414, + "learning_rate": 0.0004914706879565197, + "loss": 0.2193, + "mean_token_accuracy": 0.9321542829275131, + "num_tokens": 2336543.0, + "step": 264 + }, + { + "entropy": 2.507073998451233, + "epoch": 0.9592760180995475, + "grad_norm": 0.5315744876861572, + "learning_rate": 0.000491289555447043, + "loss": 0.0851, + "mean_token_accuracy": 0.9771326780319214, + "num_tokens": 2345292.0, + "step": 265 + }, + { + "entropy": 2.4205283522605896, + "epoch": 0.96289592760181, + "grad_norm": 0.5441373586654663, + "learning_rate": 0.000491107624420941, + "loss": 0.1323, + "mean_token_accuracy": 0.9541790336370468, + "num_tokens": 2354242.0, + "step": 266 + }, + { + "entropy": 2.3817258477211, + "epoch": 0.9665158371040724, + "grad_norm": 0.5946238040924072, + "learning_rate": 0.0004909248955692111, + "loss": 0.1708, + "mean_token_accuracy": 0.947738841176033, + "num_tokens": 2363183.0, + "step": 267 + }, + { + "entropy": 2.5073485374450684, + "epoch": 0.9701357466063348, + "grad_norm": 0.6979324817657471, + "learning_rate": 0.0004907413695858812, + "loss": 0.2099, + "mean_token_accuracy": 0.9423733651638031, + "num_tokens": 2371885.0, + "step": 268 + }, + { + "entropy": 2.5705007910728455, + "epoch": 0.9737556561085973, + "grad_norm": 0.8203943967819214, + "learning_rate": 0.0004905570471680057, + "loss": 0.217, + "mean_token_accuracy": 0.9511639326810837, + "num_tokens": 2380316.0, + "step": 269 + }, + { + "entropy": 2.2677993774414062, + "epoch": 0.9773755656108597, + "grad_norm": 0.5840432047843933, + "learning_rate": 0.0004903719290156649, + "loss": 0.2364, + "mean_token_accuracy": 0.9407180696725845, + "num_tokens": 2389723.0, + "step": 270 + }, + { + "entropy": 2.477886915206909, + "epoch": 0.9809954751131221, + "grad_norm": 0.818929135799408, + "learning_rate": 0.0004901860158319612, + "loss": 0.1707, + "mean_token_accuracy": 0.9579566866159439, + "num_tokens": 2398388.0, + "step": 271 + }, + { + "entropy": 2.549662232398987, + "epoch": 0.9846153846153847, + "grad_norm": 0.7804781198501587, + "learning_rate": 0.0004899993083230166, + "loss": 0.2944, + "mean_token_accuracy": 0.9381812512874603, + "num_tokens": 2406929.0, + "step": 272 + }, + { + "entropy": 2.4465304017066956, + "epoch": 0.9882352941176471, + "grad_norm": 0.5218799114227295, + "learning_rate": 0.0004898118071979699, + "loss": 0.1661, + "mean_token_accuracy": 0.9500218778848648, + "num_tokens": 2415631.0, + "step": 273 + }, + { + "entropy": 2.5852283239364624, + "epoch": 0.9918552036199095, + "grad_norm": 0.591163158416748, + "learning_rate": 0.0004896235131689743, + "loss": 0.2005, + "mean_token_accuracy": 0.9455285370349884, + "num_tokens": 2424091.0, + "step": 274 + }, + { + "entropy": 2.478701651096344, + "epoch": 0.995475113122172, + "grad_norm": 1.0615383386611938, + "learning_rate": 0.0004894344269511945, + "loss": 0.2864, + "mean_token_accuracy": 0.9306265562772751, + "num_tokens": 2432705.0, + "step": 275 + }, + { + "entropy": 2.600062847137451, + "epoch": 0.9990950226244344, + "grad_norm": 0.7011683583259583, + "learning_rate": 0.0004892445492628043, + "loss": 0.1664, + "mean_token_accuracy": 0.9547821134328842, + "num_tokens": 2440992.0, + "step": 276 + }, + { + "entropy": 2.3411240577697754, + "epoch": 1.0, + "grad_norm": 0.4944029450416565, + "learning_rate": 0.000489053880824983, + "loss": 0.022, + "mean_token_accuracy": 0.9929078221321106, + "num_tokens": 2441725.0, + "step": 277 + }, + { + "epoch": 1.0, + "eval_entropy": 2.5467925265552553, + "eval_loss": 0.21274714171886444, + "eval_mean_token_accuracy": 0.9444630068492114, + "eval_num_tokens": 2441725.0, + "eval_runtime": 116.0434, + "eval_samples_per_second": 3.18, + "eval_steps_per_second": 1.06, + "step": 277 + }, + { + "entropy": 2.609170138835907, + "epoch": 1.0036199095022624, + "grad_norm": 1.0785081386566162, + "learning_rate": 0.0004888624223619136, + "loss": 0.3167, + "mean_token_accuracy": 0.9296800643205643, + "num_tokens": 2450193.0, + "step": 278 + }, + { + "entropy": 2.497025430202484, + "epoch": 1.0072398190045249, + "grad_norm": 0.5221985578536987, + "learning_rate": 0.0004886701746007801, + "loss": 0.0854, + "mean_token_accuracy": 0.9753399342298508, + "num_tokens": 2459309.0, + "step": 279 + }, + { + "entropy": 2.5487362146377563, + "epoch": 1.0108597285067873, + "grad_norm": 0.5161958336830139, + "learning_rate": 0.0004884771382717638, + "loss": 0.0819, + "mean_token_accuracy": 0.9748431146144867, + "num_tokens": 2467844.0, + "step": 280 + }, + { + "entropy": 2.5276209115982056, + "epoch": 1.0144796380090497, + "grad_norm": 0.5731730461120605, + "learning_rate": 0.0004882833141080412, + "loss": 0.1541, + "mean_token_accuracy": 0.9567564427852631, + "num_tokens": 2476894.0, + "step": 281 + }, + { + "entropy": 2.4442760348320007, + "epoch": 1.0180995475113122, + "grad_norm": 0.7120366096496582, + "learning_rate": 0.0004880887028457813, + "loss": 0.1945, + "mean_token_accuracy": 0.9465379565954208, + "num_tokens": 2485971.0, + "step": 282 + }, + { + "entropy": 2.4069360494613647, + "epoch": 1.0217194570135746, + "grad_norm": 0.7468647360801697, + "learning_rate": 0.00048789330522414244, + "loss": 0.2345, + "mean_token_accuracy": 0.9446765780448914, + "num_tokens": 2495043.0, + "step": 283 + }, + { + "entropy": 2.468382716178894, + "epoch": 1.025339366515837, + "grad_norm": 0.666231632232666, + "learning_rate": 0.0004876971219852697, + "loss": 0.1779, + "mean_token_accuracy": 0.9534575343132019, + "num_tokens": 2503672.0, + "step": 284 + }, + { + "entropy": 2.4362316727638245, + "epoch": 1.0289592760180994, + "grad_norm": 0.8445858955383301, + "learning_rate": 0.000487500153874292, + "loss": 0.1698, + "mean_token_accuracy": 0.953661322593689, + "num_tokens": 2512322.0, + "step": 285 + }, + { + "entropy": 2.364333391189575, + "epoch": 1.032579185520362, + "grad_norm": 0.4805246591567993, + "learning_rate": 0.0004873024016393193, + "loss": 0.0778, + "mean_token_accuracy": 0.9824571758508682, + "num_tokens": 2520791.0, + "step": 286 + }, + { + "entropy": 2.223461151123047, + "epoch": 1.0361990950226245, + "grad_norm": 0.648465096950531, + "learning_rate": 0.0004871038660314399, + "loss": 0.2593, + "mean_token_accuracy": 0.9419913589954376, + "num_tokens": 2530082.0, + "step": 287 + }, + { + "entropy": 2.3313387036323547, + "epoch": 1.039819004524887, + "grad_norm": 0.6912294626235962, + "learning_rate": 0.00048690454780471725, + "loss": 0.1354, + "mean_token_accuracy": 0.9561934620141983, + "num_tokens": 2538728.0, + "step": 288 + }, + { + "entropy": 2.191806375980377, + "epoch": 1.0434389140271494, + "grad_norm": 0.8620694279670715, + "learning_rate": 0.0004867044477161874, + "loss": 0.1103, + "mean_token_accuracy": 0.968692272901535, + "num_tokens": 2547219.0, + "step": 289 + }, + { + "entropy": 2.167125165462494, + "epoch": 1.0470588235294118, + "grad_norm": 0.6192149519920349, + "learning_rate": 0.0004865035665258559, + "loss": 0.1288, + "mean_token_accuracy": 0.9643534421920776, + "num_tokens": 2555940.0, + "step": 290 + }, + { + "entropy": 2.2750985622406006, + "epoch": 1.0506787330316743, + "grad_norm": 1.7459602355957031, + "learning_rate": 0.0004863019049966953, + "loss": 0.393, + "mean_token_accuracy": 0.9146681725978851, + "num_tokens": 2564362.0, + "step": 291 + }, + { + "entropy": 2.236129105091095, + "epoch": 1.0542986425339367, + "grad_norm": 0.6311184167861938, + "learning_rate": 0.0004860994638946416, + "loss": 0.1536, + "mean_token_accuracy": 0.9636097103357315, + "num_tokens": 2573316.0, + "step": 292 + }, + { + "entropy": 2.2642418146133423, + "epoch": 1.0579185520361991, + "grad_norm": 0.6023411154747009, + "learning_rate": 0.000485896243988592, + "loss": 0.191, + "mean_token_accuracy": 0.9476015418767929, + "num_tokens": 2581835.0, + "step": 293 + }, + { + "entropy": 2.3589024543762207, + "epoch": 1.0615384615384615, + "grad_norm": 0.48049232363700867, + "learning_rate": 0.0004856922460504016, + "loss": 0.1017, + "mean_token_accuracy": 0.9713075459003448, + "num_tokens": 2590317.0, + "step": 294 + }, + { + "entropy": 2.4141315817832947, + "epoch": 1.065158371040724, + "grad_norm": 0.8456616997718811, + "learning_rate": 0.0004854874708548806, + "loss": 0.1422, + "mean_token_accuracy": 0.9622762501239777, + "num_tokens": 2598538.0, + "step": 295 + }, + { + "entropy": 2.069903999567032, + "epoch": 1.0687782805429864, + "grad_norm": 0.7641116380691528, + "learning_rate": 0.0004852819191797912, + "loss": 0.2185, + "mean_token_accuracy": 0.9464851468801498, + "num_tokens": 2608219.0, + "step": 296 + }, + { + "entropy": 2.163217008113861, + "epoch": 1.0723981900452488, + "grad_norm": 0.546085000038147, + "learning_rate": 0.0004850755918058449, + "loss": 0.1035, + "mean_token_accuracy": 0.9708487540483475, + "num_tokens": 2617261.0, + "step": 297 + }, + { + "entropy": 2.2678662836551666, + "epoch": 1.0760180995475113, + "grad_norm": 0.8699386119842529, + "learning_rate": 0.0004848684895166994, + "loss": 0.2384, + "mean_token_accuracy": 0.9486480504274368, + "num_tokens": 2626144.0, + "step": 298 + }, + { + "entropy": 2.13065105676651, + "epoch": 1.0796380090497737, + "grad_norm": 0.44323107600212097, + "learning_rate": 0.00048466061309895554, + "loss": 0.0818, + "mean_token_accuracy": 0.9722468554973602, + "num_tokens": 2635626.0, + "step": 299 + }, + { + "entropy": 2.184772551059723, + "epoch": 1.0832579185520361, + "grad_norm": 0.7928256988525391, + "learning_rate": 0.0004844519633421545, + "loss": 0.2378, + "mean_token_accuracy": 0.9477885961532593, + "num_tokens": 2644674.0, + "step": 300 + }, + { + "entropy": 2.1669145822525024, + "epoch": 1.0868778280542986, + "grad_norm": 0.5570158362388611, + "learning_rate": 0.00048424254103877456, + "loss": 0.1434, + "mean_token_accuracy": 0.9587411731481552, + "num_tokens": 2653658.0, + "step": 301 + }, + { + "entropy": 2.3057579398155212, + "epoch": 1.090497737556561, + "grad_norm": 0.9084392189979553, + "learning_rate": 0.00048403234698422837, + "loss": 0.3831, + "mean_token_accuracy": 0.8896283358335495, + "num_tokens": 2662350.0, + "step": 302 + }, + { + "entropy": 2.1741657853126526, + "epoch": 1.0941176470588236, + "grad_norm": 0.6791238784790039, + "learning_rate": 0.0004838213819768597, + "loss": 0.1648, + "mean_token_accuracy": 0.9576362520456314, + "num_tokens": 2671450.0, + "step": 303 + }, + { + "entropy": 2.089864045381546, + "epoch": 1.097737556561086, + "grad_norm": 0.5696312189102173, + "learning_rate": 0.0004836096468179406, + "loss": 0.1269, + "mean_token_accuracy": 0.9658148884773254, + "num_tokens": 2680581.0, + "step": 304 + }, + { + "entropy": 2.2657605409622192, + "epoch": 1.1013574660633485, + "grad_norm": 1.605503797531128, + "learning_rate": 0.0004833971423116682, + "loss": 0.1027, + "mean_token_accuracy": 0.9762597978115082, + "num_tokens": 2689001.0, + "step": 305 + }, + { + "entropy": 2.079287111759186, + "epoch": 1.104977375565611, + "grad_norm": 0.5804780721664429, + "learning_rate": 0.00048318386926516157, + "loss": 0.1137, + "mean_token_accuracy": 0.9633719325065613, + "num_tokens": 2698050.0, + "step": 306 + }, + { + "entropy": 2.201345145702362, + "epoch": 1.1085972850678734, + "grad_norm": 0.8606241941452026, + "learning_rate": 0.000482969828488459, + "loss": 0.2124, + "mean_token_accuracy": 0.9472681730985641, + "num_tokens": 2706704.0, + "step": 307 + }, + { + "entropy": 2.095236599445343, + "epoch": 1.1122171945701358, + "grad_norm": 0.7078782320022583, + "learning_rate": 0.0004827550207945147, + "loss": 0.1957, + "mean_token_accuracy": 0.9564679116010666, + "num_tokens": 2715745.0, + "step": 308 + }, + { + "entropy": 2.186302363872528, + "epoch": 1.1158371040723982, + "grad_norm": 0.7166503667831421, + "learning_rate": 0.0004825394469991956, + "loss": 0.1539, + "mean_token_accuracy": 0.9662427455186844, + "num_tokens": 2724296.0, + "step": 309 + }, + { + "entropy": 2.052559405565262, + "epoch": 1.1194570135746607, + "grad_norm": 0.6510501503944397, + "learning_rate": 0.00048232310792127846, + "loss": 0.1831, + "mean_token_accuracy": 0.9533994495868683, + "num_tokens": 2733482.0, + "step": 310 + }, + { + "entropy": 2.093154102563858, + "epoch": 1.123076923076923, + "grad_norm": 0.711121678352356, + "learning_rate": 0.0004821060043824466, + "loss": 0.2315, + "mean_token_accuracy": 0.9381555914878845, + "num_tokens": 2742912.0, + "step": 311 + }, + { + "entropy": 2.188497006893158, + "epoch": 1.1266968325791855, + "grad_norm": 0.6782490015029907, + "learning_rate": 0.00048188813720728707, + "loss": 0.2, + "mean_token_accuracy": 0.9501812607049942, + "num_tokens": 2751808.0, + "step": 312 + }, + { + "entropy": 2.0495824217796326, + "epoch": 1.130316742081448, + "grad_norm": 0.7644634246826172, + "learning_rate": 0.00048166950722328697, + "loss": 0.2152, + "mean_token_accuracy": 0.9440928995609283, + "num_tokens": 2761066.0, + "step": 313 + }, + { + "entropy": 2.1707025468349457, + "epoch": 1.1339366515837104, + "grad_norm": 0.655131459236145, + "learning_rate": 0.00048145011526083106, + "loss": 0.1637, + "mean_token_accuracy": 0.9500558227300644, + "num_tokens": 2769870.0, + "step": 314 + }, + { + "entropy": 2.1047372221946716, + "epoch": 1.1375565610859728, + "grad_norm": 0.5353516936302185, + "learning_rate": 0.0004812299621531979, + "loss": 0.1705, + "mean_token_accuracy": 0.9455999433994293, + "num_tokens": 2779383.0, + "step": 315 + }, + { + "entropy": 2.1921610236167908, + "epoch": 1.1411764705882352, + "grad_norm": 0.8998016119003296, + "learning_rate": 0.00048100904873655696, + "loss": 0.3918, + "mean_token_accuracy": 0.9382697492837906, + "num_tokens": 2788386.0, + "step": 316 + }, + { + "entropy": 2.0850723683834076, + "epoch": 1.1447963800904977, + "grad_norm": 0.867432713508606, + "learning_rate": 0.0004807873758499656, + "loss": 0.2196, + "mean_token_accuracy": 0.9498324394226074, + "num_tokens": 2797496.0, + "step": 317 + }, + { + "entropy": 2.1980925798416138, + "epoch": 1.14841628959276, + "grad_norm": 0.6076980233192444, + "learning_rate": 0.00048056494433536577, + "loss": 0.1086, + "mean_token_accuracy": 0.9642161130905151, + "num_tokens": 2805836.0, + "step": 318 + }, + { + "entropy": 2.15611070394516, + "epoch": 1.1520361990950225, + "grad_norm": 0.6276211738586426, + "learning_rate": 0.0004803417550375806, + "loss": 0.1463, + "mean_token_accuracy": 0.9622830748558044, + "num_tokens": 2814404.0, + "step": 319 + }, + { + "entropy": 2.0017230808734894, + "epoch": 1.155656108597285, + "grad_norm": 0.5840948820114136, + "learning_rate": 0.0004801178088043115, + "loss": 0.1869, + "mean_token_accuracy": 0.9506777077913284, + "num_tokens": 2823786.0, + "step": 320 + }, + { + "entropy": 2.1539418697357178, + "epoch": 1.1592760180995474, + "grad_norm": 1.074331283569336, + "learning_rate": 0.0004798931064861349, + "loss": 0.2797, + "mean_token_accuracy": 0.9271649420261383, + "num_tokens": 2832374.0, + "step": 321 + }, + { + "entropy": 1.930726408958435, + "epoch": 1.16289592760181, + "grad_norm": 0.5121958255767822, + "learning_rate": 0.0004796676489364988, + "loss": 0.1579, + "mean_token_accuracy": 0.9582571685314178, + "num_tokens": 2841561.0, + "step": 322 + }, + { + "entropy": 2.0205810368061066, + "epoch": 1.1665158371040725, + "grad_norm": 0.6360969543457031, + "learning_rate": 0.00047944143701171966, + "loss": 0.1582, + "mean_token_accuracy": 0.9620308429002762, + "num_tokens": 2850171.0, + "step": 323 + }, + { + "entropy": 1.9655758142471313, + "epoch": 1.170135746606335, + "grad_norm": 0.6647385358810425, + "learning_rate": 0.0004792144715709792, + "loss": 0.1594, + "mean_token_accuracy": 0.954497441649437, + "num_tokens": 2858905.0, + "step": 324 + }, + { + "entropy": 1.9725223183631897, + "epoch": 1.1737556561085973, + "grad_norm": 0.6429229974746704, + "learning_rate": 0.0004789867534763211, + "loss": 0.1407, + "mean_token_accuracy": 0.9645214527845383, + "num_tokens": 2867533.0, + "step": 325 + }, + { + "entropy": 1.9473685026168823, + "epoch": 1.1773755656108598, + "grad_norm": 0.811651349067688, + "learning_rate": 0.0004787582835926477, + "loss": 0.1608, + "mean_token_accuracy": 0.9479968994855881, + "num_tokens": 2876286.0, + "step": 326 + }, + { + "entropy": 1.8863109350204468, + "epoch": 1.1809954751131222, + "grad_norm": 0.5587059855461121, + "learning_rate": 0.00047852906278771686, + "loss": 0.131, + "mean_token_accuracy": 0.9684520065784454, + "num_tokens": 2885667.0, + "step": 327 + }, + { + "entropy": 1.8288891315460205, + "epoch": 1.1846153846153846, + "grad_norm": 0.8450536131858826, + "learning_rate": 0.0004782990919321383, + "loss": 0.2224, + "mean_token_accuracy": 0.9377491921186447, + "num_tokens": 2894765.0, + "step": 328 + }, + { + "entropy": 1.9347718358039856, + "epoch": 1.188235294117647, + "grad_norm": 0.7665867209434509, + "learning_rate": 0.0004780683718993705, + "loss": 0.167, + "mean_token_accuracy": 0.9583602845668793, + "num_tokens": 2903551.0, + "step": 329 + }, + { + "entropy": 1.9097798764705658, + "epoch": 1.1918552036199095, + "grad_norm": 0.7705667018890381, + "learning_rate": 0.00047783690356571784, + "loss": 0.2115, + "mean_token_accuracy": 0.9526428133249283, + "num_tokens": 2912197.0, + "step": 330 + }, + { + "entropy": 1.9174850285053253, + "epoch": 1.195475113122172, + "grad_norm": 0.5695499181747437, + "learning_rate": 0.00047760468781032634, + "loss": 0.1033, + "mean_token_accuracy": 0.969958484172821, + "num_tokens": 2920579.0, + "step": 331 + }, + { + "entropy": 1.8578442931175232, + "epoch": 1.1990950226244343, + "grad_norm": 0.7843735814094543, + "learning_rate": 0.000477371725515181, + "loss": 0.1664, + "mean_token_accuracy": 0.9545005410909653, + "num_tokens": 2929352.0, + "step": 332 + }, + { + "entropy": 1.8509328961372375, + "epoch": 1.2027149321266968, + "grad_norm": 0.5951048135757446, + "learning_rate": 0.0004771380175651026, + "loss": 0.1566, + "mean_token_accuracy": 0.9551403075456619, + "num_tokens": 2938387.0, + "step": 333 + }, + { + "entropy": 1.8236390948295593, + "epoch": 1.2063348416289592, + "grad_norm": 0.4988223910331726, + "learning_rate": 0.0004769035648477434, + "loss": 0.1242, + "mean_token_accuracy": 0.966319814324379, + "num_tokens": 2947741.0, + "step": 334 + }, + { + "entropy": 1.9594822525978088, + "epoch": 1.2099547511312216, + "grad_norm": 0.7550755143165588, + "learning_rate": 0.00047666836825358477, + "loss": 0.1591, + "mean_token_accuracy": 0.9666347652673721, + "num_tokens": 2956313.0, + "step": 335 + }, + { + "entropy": 1.9148444533348083, + "epoch": 1.213574660633484, + "grad_norm": 0.5889077186584473, + "learning_rate": 0.00047643242867593345, + "loss": 0.1343, + "mean_token_accuracy": 0.9611433297395706, + "num_tokens": 2964928.0, + "step": 336 + }, + { + "entropy": 1.8126957714557648, + "epoch": 1.2171945701357467, + "grad_norm": 0.5447750091552734, + "learning_rate": 0.0004761957470109179, + "loss": 0.1659, + "mean_token_accuracy": 0.9552300125360489, + "num_tokens": 2974160.0, + "step": 337 + }, + { + "entropy": 1.7981431782245636, + "epoch": 1.2208144796380092, + "grad_norm": 0.5400761365890503, + "learning_rate": 0.0004759583241574854, + "loss": 0.1339, + "mean_token_accuracy": 0.9620136916637421, + "num_tokens": 2982900.0, + "step": 338 + }, + { + "entropy": 1.8613979518413544, + "epoch": 1.2244343891402716, + "grad_norm": 0.7452914714813232, + "learning_rate": 0.0004757201610173981, + "loss": 0.4, + "mean_token_accuracy": 0.9068266004323959, + "num_tokens": 2991783.0, + "step": 339 + }, + { + "entropy": 1.8654026687145233, + "epoch": 1.228054298642534, + "grad_norm": 1.7142685651779175, + "learning_rate": 0.00047548125849523, + "loss": 0.3168, + "mean_token_accuracy": 0.9308896362781525, + "num_tokens": 3000530.0, + "step": 340 + }, + { + "entropy": 1.7702704071998596, + "epoch": 1.2316742081447964, + "grad_norm": 0.6687431931495667, + "learning_rate": 0.0004752416174983633, + "loss": 0.1697, + "mean_token_accuracy": 0.9530515670776367, + "num_tokens": 3009355.0, + "step": 341 + }, + { + "entropy": 1.735857516527176, + "epoch": 1.2352941176470589, + "grad_norm": 0.6127599477767944, + "learning_rate": 0.00047500123893698507, + "loss": 0.1706, + "mean_token_accuracy": 0.9593266248703003, + "num_tokens": 3018518.0, + "step": 342 + }, + { + "entropy": 1.7076368927955627, + "epoch": 1.2389140271493213, + "grad_norm": 0.6973987817764282, + "learning_rate": 0.0004747601237240836, + "loss": 0.1615, + "mean_token_accuracy": 0.9539438933134079, + "num_tokens": 3027752.0, + "step": 343 + }, + { + "entropy": 1.7353227138519287, + "epoch": 1.2425339366515837, + "grad_norm": 0.8406392335891724, + "learning_rate": 0.00047451827277544546, + "loss": 0.2063, + "mean_token_accuracy": 0.9488435834646225, + "num_tokens": 3036383.0, + "step": 344 + }, + { + "entropy": 1.6597246527671814, + "epoch": 1.2461538461538462, + "grad_norm": 0.5971431732177734, + "learning_rate": 0.00047427568700965107, + "loss": 0.1013, + "mean_token_accuracy": 0.9721864312887192, + "num_tokens": 3045375.0, + "step": 345 + }, + { + "entropy": 1.7100033462047577, + "epoch": 1.2497737556561086, + "grad_norm": 0.5883470773696899, + "learning_rate": 0.00047403236734807225, + "loss": 0.1164, + "mean_token_accuracy": 0.9664830714464188, + "num_tokens": 3054084.0, + "step": 346 + }, + { + "entropy": 1.7402609288692474, + "epoch": 1.253393665158371, + "grad_norm": 0.7355862855911255, + "learning_rate": 0.00047378831471486815, + "loss": 0.2007, + "mean_token_accuracy": 0.9560511559247971, + "num_tokens": 3062727.0, + "step": 347 + }, + { + "entropy": 1.79518261551857, + "epoch": 1.2570135746606335, + "grad_norm": 0.6006518006324768, + "learning_rate": 0.00047354353003698163, + "loss": 0.1085, + "mean_token_accuracy": 0.9598321914672852, + "num_tokens": 3071178.0, + "step": 348 + }, + { + "entropy": 1.7328391373157501, + "epoch": 1.260633484162896, + "grad_norm": 0.560342013835907, + "learning_rate": 0.0004732980142441362, + "loss": 0.1593, + "mean_token_accuracy": 0.9579409211874008, + "num_tokens": 3079927.0, + "step": 349 + }, + { + "entropy": 1.7356511652469635, + "epoch": 1.2642533936651583, + "grad_norm": 0.9149975776672363, + "learning_rate": 0.00047305176826883206, + "loss": 0.4064, + "mean_token_accuracy": 0.9265118837356567, + "num_tokens": 3089314.0, + "step": 350 + }, + { + "entropy": 1.8573569357395172, + "epoch": 1.2678733031674208, + "grad_norm": 0.8300670981407166, + "learning_rate": 0.0004728047930463428, + "loss": 0.195, + "mean_token_accuracy": 0.9453776180744171, + "num_tokens": 3097702.0, + "step": 351 + }, + { + "entropy": 1.7906217575073242, + "epoch": 1.2714932126696832, + "grad_norm": 0.5668906569480896, + "learning_rate": 0.0004725570895147118, + "loss": 0.1572, + "mean_token_accuracy": 0.962067037820816, + "num_tokens": 3106379.0, + "step": 352 + }, + { + "entropy": 1.6957395374774933, + "epoch": 1.2751131221719456, + "grad_norm": 0.4048328399658203, + "learning_rate": 0.0004723086586147487, + "loss": 0.0944, + "mean_token_accuracy": 0.9716819673776627, + "num_tokens": 3115622.0, + "step": 353 + }, + { + "entropy": 1.8158144056797028, + "epoch": 1.278733031674208, + "grad_norm": 0.6396092772483826, + "learning_rate": 0.00047205950129002564, + "loss": 0.1011, + "mean_token_accuracy": 0.9698463827371597, + "num_tokens": 3124016.0, + "step": 354 + }, + { + "entropy": 1.730194479227066, + "epoch": 1.2823529411764705, + "grad_norm": 0.662876307964325, + "learning_rate": 0.000471809618486874, + "loss": 0.1641, + "mean_token_accuracy": 0.9520179778337479, + "num_tokens": 3132712.0, + "step": 355 + }, + { + "entropy": 1.6776110529899597, + "epoch": 1.285972850678733, + "grad_norm": 0.868507981300354, + "learning_rate": 0.0004715590111543804, + "loss": 0.3374, + "mean_token_accuracy": 0.9303739666938782, + "num_tokens": 3142103.0, + "step": 356 + }, + { + "entropy": 1.6501678824424744, + "epoch": 1.2895927601809956, + "grad_norm": 0.5433686971664429, + "learning_rate": 0.0004713076802443834, + "loss": 0.1237, + "mean_token_accuracy": 0.9653612226247787, + "num_tokens": 3151192.0, + "step": 357 + }, + { + "entropy": 1.6524465382099152, + "epoch": 1.293212669683258, + "grad_norm": 0.6145523190498352, + "learning_rate": 0.00047105562671147, + "loss": 0.1204, + "mean_token_accuracy": 0.9690534323453903, + "num_tokens": 3159839.0, + "step": 358 + }, + { + "entropy": 1.5339214205741882, + "epoch": 1.2968325791855204, + "grad_norm": 0.500477135181427, + "learning_rate": 0.00047080285151297144, + "loss": 0.1295, + "mean_token_accuracy": 0.9571033865213394, + "num_tokens": 3169047.0, + "step": 359 + }, + { + "entropy": 1.6765435338020325, + "epoch": 1.3004524886877828, + "grad_norm": 0.6697553396224976, + "learning_rate": 0.00047054935560896026, + "loss": 0.135, + "mean_token_accuracy": 0.9672541171312332, + "num_tokens": 3177062.0, + "step": 360 + }, + { + "entropy": 1.5932062566280365, + "epoch": 1.3040723981900453, + "grad_norm": 0.706957221031189, + "learning_rate": 0.0004702951399622462, + "loss": 0.1229, + "mean_token_accuracy": 0.9634416699409485, + "num_tokens": 3185829.0, + "step": 361 + }, + { + "entropy": 1.5623145997524261, + "epoch": 1.3076923076923077, + "grad_norm": 0.6199461221694946, + "learning_rate": 0.00047004020553837275, + "loss": 0.1449, + "mean_token_accuracy": 0.9620065689086914, + "num_tokens": 3194426.0, + "step": 362 + }, + { + "entropy": 1.5226828753948212, + "epoch": 1.3113122171945701, + "grad_norm": 0.8962509036064148, + "learning_rate": 0.0004697845533056132, + "loss": 0.2207, + "mean_token_accuracy": 0.9403344839811325, + "num_tokens": 3203655.0, + "step": 363 + }, + { + "entropy": 1.5395641326904297, + "epoch": 1.3149321266968326, + "grad_norm": 0.5993619561195374, + "learning_rate": 0.00046952818423496727, + "loss": 0.1486, + "mean_token_accuracy": 0.9614185988903046, + "num_tokens": 3212069.0, + "step": 364 + }, + { + "entropy": 1.5738630294799805, + "epoch": 1.318552036199095, + "grad_norm": 0.7393983602523804, + "learning_rate": 0.00046927109930015756, + "loss": 0.1812, + "mean_token_accuracy": 0.9535021334886551, + "num_tokens": 3220482.0, + "step": 365 + }, + { + "entropy": 1.5462632775306702, + "epoch": 1.3221719457013574, + "grad_norm": 0.7453555464744568, + "learning_rate": 0.0004690132994776253, + "loss": 0.164, + "mean_token_accuracy": 0.9585814625024796, + "num_tokens": 3229505.0, + "step": 366 + }, + { + "entropy": 1.5241961777210236, + "epoch": 1.3257918552036199, + "grad_norm": 0.7553415298461914, + "learning_rate": 0.00046875478574652713, + "loss": 0.1445, + "mean_token_accuracy": 0.9682841598987579, + "num_tokens": 3238326.0, + "step": 367 + }, + { + "entropy": 1.5344699025154114, + "epoch": 1.3294117647058823, + "grad_norm": 0.8565949201583862, + "learning_rate": 0.0004684955590887311, + "loss": 0.2521, + "mean_token_accuracy": 0.920401468873024, + "num_tokens": 3247482.0, + "step": 368 + }, + { + "entropy": 1.5109277665615082, + "epoch": 1.3330316742081447, + "grad_norm": 0.5170580148696899, + "learning_rate": 0.00046823562048881295, + "loss": 0.1393, + "mean_token_accuracy": 0.9584086239337921, + "num_tokens": 3256464.0, + "step": 369 + }, + { + "entropy": 1.4666939079761505, + "epoch": 1.3366515837104074, + "grad_norm": 0.6995373368263245, + "learning_rate": 0.0004679749709340529, + "loss": 0.1726, + "mean_token_accuracy": 0.9477890431880951, + "num_tokens": 3265853.0, + "step": 370 + }, + { + "entropy": 1.4208430051803589, + "epoch": 1.3402714932126698, + "grad_norm": 1.1363991498947144, + "learning_rate": 0.000467713611414431, + "loss": 0.196, + "mean_token_accuracy": 0.9495431333780289, + "num_tokens": 3275367.0, + "step": 371 + }, + { + "entropy": 1.5009459853172302, + "epoch": 1.3438914027149322, + "grad_norm": 0.7883325219154358, + "learning_rate": 0.00046745154292262414, + "loss": 0.2526, + "mean_token_accuracy": 0.9334618002176285, + "num_tokens": 3284772.0, + "step": 372 + }, + { + "entropy": 1.5485479533672333, + "epoch": 1.3475113122171947, + "grad_norm": 0.6516429781913757, + "learning_rate": 0.00046718876645400156, + "loss": 0.2057, + "mean_token_accuracy": 0.9546459317207336, + "num_tokens": 3293493.0, + "step": 373 + }, + { + "entropy": 1.6237249970436096, + "epoch": 1.351131221719457, + "grad_norm": 0.8916263580322266, + "learning_rate": 0.00046692528300662213, + "loss": 0.2123, + "mean_token_accuracy": 0.9456845372915268, + "num_tokens": 3302063.0, + "step": 374 + }, + { + "entropy": 1.561572015285492, + "epoch": 1.3547511312217195, + "grad_norm": 0.7527791857719421, + "learning_rate": 0.00046666109358122935, + "loss": 0.2113, + "mean_token_accuracy": 0.9537477940320969, + "num_tokens": 3311037.0, + "step": 375 + }, + { + "entropy": 1.5594256818294525, + "epoch": 1.358371040723982, + "grad_norm": 1.25638747215271, + "learning_rate": 0.0004663961991812485, + "loss": 0.1629, + "mean_token_accuracy": 0.9508458077907562, + "num_tokens": 3319635.0, + "step": 376 + }, + { + "entropy": 1.6909976303577423, + "epoch": 1.3619909502262444, + "grad_norm": 0.7627813220024109, + "learning_rate": 0.00046613060081278194, + "loss": 0.2303, + "mean_token_accuracy": 0.9425801336765289, + "num_tokens": 3328043.0, + "step": 377 + }, + { + "entropy": 1.6074829697608948, + "epoch": 1.3656108597285068, + "grad_norm": 0.6584346294403076, + "learning_rate": 0.00046586429948460646, + "loss": 0.1815, + "mean_token_accuracy": 0.9536214470863342, + "num_tokens": 3337143.0, + "step": 378 + }, + { + "entropy": 1.7382183969020844, + "epoch": 1.3692307692307693, + "grad_norm": 1.37154221534729, + "learning_rate": 0.0004655972962081684, + "loss": 0.1849, + "mean_token_accuracy": 0.948440819978714, + "num_tokens": 3346033.0, + "step": 379 + }, + { + "entropy": 1.7148900926113129, + "epoch": 1.3728506787330317, + "grad_norm": 0.9487980604171753, + "learning_rate": 0.00046532959199758, + "loss": 0.2521, + "mean_token_accuracy": 0.9344504028558731, + "num_tokens": 3354849.0, + "step": 380 + }, + { + "entropy": 1.7164019346237183, + "epoch": 1.3764705882352941, + "grad_norm": 0.5609025359153748, + "learning_rate": 0.00046506118786961614, + "loss": 0.1425, + "mean_token_accuracy": 0.9571309834718704, + "num_tokens": 3363674.0, + "step": 381 + }, + { + "entropy": 1.894619107246399, + "epoch": 1.3800904977375565, + "grad_norm": 0.9811336994171143, + "learning_rate": 0.00046479208484370997, + "loss": 0.2522, + "mean_token_accuracy": 0.9424156546592712, + "num_tokens": 3372325.0, + "step": 382 + }, + { + "entropy": 1.78870290517807, + "epoch": 1.383710407239819, + "grad_norm": 0.5707085132598877, + "learning_rate": 0.00046452228394194893, + "loss": 0.1354, + "mean_token_accuracy": 0.9613165706396103, + "num_tokens": 3381270.0, + "step": 383 + }, + { + "entropy": 1.803922712802887, + "epoch": 1.3873303167420814, + "grad_norm": 0.5655364394187927, + "learning_rate": 0.0004642517861890713, + "loss": 0.0818, + "mean_token_accuracy": 0.9776160269975662, + "num_tokens": 3390363.0, + "step": 384 + }, + { + "entropy": 1.8172507882118225, + "epoch": 1.3909502262443438, + "grad_norm": 0.6950513124465942, + "learning_rate": 0.00046398059261246205, + "loss": 0.1145, + "mean_token_accuracy": 0.963288351893425, + "num_tokens": 3399176.0, + "step": 385 + }, + { + "entropy": 1.9182518422603607, + "epoch": 1.3945701357466063, + "grad_norm": 0.5900619029998779, + "learning_rate": 0.0004637087042421489, + "loss": 0.108, + "mean_token_accuracy": 0.9723307639360428, + "num_tokens": 3407978.0, + "step": 386 + }, + { + "entropy": 1.8558574616909027, + "epoch": 1.3981900452488687, + "grad_norm": 0.6279832124710083, + "learning_rate": 0.00046343612211079843, + "loss": 0.1471, + "mean_token_accuracy": 0.9603912532329559, + "num_tokens": 3416856.0, + "step": 387 + }, + { + "entropy": 1.8146779537200928, + "epoch": 1.4018099547511311, + "grad_norm": 0.6171274781227112, + "learning_rate": 0.0004631628472537125, + "loss": 0.1872, + "mean_token_accuracy": 0.9447146654129028, + "num_tokens": 3426044.0, + "step": 388 + }, + { + "entropy": 1.9342225790023804, + "epoch": 1.4054298642533936, + "grad_norm": 0.9947887659072876, + "learning_rate": 0.00046288888070882374, + "loss": 0.2966, + "mean_token_accuracy": 0.9279204607009888, + "num_tokens": 3435154.0, + "step": 389 + }, + { + "entropy": 1.9391801953315735, + "epoch": 1.409049773755656, + "grad_norm": 0.7155653834342957, + "learning_rate": 0.000462614223516692, + "loss": 0.1847, + "mean_token_accuracy": 0.9475171864032745, + "num_tokens": 3444563.0, + "step": 390 + }, + { + "entropy": 2.0716978013515472, + "epoch": 1.4126696832579184, + "grad_norm": 0.8198989629745483, + "learning_rate": 0.0004623388767205004, + "loss": 0.1317, + "mean_token_accuracy": 0.9608721435070038, + "num_tokens": 3453410.0, + "step": 391 + }, + { + "entropy": 2.1060431599617004, + "epoch": 1.416289592760181, + "grad_norm": 1.025406002998352, + "learning_rate": 0.00046206284136605106, + "loss": 0.2146, + "mean_token_accuracy": 0.9414294511079788, + "num_tokens": 3461958.0, + "step": 392 + }, + { + "entropy": 2.1459922194480896, + "epoch": 1.4199095022624435, + "grad_norm": 0.9209627509117126, + "learning_rate": 0.00046178611850176146, + "loss": 0.2137, + "mean_token_accuracy": 0.956874743103981, + "num_tokens": 3470547.0, + "step": 393 + }, + { + "entropy": 2.0233450531959534, + "epoch": 1.423529411764706, + "grad_norm": 0.5777944922447205, + "learning_rate": 0.00046150870917866025, + "loss": 0.122, + "mean_token_accuracy": 0.9672323018312454, + "num_tokens": 3479618.0, + "step": 394 + }, + { + "entropy": 2.035937190055847, + "epoch": 1.4271493212669684, + "grad_norm": 0.7945542931556702, + "learning_rate": 0.0004612306144503835, + "loss": 0.2879, + "mean_token_accuracy": 0.946587473154068, + "num_tokens": 3488533.0, + "step": 395 + }, + { + "entropy": 2.155315637588501, + "epoch": 1.4307692307692308, + "grad_norm": 0.6385292410850525, + "learning_rate": 0.00046095183537317035, + "loss": 0.1008, + "mean_token_accuracy": 0.9655124247074127, + "num_tokens": 3496686.0, + "step": 396 + }, + { + "entropy": 2.186827063560486, + "epoch": 1.4343891402714932, + "grad_norm": 0.4759826958179474, + "learning_rate": 0.0004606723730058593, + "loss": 0.0768, + "mean_token_accuracy": 0.9783597737550735, + "num_tokens": 3504958.0, + "step": 397 + }, + { + "entropy": 1.974392294883728, + "epoch": 1.4380090497737557, + "grad_norm": 0.6250292062759399, + "learning_rate": 0.00046039222840988406, + "loss": 0.1381, + "mean_token_accuracy": 0.9586146324872971, + "num_tokens": 3513694.0, + "step": 398 + }, + { + "entropy": 2.045738846063614, + "epoch": 1.441628959276018, + "grad_norm": 0.5517769455909729, + "learning_rate": 0.0004601114026492695, + "loss": 0.1312, + "mean_token_accuracy": 0.9682512134313583, + "num_tokens": 3522395.0, + "step": 399 + }, + { + "entropy": 2.105030357837677, + "epoch": 1.4452488687782805, + "grad_norm": 0.6748242974281311, + "learning_rate": 0.0004598298967906276, + "loss": 0.1056, + "mean_token_accuracy": 0.9701305478811264, + "num_tokens": 3530838.0, + "step": 400 + }, + { + "entropy": 2.024325281381607, + "epoch": 1.448868778280543, + "grad_norm": 0.6320233941078186, + "learning_rate": 0.00045954771190315344, + "loss": 0.1129, + "mean_token_accuracy": 0.9633017927408218, + "num_tokens": 3540184.0, + "step": 401 + }, + { + "entropy": 2.1561593413352966, + "epoch": 1.4524886877828054, + "grad_norm": 0.7380363941192627, + "learning_rate": 0.0004592648490586213, + "loss": 0.1304, + "mean_token_accuracy": 0.9599586874246597, + "num_tokens": 3548727.0, + "step": 402 + }, + { + "entropy": 2.2986454367637634, + "epoch": 1.4561085972850678, + "grad_norm": 0.669114351272583, + "learning_rate": 0.00045898130933138024, + "loss": 0.1005, + "mean_token_accuracy": 0.9724964797496796, + "num_tokens": 3556780.0, + "step": 403 + }, + { + "entropy": 2.103136509656906, + "epoch": 1.4597285067873302, + "grad_norm": 0.6677402853965759, + "learning_rate": 0.0004586970937983504, + "loss": 0.1177, + "mean_token_accuracy": 0.9597653448581696, + "num_tokens": 3565427.0, + "step": 404 + }, + { + "entropy": 2.112696200609207, + "epoch": 1.463348416289593, + "grad_norm": 0.4597342014312744, + "learning_rate": 0.0004584122035390185, + "loss": 0.0695, + "mean_token_accuracy": 0.9763098359107971, + "num_tokens": 3573902.0, + "step": 405 + }, + { + "entropy": 2.0472628474235535, + "epoch": 1.4669683257918553, + "grad_norm": 0.7842056751251221, + "learning_rate": 0.0004581266396354339, + "loss": 0.1981, + "mean_token_accuracy": 0.9521032422780991, + "num_tokens": 3582913.0, + "step": 406 + }, + { + "entropy": 2.236558735370636, + "epoch": 1.4705882352941178, + "grad_norm": 0.7634767293930054, + "learning_rate": 0.000457840403172205, + "loss": 0.1956, + "mean_token_accuracy": 0.9602932929992676, + "num_tokens": 3591197.0, + "step": 407 + }, + { + "entropy": 2.182949125766754, + "epoch": 1.4742081447963802, + "grad_norm": 0.7084661722183228, + "learning_rate": 0.00045755349523649415, + "loss": 0.2463, + "mean_token_accuracy": 0.9392582327127457, + "num_tokens": 3600134.0, + "step": 408 + }, + { + "entropy": 2.135133147239685, + "epoch": 1.4778280542986426, + "grad_norm": 0.8172940015792847, + "learning_rate": 0.00045726591691801433, + "loss": 0.2375, + "mean_token_accuracy": 0.9458330571651459, + "num_tokens": 3608945.0, + "step": 409 + }, + { + "entropy": 2.157473146915436, + "epoch": 1.481447963800905, + "grad_norm": 0.6165594458580017, + "learning_rate": 0.0004569776693090246, + "loss": 0.1628, + "mean_token_accuracy": 0.9586529731750488, + "num_tokens": 3617790.0, + "step": 410 + }, + { + "entropy": 2.15165376663208, + "epoch": 1.4850678733031675, + "grad_norm": 0.6619407534599304, + "learning_rate": 0.0004566887535043263, + "loss": 0.1866, + "mean_token_accuracy": 0.9545126557350159, + "num_tokens": 3626937.0, + "step": 411 + }, + { + "entropy": 2.271161735057831, + "epoch": 1.48868778280543, + "grad_norm": 0.5861835479736328, + "learning_rate": 0.0004563991706012582, + "loss": 0.1409, + "mean_token_accuracy": 0.9595955163240433, + "num_tokens": 3636025.0, + "step": 412 + }, + { + "entropy": 2.277799427509308, + "epoch": 1.4923076923076923, + "grad_norm": 0.6464956402778625, + "learning_rate": 0.00045610892169969323, + "loss": 0.0792, + "mean_token_accuracy": 0.9806316941976547, + "num_tokens": 3644746.0, + "step": 413 + }, + { + "entropy": 2.2143171429634094, + "epoch": 1.4959276018099548, + "grad_norm": 0.7531687021255493, + "learning_rate": 0.00045581800790203366, + "loss": 0.2584, + "mean_token_accuracy": 0.9225966930389404, + "num_tokens": 3654064.0, + "step": 414 + }, + { + "entropy": 2.231681764125824, + "epoch": 1.4995475113122172, + "grad_norm": 0.6902768015861511, + "learning_rate": 0.00045552643031320726, + "loss": 0.232, + "mean_token_accuracy": 0.9433842301368713, + "num_tokens": 3663130.0, + "step": 415 + }, + { + "entropy": 2.2672717571258545, + "epoch": 1.5031674208144796, + "grad_norm": 0.5134314894676208, + "learning_rate": 0.00045523419004066273, + "loss": 0.0874, + "mean_token_accuracy": 0.9708191752433777, + "num_tokens": 3671981.0, + "step": 416 + }, + { + "entropy": 2.3302834033966064, + "epoch": 1.506787330316742, + "grad_norm": 0.885969340801239, + "learning_rate": 0.0004549412881943659, + "loss": 0.0723, + "mean_token_accuracy": 0.9791463166475296, + "num_tokens": 3680525.0, + "step": 417 + }, + { + "entropy": 2.2693899869918823, + "epoch": 1.5104072398190045, + "grad_norm": 0.7424856424331665, + "learning_rate": 0.00045464772588679547, + "loss": 0.1509, + "mean_token_accuracy": 0.9600907415151596, + "num_tokens": 3689430.0, + "step": 418 + }, + { + "entropy": 2.4042725563049316, + "epoch": 1.514027149321267, + "grad_norm": 0.8968034982681274, + "learning_rate": 0.0004543535042329382, + "loss": 0.1984, + "mean_token_accuracy": 0.9488537162542343, + "num_tokens": 3697836.0, + "step": 419 + }, + { + "entropy": 2.2518428564071655, + "epoch": 1.5176470588235293, + "grad_norm": 0.5963534712791443, + "learning_rate": 0.0004540586243502858, + "loss": 0.1214, + "mean_token_accuracy": 0.9711381644010544, + "num_tokens": 3706675.0, + "step": 420 + }, + { + "entropy": 2.275522291660309, + "epoch": 1.5212669683257918, + "grad_norm": 1.0797090530395508, + "learning_rate": 0.0004537630873588293, + "loss": 0.2508, + "mean_token_accuracy": 0.9247037768363953, + "num_tokens": 3715631.0, + "step": 421 + }, + { + "entropy": 2.249617278575897, + "epoch": 1.5248868778280542, + "grad_norm": 0.7636313438415527, + "learning_rate": 0.000453466894381056, + "loss": 0.1112, + "mean_token_accuracy": 0.9681926071643829, + "num_tokens": 3724579.0, + "step": 422 + }, + { + "entropy": 2.280571699142456, + "epoch": 1.5285067873303166, + "grad_norm": 0.9915648698806763, + "learning_rate": 0.00045317004654194464, + "loss": 0.3532, + "mean_token_accuracy": 0.9360047876834869, + "num_tokens": 3733607.0, + "step": 423 + }, + { + "entropy": 2.241512656211853, + "epoch": 1.532126696832579, + "grad_norm": 0.924977719783783, + "learning_rate": 0.0004528725449689611, + "loss": 0.1997, + "mean_token_accuracy": 0.9475428760051727, + "num_tokens": 3742611.0, + "step": 424 + }, + { + "entropy": 2.201731503009796, + "epoch": 1.5357466063348415, + "grad_norm": 0.7018861770629883, + "learning_rate": 0.0004525743907920542, + "loss": 0.1683, + "mean_token_accuracy": 0.9465018659830093, + "num_tokens": 3751737.0, + "step": 425 + }, + { + "entropy": 2.28944593667984, + "epoch": 1.539366515837104, + "grad_norm": 0.5893452763557434, + "learning_rate": 0.00045227558514365166, + "loss": 0.0969, + "mean_token_accuracy": 0.9711766839027405, + "num_tokens": 3761245.0, + "step": 426 + }, + { + "entropy": 2.3497202396392822, + "epoch": 1.5429864253393664, + "grad_norm": 0.685279130935669, + "learning_rate": 0.0004519761291586551, + "loss": 0.106, + "mean_token_accuracy": 0.9663016647100449, + "num_tokens": 3769854.0, + "step": 427 + }, + { + "entropy": 2.308362066745758, + "epoch": 1.5466063348416288, + "grad_norm": 0.5116177797317505, + "learning_rate": 0.00045167602397443694, + "loss": 0.1132, + "mean_token_accuracy": 0.9700013697147369, + "num_tokens": 3778996.0, + "step": 428 + }, + { + "entropy": 2.238637685775757, + "epoch": 1.5502262443438914, + "grad_norm": 0.8374833464622498, + "learning_rate": 0.00045137527073083457, + "loss": 0.2539, + "mean_token_accuracy": 0.9407305717468262, + "num_tokens": 3787835.0, + "step": 429 + }, + { + "entropy": 2.3406758308410645, + "epoch": 1.5538461538461539, + "grad_norm": 0.5140913724899292, + "learning_rate": 0.0004510738705701473, + "loss": 0.1113, + "mean_token_accuracy": 0.9635641574859619, + "num_tokens": 3796498.0, + "step": 430 + }, + { + "entropy": 2.2642539143562317, + "epoch": 1.5574660633484163, + "grad_norm": 0.5750702023506165, + "learning_rate": 0.0004507718246371313, + "loss": 0.1127, + "mean_token_accuracy": 0.9660817235708237, + "num_tokens": 3805464.0, + "step": 431 + }, + { + "entropy": 2.2058264315128326, + "epoch": 1.5610859728506787, + "grad_norm": 0.6448659300804138, + "learning_rate": 0.0004504691340789955, + "loss": 0.0994, + "mean_token_accuracy": 0.96739861369133, + "num_tokens": 3814309.0, + "step": 432 + }, + { + "entropy": 2.330399215221405, + "epoch": 1.5647058823529412, + "grad_norm": 0.8432528376579285, + "learning_rate": 0.0004501658000453973, + "loss": 0.1999, + "mean_token_accuracy": 0.9510775059461594, + "num_tokens": 3823126.0, + "step": 433 + }, + { + "entropy": 2.4211326837539673, + "epoch": 1.5683257918552036, + "grad_norm": 0.8101194500923157, + "learning_rate": 0.00044986182368843806, + "loss": 0.144, + "mean_token_accuracy": 0.9656328558921814, + "num_tokens": 3831274.0, + "step": 434 + }, + { + "entropy": 2.2594956755638123, + "epoch": 1.571945701357466, + "grad_norm": 0.6753663420677185, + "learning_rate": 0.0004495572061626585, + "loss": 0.1433, + "mean_token_accuracy": 0.9572386592626572, + "num_tokens": 3840206.0, + "step": 435 + }, + { + "entropy": 2.1233682930469513, + "epoch": 1.5755656108597285, + "grad_norm": 0.48616713285446167, + "learning_rate": 0.000449251948625035, + "loss": 0.0934, + "mean_token_accuracy": 0.9740773588418961, + "num_tokens": 3849363.0, + "step": 436 + }, + { + "entropy": 2.325556695461273, + "epoch": 1.5791855203619911, + "grad_norm": 0.7744045853614807, + "learning_rate": 0.00044894605223497446, + "loss": 0.127, + "mean_token_accuracy": 0.9687052518129349, + "num_tokens": 3857733.0, + "step": 437 + }, + { + "entropy": 2.266542673110962, + "epoch": 1.5828054298642535, + "grad_norm": 2.373530387878418, + "learning_rate": 0.00044863951815431045, + "loss": 0.2404, + "mean_token_accuracy": 0.9437267184257507, + "num_tokens": 3866374.0, + "step": 438 + }, + { + "entropy": 2.1757248640060425, + "epoch": 1.586425339366516, + "grad_norm": 0.5588560700416565, + "learning_rate": 0.00044833234754729847, + "loss": 0.142, + "mean_token_accuracy": 0.9601300358772278, + "num_tokens": 3875520.0, + "step": 439 + }, + { + "entropy": 2.124377518892288, + "epoch": 1.5900452488687784, + "grad_norm": 0.5602438449859619, + "learning_rate": 0.0004480245415806116, + "loss": 0.1556, + "mean_token_accuracy": 0.9561446160078049, + "num_tokens": 3884345.0, + "step": 440 + }, + { + "entropy": 2.1571075320243835, + "epoch": 1.5936651583710408, + "grad_norm": 0.472598671913147, + "learning_rate": 0.0004477161014233361, + "loss": 0.0848, + "mean_token_accuracy": 0.9742853343486786, + "num_tokens": 3893129.0, + "step": 441 + }, + { + "entropy": 2.0434057414531708, + "epoch": 1.5972850678733033, + "grad_norm": 0.7104448676109314, + "learning_rate": 0.00044740702824696703, + "loss": 0.1524, + "mean_token_accuracy": 0.9542464315891266, + "num_tokens": 3902120.0, + "step": 442 + }, + { + "entropy": 2.1118403673171997, + "epoch": 1.6009049773755657, + "grad_norm": 0.6632394194602966, + "learning_rate": 0.0004470973232254037, + "loss": 0.3001, + "mean_token_accuracy": 0.928197592496872, + "num_tokens": 3910974.0, + "step": 443 + }, + { + "entropy": 2.0292475819587708, + "epoch": 1.6045248868778281, + "grad_norm": 1.050956130027771, + "learning_rate": 0.00044678698753494527, + "loss": 0.2226, + "mean_token_accuracy": 0.9448522627353668, + "num_tokens": 3920005.0, + "step": 444 + }, + { + "entropy": 1.991033524274826, + "epoch": 1.6081447963800906, + "grad_norm": 0.670244038105011, + "learning_rate": 0.00044647602235428624, + "loss": 0.2158, + "mean_token_accuracy": 0.9551118016242981, + "num_tokens": 3929334.0, + "step": 445 + }, + { + "entropy": 2.04949289560318, + "epoch": 1.611764705882353, + "grad_norm": 0.6321494579315186, + "learning_rate": 0.00044616442886451197, + "loss": 0.1743, + "mean_token_accuracy": 0.9494802355766296, + "num_tokens": 3938211.0, + "step": 446 + }, + { + "entropy": 2.1101951897144318, + "epoch": 1.6153846153846154, + "grad_norm": 0.6970012187957764, + "learning_rate": 0.0004458522082490943, + "loss": 0.1228, + "mean_token_accuracy": 0.9624926447868347, + "num_tokens": 3946534.0, + "step": 447 + }, + { + "entropy": 1.9337081909179688, + "epoch": 1.6190045248868778, + "grad_norm": 0.5971657633781433, + "learning_rate": 0.0004455393616938868, + "loss": 0.1431, + "mean_token_accuracy": 0.9635348320007324, + "num_tokens": 3955694.0, + "step": 448 + }, + { + "entropy": 1.9635128676891327, + "epoch": 1.6226244343891403, + "grad_norm": 0.8510827422142029, + "learning_rate": 0.00044522589038712074, + "loss": 0.2446, + "mean_token_accuracy": 0.9457641988992691, + "num_tokens": 3964907.0, + "step": 449 + }, + { + "entropy": 2.0336360335350037, + "epoch": 1.6262443438914027, + "grad_norm": 0.5803818106651306, + "learning_rate": 0.00044491179551939985, + "loss": 0.0872, + "mean_token_accuracy": 0.9734505414962769, + "num_tokens": 3973584.0, + "step": 450 + }, + { + "entropy": 2.0668878853321075, + "epoch": 1.6298642533936651, + "grad_norm": 0.6990496516227722, + "learning_rate": 0.0004445970782836967, + "loss": 0.1138, + "mean_token_accuracy": 0.9702571034431458, + "num_tokens": 3982632.0, + "step": 451 + }, + { + "entropy": 2.1481760144233704, + "epoch": 1.6334841628959276, + "grad_norm": 0.6156729459762573, + "learning_rate": 0.00044428173987534733, + "loss": 0.0936, + "mean_token_accuracy": 0.9739355593919754, + "num_tokens": 3991147.0, + "step": 452 + }, + { + "entropy": 2.0678701996803284, + "epoch": 1.63710407239819, + "grad_norm": 0.5441684126853943, + "learning_rate": 0.0004439657814920472, + "loss": 0.123, + "mean_token_accuracy": 0.9693446308374405, + "num_tokens": 3999990.0, + "step": 453 + }, + { + "entropy": 1.9867055118083954, + "epoch": 1.6407239819004524, + "grad_norm": 0.9218093156814575, + "learning_rate": 0.00044364920433384656, + "loss": 0.1997, + "mean_token_accuracy": 0.9564195573329926, + "num_tokens": 4009097.0, + "step": 454 + }, + { + "entropy": 2.145586997270584, + "epoch": 1.6443438914027149, + "grad_norm": 0.77643883228302, + "learning_rate": 0.0004433320096031458, + "loss": 0.1491, + "mean_token_accuracy": 0.9602408111095428, + "num_tokens": 4018059.0, + "step": 455 + }, + { + "entropy": 2.071108251810074, + "epoch": 1.6479638009049773, + "grad_norm": 0.5267088413238525, + "learning_rate": 0.0004430141985046909, + "loss": 0.0875, + "mean_token_accuracy": 0.9764399826526642, + "num_tokens": 4027089.0, + "step": 456 + }, + { + "entropy": 2.1659318804740906, + "epoch": 1.6515837104072397, + "grad_norm": 1.0642318725585938, + "learning_rate": 0.000442695772245569, + "loss": 0.2623, + "mean_token_accuracy": 0.9307756721973419, + "num_tokens": 4035719.0, + "step": 457 + }, + { + "entropy": 2.0232724249362946, + "epoch": 1.6552036199095022, + "grad_norm": 0.6213289499282837, + "learning_rate": 0.0004423767320352035, + "loss": 0.1597, + "mean_token_accuracy": 0.9599647223949432, + "num_tokens": 4045088.0, + "step": 458 + }, + { + "entropy": 2.047410547733307, + "epoch": 1.6588235294117646, + "grad_norm": 0.6346105933189392, + "learning_rate": 0.0004420570790853498, + "loss": 0.1422, + "mean_token_accuracy": 0.9649711549282074, + "num_tokens": 4054262.0, + "step": 459 + }, + { + "entropy": 2.0923012793064117, + "epoch": 1.662443438914027, + "grad_norm": 0.46477749943733215, + "learning_rate": 0.0004417368146100907, + "loss": 0.079, + "mean_token_accuracy": 0.9777993708848953, + "num_tokens": 4063107.0, + "step": 460 + }, + { + "entropy": 2.168913394212723, + "epoch": 1.6660633484162894, + "grad_norm": 0.5164734721183777, + "learning_rate": 0.0004414159398258312, + "loss": 0.0941, + "mean_token_accuracy": 0.9725133627653122, + "num_tokens": 4071656.0, + "step": 461 + }, + { + "entropy": 2.152670443058014, + "epoch": 1.6696832579185519, + "grad_norm": 0.8985757231712341, + "learning_rate": 0.00044109445595129495, + "loss": 0.2142, + "mean_token_accuracy": 0.9387252777814865, + "num_tokens": 4080023.0, + "step": 462 + }, + { + "entropy": 2.111784875392914, + "epoch": 1.6733031674208145, + "grad_norm": 0.47521084547042847, + "learning_rate": 0.0004407723642075184, + "loss": 0.0581, + "mean_token_accuracy": 0.9821985810995102, + "num_tokens": 4088469.0, + "step": 463 + }, + { + "entropy": 1.9784683287143707, + "epoch": 1.676923076923077, + "grad_norm": 0.5552536249160767, + "learning_rate": 0.0004404496658178472, + "loss": 0.1353, + "mean_token_accuracy": 0.9619844257831573, + "num_tokens": 4097737.0, + "step": 464 + }, + { + "entropy": 2.015674114227295, + "epoch": 1.6805429864253394, + "grad_norm": 0.6078305244445801, + "learning_rate": 0.0004401263620079309, + "loss": 0.1916, + "mean_token_accuracy": 0.9506707191467285, + "num_tokens": 4107156.0, + "step": 465 + }, + { + "entropy": 2.0832217931747437, + "epoch": 1.6841628959276018, + "grad_norm": 0.6618755459785461, + "learning_rate": 0.0004398024540057186, + "loss": 0.1671, + "mean_token_accuracy": 0.9617152661085129, + "num_tokens": 4116019.0, + "step": 466 + }, + { + "entropy": 2.0383114516735077, + "epoch": 1.6877828054298643, + "grad_norm": 0.5774693489074707, + "learning_rate": 0.0004394779430414541, + "loss": 0.2647, + "mean_token_accuracy": 0.9387127161026001, + "num_tokens": 4125001.0, + "step": 467 + }, + { + "entropy": 2.201409190893173, + "epoch": 1.6914027149321267, + "grad_norm": 0.7600311636924744, + "learning_rate": 0.0004391528303476715, + "loss": 0.073, + "mean_token_accuracy": 0.979825034737587, + "num_tokens": 4133467.0, + "step": 468 + }, + { + "entropy": 2.168666422367096, + "epoch": 1.6950226244343891, + "grad_norm": 0.7801902294158936, + "learning_rate": 0.00043882711715919015, + "loss": 0.2406, + "mean_token_accuracy": 0.9451306313276291, + "num_tokens": 4141765.0, + "step": 469 + }, + { + "entropy": 2.1429262161254883, + "epoch": 1.6986425339366515, + "grad_norm": 0.5192358493804932, + "learning_rate": 0.0004385008047131104, + "loss": 0.1052, + "mean_token_accuracy": 0.9749262481927872, + "num_tokens": 4150732.0, + "step": 470 + }, + { + "entropy": 2.1387495696544647, + "epoch": 1.702262443438914, + "grad_norm": 0.6219777464866638, + "learning_rate": 0.0004381738942488083, + "loss": 0.2127, + "mean_token_accuracy": 0.9398418068885803, + "num_tokens": 4159715.0, + "step": 471 + }, + { + "entropy": 2.1718398332595825, + "epoch": 1.7058823529411766, + "grad_norm": 0.5738123655319214, + "learning_rate": 0.0004378463870079316, + "loss": 0.1703, + "mean_token_accuracy": 0.9520847648382187, + "num_tokens": 4168526.0, + "step": 472 + }, + { + "entropy": 2.2768235206604004, + "epoch": 1.709502262443439, + "grad_norm": 0.662564754486084, + "learning_rate": 0.00043751828423439456, + "loss": 0.138, + "mean_token_accuracy": 0.9581841826438904, + "num_tokens": 4177189.0, + "step": 473 + }, + { + "entropy": 2.29143089056015, + "epoch": 1.7131221719457015, + "grad_norm": 0.8638074398040771, + "learning_rate": 0.00043718958717437324, + "loss": 0.1432, + "mean_token_accuracy": 0.9645630270242691, + "num_tokens": 4185367.0, + "step": 474 + }, + { + "entropy": 2.2810245156288147, + "epoch": 1.716742081447964, + "grad_norm": 0.6139346957206726, + "learning_rate": 0.00043686029707630097, + "loss": 0.173, + "mean_token_accuracy": 0.9592728316783905, + "num_tokens": 4194418.0, + "step": 475 + }, + { + "entropy": 2.1307725310325623, + "epoch": 1.7203619909502263, + "grad_norm": 0.5192779302597046, + "learning_rate": 0.00043653041519086354, + "loss": 0.1025, + "mean_token_accuracy": 0.970764696598053, + "num_tokens": 4203705.0, + "step": 476 + }, + { + "entropy": 2.160595118999481, + "epoch": 1.7239819004524888, + "grad_norm": 0.7398526668548584, + "learning_rate": 0.0004361999427709943, + "loss": 0.229, + "mean_token_accuracy": 0.9352773874998093, + "num_tokens": 4212648.0, + "step": 477 + }, + { + "entropy": 2.1865442991256714, + "epoch": 1.7276018099547512, + "grad_norm": 0.6227203011512756, + "learning_rate": 0.0004358688810718699, + "loss": 0.1118, + "mean_token_accuracy": 0.9689576476812363, + "num_tokens": 4221208.0, + "step": 478 + }, + { + "entropy": 2.086527943611145, + "epoch": 1.7312217194570136, + "grad_norm": 0.722144603729248, + "learning_rate": 0.00043553723135090447, + "loss": 0.1656, + "mean_token_accuracy": 0.9537550210952759, + "num_tokens": 4230810.0, + "step": 479 + }, + { + "entropy": 2.068355441093445, + "epoch": 1.734841628959276, + "grad_norm": 0.5781517028808594, + "learning_rate": 0.0004352049948677462, + "loss": 0.1497, + "mean_token_accuracy": 0.9600837379693985, + "num_tokens": 4240394.0, + "step": 480 + }, + { + "entropy": 2.185140371322632, + "epoch": 1.7384615384615385, + "grad_norm": 0.7261873483657837, + "learning_rate": 0.0004348721728842715, + "loss": 0.1582, + "mean_token_accuracy": 0.9584025889635086, + "num_tokens": 4249205.0, + "step": 481 + }, + { + "entropy": 2.21835720539093, + "epoch": 1.742081447963801, + "grad_norm": 0.5321667194366455, + "learning_rate": 0.0004345387666645807, + "loss": 0.1344, + "mean_token_accuracy": 0.9659005403518677, + "num_tokens": 4257808.0, + "step": 482 + }, + { + "entropy": 2.078131854534149, + "epoch": 1.7457013574660634, + "grad_norm": 0.5598498582839966, + "learning_rate": 0.00043420477747499307, + "loss": 0.1347, + "mean_token_accuracy": 0.9678008407354355, + "num_tokens": 4266728.0, + "step": 483 + }, + { + "entropy": 2.060504525899887, + "epoch": 1.7493212669683258, + "grad_norm": 0.5017166137695312, + "learning_rate": 0.0004338702065840422, + "loss": 0.0722, + "mean_token_accuracy": 0.9762782007455826, + "num_tokens": 4275514.0, + "step": 484 + }, + { + "entropy": 2.165244698524475, + "epoch": 1.7529411764705882, + "grad_norm": 0.4664002060890198, + "learning_rate": 0.00043353505526247084, + "loss": 0.1206, + "mean_token_accuracy": 0.9696767777204514, + "num_tokens": 4284013.0, + "step": 485 + }, + { + "entropy": 2.103049159049988, + "epoch": 1.7565610859728507, + "grad_norm": 0.6669000387191772, + "learning_rate": 0.0004331993247832265, + "loss": 0.1052, + "mean_token_accuracy": 0.9665459096431732, + "num_tokens": 4293011.0, + "step": 486 + }, + { + "entropy": 2.1286613941192627, + "epoch": 1.760180995475113, + "grad_norm": 0.7821269631385803, + "learning_rate": 0.00043286301642145634, + "loss": 0.3669, + "mean_token_accuracy": 0.9062697291374207, + "num_tokens": 4301965.0, + "step": 487 + }, + { + "entropy": 2.098009169101715, + "epoch": 1.7638009049773755, + "grad_norm": 0.5720731616020203, + "learning_rate": 0.0004325261314545024, + "loss": 0.1324, + "mean_token_accuracy": 0.9650943875312805, + "num_tokens": 4310914.0, + "step": 488 + }, + { + "entropy": 2.164614498615265, + "epoch": 1.767420814479638, + "grad_norm": 1.0500473976135254, + "learning_rate": 0.0004321886711618967, + "loss": 0.1182, + "mean_token_accuracy": 0.9720661342144012, + "num_tokens": 4319072.0, + "step": 489 + }, + { + "entropy": 2.2015402913093567, + "epoch": 1.7710407239819004, + "grad_norm": 0.5770253539085388, + "learning_rate": 0.00043185063682535634, + "loss": 0.1226, + "mean_token_accuracy": 0.9615659862756729, + "num_tokens": 4327539.0, + "step": 490 + }, + { + "entropy": 2.075456440448761, + "epoch": 1.7746606334841628, + "grad_norm": 0.6456925272941589, + "learning_rate": 0.0004315120297287789, + "loss": 0.1123, + "mean_token_accuracy": 0.9628709554672241, + "num_tokens": 4336523.0, + "step": 491 + }, + { + "entropy": 2.158169150352478, + "epoch": 1.7782805429864252, + "grad_norm": 0.8282069563865662, + "learning_rate": 0.00043117285115823733, + "loss": 0.2146, + "mean_token_accuracy": 0.9413971602916718, + "num_tokens": 4345294.0, + "step": 492 + }, + { + "entropy": 2.02735897898674, + "epoch": 1.7819004524886877, + "grad_norm": 0.783597469329834, + "learning_rate": 0.000430833102401975, + "loss": 0.1376, + "mean_token_accuracy": 0.964630737900734, + "num_tokens": 4354107.0, + "step": 493 + }, + { + "entropy": 2.138492166996002, + "epoch": 1.78552036199095, + "grad_norm": 0.6317175030708313, + "learning_rate": 0.000430492784750401, + "loss": 0.1005, + "mean_token_accuracy": 0.9734214246273041, + "num_tokens": 4362560.0, + "step": 494 + }, + { + "entropy": 2.0253217220306396, + "epoch": 1.7891402714932125, + "grad_norm": 0.5523395538330078, + "learning_rate": 0.000430151899496085, + "loss": 0.1633, + "mean_token_accuracy": 0.9558031558990479, + "num_tokens": 4371698.0, + "step": 495 + }, + { + "entropy": 2.160472810268402, + "epoch": 1.792760180995475, + "grad_norm": 0.6557935476303101, + "learning_rate": 0.00042981044793375295, + "loss": 0.1154, + "mean_token_accuracy": 0.9722230583429337, + "num_tokens": 4380612.0, + "step": 496 + }, + { + "entropy": 2.0284159183502197, + "epoch": 1.7963800904977374, + "grad_norm": 0.7357863187789917, + "learning_rate": 0.00042946843136028117, + "loss": 0.1166, + "mean_token_accuracy": 0.9629471153020859, + "num_tokens": 4389521.0, + "step": 497 + }, + { + "entropy": 2.1544791162014008, + "epoch": 1.8, + "grad_norm": 0.5604898929595947, + "learning_rate": 0.00042912585107469226, + "loss": 0.0834, + "mean_token_accuracy": 0.9783036410808563, + "num_tokens": 4398059.0, + "step": 498 + }, + { + "entropy": 2.1051094830036163, + "epoch": 1.8036199095022625, + "grad_norm": 0.4598539173603058, + "learning_rate": 0.0004287827083781497, + "loss": 0.0411, + "mean_token_accuracy": 0.9868490546941757, + "num_tokens": 4406453.0, + "step": 499 + }, + { + "entropy": 2.0219272077083588, + "epoch": 1.807239819004525, + "grad_norm": 0.8164628744125366, + "learning_rate": 0.00042843900457395343, + "loss": 0.1988, + "mean_token_accuracy": 0.9502352625131607, + "num_tokens": 4415440.0, + "step": 500 + }, + { + "entropy": 1.980013906955719, + "epoch": 1.8108597285067873, + "grad_norm": 0.572798490524292, + "learning_rate": 0.0004280947409675341, + "loss": 0.1148, + "mean_token_accuracy": 0.966580331325531, + "num_tokens": 4424532.0, + "step": 501 + }, + { + "entropy": 2.0646563172340393, + "epoch": 1.8144796380090498, + "grad_norm": 0.769386351108551, + "learning_rate": 0.00042774991886644875, + "loss": 0.1592, + "mean_token_accuracy": 0.9553463608026505, + "num_tokens": 4432913.0, + "step": 502 + }, + { + "entropy": 2.040877491235733, + "epoch": 1.8180995475113122, + "grad_norm": 0.7467371821403503, + "learning_rate": 0.0004274045395803758, + "loss": 0.2247, + "mean_token_accuracy": 0.9526964277029037, + "num_tokens": 4441425.0, + "step": 503 + }, + { + "entropy": 1.9934698939323425, + "epoch": 1.8217194570135746, + "grad_norm": 0.6602952480316162, + "learning_rate": 0.00042705860442110964, + "loss": 0.1681, + "mean_token_accuracy": 0.9594631940126419, + "num_tokens": 4450383.0, + "step": 504 + }, + { + "entropy": 2.0858289897441864, + "epoch": 1.825339366515837, + "grad_norm": 0.684380829334259, + "learning_rate": 0.0004267121147025562, + "loss": 0.1154, + "mean_token_accuracy": 0.9638111293315887, + "num_tokens": 4458862.0, + "step": 505 + }, + { + "entropy": 2.0886995792388916, + "epoch": 1.8289592760180997, + "grad_norm": 0.5784837007522583, + "learning_rate": 0.00042636507174072756, + "loss": 0.1026, + "mean_token_accuracy": 0.9676834791898727, + "num_tokens": 4467386.0, + "step": 506 + }, + { + "entropy": 2.0236063301563263, + "epoch": 1.8325791855203621, + "grad_norm": 0.5101180672645569, + "learning_rate": 0.00042601747685373716, + "loss": 0.1031, + "mean_token_accuracy": 0.9734093993902206, + "num_tokens": 4476054.0, + "step": 507 + }, + { + "entropy": 1.9801031053066254, + "epoch": 1.8361990950226246, + "grad_norm": 0.6581607460975647, + "learning_rate": 0.00042566933136179455, + "loss": 0.1548, + "mean_token_accuracy": 0.9581006914377213, + "num_tokens": 4484895.0, + "step": 508 + }, + { + "entropy": 2.0244787633419037, + "epoch": 1.839819004524887, + "grad_norm": 0.8100608587265015, + "learning_rate": 0.0004253206365872008, + "loss": 0.196, + "mean_token_accuracy": 0.9532899260520935, + "num_tokens": 4493737.0, + "step": 509 + }, + { + "entropy": 1.9108119010925293, + "epoch": 1.8434389140271494, + "grad_norm": 0.4903942048549652, + "learning_rate": 0.00042497139385434314, + "loss": 0.1313, + "mean_token_accuracy": 0.9667337089776993, + "num_tokens": 4502840.0, + "step": 510 + }, + { + "entropy": 2.009468197822571, + "epoch": 1.8470588235294119, + "grad_norm": 0.6010113954544067, + "learning_rate": 0.0004246216044896897, + "loss": 0.1013, + "mean_token_accuracy": 0.9692314714193344, + "num_tokens": 4511407.0, + "step": 511 + }, + { + "entropy": 2.0337170362472534, + "epoch": 1.8506787330316743, + "grad_norm": 0.7906802892684937, + "learning_rate": 0.00042427126982178546, + "loss": 0.1682, + "mean_token_accuracy": 0.9550099819898605, + "num_tokens": 4520018.0, + "step": 512 + }, + { + "entropy": 1.8813888728618622, + "epoch": 1.8542986425339367, + "grad_norm": 0.5353080034255981, + "learning_rate": 0.00042392039118124586, + "loss": 0.1228, + "mean_token_accuracy": 0.9624074995517731, + "num_tokens": 4529270.0, + "step": 513 + }, + { + "entropy": 2.012698233127594, + "epoch": 1.8579185520361992, + "grad_norm": 0.6713843941688538, + "learning_rate": 0.00042356896990075285, + "loss": 0.2225, + "mean_token_accuracy": 0.9417333751916885, + "num_tokens": 4538008.0, + "step": 514 + }, + { + "entropy": 1.880586564540863, + "epoch": 1.8615384615384616, + "grad_norm": 0.5821724534034729, + "learning_rate": 0.00042321700731504916, + "loss": 0.1144, + "mean_token_accuracy": 0.9677341282367706, + "num_tokens": 4546950.0, + "step": 515 + }, + { + "entropy": 2.0066279470920563, + "epoch": 1.865158371040724, + "grad_norm": 0.4095056354999542, + "learning_rate": 0.0004228645047609335, + "loss": 0.0424, + "mean_token_accuracy": 0.9854962974786758, + "num_tokens": 4555452.0, + "step": 516 + }, + { + "entropy": 2.042815536260605, + "epoch": 1.8687782805429864, + "grad_norm": 0.5398769974708557, + "learning_rate": 0.0004225114635772555, + "loss": 0.1343, + "mean_token_accuracy": 0.9615450948476791, + "num_tokens": 4564386.0, + "step": 517 + }, + { + "entropy": 2.0948933362960815, + "epoch": 1.8723981900452489, + "grad_norm": 0.6738974452018738, + "learning_rate": 0.0004221578851049107, + "loss": 0.1541, + "mean_token_accuracy": 0.9526563137769699, + "num_tokens": 4573041.0, + "step": 518 + }, + { + "entropy": 2.102545380592346, + "epoch": 1.8760180995475113, + "grad_norm": 0.7769943475723267, + "learning_rate": 0.00042180377068683504, + "loss": 0.2362, + "mean_token_accuracy": 0.9472651779651642, + "num_tokens": 4581666.0, + "step": 519 + }, + { + "entropy": 2.087820291519165, + "epoch": 1.8796380090497737, + "grad_norm": 0.5722424983978271, + "learning_rate": 0.0004214491216680004, + "loss": 0.1657, + "mean_token_accuracy": 0.9537082612514496, + "num_tokens": 4590238.0, + "step": 520 + }, + { + "entropy": 2.0093430876731873, + "epoch": 1.8832579185520362, + "grad_norm": 0.5844932198524475, + "learning_rate": 0.00042109393939540867, + "loss": 0.1485, + "mean_token_accuracy": 0.9624215811491013, + "num_tokens": 4599352.0, + "step": 521 + }, + { + "entropy": 1.9117147326469421, + "epoch": 1.8868778280542986, + "grad_norm": 0.46085676550865173, + "learning_rate": 0.0004207382252180876, + "loss": 0.0853, + "mean_token_accuracy": 0.9769327491521835, + "num_tokens": 4608571.0, + "step": 522 + }, + { + "entropy": 2.0205602943897247, + "epoch": 1.890497737556561, + "grad_norm": 0.5571608543395996, + "learning_rate": 0.000420381980487085, + "loss": 0.1517, + "mean_token_accuracy": 0.9646699875593185, + "num_tokens": 4617445.0, + "step": 523 + }, + { + "entropy": 1.9571953415870667, + "epoch": 1.8941176470588235, + "grad_norm": 0.470630943775177, + "learning_rate": 0.0004200252065554636, + "loss": 0.1005, + "mean_token_accuracy": 0.9750025719404221, + "num_tokens": 4626756.0, + "step": 524 + }, + { + "entropy": 2.063209116458893, + "epoch": 1.897737556561086, + "grad_norm": 0.6447069644927979, + "learning_rate": 0.00041966790477829637, + "loss": 0.113, + "mean_token_accuracy": 0.9695079624652863, + "num_tokens": 4635378.0, + "step": 525 + }, + { + "entropy": 1.9232109785079956, + "epoch": 1.9013574660633483, + "grad_norm": 0.5114295482635498, + "learning_rate": 0.000419310076512661, + "loss": 0.1492, + "mean_token_accuracy": 0.9653338938951492, + "num_tokens": 4644769.0, + "step": 526 + }, + { + "entropy": 2.1691197752952576, + "epoch": 1.9049773755656108, + "grad_norm": 0.7630137205123901, + "learning_rate": 0.00041895172311763476, + "loss": 0.212, + "mean_token_accuracy": 0.9533941894769669, + "num_tokens": 4652857.0, + "step": 527 + }, + { + "entropy": 2.04753240942955, + "epoch": 1.9085972850678732, + "grad_norm": 0.6423042416572571, + "learning_rate": 0.00041859284595428955, + "loss": 0.1455, + "mean_token_accuracy": 0.956505224108696, + "num_tokens": 4661591.0, + "step": 528 + }, + { + "entropy": 1.9440338611602783, + "epoch": 1.9122171945701356, + "grad_norm": 0.5011327266693115, + "learning_rate": 0.00041823344638568656, + "loss": 0.1255, + "mean_token_accuracy": 0.965131089091301, + "num_tokens": 4670594.0, + "step": 529 + }, + { + "entropy": 2.0554805397987366, + "epoch": 1.915837104072398, + "grad_norm": 0.5821590423583984, + "learning_rate": 0.0004178735257768713, + "loss": 0.0486, + "mean_token_accuracy": 0.9875282496213913, + "num_tokens": 4679344.0, + "step": 530 + }, + { + "entropy": 2.130349576473236, + "epoch": 1.9194570135746605, + "grad_norm": 0.5332052111625671, + "learning_rate": 0.0004175130854948679, + "loss": 0.0915, + "mean_token_accuracy": 0.9737034440040588, + "num_tokens": 4687922.0, + "step": 531 + }, + { + "entropy": 2.146788775920868, + "epoch": 1.9230769230769231, + "grad_norm": 0.5016877055168152, + "learning_rate": 0.00041715212690867455, + "loss": 0.1281, + "mean_token_accuracy": 0.9681432545185089, + "num_tokens": 4696593.0, + "step": 532 + }, + { + "entropy": 2.041268438100815, + "epoch": 1.9266968325791856, + "grad_norm": 0.5257729887962341, + "learning_rate": 0.00041679065138925807, + "loss": 0.1272, + "mean_token_accuracy": 0.9649266451597214, + "num_tokens": 4705792.0, + "step": 533 + }, + { + "entropy": 2.114819645881653, + "epoch": 1.930316742081448, + "grad_norm": 0.7085135579109192, + "learning_rate": 0.0004164286603095484, + "loss": 0.1545, + "mean_token_accuracy": 0.9581228941679001, + "num_tokens": 4714599.0, + "step": 534 + }, + { + "entropy": 2.022280514240265, + "epoch": 1.9339366515837104, + "grad_norm": 0.5309014320373535, + "learning_rate": 0.00041606615504443387, + "loss": 0.1933, + "mean_token_accuracy": 0.9562340676784515, + "num_tokens": 4724062.0, + "step": 535 + }, + { + "entropy": 2.0959260165691376, + "epoch": 1.9375565610859729, + "grad_norm": 0.6528061628341675, + "learning_rate": 0.0004157031369707557, + "loss": 0.1306, + "mean_token_accuracy": 0.9612343460321426, + "num_tokens": 4733077.0, + "step": 536 + }, + { + "entropy": 2.2772948145866394, + "epoch": 1.9411764705882353, + "grad_norm": 0.7351471185684204, + "learning_rate": 0.0004153396074673028, + "loss": 0.1494, + "mean_token_accuracy": 0.9608108699321747, + "num_tokens": 4741201.0, + "step": 537 + }, + { + "entropy": 2.0935052037239075, + "epoch": 1.9447963800904977, + "grad_norm": 0.5435840487480164, + "learning_rate": 0.0004149755679148065, + "loss": 0.0884, + "mean_token_accuracy": 0.9745689779520035, + "num_tokens": 4750306.0, + "step": 538 + }, + { + "entropy": 2.2082818746566772, + "epoch": 1.9484162895927601, + "grad_norm": 0.3780331611633301, + "learning_rate": 0.00041461101969593537, + "loss": 0.0739, + "mean_token_accuracy": 0.9777179658412933, + "num_tokens": 4758954.0, + "step": 539 + }, + { + "entropy": 2.1683040261268616, + "epoch": 1.9520361990950226, + "grad_norm": 0.4637961685657501, + "learning_rate": 0.00041424596419529017, + "loss": 0.0632, + "mean_token_accuracy": 0.9834533184766769, + "num_tokens": 4767615.0, + "step": 540 + }, + { + "entropy": 2.075555235147476, + "epoch": 1.9556561085972852, + "grad_norm": 0.7603118419647217, + "learning_rate": 0.00041388040279939804, + "loss": 0.2835, + "mean_token_accuracy": 0.9364205300807953, + "num_tokens": 4776714.0, + "step": 541 + }, + { + "entropy": 2.18926739692688, + "epoch": 1.9592760180995477, + "grad_norm": 0.8895708918571472, + "learning_rate": 0.0004135143368967079, + "loss": 0.2514, + "mean_token_accuracy": 0.9361050724983215, + "num_tokens": 4785402.0, + "step": 542 + }, + { + "entropy": 2.2387169003486633, + "epoch": 1.96289592760181, + "grad_norm": 0.6013544797897339, + "learning_rate": 0.00041314776787758454, + "loss": 0.1502, + "mean_token_accuracy": 0.9594238847494125, + "num_tokens": 4793928.0, + "step": 543 + }, + { + "entropy": 2.208383619785309, + "epoch": 1.9665158371040725, + "grad_norm": 0.6934756636619568, + "learning_rate": 0.00041278069713430386, + "loss": 0.1777, + "mean_token_accuracy": 0.9619583487510681, + "num_tokens": 4802612.0, + "step": 544 + }, + { + "entropy": 2.2621757984161377, + "epoch": 1.970135746606335, + "grad_norm": 0.6920077800750732, + "learning_rate": 0.00041241312606104743, + "loss": 0.1689, + "mean_token_accuracy": 0.9594835937023163, + "num_tokens": 4811332.0, + "step": 545 + }, + { + "entropy": 2.2654454112052917, + "epoch": 1.9737556561085974, + "grad_norm": 0.6259592771530151, + "learning_rate": 0.000412045056053897, + "loss": 0.142, + "mean_token_accuracy": 0.9648078680038452, + "num_tokens": 4820441.0, + "step": 546 + }, + { + "entropy": 2.218056857585907, + "epoch": 1.9773755656108598, + "grad_norm": 0.5390617847442627, + "learning_rate": 0.0004116764885108292, + "loss": 0.1737, + "mean_token_accuracy": 0.9595656991004944, + "num_tokens": 4829437.0, + "step": 547 + }, + { + "entropy": 2.2571592330932617, + "epoch": 1.9809954751131222, + "grad_norm": 0.3656528890132904, + "learning_rate": 0.0004113074248317108, + "loss": 0.0545, + "mean_token_accuracy": 0.9825418293476105, + "num_tokens": 4838118.0, + "step": 548 + }, + { + "entropy": 2.1890549659729004, + "epoch": 1.9846153846153847, + "grad_norm": 0.5716155767440796, + "learning_rate": 0.00041093786641829247, + "loss": 0.0997, + "mean_token_accuracy": 0.9715700745582581, + "num_tokens": 4847073.0, + "step": 549 + }, + { + "entropy": 2.2726192474365234, + "epoch": 1.988235294117647, + "grad_norm": 0.4709530770778656, + "learning_rate": 0.0004105678146742042, + "loss": 0.0746, + "mean_token_accuracy": 0.9799739569425583, + "num_tokens": 4855755.0, + "step": 550 + }, + { + "entropy": 2.2328362464904785, + "epoch": 1.9918552036199095, + "grad_norm": 0.6773779392242432, + "learning_rate": 0.0004101972710049498, + "loss": 0.1418, + "mean_token_accuracy": 0.9629421681165695, + "num_tokens": 4864601.0, + "step": 551 + }, + { + "entropy": 2.199812740087509, + "epoch": 1.995475113122172, + "grad_norm": 0.717012882232666, + "learning_rate": 0.00040982623681790113, + "loss": 0.2948, + "mean_token_accuracy": 0.9432803690433502, + "num_tokens": 4873630.0, + "step": 552 + }, + { + "entropy": 2.2102787494659424, + "epoch": 1.9990950226244344, + "grad_norm": 0.6925314664840698, + "learning_rate": 0.00040945471352229346, + "loss": 0.2579, + "mean_token_accuracy": 0.9435124397277832, + "num_tokens": 4882714.0, + "step": 553 + }, + { + "entropy": 2.3318979740142822, + "epoch": 2.0, + "grad_norm": 2.688188314437866, + "learning_rate": 0.0004090827025292197, + "loss": 0.0283, + "mean_token_accuracy": 0.9918032884597778, + "num_tokens": 4883450.0, + "step": 554 + }, + { + "epoch": 2.0, + "eval_entropy": 2.2165925522160723, + "eval_loss": 0.16817161440849304, + "eval_mean_token_accuracy": 0.9567220133494555, + "eval_num_tokens": 4883450.0, + "eval_runtime": 116.1556, + "eval_samples_per_second": 3.177, + "eval_steps_per_second": 1.059, + "step": 554 + }, + { + "entropy": 2.0389976799488068, + "epoch": 2.0036199095022624, + "grad_norm": 0.8596204519271851, + "learning_rate": 0.00040871020525162484, + "loss": 0.1341, + "mean_token_accuracy": 0.9626202881336212, + "num_tokens": 4893236.0, + "step": 555 + }, + { + "entropy": 2.245832860469818, + "epoch": 2.007239819004525, + "grad_norm": 0.39707237482070923, + "learning_rate": 0.00040833722310430114, + "loss": 0.0564, + "mean_token_accuracy": 0.9868980199098587, + "num_tokens": 4901819.0, + "step": 556 + }, + { + "entropy": 2.169717162847519, + "epoch": 2.0108597285067873, + "grad_norm": 0.46584129333496094, + "learning_rate": 0.0004079637575038822, + "loss": 0.0792, + "mean_token_accuracy": 0.9758767485618591, + "num_tokens": 4910892.0, + "step": 557 + }, + { + "entropy": 2.27083820104599, + "epoch": 2.0144796380090497, + "grad_norm": 0.8394352197647095, + "learning_rate": 0.0004075898098688381, + "loss": 0.0962, + "mean_token_accuracy": 0.9723308384418488, + "num_tokens": 4919510.0, + "step": 558 + }, + { + "entropy": 2.1067663431167603, + "epoch": 2.018099547511312, + "grad_norm": 0.4951268434524536, + "learning_rate": 0.0004072153816194696, + "loss": 0.1195, + "mean_token_accuracy": 0.9703402817249298, + "num_tokens": 4928439.0, + "step": 559 + }, + { + "entropy": 2.016420066356659, + "epoch": 2.0217194570135746, + "grad_norm": 0.5574740171432495, + "learning_rate": 0.00040684047417790273, + "loss": 0.1037, + "mean_token_accuracy": 0.9727325141429901, + "num_tokens": 4938061.0, + "step": 560 + }, + { + "entropy": 2.1843727231025696, + "epoch": 2.025339366515837, + "grad_norm": 0.786014199256897, + "learning_rate": 0.00040646508896808394, + "loss": 0.155, + "mean_token_accuracy": 0.9608975350856781, + "num_tokens": 4946619.0, + "step": 561 + }, + { + "entropy": 2.160427451133728, + "epoch": 2.0289592760180994, + "grad_norm": 0.5267161130905151, + "learning_rate": 0.000406089227415774, + "loss": 0.0632, + "mean_token_accuracy": 0.9791042655706406, + "num_tokens": 4955324.0, + "step": 562 + }, + { + "entropy": 2.0923200249671936, + "epoch": 2.032579185520362, + "grad_norm": 0.8306187987327576, + "learning_rate": 0.00040571289094854304, + "loss": 0.1976, + "mean_token_accuracy": 0.9538775235414505, + "num_tokens": 4964321.0, + "step": 563 + }, + { + "entropy": 2.0181354880332947, + "epoch": 2.0361990950226243, + "grad_norm": 0.6798867583274841, + "learning_rate": 0.0004053360809957649, + "loss": 0.1797, + "mean_token_accuracy": 0.9569422006607056, + "num_tokens": 4973937.0, + "step": 564 + }, + { + "entropy": 2.123030036687851, + "epoch": 2.0398190045248867, + "grad_norm": 0.4481683671474457, + "learning_rate": 0.00040495879898861173, + "loss": 0.0639, + "mean_token_accuracy": 0.9827965050935745, + "num_tokens": 4982779.0, + "step": 565 + }, + { + "entropy": 2.0797010362148285, + "epoch": 2.043438914027149, + "grad_norm": 0.7745859622955322, + "learning_rate": 0.00040458104636004877, + "loss": 0.1602, + "mean_token_accuracy": 0.9600242227315903, + "num_tokens": 4991793.0, + "step": 566 + }, + { + "entropy": 2.0320390164852142, + "epoch": 2.0470588235294116, + "grad_norm": 0.5792120695114136, + "learning_rate": 0.0004042028245448286, + "loss": 0.0816, + "mean_token_accuracy": 0.9757721722126007, + "num_tokens": 5000834.0, + "step": 567 + }, + { + "entropy": 2.1047743558883667, + "epoch": 2.050678733031674, + "grad_norm": 0.5770072937011719, + "learning_rate": 0.0004038241349794858, + "loss": 0.1367, + "mean_token_accuracy": 0.9598450362682343, + "num_tokens": 5010155.0, + "step": 568 + }, + { + "entropy": 2.022550255060196, + "epoch": 2.0542986425339365, + "grad_norm": 0.47085902094841003, + "learning_rate": 0.0004034449791023319, + "loss": 0.1005, + "mean_token_accuracy": 0.970214769244194, + "num_tokens": 5020010.0, + "step": 569 + }, + { + "entropy": 2.034317582845688, + "epoch": 2.057918552036199, + "grad_norm": 0.4816018044948578, + "learning_rate": 0.0004030653583534489, + "loss": 0.118, + "mean_token_accuracy": 0.9635649025440216, + "num_tokens": 5029205.0, + "step": 570 + }, + { + "entropy": 2.1142700910568237, + "epoch": 2.0615384615384613, + "grad_norm": 0.561765730381012, + "learning_rate": 0.0004026852741746849, + "loss": 0.0628, + "mean_token_accuracy": 0.9811093211174011, + "num_tokens": 5037830.0, + "step": 571 + }, + { + "entropy": 2.1506906747817993, + "epoch": 2.065158371040724, + "grad_norm": 0.9037840366363525, + "learning_rate": 0.0004023047280096482, + "loss": 0.1395, + "mean_token_accuracy": 0.9645196944475174, + "num_tokens": 5046618.0, + "step": 572 + }, + { + "entropy": 2.1811060309410095, + "epoch": 2.0687782805429866, + "grad_norm": 0.6224188208580017, + "learning_rate": 0.0004019237213037014, + "loss": 0.0766, + "mean_token_accuracy": 0.9752616137266159, + "num_tokens": 5055467.0, + "step": 573 + }, + { + "entropy": 2.0479070246219635, + "epoch": 2.072398190045249, + "grad_norm": 0.5052458643913269, + "learning_rate": 0.00040154225550395665, + "loss": 0.091, + "mean_token_accuracy": 0.9753529280424118, + "num_tokens": 5064518.0, + "step": 574 + }, + { + "entropy": 2.18623149394989, + "epoch": 2.0760180995475115, + "grad_norm": 0.49587905406951904, + "learning_rate": 0.00040116033205926964, + "loss": 0.0703, + "mean_token_accuracy": 0.979348823428154, + "num_tokens": 5072713.0, + "step": 575 + }, + { + "entropy": 2.131018817424774, + "epoch": 2.079638009049774, + "grad_norm": 0.607468843460083, + "learning_rate": 0.0004007779524202343, + "loss": 0.0988, + "mean_token_accuracy": 0.9756181836128235, + "num_tokens": 5081046.0, + "step": 576 + }, + { + "entropy": 2.0251292288303375, + "epoch": 2.0832579185520363, + "grad_norm": 0.867511510848999, + "learning_rate": 0.00040039511803917723, + "loss": 0.1672, + "mean_token_accuracy": 0.9638413190841675, + "num_tokens": 5089859.0, + "step": 577 + }, + { + "entropy": 2.0818732380867004, + "epoch": 2.086877828054299, + "grad_norm": 0.5915331840515137, + "learning_rate": 0.0004000118303701521, + "loss": 0.1103, + "mean_token_accuracy": 0.9715124219655991, + "num_tokens": 5098331.0, + "step": 578 + }, + { + "entropy": 1.9556698501110077, + "epoch": 2.090497737556561, + "grad_norm": 0.5216535329818726, + "learning_rate": 0.0003996280908689345, + "loss": 0.1481, + "mean_token_accuracy": 0.9601311087608337, + "num_tokens": 5107557.0, + "step": 579 + }, + { + "entropy": 2.015773117542267, + "epoch": 2.0941176470588236, + "grad_norm": 0.7138916254043579, + "learning_rate": 0.00039924390099301584, + "loss": 0.1173, + "mean_token_accuracy": 0.9670253992080688, + "num_tokens": 5116677.0, + "step": 580 + }, + { + "entropy": 2.0676984786987305, + "epoch": 2.097737556561086, + "grad_norm": 0.7776201963424683, + "learning_rate": 0.0003988592622015984, + "loss": 0.0668, + "mean_token_accuracy": 0.9766870141029358, + "num_tokens": 5125262.0, + "step": 581 + }, + { + "entropy": 2.0256679952144623, + "epoch": 2.1013574660633485, + "grad_norm": 0.5481430888175964, + "learning_rate": 0.00039847417595558903, + "loss": 0.0898, + "mean_token_accuracy": 0.9747780114412308, + "num_tokens": 5133848.0, + "step": 582 + }, + { + "entropy": 2.049301326274872, + "epoch": 2.104977375565611, + "grad_norm": 0.6634963154792786, + "learning_rate": 0.00039808864371759464, + "loss": 0.1012, + "mean_token_accuracy": 0.9695883542299271, + "num_tokens": 5142266.0, + "step": 583 + }, + { + "entropy": 1.8873322904109955, + "epoch": 2.1085972850678734, + "grad_norm": 0.6262965798377991, + "learning_rate": 0.0003977026669519156, + "loss": 0.1064, + "mean_token_accuracy": 0.9686857610940933, + "num_tokens": 5151297.0, + "step": 584 + }, + { + "entropy": 2.0208800733089447, + "epoch": 2.112217194570136, + "grad_norm": 0.6475429534912109, + "learning_rate": 0.0003973162471245411, + "loss": 0.126, + "mean_token_accuracy": 0.9671273976564407, + "num_tokens": 5159913.0, + "step": 585 + }, + { + "entropy": 2.0354510843753815, + "epoch": 2.1158371040723982, + "grad_norm": 0.6373077034950256, + "learning_rate": 0.0003969293857031426, + "loss": 0.1403, + "mean_token_accuracy": 0.9615094214677811, + "num_tokens": 5168392.0, + "step": 586 + }, + { + "entropy": 2.0489701330661774, + "epoch": 2.1194570135746607, + "grad_norm": 0.7459731698036194, + "learning_rate": 0.0003965420841570693, + "loss": 0.0847, + "mean_token_accuracy": 0.9742033332586288, + "num_tokens": 5176858.0, + "step": 587 + }, + { + "entropy": 2.0531455874443054, + "epoch": 2.123076923076923, + "grad_norm": 0.8357418179512024, + "learning_rate": 0.00039615434395734174, + "loss": 0.2558, + "mean_token_accuracy": 0.9348864704370499, + "num_tokens": 5185101.0, + "step": 588 + }, + { + "entropy": 1.9761857986450195, + "epoch": 2.1266968325791855, + "grad_norm": 0.4816463887691498, + "learning_rate": 0.00039576616657664666, + "loss": 0.0934, + "mean_token_accuracy": 0.9781179577112198, + "num_tokens": 5193987.0, + "step": 589 + }, + { + "entropy": 2.0150316655635834, + "epoch": 2.130316742081448, + "grad_norm": 0.7039950489997864, + "learning_rate": 0.0003953775534893311, + "loss": 0.1558, + "mean_token_accuracy": 0.9602096229791641, + "num_tokens": 5202598.0, + "step": 590 + }, + { + "entropy": 2.0542426705360413, + "epoch": 2.1339366515837104, + "grad_norm": 0.6318346858024597, + "learning_rate": 0.00039498850617139737, + "loss": 0.1277, + "mean_token_accuracy": 0.9658758789300919, + "num_tokens": 5211157.0, + "step": 591 + }, + { + "entropy": 2.0793416798114777, + "epoch": 2.137556561085973, + "grad_norm": 0.6513328552246094, + "learning_rate": 0.0003945990261004964, + "loss": 0.3452, + "mean_token_accuracy": 0.9376382231712341, + "num_tokens": 5220057.0, + "step": 592 + }, + { + "entropy": 1.834738850593567, + "epoch": 2.1411764705882352, + "grad_norm": 0.709550678730011, + "learning_rate": 0.0003942091147559234, + "loss": 0.1632, + "mean_token_accuracy": 0.9588025957345963, + "num_tokens": 5229649.0, + "step": 593 + }, + { + "entropy": 2.115740954875946, + "epoch": 2.1447963800904977, + "grad_norm": 0.6495632529258728, + "learning_rate": 0.00039381877361861127, + "loss": 0.0799, + "mean_token_accuracy": 0.9793208837509155, + "num_tokens": 5238060.0, + "step": 594 + }, + { + "entropy": 1.9325994551181793, + "epoch": 2.14841628959276, + "grad_norm": 0.3864371180534363, + "learning_rate": 0.0003934280041711253, + "loss": 0.0392, + "mean_token_accuracy": 0.9867032468318939, + "num_tokens": 5246715.0, + "step": 595 + }, + { + "entropy": 1.9573578834533691, + "epoch": 2.1520361990950225, + "grad_norm": 0.8978553414344788, + "learning_rate": 0.0003930368078976578, + "loss": 0.1043, + "mean_token_accuracy": 0.9700421690940857, + "num_tokens": 5255677.0, + "step": 596 + }, + { + "entropy": 2.017194092273712, + "epoch": 2.155656108597285, + "grad_norm": 0.8082290887832642, + "learning_rate": 0.0003926451862840221, + "loss": 0.193, + "mean_token_accuracy": 0.9494165182113647, + "num_tokens": 5264229.0, + "step": 597 + }, + { + "entropy": 1.8982190787792206, + "epoch": 2.1592760180995474, + "grad_norm": 0.7600063681602478, + "learning_rate": 0.00039225314081764673, + "loss": 0.2152, + "mean_token_accuracy": 0.9523166120052338, + "num_tokens": 5273397.0, + "step": 598 + }, + { + "entropy": 1.9896901845932007, + "epoch": 2.16289592760181, + "grad_norm": 0.45877528190612793, + "learning_rate": 0.0003918606729875706, + "loss": 0.0892, + "mean_token_accuracy": 0.9720247238874435, + "num_tokens": 5282376.0, + "step": 599 + }, + { + "entropy": 1.8235589861869812, + "epoch": 2.1665158371040723, + "grad_norm": 0.49329352378845215, + "learning_rate": 0.0003914677842844365, + "loss": 0.0803, + "mean_token_accuracy": 0.9721037000417709, + "num_tokens": 5291815.0, + "step": 600 + }, + { + "entropy": 1.9400377571582794, + "epoch": 2.1701357466063347, + "grad_norm": 0.5306346416473389, + "learning_rate": 0.0003910744762004857, + "loss": 0.0602, + "mean_token_accuracy": 0.9762802571058273, + "num_tokens": 5300394.0, + "step": 601 + }, + { + "entropy": 1.7808023691177368, + "epoch": 2.173755656108597, + "grad_norm": 0.5050559043884277, + "learning_rate": 0.00039068075022955255, + "loss": 0.0862, + "mean_token_accuracy": 0.9724314510822296, + "num_tokens": 5309685.0, + "step": 602 + }, + { + "entropy": 1.9939678311347961, + "epoch": 2.1773755656108595, + "grad_norm": 0.6879346966743469, + "learning_rate": 0.0003902866078670584, + "loss": 0.0936, + "mean_token_accuracy": 0.9765703976154327, + "num_tokens": 5318020.0, + "step": 603 + }, + { + "entropy": 1.9384137690067291, + "epoch": 2.180995475113122, + "grad_norm": 0.6881359219551086, + "learning_rate": 0.0003898920506100061, + "loss": 0.1303, + "mean_token_accuracy": 0.9615567773580551, + "num_tokens": 5326895.0, + "step": 604 + }, + { + "entropy": 1.9919665455818176, + "epoch": 2.184615384615385, + "grad_norm": 0.6181508302688599, + "learning_rate": 0.00038949707995697446, + "loss": 0.0745, + "mean_token_accuracy": 0.9808734804391861, + "num_tokens": 5335355.0, + "step": 605 + }, + { + "entropy": 1.9376583397388458, + "epoch": 2.1882352941176473, + "grad_norm": 0.46525871753692627, + "learning_rate": 0.0003891016974081125, + "loss": 0.0826, + "mean_token_accuracy": 0.9753947854042053, + "num_tokens": 5343879.0, + "step": 606 + }, + { + "entropy": 1.8252979516983032, + "epoch": 2.1918552036199097, + "grad_norm": 0.5332593321800232, + "learning_rate": 0.00038870590446513325, + "loss": 0.1218, + "mean_token_accuracy": 0.9644111543893814, + "num_tokens": 5352980.0, + "step": 607 + }, + { + "entropy": 1.8981524407863617, + "epoch": 2.195475113122172, + "grad_norm": 0.5849556922912598, + "learning_rate": 0.0003883097026313089, + "loss": 0.0854, + "mean_token_accuracy": 0.9766328930854797, + "num_tokens": 5361576.0, + "step": 608 + }, + { + "entropy": 1.9466857016086578, + "epoch": 2.1990950226244346, + "grad_norm": 1.0213185548782349, + "learning_rate": 0.00038791309341146453, + "loss": 0.1282, + "mean_token_accuracy": 0.975858062505722, + "num_tokens": 5369947.0, + "step": 609 + }, + { + "entropy": 1.9219308197498322, + "epoch": 2.202714932126697, + "grad_norm": 0.7259594798088074, + "learning_rate": 0.00038751607831197243, + "loss": 0.0986, + "mean_token_accuracy": 0.9709735363721848, + "num_tokens": 5378429.0, + "step": 610 + }, + { + "entropy": 1.934881567955017, + "epoch": 2.2063348416289594, + "grad_norm": 0.6190217137336731, + "learning_rate": 0.0003871186588407467, + "loss": 0.1259, + "mean_token_accuracy": 0.9606761038303375, + "num_tokens": 5386986.0, + "step": 611 + }, + { + "entropy": 1.9234256446361542, + "epoch": 2.209954751131222, + "grad_norm": 1.1731759309768677, + "learning_rate": 0.00038672083650723697, + "loss": 0.3705, + "mean_token_accuracy": 0.9448409974575043, + "num_tokens": 5395623.0, + "step": 612 + }, + { + "entropy": 1.9198957085609436, + "epoch": 2.2135746606334843, + "grad_norm": 0.38831791281700134, + "learning_rate": 0.00038632261282242316, + "loss": 0.0405, + "mean_token_accuracy": 0.9884084165096283, + "num_tokens": 5403964.0, + "step": 613 + }, + { + "entropy": 1.9401849210262299, + "epoch": 2.2171945701357467, + "grad_norm": 0.6391944885253906, + "learning_rate": 0.0003859239892988097, + "loss": 0.0803, + "mean_token_accuracy": 0.9763080179691315, + "num_tokens": 5412601.0, + "step": 614 + }, + { + "entropy": 1.906328171491623, + "epoch": 2.220814479638009, + "grad_norm": 0.5495765805244446, + "learning_rate": 0.00038552496745041935, + "loss": 0.0919, + "mean_token_accuracy": 0.9796502739191055, + "num_tokens": 5421112.0, + "step": 615 + }, + { + "entropy": 1.9130763709545135, + "epoch": 2.2244343891402716, + "grad_norm": 0.8233397006988525, + "learning_rate": 0.0003851255487927883, + "loss": 0.1246, + "mean_token_accuracy": 0.9621723592281342, + "num_tokens": 5429851.0, + "step": 616 + }, + { + "entropy": 1.8408336341381073, + "epoch": 2.228054298642534, + "grad_norm": 0.8857082724571228, + "learning_rate": 0.00038472573484295904, + "loss": 0.1061, + "mean_token_accuracy": 0.9664444029331207, + "num_tokens": 5438983.0, + "step": 617 + }, + { + "entropy": 1.8644142150878906, + "epoch": 2.2316742081447964, + "grad_norm": 0.6762974262237549, + "learning_rate": 0.0003843255271194762, + "loss": 0.1532, + "mean_token_accuracy": 0.952915757894516, + "num_tokens": 5447922.0, + "step": 618 + }, + { + "entropy": 1.7125722169876099, + "epoch": 2.235294117647059, + "grad_norm": 0.44111478328704834, + "learning_rate": 0.00038392492714237975, + "loss": 0.0819, + "mean_token_accuracy": 0.9738304615020752, + "num_tokens": 5457128.0, + "step": 619 + }, + { + "entropy": 1.7900195717811584, + "epoch": 2.2389140271493213, + "grad_norm": 0.5224407911300659, + "learning_rate": 0.0003835239364331993, + "loss": 0.1023, + "mean_token_accuracy": 0.975239485502243, + "num_tokens": 5465760.0, + "step": 620 + }, + { + "entropy": 1.715638667345047, + "epoch": 2.2425339366515837, + "grad_norm": 0.6327251195907593, + "learning_rate": 0.00038312255651494866, + "loss": 0.154, + "mean_token_accuracy": 0.9579339027404785, + "num_tokens": 5475190.0, + "step": 621 + }, + { + "entropy": 1.8499042093753815, + "epoch": 2.246153846153846, + "grad_norm": 0.6490166187286377, + "learning_rate": 0.00038272078891212017, + "loss": 0.1248, + "mean_token_accuracy": 0.9679877310991287, + "num_tokens": 5484011.0, + "step": 622 + }, + { + "entropy": 1.7533331513404846, + "epoch": 2.2497737556561086, + "grad_norm": 0.6320033073425293, + "learning_rate": 0.000382318635150678, + "loss": 0.1588, + "mean_token_accuracy": 0.9576389044523239, + "num_tokens": 5493123.0, + "step": 623 + }, + { + "entropy": 1.8554400503635406, + "epoch": 2.253393665158371, + "grad_norm": 0.7169481515884399, + "learning_rate": 0.0003819160967580536, + "loss": 0.1316, + "mean_token_accuracy": 0.966967299580574, + "num_tokens": 5501923.0, + "step": 624 + }, + { + "entropy": 1.9283805191516876, + "epoch": 2.2570135746606335, + "grad_norm": 0.599856436252594, + "learning_rate": 0.00038151317526313917, + "loss": 0.1326, + "mean_token_accuracy": 0.961080014705658, + "num_tokens": 5510356.0, + "step": 625 + }, + { + "entropy": 1.7921342253684998, + "epoch": 2.260633484162896, + "grad_norm": 0.7019768357276917, + "learning_rate": 0.0003811098721962818, + "loss": 0.0976, + "mean_token_accuracy": 0.970125287771225, + "num_tokens": 5519016.0, + "step": 626 + }, + { + "entropy": 1.7646876573562622, + "epoch": 2.2642533936651583, + "grad_norm": 0.7311795949935913, + "learning_rate": 0.00038070618908927784, + "loss": 0.0908, + "mean_token_accuracy": 0.9719386845827103, + "num_tokens": 5528139.0, + "step": 627 + }, + { + "entropy": 1.8233769237995148, + "epoch": 2.2678733031674208, + "grad_norm": 0.6742154955863953, + "learning_rate": 0.0003803021274753674, + "loss": 0.1348, + "mean_token_accuracy": 0.9619691967964172, + "num_tokens": 5537036.0, + "step": 628 + }, + { + "entropy": 1.7711736857891083, + "epoch": 2.271493212669683, + "grad_norm": 0.6000869274139404, + "learning_rate": 0.00037989768888922775, + "loss": 0.1086, + "mean_token_accuracy": 0.9672373533248901, + "num_tokens": 5545932.0, + "step": 629 + }, + { + "entropy": 1.8396382629871368, + "epoch": 2.2751131221719456, + "grad_norm": 0.541504979133606, + "learning_rate": 0.0003794928748669683, + "loss": 0.0775, + "mean_token_accuracy": 0.977355495095253, + "num_tokens": 5554403.0, + "step": 630 + }, + { + "entropy": 1.890054315328598, + "epoch": 2.278733031674208, + "grad_norm": 0.5629594326019287, + "learning_rate": 0.00037908768694612434, + "loss": 0.0711, + "mean_token_accuracy": 0.9779117107391357, + "num_tokens": 5563156.0, + "step": 631 + }, + { + "entropy": 1.9505741894245148, + "epoch": 2.2823529411764705, + "grad_norm": 0.6717761754989624, + "learning_rate": 0.0003786821266656512, + "loss": 0.1077, + "mean_token_accuracy": 0.9674138873815536, + "num_tokens": 5571618.0, + "step": 632 + }, + { + "entropy": 1.8377742171287537, + "epoch": 2.285972850678733, + "grad_norm": 0.6176472902297974, + "learning_rate": 0.0003782761955659185, + "loss": 0.1106, + "mean_token_accuracy": 0.9669957906007767, + "num_tokens": 5580668.0, + "step": 633 + }, + { + "entropy": 1.8336479365825653, + "epoch": 2.2895927601809953, + "grad_norm": 0.5120813846588135, + "learning_rate": 0.0003778698951887042, + "loss": 0.0732, + "mean_token_accuracy": 0.9774532318115234, + "num_tokens": 5589491.0, + "step": 634 + }, + { + "entropy": 1.9576656222343445, + "epoch": 2.2932126696832578, + "grad_norm": 0.9347079396247864, + "learning_rate": 0.00037746322707718895, + "loss": 0.2275, + "mean_token_accuracy": 0.9512088149785995, + "num_tokens": 5598327.0, + "step": 635 + }, + { + "entropy": 1.9309991896152496, + "epoch": 2.29683257918552, + "grad_norm": 0.506108283996582, + "learning_rate": 0.0003770561927759502, + "loss": 0.1046, + "mean_token_accuracy": 0.9633967131376266, + "num_tokens": 5606948.0, + "step": 636 + }, + { + "entropy": 1.963425725698471, + "epoch": 2.3004524886877826, + "grad_norm": 0.5499919056892395, + "learning_rate": 0.0003766487938309561, + "loss": 0.0804, + "mean_token_accuracy": 0.9783825874328613, + "num_tokens": 5615342.0, + "step": 637 + }, + { + "entropy": 1.8853708505630493, + "epoch": 2.304072398190045, + "grad_norm": 0.5846657156944275, + "learning_rate": 0.00037624103178955946, + "loss": 0.0904, + "mean_token_accuracy": 0.9774703830480576, + "num_tokens": 5624449.0, + "step": 638 + }, + { + "entropy": 1.928403079509735, + "epoch": 2.3076923076923075, + "grad_norm": 0.5203971266746521, + "learning_rate": 0.0003758329082004928, + "loss": 0.0917, + "mean_token_accuracy": 0.9723261743783951, + "num_tokens": 5633273.0, + "step": 639 + }, + { + "entropy": 1.8914157152175903, + "epoch": 2.31131221719457, + "grad_norm": 0.5215239524841309, + "learning_rate": 0.00037542442461386145, + "loss": 0.1072, + "mean_token_accuracy": 0.9704900681972504, + "num_tokens": 5642357.0, + "step": 640 + }, + { + "entropy": 1.9754666090011597, + "epoch": 2.3149321266968323, + "grad_norm": 0.6710624694824219, + "learning_rate": 0.0003750155825811379, + "loss": 0.1344, + "mean_token_accuracy": 0.9615458548069, + "num_tokens": 5651409.0, + "step": 641 + }, + { + "entropy": 1.97001314163208, + "epoch": 2.318552036199095, + "grad_norm": 0.6511638164520264, + "learning_rate": 0.00037460638365515673, + "loss": 0.0502, + "mean_token_accuracy": 0.9829420000314713, + "num_tokens": 5660362.0, + "step": 642 + }, + { + "entropy": 1.9473612904548645, + "epoch": 2.3221719457013577, + "grad_norm": 0.5315663814544678, + "learning_rate": 0.00037419682939010725, + "loss": 0.1004, + "mean_token_accuracy": 0.9741797298192978, + "num_tokens": 5669386.0, + "step": 643 + }, + { + "entropy": 1.9136508405208588, + "epoch": 2.32579185520362, + "grad_norm": 0.6636398434638977, + "learning_rate": 0.00037378692134152887, + "loss": 0.0928, + "mean_token_accuracy": 0.9753085225820541, + "num_tokens": 5678226.0, + "step": 644 + }, + { + "entropy": 2.0870893597602844, + "epoch": 2.3294117647058825, + "grad_norm": 0.45003074407577515, + "learning_rate": 0.00037337666106630464, + "loss": 0.0937, + "mean_token_accuracy": 0.9742898046970367, + "num_tokens": 5687017.0, + "step": 645 + }, + { + "entropy": 2.084017276763916, + "epoch": 2.333031674208145, + "grad_norm": 0.6305840611457825, + "learning_rate": 0.0003729660501226553, + "loss": 0.1085, + "mean_token_accuracy": 0.9696957617998123, + "num_tokens": 5695585.0, + "step": 646 + }, + { + "entropy": 2.0916273295879364, + "epoch": 2.3366515837104074, + "grad_norm": 0.6674802303314209, + "learning_rate": 0.00037255509007013353, + "loss": 0.1214, + "mean_token_accuracy": 0.9657080322504044, + "num_tokens": 5704167.0, + "step": 647 + }, + { + "entropy": 2.0445155799388885, + "epoch": 2.34027149321267, + "grad_norm": 0.9245135188102722, + "learning_rate": 0.0003721437824696181, + "loss": 0.124, + "mean_token_accuracy": 0.9668982475996017, + "num_tokens": 5712896.0, + "step": 648 + }, + { + "entropy": 2.040050685405731, + "epoch": 2.3438914027149322, + "grad_norm": 0.558266818523407, + "learning_rate": 0.00037173212888330756, + "loss": 0.103, + "mean_token_accuracy": 0.9663692861795425, + "num_tokens": 5721568.0, + "step": 649 + }, + { + "entropy": 2.078313887119293, + "epoch": 2.3475113122171947, + "grad_norm": 0.6157237887382507, + "learning_rate": 0.0003713201308747148, + "loss": 0.1247, + "mean_token_accuracy": 0.9645204842090607, + "num_tokens": 5730097.0, + "step": 650 + }, + { + "entropy": 1.9473297894001007, + "epoch": 2.351131221719457, + "grad_norm": 0.6460309028625488, + "learning_rate": 0.0003709077900086607, + "loss": 0.193, + "mean_token_accuracy": 0.9537071883678436, + "num_tokens": 5738953.0, + "step": 651 + }, + { + "entropy": 1.9319245219230652, + "epoch": 2.3547511312217195, + "grad_norm": 0.826302170753479, + "learning_rate": 0.0003704951078512684, + "loss": 0.2072, + "mean_token_accuracy": 0.9553762674331665, + "num_tokens": 5748421.0, + "step": 652 + }, + { + "entropy": 2.000667005777359, + "epoch": 2.358371040723982, + "grad_norm": 0.508975625038147, + "learning_rate": 0.00037008208596995743, + "loss": 0.1124, + "mean_token_accuracy": 0.9674097448587418, + "num_tokens": 5757333.0, + "step": 653 + }, + { + "entropy": 1.9692010879516602, + "epoch": 2.3619909502262444, + "grad_norm": 0.597391664981842, + "learning_rate": 0.00036966872593343747, + "loss": 0.0958, + "mean_token_accuracy": 0.9727880656719208, + "num_tokens": 5766427.0, + "step": 654 + }, + { + "entropy": 1.9356706142425537, + "epoch": 2.365610859728507, + "grad_norm": 0.6264978051185608, + "learning_rate": 0.0003692550293117025, + "loss": 0.0925, + "mean_token_accuracy": 0.9736592024564743, + "num_tokens": 5775578.0, + "step": 655 + }, + { + "entropy": 2.086688846349716, + "epoch": 2.3692307692307693, + "grad_norm": 0.926537811756134, + "learning_rate": 0.00036884099767602523, + "loss": 0.1772, + "mean_token_accuracy": 0.9588586837053299, + "num_tokens": 5783754.0, + "step": 656 + }, + { + "entropy": 1.8272685706615448, + "epoch": 2.3728506787330317, + "grad_norm": 0.5276276469230652, + "learning_rate": 0.0003684266325989504, + "loss": 0.106, + "mean_token_accuracy": 0.9692760407924652, + "num_tokens": 5793159.0, + "step": 657 + }, + { + "entropy": 1.8490014672279358, + "epoch": 2.376470588235294, + "grad_norm": 0.6970511078834534, + "learning_rate": 0.0003680119356542895, + "loss": 0.0849, + "mean_token_accuracy": 0.9812656790018082, + "num_tokens": 5802503.0, + "step": 658 + }, + { + "entropy": 1.8577990531921387, + "epoch": 2.3800904977375565, + "grad_norm": 0.49535682797431946, + "learning_rate": 0.00036759690841711435, + "loss": 0.0965, + "mean_token_accuracy": 0.9723764955997467, + "num_tokens": 5811839.0, + "step": 659 + }, + { + "entropy": 1.785957396030426, + "epoch": 2.383710407239819, + "grad_norm": 0.7373266220092773, + "learning_rate": 0.00036718155246375124, + "loss": 0.103, + "mean_token_accuracy": 0.9659082442522049, + "num_tokens": 5821076.0, + "step": 660 + }, + { + "entropy": 1.8944315016269684, + "epoch": 2.3873303167420814, + "grad_norm": 0.4784161448478699, + "learning_rate": 0.000366765869371775, + "loss": 0.0899, + "mean_token_accuracy": 0.9731316566467285, + "num_tokens": 5830098.0, + "step": 661 + }, + { + "entropy": 1.8901143372058868, + "epoch": 2.390950226244344, + "grad_norm": 0.5539003610610962, + "learning_rate": 0.00036634986072000305, + "loss": 0.078, + "mean_token_accuracy": 0.9769923985004425, + "num_tokens": 5839149.0, + "step": 662 + }, + { + "entropy": 1.8183043003082275, + "epoch": 2.3945701357466063, + "grad_norm": 0.48431649804115295, + "learning_rate": 0.0003659335280884893, + "loss": 0.0669, + "mean_token_accuracy": 0.978607714176178, + "num_tokens": 5848064.0, + "step": 663 + }, + { + "entropy": 1.7216700911521912, + "epoch": 2.3981900452488687, + "grad_norm": 0.5597919821739197, + "learning_rate": 0.00036551687305851803, + "loss": 0.1026, + "mean_token_accuracy": 0.9733614027500153, + "num_tokens": 5857075.0, + "step": 664 + }, + { + "entropy": 1.7788107991218567, + "epoch": 2.401809954751131, + "grad_norm": 0.6780642867088318, + "learning_rate": 0.00036509989721259824, + "loss": 0.0895, + "mean_token_accuracy": 0.9711848199367523, + "num_tokens": 5866029.0, + "step": 665 + }, + { + "entropy": 1.8354471325874329, + "epoch": 2.4054298642533936, + "grad_norm": 0.6284046769142151, + "learning_rate": 0.0003646826021344573, + "loss": 0.1153, + "mean_token_accuracy": 0.9645407199859619, + "num_tokens": 5874523.0, + "step": 666 + }, + { + "entropy": 1.829980492591858, + "epoch": 2.409049773755656, + "grad_norm": 0.6398605704307556, + "learning_rate": 0.00036426498940903506, + "loss": 0.0605, + "mean_token_accuracy": 0.9823256582021713, + "num_tokens": 5883067.0, + "step": 667 + }, + { + "entropy": 1.839373379945755, + "epoch": 2.4126696832579184, + "grad_norm": 0.6254173517227173, + "learning_rate": 0.000363847060622478, + "loss": 0.0708, + "mean_token_accuracy": 0.978134423494339, + "num_tokens": 5891921.0, + "step": 668 + }, + { + "entropy": 1.7790280282497406, + "epoch": 2.416289592760181, + "grad_norm": 0.5987306833267212, + "learning_rate": 0.0003634288173621326, + "loss": 0.0888, + "mean_token_accuracy": 0.9814571887254715, + "num_tokens": 5900603.0, + "step": 669 + }, + { + "entropy": 1.6918425559997559, + "epoch": 2.4199095022624433, + "grad_norm": 0.784694492816925, + "learning_rate": 0.00036301026121654057, + "loss": 0.1353, + "mean_token_accuracy": 0.9646909832954407, + "num_tokens": 5910028.0, + "step": 670 + }, + { + "entropy": 1.726965218782425, + "epoch": 2.4235294117647057, + "grad_norm": 0.7017857432365417, + "learning_rate": 0.00036259139377543104, + "loss": 0.1531, + "mean_token_accuracy": 0.9617924690246582, + "num_tokens": 5919145.0, + "step": 671 + }, + { + "entropy": 1.7354467511177063, + "epoch": 2.427149321266968, + "grad_norm": 0.49217918515205383, + "learning_rate": 0.00036217221662971613, + "loss": 0.1217, + "mean_token_accuracy": 0.96451136469841, + "num_tokens": 5928203.0, + "step": 672 + }, + { + "entropy": 1.827672392129898, + "epoch": 2.430769230769231, + "grad_norm": 0.5875037312507629, + "learning_rate": 0.0003617527313714841, + "loss": 0.1151, + "mean_token_accuracy": 0.9714375436306, + "num_tokens": 5936876.0, + "step": 673 + }, + { + "entropy": 1.787518948316574, + "epoch": 2.4343891402714934, + "grad_norm": 0.5444310307502747, + "learning_rate": 0.0003613329395939933, + "loss": 0.1096, + "mean_token_accuracy": 0.9701481461524963, + "num_tokens": 5946025.0, + "step": 674 + }, + { + "entropy": 1.832441657781601, + "epoch": 2.438009049773756, + "grad_norm": 0.6885861754417419, + "learning_rate": 0.00036091284289166637, + "loss": 0.1409, + "mean_token_accuracy": 0.9587968736886978, + "num_tokens": 5954406.0, + "step": 675 + }, + { + "entropy": 1.7488494515419006, + "epoch": 2.4416289592760183, + "grad_norm": 0.4765988290309906, + "learning_rate": 0.0003604924428600843, + "loss": 0.1183, + "mean_token_accuracy": 0.9581810384988785, + "num_tokens": 5963472.0, + "step": 676 + }, + { + "entropy": 1.885668009519577, + "epoch": 2.4452488687782807, + "grad_norm": 0.7310354113578796, + "learning_rate": 0.00036007174109597983, + "loss": 0.1248, + "mean_token_accuracy": 0.9588694721460342, + "num_tokens": 5971771.0, + "step": 677 + }, + { + "entropy": 1.8329627513885498, + "epoch": 2.448868778280543, + "grad_norm": 0.37075191736221313, + "learning_rate": 0.00035965073919723206, + "loss": 0.0694, + "mean_token_accuracy": 0.9812011271715164, + "num_tokens": 5980536.0, + "step": 678 + }, + { + "entropy": 1.8218618333339691, + "epoch": 2.4524886877828056, + "grad_norm": 0.5196499228477478, + "learning_rate": 0.0003592294387628597, + "loss": 0.0833, + "mean_token_accuracy": 0.9765996187925339, + "num_tokens": 5989462.0, + "step": 679 + }, + { + "entropy": 1.7702144086360931, + "epoch": 2.456108597285068, + "grad_norm": 0.68550044298172, + "learning_rate": 0.0003588078413930155, + "loss": 0.1395, + "mean_token_accuracy": 0.9701545089483261, + "num_tokens": 5998702.0, + "step": 680 + }, + { + "entropy": 1.729397028684616, + "epoch": 2.4597285067873305, + "grad_norm": 0.6107930541038513, + "learning_rate": 0.00035838594868898004, + "loss": 0.1009, + "mean_token_accuracy": 0.9712544083595276, + "num_tokens": 6007594.0, + "step": 681 + }, + { + "entropy": 1.6558150053024292, + "epoch": 2.463348416289593, + "grad_norm": 0.45058509707450867, + "learning_rate": 0.0003579637622531555, + "loss": 0.0747, + "mean_token_accuracy": 0.9791784882545471, + "num_tokens": 6016874.0, + "step": 682 + }, + { + "entropy": 1.7209869921207428, + "epoch": 2.4669683257918553, + "grad_norm": 0.6103800535202026, + "learning_rate": 0.0003575412836890599, + "loss": 0.1096, + "mean_token_accuracy": 0.9665796160697937, + "num_tokens": 6026056.0, + "step": 683 + }, + { + "entropy": 1.790249615907669, + "epoch": 2.4705882352941178, + "grad_norm": 0.67525315284729, + "learning_rate": 0.0003571185146013205, + "loss": 0.0811, + "mean_token_accuracy": 0.9776998162269592, + "num_tokens": 6034624.0, + "step": 684 + }, + { + "entropy": 1.735906183719635, + "epoch": 2.47420814479638, + "grad_norm": 0.884986162185669, + "learning_rate": 0.00035669545659566836, + "loss": 0.2324, + "mean_token_accuracy": 0.9448857754468918, + "num_tokens": 6043557.0, + "step": 685 + }, + { + "entropy": 1.673194944858551, + "epoch": 2.4778280542986426, + "grad_norm": 0.7441328763961792, + "learning_rate": 0.0003562721112789316, + "loss": 0.1661, + "mean_token_accuracy": 0.9566781520843506, + "num_tokens": 6052623.0, + "step": 686 + }, + { + "entropy": 1.736072987318039, + "epoch": 2.481447963800905, + "grad_norm": 0.5674424767494202, + "learning_rate": 0.00035584848025902973, + "loss": 0.0751, + "mean_token_accuracy": 0.9750215858221054, + "num_tokens": 6061347.0, + "step": 687 + }, + { + "entropy": 1.625234305858612, + "epoch": 2.4850678733031675, + "grad_norm": 0.6596720218658447, + "learning_rate": 0.00035542456514496725, + "loss": 0.0796, + "mean_token_accuracy": 0.9773041009902954, + "num_tokens": 6070396.0, + "step": 688 + }, + { + "entropy": 1.6548752784729004, + "epoch": 2.48868778280543, + "grad_norm": 0.5798892378807068, + "learning_rate": 0.00035500036754682794, + "loss": 0.1412, + "mean_token_accuracy": 0.9653023481369019, + "num_tokens": 6079757.0, + "step": 689 + }, + { + "entropy": 1.6213977932929993, + "epoch": 2.4923076923076923, + "grad_norm": 0.44931474328041077, + "learning_rate": 0.00035457588907576823, + "loss": 0.0724, + "mean_token_accuracy": 0.9800422787666321, + "num_tokens": 6088646.0, + "step": 690 + }, + { + "entropy": 1.6762541830539703, + "epoch": 2.4959276018099548, + "grad_norm": 0.6818104386329651, + "learning_rate": 0.0003541511313440114, + "loss": 0.1217, + "mean_token_accuracy": 0.9675028026103973, + "num_tokens": 6097441.0, + "step": 691 + }, + { + "entropy": 1.7241974771022797, + "epoch": 2.499547511312217, + "grad_norm": 0.4126259982585907, + "learning_rate": 0.00035372609596484166, + "loss": 0.0615, + "mean_token_accuracy": 0.9799284338951111, + "num_tokens": 6105578.0, + "step": 692 + }, + { + "entropy": 1.6379709541797638, + "epoch": 2.5031674208144796, + "grad_norm": 0.47291842103004456, + "learning_rate": 0.00035330078455259734, + "loss": 0.0858, + "mean_token_accuracy": 0.9744312763214111, + "num_tokens": 6114404.0, + "step": 693 + }, + { + "entropy": 1.6317658722400665, + "epoch": 2.506787330316742, + "grad_norm": 0.5747683048248291, + "learning_rate": 0.00035287519872266544, + "loss": 0.1344, + "mean_token_accuracy": 0.9632531553506851, + "num_tokens": 6123319.0, + "step": 694 + }, + { + "entropy": 1.6969698369503021, + "epoch": 2.5104072398190045, + "grad_norm": 0.5810018181800842, + "learning_rate": 0.00035244934009147523, + "loss": 0.0927, + "mean_token_accuracy": 0.9729650169610977, + "num_tokens": 6131814.0, + "step": 695 + }, + { + "entropy": 1.631262481212616, + "epoch": 2.514027149321267, + "grad_norm": 0.44387346506118774, + "learning_rate": 0.00035202321027649205, + "loss": 0.0657, + "mean_token_accuracy": 0.9802225232124329, + "num_tokens": 6140967.0, + "step": 696 + }, + { + "entropy": 1.610716551542282, + "epoch": 2.5176470588235293, + "grad_norm": 0.6546471118927002, + "learning_rate": 0.0003515968108962112, + "loss": 0.1114, + "mean_token_accuracy": 0.9671156108379364, + "num_tokens": 6149938.0, + "step": 697 + }, + { + "entropy": 1.598843276500702, + "epoch": 2.521266968325792, + "grad_norm": 0.541953444480896, + "learning_rate": 0.0003511701435701519, + "loss": 0.0504, + "mean_token_accuracy": 0.98616062104702, + "num_tokens": 6158686.0, + "step": 698 + }, + { + "entropy": 1.7793676853179932, + "epoch": 2.524886877828054, + "grad_norm": 0.6303162574768066, + "learning_rate": 0.00035074320991885106, + "loss": 0.0797, + "mean_token_accuracy": 0.9783169627189636, + "num_tokens": 6166835.0, + "step": 699 + }, + { + "entropy": 1.598317414522171, + "epoch": 2.5285067873303166, + "grad_norm": 0.4783090054988861, + "learning_rate": 0.000350316011563857, + "loss": 0.0693, + "mean_token_accuracy": 0.9740357846021652, + "num_tokens": 6175978.0, + "step": 700 + }, + { + "entropy": 1.6361595392227173, + "epoch": 2.532126696832579, + "grad_norm": 0.46353498101234436, + "learning_rate": 0.00034988855012772367, + "loss": 0.0543, + "mean_token_accuracy": 0.9821173399686813, + "num_tokens": 6185071.0, + "step": 701 + }, + { + "entropy": 1.6333596408367157, + "epoch": 2.5357466063348415, + "grad_norm": 0.4968421459197998, + "learning_rate": 0.0003494608272340039, + "loss": 0.1588, + "mean_token_accuracy": 0.9692430347204208, + "num_tokens": 6194279.0, + "step": 702 + }, + { + "entropy": 1.6701206266880035, + "epoch": 2.539366515837104, + "grad_norm": 0.7050784826278687, + "learning_rate": 0.00034903284450724385, + "loss": 0.1298, + "mean_token_accuracy": 0.9623726159334183, + "num_tokens": 6203017.0, + "step": 703 + }, + { + "entropy": 1.6594900786876678, + "epoch": 2.5429864253393664, + "grad_norm": 0.7955659031867981, + "learning_rate": 0.0003486046035729765, + "loss": 0.1695, + "mean_token_accuracy": 0.9616524875164032, + "num_tokens": 6212016.0, + "step": 704 + }, + { + "entropy": 1.7208792865276337, + "epoch": 2.546606334841629, + "grad_norm": 0.7105070352554321, + "learning_rate": 0.00034817610605771546, + "loss": 0.1655, + "mean_token_accuracy": 0.9637335985898972, + "num_tokens": 6220619.0, + "step": 705 + }, + { + "entropy": 1.668517529964447, + "epoch": 2.5502262443438912, + "grad_norm": 0.3955032527446747, + "learning_rate": 0.0003477473535889488, + "loss": 0.0502, + "mean_token_accuracy": 0.9823585599660873, + "num_tokens": 6229785.0, + "step": 706 + }, + { + "entropy": 1.7515103816986084, + "epoch": 2.5538461538461537, + "grad_norm": 0.6166616082191467, + "learning_rate": 0.00034731834779513313, + "loss": 0.1113, + "mean_token_accuracy": 0.9675650298595428, + "num_tokens": 6238724.0, + "step": 707 + }, + { + "entropy": 1.8460631668567657, + "epoch": 2.557466063348416, + "grad_norm": 0.8243921399116516, + "learning_rate": 0.0003468890903056872, + "loss": 0.1625, + "mean_token_accuracy": 0.9648249596357346, + "num_tokens": 6246939.0, + "step": 708 + }, + { + "entropy": 1.784417450428009, + "epoch": 2.5610859728506785, + "grad_norm": 0.5633116960525513, + "learning_rate": 0.00034645958275098557, + "loss": 0.1074, + "mean_token_accuracy": 0.9705483913421631, + "num_tokens": 6255686.0, + "step": 709 + }, + { + "entropy": 1.7208334505558014, + "epoch": 2.564705882352941, + "grad_norm": 0.8083389401435852, + "learning_rate": 0.0003460298267623526, + "loss": 0.1184, + "mean_token_accuracy": 0.9747882932424545, + "num_tokens": 6265047.0, + "step": 710 + }, + { + "entropy": 1.7345463037490845, + "epoch": 2.5683257918552034, + "grad_norm": 0.6094368100166321, + "learning_rate": 0.0003455998239720565, + "loss": 0.1689, + "mean_token_accuracy": 0.9613602459430695, + "num_tokens": 6274460.0, + "step": 711 + }, + { + "entropy": 1.9464713335037231, + "epoch": 2.571945701357466, + "grad_norm": 0.6025084853172302, + "learning_rate": 0.0003451695760133025, + "loss": 0.1477, + "mean_token_accuracy": 0.9618766456842422, + "num_tokens": 6282700.0, + "step": 712 + }, + { + "entropy": 1.8449675738811493, + "epoch": 2.5755656108597282, + "grad_norm": 0.43869853019714355, + "learning_rate": 0.0003447390845202272, + "loss": 0.0892, + "mean_token_accuracy": 0.974039301276207, + "num_tokens": 6291627.0, + "step": 713 + }, + { + "entropy": 1.9028298556804657, + "epoch": 2.579185520361991, + "grad_norm": 0.5455291271209717, + "learning_rate": 0.0003443083511278922, + "loss": 0.0939, + "mean_token_accuracy": 0.9729337990283966, + "num_tokens": 6300198.0, + "step": 714 + }, + { + "entropy": 1.8395194113254547, + "epoch": 2.5828054298642535, + "grad_norm": 0.48734748363494873, + "learning_rate": 0.00034387737747227786, + "loss": 0.0791, + "mean_token_accuracy": 0.9785804748535156, + "num_tokens": 6309362.0, + "step": 715 + }, + { + "entropy": 1.8357026278972626, + "epoch": 2.586425339366516, + "grad_norm": 0.4359396994113922, + "learning_rate": 0.000343446165190277, + "loss": 0.0752, + "mean_token_accuracy": 0.9807359129190445, + "num_tokens": 6318232.0, + "step": 716 + }, + { + "entropy": 1.7531521618366241, + "epoch": 2.5900452488687784, + "grad_norm": 0.7446436882019043, + "learning_rate": 0.0003430147159196887, + "loss": 0.1467, + "mean_token_accuracy": 0.9661064445972443, + "num_tokens": 6327607.0, + "step": 717 + }, + { + "entropy": 1.83816197514534, + "epoch": 2.593665158371041, + "grad_norm": 0.3669150173664093, + "learning_rate": 0.0003425830312992125, + "loss": 0.076, + "mean_token_accuracy": 0.9777591675519943, + "num_tokens": 6336991.0, + "step": 718 + }, + { + "entropy": 1.9396244585514069, + "epoch": 2.5972850678733033, + "grad_norm": 0.6049129962921143, + "learning_rate": 0.00034215111296844147, + "loss": 0.1001, + "mean_token_accuracy": 0.968943640589714, + "num_tokens": 6345381.0, + "step": 719 + }, + { + "entropy": 1.8745197057724, + "epoch": 2.6009049773755657, + "grad_norm": 0.8561233878135681, + "learning_rate": 0.00034171896256785645, + "loss": 0.2378, + "mean_token_accuracy": 0.9442594349384308, + "num_tokens": 6354290.0, + "step": 720 + }, + { + "entropy": 1.8199078440666199, + "epoch": 2.604524886877828, + "grad_norm": 0.4546636939048767, + "learning_rate": 0.00034128658173881993, + "loss": 0.0407, + "mean_token_accuracy": 0.9873656630516052, + "num_tokens": 6362826.0, + "step": 721 + }, + { + "entropy": 1.8066097497940063, + "epoch": 2.6081447963800906, + "grad_norm": 0.6496687531471252, + "learning_rate": 0.0003408539721235691, + "loss": 0.1279, + "mean_token_accuracy": 0.9674505293369293, + "num_tokens": 6371666.0, + "step": 722 + }, + { + "entropy": 1.8027856945991516, + "epoch": 2.611764705882353, + "grad_norm": 0.6001412272453308, + "learning_rate": 0.0003404211353652106, + "loss": 0.1144, + "mean_token_accuracy": 0.9672902077436447, + "num_tokens": 6380469.0, + "step": 723 + }, + { + "entropy": 1.7859437465667725, + "epoch": 2.6153846153846154, + "grad_norm": 0.4654795229434967, + "learning_rate": 0.0003399880731077136, + "loss": 0.0655, + "mean_token_accuracy": 0.9804074019193649, + "num_tokens": 6389485.0, + "step": 724 + }, + { + "entropy": 1.722127079963684, + "epoch": 2.619004524886878, + "grad_norm": 0.5452624559402466, + "learning_rate": 0.0003395547869959037, + "loss": 0.0827, + "mean_token_accuracy": 0.972189649939537, + "num_tokens": 6398523.0, + "step": 725 + }, + { + "entropy": 1.7406074404716492, + "epoch": 2.6226244343891403, + "grad_norm": 0.5524203181266785, + "learning_rate": 0.00033912127867545685, + "loss": 0.1279, + "mean_token_accuracy": 0.9688322842121124, + "num_tokens": 6407560.0, + "step": 726 + }, + { + "entropy": 1.7783840000629425, + "epoch": 2.6262443438914027, + "grad_norm": 0.6428073644638062, + "learning_rate": 0.00033868754979289275, + "loss": 0.1392, + "mean_token_accuracy": 0.9665655642747879, + "num_tokens": 6416230.0, + "step": 727 + }, + { + "entropy": 1.7406431436538696, + "epoch": 2.629864253393665, + "grad_norm": 0.6197221875190735, + "learning_rate": 0.0003382536019955691, + "loss": 0.2688, + "mean_token_accuracy": 0.9567561745643616, + "num_tokens": 6425158.0, + "step": 728 + }, + { + "entropy": 1.7054848670959473, + "epoch": 2.6334841628959276, + "grad_norm": 0.499615877866745, + "learning_rate": 0.0003378194369316749, + "loss": 0.0765, + "mean_token_accuracy": 0.9788558930158615, + "num_tokens": 6434219.0, + "step": 729 + }, + { + "entropy": 1.8623437583446503, + "epoch": 2.63710407239819, + "grad_norm": 0.428608775138855, + "learning_rate": 0.0003373850562502243, + "loss": 0.044, + "mean_token_accuracy": 0.9862259030342102, + "num_tokens": 6442657.0, + "step": 730 + }, + { + "entropy": 1.6827208995819092, + "epoch": 2.6407239819004524, + "grad_norm": 0.46222713589668274, + "learning_rate": 0.00033695046160105076, + "loss": 0.0687, + "mean_token_accuracy": 0.9762164503335953, + "num_tokens": 6451550.0, + "step": 731 + }, + { + "entropy": 1.707773894071579, + "epoch": 2.644343891402715, + "grad_norm": 0.4701695442199707, + "learning_rate": 0.0003365156546347998, + "loss": 0.0622, + "mean_token_accuracy": 0.9804075062274933, + "num_tokens": 6460494.0, + "step": 732 + }, + { + "entropy": 1.7011042833328247, + "epoch": 2.6479638009049773, + "grad_norm": 0.5986224412918091, + "learning_rate": 0.0003360806370029239, + "loss": 0.0954, + "mean_token_accuracy": 0.9730664491653442, + "num_tokens": 6469728.0, + "step": 733 + }, + { + "entropy": 1.810427963733673, + "epoch": 2.6515837104072397, + "grad_norm": 0.8224559426307678, + "learning_rate": 0.0003356454103576754, + "loss": 0.1218, + "mean_token_accuracy": 0.9742488712072372, + "num_tokens": 6478643.0, + "step": 734 + }, + { + "entropy": 1.773183435201645, + "epoch": 2.655203619909502, + "grad_norm": 0.609344482421875, + "learning_rate": 0.0003352099763521006, + "loss": 0.0955, + "mean_token_accuracy": 0.9747250378131866, + "num_tokens": 6487314.0, + "step": 735 + }, + { + "entropy": 1.7761066555976868, + "epoch": 2.6588235294117646, + "grad_norm": 0.6947258114814758, + "learning_rate": 0.0003347743366400333, + "loss": 0.1188, + "mean_token_accuracy": 0.9693178832530975, + "num_tokens": 6496074.0, + "step": 736 + }, + { + "entropy": 1.7725336253643036, + "epoch": 2.662443438914027, + "grad_norm": 0.6928444504737854, + "learning_rate": 0.0003343384928760887, + "loss": 0.1589, + "mean_token_accuracy": 0.9603369683027267, + "num_tokens": 6504997.0, + "step": 737 + }, + { + "entropy": 1.8763961493968964, + "epoch": 2.6660633484162894, + "grad_norm": 0.6204855442047119, + "learning_rate": 0.00033390244671565694, + "loss": 0.1115, + "mean_token_accuracy": 0.9727036952972412, + "num_tokens": 6513639.0, + "step": 738 + }, + { + "entropy": 1.8347080647945404, + "epoch": 2.669683257918552, + "grad_norm": 0.4470975697040558, + "learning_rate": 0.00033346619981489687, + "loss": 0.0707, + "mean_token_accuracy": 0.9816004037857056, + "num_tokens": 6522524.0, + "step": 739 + }, + { + "entropy": 1.8440867066383362, + "epoch": 2.6733031674208148, + "grad_norm": 0.6848122477531433, + "learning_rate": 0.0003330297538307298, + "loss": 0.1133, + "mean_token_accuracy": 0.966602012515068, + "num_tokens": 6531421.0, + "step": 740 + }, + { + "entropy": 1.829009771347046, + "epoch": 2.676923076923077, + "grad_norm": 0.37875643372535706, + "learning_rate": 0.0003325931104208333, + "loss": 0.0539, + "mean_token_accuracy": 0.9850967526435852, + "num_tokens": 6540304.0, + "step": 741 + }, + { + "entropy": 1.8256315886974335, + "epoch": 2.6805429864253396, + "grad_norm": 0.4970630407333374, + "learning_rate": 0.00033215627124363466, + "loss": 0.1195, + "mean_token_accuracy": 0.9662436544895172, + "num_tokens": 6549267.0, + "step": 742 + }, + { + "entropy": 1.823629915714264, + "epoch": 2.684162895927602, + "grad_norm": 0.659981906414032, + "learning_rate": 0.0003317192379583047, + "loss": 0.1368, + "mean_token_accuracy": 0.9655566364526749, + "num_tokens": 6558447.0, + "step": 743 + }, + { + "entropy": 1.8459455370903015, + "epoch": 2.6877828054298645, + "grad_norm": 0.620197057723999, + "learning_rate": 0.0003312820122247515, + "loss": 0.1766, + "mean_token_accuracy": 0.9569400995969772, + "num_tokens": 6567424.0, + "step": 744 + }, + { + "entropy": 1.7685991525650024, + "epoch": 2.691402714932127, + "grad_norm": 0.34498465061187744, + "learning_rate": 0.0003308445957036142, + "loss": 0.0615, + "mean_token_accuracy": 0.982216015458107, + "num_tokens": 6577071.0, + "step": 745 + }, + { + "entropy": 1.8037284910678864, + "epoch": 2.6950226244343893, + "grad_norm": 0.5550521016120911, + "learning_rate": 0.00033040699005625654, + "loss": 0.0701, + "mean_token_accuracy": 0.9795115292072296, + "num_tokens": 6586396.0, + "step": 746 + }, + { + "entropy": 1.813001424074173, + "epoch": 2.6986425339366518, + "grad_norm": 0.4117080271244049, + "learning_rate": 0.0003299691969447603, + "loss": 0.0657, + "mean_token_accuracy": 0.978747770190239, + "num_tokens": 6595189.0, + "step": 747 + }, + { + "entropy": 1.844575196504593, + "epoch": 2.702262443438914, + "grad_norm": 0.32197874784469604, + "learning_rate": 0.00032953121803191976, + "loss": 0.0342, + "mean_token_accuracy": 0.9904316365718842, + "num_tokens": 6604169.0, + "step": 748 + }, + { + "entropy": 1.9490505158901215, + "epoch": 2.7058823529411766, + "grad_norm": 0.5810762047767639, + "learning_rate": 0.00032909305498123465, + "loss": 0.1419, + "mean_token_accuracy": 0.9646100401878357, + "num_tokens": 6612744.0, + "step": 749 + }, + { + "entropy": 1.9927488267421722, + "epoch": 2.709502262443439, + "grad_norm": 0.7435065507888794, + "learning_rate": 0.0003286547094569039, + "loss": 0.1368, + "mean_token_accuracy": 0.9609140008687973, + "num_tokens": 6621000.0, + "step": 750 + }, + { + "entropy": 1.8266884088516235, + "epoch": 2.7131221719457015, + "grad_norm": 0.6717537045478821, + "learning_rate": 0.00032821618312381975, + "loss": 0.1449, + "mean_token_accuracy": 0.9694183021783829, + "num_tokens": 6629893.0, + "step": 751 + }, + { + "entropy": 1.850794643163681, + "epoch": 2.716742081447964, + "grad_norm": 0.44241195917129517, + "learning_rate": 0.00032777747764756117, + "loss": 0.0602, + "mean_token_accuracy": 0.9823136776685715, + "num_tokens": 6638696.0, + "step": 752 + }, + { + "entropy": 1.8408480882644653, + "epoch": 2.7203619909502263, + "grad_norm": 0.6299809217453003, + "learning_rate": 0.00032733859469438736, + "loss": 0.1408, + "mean_token_accuracy": 0.9629880636930466, + "num_tokens": 6647431.0, + "step": 753 + }, + { + "entropy": 1.7875444293022156, + "epoch": 2.723981900452489, + "grad_norm": 0.48492106795310974, + "learning_rate": 0.00032689953593123175, + "loss": 0.0806, + "mean_token_accuracy": 0.9798424690961838, + "num_tokens": 6656443.0, + "step": 754 + }, + { + "entropy": 1.778283566236496, + "epoch": 2.727601809954751, + "grad_norm": 0.46145930886268616, + "learning_rate": 0.0003264603030256955, + "loss": 0.0707, + "mean_token_accuracy": 0.9741399586200714, + "num_tokens": 6665465.0, + "step": 755 + }, + { + "entropy": 1.7340950965881348, + "epoch": 2.7312217194570136, + "grad_norm": 0.5734900236129761, + "learning_rate": 0.00032602089764604126, + "loss": 0.1443, + "mean_token_accuracy": 0.96195288002491, + "num_tokens": 6674797.0, + "step": 756 + }, + { + "entropy": 1.7791962027549744, + "epoch": 2.734841628959276, + "grad_norm": 0.5199477076530457, + "learning_rate": 0.00032558132146118636, + "loss": 0.0794, + "mean_token_accuracy": 0.975062221288681, + "num_tokens": 6683578.0, + "step": 757 + }, + { + "entropy": 1.825905591249466, + "epoch": 2.7384615384615385, + "grad_norm": 0.5944926738739014, + "learning_rate": 0.0003251415761406975, + "loss": 0.0909, + "mean_token_accuracy": 0.954865038394928, + "num_tokens": 6691818.0, + "step": 758 + }, + { + "entropy": 1.804949015378952, + "epoch": 2.742081447963801, + "grad_norm": 0.7065241932868958, + "learning_rate": 0.0003247016633547833, + "loss": 0.1511, + "mean_token_accuracy": 0.9687065333127975, + "num_tokens": 6700619.0, + "step": 759 + }, + { + "entropy": 1.7419202327728271, + "epoch": 2.7457013574660634, + "grad_norm": 0.49316564202308655, + "learning_rate": 0.00032426158477428857, + "loss": 0.0867, + "mean_token_accuracy": 0.9774050414562225, + "num_tokens": 6709635.0, + "step": 760 + }, + { + "entropy": 1.8934829235076904, + "epoch": 2.749321266968326, + "grad_norm": 0.9417999386787415, + "learning_rate": 0.00032382134207068787, + "loss": 0.1464, + "mean_token_accuracy": 0.9591032713651657, + "num_tokens": 6717657.0, + "step": 761 + }, + { + "entropy": 1.7354997992515564, + "epoch": 2.7529411764705882, + "grad_norm": 0.7240809798240662, + "learning_rate": 0.00032338093691607907, + "loss": 0.13, + "mean_token_accuracy": 0.9705345183610916, + "num_tokens": 6726671.0, + "step": 762 + }, + { + "entropy": 1.7620687186717987, + "epoch": 2.7565610859728507, + "grad_norm": 0.4986638128757477, + "learning_rate": 0.0003229403709831772, + "loss": 0.0963, + "mean_token_accuracy": 0.9756871312856674, + "num_tokens": 6735157.0, + "step": 763 + }, + { + "entropy": 1.7719130218029022, + "epoch": 2.760180995475113, + "grad_norm": 0.6204966902732849, + "learning_rate": 0.00032249964594530757, + "loss": 0.0578, + "mean_token_accuracy": 0.9815829247236252, + "num_tokens": 6743855.0, + "step": 764 + }, + { + "entropy": 1.7228702902793884, + "epoch": 2.7638009049773755, + "grad_norm": 0.5283492207527161, + "learning_rate": 0.0003220587634764003, + "loss": 0.069, + "mean_token_accuracy": 0.9851528853178024, + "num_tokens": 6753040.0, + "step": 765 + }, + { + "entropy": 1.7129736840724945, + "epoch": 2.767420814479638, + "grad_norm": 0.49026060104370117, + "learning_rate": 0.0003216177252509831, + "loss": 0.0672, + "mean_token_accuracy": 0.9857761710882187, + "num_tokens": 6762014.0, + "step": 766 + }, + { + "entropy": 1.7600707411766052, + "epoch": 2.7710407239819004, + "grad_norm": 0.5250128507614136, + "learning_rate": 0.00032117653294417523, + "loss": 0.1134, + "mean_token_accuracy": 0.9638848602771759, + "num_tokens": 6771012.0, + "step": 767 + }, + { + "entropy": 1.768298476934433, + "epoch": 2.774660633484163, + "grad_norm": 0.5671310424804688, + "learning_rate": 0.00032073518823168143, + "loss": 0.057, + "mean_token_accuracy": 0.9840837568044662, + "num_tokens": 6779601.0, + "step": 768 + }, + { + "entropy": 1.7464122474193573, + "epoch": 2.7782805429864252, + "grad_norm": 0.6007266044616699, + "learning_rate": 0.0003202936927897852, + "loss": 0.081, + "mean_token_accuracy": 0.9773043692111969, + "num_tokens": 6788518.0, + "step": 769 + }, + { + "entropy": 1.6484523713588715, + "epoch": 2.7819004524886877, + "grad_norm": 0.5163906812667847, + "learning_rate": 0.00031985204829534236, + "loss": 0.1215, + "mean_token_accuracy": 0.9645300209522247, + "num_tokens": 6797924.0, + "step": 770 + }, + { + "entropy": 1.7306124567985535, + "epoch": 2.78552036199095, + "grad_norm": 0.5778948068618774, + "learning_rate": 0.00031941025642577515, + "loss": 0.127, + "mean_token_accuracy": 0.9713134616613388, + "num_tokens": 6806828.0, + "step": 771 + }, + { + "entropy": 1.6599189043045044, + "epoch": 2.7891402714932125, + "grad_norm": 0.5121646523475647, + "learning_rate": 0.0003189683188590653, + "loss": 0.1066, + "mean_token_accuracy": 0.9707446396350861, + "num_tokens": 6816144.0, + "step": 772 + }, + { + "entropy": 1.71377295255661, + "epoch": 2.792760180995475, + "grad_norm": 0.9535031318664551, + "learning_rate": 0.00031852623727374787, + "loss": 0.2316, + "mean_token_accuracy": 0.9587533473968506, + "num_tokens": 6824849.0, + "step": 773 + }, + { + "entropy": 1.7716725766658783, + "epoch": 2.7963800904977374, + "grad_norm": 0.5735589265823364, + "learning_rate": 0.00031808401334890537, + "loss": 0.1028, + "mean_token_accuracy": 0.9716143608093262, + "num_tokens": 6833331.0, + "step": 774 + }, + { + "entropy": 1.7134707272052765, + "epoch": 2.8, + "grad_norm": 0.7087857127189636, + "learning_rate": 0.00031764164876416036, + "loss": 0.1201, + "mean_token_accuracy": 0.9686445444822311, + "num_tokens": 6842254.0, + "step": 775 + }, + { + "entropy": 1.6055873930454254, + "epoch": 2.8036199095022623, + "grad_norm": 0.4578965902328491, + "learning_rate": 0.00031719914519967, + "loss": 0.0827, + "mean_token_accuracy": 0.972065269947052, + "num_tokens": 6851644.0, + "step": 776 + }, + { + "entropy": 1.6444376707077026, + "epoch": 2.8072398190045247, + "grad_norm": 0.5656917095184326, + "learning_rate": 0.0003167565043361194, + "loss": 0.1036, + "mean_token_accuracy": 0.9723617881536484, + "num_tokens": 6860787.0, + "step": 777 + }, + { + "entropy": 1.6980305314064026, + "epoch": 2.810859728506787, + "grad_norm": 0.7013098001480103, + "learning_rate": 0.0003163137278547146, + "loss": 0.0838, + "mean_token_accuracy": 0.9793482422828674, + "num_tokens": 6869378.0, + "step": 778 + }, + { + "entropy": 1.6744478940963745, + "epoch": 2.8144796380090495, + "grad_norm": 0.6889812350273132, + "learning_rate": 0.00031587081743717735, + "loss": 0.0964, + "mean_token_accuracy": 0.9762091189622879, + "num_tokens": 6878050.0, + "step": 779 + }, + { + "entropy": 1.6397214829921722, + "epoch": 2.818099547511312, + "grad_norm": 0.7166011333465576, + "learning_rate": 0.00031542777476573785, + "loss": 0.1792, + "mean_token_accuracy": 0.9539972990751266, + "num_tokens": 6887153.0, + "step": 780 + }, + { + "entropy": 1.6447750926017761, + "epoch": 2.8217194570135744, + "grad_norm": 0.7113035321235657, + "learning_rate": 0.0003149846015231286, + "loss": 0.1464, + "mean_token_accuracy": 0.96909099817276, + "num_tokens": 6895877.0, + "step": 781 + }, + { + "entropy": 1.6827795505523682, + "epoch": 2.825339366515837, + "grad_norm": 0.6915350556373596, + "learning_rate": 0.0003145412993925781, + "loss": 0.1335, + "mean_token_accuracy": 0.9615183472633362, + "num_tokens": 6904553.0, + "step": 782 + }, + { + "entropy": 1.6189779937267303, + "epoch": 2.8289592760180997, + "grad_norm": 0.467428982257843, + "learning_rate": 0.00031409787005780423, + "loss": 0.0829, + "mean_token_accuracy": 0.9781016558408737, + "num_tokens": 6913634.0, + "step": 783 + }, + { + "entropy": 1.6323690116405487, + "epoch": 2.832579185520362, + "grad_norm": 0.49170154333114624, + "learning_rate": 0.00031365431520300813, + "loss": 0.0828, + "mean_token_accuracy": 0.9719655811786652, + "num_tokens": 6922638.0, + "step": 784 + }, + { + "entropy": 1.6121336817741394, + "epoch": 2.8361990950226246, + "grad_norm": 0.5629302263259888, + "learning_rate": 0.00031321063651286777, + "loss": 0.0757, + "mean_token_accuracy": 0.9791934490203857, + "num_tokens": 6931590.0, + "step": 785 + }, + { + "entropy": 1.7345627546310425, + "epoch": 2.839819004524887, + "grad_norm": 0.5514137148857117, + "learning_rate": 0.0003127668356725313, + "loss": 0.0819, + "mean_token_accuracy": 0.9800210148096085, + "num_tokens": 6940137.0, + "step": 786 + }, + { + "entropy": 1.6671563386917114, + "epoch": 2.8434389140271494, + "grad_norm": 0.5090643167495728, + "learning_rate": 0.0003123229143676109, + "loss": 0.0794, + "mean_token_accuracy": 0.9826332330703735, + "num_tokens": 6948616.0, + "step": 787 + }, + { + "entropy": 1.551501840353012, + "epoch": 2.847058823529412, + "grad_norm": 0.3994922935962677, + "learning_rate": 0.0003118788742841761, + "loss": 0.0491, + "mean_token_accuracy": 0.9865831136703491, + "num_tokens": 6957369.0, + "step": 788 + }, + { + "entropy": 1.500845193862915, + "epoch": 2.8506787330316743, + "grad_norm": 0.6023295521736145, + "learning_rate": 0.00031143471710874795, + "loss": 0.114, + "mean_token_accuracy": 0.9669302552938461, + "num_tokens": 6966667.0, + "step": 789 + }, + { + "entropy": 1.5258118510246277, + "epoch": 2.8542986425339367, + "grad_norm": 0.5326524972915649, + "learning_rate": 0.00031099044452829186, + "loss": 0.0657, + "mean_token_accuracy": 0.9833361059427261, + "num_tokens": 6975880.0, + "step": 790 + }, + { + "entropy": 1.5674570798873901, + "epoch": 2.857918552036199, + "grad_norm": 0.4518730044364929, + "learning_rate": 0.00031054605823021186, + "loss": 0.0569, + "mean_token_accuracy": 0.9832890778779984, + "num_tokens": 6984824.0, + "step": 791 + }, + { + "entropy": 1.5301121771335602, + "epoch": 2.8615384615384616, + "grad_norm": 0.5933698415756226, + "learning_rate": 0.00031010155990234364, + "loss": 0.1129, + "mean_token_accuracy": 0.9684284627437592, + "num_tokens": 6994076.0, + "step": 792 + }, + { + "entropy": 1.5711756348609924, + "epoch": 2.865158371040724, + "grad_norm": 0.6634730696678162, + "learning_rate": 0.00030965695123294837, + "loss": 0.1204, + "mean_token_accuracy": 0.972825437784195, + "num_tokens": 7003048.0, + "step": 793 + }, + { + "entropy": 1.6537431180477142, + "epoch": 2.8687782805429864, + "grad_norm": 0.5688450336456299, + "learning_rate": 0.0003092122339107067, + "loss": 0.0659, + "mean_token_accuracy": 0.9861912727355957, + "num_tokens": 7011743.0, + "step": 794 + }, + { + "entropy": 1.731940358877182, + "epoch": 2.872398190045249, + "grad_norm": 0.9030163288116455, + "learning_rate": 0.0003087674096247115, + "loss": 0.0829, + "mean_token_accuracy": 0.9802074134349823, + "num_tokens": 7020003.0, + "step": 795 + }, + { + "entropy": 1.6672345995903015, + "epoch": 2.8760180995475113, + "grad_norm": 0.5129911303520203, + "learning_rate": 0.00030832248006446223, + "loss": 0.0823, + "mean_token_accuracy": 0.9805259853601456, + "num_tokens": 7029275.0, + "step": 796 + }, + { + "entropy": 1.7102139592170715, + "epoch": 2.8796380090497737, + "grad_norm": 0.6210790872573853, + "learning_rate": 0.00030787744691985797, + "loss": 0.1248, + "mean_token_accuracy": 0.9665560126304626, + "num_tokens": 7038068.0, + "step": 797 + }, + { + "entropy": 1.659182459115982, + "epoch": 2.883257918552036, + "grad_norm": 0.6379976868629456, + "learning_rate": 0.0003074323118811913, + "loss": 0.1065, + "mean_token_accuracy": 0.9647062122821808, + "num_tokens": 7047039.0, + "step": 798 + }, + { + "entropy": 1.6344517767429352, + "epoch": 2.8868778280542986, + "grad_norm": 0.5851842761039734, + "learning_rate": 0.00030698707663914186, + "loss": 0.1046, + "mean_token_accuracy": 0.9666399955749512, + "num_tokens": 7056105.0, + "step": 799 + }, + { + "entropy": 1.6803805828094482, + "epoch": 2.890497737556561, + "grad_norm": 0.5926725268363953, + "learning_rate": 0.00030654174288477, + "loss": 0.1019, + "mean_token_accuracy": 0.9712099581956863, + "num_tokens": 7064710.0, + "step": 800 + }, + { + "entropy": 1.7004003822803497, + "epoch": 2.8941176470588235, + "grad_norm": 0.6103729605674744, + "learning_rate": 0.0003060963123095098, + "loss": 0.091, + "mean_token_accuracy": 0.9780148714780807, + "num_tokens": 7073218.0, + "step": 801 + }, + { + "entropy": 1.8133964240550995, + "epoch": 2.897737556561086, + "grad_norm": 0.872008740901947, + "learning_rate": 0.0003056507866051636, + "loss": 0.3003, + "mean_token_accuracy": 0.9385994374752045, + "num_tokens": 7081791.0, + "step": 802 + }, + { + "entropy": 1.7527997195720673, + "epoch": 2.9013574660633483, + "grad_norm": 0.553669810295105, + "learning_rate": 0.0003052051674638945, + "loss": 0.0999, + "mean_token_accuracy": 0.9695112109184265, + "num_tokens": 7090196.0, + "step": 803 + }, + { + "entropy": 1.6374657154083252, + "epoch": 2.9049773755656108, + "grad_norm": 0.4158615469932556, + "learning_rate": 0.00030475945657822107, + "loss": 0.0682, + "mean_token_accuracy": 0.9802833646535873, + "num_tokens": 7099216.0, + "step": 804 + }, + { + "entropy": 1.6056133210659027, + "epoch": 2.908597285067873, + "grad_norm": 0.47468429803848267, + "learning_rate": 0.00030431365564101003, + "loss": 0.1188, + "mean_token_accuracy": 0.9720293581485748, + "num_tokens": 7108787.0, + "step": 805 + }, + { + "entropy": 1.7184821665287018, + "epoch": 2.9122171945701356, + "grad_norm": 0.6617569923400879, + "learning_rate": 0.00030386776634547003, + "loss": 0.1121, + "mean_token_accuracy": 0.9623472690582275, + "num_tokens": 7117158.0, + "step": 806 + }, + { + "entropy": 1.7546651065349579, + "epoch": 2.915837104072398, + "grad_norm": 0.5058173537254333, + "learning_rate": 0.0003034217903851454, + "loss": 0.0861, + "mean_token_accuracy": 0.9664297550916672, + "num_tokens": 7125800.0, + "step": 807 + }, + { + "entropy": 1.6985557675361633, + "epoch": 2.9194570135746605, + "grad_norm": 0.5197705626487732, + "learning_rate": 0.00030297572945390996, + "loss": 0.1009, + "mean_token_accuracy": 0.9677706956863403, + "num_tokens": 7134221.0, + "step": 808 + }, + { + "entropy": 1.6737182438373566, + "epoch": 2.9230769230769234, + "grad_norm": 0.4528989791870117, + "learning_rate": 0.00030252958524595966, + "loss": 0.0656, + "mean_token_accuracy": 0.9853187948465347, + "num_tokens": 7142716.0, + "step": 809 + }, + { + "entropy": 1.687746375799179, + "epoch": 2.926696832579186, + "grad_norm": 0.8552060723304749, + "learning_rate": 0.00030208335945580716, + "loss": 0.1584, + "mean_token_accuracy": 0.958037719130516, + "num_tokens": 7151288.0, + "step": 810 + }, + { + "entropy": 1.6994356215000153, + "epoch": 2.930316742081448, + "grad_norm": 0.470833957195282, + "learning_rate": 0.00030163705377827496, + "loss": 0.0537, + "mean_token_accuracy": 0.9804185479879379, + "num_tokens": 7159738.0, + "step": 811 + }, + { + "entropy": 1.7072536945343018, + "epoch": 2.9339366515837106, + "grad_norm": 0.5749104022979736, + "learning_rate": 0.0003011906699084888, + "loss": 0.0502, + "mean_token_accuracy": 0.9830235093832016, + "num_tokens": 7168101.0, + "step": 812 + }, + { + "entropy": 1.70310440659523, + "epoch": 2.937556561085973, + "grad_norm": 0.7587386965751648, + "learning_rate": 0.0003007442095418715, + "loss": 0.1362, + "mean_token_accuracy": 0.9594880938529968, + "num_tokens": 7176663.0, + "step": 813 + }, + { + "entropy": 1.6307457983493805, + "epoch": 2.9411764705882355, + "grad_norm": 0.5054190754890442, + "learning_rate": 0.00030029767437413665, + "loss": 0.0744, + "mean_token_accuracy": 0.9738886505365372, + "num_tokens": 7185376.0, + "step": 814 + }, + { + "entropy": 1.5872860848903656, + "epoch": 2.944796380090498, + "grad_norm": 0.5463546514511108, + "learning_rate": 0.00029985106610128147, + "loss": 0.0916, + "mean_token_accuracy": 0.9782509952783585, + "num_tokens": 7194304.0, + "step": 815 + }, + { + "entropy": 1.6643644273281097, + "epoch": 2.9484162895927604, + "grad_norm": 0.5434613823890686, + "learning_rate": 0.0002994043864195811, + "loss": 0.1007, + "mean_token_accuracy": 0.9665197134017944, + "num_tokens": 7202895.0, + "step": 816 + }, + { + "entropy": 1.701482743024826, + "epoch": 2.952036199095023, + "grad_norm": 1.2643967866897583, + "learning_rate": 0.00029895763702558206, + "loss": 0.1377, + "mean_token_accuracy": 0.9696027487516403, + "num_tokens": 7211000.0, + "step": 817 + }, + { + "entropy": 1.688760131597519, + "epoch": 2.9556561085972852, + "grad_norm": 0.5438109636306763, + "learning_rate": 0.00029851081961609536, + "loss": 0.0637, + "mean_token_accuracy": 0.9724639654159546, + "num_tokens": 7219274.0, + "step": 818 + }, + { + "entropy": 1.6547857522964478, + "epoch": 2.9592760180995477, + "grad_norm": 0.4520387649536133, + "learning_rate": 0.0002980639358881906, + "loss": 0.0376, + "mean_token_accuracy": 0.9887004494667053, + "num_tokens": 7228000.0, + "step": 819 + }, + { + "entropy": 1.5814381837844849, + "epoch": 2.96289592760181, + "grad_norm": 0.49122339487075806, + "learning_rate": 0.00029761698753918894, + "loss": 0.0533, + "mean_token_accuracy": 0.983299508690834, + "num_tokens": 7236798.0, + "step": 820 + }, + { + "entropy": 1.5796774625778198, + "epoch": 2.9665158371040725, + "grad_norm": 0.43303897976875305, + "learning_rate": 0.00029716997626665726, + "loss": 0.0517, + "mean_token_accuracy": 0.984140008687973, + "num_tokens": 7245570.0, + "step": 821 + }, + { + "entropy": 1.5434466302394867, + "epoch": 2.970135746606335, + "grad_norm": 0.5712567567825317, + "learning_rate": 0.0002967229037684014, + "loss": 0.0634, + "mean_token_accuracy": 0.9851510971784592, + "num_tokens": 7254482.0, + "step": 822 + }, + { + "entropy": 1.5368549823760986, + "epoch": 2.9737556561085974, + "grad_norm": 0.5042312741279602, + "learning_rate": 0.0002962757717424595, + "loss": 0.1041, + "mean_token_accuracy": 0.9698852747678757, + "num_tokens": 7263428.0, + "step": 823 + }, + { + "entropy": 1.5740615129470825, + "epoch": 2.97737556561086, + "grad_norm": 0.8506835699081421, + "learning_rate": 0.0002958285818870963, + "loss": 0.0653, + "mean_token_accuracy": 0.9827365875244141, + "num_tokens": 7272425.0, + "step": 824 + }, + { + "entropy": 1.625010073184967, + "epoch": 2.9809954751131222, + "grad_norm": 0.6260822415351868, + "learning_rate": 0.00029538133590079556, + "loss": 0.1112, + "mean_token_accuracy": 0.9715189933776855, + "num_tokens": 7281312.0, + "step": 825 + }, + { + "entropy": 1.6078990697860718, + "epoch": 2.9846153846153847, + "grad_norm": 0.4316014349460602, + "learning_rate": 0.00029493403548225467, + "loss": 0.059, + "mean_token_accuracy": 0.9821690768003464, + "num_tokens": 7289748.0, + "step": 826 + }, + { + "entropy": 1.6132618486881256, + "epoch": 2.988235294117647, + "grad_norm": 0.6471059322357178, + "learning_rate": 0.0002944866823303776, + "loss": 0.0839, + "mean_token_accuracy": 0.9747331887483597, + "num_tokens": 7298453.0, + "step": 827 + }, + { + "entropy": 1.6038751900196075, + "epoch": 2.9918552036199095, + "grad_norm": 0.5383681654930115, + "learning_rate": 0.0002940392781442686, + "loss": 0.0728, + "mean_token_accuracy": 0.9774085730314255, + "num_tokens": 7307116.0, + "step": 828 + }, + { + "entropy": 1.6446776688098907, + "epoch": 2.995475113122172, + "grad_norm": 0.5420554280281067, + "learning_rate": 0.0002935918246232259, + "loss": 0.0799, + "mean_token_accuracy": 0.977481946349144, + "num_tokens": 7315668.0, + "step": 829 + }, + { + "entropy": 1.5571844279766083, + "epoch": 2.9990950226244344, + "grad_norm": 0.6471306681632996, + "learning_rate": 0.00029314432346673485, + "loss": 0.1657, + "mean_token_accuracy": 0.9566951394081116, + "num_tokens": 7324721.0, + "step": 830 + }, + { + "entropy": 2.0783205032348633, + "epoch": 3.0, + "grad_norm": 3.195817232131958, + "learning_rate": 0.000292696776374462, + "loss": 0.0742, + "mean_token_accuracy": 0.96875, + "num_tokens": 7325175.0, + "step": 831 + }, + { + "epoch": 3.0, + "eval_entropy": 1.6213929740394033, + "eval_loss": 0.14780744910240173, + "eval_mean_token_accuracy": 0.9634173047251817, + "eval_num_tokens": 7325175.0, + "eval_runtime": 116.0041, + "eval_samples_per_second": 3.181, + "eval_steps_per_second": 1.06, + "step": 831 + }, + { + "entropy": 1.639732986688614, + "epoch": 3.0036199095022624, + "grad_norm": 0.45313218235969543, + "learning_rate": 0.00029224918504624814, + "loss": 0.0569, + "mean_token_accuracy": 0.9821487963199615, + "num_tokens": 7333756.0, + "step": 832 + }, + { + "entropy": 1.620821863412857, + "epoch": 3.007239819004525, + "grad_norm": 0.4471704363822937, + "learning_rate": 0.0002918015511821022, + "loss": 0.059, + "mean_token_accuracy": 0.9843536615371704, + "num_tokens": 7342266.0, + "step": 833 + }, + { + "entropy": 1.722977101802826, + "epoch": 3.0108597285067873, + "grad_norm": 0.5039600729942322, + "learning_rate": 0.0002913538764821947, + "loss": 0.0438, + "mean_token_accuracy": 0.9868119210004807, + "num_tokens": 7350541.0, + "step": 834 + }, + { + "entropy": 1.6466768980026245, + "epoch": 3.0144796380090497, + "grad_norm": 0.4470590054988861, + "learning_rate": 0.0002909061626468512, + "loss": 0.0418, + "mean_token_accuracy": 0.9859062284231186, + "num_tokens": 7359236.0, + "step": 835 + }, + { + "entropy": 1.6936305463314056, + "epoch": 3.018099547511312, + "grad_norm": 0.5103632211685181, + "learning_rate": 0.00029045841137654584, + "loss": 0.0649, + "mean_token_accuracy": 0.9817161113023758, + "num_tokens": 7367649.0, + "step": 836 + }, + { + "entropy": 1.5894218981266022, + "epoch": 3.0217194570135746, + "grad_norm": 0.4315621554851532, + "learning_rate": 0.000290010624371895, + "loss": 0.0779, + "mean_token_accuracy": 0.9756399989128113, + "num_tokens": 7376772.0, + "step": 837 + }, + { + "entropy": 1.6676535904407501, + "epoch": 3.025339366515837, + "grad_norm": 0.6142503023147583, + "learning_rate": 0.00028956280333365084, + "loss": 0.0601, + "mean_token_accuracy": 0.9850548654794693, + "num_tokens": 7385454.0, + "step": 838 + }, + { + "entropy": 1.6877512037754059, + "epoch": 3.0289592760180994, + "grad_norm": 0.5499544739723206, + "learning_rate": 0.0002891149499626948, + "loss": 0.06, + "mean_token_accuracy": 0.980460986495018, + "num_tokens": 7393843.0, + "step": 839 + }, + { + "entropy": 1.64662566781044, + "epoch": 3.032579185520362, + "grad_norm": 0.7865297198295593, + "learning_rate": 0.00028866706596003094, + "loss": 0.1098, + "mean_token_accuracy": 0.9614097326993942, + "num_tokens": 7402203.0, + "step": 840 + }, + { + "entropy": 1.5609408617019653, + "epoch": 3.0361990950226243, + "grad_norm": 0.5209096074104309, + "learning_rate": 0.0002882191530267797, + "loss": 0.0893, + "mean_token_accuracy": 0.9731233417987823, + "num_tokens": 7411227.0, + "step": 841 + }, + { + "entropy": 1.6110387742519379, + "epoch": 3.0398190045248867, + "grad_norm": 0.49672260880470276, + "learning_rate": 0.00028777121286417185, + "loss": 0.0512, + "mean_token_accuracy": 0.9793709367513657, + "num_tokens": 7419751.0, + "step": 842 + }, + { + "entropy": 1.5630280673503876, + "epoch": 3.043438914027149, + "grad_norm": 0.5099878907203674, + "learning_rate": 0.00028732324717354083, + "loss": 0.0447, + "mean_token_accuracy": 0.9830391556024551, + "num_tokens": 7428533.0, + "step": 843 + }, + { + "entropy": 1.5407153069972992, + "epoch": 3.0470588235294116, + "grad_norm": 0.7725343704223633, + "learning_rate": 0.0002868752576563175, + "loss": 0.071, + "mean_token_accuracy": 0.9820850938558578, + "num_tokens": 7437390.0, + "step": 844 + }, + { + "entropy": 1.5895936191082, + "epoch": 3.050678733031674, + "grad_norm": 0.5729185938835144, + "learning_rate": 0.0002864272460140234, + "loss": 0.0651, + "mean_token_accuracy": 0.9816865175962448, + "num_tokens": 7445715.0, + "step": 845 + }, + { + "entropy": 1.5614444315433502, + "epoch": 3.0542986425339365, + "grad_norm": 0.49079445004463196, + "learning_rate": 0.00028597921394826346, + "loss": 0.078, + "mean_token_accuracy": 0.9791339933872223, + "num_tokens": 7454770.0, + "step": 846 + }, + { + "entropy": 1.4948404431343079, + "epoch": 3.057918552036199, + "grad_norm": 0.45897549390792847, + "learning_rate": 0.0002855311631607209, + "loss": 0.0506, + "mean_token_accuracy": 0.9858186691999435, + "num_tokens": 7463578.0, + "step": 847 + }, + { + "entropy": 1.4837007820606232, + "epoch": 3.0615384615384613, + "grad_norm": 0.6153395771980286, + "learning_rate": 0.0002850830953531494, + "loss": 0.0862, + "mean_token_accuracy": 0.9767305850982666, + "num_tokens": 7472726.0, + "step": 848 + }, + { + "entropy": 1.5454865992069244, + "epoch": 3.065158371040724, + "grad_norm": 0.9645626544952393, + "learning_rate": 0.00028463501222736787, + "loss": 0.1448, + "mean_token_accuracy": 0.9689669013023376, + "num_tokens": 7481594.0, + "step": 849 + }, + { + "entropy": 1.503423035144806, + "epoch": 3.0687782805429866, + "grad_norm": 0.5449880361557007, + "learning_rate": 0.00028418691548525306, + "loss": 0.0809, + "mean_token_accuracy": 0.9776449203491211, + "num_tokens": 7490420.0, + "step": 850 + }, + { + "entropy": 1.43245068192482, + "epoch": 3.072398190045249, + "grad_norm": 0.7362976670265198, + "learning_rate": 0.0002837388068287334, + "loss": 0.0956, + "mean_token_accuracy": 0.9742193967103958, + "num_tokens": 7499567.0, + "step": 851 + }, + { + "entropy": 1.4874032735824585, + "epoch": 3.0760180995475115, + "grad_norm": 0.5615106821060181, + "learning_rate": 0.00028329068795978274, + "loss": 0.0837, + "mean_token_accuracy": 0.9790486842393875, + "num_tokens": 7508507.0, + "step": 852 + }, + { + "entropy": 1.4498116374015808, + "epoch": 3.079638009049774, + "grad_norm": 0.4348931610584259, + "learning_rate": 0.00028284256058041363, + "loss": 0.0634, + "mean_token_accuracy": 0.9843485057353973, + "num_tokens": 7517576.0, + "step": 853 + }, + { + "entropy": 1.5673332512378693, + "epoch": 3.0832579185520363, + "grad_norm": 0.5114635825157166, + "learning_rate": 0.000282394426392671, + "loss": 0.0663, + "mean_token_accuracy": 0.9789460599422455, + "num_tokens": 7526323.0, + "step": 854 + }, + { + "entropy": 1.5406886637210846, + "epoch": 3.086877828054299, + "grad_norm": 0.4056108593940735, + "learning_rate": 0.0002819462870986256, + "loss": 0.051, + "mean_token_accuracy": 0.985608771443367, + "num_tokens": 7535015.0, + "step": 855 + }, + { + "entropy": 1.5750982761383057, + "epoch": 3.090497737556561, + "grad_norm": 0.5252281427383423, + "learning_rate": 0.00028149814440036757, + "loss": 0.0426, + "mean_token_accuracy": 0.9839760363101959, + "num_tokens": 7543712.0, + "step": 856 + }, + { + "entropy": 1.5696458220481873, + "epoch": 3.0941176470588236, + "grad_norm": 0.7764946222305298, + "learning_rate": 0.00028105, + "loss": 0.1381, + "mean_token_accuracy": 0.9690622985363007, + "num_tokens": 7552651.0, + "step": 857 + }, + { + "entropy": 1.601443588733673, + "epoch": 3.097737556561086, + "grad_norm": 0.5605809688568115, + "learning_rate": 0.0002806018555996324, + "loss": 0.0847, + "mean_token_accuracy": 0.9771271497011185, + "num_tokens": 7561157.0, + "step": 858 + }, + { + "entropy": 1.5733444690704346, + "epoch": 3.1013574660633485, + "grad_norm": 0.6177924871444702, + "learning_rate": 0.00028015371290137443, + "loss": 0.0654, + "mean_token_accuracy": 0.9801760017871857, + "num_tokens": 7569884.0, + "step": 859 + }, + { + "entropy": 1.6089049577713013, + "epoch": 3.104977375565611, + "grad_norm": 0.9339480400085449, + "learning_rate": 0.000279705573607329, + "loss": 0.0924, + "mean_token_accuracy": 0.9692800939083099, + "num_tokens": 7578417.0, + "step": 860 + }, + { + "entropy": 1.58562570810318, + "epoch": 3.1085972850678734, + "grad_norm": 0.6229763031005859, + "learning_rate": 0.00027925743941958637, + "loss": 0.0689, + "mean_token_accuracy": 0.9820535033941269, + "num_tokens": 7586899.0, + "step": 861 + }, + { + "entropy": 1.6127779185771942, + "epoch": 3.112217194570136, + "grad_norm": 0.5199776291847229, + "learning_rate": 0.0002788093120402174, + "loss": 0.0696, + "mean_token_accuracy": 0.9842628389596939, + "num_tokens": 7595283.0, + "step": 862 + }, + { + "entropy": 1.5815349221229553, + "epoch": 3.1158371040723982, + "grad_norm": 0.3927786946296692, + "learning_rate": 0.0002783611931712666, + "loss": 0.0489, + "mean_token_accuracy": 0.9855190068483353, + "num_tokens": 7603800.0, + "step": 863 + }, + { + "entropy": 1.5128042995929718, + "epoch": 3.1194570135746607, + "grad_norm": 0.5245664715766907, + "learning_rate": 0.00027791308451474695, + "loss": 0.0916, + "mean_token_accuracy": 0.9793255031108856, + "num_tokens": 7612765.0, + "step": 864 + }, + { + "entropy": 1.4662578105926514, + "epoch": 3.123076923076923, + "grad_norm": 0.4836482107639313, + "learning_rate": 0.000277464987772632, + "loss": 0.0363, + "mean_token_accuracy": 0.9882297664880753, + "num_tokens": 7621842.0, + "step": 865 + }, + { + "entropy": 1.6075958013534546, + "epoch": 3.1266968325791855, + "grad_norm": 0.6621652841567993, + "learning_rate": 0.00027701690464685053, + "loss": 0.0703, + "mean_token_accuracy": 0.9801139384508133, + "num_tokens": 7630299.0, + "step": 866 + }, + { + "entropy": 1.5028826892375946, + "epoch": 3.130316742081448, + "grad_norm": 1.076515555381775, + "learning_rate": 0.00027656883683927917, + "loss": 0.1021, + "mean_token_accuracy": 0.9723865538835526, + "num_tokens": 7639269.0, + "step": 867 + }, + { + "entropy": 1.4604552686214447, + "epoch": 3.1339366515837104, + "grad_norm": 0.6197560429573059, + "learning_rate": 0.0002761207860517365, + "loss": 0.0831, + "mean_token_accuracy": 0.9773454070091248, + "num_tokens": 7648589.0, + "step": 868 + }, + { + "entropy": 1.5533301830291748, + "epoch": 3.137556561085973, + "grad_norm": 0.6384056806564331, + "learning_rate": 0.00027567275398597665, + "loss": 0.085, + "mean_token_accuracy": 0.9763429015874863, + "num_tokens": 7657465.0, + "step": 869 + }, + { + "entropy": 1.499713659286499, + "epoch": 3.1411764705882352, + "grad_norm": 0.5099884867668152, + "learning_rate": 0.0002752247423436825, + "loss": 0.0506, + "mean_token_accuracy": 0.9845949709415436, + "num_tokens": 7666239.0, + "step": 870 + }, + { + "entropy": 1.5065864324569702, + "epoch": 3.1447963800904977, + "grad_norm": 0.500906765460968, + "learning_rate": 0.00027477675282645917, + "loss": 0.0505, + "mean_token_accuracy": 0.9816035628318787, + "num_tokens": 7675002.0, + "step": 871 + }, + { + "entropy": 1.492633044719696, + "epoch": 3.14841628959276, + "grad_norm": 0.5848217606544495, + "learning_rate": 0.00027432878713582826, + "loss": 0.0714, + "mean_token_accuracy": 0.9832541942596436, + "num_tokens": 7683452.0, + "step": 872 + }, + { + "entropy": 1.5013020932674408, + "epoch": 3.1520361990950225, + "grad_norm": 0.7728188037872314, + "learning_rate": 0.0002738808469732202, + "loss": 0.1403, + "mean_token_accuracy": 0.9723454862833023, + "num_tokens": 7692088.0, + "step": 873 + }, + { + "entropy": 1.4020000398159027, + "epoch": 3.155656108597285, + "grad_norm": 0.7066675424575806, + "learning_rate": 0.00027343293403996906, + "loss": 0.0631, + "mean_token_accuracy": 0.9841864109039307, + "num_tokens": 7701066.0, + "step": 874 + }, + { + "entropy": 1.4469320476055145, + "epoch": 3.1592760180995474, + "grad_norm": 0.47683194279670715, + "learning_rate": 0.0002729850500373052, + "loss": 0.0787, + "mean_token_accuracy": 0.9787198454141617, + "num_tokens": 7710189.0, + "step": 875 + }, + { + "entropy": 1.4941265881061554, + "epoch": 3.16289592760181, + "grad_norm": 0.5534874796867371, + "learning_rate": 0.00027253719666634916, + "loss": 0.0681, + "mean_token_accuracy": 0.9741384238004684, + "num_tokens": 7718736.0, + "step": 876 + }, + { + "entropy": 1.48758664727211, + "epoch": 3.1665158371040723, + "grad_norm": 0.42443010210990906, + "learning_rate": 0.000272089375628105, + "loss": 0.0452, + "mean_token_accuracy": 0.986537754535675, + "num_tokens": 7727565.0, + "step": 877 + }, + { + "entropy": 1.4197124242782593, + "epoch": 3.1701357466063347, + "grad_norm": 0.4680332541465759, + "learning_rate": 0.00027164158862345416, + "loss": 0.0712, + "mean_token_accuracy": 0.979786142706871, + "num_tokens": 7736663.0, + "step": 878 + }, + { + "entropy": 1.4459567070007324, + "epoch": 3.173755656108597, + "grad_norm": 0.5269680619239807, + "learning_rate": 0.00027119383735314887, + "loss": 0.0527, + "mean_token_accuracy": 0.9839773774147034, + "num_tokens": 7745837.0, + "step": 879 + }, + { + "entropy": 1.4754200279712677, + "epoch": 3.1773755656108595, + "grad_norm": 0.39273717999458313, + "learning_rate": 0.00027074612351780524, + "loss": 0.0188, + "mean_token_accuracy": 0.9941024333238602, + "num_tokens": 7754747.0, + "step": 880 + }, + { + "entropy": 1.440185934305191, + "epoch": 3.180995475113122, + "grad_norm": 0.6401451826095581, + "learning_rate": 0.00027029844881789776, + "loss": 0.0825, + "mean_token_accuracy": 0.9758540540933609, + "num_tokens": 7763933.0, + "step": 881 + }, + { + "entropy": 1.4647364616394043, + "epoch": 3.184615384615385, + "grad_norm": 0.6890838146209717, + "learning_rate": 0.0002698508149537519, + "loss": 0.0609, + "mean_token_accuracy": 0.9836824238300323, + "num_tokens": 7772693.0, + "step": 882 + }, + { + "entropy": 1.510004311800003, + "epoch": 3.1882352941176473, + "grad_norm": 0.4847521185874939, + "learning_rate": 0.000269403223625538, + "loss": 0.0665, + "mean_token_accuracy": 0.9833553731441498, + "num_tokens": 7781591.0, + "step": 883 + }, + { + "entropy": 1.5321883261203766, + "epoch": 3.1918552036199097, + "grad_norm": 0.5583149790763855, + "learning_rate": 0.00026895567653326515, + "loss": 0.0481, + "mean_token_accuracy": 0.9884297996759415, + "num_tokens": 7789893.0, + "step": 884 + }, + { + "entropy": 1.5181719362735748, + "epoch": 3.195475113122172, + "grad_norm": 0.5727811455726624, + "learning_rate": 0.000268508175376774, + "loss": 0.051, + "mean_token_accuracy": 0.9885639101266861, + "num_tokens": 7798544.0, + "step": 885 + }, + { + "entropy": 1.6374200582504272, + "epoch": 3.1990950226244346, + "grad_norm": 0.5002682209014893, + "learning_rate": 0.0002680607218557314, + "loss": 0.0778, + "mean_token_accuracy": 0.9834531843662262, + "num_tokens": 7807030.0, + "step": 886 + }, + { + "entropy": 1.4485781788825989, + "epoch": 3.202714932126697, + "grad_norm": 0.5490010976791382, + "learning_rate": 0.0002676133176696224, + "loss": 0.0612, + "mean_token_accuracy": 0.9833452105522156, + "num_tokens": 7816008.0, + "step": 887 + }, + { + "entropy": 1.5048691630363464, + "epoch": 3.2063348416289594, + "grad_norm": 0.37134769558906555, + "learning_rate": 0.0002671659645177453, + "loss": 0.0411, + "mean_token_accuracy": 0.9869852215051651, + "num_tokens": 7825152.0, + "step": 888 + }, + { + "entropy": 1.522626668214798, + "epoch": 3.209954751131222, + "grad_norm": 0.3474898040294647, + "learning_rate": 0.00026671866409920444, + "loss": 0.0453, + "mean_token_accuracy": 0.9880259335041046, + "num_tokens": 7833517.0, + "step": 889 + }, + { + "entropy": 1.4796842634677887, + "epoch": 3.2135746606334843, + "grad_norm": 0.6107187271118164, + "learning_rate": 0.0002662714181129038, + "loss": 0.0587, + "mean_token_accuracy": 0.9835474342107773, + "num_tokens": 7842418.0, + "step": 890 + }, + { + "entropy": 1.527999073266983, + "epoch": 3.2171945701357467, + "grad_norm": 0.8143520355224609, + "learning_rate": 0.00026582422825754037, + "loss": 0.1435, + "mean_token_accuracy": 0.9624571949243546, + "num_tokens": 7851284.0, + "step": 891 + }, + { + "entropy": 1.488987773656845, + "epoch": 3.220814479638009, + "grad_norm": 0.4910070300102234, + "learning_rate": 0.0002653770962315986, + "loss": 0.0627, + "mean_token_accuracy": 0.9796515852212906, + "num_tokens": 7860011.0, + "step": 892 + }, + { + "entropy": 1.497105747461319, + "epoch": 3.2244343891402716, + "grad_norm": 0.6304562091827393, + "learning_rate": 0.00026493002373334274, + "loss": 0.0837, + "mean_token_accuracy": 0.975927010178566, + "num_tokens": 7868618.0, + "step": 893 + }, + { + "entropy": 1.4863994121551514, + "epoch": 3.228054298642534, + "grad_norm": 0.4768204092979431, + "learning_rate": 0.00026448301246081106, + "loss": 0.0449, + "mean_token_accuracy": 0.9877417385578156, + "num_tokens": 7877335.0, + "step": 894 + }, + { + "entropy": 1.4551187455654144, + "epoch": 3.2316742081447964, + "grad_norm": 0.5773951411247253, + "learning_rate": 0.0002640360641118095, + "loss": 0.0807, + "mean_token_accuracy": 0.974293515086174, + "num_tokens": 7886486.0, + "step": 895 + }, + { + "entropy": 1.4752719104290009, + "epoch": 3.235294117647059, + "grad_norm": 0.8372188806533813, + "learning_rate": 0.00026358918038390464, + "loss": 0.1428, + "mean_token_accuracy": 0.9693069010972977, + "num_tokens": 7895501.0, + "step": 896 + }, + { + "entropy": 1.462522953748703, + "epoch": 3.2389140271493213, + "grad_norm": 0.4307233393192291, + "learning_rate": 0.0002631423629744179, + "loss": 0.0574, + "mean_token_accuracy": 0.9867023974657059, + "num_tokens": 7904614.0, + "step": 897 + }, + { + "entropy": 1.5075399577617645, + "epoch": 3.2425339366515837, + "grad_norm": 0.6246724724769592, + "learning_rate": 0.00026269561358041886, + "loss": 0.074, + "mean_token_accuracy": 0.9773128777742386, + "num_tokens": 7913383.0, + "step": 898 + }, + { + "entropy": 1.407273530960083, + "epoch": 3.246153846153846, + "grad_norm": 0.31213951110839844, + "learning_rate": 0.0002622489338987186, + "loss": 0.0225, + "mean_token_accuracy": 0.994924858212471, + "num_tokens": 7922686.0, + "step": 899 + }, + { + "entropy": 1.3883163630962372, + "epoch": 3.2497737556561086, + "grad_norm": 0.476696252822876, + "learning_rate": 0.00026180232562586335, + "loss": 0.0727, + "mean_token_accuracy": 0.9775501936674118, + "num_tokens": 7931958.0, + "step": 900 + }, + { + "entropy": 1.4200710952281952, + "epoch": 3.253393665158371, + "grad_norm": 0.5860406756401062, + "learning_rate": 0.0002613557904581284, + "loss": 0.0658, + "mean_token_accuracy": 0.9809585213661194, + "num_tokens": 7940834.0, + "step": 901 + }, + { + "entropy": 1.4369202852249146, + "epoch": 3.2570135746606335, + "grad_norm": 0.47559866309165955, + "learning_rate": 0.0002609093300915112, + "loss": 0.0481, + "mean_token_accuracy": 0.9899342954158783, + "num_tokens": 7949907.0, + "step": 902 + }, + { + "entropy": 1.544773817062378, + "epoch": 3.260633484162896, + "grad_norm": 0.6772119402885437, + "learning_rate": 0.00026046294622172504, + "loss": 0.067, + "mean_token_accuracy": 0.9841168224811554, + "num_tokens": 7958556.0, + "step": 903 + }, + { + "entropy": 1.504111796617508, + "epoch": 3.2642533936651583, + "grad_norm": 0.4870680868625641, + "learning_rate": 0.0002600166405441928, + "loss": 0.0379, + "mean_token_accuracy": 0.990149587392807, + "num_tokens": 7967079.0, + "step": 904 + }, + { + "entropy": 1.349067509174347, + "epoch": 3.2678733031674208, + "grad_norm": 0.46113792061805725, + "learning_rate": 0.0002595704147540404, + "loss": 0.0521, + "mean_token_accuracy": 0.9874720871448517, + "num_tokens": 7976551.0, + "step": 905 + }, + { + "entropy": 1.462818831205368, + "epoch": 3.271493212669683, + "grad_norm": 0.7971535325050354, + "learning_rate": 0.0002591242705460901, + "loss": 0.041, + "mean_token_accuracy": 0.9884305745363235, + "num_tokens": 7985622.0, + "step": 906 + }, + { + "entropy": 1.3945342004299164, + "epoch": 3.2751131221719456, + "grad_norm": 0.7364558577537537, + "learning_rate": 0.00025867820961485453, + "loss": 0.0978, + "mean_token_accuracy": 0.9766480922698975, + "num_tokens": 7995012.0, + "step": 907 + }, + { + "entropy": 1.4662614464759827, + "epoch": 3.278733031674208, + "grad_norm": 0.4509989619255066, + "learning_rate": 0.0002582322336545299, + "loss": 0.0401, + "mean_token_accuracy": 0.9875599294900894, + "num_tokens": 8003688.0, + "step": 908 + }, + { + "entropy": 1.5244667828083038, + "epoch": 3.2823529411764705, + "grad_norm": 0.76254802942276, + "learning_rate": 0.00025778634435899, + "loss": 0.0706, + "mean_token_accuracy": 0.9814789742231369, + "num_tokens": 8011711.0, + "step": 909 + }, + { + "entropy": 1.498660922050476, + "epoch": 3.285972850678733, + "grad_norm": 0.5205233097076416, + "learning_rate": 0.0002573405434217788, + "loss": 0.0433, + "mean_token_accuracy": 0.9895520657300949, + "num_tokens": 8020268.0, + "step": 910 + }, + { + "entropy": 1.3673588633537292, + "epoch": 3.2895927601809953, + "grad_norm": 0.36727628111839294, + "learning_rate": 0.0002568948325361054, + "loss": 0.046, + "mean_token_accuracy": 0.9816896766424179, + "num_tokens": 8029676.0, + "step": 911 + }, + { + "entropy": 1.3924965262413025, + "epoch": 3.2932126696832578, + "grad_norm": 0.6359453797340393, + "learning_rate": 0.0002564492133948364, + "loss": 0.0677, + "mean_token_accuracy": 0.9825267344713211, + "num_tokens": 8038613.0, + "step": 912 + }, + { + "entropy": 1.426201194524765, + "epoch": 3.29683257918552, + "grad_norm": 0.5639982223510742, + "learning_rate": 0.0002560036876904902, + "loss": 0.0762, + "mean_token_accuracy": 0.9812760651111603, + "num_tokens": 8047516.0, + "step": 913 + }, + { + "entropy": 1.4323900640010834, + "epoch": 3.3004524886877826, + "grad_norm": 0.5035631060600281, + "learning_rate": 0.00025555825711522995, + "loss": 0.0479, + "mean_token_accuracy": 0.9820713251829147, + "num_tokens": 8056237.0, + "step": 914 + }, + { + "entropy": 1.433140367269516, + "epoch": 3.304072398190045, + "grad_norm": 0.5381770133972168, + "learning_rate": 0.00025511292336085804, + "loss": 0.0584, + "mean_token_accuracy": 0.9868257641792297, + "num_tokens": 8064918.0, + "step": 915 + }, + { + "entropy": 1.412838101387024, + "epoch": 3.3076923076923075, + "grad_norm": 0.46058139204978943, + "learning_rate": 0.00025466768811880866, + "loss": 0.0396, + "mean_token_accuracy": 0.9873918145895004, + "num_tokens": 8073881.0, + "step": 916 + }, + { + "entropy": 1.4484798610210419, + "epoch": 3.31131221719457, + "grad_norm": 0.8550136685371399, + "learning_rate": 0.000254222553080142, + "loss": 0.0744, + "mean_token_accuracy": 0.9780523777008057, + "num_tokens": 8082249.0, + "step": 917 + }, + { + "entropy": 1.4633181393146515, + "epoch": 3.3149321266968323, + "grad_norm": 0.8231784105300903, + "learning_rate": 0.00025377751993553777, + "loss": 0.0847, + "mean_token_accuracy": 0.9764655083417892, + "num_tokens": 8090772.0, + "step": 918 + }, + { + "entropy": 1.5348555445671082, + "epoch": 3.318552036199095, + "grad_norm": 0.6072585582733154, + "learning_rate": 0.00025333259037528847, + "loss": 0.0547, + "mean_token_accuracy": 0.983170285820961, + "num_tokens": 8098744.0, + "step": 919 + }, + { + "entropy": 1.4343461096286774, + "epoch": 3.3221719457013577, + "grad_norm": 0.5895786881446838, + "learning_rate": 0.0002528877660892933, + "loss": 0.033, + "mean_token_accuracy": 0.9907310158014297, + "num_tokens": 8107459.0, + "step": 920 + }, + { + "entropy": 1.3224802315235138, + "epoch": 3.32579185520362, + "grad_norm": 0.4657888114452362, + "learning_rate": 0.0002524430487670515, + "loss": 0.0581, + "mean_token_accuracy": 0.9821915626525879, + "num_tokens": 8116673.0, + "step": 921 + }, + { + "entropy": 1.4497299492359161, + "epoch": 3.3294117647058825, + "grad_norm": 0.5360382795333862, + "learning_rate": 0.0002519984400976564, + "loss": 0.0849, + "mean_token_accuracy": 0.9713759422302246, + "num_tokens": 8125774.0, + "step": 922 + }, + { + "entropy": 1.4038201570510864, + "epoch": 3.333031674208145, + "grad_norm": 0.5329150557518005, + "learning_rate": 0.00025155394176978814, + "loss": 0.0679, + "mean_token_accuracy": 0.9782100170850754, + "num_tokens": 8134777.0, + "step": 923 + }, + { + "entropy": 1.3989399075508118, + "epoch": 3.3366515837104074, + "grad_norm": 0.47847944498062134, + "learning_rate": 0.00025110955547170803, + "loss": 0.0579, + "mean_token_accuracy": 0.9826236069202423, + "num_tokens": 8143596.0, + "step": 924 + }, + { + "entropy": 1.4384986460208893, + "epoch": 3.34027149321267, + "grad_norm": 0.6291977763175964, + "learning_rate": 0.0002506652828912521, + "loss": 0.0826, + "mean_token_accuracy": 0.9759467244148254, + "num_tokens": 8152554.0, + "step": 925 + }, + { + "entropy": 1.3491226136684418, + "epoch": 3.3438914027149322, + "grad_norm": 0.4057374596595764, + "learning_rate": 0.00025022112571582383, + "loss": 0.0428, + "mean_token_accuracy": 0.9861899316310883, + "num_tokens": 8161845.0, + "step": 926 + }, + { + "entropy": 1.3831347525119781, + "epoch": 3.3475113122171947, + "grad_norm": 0.5007946491241455, + "learning_rate": 0.0002497770856323891, + "loss": 0.0417, + "mean_token_accuracy": 0.9847332686185837, + "num_tokens": 8170865.0, + "step": 927 + }, + { + "entropy": 1.4520001113414764, + "epoch": 3.351131221719457, + "grad_norm": 0.5229163765907288, + "learning_rate": 0.00024933316432746864, + "loss": 0.0515, + "mean_token_accuracy": 0.98235984146595, + "num_tokens": 8179738.0, + "step": 928 + }, + { + "entropy": 1.4497073292732239, + "epoch": 3.3547511312217195, + "grad_norm": 0.6086527705192566, + "learning_rate": 0.0002488893634871322, + "loss": 0.082, + "mean_token_accuracy": 0.9839034825563431, + "num_tokens": 8188402.0, + "step": 929 + }, + { + "entropy": 1.4439297020435333, + "epoch": 3.358371040723982, + "grad_norm": 0.6497851014137268, + "learning_rate": 0.00024844568479699187, + "loss": 0.0863, + "mean_token_accuracy": 0.9722652286291122, + "num_tokens": 8196956.0, + "step": 930 + }, + { + "entropy": 1.3755157589912415, + "epoch": 3.3619909502262444, + "grad_norm": 0.6988303661346436, + "learning_rate": 0.0002480021299421957, + "loss": 0.0999, + "mean_token_accuracy": 0.9738437533378601, + "num_tokens": 8205951.0, + "step": 931 + }, + { + "entropy": 1.3790476024150848, + "epoch": 3.365610859728507, + "grad_norm": 0.8188769221305847, + "learning_rate": 0.0002475587006074219, + "loss": 0.206, + "mean_token_accuracy": 0.9557942748069763, + "num_tokens": 8215256.0, + "step": 932 + }, + { + "entropy": 1.4337495565414429, + "epoch": 3.3692307692307693, + "grad_norm": 0.481511652469635, + "learning_rate": 0.00024711539847687135, + "loss": 0.0568, + "mean_token_accuracy": 0.982319638133049, + "num_tokens": 8224081.0, + "step": 933 + }, + { + "entropy": 1.4721867442131042, + "epoch": 3.3728506787330317, + "grad_norm": 0.595804750919342, + "learning_rate": 0.00024667222523426204, + "loss": 0.073, + "mean_token_accuracy": 0.979112833738327, + "num_tokens": 8232560.0, + "step": 934 + }, + { + "entropy": 1.4026366472244263, + "epoch": 3.376470588235294, + "grad_norm": 0.8112502098083496, + "learning_rate": 0.0002462291825628226, + "loss": 0.1302, + "mean_token_accuracy": 0.9592884331941605, + "num_tokens": 8241529.0, + "step": 935 + }, + { + "entropy": 1.4276806712150574, + "epoch": 3.3800904977375565, + "grad_norm": 0.3144559860229492, + "learning_rate": 0.0002457862721452854, + "loss": 0.0355, + "mean_token_accuracy": 0.9895562827587128, + "num_tokens": 8250331.0, + "step": 936 + }, + { + "entropy": 1.4367564022541046, + "epoch": 3.383710407239819, + "grad_norm": 0.6843166947364807, + "learning_rate": 0.0002453434956638806, + "loss": 0.0674, + "mean_token_accuracy": 0.9829154461622238, + "num_tokens": 8259137.0, + "step": 937 + }, + { + "entropy": 1.390118271112442, + "epoch": 3.3873303167420814, + "grad_norm": 0.437500536441803, + "learning_rate": 0.00024490085480032996, + "loss": 0.0323, + "mean_token_accuracy": 0.9916883558034897, + "num_tokens": 8268372.0, + "step": 938 + }, + { + "entropy": 1.3605903685092926, + "epoch": 3.390950226244344, + "grad_norm": 0.6721571087837219, + "learning_rate": 0.00024445835123583964, + "loss": 0.1217, + "mean_token_accuracy": 0.9565094709396362, + "num_tokens": 8277388.0, + "step": 939 + }, + { + "entropy": 1.3998730778694153, + "epoch": 3.3945701357466063, + "grad_norm": 0.38136187195777893, + "learning_rate": 0.00024401598665109463, + "loss": 0.0397, + "mean_token_accuracy": 0.9870003908872604, + "num_tokens": 8286150.0, + "step": 940 + }, + { + "entropy": 1.406863808631897, + "epoch": 3.3981900452488687, + "grad_norm": 0.5735233426094055, + "learning_rate": 0.00024357376272625205, + "loss": 0.0794, + "mean_token_accuracy": 0.9789908528327942, + "num_tokens": 8294719.0, + "step": 941 + }, + { + "entropy": 1.418317824602127, + "epoch": 3.401809954751131, + "grad_norm": 0.624377965927124, + "learning_rate": 0.00024313168114093475, + "loss": 0.0466, + "mean_token_accuracy": 0.9851591736078262, + "num_tokens": 8303298.0, + "step": 942 + }, + { + "entropy": 1.3575542867183685, + "epoch": 3.4054298642533936, + "grad_norm": 0.5194457173347473, + "learning_rate": 0.00024268974357422488, + "loss": 0.0743, + "mean_token_accuracy": 0.9743311256170273, + "num_tokens": 8312635.0, + "step": 943 + }, + { + "entropy": 1.392454832792282, + "epoch": 3.409049773755656, + "grad_norm": 0.5445207357406616, + "learning_rate": 0.00024224795170465756, + "loss": 0.0986, + "mean_token_accuracy": 0.9710196405649185, + "num_tokens": 8321364.0, + "step": 944 + }, + { + "entropy": 1.324178010225296, + "epoch": 3.4126696832579184, + "grad_norm": 0.4121778607368469, + "learning_rate": 0.0002418063072102148, + "loss": 0.0513, + "mean_token_accuracy": 0.9844248443841934, + "num_tokens": 8330452.0, + "step": 945 + }, + { + "entropy": 1.4191058278083801, + "epoch": 3.416289592760181, + "grad_norm": 0.48296698927879333, + "learning_rate": 0.00024136481176831854, + "loss": 0.0561, + "mean_token_accuracy": 0.9812565594911575, + "num_tokens": 8339243.0, + "step": 946 + }, + { + "entropy": 1.3743943274021149, + "epoch": 3.4199095022624433, + "grad_norm": 0.5322384834289551, + "learning_rate": 0.00024092346705582474, + "loss": 0.065, + "mean_token_accuracy": 0.9788537919521332, + "num_tokens": 8347866.0, + "step": 947 + }, + { + "entropy": 1.4042058885097504, + "epoch": 3.4235294117647057, + "grad_norm": 0.5542939901351929, + "learning_rate": 0.00024048227474901697, + "loss": 0.0835, + "mean_token_accuracy": 0.9758901000022888, + "num_tokens": 8356604.0, + "step": 948 + }, + { + "entropy": 1.4089910387992859, + "epoch": 3.427149321266968, + "grad_norm": 0.6025400757789612, + "learning_rate": 0.00024004123652359973, + "loss": 0.0736, + "mean_token_accuracy": 0.9723546206951141, + "num_tokens": 8365168.0, + "step": 949 + }, + { + "entropy": 1.3679145872592926, + "epoch": 3.430769230769231, + "grad_norm": 0.6585437059402466, + "learning_rate": 0.00023960035405469235, + "loss": 0.1387, + "mean_token_accuracy": 0.9651710242033005, + "num_tokens": 8374034.0, + "step": 950 + }, + { + "entropy": 1.3707129955291748, + "epoch": 3.4343891402714934, + "grad_norm": 0.639600932598114, + "learning_rate": 0.0002391596290168228, + "loss": 0.0491, + "mean_token_accuracy": 0.9869592487812042, + "num_tokens": 8383019.0, + "step": 951 + }, + { + "entropy": 1.3920492231845856, + "epoch": 3.438009049773756, + "grad_norm": 0.4947279393672943, + "learning_rate": 0.00023871906308392088, + "loss": 0.0647, + "mean_token_accuracy": 0.98161181807518, + "num_tokens": 8392191.0, + "step": 952 + }, + { + "entropy": 1.4259005188941956, + "epoch": 3.4416289592760183, + "grad_norm": 0.5486235618591309, + "learning_rate": 0.00023827865792931205, + "loss": 0.0581, + "mean_token_accuracy": 0.9796920716762543, + "num_tokens": 8400966.0, + "step": 953 + }, + { + "entropy": 1.4309614896774292, + "epoch": 3.4452488687782807, + "grad_norm": 0.6024688482284546, + "learning_rate": 0.00023783841522571138, + "loss": 0.1217, + "mean_token_accuracy": 0.9621599614620209, + "num_tokens": 8409878.0, + "step": 954 + }, + { + "entropy": 1.427911102771759, + "epoch": 3.448868778280543, + "grad_norm": 0.4339677691459656, + "learning_rate": 0.00023739833664521671, + "loss": 0.0521, + "mean_token_accuracy": 0.9818601310253143, + "num_tokens": 8418609.0, + "step": 955 + }, + { + "entropy": 1.4235480725765228, + "epoch": 3.4524886877828056, + "grad_norm": 0.5715889930725098, + "learning_rate": 0.00023695842385930242, + "loss": 0.0657, + "mean_token_accuracy": 0.9833882004022598, + "num_tokens": 8427265.0, + "step": 956 + }, + { + "entropy": 1.335403710603714, + "epoch": 3.456108597285068, + "grad_norm": 0.34678834676742554, + "learning_rate": 0.00023651867853881356, + "loss": 0.0507, + "mean_token_accuracy": 0.9843446165323257, + "num_tokens": 8436591.0, + "step": 957 + }, + { + "entropy": 1.3923978507518768, + "epoch": 3.4597285067873305, + "grad_norm": 0.8088510632514954, + "learning_rate": 0.00023607910235395882, + "loss": 0.1065, + "mean_token_accuracy": 0.9738757163286209, + "num_tokens": 8445472.0, + "step": 958 + }, + { + "entropy": 1.513672262430191, + "epoch": 3.463348416289593, + "grad_norm": 0.6919769048690796, + "learning_rate": 0.0002356396969743044, + "loss": 0.0846, + "mean_token_accuracy": 0.9809803068637848, + "num_tokens": 8453788.0, + "step": 959 + }, + { + "entropy": 1.3483870327472687, + "epoch": 3.4669683257918553, + "grad_norm": 0.5901163220405579, + "learning_rate": 0.00023520046406876822, + "loss": 0.1035, + "mean_token_accuracy": 0.9659459739923477, + "num_tokens": 8463134.0, + "step": 960 + }, + { + "entropy": 1.4076683819293976, + "epoch": 3.4705882352941178, + "grad_norm": 0.5772054195404053, + "learning_rate": 0.00023476140530561253, + "loss": 0.058, + "mean_token_accuracy": 0.9804215878248215, + "num_tokens": 8471959.0, + "step": 961 + }, + { + "entropy": 1.3382205069065094, + "epoch": 3.47420814479638, + "grad_norm": 0.4780332148075104, + "learning_rate": 0.00023432252235243883, + "loss": 0.074, + "mean_token_accuracy": 0.9757792204618454, + "num_tokens": 8480866.0, + "step": 962 + }, + { + "entropy": 1.432678759098053, + "epoch": 3.4778280542986426, + "grad_norm": 0.5997639298439026, + "learning_rate": 0.00023388381687618022, + "loss": 0.0641, + "mean_token_accuracy": 0.9824596792459488, + "num_tokens": 8489355.0, + "step": 963 + }, + { + "entropy": 1.3388859629631042, + "epoch": 3.481447963800905, + "grad_norm": 0.3654438257217407, + "learning_rate": 0.0002334452905430961, + "loss": 0.0553, + "mean_token_accuracy": 0.9859583377838135, + "num_tokens": 8498607.0, + "step": 964 + }, + { + "entropy": 1.4407559633255005, + "epoch": 3.4850678733031675, + "grad_norm": 0.6571084856987, + "learning_rate": 0.00023300694501876535, + "loss": 0.0887, + "mean_token_accuracy": 0.9736911207437515, + "num_tokens": 8506915.0, + "step": 965 + }, + { + "entropy": 1.4553894400596619, + "epoch": 3.48868778280543, + "grad_norm": 0.459780752658844, + "learning_rate": 0.00023256878196808019, + "loss": 0.0578, + "mean_token_accuracy": 0.98088139295578, + "num_tokens": 8515157.0, + "step": 966 + }, + { + "entropy": 1.4285954535007477, + "epoch": 3.4923076923076923, + "grad_norm": 0.4488624930381775, + "learning_rate": 0.0002321308030552396, + "loss": 0.0466, + "mean_token_accuracy": 0.9883453100919724, + "num_tokens": 8523741.0, + "step": 967 + }, + { + "entropy": 1.3596898019313812, + "epoch": 3.4959276018099548, + "grad_norm": 0.5626068711280823, + "learning_rate": 0.00023169300994374352, + "loss": 0.0663, + "mean_token_accuracy": 0.979169949889183, + "num_tokens": 8532431.0, + "step": 968 + }, + { + "entropy": 1.4428678750991821, + "epoch": 3.499547511312217, + "grad_norm": 0.546142578125, + "learning_rate": 0.0002312554042963858, + "loss": 0.0777, + "mean_token_accuracy": 0.9774435311555862, + "num_tokens": 8540889.0, + "step": 969 + }, + { + "entropy": 1.3509635627269745, + "epoch": 3.5031674208144796, + "grad_norm": 0.6781264543533325, + "learning_rate": 0.00023081798777524847, + "loss": 0.0941, + "mean_token_accuracy": 0.9698395729064941, + "num_tokens": 8550128.0, + "step": 970 + }, + { + "entropy": 1.2727701663970947, + "epoch": 3.506787330316742, + "grad_norm": 0.477498322725296, + "learning_rate": 0.00023038076204169534, + "loss": 0.0447, + "mean_token_accuracy": 0.98555026948452, + "num_tokens": 8559305.0, + "step": 971 + }, + { + "entropy": 1.3704063892364502, + "epoch": 3.5104072398190045, + "grad_norm": 0.5665515661239624, + "learning_rate": 0.00022994372875636534, + "loss": 0.0727, + "mean_token_accuracy": 0.9838419556617737, + "num_tokens": 8568175.0, + "step": 972 + }, + { + "entropy": 1.3341827094554901, + "epoch": 3.514027149321267, + "grad_norm": 0.7451890110969543, + "learning_rate": 0.00022950688957916666, + "loss": 0.0892, + "mean_token_accuracy": 0.9721158593893051, + "num_tokens": 8576916.0, + "step": 973 + }, + { + "entropy": 1.3170603513717651, + "epoch": 3.5176470588235293, + "grad_norm": 0.6274797916412354, + "learning_rate": 0.00022907024616927016, + "loss": 0.0867, + "mean_token_accuracy": 0.9760665446519852, + "num_tokens": 8585937.0, + "step": 974 + }, + { + "entropy": 1.324271559715271, + "epoch": 3.521266968325792, + "grad_norm": 0.49691009521484375, + "learning_rate": 0.00022863380018510321, + "loss": 0.0617, + "mean_token_accuracy": 0.9794053137302399, + "num_tokens": 8594885.0, + "step": 975 + }, + { + "entropy": 1.3782964646816254, + "epoch": 3.524886877828054, + "grad_norm": 0.5726630687713623, + "learning_rate": 0.00022819755328434306, + "loss": 0.0789, + "mean_token_accuracy": 0.9756396412849426, + "num_tokens": 8603243.0, + "step": 976 + }, + { + "entropy": 1.3255096673965454, + "epoch": 3.5285067873303166, + "grad_norm": 0.5568417906761169, + "learning_rate": 0.00022776150712391127, + "loss": 0.0734, + "mean_token_accuracy": 0.974893182516098, + "num_tokens": 8612414.0, + "step": 977 + }, + { + "entropy": 1.4089244902133942, + "epoch": 3.532126696832579, + "grad_norm": 0.5721971392631531, + "learning_rate": 0.00022732566335996674, + "loss": 0.0719, + "mean_token_accuracy": 0.976212814450264, + "num_tokens": 8620851.0, + "step": 978 + }, + { + "entropy": 1.278488278388977, + "epoch": 3.5357466063348415, + "grad_norm": 0.4737057387828827, + "learning_rate": 0.00022689002364789938, + "loss": 0.0329, + "mean_token_accuracy": 0.9908775240182877, + "num_tokens": 8630000.0, + "step": 979 + }, + { + "entropy": 1.3656818866729736, + "epoch": 3.539366515837104, + "grad_norm": 0.6749998927116394, + "learning_rate": 0.00022645458964232456, + "loss": 0.0875, + "mean_token_accuracy": 0.978403314948082, + "num_tokens": 8638635.0, + "step": 980 + }, + { + "entropy": 1.3721227645874023, + "epoch": 3.5429864253393664, + "grad_norm": 0.5295807719230652, + "learning_rate": 0.00022601936299707616, + "loss": 0.0694, + "mean_token_accuracy": 0.9826173633337021, + "num_tokens": 8647726.0, + "step": 981 + }, + { + "entropy": 1.3711304068565369, + "epoch": 3.546606334841629, + "grad_norm": 0.5617223381996155, + "learning_rate": 0.0002255843453652002, + "loss": 0.0745, + "mean_token_accuracy": 0.9778908938169479, + "num_tokens": 8656421.0, + "step": 982 + }, + { + "entropy": 1.3603121936321259, + "epoch": 3.5502262443438912, + "grad_norm": 0.5830493569374084, + "learning_rate": 0.00022514953839894932, + "loss": 0.0498, + "mean_token_accuracy": 0.9847024828195572, + "num_tokens": 8665206.0, + "step": 983 + }, + { + "entropy": 1.3419533371925354, + "epoch": 3.5538461538461537, + "grad_norm": 0.5035730004310608, + "learning_rate": 0.00022471494374977556, + "loss": 0.0873, + "mean_token_accuracy": 0.9755606353282928, + "num_tokens": 8674482.0, + "step": 984 + }, + { + "entropy": 1.4004729092121124, + "epoch": 3.557466063348416, + "grad_norm": 0.4822017252445221, + "learning_rate": 0.0002242805630683251, + "loss": 0.0574, + "mean_token_accuracy": 0.9790775179862976, + "num_tokens": 8683066.0, + "step": 985 + }, + { + "entropy": 1.32045316696167, + "epoch": 3.5610859728506785, + "grad_norm": 0.3949761688709259, + "learning_rate": 0.00022384639800443088, + "loss": 0.0396, + "mean_token_accuracy": 0.9879113733768463, + "num_tokens": 8691966.0, + "step": 986 + }, + { + "entropy": 1.3542158901691437, + "epoch": 3.564705882352941, + "grad_norm": 0.6060124635696411, + "learning_rate": 0.0002234124502071072, + "loss": 0.0827, + "mean_token_accuracy": 0.9750298708677292, + "num_tokens": 8700859.0, + "step": 987 + }, + { + "entropy": 1.2594963014125824, + "epoch": 3.5683257918552034, + "grad_norm": 0.5580794215202332, + "learning_rate": 0.00022297872132454318, + "loss": 0.0691, + "mean_token_accuracy": 0.9793778210878372, + "num_tokens": 8710316.0, + "step": 988 + }, + { + "entropy": 1.3390154540538788, + "epoch": 3.571945701357466, + "grad_norm": 0.38052669167518616, + "learning_rate": 0.00022254521300409626, + "loss": 0.0436, + "mean_token_accuracy": 0.9838068634271622, + "num_tokens": 8719219.0, + "step": 989 + }, + { + "entropy": 1.4103966355323792, + "epoch": 3.5755656108597282, + "grad_norm": 0.6793152093887329, + "learning_rate": 0.00022211192689228633, + "loss": 0.0738, + "mean_token_accuracy": 0.9803069233894348, + "num_tokens": 8727658.0, + "step": 990 + }, + { + "entropy": 1.3170084357261658, + "epoch": 3.579185520361991, + "grad_norm": 0.5633034110069275, + "learning_rate": 0.00022167886463478933, + "loss": 0.0483, + "mean_token_accuracy": 0.9852565824985504, + "num_tokens": 8736502.0, + "step": 991 + }, + { + "entropy": 1.3381566107273102, + "epoch": 3.5828054298642535, + "grad_norm": 0.46346399188041687, + "learning_rate": 0.00022124602787643088, + "loss": 0.0324, + "mean_token_accuracy": 0.9907149076461792, + "num_tokens": 8745057.0, + "step": 992 + }, + { + "entropy": 1.2907682061195374, + "epoch": 3.586425339366516, + "grad_norm": 0.5343260169029236, + "learning_rate": 0.00022081341826118013, + "loss": 0.0577, + "mean_token_accuracy": 0.982825830578804, + "num_tokens": 8754046.0, + "step": 993 + }, + { + "entropy": 1.3302249014377594, + "epoch": 3.5900452488687784, + "grad_norm": 0.7435888051986694, + "learning_rate": 0.00022038103743214345, + "loss": 0.0644, + "mean_token_accuracy": 0.9805946946144104, + "num_tokens": 8762749.0, + "step": 994 + }, + { + "entropy": 1.3043336868286133, + "epoch": 3.593665158371041, + "grad_norm": 0.5991454124450684, + "learning_rate": 0.00021994888703155853, + "loss": 0.1013, + "mean_token_accuracy": 0.9729356169700623, + "num_tokens": 8771617.0, + "step": 995 + }, + { + "entropy": 1.2590918838977814, + "epoch": 3.5972850678733033, + "grad_norm": 0.6732775568962097, + "learning_rate": 0.00021951696870078748, + "loss": 0.2119, + "mean_token_accuracy": 0.9542236030101776, + "num_tokens": 8781055.0, + "step": 996 + }, + { + "entropy": 1.3064957559108734, + "epoch": 3.6009049773755657, + "grad_norm": 0.6471344828605652, + "learning_rate": 0.00021908528408031124, + "loss": 0.0775, + "mean_token_accuracy": 0.9764008969068527, + "num_tokens": 8789616.0, + "step": 997 + }, + { + "entropy": 1.2735644578933716, + "epoch": 3.604524886877828, + "grad_norm": 0.4242008626461029, + "learning_rate": 0.00021865383480972308, + "loss": 0.0517, + "mean_token_accuracy": 0.9826734960079193, + "num_tokens": 8798420.0, + "step": 998 + }, + { + "entropy": 1.2687926590442657, + "epoch": 3.6081447963800906, + "grad_norm": 0.6521032452583313, + "learning_rate": 0.00021822262252772212, + "loss": 0.0831, + "mean_token_accuracy": 0.9806712120771408, + "num_tokens": 8807486.0, + "step": 999 + }, + { + "entropy": 1.2503767311573029, + "epoch": 3.611764705882353, + "grad_norm": 0.44378921389579773, + "learning_rate": 0.00021779164887210774, + "loss": 0.0709, + "mean_token_accuracy": 0.9845052361488342, + "num_tokens": 8816795.0, + "step": 1000 + }, + { + "entropy": 1.2962157726287842, + "epoch": 3.6153846153846154, + "grad_norm": 0.47278398275375366, + "learning_rate": 0.0002173609154797728, + "loss": 0.0321, + "mean_token_accuracy": 0.986628457903862, + "num_tokens": 8825449.0, + "step": 1001 + }, + { + "entropy": 1.335391879081726, + "epoch": 3.619004524886878, + "grad_norm": 0.3435405492782593, + "learning_rate": 0.00021693042398669747, + "loss": 0.0361, + "mean_token_accuracy": 0.9887901991605759, + "num_tokens": 8834296.0, + "step": 1002 + }, + { + "entropy": 1.295527994632721, + "epoch": 3.6226244343891403, + "grad_norm": 0.4150637686252594, + "learning_rate": 0.0002165001760279435, + "loss": 0.0419, + "mean_token_accuracy": 0.9862009286880493, + "num_tokens": 8843354.0, + "step": 1003 + }, + { + "entropy": 1.270320326089859, + "epoch": 3.6262443438914027, + "grad_norm": 0.4439278542995453, + "learning_rate": 0.0002160701732376474, + "loss": 0.0676, + "mean_token_accuracy": 0.9789925366640091, + "num_tokens": 8852311.0, + "step": 1004 + }, + { + "entropy": 1.2495850026607513, + "epoch": 3.629864253393665, + "grad_norm": 0.4471176266670227, + "learning_rate": 0.00021564041724901446, + "loss": 0.0469, + "mean_token_accuracy": 0.98641636967659, + "num_tokens": 8861126.0, + "step": 1005 + }, + { + "entropy": 1.3307124376296997, + "epoch": 3.6334841628959276, + "grad_norm": 0.547099769115448, + "learning_rate": 0.0002152109096943128, + "loss": 0.0861, + "mean_token_accuracy": 0.9793668240308762, + "num_tokens": 8870129.0, + "step": 1006 + }, + { + "entropy": 1.3895522952079773, + "epoch": 3.63710407239819, + "grad_norm": 0.5946421027183533, + "learning_rate": 0.00021478165220486674, + "loss": 0.0704, + "mean_token_accuracy": 0.9831217378377914, + "num_tokens": 8878385.0, + "step": 1007 + }, + { + "entropy": 1.3864755928516388, + "epoch": 3.6407239819004524, + "grad_norm": 0.42203575372695923, + "learning_rate": 0.00021435264641105116, + "loss": 0.0557, + "mean_token_accuracy": 0.9843680560588837, + "num_tokens": 8887161.0, + "step": 1008 + }, + { + "entropy": 1.3556683957576752, + "epoch": 3.644343891402715, + "grad_norm": 0.5707162618637085, + "learning_rate": 0.00021392389394228454, + "loss": 0.0523, + "mean_token_accuracy": 0.9845058023929596, + "num_tokens": 8896049.0, + "step": 1009 + }, + { + "entropy": 1.2712234854698181, + "epoch": 3.6479638009049773, + "grad_norm": 0.6082377433776855, + "learning_rate": 0.00021349539642702347, + "loss": 0.1082, + "mean_token_accuracy": 0.9710930436849594, + "num_tokens": 8905546.0, + "step": 1010 + }, + { + "entropy": 1.3434297740459442, + "epoch": 3.6515837104072397, + "grad_norm": 0.7305653095245361, + "learning_rate": 0.0002130671554927561, + "loss": 0.088, + "mean_token_accuracy": 0.9745359718799591, + "num_tokens": 8914502.0, + "step": 1011 + }, + { + "entropy": 1.3378058075904846, + "epoch": 3.655203619909502, + "grad_norm": 0.4537632167339325, + "learning_rate": 0.00021263917276599607, + "loss": 0.047, + "mean_token_accuracy": 0.9869710952043533, + "num_tokens": 8923463.0, + "step": 1012 + }, + { + "entropy": 1.3655290305614471, + "epoch": 3.6588235294117646, + "grad_norm": 0.5036798119544983, + "learning_rate": 0.0002122114498722763, + "loss": 0.0655, + "mean_token_accuracy": 0.982716903090477, + "num_tokens": 8932384.0, + "step": 1013 + }, + { + "entropy": 1.3198035657405853, + "epoch": 3.662443438914027, + "grad_norm": 0.3511429727077484, + "learning_rate": 0.000211783988436143, + "loss": 0.0382, + "mean_token_accuracy": 0.9901436120271683, + "num_tokens": 8941300.0, + "step": 1014 + }, + { + "entropy": 1.3130914568901062, + "epoch": 3.6660633484162894, + "grad_norm": 0.4056939482688904, + "learning_rate": 0.00021135679008114894, + "loss": 0.0639, + "mean_token_accuracy": 0.9808386266231537, + "num_tokens": 8950534.0, + "step": 1015 + }, + { + "entropy": 1.3055587410926819, + "epoch": 3.669683257918552, + "grad_norm": 0.33344724774360657, + "learning_rate": 0.00021092985642984802, + "loss": 0.0449, + "mean_token_accuracy": 0.9886894524097443, + "num_tokens": 8960263.0, + "step": 1016 + }, + { + "entropy": 1.3109475672245026, + "epoch": 3.6733031674208148, + "grad_norm": 0.490029901266098, + "learning_rate": 0.00021050318910378874, + "loss": 0.0876, + "mean_token_accuracy": 0.9755903035402298, + "num_tokens": 8969611.0, + "step": 1017 + }, + { + "entropy": 1.3801122307777405, + "epoch": 3.676923076923077, + "grad_norm": 0.3520437479019165, + "learning_rate": 0.00021007678972350798, + "loss": 0.0482, + "mean_token_accuracy": 0.9860682934522629, + "num_tokens": 8978283.0, + "step": 1018 + }, + { + "entropy": 1.309948354959488, + "epoch": 3.6805429864253396, + "grad_norm": 0.485009104013443, + "learning_rate": 0.00020965065990852474, + "loss": 0.0824, + "mean_token_accuracy": 0.9751296043395996, + "num_tokens": 8987535.0, + "step": 1019 + }, + { + "entropy": 1.3771768808364868, + "epoch": 3.684162895927602, + "grad_norm": 0.5419639945030212, + "learning_rate": 0.00020922480127733448, + "loss": 0.0649, + "mean_token_accuracy": 0.9826148748397827, + "num_tokens": 8996533.0, + "step": 1020 + }, + { + "entropy": 1.337918907403946, + "epoch": 3.6877828054298645, + "grad_norm": 0.36202654242515564, + "learning_rate": 0.00020879921544740264, + "loss": 0.0311, + "mean_token_accuracy": 0.9919043332338333, + "num_tokens": 9005497.0, + "step": 1021 + }, + { + "entropy": 1.439345896244049, + "epoch": 3.691402714932127, + "grad_norm": 0.6851293444633484, + "learning_rate": 0.0002083739040351584, + "loss": 0.096, + "mean_token_accuracy": 0.9736751765012741, + "num_tokens": 9014037.0, + "step": 1022 + }, + { + "entropy": 1.44906947016716, + "epoch": 3.6950226244343893, + "grad_norm": 0.4260176122188568, + "learning_rate": 0.00020794886865598848, + "loss": 0.0523, + "mean_token_accuracy": 0.9793268889188766, + "num_tokens": 9022452.0, + "step": 1023 + }, + { + "entropy": 1.449628233909607, + "epoch": 3.6986425339366518, + "grad_norm": 0.6072604656219482, + "learning_rate": 0.00020752411092423177, + "loss": 0.0727, + "mean_token_accuracy": 0.9774363785982132, + "num_tokens": 9030847.0, + "step": 1024 + }, + { + "entropy": 1.3873493075370789, + "epoch": 3.702262443438914, + "grad_norm": 0.44552555680274963, + "learning_rate": 0.00020709963245317209, + "loss": 0.0639, + "mean_token_accuracy": 0.9800115376710892, + "num_tokens": 9039891.0, + "step": 1025 + }, + { + "entropy": 1.4281193912029266, + "epoch": 3.7058823529411766, + "grad_norm": 0.5228530764579773, + "learning_rate": 0.0002066754348550327, + "loss": 0.0738, + "mean_token_accuracy": 0.9765488505363464, + "num_tokens": 9048686.0, + "step": 1026 + }, + { + "entropy": 1.3790385127067566, + "epoch": 3.709502262443439, + "grad_norm": 0.4316764175891876, + "learning_rate": 0.00020625151974097022, + "loss": 0.0641, + "mean_token_accuracy": 0.97920823097229, + "num_tokens": 9057678.0, + "step": 1027 + }, + { + "entropy": 1.4287641942501068, + "epoch": 3.7131221719457015, + "grad_norm": 0.4056229591369629, + "learning_rate": 0.00020582788872106842, + "loss": 0.036, + "mean_token_accuracy": 0.9899342656135559, + "num_tokens": 9066521.0, + "step": 1028 + }, + { + "entropy": 1.454606294631958, + "epoch": 3.716742081447964, + "grad_norm": 0.7525569200515747, + "learning_rate": 0.0002054045434043316, + "loss": 0.1423, + "mean_token_accuracy": 0.9605622440576553, + "num_tokens": 9075595.0, + "step": 1029 + }, + { + "entropy": 1.3942890167236328, + "epoch": 3.7203619909502263, + "grad_norm": 0.4933125078678131, + "learning_rate": 0.00020498148539867944, + "loss": 0.0773, + "mean_token_accuracy": 0.970758929848671, + "num_tokens": 9084661.0, + "step": 1030 + }, + { + "entropy": 1.384048968553543, + "epoch": 3.723981900452489, + "grad_norm": 0.43627068400382996, + "learning_rate": 0.00020455871631094017, + "loss": 0.0678, + "mean_token_accuracy": 0.983132854104042, + "num_tokens": 9094062.0, + "step": 1031 + }, + { + "entropy": 1.4216719567775726, + "epoch": 3.727601809954751, + "grad_norm": 0.6412005424499512, + "learning_rate": 0.0002041362377468445, + "loss": 0.1097, + "mean_token_accuracy": 0.9793971478939056, + "num_tokens": 9103015.0, + "step": 1032 + }, + { + "entropy": 1.4771287441253662, + "epoch": 3.7312217194570136, + "grad_norm": 0.5385004281997681, + "learning_rate": 0.00020371405131102002, + "loss": 0.0553, + "mean_token_accuracy": 0.9826144278049469, + "num_tokens": 9111433.0, + "step": 1033 + }, + { + "entropy": 1.4442466795444489, + "epoch": 3.734841628959276, + "grad_norm": 0.5972802042961121, + "learning_rate": 0.00020329215860698458, + "loss": 0.0584, + "mean_token_accuracy": 0.984418511390686, + "num_tokens": 9120012.0, + "step": 1034 + }, + { + "entropy": 1.4893062710762024, + "epoch": 3.7384615384615385, + "grad_norm": 0.7473769783973694, + "learning_rate": 0.00020287056123714035, + "loss": 0.1091, + "mean_token_accuracy": 0.9683271646499634, + "num_tokens": 9128636.0, + "step": 1035 + }, + { + "entropy": 1.3519982993602753, + "epoch": 3.742081447963801, + "grad_norm": 0.47699517011642456, + "learning_rate": 0.00020244926080276794, + "loss": 0.0525, + "mean_token_accuracy": 0.9904675185680389, + "num_tokens": 9137968.0, + "step": 1036 + }, + { + "entropy": 1.4561591148376465, + "epoch": 3.7457013574660634, + "grad_norm": 0.42199093103408813, + "learning_rate": 0.00020202825890402003, + "loss": 0.0451, + "mean_token_accuracy": 0.9883602410554886, + "num_tokens": 9146589.0, + "step": 1037 + }, + { + "entropy": 1.408205658197403, + "epoch": 3.749321266968326, + "grad_norm": 0.4122658371925354, + "learning_rate": 0.0002016075571399157, + "loss": 0.0489, + "mean_token_accuracy": 0.9833643138408661, + "num_tokens": 9155443.0, + "step": 1038 + }, + { + "entropy": 1.3683985471725464, + "epoch": 3.7529411764705882, + "grad_norm": 0.43202081322669983, + "learning_rate": 0.0002011871571083336, + "loss": 0.0424, + "mean_token_accuracy": 0.9905680269002914, + "num_tokens": 9164792.0, + "step": 1039 + }, + { + "entropy": 1.3635350167751312, + "epoch": 3.7565610859728507, + "grad_norm": 0.606826663017273, + "learning_rate": 0.00020076706040600672, + "loss": 0.0883, + "mean_token_accuracy": 0.9747144728899002, + "num_tokens": 9174060.0, + "step": 1040 + }, + { + "entropy": 1.3948090970516205, + "epoch": 3.760180995475113, + "grad_norm": 0.561805009841919, + "learning_rate": 0.00020034726862851594, + "loss": 0.1131, + "mean_token_accuracy": 0.9722562730312347, + "num_tokens": 9183245.0, + "step": 1041 + }, + { + "entropy": 1.4282923936843872, + "epoch": 3.7638009049773755, + "grad_norm": 0.5546544790267944, + "learning_rate": 0.00019992778337028384, + "loss": 0.0762, + "mean_token_accuracy": 0.9801356643438339, + "num_tokens": 9191879.0, + "step": 1042 + }, + { + "entropy": 1.3675439953804016, + "epoch": 3.767420814479638, + "grad_norm": 0.5167890787124634, + "learning_rate": 0.0001995086062245689, + "loss": 0.0804, + "mean_token_accuracy": 0.98072350025177, + "num_tokens": 9201014.0, + "step": 1043 + }, + { + "entropy": 1.424451231956482, + "epoch": 3.7710407239819004, + "grad_norm": 0.44696182012557983, + "learning_rate": 0.00019908973878345943, + "loss": 0.0583, + "mean_token_accuracy": 0.9818143099546432, + "num_tokens": 9209954.0, + "step": 1044 + }, + { + "entropy": 1.3515954911708832, + "epoch": 3.774660633484163, + "grad_norm": 0.619891345500946, + "learning_rate": 0.0001986711826378673, + "loss": 0.0949, + "mean_token_accuracy": 0.9688181430101395, + "num_tokens": 9219157.0, + "step": 1045 + }, + { + "entropy": 1.3176933526992798, + "epoch": 3.7782805429864252, + "grad_norm": 0.43150845170021057, + "learning_rate": 0.00019825293937752203, + "loss": 0.0459, + "mean_token_accuracy": 0.9851347357034683, + "num_tokens": 9228415.0, + "step": 1046 + }, + { + "entropy": 1.3939999043941498, + "epoch": 3.7819004524886877, + "grad_norm": 0.6242758631706238, + "learning_rate": 0.00019783501059096495, + "loss": 0.0703, + "mean_token_accuracy": 0.9822264909744263, + "num_tokens": 9237479.0, + "step": 1047 + }, + { + "entropy": 1.430876463651657, + "epoch": 3.78552036199095, + "grad_norm": 0.6107195019721985, + "learning_rate": 0.00019741739786554273, + "loss": 0.0758, + "mean_token_accuracy": 0.9829006642103195, + "num_tokens": 9245975.0, + "step": 1048 + }, + { + "entropy": 1.4005264639854431, + "epoch": 3.7891402714932125, + "grad_norm": 0.5321121215820312, + "learning_rate": 0.00019700010278740174, + "loss": 0.0636, + "mean_token_accuracy": 0.9827183485031128, + "num_tokens": 9254487.0, + "step": 1049 + }, + { + "entropy": 1.4024662375450134, + "epoch": 3.792760180995475, + "grad_norm": 0.5756775140762329, + "learning_rate": 0.00019658312694148191, + "loss": 0.0702, + "mean_token_accuracy": 0.9786443412303925, + "num_tokens": 9263345.0, + "step": 1050 + }, + { + "entropy": 1.3956109881401062, + "epoch": 3.7963800904977374, + "grad_norm": 0.5821980834007263, + "learning_rate": 0.00019616647191151077, + "loss": 0.0715, + "mean_token_accuracy": 0.97563835978508, + "num_tokens": 9271916.0, + "step": 1051 + }, + { + "entropy": 1.4688811898231506, + "epoch": 3.8, + "grad_norm": 0.3763403594493866, + "learning_rate": 0.00019575013927999692, + "loss": 0.0399, + "mean_token_accuracy": 0.9858106821775436, + "num_tokens": 9280577.0, + "step": 1052 + }, + { + "entropy": 1.482151448726654, + "epoch": 3.8036199095022623, + "grad_norm": 0.4746648371219635, + "learning_rate": 0.00019533413062822495, + "loss": 0.0338, + "mean_token_accuracy": 0.9888868033885956, + "num_tokens": 9289036.0, + "step": 1053 + }, + { + "entropy": 1.39437335729599, + "epoch": 3.8072398190045247, + "grad_norm": 0.34683090448379517, + "learning_rate": 0.00019491844753624884, + "loss": 0.0411, + "mean_token_accuracy": 0.98799167573452, + "num_tokens": 9297968.0, + "step": 1054 + }, + { + "entropy": 1.4089177548885345, + "epoch": 3.810859728506787, + "grad_norm": 0.6755173802375793, + "learning_rate": 0.00019450309158288562, + "loss": 0.08, + "mean_token_accuracy": 0.975567102432251, + "num_tokens": 9306399.0, + "step": 1055 + }, + { + "entropy": 1.4395931661128998, + "epoch": 3.8144796380090495, + "grad_norm": 0.6466923356056213, + "learning_rate": 0.00019408806434571043, + "loss": 0.0962, + "mean_token_accuracy": 0.9790873825550079, + "num_tokens": 9315067.0, + "step": 1056 + }, + { + "entropy": 1.463386446237564, + "epoch": 3.818099547511312, + "grad_norm": 0.7904548645019531, + "learning_rate": 0.0001936733674010496, + "loss": 0.0982, + "mean_token_accuracy": 0.9723253399133682, + "num_tokens": 9323453.0, + "step": 1057 + }, + { + "entropy": 1.4089987576007843, + "epoch": 3.8217194570135744, + "grad_norm": 0.5947125554084778, + "learning_rate": 0.00019325900232397477, + "loss": 0.0558, + "mean_token_accuracy": 0.985149696469307, + "num_tokens": 9332220.0, + "step": 1058 + }, + { + "entropy": 1.424832284450531, + "epoch": 3.825339366515837, + "grad_norm": 0.6046349406242371, + "learning_rate": 0.00019284497068829747, + "loss": 0.103, + "mean_token_accuracy": 0.9751139581203461, + "num_tokens": 9341074.0, + "step": 1059 + }, + { + "entropy": 1.3354915082454681, + "epoch": 3.8289592760180997, + "grad_norm": 0.20708034932613373, + "learning_rate": 0.00019243127406656248, + "loss": 0.0117, + "mean_token_accuracy": 0.9978606253862381, + "num_tokens": 9350232.0, + "step": 1060 + }, + { + "entropy": 1.335442215204239, + "epoch": 3.832579185520362, + "grad_norm": 0.7157071232795715, + "learning_rate": 0.00019201791403004257, + "loss": 0.0915, + "mean_token_accuracy": 0.9730544090270996, + "num_tokens": 9359486.0, + "step": 1061 + }, + { + "entropy": 1.4432708621025085, + "epoch": 3.8361990950226246, + "grad_norm": 0.7831724286079407, + "learning_rate": 0.00019160489214873155, + "loss": 0.1163, + "mean_token_accuracy": 0.9673851430416107, + "num_tokens": 9368235.0, + "step": 1062 + }, + { + "entropy": 1.440447449684143, + "epoch": 3.839819004524887, + "grad_norm": 0.6418187022209167, + "learning_rate": 0.00019119220999133923, + "loss": 0.0587, + "mean_token_accuracy": 0.9853110462427139, + "num_tokens": 9376622.0, + "step": 1063 + }, + { + "entropy": 1.3945489525794983, + "epoch": 3.8434389140271494, + "grad_norm": 0.5115446448326111, + "learning_rate": 0.0001907798691252852, + "loss": 0.0627, + "mean_token_accuracy": 0.9849587380886078, + "num_tokens": 9385653.0, + "step": 1064 + }, + { + "entropy": 1.4040117859840393, + "epoch": 3.847058823529412, + "grad_norm": 0.40980765223503113, + "learning_rate": 0.0001903678711166924, + "loss": 0.0319, + "mean_token_accuracy": 0.990267813205719, + "num_tokens": 9394335.0, + "step": 1065 + }, + { + "entropy": 1.440806269645691, + "epoch": 3.8506787330316743, + "grad_norm": 0.7762898206710815, + "learning_rate": 0.00018995621753038183, + "loss": 0.1477, + "mean_token_accuracy": 0.9675359576940536, + "num_tokens": 9402789.0, + "step": 1066 + }, + { + "entropy": 1.3455476462841034, + "epoch": 3.8542986425339367, + "grad_norm": 0.4371282458305359, + "learning_rate": 0.00018954490992986644, + "loss": 0.047, + "mean_token_accuracy": 0.9871475845575333, + "num_tokens": 9411665.0, + "step": 1067 + }, + { + "entropy": 1.3937756717205048, + "epoch": 3.857918552036199, + "grad_norm": 0.8712350726127625, + "learning_rate": 0.0001891339498773447, + "loss": 0.143, + "mean_token_accuracy": 0.9606377333402634, + "num_tokens": 9420475.0, + "step": 1068 + }, + { + "entropy": 1.4569672644138336, + "epoch": 3.8615384615384616, + "grad_norm": 0.6399717926979065, + "learning_rate": 0.00018872333893369536, + "loss": 0.0625, + "mean_token_accuracy": 0.9822671264410019, + "num_tokens": 9429062.0, + "step": 1069 + }, + { + "entropy": 1.397208034992218, + "epoch": 3.865158371040724, + "grad_norm": 0.41650331020355225, + "learning_rate": 0.00018831307865847108, + "loss": 0.0565, + "mean_token_accuracy": 0.9822796285152435, + "num_tokens": 9437938.0, + "step": 1070 + }, + { + "entropy": 1.3304217159748077, + "epoch": 3.8687782805429864, + "grad_norm": 0.34858253598213196, + "learning_rate": 0.00018790317060989273, + "loss": 0.0355, + "mean_token_accuracy": 0.9889863580465317, + "num_tokens": 9446897.0, + "step": 1071 + }, + { + "entropy": 1.4303509891033173, + "epoch": 3.872398190045249, + "grad_norm": 0.5634018182754517, + "learning_rate": 0.00018749361634484325, + "loss": 0.0999, + "mean_token_accuracy": 0.9707607924938202, + "num_tokens": 9455618.0, + "step": 1072 + }, + { + "entropy": 1.4059797525405884, + "epoch": 3.8760180995475113, + "grad_norm": 0.4992756247520447, + "learning_rate": 0.00018708441741886194, + "loss": 0.062, + "mean_token_accuracy": 0.9801923334598541, + "num_tokens": 9464254.0, + "step": 1073 + }, + { + "entropy": 1.329335242509842, + "epoch": 3.8796380090497737, + "grad_norm": 0.43501394987106323, + "learning_rate": 0.00018667557538613863, + "loss": 0.0474, + "mean_token_accuracy": 0.987145259976387, + "num_tokens": 9473340.0, + "step": 1074 + }, + { + "entropy": 1.3786957263946533, + "epoch": 3.883257918552036, + "grad_norm": 0.640612781047821, + "learning_rate": 0.00018626709179950717, + "loss": 0.1196, + "mean_token_accuracy": 0.9668680727481842, + "num_tokens": 9482286.0, + "step": 1075 + }, + { + "entropy": 1.3925435245037079, + "epoch": 3.8868778280542986, + "grad_norm": 0.6338940262794495, + "learning_rate": 0.0001858589682104405, + "loss": 0.0643, + "mean_token_accuracy": 0.982734814286232, + "num_tokens": 9490868.0, + "step": 1076 + }, + { + "entropy": 1.3901928961277008, + "epoch": 3.890497737556561, + "grad_norm": 0.5943475365638733, + "learning_rate": 0.000185451206169044, + "loss": 0.0636, + "mean_token_accuracy": 0.9796071499586105, + "num_tokens": 9499551.0, + "step": 1077 + }, + { + "entropy": 1.3408487439155579, + "epoch": 3.8941176470588235, + "grad_norm": 0.47063320875167847, + "learning_rate": 0.00018504380722404975, + "loss": 0.059, + "mean_token_accuracy": 0.98704494535923, + "num_tokens": 9508605.0, + "step": 1078 + }, + { + "entropy": 1.3606760799884796, + "epoch": 3.897737556561086, + "grad_norm": 0.5077705383300781, + "learning_rate": 0.00018463677292281092, + "loss": 0.0586, + "mean_token_accuracy": 0.9849795997142792, + "num_tokens": 9517376.0, + "step": 1079 + }, + { + "entropy": 1.389219492673874, + "epoch": 3.9013574660633483, + "grad_norm": 0.451435923576355, + "learning_rate": 0.00018423010481129584, + "loss": 0.0414, + "mean_token_accuracy": 0.9872728437185287, + "num_tokens": 9525724.0, + "step": 1080 + }, + { + "entropy": 1.3427990972995758, + "epoch": 3.9049773755656108, + "grad_norm": 0.4996180236339569, + "learning_rate": 0.00018382380443408158, + "loss": 0.0519, + "mean_token_accuracy": 0.9842040240764618, + "num_tokens": 9534581.0, + "step": 1081 + }, + { + "entropy": 1.3136717081069946, + "epoch": 3.908597285067873, + "grad_norm": 0.31684455275535583, + "learning_rate": 0.00018341787333434872, + "loss": 0.0367, + "mean_token_accuracy": 0.986624076962471, + "num_tokens": 9543780.0, + "step": 1082 + }, + { + "entropy": 1.357143759727478, + "epoch": 3.9122171945701356, + "grad_norm": 0.392623633146286, + "learning_rate": 0.00018301231305387552, + "loss": 0.0361, + "mean_token_accuracy": 0.9899974465370178, + "num_tokens": 9552316.0, + "step": 1083 + }, + { + "entropy": 1.4222826957702637, + "epoch": 3.915837104072398, + "grad_norm": 0.6109654903411865, + "learning_rate": 0.00018260712513303167, + "loss": 0.0801, + "mean_token_accuracy": 0.9758190959692001, + "num_tokens": 9560547.0, + "step": 1084 + }, + { + "entropy": 1.387441635131836, + "epoch": 3.9194570135746605, + "grad_norm": 0.611193060874939, + "learning_rate": 0.00018220231111077217, + "loss": 0.0627, + "mean_token_accuracy": 0.9828397631645203, + "num_tokens": 9569112.0, + "step": 1085 + }, + { + "entropy": 1.3265759348869324, + "epoch": 3.9230769230769234, + "grad_norm": 0.35626664757728577, + "learning_rate": 0.0001817978725246326, + "loss": 0.0347, + "mean_token_accuracy": 0.9868002831935883, + "num_tokens": 9577936.0, + "step": 1086 + }, + { + "entropy": 1.3036309480667114, + "epoch": 3.926696832579186, + "grad_norm": 0.9055293202400208, + "learning_rate": 0.00018139381091072213, + "loss": 0.0869, + "mean_token_accuracy": 0.976190984249115, + "num_tokens": 9586725.0, + "step": 1087 + }, + { + "entropy": 1.317764014005661, + "epoch": 3.930316742081448, + "grad_norm": 0.34299445152282715, + "learning_rate": 0.00018099012780371814, + "loss": 0.0193, + "mean_token_accuracy": 0.9950294345617294, + "num_tokens": 9595580.0, + "step": 1088 + }, + { + "entropy": 1.4780614078044891, + "epoch": 3.9339366515837106, + "grad_norm": 0.45136016607284546, + "learning_rate": 0.00018058682473686075, + "loss": 0.03, + "mean_token_accuracy": 0.9902182072401047, + "num_tokens": 9603693.0, + "step": 1089 + }, + { + "entropy": 1.3006681501865387, + "epoch": 3.937556561085973, + "grad_norm": 0.8072985410690308, + "learning_rate": 0.00018018390324194637, + "loss": 0.1406, + "mean_token_accuracy": 0.9719722718000412, + "num_tokens": 9613517.0, + "step": 1090 + }, + { + "entropy": 1.2329892814159393, + "epoch": 3.9411764705882355, + "grad_norm": 0.7343161702156067, + "learning_rate": 0.00017978136484932198, + "loss": 0.1221, + "mean_token_accuracy": 0.9715431183576584, + "num_tokens": 9623002.0, + "step": 1091 + }, + { + "entropy": 1.339203268289566, + "epoch": 3.944796380090498, + "grad_norm": 0.422463595867157, + "learning_rate": 0.00017937921108787986, + "loss": 0.0366, + "mean_token_accuracy": 0.9875014275312424, + "num_tokens": 9631474.0, + "step": 1092 + }, + { + "entropy": 1.3312835395336151, + "epoch": 3.9484162895927604, + "grad_norm": 0.5072442889213562, + "learning_rate": 0.00017897744348505123, + "loss": 0.0561, + "mean_token_accuracy": 0.9836284965276718, + "num_tokens": 9640156.0, + "step": 1093 + }, + { + "entropy": 1.4004390239715576, + "epoch": 3.952036199095023, + "grad_norm": 0.48746341466903687, + "learning_rate": 0.0001785760635668007, + "loss": 0.044, + "mean_token_accuracy": 0.9849557876586914, + "num_tokens": 9648458.0, + "step": 1094 + }, + { + "entropy": 1.350889652967453, + "epoch": 3.9556561085972852, + "grad_norm": 0.41746893525123596, + "learning_rate": 0.00017817507285762023, + "loss": 0.0532, + "mean_token_accuracy": 0.9855844229459763, + "num_tokens": 9657136.0, + "step": 1095 + }, + { + "entropy": 1.237421602010727, + "epoch": 3.9592760180995477, + "grad_norm": 0.46493780612945557, + "learning_rate": 0.00017777447288052373, + "loss": 0.0759, + "mean_token_accuracy": 0.9721266627311707, + "num_tokens": 9667058.0, + "step": 1096 + }, + { + "entropy": 1.3534648716449738, + "epoch": 3.96289592760181, + "grad_norm": 0.406345933675766, + "learning_rate": 0.000177374265157041, + "loss": 0.0537, + "mean_token_accuracy": 0.9838696867227554, + "num_tokens": 9675627.0, + "step": 1097 + }, + { + "entropy": 1.237879753112793, + "epoch": 3.9665158371040725, + "grad_norm": 0.527837872505188, + "learning_rate": 0.00017697445120721175, + "loss": 0.0737, + "mean_token_accuracy": 0.9752355068922043, + "num_tokens": 9685091.0, + "step": 1098 + }, + { + "entropy": 1.2298554480075836, + "epoch": 3.970135746606335, + "grad_norm": 0.45402729511260986, + "learning_rate": 0.00017657503254958054, + "loss": 0.0556, + "mean_token_accuracy": 0.9843012988567352, + "num_tokens": 9694688.0, + "step": 1099 + }, + { + "entropy": 1.270505130290985, + "epoch": 3.9737556561085974, + "grad_norm": 0.6557897329330444, + "learning_rate": 0.00017617601070119037, + "loss": 0.0918, + "mean_token_accuracy": 0.9786079078912735, + "num_tokens": 9704286.0, + "step": 1100 + }, + { + "entropy": 1.3577526807785034, + "epoch": 3.97737556561086, + "grad_norm": 0.48044729232788086, + "learning_rate": 0.0001757773871775768, + "loss": 0.0564, + "mean_token_accuracy": 0.9776984602212906, + "num_tokens": 9712668.0, + "step": 1101 + }, + { + "entropy": 1.309740036725998, + "epoch": 3.9809954751131222, + "grad_norm": 0.8556230664253235, + "learning_rate": 0.00017537916349276303, + "loss": 0.2013, + "mean_token_accuracy": 0.9610435962677002, + "num_tokens": 9722042.0, + "step": 1102 + }, + { + "entropy": 1.375863939523697, + "epoch": 3.9846153846153847, + "grad_norm": 0.4123291075229645, + "learning_rate": 0.00017498134115925327, + "loss": 0.0208, + "mean_token_accuracy": 0.9937012493610382, + "num_tokens": 9730420.0, + "step": 1103 + }, + { + "entropy": 1.3407017588615417, + "epoch": 3.988235294117647, + "grad_norm": 0.3886757493019104, + "learning_rate": 0.0001745839216880275, + "loss": 0.0223, + "mean_token_accuracy": 0.9922950863838196, + "num_tokens": 9739292.0, + "step": 1104 + }, + { + "entropy": 1.2783922851085663, + "epoch": 3.9918552036199095, + "grad_norm": 0.39245131611824036, + "learning_rate": 0.00017418690658853542, + "loss": 0.0607, + "mean_token_accuracy": 0.9823390543460846, + "num_tokens": 9748635.0, + "step": 1105 + }, + { + "entropy": 1.3240907490253448, + "epoch": 3.995475113122172, + "grad_norm": 0.925537645816803, + "learning_rate": 0.00017379029736869103, + "loss": 0.1301, + "mean_token_accuracy": 0.9688823968172073, + "num_tokens": 9757450.0, + "step": 1106 + }, + { + "entropy": 1.2996585667133331, + "epoch": 3.9990950226244344, + "grad_norm": 0.5589770674705505, + "learning_rate": 0.00017339409553486675, + "loss": 0.0833, + "mean_token_accuracy": 0.9765840470790863, + "num_tokens": 9766204.0, + "step": 1107 + }, + { + "entropy": 1.3336817026138306, + "epoch": 4.0, + "grad_norm": 1.8711317777633667, + "learning_rate": 0.00017299830259188753, + "loss": 0.0647, + "mean_token_accuracy": 0.9789473414421082, + "num_tokens": 9766900.0, + "step": 1108 + }, + { + "epoch": 4.0, + "eval_entropy": 1.3279241662684496, + "eval_loss": 0.13373936712741852, + "eval_mean_token_accuracy": 0.9691996351490176, + "eval_num_tokens": 9766900.0, + "eval_runtime": 116.1625, + "eval_samples_per_second": 3.177, + "eval_steps_per_second": 1.059, + "step": 1108 + }, + { + "entropy": 1.293619453907013, + "epoch": 4.003619909502262, + "grad_norm": 0.3031173050403595, + "learning_rate": 0.0001726029200430255, + "loss": 0.0413, + "mean_token_accuracy": 0.9907345324754715, + "num_tokens": 9775499.0, + "step": 1109 + }, + { + "entropy": 1.278919130563736, + "epoch": 4.007239819004525, + "grad_norm": 0.4092402160167694, + "learning_rate": 0.00017220794938999388, + "loss": 0.0471, + "mean_token_accuracy": 0.9856289625167847, + "num_tokens": 9784864.0, + "step": 1110 + }, + { + "entropy": 1.2768179178237915, + "epoch": 4.010859728506787, + "grad_norm": 0.4138973355293274, + "learning_rate": 0.0001718133921329416, + "loss": 0.0487, + "mean_token_accuracy": 0.9872913360595703, + "num_tokens": 9794077.0, + "step": 1111 + }, + { + "entropy": 1.2907516956329346, + "epoch": 4.01447963800905, + "grad_norm": 0.433987021446228, + "learning_rate": 0.0001714192497704474, + "loss": 0.0458, + "mean_token_accuracy": 0.9867971241474152, + "num_tokens": 9802781.0, + "step": 1112 + }, + { + "entropy": 1.2788557410240173, + "epoch": 4.018099547511312, + "grad_norm": 0.456443727016449, + "learning_rate": 0.0001710255237995142, + "loss": 0.054, + "mean_token_accuracy": 0.9820699095726013, + "num_tokens": 9811440.0, + "step": 1113 + }, + { + "entropy": 1.2931778728961945, + "epoch": 4.021719457013575, + "grad_norm": 0.5304563641548157, + "learning_rate": 0.00017063221571556348, + "loss": 0.0663, + "mean_token_accuracy": 0.9788531064987183, + "num_tokens": 9820394.0, + "step": 1114 + }, + { + "entropy": 1.251348078250885, + "epoch": 4.025339366515837, + "grad_norm": 0.32978400588035583, + "learning_rate": 0.00017023932701242932, + "loss": 0.0201, + "mean_token_accuracy": 0.9935282766819, + "num_tokens": 9829196.0, + "step": 1115 + }, + { + "entropy": 1.3026653826236725, + "epoch": 4.0289592760180994, + "grad_norm": 0.4994417726993561, + "learning_rate": 0.0001698468591823532, + "loss": 0.0546, + "mean_token_accuracy": 0.9885783791542053, + "num_tokens": 9838079.0, + "step": 1116 + }, + { + "entropy": 1.2028016149997711, + "epoch": 4.032579185520362, + "grad_norm": 0.3469240963459015, + "learning_rate": 0.00016945481371597793, + "loss": 0.0414, + "mean_token_accuracy": 0.9887481927871704, + "num_tokens": 9847654.0, + "step": 1117 + }, + { + "entropy": 1.234688937664032, + "epoch": 4.036199095022624, + "grad_norm": 0.35243600606918335, + "learning_rate": 0.00016906319210234218, + "loss": 0.037, + "mean_token_accuracy": 0.9897292852401733, + "num_tokens": 9856599.0, + "step": 1118 + }, + { + "entropy": 1.2084547579288483, + "epoch": 4.039819004524887, + "grad_norm": 0.31716811656951904, + "learning_rate": 0.00016867199582887464, + "loss": 0.0385, + "mean_token_accuracy": 0.9857838749885559, + "num_tokens": 9865697.0, + "step": 1119 + }, + { + "entropy": 1.2915122210979462, + "epoch": 4.043438914027149, + "grad_norm": 0.38569268584251404, + "learning_rate": 0.00016828122638138876, + "loss": 0.0208, + "mean_token_accuracy": 0.9932724088430405, + "num_tokens": 9874265.0, + "step": 1120 + }, + { + "entropy": 1.2242113649845123, + "epoch": 4.047058823529412, + "grad_norm": 0.5431196689605713, + "learning_rate": 0.0001678908852440766, + "loss": 0.0641, + "mean_token_accuracy": 0.9854831397533417, + "num_tokens": 9883597.0, + "step": 1121 + }, + { + "entropy": 1.256748080253601, + "epoch": 4.050678733031674, + "grad_norm": 0.5003210306167603, + "learning_rate": 0.00016750097389950358, + "loss": 0.0391, + "mean_token_accuracy": 0.9875418394804001, + "num_tokens": 9892067.0, + "step": 1122 + }, + { + "entropy": 1.1860899925231934, + "epoch": 4.0542986425339365, + "grad_norm": 0.35398924350738525, + "learning_rate": 0.00016711149382860266, + "loss": 0.0287, + "mean_token_accuracy": 0.9883342385292053, + "num_tokens": 9901411.0, + "step": 1123 + }, + { + "entropy": 1.347067803144455, + "epoch": 4.057918552036199, + "grad_norm": 0.5562195777893066, + "learning_rate": 0.00016672244651066883, + "loss": 0.045, + "mean_token_accuracy": 0.987530916929245, + "num_tokens": 9909638.0, + "step": 1124 + }, + { + "entropy": 1.2423038482666016, + "epoch": 4.061538461538461, + "grad_norm": 0.38534653186798096, + "learning_rate": 0.00016633383342335331, + "loss": 0.0301, + "mean_token_accuracy": 0.991975411772728, + "num_tokens": 9918264.0, + "step": 1125 + }, + { + "entropy": 1.2479912340641022, + "epoch": 4.065158371040724, + "grad_norm": 0.5592287182807922, + "learning_rate": 0.00016594565604265816, + "loss": 0.0598, + "mean_token_accuracy": 0.9825469106435776, + "num_tokens": 9926721.0, + "step": 1126 + }, + { + "entropy": 1.176967740058899, + "epoch": 4.068778280542986, + "grad_norm": 0.4851750433444977, + "learning_rate": 0.0001655579158429307, + "loss": 0.0481, + "mean_token_accuracy": 0.9884417057037354, + "num_tokens": 9936006.0, + "step": 1127 + }, + { + "entropy": 1.2568339705467224, + "epoch": 4.072398190045249, + "grad_norm": 0.3245483934879303, + "learning_rate": 0.00016517061429685738, + "loss": 0.0183, + "mean_token_accuracy": 0.9941931664943695, + "num_tokens": 9944541.0, + "step": 1128 + }, + { + "entropy": 1.192540168762207, + "epoch": 4.076018099547511, + "grad_norm": 0.529319703578949, + "learning_rate": 0.00016478375287545886, + "loss": 0.0641, + "mean_token_accuracy": 0.9852915108203888, + "num_tokens": 9953279.0, + "step": 1129 + }, + { + "entropy": 1.216693490743637, + "epoch": 4.0796380090497735, + "grad_norm": 0.5261855721473694, + "learning_rate": 0.00016439733304808436, + "loss": 0.044, + "mean_token_accuracy": 0.9879221767187119, + "num_tokens": 9962236.0, + "step": 1130 + }, + { + "entropy": 1.2447461783885956, + "epoch": 4.083257918552036, + "grad_norm": 0.667833149433136, + "learning_rate": 0.0001640113562824054, + "loss": 0.0361, + "mean_token_accuracy": 0.987928569316864, + "num_tokens": 9970769.0, + "step": 1131 + }, + { + "entropy": 1.1653542816638947, + "epoch": 4.086877828054298, + "grad_norm": 0.42759764194488525, + "learning_rate": 0.00016362582404441084, + "loss": 0.0342, + "mean_token_accuracy": 0.9896238744258881, + "num_tokens": 9979647.0, + "step": 1132 + }, + { + "entropy": 1.2345812320709229, + "epoch": 4.090497737556561, + "grad_norm": 0.850532591342926, + "learning_rate": 0.00016324073779840165, + "loss": 0.0528, + "mean_token_accuracy": 0.9796405136585236, + "num_tokens": 9988323.0, + "step": 1133 + }, + { + "entropy": 1.2570472955703735, + "epoch": 4.094117647058823, + "grad_norm": 0.666858971118927, + "learning_rate": 0.00016285609900698413, + "loss": 0.0377, + "mean_token_accuracy": 0.9860774129629135, + "num_tokens": 9996967.0, + "step": 1134 + }, + { + "entropy": 1.1051703989505768, + "epoch": 4.097737556561086, + "grad_norm": 0.4458142817020416, + "learning_rate": 0.0001624719091310654, + "loss": 0.0504, + "mean_token_accuracy": 0.9836345016956329, + "num_tokens": 10006676.0, + "step": 1135 + }, + { + "entropy": 1.2148622274398804, + "epoch": 4.101357466063348, + "grad_norm": 0.7275047898292542, + "learning_rate": 0.0001620881696298478, + "loss": 0.0539, + "mean_token_accuracy": 0.9839228391647339, + "num_tokens": 10015437.0, + "step": 1136 + }, + { + "entropy": 1.2343271374702454, + "epoch": 4.1049773755656105, + "grad_norm": 0.4777041971683502, + "learning_rate": 0.00016170488196082285, + "loss": 0.0441, + "mean_token_accuracy": 0.9889576286077499, + "num_tokens": 10024075.0, + "step": 1137 + }, + { + "entropy": 1.2086644172668457, + "epoch": 4.108597285067873, + "grad_norm": 0.4287410080432892, + "learning_rate": 0.00016132204757976563, + "loss": 0.0327, + "mean_token_accuracy": 0.9914772063493729, + "num_tokens": 10032954.0, + "step": 1138 + }, + { + "entropy": 1.3105964958667755, + "epoch": 4.112217194570135, + "grad_norm": 0.627275288105011, + "learning_rate": 0.0001609396679407303, + "loss": 0.2148, + "mean_token_accuracy": 0.9677064567804337, + "num_tokens": 10041500.0, + "step": 1139 + }, + { + "entropy": 1.213149219751358, + "epoch": 4.115837104072398, + "grad_norm": 0.545829713344574, + "learning_rate": 0.00016055774449604337, + "loss": 0.0537, + "mean_token_accuracy": 0.9844522625207901, + "num_tokens": 10050143.0, + "step": 1140 + }, + { + "entropy": 1.2647437453269958, + "epoch": 4.11945701357466, + "grad_norm": 0.3859199285507202, + "learning_rate": 0.00016017627869629853, + "loss": 0.0211, + "mean_token_accuracy": 0.9922139197587967, + "num_tokens": 10058737.0, + "step": 1141 + }, + { + "entropy": 1.1950034499168396, + "epoch": 4.123076923076923, + "grad_norm": 0.5298987030982971, + "learning_rate": 0.00015979527199035172, + "loss": 0.0476, + "mean_token_accuracy": 0.9850315600633621, + "num_tokens": 10067294.0, + "step": 1142 + }, + { + "entropy": 1.2080510556697845, + "epoch": 4.126696832579185, + "grad_norm": 1.1342438459396362, + "learning_rate": 0.00015941472582531505, + "loss": 0.0548, + "mean_token_accuracy": 0.9844682365655899, + "num_tokens": 10076175.0, + "step": 1143 + }, + { + "entropy": 1.1458761394023895, + "epoch": 4.130316742081448, + "grad_norm": 0.4891660213470459, + "learning_rate": 0.00015903464164655103, + "loss": 0.0612, + "mean_token_accuracy": 0.9851740747690201, + "num_tokens": 10085549.0, + "step": 1144 + }, + { + "entropy": 1.1645109951496124, + "epoch": 4.133936651583711, + "grad_norm": 0.47334766387939453, + "learning_rate": 0.00015865502089766807, + "loss": 0.0464, + "mean_token_accuracy": 0.9830625057220459, + "num_tokens": 10095378.0, + "step": 1145 + }, + { + "entropy": 1.220601737499237, + "epoch": 4.137556561085973, + "grad_norm": 0.2870275378227234, + "learning_rate": 0.00015827586502051405, + "loss": 0.0117, + "mean_token_accuracy": 0.9957176744937897, + "num_tokens": 10104010.0, + "step": 1146 + }, + { + "entropy": 1.2811335325241089, + "epoch": 4.141176470588236, + "grad_norm": 0.80066978931427, + "learning_rate": 0.00015789717545517136, + "loss": 0.0834, + "mean_token_accuracy": 0.9741235673427582, + "num_tokens": 10112986.0, + "step": 1147 + }, + { + "entropy": 1.2601549923419952, + "epoch": 4.144796380090498, + "grad_norm": 0.5500946640968323, + "learning_rate": 0.00015751895363995118, + "loss": 0.0322, + "mean_token_accuracy": 0.9874281883239746, + "num_tokens": 10121704.0, + "step": 1148 + }, + { + "entropy": 1.1824783682823181, + "epoch": 4.1484162895927605, + "grad_norm": 0.33207061886787415, + "learning_rate": 0.00015714120101138824, + "loss": 0.0276, + "mean_token_accuracy": 0.9928829818964005, + "num_tokens": 10130838.0, + "step": 1149 + }, + { + "entropy": 1.2213251888751984, + "epoch": 4.152036199095023, + "grad_norm": 0.5984644889831543, + "learning_rate": 0.00015676391900423513, + "loss": 0.0432, + "mean_token_accuracy": 0.988958552479744, + "num_tokens": 10139651.0, + "step": 1150 + }, + { + "entropy": 1.233760267496109, + "epoch": 4.155656108597285, + "grad_norm": 0.38205328583717346, + "learning_rate": 0.00015638710905145693, + "loss": 0.0327, + "mean_token_accuracy": 0.9904181510210037, + "num_tokens": 10148702.0, + "step": 1151 + }, + { + "entropy": 1.2735399901866913, + "epoch": 4.159276018099548, + "grad_norm": 0.4505186080932617, + "learning_rate": 0.00015601077258422597, + "loss": 0.0343, + "mean_token_accuracy": 0.9863469153642654, + "num_tokens": 10157322.0, + "step": 1152 + }, + { + "entropy": 1.248571664094925, + "epoch": 4.16289592760181, + "grad_norm": 0.5458792448043823, + "learning_rate": 0.00015563491103191604, + "loss": 0.0586, + "mean_token_accuracy": 0.9839989989995956, + "num_tokens": 10166116.0, + "step": 1153 + }, + { + "entropy": 1.2250197231769562, + "epoch": 4.166515837104073, + "grad_norm": 0.4827875792980194, + "learning_rate": 0.00015525952582209725, + "loss": 0.0432, + "mean_token_accuracy": 0.9888117164373398, + "num_tokens": 10174949.0, + "step": 1154 + }, + { + "entropy": 1.2532283961772919, + "epoch": 4.170135746606335, + "grad_norm": 0.6164746284484863, + "learning_rate": 0.0001548846183805304, + "loss": 0.0489, + "mean_token_accuracy": 0.989223524928093, + "num_tokens": 10183649.0, + "step": 1155 + }, + { + "entropy": 1.3029357194900513, + "epoch": 4.173755656108598, + "grad_norm": 0.4397593140602112, + "learning_rate": 0.00015451019013116186, + "loss": 0.0239, + "mean_token_accuracy": 0.9899367988109589, + "num_tokens": 10192293.0, + "step": 1156 + }, + { + "entropy": 1.3099361062049866, + "epoch": 4.17737556561086, + "grad_norm": 0.5233256816864014, + "learning_rate": 0.00015413624249611773, + "loss": 0.0361, + "mean_token_accuracy": 0.9909869581460953, + "num_tokens": 10200972.0, + "step": 1157 + }, + { + "entropy": 1.332322210073471, + "epoch": 4.180995475113122, + "grad_norm": 0.40877828001976013, + "learning_rate": 0.00015376277689569884, + "loss": 0.0378, + "mean_token_accuracy": 0.9867051988840103, + "num_tokens": 10209383.0, + "step": 1158 + }, + { + "entropy": 1.2286882102489471, + "epoch": 4.184615384615385, + "grad_norm": 0.4264864921569824, + "learning_rate": 0.00015338979474837508, + "loss": 0.0349, + "mean_token_accuracy": 0.9909752011299133, + "num_tokens": 10218431.0, + "step": 1159 + }, + { + "entropy": 1.3134422600269318, + "epoch": 4.188235294117647, + "grad_norm": 0.5659143328666687, + "learning_rate": 0.00015301729747078027, + "loss": 0.0472, + "mean_token_accuracy": 0.9854199290275574, + "num_tokens": 10227234.0, + "step": 1160 + }, + { + "entropy": 1.2364612519741058, + "epoch": 4.19185520361991, + "grad_norm": 0.556576132774353, + "learning_rate": 0.00015264528647770644, + "loss": 0.0572, + "mean_token_accuracy": 0.9870803952217102, + "num_tokens": 10236143.0, + "step": 1161 + }, + { + "entropy": 1.3200373947620392, + "epoch": 4.195475113122172, + "grad_norm": 0.5832968950271606, + "learning_rate": 0.0001522737631820988, + "loss": 0.0596, + "mean_token_accuracy": 0.9803946912288666, + "num_tokens": 10244533.0, + "step": 1162 + }, + { + "entropy": 1.2263735830783844, + "epoch": 4.199095022624435, + "grad_norm": 0.5366385579109192, + "learning_rate": 0.00015190272899505024, + "loss": 0.0557, + "mean_token_accuracy": 0.9855044633150101, + "num_tokens": 10253397.0, + "step": 1163 + }, + { + "entropy": 1.2398187220096588, + "epoch": 4.202714932126697, + "grad_norm": 0.3928399384021759, + "learning_rate": 0.0001515321853257958, + "loss": 0.0353, + "mean_token_accuracy": 0.9881556481122971, + "num_tokens": 10262218.0, + "step": 1164 + }, + { + "entropy": 1.2799296081066132, + "epoch": 4.206334841628959, + "grad_norm": 0.34111514687538147, + "learning_rate": 0.00015116213358170756, + "loss": 0.0205, + "mean_token_accuracy": 0.9952614158391953, + "num_tokens": 10270643.0, + "step": 1165 + }, + { + "entropy": 1.2533689439296722, + "epoch": 4.209954751131222, + "grad_norm": 0.6038467884063721, + "learning_rate": 0.00015079257516828923, + "loss": 0.0364, + "mean_token_accuracy": 0.9885422587394714, + "num_tokens": 10279192.0, + "step": 1166 + }, + { + "entropy": 1.286198616027832, + "epoch": 4.213574660633484, + "grad_norm": 0.3359139561653137, + "learning_rate": 0.00015042351148917074, + "loss": 0.0169, + "mean_token_accuracy": 0.9941065609455109, + "num_tokens": 10287955.0, + "step": 1167 + }, + { + "entropy": 1.246151328086853, + "epoch": 4.217194570135747, + "grad_norm": 0.5997951030731201, + "learning_rate": 0.00015005494394610306, + "loss": 0.046, + "mean_token_accuracy": 0.98799067735672, + "num_tokens": 10296726.0, + "step": 1168 + }, + { + "entropy": 1.1993427872657776, + "epoch": 4.220814479638009, + "grad_norm": 0.48044100403785706, + "learning_rate": 0.00014968687393895243, + "loss": 0.037, + "mean_token_accuracy": 0.9872793555259705, + "num_tokens": 10306144.0, + "step": 1169 + }, + { + "entropy": 1.283030241727829, + "epoch": 4.224434389140272, + "grad_norm": 0.5447496175765991, + "learning_rate": 0.00014931930286569606, + "loss": 0.0512, + "mean_token_accuracy": 0.9839747399091721, + "num_tokens": 10314787.0, + "step": 1170 + }, + { + "entropy": 1.289113700389862, + "epoch": 4.228054298642534, + "grad_norm": 0.4241376221179962, + "learning_rate": 0.00014895223212241547, + "loss": 0.0337, + "mean_token_accuracy": 0.9877157956361771, + "num_tokens": 10323498.0, + "step": 1171 + }, + { + "entropy": 1.2173149585723877, + "epoch": 4.2316742081447964, + "grad_norm": 0.4567115604877472, + "learning_rate": 0.00014858566310329204, + "loss": 0.0308, + "mean_token_accuracy": 0.9885731935501099, + "num_tokens": 10332577.0, + "step": 1172 + }, + { + "entropy": 1.2338224053382874, + "epoch": 4.235294117647059, + "grad_norm": 0.7391862869262695, + "learning_rate": 0.00014821959720060196, + "loss": 0.0566, + "mean_token_accuracy": 0.9849485456943512, + "num_tokens": 10341145.0, + "step": 1173 + }, + { + "entropy": 1.2072623670101166, + "epoch": 4.238914027149321, + "grad_norm": 0.4814695715904236, + "learning_rate": 0.00014785403580470983, + "loss": 0.0337, + "mean_token_accuracy": 0.9873705059289932, + "num_tokens": 10349995.0, + "step": 1174 + }, + { + "entropy": 1.2393099963665009, + "epoch": 4.242533936651584, + "grad_norm": 0.9794650673866272, + "learning_rate": 0.0001474889803040645, + "loss": 0.1513, + "mean_token_accuracy": 0.970646321773529, + "num_tokens": 10358839.0, + "step": 1175 + }, + { + "entropy": 1.2389378249645233, + "epoch": 4.246153846153846, + "grad_norm": 0.5181885361671448, + "learning_rate": 0.00014712443208519352, + "loss": 0.0486, + "mean_token_accuracy": 0.9840831756591797, + "num_tokens": 10367605.0, + "step": 1176 + }, + { + "entropy": 1.179212898015976, + "epoch": 4.249773755656109, + "grad_norm": 0.36691412329673767, + "learning_rate": 0.0001467603925326972, + "loss": 0.0328, + "mean_token_accuracy": 0.9896590262651443, + "num_tokens": 10377166.0, + "step": 1177 + }, + { + "entropy": 1.1962746381759644, + "epoch": 4.253393665158371, + "grad_norm": 0.44420450925827026, + "learning_rate": 0.00014639686302924418, + "loss": 0.0347, + "mean_token_accuracy": 0.9883467555046082, + "num_tokens": 10386492.0, + "step": 1178 + }, + { + "entropy": 1.280052810907364, + "epoch": 4.2570135746606335, + "grad_norm": 0.7159825563430786, + "learning_rate": 0.000146033844955566, + "loss": 0.0478, + "mean_token_accuracy": 0.9879229664802551, + "num_tokens": 10395177.0, + "step": 1179 + }, + { + "entropy": 1.201603651046753, + "epoch": 4.260633484162896, + "grad_norm": 0.5321839451789856, + "learning_rate": 0.00014567133969045157, + "loss": 0.0549, + "mean_token_accuracy": 0.9802237451076508, + "num_tokens": 10404061.0, + "step": 1180 + }, + { + "entropy": 1.2947501838207245, + "epoch": 4.264253393665158, + "grad_norm": 0.4968154728412628, + "learning_rate": 0.00014530934861074193, + "loss": 0.0483, + "mean_token_accuracy": 0.9880443960428238, + "num_tokens": 10412500.0, + "step": 1181 + }, + { + "entropy": 1.3257293999195099, + "epoch": 4.267873303167421, + "grad_norm": 0.5405517220497131, + "learning_rate": 0.00014494787309132537, + "loss": 0.0334, + "mean_token_accuracy": 0.9880732297897339, + "num_tokens": 10420905.0, + "step": 1182 + }, + { + "entropy": 1.1579481065273285, + "epoch": 4.271493212669683, + "grad_norm": 0.4960266053676605, + "learning_rate": 0.00014458691450513212, + "loss": 0.0459, + "mean_token_accuracy": 0.9850940555334091, + "num_tokens": 10430442.0, + "step": 1183 + }, + { + "entropy": 1.259652554988861, + "epoch": 4.275113122171946, + "grad_norm": 0.5519370436668396, + "learning_rate": 0.00014422647422312874, + "loss": 0.0451, + "mean_token_accuracy": 0.9860022515058517, + "num_tokens": 10439289.0, + "step": 1184 + }, + { + "entropy": 1.1983542442321777, + "epoch": 4.278733031674208, + "grad_norm": 0.46854233741760254, + "learning_rate": 0.00014386655361431336, + "loss": 0.0378, + "mean_token_accuracy": 0.984845370054245, + "num_tokens": 10448374.0, + "step": 1185 + }, + { + "entropy": 1.2637499272823334, + "epoch": 4.2823529411764705, + "grad_norm": 0.41014033555984497, + "learning_rate": 0.00014350715404571045, + "loss": 0.0276, + "mean_token_accuracy": 0.9908969849348068, + "num_tokens": 10457150.0, + "step": 1186 + }, + { + "entropy": 1.185476541519165, + "epoch": 4.285972850678733, + "grad_norm": 0.36182934045791626, + "learning_rate": 0.00014314827688236527, + "loss": 0.0232, + "mean_token_accuracy": 0.9917554408311844, + "num_tokens": 10466114.0, + "step": 1187 + }, + { + "entropy": 1.2526941895484924, + "epoch": 4.289592760180995, + "grad_norm": 0.42682337760925293, + "learning_rate": 0.00014278992348733897, + "loss": 0.0258, + "mean_token_accuracy": 0.9892576783895493, + "num_tokens": 10474841.0, + "step": 1188 + }, + { + "entropy": 1.2896224856376648, + "epoch": 4.293212669683258, + "grad_norm": 0.6532279253005981, + "learning_rate": 0.00014243209522170366, + "loss": 0.0403, + "mean_token_accuracy": 0.9877428412437439, + "num_tokens": 10483179.0, + "step": 1189 + }, + { + "entropy": 1.2147268652915955, + "epoch": 4.29683257918552, + "grad_norm": 0.5174675583839417, + "learning_rate": 0.0001420747934445364, + "loss": 0.049, + "mean_token_accuracy": 0.9841071367263794, + "num_tokens": 10492012.0, + "step": 1190 + }, + { + "entropy": 1.2130130529403687, + "epoch": 4.300452488687783, + "grad_norm": 0.3487169146537781, + "learning_rate": 0.00014171801951291495, + "loss": 0.0379, + "mean_token_accuracy": 0.987470880150795, + "num_tokens": 10500781.0, + "step": 1191 + }, + { + "entropy": 1.2114128768444061, + "epoch": 4.304072398190045, + "grad_norm": 0.3877721428871155, + "learning_rate": 0.00014136177478191232, + "loss": 0.0366, + "mean_token_accuracy": 0.9868324846029282, + "num_tokens": 10509827.0, + "step": 1192 + }, + { + "entropy": 1.2727918028831482, + "epoch": 4.3076923076923075, + "grad_norm": 0.4810413122177124, + "learning_rate": 0.00014100606060459136, + "loss": 0.0264, + "mean_token_accuracy": 0.9923188835382462, + "num_tokens": 10518320.0, + "step": 1193 + }, + { + "entropy": 1.2287248075008392, + "epoch": 4.31131221719457, + "grad_norm": 0.4871313273906708, + "learning_rate": 0.0001406508783319996, + "loss": 0.0291, + "mean_token_accuracy": 0.9893288463354111, + "num_tokens": 10527372.0, + "step": 1194 + }, + { + "entropy": 1.2256539463996887, + "epoch": 4.314932126696832, + "grad_norm": 0.4660762846469879, + "learning_rate": 0.00014029622931316488, + "loss": 0.0442, + "mean_token_accuracy": 0.9856550842523575, + "num_tokens": 10536221.0, + "step": 1195 + }, + { + "entropy": 1.220929890871048, + "epoch": 4.318552036199095, + "grad_norm": 0.5266750454902649, + "learning_rate": 0.00013994211489508937, + "loss": 0.045, + "mean_token_accuracy": 0.9869575053453445, + "num_tokens": 10544855.0, + "step": 1196 + }, + { + "entropy": 1.2149785161018372, + "epoch": 4.322171945701357, + "grad_norm": 0.38891637325286865, + "learning_rate": 0.00013958853642274445, + "loss": 0.0279, + "mean_token_accuracy": 0.9925510138273239, + "num_tokens": 10553663.0, + "step": 1197 + }, + { + "entropy": 1.2534555792808533, + "epoch": 4.32579185520362, + "grad_norm": 0.504973828792572, + "learning_rate": 0.0001392354952390665, + "loss": 0.0265, + "mean_token_accuracy": 0.9886986464262009, + "num_tokens": 10562210.0, + "step": 1198 + }, + { + "entropy": 1.1534670293331146, + "epoch": 4.329411764705882, + "grad_norm": 0.3674943745136261, + "learning_rate": 0.00013888299268495095, + "loss": 0.0287, + "mean_token_accuracy": 0.9921697527170181, + "num_tokens": 10571681.0, + "step": 1199 + }, + { + "entropy": 1.1742701530456543, + "epoch": 4.3330316742081445, + "grad_norm": 0.5762848854064941, + "learning_rate": 0.0001385310300992471, + "loss": 0.0652, + "mean_token_accuracy": 0.9827972054481506, + "num_tokens": 10580931.0, + "step": 1200 + }, + { + "entropy": 1.224473923444748, + "epoch": 4.336651583710407, + "grad_norm": 0.3944694697856903, + "learning_rate": 0.00013817960881875406, + "loss": 0.0371, + "mean_token_accuracy": 0.9884412586688995, + "num_tokens": 10589820.0, + "step": 1201 + }, + { + "entropy": 1.1943216919898987, + "epoch": 4.340271493212669, + "grad_norm": 0.44515159726142883, + "learning_rate": 0.0001378287301782145, + "loss": 0.0354, + "mean_token_accuracy": 0.9919027835130692, + "num_tokens": 10599295.0, + "step": 1202 + }, + { + "entropy": 1.2744194567203522, + "epoch": 4.343891402714932, + "grad_norm": 0.5868192911148071, + "learning_rate": 0.0001374783955103102, + "loss": 0.0329, + "mean_token_accuracy": 0.990682065486908, + "num_tokens": 10607671.0, + "step": 1203 + }, + { + "entropy": 1.2263163924217224, + "epoch": 4.347511312217194, + "grad_norm": 0.7657507658004761, + "learning_rate": 0.00013712860614565687, + "loss": 0.0557, + "mean_token_accuracy": 0.9864864498376846, + "num_tokens": 10616269.0, + "step": 1204 + }, + { + "entropy": 1.1996283829212189, + "epoch": 4.351131221719457, + "grad_norm": 0.4356515407562256, + "learning_rate": 0.00013677936341279913, + "loss": 0.034, + "mean_token_accuracy": 0.9890493154525757, + "num_tokens": 10624896.0, + "step": 1205 + }, + { + "entropy": 1.2461954951286316, + "epoch": 4.354751131221719, + "grad_norm": 0.4709055721759796, + "learning_rate": 0.0001364306686382054, + "loss": 0.0372, + "mean_token_accuracy": 0.9871190786361694, + "num_tokens": 10633634.0, + "step": 1206 + }, + { + "entropy": 1.2780955731868744, + "epoch": 4.3583710407239815, + "grad_norm": 0.4301148056983948, + "learning_rate": 0.00013608252314626284, + "loss": 0.0302, + "mean_token_accuracy": 0.990274652838707, + "num_tokens": 10641899.0, + "step": 1207 + }, + { + "entropy": 1.1826459169387817, + "epoch": 4.361990950226244, + "grad_norm": 0.42350223660469055, + "learning_rate": 0.00013573492825927238, + "loss": 0.0359, + "mean_token_accuracy": 0.9923158586025238, + "num_tokens": 10650903.0, + "step": 1208 + }, + { + "entropy": 1.2023819386959076, + "epoch": 4.365610859728506, + "grad_norm": 0.5379685759544373, + "learning_rate": 0.00013538788529744375, + "loss": 0.0416, + "mean_token_accuracy": 0.990043118596077, + "num_tokens": 10659772.0, + "step": 1209 + }, + { + "entropy": 1.2026404440402985, + "epoch": 4.36923076923077, + "grad_norm": 0.4013594388961792, + "learning_rate": 0.00013504139557889033, + "loss": 0.0287, + "mean_token_accuracy": 0.9898358583450317, + "num_tokens": 10668661.0, + "step": 1210 + }, + { + "entropy": 1.1808496117591858, + "epoch": 4.372850678733032, + "grad_norm": 0.4315294623374939, + "learning_rate": 0.0001346954604196242, + "loss": 0.0345, + "mean_token_accuracy": 0.9906547367572784, + "num_tokens": 10677439.0, + "step": 1211 + }, + { + "entropy": 1.233126848936081, + "epoch": 4.376470588235295, + "grad_norm": 0.46303990483283997, + "learning_rate": 0.00013435008113355125, + "loss": 0.038, + "mean_token_accuracy": 0.9871664345264435, + "num_tokens": 10686193.0, + "step": 1212 + }, + { + "entropy": 1.1450912654399872, + "epoch": 4.380090497737557, + "grad_norm": 0.39165055751800537, + "learning_rate": 0.0001340052590324659, + "loss": 0.0268, + "mean_token_accuracy": 0.9913052469491959, + "num_tokens": 10695309.0, + "step": 1213 + }, + { + "entropy": 1.213069200515747, + "epoch": 4.383710407239819, + "grad_norm": 0.6870453357696533, + "learning_rate": 0.00013366099542604657, + "loss": 0.0928, + "mean_token_accuracy": 0.9731069356203079, + "num_tokens": 10704279.0, + "step": 1214 + }, + { + "entropy": 1.2083907425403595, + "epoch": 4.387330316742082, + "grad_norm": 0.5568958520889282, + "learning_rate": 0.00013331729162185021, + "loss": 0.0457, + "mean_token_accuracy": 0.9876127988100052, + "num_tokens": 10712689.0, + "step": 1215 + }, + { + "entropy": 1.1905820965766907, + "epoch": 4.390950226244344, + "grad_norm": 0.677911639213562, + "learning_rate": 0.00013297414892530775, + "loss": 0.0578, + "mean_token_accuracy": 0.9796578139066696, + "num_tokens": 10721429.0, + "step": 1216 + }, + { + "entropy": 1.1438561081886292, + "epoch": 4.394570135746607, + "grad_norm": 0.3332656919956207, + "learning_rate": 0.00013263156863971883, + "loss": 0.0211, + "mean_token_accuracy": 0.9961917698383331, + "num_tokens": 10729998.0, + "step": 1217 + }, + { + "entropy": 1.2401678264141083, + "epoch": 4.398190045248869, + "grad_norm": 0.4719739854335785, + "learning_rate": 0.00013228955206624703, + "loss": 0.0205, + "mean_token_accuracy": 0.9942127913236618, + "num_tokens": 10738536.0, + "step": 1218 + }, + { + "entropy": 1.1324664056301117, + "epoch": 4.401809954751132, + "grad_norm": 0.44815942645072937, + "learning_rate": 0.0001319481005039149, + "loss": 0.0343, + "mean_token_accuracy": 0.9881940931081772, + "num_tokens": 10747845.0, + "step": 1219 + }, + { + "entropy": 1.226366937160492, + "epoch": 4.405429864253394, + "grad_norm": 0.53164142370224, + "learning_rate": 0.00013160721524959904, + "loss": 0.0301, + "mean_token_accuracy": 0.9902307987213135, + "num_tokens": 10756478.0, + "step": 1220 + }, + { + "entropy": 1.1090387403964996, + "epoch": 4.409049773755656, + "grad_norm": 0.5124022364616394, + "learning_rate": 0.00013126689759802504, + "loss": 0.0475, + "mean_token_accuracy": 0.9851136803627014, + "num_tokens": 10765981.0, + "step": 1221 + }, + { + "entropy": 1.224695086479187, + "epoch": 4.412669683257919, + "grad_norm": 0.5596076250076294, + "learning_rate": 0.00013092714884176262, + "loss": 0.0398, + "mean_token_accuracy": 0.9881332963705063, + "num_tokens": 10774937.0, + "step": 1222 + }, + { + "entropy": 1.1444518268108368, + "epoch": 4.416289592760181, + "grad_norm": 0.593393087387085, + "learning_rate": 0.00013058797027122108, + "loss": 0.0582, + "mean_token_accuracy": 0.986479327082634, + "num_tokens": 10784549.0, + "step": 1223 + }, + { + "entropy": 1.2317917943000793, + "epoch": 4.419909502262444, + "grad_norm": 0.5458840727806091, + "learning_rate": 0.00013024936317464366, + "loss": 0.0435, + "mean_token_accuracy": 0.9879386126995087, + "num_tokens": 10792945.0, + "step": 1224 + }, + { + "entropy": 1.1805288195610046, + "epoch": 4.423529411764706, + "grad_norm": 1.2501155138015747, + "learning_rate": 0.00012991132883810328, + "loss": 0.0468, + "mean_token_accuracy": 0.9834897071123123, + "num_tokens": 10801754.0, + "step": 1225 + }, + { + "entropy": 1.267040103673935, + "epoch": 4.427149321266969, + "grad_norm": 0.371787428855896, + "learning_rate": 0.0001295738685454976, + "loss": 0.0276, + "mean_token_accuracy": 0.9908801317214966, + "num_tokens": 10810153.0, + "step": 1226 + }, + { + "entropy": 1.1796127259731293, + "epoch": 4.430769230769231, + "grad_norm": 0.5949414968490601, + "learning_rate": 0.00012923698357854367, + "loss": 0.065, + "mean_token_accuracy": 0.9796924442052841, + "num_tokens": 10819188.0, + "step": 1227 + }, + { + "entropy": 1.1894229352474213, + "epoch": 4.4343891402714934, + "grad_norm": 0.5610167980194092, + "learning_rate": 0.00012890067521677343, + "loss": 0.0579, + "mean_token_accuracy": 0.9834547340869904, + "num_tokens": 10827890.0, + "step": 1228 + }, + { + "entropy": 1.193329244852066, + "epoch": 4.438009049773756, + "grad_norm": 0.5496513247489929, + "learning_rate": 0.00012856494473752919, + "loss": 0.0699, + "mean_token_accuracy": 0.9785247892141342, + "num_tokens": 10836838.0, + "step": 1229 + }, + { + "entropy": 1.203848272562027, + "epoch": 4.441628959276018, + "grad_norm": 0.4252963364124298, + "learning_rate": 0.00012822979341595785, + "loss": 0.0298, + "mean_token_accuracy": 0.9896065294742584, + "num_tokens": 10845834.0, + "step": 1230 + }, + { + "entropy": 1.207732379436493, + "epoch": 4.445248868778281, + "grad_norm": 0.517036497592926, + "learning_rate": 0.00012789522252500685, + "loss": 0.0638, + "mean_token_accuracy": 0.9777253717184067, + "num_tokens": 10855063.0, + "step": 1231 + }, + { + "entropy": 1.1849088370800018, + "epoch": 4.448868778280543, + "grad_norm": 0.2902492582798004, + "learning_rate": 0.0001275612333354193, + "loss": 0.019, + "mean_token_accuracy": 0.9935135394334793, + "num_tokens": 10863916.0, + "step": 1232 + }, + { + "entropy": 1.2052322030067444, + "epoch": 4.452488687782806, + "grad_norm": 0.39002323150634766, + "learning_rate": 0.00012722782711572852, + "loss": 0.0338, + "mean_token_accuracy": 0.9920508861541748, + "num_tokens": 10872704.0, + "step": 1233 + }, + { + "entropy": 1.206953376531601, + "epoch": 4.456108597285068, + "grad_norm": 0.4019605815410614, + "learning_rate": 0.00012689500513225372, + "loss": 0.0334, + "mean_token_accuracy": 0.993528202176094, + "num_tokens": 10881430.0, + "step": 1234 + }, + { + "entropy": 1.1898608207702637, + "epoch": 4.4597285067873305, + "grad_norm": 0.4211233854293823, + "learning_rate": 0.00012656276864909545, + "loss": 0.0279, + "mean_token_accuracy": 0.9896760433912277, + "num_tokens": 10890291.0, + "step": 1235 + }, + { + "entropy": 1.1882571280002594, + "epoch": 4.463348416289593, + "grad_norm": 0.6270121335983276, + "learning_rate": 0.00012623111892813018, + "loss": 0.067, + "mean_token_accuracy": 0.9778329133987427, + "num_tokens": 10899339.0, + "step": 1236 + }, + { + "entropy": 1.2167966961860657, + "epoch": 4.466968325791855, + "grad_norm": 0.5115417242050171, + "learning_rate": 0.00012590005722900558, + "loss": 0.0438, + "mean_token_accuracy": 0.9870314300060272, + "num_tokens": 10908084.0, + "step": 1237 + }, + { + "entropy": 1.2357282936573029, + "epoch": 4.470588235294118, + "grad_norm": 0.4048464894294739, + "learning_rate": 0.00012556958480913644, + "loss": 0.0211, + "mean_token_accuracy": 0.9938371032476425, + "num_tokens": 10916729.0, + "step": 1238 + }, + { + "entropy": 1.1960048377513885, + "epoch": 4.47420814479638, + "grad_norm": 0.519778847694397, + "learning_rate": 0.00012523970292369906, + "loss": 0.0352, + "mean_token_accuracy": 0.9902931302785873, + "num_tokens": 10925469.0, + "step": 1239 + }, + { + "entropy": 1.115236908197403, + "epoch": 4.477828054298643, + "grad_norm": 1.1697970628738403, + "learning_rate": 0.00012491041282562673, + "loss": 0.0516, + "mean_token_accuracy": 0.9826432168483734, + "num_tokens": 10935029.0, + "step": 1240 + }, + { + "entropy": 1.1740309596061707, + "epoch": 4.481447963800905, + "grad_norm": 0.5328389406204224, + "learning_rate": 0.00012458171576560541, + "loss": 0.0369, + "mean_token_accuracy": 0.9888178706169128, + "num_tokens": 10943983.0, + "step": 1241 + }, + { + "entropy": 1.2090014219284058, + "epoch": 4.4850678733031675, + "grad_norm": 0.3985499441623688, + "learning_rate": 0.0001242536129920684, + "loss": 0.024, + "mean_token_accuracy": 0.9895037710666656, + "num_tokens": 10952723.0, + "step": 1242 + }, + { + "entropy": 1.1451536118984222, + "epoch": 4.48868778280543, + "grad_norm": 0.4824701249599457, + "learning_rate": 0.00012392610575119164, + "loss": 0.0366, + "mean_token_accuracy": 0.9898587912321091, + "num_tokens": 10961935.0, + "step": 1243 + }, + { + "entropy": 1.256010115146637, + "epoch": 4.492307692307692, + "grad_norm": 0.9043450951576233, + "learning_rate": 0.00012359919528688959, + "loss": 0.1034, + "mean_token_accuracy": 0.9813459366559982, + "num_tokens": 10970679.0, + "step": 1244 + }, + { + "entropy": 1.1992116272449493, + "epoch": 4.495927601809955, + "grad_norm": 0.359037309885025, + "learning_rate": 0.00012327288284080977, + "loss": 0.0262, + "mean_token_accuracy": 0.9912933856248856, + "num_tokens": 10979305.0, + "step": 1245 + }, + { + "entropy": 1.157044231891632, + "epoch": 4.499547511312217, + "grad_norm": 0.6599268913269043, + "learning_rate": 0.00012294716965232847, + "loss": 0.0492, + "mean_token_accuracy": 0.9798050671815872, + "num_tokens": 10988385.0, + "step": 1246 + }, + { + "entropy": 1.2378050088882446, + "epoch": 4.50316742081448, + "grad_norm": 0.28043484687805176, + "learning_rate": 0.00012262205695854584, + "loss": 0.0139, + "mean_token_accuracy": 0.9976954162120819, + "num_tokens": 10996897.0, + "step": 1247 + }, + { + "entropy": 1.2076781392097473, + "epoch": 4.506787330316742, + "grad_norm": 0.5077516436576843, + "learning_rate": 0.0001222975459942814, + "loss": 0.0409, + "mean_token_accuracy": 0.9874622672796249, + "num_tokens": 11005528.0, + "step": 1248 + }, + { + "entropy": 1.1962661147117615, + "epoch": 4.5104072398190045, + "grad_norm": 0.33547911047935486, + "learning_rate": 0.00012197363799206908, + "loss": 0.0161, + "mean_token_accuracy": 0.9955142885446548, + "num_tokens": 11014356.0, + "step": 1249 + }, + { + "entropy": 1.1652462780475616, + "epoch": 4.514027149321267, + "grad_norm": 0.4315963685512543, + "learning_rate": 0.00012165033418215278, + "loss": 0.038, + "mean_token_accuracy": 0.986273393034935, + "num_tokens": 11023148.0, + "step": 1250 + }, + { + "entropy": 1.1507481038570404, + "epoch": 4.517647058823529, + "grad_norm": 0.5866735577583313, + "learning_rate": 0.00012132763579248157, + "loss": 0.0709, + "mean_token_accuracy": 0.9747334122657776, + "num_tokens": 11032698.0, + "step": 1251 + }, + { + "entropy": 1.1324940025806427, + "epoch": 4.521266968325792, + "grad_norm": 0.6004504561424255, + "learning_rate": 0.00012100554404870504, + "loss": 0.062, + "mean_token_accuracy": 0.9802116751670837, + "num_tokens": 11042191.0, + "step": 1252 + }, + { + "entropy": 1.2644992768764496, + "epoch": 4.524886877828054, + "grad_norm": 0.4573010504245758, + "learning_rate": 0.00012068406017416869, + "loss": 0.0454, + "mean_token_accuracy": 0.9845332503318787, + "num_tokens": 11050560.0, + "step": 1253 + }, + { + "entropy": 1.1976861953735352, + "epoch": 4.528506787330317, + "grad_norm": 0.5563387870788574, + "learning_rate": 0.00012036318538990926, + "loss": 0.0461, + "mean_token_accuracy": 0.988158106803894, + "num_tokens": 11059522.0, + "step": 1254 + }, + { + "entropy": 1.1570810675621033, + "epoch": 4.532126696832579, + "grad_norm": 0.32090041041374207, + "learning_rate": 0.00012004292091465011, + "loss": 0.0271, + "mean_token_accuracy": 0.9932841211557388, + "num_tokens": 11068647.0, + "step": 1255 + }, + { + "entropy": 1.1704442203044891, + "epoch": 4.5357466063348415, + "grad_norm": 0.4426371455192566, + "learning_rate": 0.00011972326796479646, + "loss": 0.0382, + "mean_token_accuracy": 0.991062343120575, + "num_tokens": 11077626.0, + "step": 1256 + }, + { + "entropy": 1.1977724432945251, + "epoch": 4.539366515837104, + "grad_norm": 0.37808912992477417, + "learning_rate": 0.00011940422775443095, + "loss": 0.0294, + "mean_token_accuracy": 0.9922671616077423, + "num_tokens": 11086470.0, + "step": 1257 + }, + { + "entropy": 1.1857715249061584, + "epoch": 4.542986425339366, + "grad_norm": 0.5283500552177429, + "learning_rate": 0.00011908580149530903, + "loss": 0.0498, + "mean_token_accuracy": 0.983918771147728, + "num_tokens": 11095267.0, + "step": 1258 + }, + { + "entropy": 1.2057181596755981, + "epoch": 4.546606334841629, + "grad_norm": 0.6116226315498352, + "learning_rate": 0.00011876799039685415, + "loss": 0.0617, + "mean_token_accuracy": 0.9871800243854523, + "num_tokens": 11103982.0, + "step": 1259 + }, + { + "entropy": 1.1275395154953003, + "epoch": 4.550226244343891, + "grad_norm": 0.374963641166687, + "learning_rate": 0.0001184507956661534, + "loss": 0.0392, + "mean_token_accuracy": 0.9863047152757645, + "num_tokens": 11113139.0, + "step": 1260 + }, + { + "entropy": 1.2039974927902222, + "epoch": 4.553846153846154, + "grad_norm": 0.37970781326293945, + "learning_rate": 0.0001181342185079528, + "loss": 0.0316, + "mean_token_accuracy": 0.9907213151454926, + "num_tokens": 11121932.0, + "step": 1261 + }, + { + "entropy": 1.1615934669971466, + "epoch": 4.557466063348416, + "grad_norm": 0.5714005827903748, + "learning_rate": 0.00011781826012465267, + "loss": 0.076, + "mean_token_accuracy": 0.9788525849580765, + "num_tokens": 11131104.0, + "step": 1262 + }, + { + "entropy": 1.2253602147102356, + "epoch": 4.5610859728506785, + "grad_norm": 0.39638063311576843, + "learning_rate": 0.0001175029217163033, + "loss": 0.029, + "mean_token_accuracy": 0.9923384636640549, + "num_tokens": 11139439.0, + "step": 1263 + }, + { + "entropy": 1.226563960313797, + "epoch": 4.564705882352941, + "grad_norm": 0.6515656113624573, + "learning_rate": 0.00011718820448060013, + "loss": 0.039, + "mean_token_accuracy": 0.9874402731657028, + "num_tokens": 11148163.0, + "step": 1264 + }, + { + "entropy": 1.2068665623664856, + "epoch": 4.568325791855203, + "grad_norm": 0.4194830656051636, + "learning_rate": 0.00011687410961287929, + "loss": 0.0315, + "mean_token_accuracy": 0.9902697205543518, + "num_tokens": 11156698.0, + "step": 1265 + }, + { + "entropy": 1.185595840215683, + "epoch": 4.571945701357466, + "grad_norm": 0.3697413504123688, + "learning_rate": 0.00011656063830611315, + "loss": 0.0192, + "mean_token_accuracy": 0.9939739406108856, + "num_tokens": 11165563.0, + "step": 1266 + }, + { + "entropy": 1.146008312702179, + "epoch": 4.575565610859728, + "grad_norm": 0.6688143610954285, + "learning_rate": 0.0001162477917509057, + "loss": 0.0578, + "mean_token_accuracy": 0.9821225702762604, + "num_tokens": 11175007.0, + "step": 1267 + }, + { + "entropy": 1.1765292882919312, + "epoch": 4.579185520361991, + "grad_norm": 0.40471139550209045, + "learning_rate": 0.00011593557113548798, + "loss": 0.0424, + "mean_token_accuracy": 0.9884167313575745, + "num_tokens": 11184158.0, + "step": 1268 + }, + { + "entropy": 1.236164003610611, + "epoch": 4.582805429864253, + "grad_norm": 0.3746803104877472, + "learning_rate": 0.00011562397764571371, + "loss": 0.0214, + "mean_token_accuracy": 0.9929858446121216, + "num_tokens": 11192773.0, + "step": 1269 + }, + { + "entropy": 1.2086671888828278, + "epoch": 4.5864253393665155, + "grad_norm": 0.45422643423080444, + "learning_rate": 0.00011531301246505468, + "loss": 0.0348, + "mean_token_accuracy": 0.9905667155981064, + "num_tokens": 11201449.0, + "step": 1270 + }, + { + "entropy": 1.2117674052715302, + "epoch": 4.590045248868778, + "grad_norm": 0.6983218193054199, + "learning_rate": 0.00011500267677459625, + "loss": 0.0222, + "mean_token_accuracy": 0.9888782948255539, + "num_tokens": 11210311.0, + "step": 1271 + }, + { + "entropy": 1.2101182341575623, + "epoch": 4.59366515837104, + "grad_norm": 0.6044661998748779, + "learning_rate": 0.00011469297175303293, + "loss": 0.046, + "mean_token_accuracy": 0.9895824193954468, + "num_tokens": 11219016.0, + "step": 1272 + }, + { + "entropy": 1.1193795204162598, + "epoch": 4.597285067873303, + "grad_norm": 0.7723874449729919, + "learning_rate": 0.00011438389857666392, + "loss": 0.0758, + "mean_token_accuracy": 0.981050118803978, + "num_tokens": 11228429.0, + "step": 1273 + }, + { + "entropy": 1.240187257528305, + "epoch": 4.600904977375565, + "grad_norm": 0.680292010307312, + "learning_rate": 0.00011407545841938842, + "loss": 0.0418, + "mean_token_accuracy": 0.986311137676239, + "num_tokens": 11236929.0, + "step": 1274 + }, + { + "entropy": 1.2108842730522156, + "epoch": 4.604524886877828, + "grad_norm": 0.6665503978729248, + "learning_rate": 0.00011376765245270154, + "loss": 0.0679, + "mean_token_accuracy": 0.9780846238136292, + "num_tokens": 11245589.0, + "step": 1275 + }, + { + "entropy": 1.1833328604698181, + "epoch": 4.60814479638009, + "grad_norm": 0.4157174825668335, + "learning_rate": 0.00011346048184568953, + "loss": 0.0244, + "mean_token_accuracy": 0.9866289645433426, + "num_tokens": 11254225.0, + "step": 1276 + }, + { + "entropy": 1.1474134027957916, + "epoch": 4.6117647058823525, + "grad_norm": 0.388327032327652, + "learning_rate": 0.00011315394776502554, + "loss": 0.0305, + "mean_token_accuracy": 0.9905170947313309, + "num_tokens": 11262729.0, + "step": 1277 + }, + { + "entropy": 1.1579011976718903, + "epoch": 4.615384615384615, + "grad_norm": 0.5559958219528198, + "learning_rate": 0.00011284805137496494, + "loss": 0.0263, + "mean_token_accuracy": 0.9905281066894531, + "num_tokens": 11271565.0, + "step": 1278 + }, + { + "entropy": 1.1825282573699951, + "epoch": 4.619004524886877, + "grad_norm": 0.3398473858833313, + "learning_rate": 0.0001125427938373415, + "loss": 0.0175, + "mean_token_accuracy": 0.9951401799917221, + "num_tokens": 11280546.0, + "step": 1279 + }, + { + "entropy": 1.1961012184619904, + "epoch": 4.62262443438914, + "grad_norm": 0.3031075894832611, + "learning_rate": 0.00011223817631156197, + "loss": 0.0161, + "mean_token_accuracy": 0.9932055175304413, + "num_tokens": 11289253.0, + "step": 1280 + }, + { + "entropy": 1.1445344388484955, + "epoch": 4.626244343891402, + "grad_norm": 0.4055817127227783, + "learning_rate": 0.00011193419995460257, + "loss": 0.0305, + "mean_token_accuracy": 0.9872564822435379, + "num_tokens": 11298445.0, + "step": 1281 + }, + { + "entropy": 1.1870165169239044, + "epoch": 4.629864253393665, + "grad_norm": 0.5154533386230469, + "learning_rate": 0.00011163086592100444, + "loss": 0.0318, + "mean_token_accuracy": 0.9910313338041306, + "num_tokens": 11307495.0, + "step": 1282 + }, + { + "entropy": 1.2139601707458496, + "epoch": 4.633484162895927, + "grad_norm": 0.5279651880264282, + "learning_rate": 0.00011132817536286869, + "loss": 0.0293, + "mean_token_accuracy": 0.9919774979352951, + "num_tokens": 11315776.0, + "step": 1283 + }, + { + "entropy": 1.1790938973426819, + "epoch": 4.63710407239819, + "grad_norm": 0.3685486614704132, + "learning_rate": 0.00011102612942985265, + "loss": 0.0185, + "mean_token_accuracy": 0.9948403984308243, + "num_tokens": 11324516.0, + "step": 1284 + }, + { + "entropy": 1.090241402387619, + "epoch": 4.640723981900453, + "grad_norm": 0.35041630268096924, + "learning_rate": 0.00011072472926916545, + "loss": 0.0356, + "mean_token_accuracy": 0.9899317622184753, + "num_tokens": 11333970.0, + "step": 1285 + }, + { + "entropy": 1.113451361656189, + "epoch": 4.644343891402715, + "grad_norm": 0.2749859094619751, + "learning_rate": 0.00011042397602556312, + "loss": 0.0199, + "mean_token_accuracy": 0.9949481189250946, + "num_tokens": 11343096.0, + "step": 1286 + }, + { + "entropy": 1.1552523374557495, + "epoch": 4.647963800904978, + "grad_norm": 0.5669251084327698, + "learning_rate": 0.0001101238708413448, + "loss": 0.0656, + "mean_token_accuracy": 0.9822637885808945, + "num_tokens": 11351931.0, + "step": 1287 + }, + { + "entropy": 1.136662244796753, + "epoch": 4.65158371040724, + "grad_norm": 0.3863164484500885, + "learning_rate": 0.00010982441485634835, + "loss": 0.025, + "mean_token_accuracy": 0.9904049932956696, + "num_tokens": 11360920.0, + "step": 1288 + }, + { + "entropy": 1.1108895540237427, + "epoch": 4.655203619909503, + "grad_norm": 0.31534653902053833, + "learning_rate": 0.0001095256092079458, + "loss": 0.0204, + "mean_token_accuracy": 0.9938828349113464, + "num_tokens": 11369947.0, + "step": 1289 + }, + { + "entropy": 1.1851194500923157, + "epoch": 4.658823529411765, + "grad_norm": 0.5289696455001831, + "learning_rate": 0.00010922745503103884, + "loss": 0.0514, + "mean_token_accuracy": 0.9875812381505966, + "num_tokens": 11378552.0, + "step": 1290 + }, + { + "entropy": 1.0774552524089813, + "epoch": 4.6624434389140275, + "grad_norm": 0.40037772059440613, + "learning_rate": 0.00010892995345805528, + "loss": 0.0364, + "mean_token_accuracy": 0.9890797883272171, + "num_tokens": 11387967.0, + "step": 1291 + }, + { + "entropy": 1.2189326882362366, + "epoch": 4.66606334841629, + "grad_norm": 0.5457913279533386, + "learning_rate": 0.00010863310561894397, + "loss": 0.0478, + "mean_token_accuracy": 0.9867587238550186, + "num_tokens": 11396578.0, + "step": 1292 + }, + { + "entropy": 1.1517588794231415, + "epoch": 4.669683257918552, + "grad_norm": 0.796279788017273, + "learning_rate": 0.00010833691264117066, + "loss": 0.065, + "mean_token_accuracy": 0.9865336418151855, + "num_tokens": 11405363.0, + "step": 1293 + }, + { + "entropy": 1.1254721879959106, + "epoch": 4.673303167420815, + "grad_norm": 0.3716721832752228, + "learning_rate": 0.00010804137564971422, + "loss": 0.023, + "mean_token_accuracy": 0.9923341125249863, + "num_tokens": 11414795.0, + "step": 1294 + }, + { + "entropy": 1.0851550549268723, + "epoch": 4.676923076923077, + "grad_norm": 0.36168938875198364, + "learning_rate": 0.00010774649576706178, + "loss": 0.0284, + "mean_token_accuracy": 0.9884274005889893, + "num_tokens": 11424330.0, + "step": 1295 + }, + { + "entropy": 1.1117421388626099, + "epoch": 4.68054298642534, + "grad_norm": 0.36146998405456543, + "learning_rate": 0.0001074522741132045, + "loss": 0.0308, + "mean_token_accuracy": 0.9892974644899368, + "num_tokens": 11433315.0, + "step": 1296 + }, + { + "entropy": 1.1173599064350128, + "epoch": 4.684162895927602, + "grad_norm": 0.5733943581581116, + "learning_rate": 0.00010715871180563403, + "loss": 0.0595, + "mean_token_accuracy": 0.9814283847808838, + "num_tokens": 11442321.0, + "step": 1297 + }, + { + "entropy": 1.1422770619392395, + "epoch": 4.6877828054298645, + "grad_norm": 0.5190064907073975, + "learning_rate": 0.00010686580995933731, + "loss": 0.0527, + "mean_token_accuracy": 0.9888507723808289, + "num_tokens": 11451531.0, + "step": 1298 + }, + { + "entropy": 1.1887125968933105, + "epoch": 4.691402714932127, + "grad_norm": 0.43336015939712524, + "learning_rate": 0.00010657356968679273, + "loss": 0.0205, + "mean_token_accuracy": 0.9921245276927948, + "num_tokens": 11460372.0, + "step": 1299 + }, + { + "entropy": 1.1815907955169678, + "epoch": 4.695022624434389, + "grad_norm": 0.9948742389678955, + "learning_rate": 0.00010628199209796627, + "loss": 0.1664, + "mean_token_accuracy": 0.9725013822317123, + "num_tokens": 11469511.0, + "step": 1300 + }, + { + "entropy": 1.2282527089118958, + "epoch": 4.698642533936652, + "grad_norm": 0.24515807628631592, + "learning_rate": 0.00010599107830030672, + "loss": 0.0113, + "mean_token_accuracy": 0.9958342462778091, + "num_tokens": 11477768.0, + "step": 1301 + }, + { + "entropy": 1.1868560314178467, + "epoch": 4.702262443438914, + "grad_norm": 0.44611793756484985, + "learning_rate": 0.00010570082939874174, + "loss": 0.0294, + "mean_token_accuracy": 0.9905449897050858, + "num_tokens": 11486161.0, + "step": 1302 + }, + { + "entropy": 1.1398615539073944, + "epoch": 4.705882352941177, + "grad_norm": 0.4731442332267761, + "learning_rate": 0.00010541124649567368, + "loss": 0.0481, + "mean_token_accuracy": 0.9839041233062744, + "num_tokens": 11495424.0, + "step": 1303 + }, + { + "entropy": 1.1303377449512482, + "epoch": 4.709502262443439, + "grad_norm": 0.44781962037086487, + "learning_rate": 0.00010512233069097528, + "loss": 0.0157, + "mean_token_accuracy": 0.9963311403989792, + "num_tokens": 11504201.0, + "step": 1304 + }, + { + "entropy": 1.2015290260314941, + "epoch": 4.7131221719457015, + "grad_norm": 0.7324572801589966, + "learning_rate": 0.00010483408308198563, + "loss": 0.0383, + "mean_token_accuracy": 0.9885118752717972, + "num_tokens": 11513025.0, + "step": 1305 + }, + { + "entropy": 1.111644297838211, + "epoch": 4.716742081447964, + "grad_norm": 0.3878529369831085, + "learning_rate": 0.00010454650476350581, + "loss": 0.0365, + "mean_token_accuracy": 0.9912507086992264, + "num_tokens": 11522515.0, + "step": 1306 + }, + { + "entropy": 1.1407475769519806, + "epoch": 4.720361990950226, + "grad_norm": 0.4501092731952667, + "learning_rate": 0.000104259596827795, + "loss": 0.0506, + "mean_token_accuracy": 0.9859120547771454, + "num_tokens": 11531359.0, + "step": 1307 + }, + { + "entropy": 1.1798086017370224, + "epoch": 4.723981900452489, + "grad_norm": 0.5489926338195801, + "learning_rate": 0.00010397336036456606, + "loss": 0.0427, + "mean_token_accuracy": 0.9872020483016968, + "num_tokens": 11540219.0, + "step": 1308 + }, + { + "entropy": 1.1605547368526459, + "epoch": 4.727601809954751, + "grad_norm": 0.46645691990852356, + "learning_rate": 0.00010368779646098153, + "loss": 0.0341, + "mean_token_accuracy": 0.991564467549324, + "num_tokens": 11548703.0, + "step": 1309 + }, + { + "entropy": 1.160649299621582, + "epoch": 4.731221719457014, + "grad_norm": 0.41181880235671997, + "learning_rate": 0.00010340290620164959, + "loss": 0.0335, + "mean_token_accuracy": 0.9900417178869247, + "num_tokens": 11557884.0, + "step": 1310 + }, + { + "entropy": 1.2060245275497437, + "epoch": 4.734841628959276, + "grad_norm": 0.593100368976593, + "learning_rate": 0.00010311869066861967, + "loss": 0.0489, + "mean_token_accuracy": 0.9868187755346298, + "num_tokens": 11566192.0, + "step": 1311 + }, + { + "entropy": 1.1960155069828033, + "epoch": 4.7384615384615385, + "grad_norm": 0.6581326723098755, + "learning_rate": 0.00010283515094137866, + "loss": 0.0488, + "mean_token_accuracy": 0.9871827214956284, + "num_tokens": 11575309.0, + "step": 1312 + }, + { + "entropy": 1.1478987336158752, + "epoch": 4.742081447963801, + "grad_norm": 0.7423575520515442, + "learning_rate": 0.00010255228809684654, + "loss": 0.0442, + "mean_token_accuracy": 0.9883019477128983, + "num_tokens": 11584290.0, + "step": 1313 + }, + { + "entropy": 1.1630191802978516, + "epoch": 4.745701357466063, + "grad_norm": 0.4025084376335144, + "learning_rate": 0.00010227010320937243, + "loss": 0.0289, + "mean_token_accuracy": 0.9910438656806946, + "num_tokens": 11593101.0, + "step": 1314 + }, + { + "entropy": 1.201496183872223, + "epoch": 4.749321266968326, + "grad_norm": 0.2735002338886261, + "learning_rate": 0.0001019885973507305, + "loss": 0.0091, + "mean_token_accuracy": 0.997909814119339, + "num_tokens": 11601492.0, + "step": 1315 + }, + { + "entropy": 1.1884644627571106, + "epoch": 4.752941176470588, + "grad_norm": 0.36079543828964233, + "learning_rate": 0.00010170777159011589, + "loss": 0.0177, + "mean_token_accuracy": 0.9946515262126923, + "num_tokens": 11610063.0, + "step": 1316 + }, + { + "entropy": 1.1655499041080475, + "epoch": 4.756561085972851, + "grad_norm": 0.28955161571502686, + "learning_rate": 0.00010142762699414064, + "loss": 0.0213, + "mean_token_accuracy": 0.9961933195590973, + "num_tokens": 11618979.0, + "step": 1317 + }, + { + "entropy": 1.104924738407135, + "epoch": 4.760180995475113, + "grad_norm": 0.5413517355918884, + "learning_rate": 0.00010114816462682961, + "loss": 0.0523, + "mean_token_accuracy": 0.9876614063978195, + "num_tokens": 11628224.0, + "step": 1318 + }, + { + "entropy": 1.2001541256904602, + "epoch": 4.7638009049773755, + "grad_norm": 0.38713425397872925, + "learning_rate": 0.00010086938554961647, + "loss": 0.0289, + "mean_token_accuracy": 0.9902506768703461, + "num_tokens": 11636542.0, + "step": 1319 + }, + { + "entropy": 1.1230383515357971, + "epoch": 4.767420814479638, + "grad_norm": 0.42886537313461304, + "learning_rate": 0.00010059129082133972, + "loss": 0.0278, + "mean_token_accuracy": 0.991959884762764, + "num_tokens": 11645767.0, + "step": 1320 + }, + { + "entropy": 1.2156026661396027, + "epoch": 4.7710407239819, + "grad_norm": 0.5437722206115723, + "learning_rate": 0.00010031388149823848, + "loss": 0.0379, + "mean_token_accuracy": 0.9888860136270523, + "num_tokens": 11654434.0, + "step": 1321 + }, + { + "entropy": 1.1663733422756195, + "epoch": 4.774660633484163, + "grad_norm": 0.3484131693840027, + "learning_rate": 0.00010003715863394893, + "loss": 0.0179, + "mean_token_accuracy": 0.9933156818151474, + "num_tokens": 11663217.0, + "step": 1322 + }, + { + "entropy": 1.184800922870636, + "epoch": 4.778280542986425, + "grad_norm": 0.44003114104270935, + "learning_rate": 9.976112327949957e-05, + "loss": 0.0402, + "mean_token_accuracy": 0.9876485913991928, + "num_tokens": 11672149.0, + "step": 1323 + }, + { + "entropy": 1.0641057193279266, + "epoch": 4.781900452488688, + "grad_norm": 0.5698227882385254, + "learning_rate": 9.94857764833079e-05, + "loss": 0.0548, + "mean_token_accuracy": 0.9845984429121017, + "num_tokens": 11681304.0, + "step": 1324 + }, + { + "entropy": 1.2146041095256805, + "epoch": 4.78552036199095, + "grad_norm": 0.6113767623901367, + "learning_rate": 9.921111929117624e-05, + "loss": 0.0421, + "mean_token_accuracy": 0.9896378368139267, + "num_tokens": 11689662.0, + "step": 1325 + }, + { + "entropy": 1.1616670489311218, + "epoch": 4.7891402714932125, + "grad_norm": 0.41098496317863464, + "learning_rate": 9.893715274628749e-05, + "loss": 0.0372, + "mean_token_accuracy": 0.9886336326599121, + "num_tokens": 11698852.0, + "step": 1326 + }, + { + "entropy": 1.258986234664917, + "epoch": 4.792760180995475, + "grad_norm": 0.372736394405365, + "learning_rate": 9.866387788920149e-05, + "loss": 0.0196, + "mean_token_accuracy": 0.9942413419485092, + "num_tokens": 11707118.0, + "step": 1327 + }, + { + "entropy": 1.181258738040924, + "epoch": 4.796380090497737, + "grad_norm": 0.5828834176063538, + "learning_rate": 9.839129575785117e-05, + "loss": 0.0445, + "mean_token_accuracy": 0.9854382127523422, + "num_tokens": 11715957.0, + "step": 1328 + }, + { + "entropy": 1.1307817101478577, + "epoch": 4.8, + "grad_norm": 0.48134756088256836, + "learning_rate": 9.811940738753796e-05, + "loss": 0.039, + "mean_token_accuracy": 0.9897171407938004, + "num_tokens": 11725287.0, + "step": 1329 + }, + { + "entropy": 1.19740691781044, + "epoch": 4.803619909502262, + "grad_norm": 0.7992010712623596, + "learning_rate": 9.784821381092864e-05, + "loss": 0.0699, + "mean_token_accuracy": 0.9800475984811783, + "num_tokens": 11733857.0, + "step": 1330 + }, + { + "entropy": 1.2161820828914642, + "epoch": 4.807239819004525, + "grad_norm": 0.4690302908420563, + "learning_rate": 9.7577716058051e-05, + "loss": 0.0305, + "mean_token_accuracy": 0.9906628876924515, + "num_tokens": 11742243.0, + "step": 1331 + }, + { + "entropy": 1.194101721048355, + "epoch": 4.810859728506787, + "grad_norm": 0.382661908864975, + "learning_rate": 9.730791515629003e-05, + "loss": 0.0271, + "mean_token_accuracy": 0.9905851632356644, + "num_tokens": 11751039.0, + "step": 1332 + }, + { + "entropy": 1.2171461582183838, + "epoch": 4.8144796380090495, + "grad_norm": 0.41641414165496826, + "learning_rate": 9.703881213038375e-05, + "loss": 0.0377, + "mean_token_accuracy": 0.991615504026413, + "num_tokens": 11759786.0, + "step": 1333 + }, + { + "entropy": 1.15550696849823, + "epoch": 4.818099547511312, + "grad_norm": 0.43042320013046265, + "learning_rate": 9.677040800241995e-05, + "loss": 0.0377, + "mean_token_accuracy": 0.9883392006158829, + "num_tokens": 11768817.0, + "step": 1334 + }, + { + "entropy": 1.1094126105308533, + "epoch": 4.821719457013574, + "grad_norm": 0.4802487790584564, + "learning_rate": 9.650270379183166e-05, + "loss": 0.0357, + "mean_token_accuracy": 0.9853959828615189, + "num_tokens": 11778098.0, + "step": 1335 + }, + { + "entropy": 1.1772915720939636, + "epoch": 4.825339366515837, + "grad_norm": 0.47741183638572693, + "learning_rate": 9.623570051539347e-05, + "loss": 0.0488, + "mean_token_accuracy": 0.9828261733055115, + "num_tokens": 11787472.0, + "step": 1336 + }, + { + "entropy": 1.1935390532016754, + "epoch": 4.828959276018099, + "grad_norm": 0.49854576587677, + "learning_rate": 9.596939918721795e-05, + "loss": 0.0461, + "mean_token_accuracy": 0.9878488183021545, + "num_tokens": 11796269.0, + "step": 1337 + }, + { + "entropy": 1.2000310122966766, + "epoch": 4.832579185520362, + "grad_norm": 0.449413925409317, + "learning_rate": 9.570380081875159e-05, + "loss": 0.0326, + "mean_token_accuracy": 0.9885634779930115, + "num_tokens": 11804800.0, + "step": 1338 + }, + { + "entropy": 1.1534567177295685, + "epoch": 4.836199095022624, + "grad_norm": 0.4129447638988495, + "learning_rate": 9.543890641877057e-05, + "loss": 0.0241, + "mean_token_accuracy": 0.9913550019264221, + "num_tokens": 11813742.0, + "step": 1339 + }, + { + "entropy": 1.2593884468078613, + "epoch": 4.839819004524887, + "grad_norm": 0.5344672203063965, + "learning_rate": 9.51747169933778e-05, + "loss": 0.0328, + "mean_token_accuracy": 0.993433803319931, + "num_tokens": 11821810.0, + "step": 1340 + }, + { + "entropy": 1.1583607494831085, + "epoch": 4.843438914027149, + "grad_norm": 0.5206009745597839, + "learning_rate": 9.491123354599839e-05, + "loss": 0.0404, + "mean_token_accuracy": 0.9856080114841461, + "num_tokens": 11830477.0, + "step": 1341 + }, + { + "entropy": 1.1516379415988922, + "epoch": 4.847058823529411, + "grad_norm": 0.40458643436431885, + "learning_rate": 9.464845707737593e-05, + "loss": 0.0457, + "mean_token_accuracy": 0.9855477511882782, + "num_tokens": 11839532.0, + "step": 1342 + }, + { + "entropy": 1.268717736005783, + "epoch": 4.850678733031674, + "grad_norm": 0.49775683879852295, + "learning_rate": 9.438638858556893e-05, + "loss": 0.0394, + "mean_token_accuracy": 0.9892454147338867, + "num_tokens": 11847796.0, + "step": 1343 + }, + { + "entropy": 1.204954743385315, + "epoch": 4.854298642533936, + "grad_norm": 0.5535430312156677, + "learning_rate": 9.412502906594703e-05, + "loss": 0.0279, + "mean_token_accuracy": 0.9906836897134781, + "num_tokens": 11856597.0, + "step": 1344 + }, + { + "entropy": 1.239096075296402, + "epoch": 4.857918552036199, + "grad_norm": 0.43477335572242737, + "learning_rate": 9.3864379511187e-05, + "loss": 0.0282, + "mean_token_accuracy": 0.9896904081106186, + "num_tokens": 11864932.0, + "step": 1345 + }, + { + "entropy": 1.181561678647995, + "epoch": 4.861538461538462, + "grad_norm": 0.7839770317077637, + "learning_rate": 9.360444091126893e-05, + "loss": 0.0598, + "mean_token_accuracy": 0.9811979532241821, + "num_tokens": 11873696.0, + "step": 1346 + }, + { + "entropy": 1.2574385702610016, + "epoch": 4.8651583710407245, + "grad_norm": 0.8974226117134094, + "learning_rate": 9.334521425347285e-05, + "loss": 0.052, + "mean_token_accuracy": 0.9802359342575073, + "num_tokens": 11881907.0, + "step": 1347 + }, + { + "entropy": 1.2081020176410675, + "epoch": 4.868778280542987, + "grad_norm": 0.5048856735229492, + "learning_rate": 9.30867005223747e-05, + "loss": 0.0405, + "mean_token_accuracy": 0.9917744398117065, + "num_tokens": 11890265.0, + "step": 1348 + }, + { + "entropy": 1.1894653141498566, + "epoch": 4.872398190045249, + "grad_norm": 0.37338632345199585, + "learning_rate": 9.282890069984239e-05, + "loss": 0.0295, + "mean_token_accuracy": 0.9903924912214279, + "num_tokens": 11899137.0, + "step": 1349 + }, + { + "entropy": 1.1611916571855545, + "epoch": 4.876018099547512, + "grad_norm": 0.48497217893600464, + "learning_rate": 9.257181576503266e-05, + "loss": 0.0399, + "mean_token_accuracy": 0.9874439090490341, + "num_tokens": 11908436.0, + "step": 1350 + }, + { + "entropy": 1.1653602123260498, + "epoch": 4.879638009049774, + "grad_norm": 0.4942433834075928, + "learning_rate": 9.231544669438686e-05, + "loss": 0.0444, + "mean_token_accuracy": 0.9836005568504333, + "num_tokens": 11917077.0, + "step": 1351 + }, + { + "entropy": 1.170697033405304, + "epoch": 4.883257918552037, + "grad_norm": 0.4725663959980011, + "learning_rate": 9.205979446162726e-05, + "loss": 0.0285, + "mean_token_accuracy": 0.9913990944623947, + "num_tokens": 11925805.0, + "step": 1352 + }, + { + "entropy": 1.159929096698761, + "epoch": 4.886877828054299, + "grad_norm": 0.40604647994041443, + "learning_rate": 9.180486003775372e-05, + "loss": 0.0241, + "mean_token_accuracy": 0.9921021014451981, + "num_tokens": 11934655.0, + "step": 1353 + }, + { + "entropy": 1.2127621173858643, + "epoch": 4.8904977375565615, + "grad_norm": 0.5436980724334717, + "learning_rate": 9.155064439103966e-05, + "loss": 0.0274, + "mean_token_accuracy": 0.9923296123743057, + "num_tokens": 11943642.0, + "step": 1354 + }, + { + "entropy": 1.191734939813614, + "epoch": 4.894117647058824, + "grad_norm": 0.6527659893035889, + "learning_rate": 9.12971484870285e-05, + "loss": 0.0626, + "mean_token_accuracy": 0.9802115112543106, + "num_tokens": 11952498.0, + "step": 1355 + }, + { + "entropy": 1.161993145942688, + "epoch": 4.897737556561086, + "grad_norm": 0.44556480646133423, + "learning_rate": 9.104437328852997e-05, + "loss": 0.0351, + "mean_token_accuracy": 0.9911103397607803, + "num_tokens": 11961424.0, + "step": 1356 + }, + { + "entropy": 1.1761930584907532, + "epoch": 4.901357466063349, + "grad_norm": 0.3631705343723297, + "learning_rate": 9.079231975561655e-05, + "loss": 0.0402, + "mean_token_accuracy": 0.9864005595445633, + "num_tokens": 11970226.0, + "step": 1357 + }, + { + "entropy": 1.127123236656189, + "epoch": 4.904977375565611, + "grad_norm": 0.3652251064777374, + "learning_rate": 9.054098884561962e-05, + "loss": 0.0319, + "mean_token_accuracy": 0.9879902452230453, + "num_tokens": 11978695.0, + "step": 1358 + }, + { + "entropy": 1.1227464973926544, + "epoch": 4.908597285067874, + "grad_norm": 0.4769943058490753, + "learning_rate": 9.029038151312601e-05, + "loss": 0.0447, + "mean_token_accuracy": 0.9882356822490692, + "num_tokens": 11987609.0, + "step": 1359 + }, + { + "entropy": 1.067475363612175, + "epoch": 4.912217194570136, + "grad_norm": 0.35615095496177673, + "learning_rate": 9.004049870997432e-05, + "loss": 0.0419, + "mean_token_accuracy": 0.9882748872041702, + "num_tokens": 11996892.0, + "step": 1360 + }, + { + "entropy": 1.1167294383049011, + "epoch": 4.9158371040723985, + "grad_norm": 0.2417716383934021, + "learning_rate": 8.979134138525127e-05, + "loss": 0.0146, + "mean_token_accuracy": 0.9959355145692825, + "num_tokens": 12005729.0, + "step": 1361 + }, + { + "entropy": 1.1310055255889893, + "epoch": 4.919457013574661, + "grad_norm": 0.612522542476654, + "learning_rate": 8.954291048528816e-05, + "loss": 0.04, + "mean_token_accuracy": 0.9909398257732391, + "num_tokens": 12014648.0, + "step": 1362 + }, + { + "entropy": 1.1927096545696259, + "epoch": 4.923076923076923, + "grad_norm": 0.30201828479766846, + "learning_rate": 8.929520695365718e-05, + "loss": 0.0154, + "mean_token_accuracy": 0.9962878674268723, + "num_tokens": 12023243.0, + "step": 1363 + }, + { + "entropy": 1.1450499594211578, + "epoch": 4.926696832579186, + "grad_norm": 0.45451363921165466, + "learning_rate": 8.904823173116795e-05, + "loss": 0.0342, + "mean_token_accuracy": 0.9906298071146011, + "num_tokens": 12032220.0, + "step": 1364 + }, + { + "entropy": 1.1808405816555023, + "epoch": 4.930316742081448, + "grad_norm": 0.5427790284156799, + "learning_rate": 8.880198575586377e-05, + "loss": 0.037, + "mean_token_accuracy": 0.9875056445598602, + "num_tokens": 12041159.0, + "step": 1365 + }, + { + "entropy": 1.196801632642746, + "epoch": 4.933936651583711, + "grad_norm": 0.5426363348960876, + "learning_rate": 8.855646996301831e-05, + "loss": 0.0548, + "mean_token_accuracy": 0.9816399365663528, + "num_tokens": 12049774.0, + "step": 1366 + }, + { + "entropy": 1.1708534061908722, + "epoch": 4.937556561085973, + "grad_norm": 0.4115329384803772, + "learning_rate": 8.831168528513182e-05, + "loss": 0.0271, + "mean_token_accuracy": 0.9913832694292068, + "num_tokens": 12058589.0, + "step": 1367 + }, + { + "entropy": 1.2116675972938538, + "epoch": 4.9411764705882355, + "grad_norm": 0.6414517760276794, + "learning_rate": 8.806763265192764e-05, + "loss": 0.0665, + "mean_token_accuracy": 0.9832449555397034, + "num_tokens": 12067158.0, + "step": 1368 + }, + { + "entropy": 1.1851932406425476, + "epoch": 4.944796380090498, + "grad_norm": 0.5957241654396057, + "learning_rate": 8.782431299034888e-05, + "loss": 0.057, + "mean_token_accuracy": 0.9863469302654266, + "num_tokens": 12076142.0, + "step": 1369 + }, + { + "entropy": 1.1860441267490387, + "epoch": 4.94841628959276, + "grad_norm": 0.4882605969905853, + "learning_rate": 8.758172722455456e-05, + "loss": 0.0352, + "mean_token_accuracy": 0.9856973141431808, + "num_tokens": 12085136.0, + "step": 1370 + }, + { + "entropy": 1.1902669072151184, + "epoch": 4.952036199095023, + "grad_norm": 0.4293587803840637, + "learning_rate": 8.733987627591634e-05, + "loss": 0.0351, + "mean_token_accuracy": 0.9872110784053802, + "num_tokens": 12093958.0, + "step": 1371 + }, + { + "entropy": 1.1657917201519012, + "epoch": 4.955656108597285, + "grad_norm": 0.4974465072154999, + "learning_rate": 8.709876106301494e-05, + "loss": 0.0422, + "mean_token_accuracy": 0.9845677465200424, + "num_tokens": 12103052.0, + "step": 1372 + }, + { + "entropy": 1.1982821822166443, + "epoch": 4.959276018099548, + "grad_norm": 0.4955083131790161, + "learning_rate": 8.685838250163667e-05, + "loss": 0.034, + "mean_token_accuracy": 0.9909563511610031, + "num_tokens": 12111490.0, + "step": 1373 + }, + { + "entropy": 1.1515125632286072, + "epoch": 4.96289592760181, + "grad_norm": 0.3922666907310486, + "learning_rate": 8.661874150476996e-05, + "loss": 0.0295, + "mean_token_accuracy": 0.9927395433187485, + "num_tokens": 12119907.0, + "step": 1374 + }, + { + "entropy": 1.1215294301509857, + "epoch": 4.9665158371040725, + "grad_norm": 0.4663509130477905, + "learning_rate": 8.637983898260185e-05, + "loss": 0.0462, + "mean_token_accuracy": 0.9870833456516266, + "num_tokens": 12128753.0, + "step": 1375 + }, + { + "entropy": 1.2042448818683624, + "epoch": 4.970135746606335, + "grad_norm": 0.37983185052871704, + "learning_rate": 8.614167584251458e-05, + "loss": 0.0183, + "mean_token_accuracy": 0.9940395504236221, + "num_tokens": 12137481.0, + "step": 1376 + }, + { + "entropy": 1.1789166033267975, + "epoch": 4.973755656108597, + "grad_norm": 0.6218724250793457, + "learning_rate": 8.5904252989082e-05, + "loss": 0.0475, + "mean_token_accuracy": 0.9865689128637314, + "num_tokens": 12146367.0, + "step": 1377 + }, + { + "entropy": 1.0459700524806976, + "epoch": 4.97737556561086, + "grad_norm": 0.3584859073162079, + "learning_rate": 8.566757132406655e-05, + "loss": 0.022, + "mean_token_accuracy": 0.9925664961338043, + "num_tokens": 12156304.0, + "step": 1378 + }, + { + "entropy": 1.1764253973960876, + "epoch": 4.980995475113122, + "grad_norm": 0.455917626619339, + "learning_rate": 8.543163174641523e-05, + "loss": 0.0306, + "mean_token_accuracy": 0.9904853105545044, + "num_tokens": 12165145.0, + "step": 1379 + }, + { + "entropy": 1.2412675023078918, + "epoch": 4.984615384615385, + "grad_norm": 0.5041368007659912, + "learning_rate": 8.519643515225658e-05, + "loss": 0.0365, + "mean_token_accuracy": 0.9898426681756973, + "num_tokens": 12173502.0, + "step": 1380 + }, + { + "entropy": 1.1146800816059113, + "epoch": 4.988235294117647, + "grad_norm": 0.30426302552223206, + "learning_rate": 8.496198243489743e-05, + "loss": 0.0205, + "mean_token_accuracy": 0.9935296326875687, + "num_tokens": 12182445.0, + "step": 1381 + }, + { + "entropy": 1.2658378779888153, + "epoch": 4.9918552036199095, + "grad_norm": 0.602274477481842, + "learning_rate": 8.472827448481894e-05, + "loss": 0.0435, + "mean_token_accuracy": 0.9829444736242294, + "num_tokens": 12191040.0, + "step": 1382 + }, + { + "entropy": 1.1924521923065186, + "epoch": 4.995475113122172, + "grad_norm": 0.5336406230926514, + "learning_rate": 8.449531218967363e-05, + "loss": 0.0237, + "mean_token_accuracy": 0.993032917380333, + "num_tokens": 12199468.0, + "step": 1383 + }, + { + "entropy": 1.2425891757011414, + "epoch": 4.999095022624434, + "grad_norm": 1.141570806503296, + "learning_rate": 8.426309643428217e-05, + "loss": 0.0185, + "mean_token_accuracy": 0.9930049031972885, + "num_tokens": 12207979.0, + "step": 1384 + }, + { + "entropy": 1.0880640745162964, + "epoch": 5.0, + "grad_norm": 0.33857738971710205, + "learning_rate": 8.403162810062945e-05, + "loss": 0.0059, + "mean_token_accuracy": 1.0, + "num_tokens": 12208625.0, + "step": 1385 + }, + { + "epoch": 5.0, + "eval_entropy": 1.186980954030665, + "eval_loss": 0.13543975353240967, + "eval_mean_token_accuracy": 0.971454709526, + "eval_num_tokens": 12208625.0, + "eval_runtime": 116.009, + "eval_samples_per_second": 3.181, + "eval_steps_per_second": 1.06, + "step": 1385 + } + ], + "logging_steps": 1, + "max_steps": 1662, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6573609868029627e+18, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1385/training_args.bin b/checkpoint-1385/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..070a2de135e794840c49e066215a1c9f2e550d1f --- /dev/null +++ b/checkpoint-1385/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc271f94ce32216bd6f2ee9866fb7d62a0583bc7ee0c7fa953fa57c302729c6c +size 6289 diff --git a/checkpoint-1662/README.md b/checkpoint-1662/README.md new file mode 100644 index 0000000000000000000000000000000000000000..58a4061707bcc32db3b543936f6b650c01f3dccb --- /dev/null +++ b/checkpoint-1662/README.md @@ -0,0 +1,208 @@ +--- +base_model: openai/gpt-oss-20b +library_name: peft +tags: +- base_model:adapter:openai/gpt-oss-20b +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/checkpoint-1662/adapter_config.json b/checkpoint-1662/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..076480eaf349cc658de2eb00b26c7360a85f8f56 --- /dev/null +++ b/checkpoint-1662/adapter_config.json @@ -0,0 +1,53 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "GptOssForCausalLM", + "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss" + }, + "base_model_name_or_path": "openai/gpt-oss-20b", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "o_proj", + "q_proj" + ], + "target_parameters": [ + "7.mlp.experts.gate_up_proj", + "7.mlp.experts.down_proj", + "15.mlp.experts.gate_up_proj", + "15.mlp.experts.down_proj", + "23.mlp.experts.gate_up_proj", + "23.mlp.experts.down_proj" + ], + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1662/adapter_model.safetensors b/checkpoint-1662/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..441ba7da699c3189ce9a3b4c7b9665bb477011ee --- /dev/null +++ b/checkpoint-1662/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbb7e0563f99e663dc1295f65f2fd5fa33a2cf7adec8ea455a3bee433b491f61 +size 60189176 diff --git a/checkpoint-1662/chat_template.jinja b/checkpoint-1662/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc7bb11927d29f653ba2740f2db2c688fd77592f --- /dev/null +++ b/checkpoint-1662/chat_template.jinja @@ -0,0 +1,331 @@ +{#- + In addition to the normal inputs of `messages` and `tools`, this template also accepts the + following kwargs: + - "builtin_tools": A list, can contain "browser" and/or "python". + - "model_identity": A string that optionally describes the model identity. + - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium". + #} + +{#- Tool Definition Rendering ============================================== #} +{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%} + {%- if param_spec.type == "array" -%} + {%- if param_spec['items'] -%} + {%- if param_spec['items']['type'] == "string" -%} + {{- "string[]" }} + {%- elif param_spec['items']['type'] == "number" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "integer" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "boolean" -%} + {{- "boolean[]" }} + {%- else -%} + {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%} + {%- if inner_type == "object | object" or inner_type|length > 50 -%} + {{- "any[]" }} + {%- else -%} + {{- inner_type + "[]" }} + {%- endif -%} + {%- endif -%} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- else -%} + {{- "any[]" }} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%} + {#- Handle array of types like ["object", "object"] from Union[dict, list] #} + {%- if param_spec.type | length > 1 -%} + {{- param_spec.type | join(" | ") }} + {%- else -%} + {{- param_spec.type[0] }} + {%- endif -%} + {%- elif param_spec.oneOf -%} + {#- Handle oneOf schemas - check for complex unions and fallback to any #} + {%- set has_object_variants = false -%} + {%- for variant in param_spec.oneOf -%} + {%- if variant.type == "object" -%} + {%- set has_object_variants = true -%} + {%- endif -%} + {%- endfor -%} + {%- if has_object_variants and param_spec.oneOf|length > 1 -%} + {{- "any" }} + {%- else -%} + {%- for variant in param_spec.oneOf -%} + {{- render_typescript_type(variant, required_params) -}} + {%- if variant.description %} + {{- "// " + variant.description }} + {%- endif -%} + {%- if variant.default is defined %} + {{ "// default: " + variant.default|tojson }} + {%- endif -%} + {%- if not loop.last %} + {{- " | " }} + {% endif -%} + {%- endfor -%} + {%- endif -%} + {%- elif param_spec.type == "string" -%} + {%- if param_spec.enum -%} + {{- '"' + param_spec.enum|join('" | "') + '"' -}} + {%- else -%} + {{- "string" }} + {%- if param_spec.nullable %} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type == "number" -%} + {{- "number" }} + {%- elif param_spec.type == "integer" -%} + {{- "number" }} + {%- elif param_spec.type == "boolean" -%} + {{- "boolean" }} + + {%- elif param_spec.type == "object" -%} + {%- if param_spec.properties -%} + {{- "{\n" }} + {%- for prop_name, prop_spec in param_spec.properties.items() -%} + {{- prop_name -}} + {%- if prop_name not in (param_spec.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{ render_typescript_type(prop_spec, param_spec.required or []) }} + {%- if not loop.last -%} + {{-", " }} + {%- endif -%} + {%- endfor -%} + {{- "}" }} + {%- else -%} + {{- "object" }} + {%- endif -%} + {%- else -%} + {{- "any" }} + {%- endif -%} +{%- endmacro -%} + +{%- macro render_tool_namespace(namespace_name, tools) -%} + {{- "## " + namespace_name + "\n\n" }} + {{- "namespace " + namespace_name + " {\n\n" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- "// " + tool.description + "\n" }} + {{- "type "+ tool.name + " = " }} + {%- if tool.parameters and tool.parameters.properties %} + {{- "(_: {\n" }} + {%- for param_name, param_spec in tool.parameters.properties.items() %} + {%- if param_spec.description %} + {{- "// " + param_spec.description + "\n" }} + {%- endif %} + {{- param_name }} + {%- if param_name not in (tool.parameters.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{- render_typescript_type(param_spec, tool.parameters.required or []) }} + {%- if param_spec.default is defined -%} + {%- if param_spec.enum %} + {{- ", // default: " + param_spec.default }} + {%- elif param_spec.oneOf %} + {{- "// default: " + param_spec.default }} + {%- else %} + {{- ", // default: " + param_spec.default|tojson }} + {%- endif -%} + {%- endif -%} + {%- if not loop.last %} + {{- ",\n" }} + {%- else %} + {{- ",\n" }} + {%- endif -%} + {%- endfor %} + {{- "}) => any;\n\n" }} + {%- else -%} + {{- "() => any;\n\n" }} + {%- endif -%} + {%- endfor %} + {{- "} // namespace " + namespace_name }} +{%- endmacro -%} + +{%- macro render_builtin_tools(browser_tool, python_tool) -%} + {%- if browser_tool %} + {{- "## browser\n\n" }} + {{- "// Tool for browsing.\n" }} + {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }} + {{- "// Cite information from the tool using the following format:\n" }} + {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }} + {{- "// Do not quote more than 10 words directly from the tool output.\n" }} + {{- "// sources=web (default: web)\n" }} + {{- "namespace browser {\n\n" }} + {{- "// Searches for information related to `query` and displays `topn` results.\n" }} + {{- "type search = (_: {\n" }} + {{- "query: string,\n" }} + {{- "topn?: number, // default: 10\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }} + {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }} + {{- "// If `cursor` is not provided, the most recent page is implied.\n" }} + {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }} + {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }} + {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }} + {{- "type open = (_: {\n" }} + {{- "id?: number | string, // default: -1\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "loc?: number, // default: -1\n" }} + {{- "num_lines?: number, // default: -1\n" }} + {{- "view_source?: boolean, // default: false\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }} + {{- "type find = (_: {\n" }} + {{- "pattern: string,\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "}) => any;\n\n" }} + {{- "} // namespace browser\n\n" }} + {%- endif -%} + + {%- if python_tool %} + {{- "## python\n\n" }} + {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }} + {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }} + {%- endif -%} +{%- endmacro -%} + +{#- System Message Construction ============================================ #} +{%- macro build_system_message() -%} + {%- if model_identity is not defined %} + {%- set model_identity = "You are ChatGPT, a large language model trained by OpenAI." %} + {%- endif %} + {{- model_identity + "\n" }} + {{- "Knowledge cutoff: 2024-06\n" }} + {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }} + {%- if reasoning_effort is not defined %} + {%- set reasoning_effort = "medium" %} + {%- endif %} + {{- "Reasoning: " + reasoning_effort + "\n\n" }} + {%- if builtin_tools %} + {{- "# Tools\n\n" }} + {%- set available_builtin_tools = namespace(browser=false, python=false) %} + {%- for tool in builtin_tools %} + {%- if tool == "browser" %} + {%- set available_builtin_tools.browser = true %} + {%- elif tool == "python" %} + {%- set available_builtin_tools.python = true %} + {%- endif %} + {%- endfor %} + {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }} + {%- endif -%} + {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }} + {%- if tools -%} + {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }} + {%- endif -%} +{%- endmacro -%} + +{#- Main Template Logic ================================================= #} +{#- Set defaults #} + +{#- Render system message #} +{{- "<|start|>system<|message|>" }} +{{- build_system_message() }} +{{- "<|end|>" }} + +{#- Extract developer message #} +{%- if messages[0].role == "developer" or messages[0].role == "system" %} + {%- set developer_message = messages[0].content %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set developer_message = "" %} + {%- set loop_messages = messages %} +{%- endif %} + +{#- Render developer message #} +{%- if developer_message or tools %} + {{- "<|start|>developer<|message|>" }} + {%- if developer_message %} + {{- "# Instructions\n\n" }} + {{- developer_message }} + {{- "\n\n" }} + {%- endif %} + {%- if tools -%} + {{- "# Tools\n\n" }} + {{- render_tool_namespace("functions", tools) }} + {%- endif -%} + {{- "<|end|>" }} +{%- endif %} + +{#- Render messages #} +{%- set last_tool_call = namespace(name=none) %} +{%- for message in loop_messages -%} + {#- At this point only assistant/user/tool messages should remain #} + {%- if message.role == 'assistant' -%} + {#- Checks to ensure the messages are being passed in the format we expect #} + {%- if "content" in message %} + {%- if "<|channel|>analysis<|message|>" in message.content or "<|channel|>final<|message|>" in message.content %} + {{- raise_exception("You have passed a message containing <|channel|> tags in the content field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }} + {%- endif %} + {%- endif %} + {%- if "thinking" in message %} + {%- if "<|channel|>analysis<|message|>" in message.thinking or "<|channel|>final<|message|>" in message.thinking %} + {{- raise_exception("You have passed a message containing <|channel|> tags in the thinking field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }} + {%- endif %} + {%- endif %} + {%- if "tool_calls" in message %} + {#- We need very careful handling here - we want to drop the tool call analysis message if the model #} + {#- has output a later <|final|> message, but otherwise we want to retain it. This is the only case #} + {#- when we render CoT/analysis messages in inference. #} + {%- set future_final_message = namespace(found=false) %} + {%- for future_message in loop_messages[loop.index:] %} + {%- if future_message.role == 'assistant' and "tool_calls" not in future_message %} + {%- set future_final_message.found = true %} + {%- endif %} + {%- endfor %} + {#- We assume max 1 tool call per message, and so we infer the tool call name #} + {#- in "tool" messages from the most recent assistant tool call name #} + {%- set tool_call = message.tool_calls[0] %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if message.content and message.thinking %} + {{- raise_exception("Cannot pass both content and thinking in an assistant message with tool calls! Put the analysis message in one or the other, but not both.") }} + {%- elif message.content and not future_final_message.found %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }} + {%- elif message.thinking and not future_final_message.found %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {%- endif %} + {{- "<|start|>assistant to=" }} + {{- "functions." + tool_call.name + "<|channel|>commentary " }} + {{- (tool_call.content_type if tool_call.content_type is defined else "json") + "<|message|>" }} + {{- tool_call.arguments|tojson }} + {{- "<|call|>" }} + {%- set last_tool_call.name = tool_call.name %} + {%- elif loop.last and not add_generation_prompt %} + {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #} + {#- This is a situation that should only occur in training, never in inference. #} + {%- if "thinking" in message %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {%- endif %} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }} + {%- else %} + {#- CoT is dropped during all previous turns, so we never render it for inference #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- endif %} + {%- elif message.role == 'tool' -%} + {%- if last_tool_call.name is none %} + {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }} + {%- endif %} + {{- "<|start|>functions." + last_tool_call.name }} + {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} + {%- elif message.role == 'user' -%} + {{- "<|start|>user<|message|>" + message.content + "<|end|>" }} + {%- endif -%} +{%- endfor -%} + +{#- Generation prompt #} +{%- if add_generation_prompt -%} +<|start|>assistant +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-1662/optimizer.pt b/checkpoint-1662/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..028ba4e5381e180f83bcedb3a55bf0d4fb374407 --- /dev/null +++ b/checkpoint-1662/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc7b01a9488a720ed11a59bedf239c88ef42607fedf0ec9998927354ad6a2bb3 +size 120498699 diff --git a/checkpoint-1662/rng_state.pth b/checkpoint-1662/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0a59c7834bf1be9f4f699331266ac781573bb8ff --- /dev/null +++ b/checkpoint-1662/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1255b82ec9f462fef6f8c2f4a8ee876c4b6d998a53c99546a4eddcc1aabaefc7 +size 14645 diff --git a/checkpoint-1662/scheduler.pt b/checkpoint-1662/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..df9fd10ba7bdc480574b11ce1fa112dc5bfd729f --- /dev/null +++ b/checkpoint-1662/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9eaaaae63de81e7b46c65e4c5fc1abd835a35114669dbf898405cd571daed1cf +size 1465 diff --git a/checkpoint-1662/special_tokens_map.json b/checkpoint-1662/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c47e282982a9c6856832947a72ded329fad2e8c --- /dev/null +++ b/checkpoint-1662/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|return|>" +} diff --git a/checkpoint-1662/tokenizer.json b/checkpoint-1662/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647 --- /dev/null +++ b/checkpoint-1662/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3 +size 27868174 diff --git a/checkpoint-1662/tokenizer_config.json b/checkpoint-1662/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e86f6faa71de0fc3afe47ea8984da9e6138c031c --- /dev/null +++ b/checkpoint-1662/tokenizer_config.json @@ -0,0 +1,183 @@ +{ + "added_tokens_decoder": { + "199998": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "199999": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200000": { + "content": "<|reserved_200000|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200001": { + "content": "<|reserved_200001|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200002": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200003": { + "content": "<|constrain|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200004": { + "content": "<|reserved_200004|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200005": { + "content": "<|channel|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200006": { + "content": "<|start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200008": { + "content": "<|message|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200009": { + "content": "<|reserved_200009|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200010": { + "content": "<|reserved_200010|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200011": { + "content": "<|reserved_200011|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200012": { + "content": "<|call|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200013": { + "content": "<|reserved_200013|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200014": { + "content": "<|reserved_200014|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200015": { + "content": "<|reserved_200015|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200016": { + "content": "<|reserved_200016|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200017": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200018": { + "content": "<|endofprompt|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|startoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|return|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|return|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-1662/trainer_state.json b/checkpoint-1662/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1f5e937551b576f7e184e1964326b1fe777db8f5 --- /dev/null +++ b/checkpoint-1662/trainer_state.json @@ -0,0 +1,16720 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 1662, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.358862280845642, + "epoch": 0.0036199095022624436, + "grad_norm": 2.292628288269043, + "learning_rate": 0.0, + "loss": 0.7311, + "mean_token_accuracy": 0.8534883409738541, + "num_tokens": 9316.0, + "step": 1 + }, + { + "entropy": 2.674945294857025, + "epoch": 0.007239819004524887, + "grad_norm": 3.8950836658477783, + "learning_rate": 1.0219999999999999e-05, + "loss": 1.0621, + "mean_token_accuracy": 0.8183160275220871, + "num_tokens": 17707.0, + "step": 2 + }, + { + "entropy": 2.4915525913238525, + "epoch": 0.01085972850678733, + "grad_norm": 2.792142868041992, + "learning_rate": 2.0439999999999997e-05, + "loss": 0.8448, + "mean_token_accuracy": 0.8489587754011154, + "num_tokens": 26783.0, + "step": 3 + }, + { + "entropy": 2.525622010231018, + "epoch": 0.014479638009049774, + "grad_norm": 2.7071900367736816, + "learning_rate": 3.0659999999999994e-05, + "loss": 0.8847, + "mean_token_accuracy": 0.8486668318510056, + "num_tokens": 35947.0, + "step": 4 + }, + { + "entropy": 2.588509976863861, + "epoch": 0.01809954751131222, + "grad_norm": 2.981574773788452, + "learning_rate": 4.0879999999999995e-05, + "loss": 1.0783, + "mean_token_accuracy": 0.8135111033916473, + "num_tokens": 44505.0, + "step": 5 + }, + { + "entropy": 2.662865400314331, + "epoch": 0.02171945701357466, + "grad_norm": 2.629283905029297, + "learning_rate": 5.1099999999999995e-05, + "loss": 0.9485, + "mean_token_accuracy": 0.8152717798948288, + "num_tokens": 53140.0, + "step": 6 + }, + { + "entropy": 2.6662243604660034, + "epoch": 0.025339366515837104, + "grad_norm": 2.730058431625366, + "learning_rate": 6.131999999999999e-05, + "loss": 0.6982, + "mean_token_accuracy": 0.8552135527133942, + "num_tokens": 61932.0, + "step": 7 + }, + { + "entropy": 2.661384105682373, + "epoch": 0.02895927601809955, + "grad_norm": 2.562839984893799, + "learning_rate": 7.154e-05, + "loss": 0.7296, + "mean_token_accuracy": 0.8579540699720383, + "num_tokens": 70973.0, + "step": 8 + }, + { + "entropy": 2.7889368534088135, + "epoch": 0.03257918552036199, + "grad_norm": 2.8640544414520264, + "learning_rate": 8.175999999999999e-05, + "loss": 0.5965, + "mean_token_accuracy": 0.8638457208871841, + "num_tokens": 79977.0, + "step": 9 + }, + { + "entropy": 2.811532199382782, + "epoch": 0.03619909502262444, + "grad_norm": 2.6199426651000977, + "learning_rate": 9.197999999999998e-05, + "loss": 0.4819, + "mean_token_accuracy": 0.8786454051733017, + "num_tokens": 88915.0, + "step": 10 + }, + { + "entropy": 2.941167712211609, + "epoch": 0.039819004524886875, + "grad_norm": 1.2497272491455078, + "learning_rate": 0.00010219999999999999, + "loss": 0.7192, + "mean_token_accuracy": 0.841494083404541, + "num_tokens": 97749.0, + "step": 11 + }, + { + "entropy": 3.0547962188720703, + "epoch": 0.04343891402714932, + "grad_norm": 1.436136245727539, + "learning_rate": 0.00011241999999999998, + "loss": 0.5908, + "mean_token_accuracy": 0.8657624870538712, + "num_tokens": 106048.0, + "step": 12 + }, + { + "entropy": 2.9914053082466125, + "epoch": 0.047058823529411764, + "grad_norm": 0.9903654456138611, + "learning_rate": 0.00012263999999999998, + "loss": 0.4008, + "mean_token_accuracy": 0.8985499292612076, + "num_tokens": 115216.0, + "step": 13 + }, + { + "entropy": 3.1867465376853943, + "epoch": 0.05067873303167421, + "grad_norm": 1.019572377204895, + "learning_rate": 0.00013286, + "loss": 0.5062, + "mean_token_accuracy": 0.8893097043037415, + "num_tokens": 124040.0, + "step": 14 + }, + { + "entropy": 3.2431325912475586, + "epoch": 0.05429864253393665, + "grad_norm": 1.2394084930419922, + "learning_rate": 0.00014308, + "loss": 0.361, + "mean_token_accuracy": 0.9009967148303986, + "num_tokens": 132447.0, + "step": 15 + }, + { + "entropy": 3.1858643889427185, + "epoch": 0.0579185520361991, + "grad_norm": 0.9859603643417358, + "learning_rate": 0.00015329999999999999, + "loss": 0.4498, + "mean_token_accuracy": 0.887280747294426, + "num_tokens": 141228.0, + "step": 16 + }, + { + "entropy": 3.5029141902923584, + "epoch": 0.06153846153846154, + "grad_norm": 1.453957438468933, + "learning_rate": 0.00016351999999999998, + "loss": 0.4949, + "mean_token_accuracy": 0.888081505894661, + "num_tokens": 149789.0, + "step": 17 + }, + { + "entropy": 3.4572895765304565, + "epoch": 0.06515837104072399, + "grad_norm": 1.390377402305603, + "learning_rate": 0.00017374, + "loss": 0.5449, + "mean_token_accuracy": 0.8745045810937881, + "num_tokens": 157813.0, + "step": 18 + }, + { + "entropy": 3.3081750869750977, + "epoch": 0.06877828054298643, + "grad_norm": 1.1171791553497314, + "learning_rate": 0.00018395999999999997, + "loss": 0.4786, + "mean_token_accuracy": 0.8893420845270157, + "num_tokens": 166315.0, + "step": 19 + }, + { + "entropy": 3.3776715993881226, + "epoch": 0.07239819004524888, + "grad_norm": 1.5567998886108398, + "learning_rate": 0.00019418, + "loss": 0.3669, + "mean_token_accuracy": 0.9146632701158524, + "num_tokens": 175207.0, + "step": 20 + }, + { + "entropy": 3.2677870988845825, + "epoch": 0.0760180995475113, + "grad_norm": 1.7404611110687256, + "learning_rate": 0.00020439999999999998, + "loss": 0.5287, + "mean_token_accuracy": 0.8777483552694321, + "num_tokens": 183833.0, + "step": 21 + }, + { + "entropy": 3.313201069831848, + "epoch": 0.07963800904977375, + "grad_norm": 1.0836979150772095, + "learning_rate": 0.00021461999999999997, + "loss": 0.3014, + "mean_token_accuracy": 0.9215261936187744, + "num_tokens": 192591.0, + "step": 22 + }, + { + "entropy": 3.208672881126404, + "epoch": 0.0832579185520362, + "grad_norm": 1.2197301387786865, + "learning_rate": 0.00022483999999999997, + "loss": 0.4401, + "mean_token_accuracy": 0.9031257778406143, + "num_tokens": 201372.0, + "step": 23 + }, + { + "entropy": 3.1830995082855225, + "epoch": 0.08687782805429864, + "grad_norm": 1.2422229051589966, + "learning_rate": 0.00023506, + "loss": 0.5144, + "mean_token_accuracy": 0.8915928155183792, + "num_tokens": 210348.0, + "step": 24 + }, + { + "entropy": 3.085207223892212, + "epoch": 0.09049773755656108, + "grad_norm": 0.8987624049186707, + "learning_rate": 0.00024527999999999996, + "loss": 0.3253, + "mean_token_accuracy": 0.9221627116203308, + "num_tokens": 219131.0, + "step": 25 + }, + { + "entropy": 3.026031017303467, + "epoch": 0.09411764705882353, + "grad_norm": 1.0273475646972656, + "learning_rate": 0.0002555, + "loss": 0.3495, + "mean_token_accuracy": 0.9147634357213974, + "num_tokens": 228292.0, + "step": 26 + }, + { + "entropy": 3.0420032739639282, + "epoch": 0.09773755656108597, + "grad_norm": 1.0590945482254028, + "learning_rate": 0.00026572, + "loss": 0.4495, + "mean_token_accuracy": 0.9019353687763214, + "num_tokens": 236942.0, + "step": 27 + }, + { + "entropy": 3.0469263792037964, + "epoch": 0.10135746606334842, + "grad_norm": 0.9584959745407104, + "learning_rate": 0.00027594, + "loss": 0.405, + "mean_token_accuracy": 0.9216890782117844, + "num_tokens": 245543.0, + "step": 28 + }, + { + "entropy": 2.92683744430542, + "epoch": 0.10497737556561086, + "grad_norm": 0.8826628923416138, + "learning_rate": 0.00028616, + "loss": 0.4004, + "mean_token_accuracy": 0.9173285663127899, + "num_tokens": 254264.0, + "step": 29 + }, + { + "entropy": 3.0086968541145325, + "epoch": 0.1085972850678733, + "grad_norm": 0.8521863222122192, + "learning_rate": 0.00029637999999999995, + "loss": 0.2876, + "mean_token_accuracy": 0.9335231184959412, + "num_tokens": 263143.0, + "step": 30 + }, + { + "entropy": 2.9086623191833496, + "epoch": 0.11221719457013575, + "grad_norm": 0.7830919623374939, + "learning_rate": 0.00030659999999999997, + "loss": 0.548, + "mean_token_accuracy": 0.8831343650817871, + "num_tokens": 272055.0, + "step": 31 + }, + { + "entropy": 2.9730575680732727, + "epoch": 0.1158371040723982, + "grad_norm": 0.7217472195625305, + "learning_rate": 0.00031682, + "loss": 0.3564, + "mean_token_accuracy": 0.9119151830673218, + "num_tokens": 280971.0, + "step": 32 + }, + { + "entropy": 3.081720530986786, + "epoch": 0.11945701357466064, + "grad_norm": 0.8697704076766968, + "learning_rate": 0.00032703999999999996, + "loss": 0.334, + "mean_token_accuracy": 0.9234935492277145, + "num_tokens": 289449.0, + "step": 33 + }, + { + "entropy": 3.1043431162834167, + "epoch": 0.12307692307692308, + "grad_norm": 0.7962514758110046, + "learning_rate": 0.00033726, + "loss": 0.1602, + "mean_token_accuracy": 0.9554370939731598, + "num_tokens": 297804.0, + "step": 34 + }, + { + "entropy": 3.0275490283966064, + "epoch": 0.12669683257918551, + "grad_norm": 0.5887104272842407, + "learning_rate": 0.00034748, + "loss": 0.2254, + "mean_token_accuracy": 0.9491932094097137, + "num_tokens": 306589.0, + "step": 35 + }, + { + "entropy": 3.099652886390686, + "epoch": 0.13031674208144797, + "grad_norm": 0.894397497177124, + "learning_rate": 0.00035769999999999997, + "loss": 0.6397, + "mean_token_accuracy": 0.8802188038825989, + "num_tokens": 315534.0, + "step": 36 + }, + { + "entropy": 3.0312134623527527, + "epoch": 0.1339366515837104, + "grad_norm": 0.6374682188034058, + "learning_rate": 0.00036791999999999993, + "loss": 0.2183, + "mean_token_accuracy": 0.9478497952222824, + "num_tokens": 324492.0, + "step": 37 + }, + { + "entropy": 3.28497713804245, + "epoch": 0.13755656108597286, + "grad_norm": 0.6740968823432922, + "learning_rate": 0.00037813999999999995, + "loss": 0.3619, + "mean_token_accuracy": 0.9288723170757294, + "num_tokens": 333195.0, + "step": 38 + }, + { + "entropy": 3.1478323340415955, + "epoch": 0.1411764705882353, + "grad_norm": 0.7235494256019592, + "learning_rate": 0.00038836, + "loss": 0.324, + "mean_token_accuracy": 0.9179254025220871, + "num_tokens": 342028.0, + "step": 39 + }, + { + "entropy": 3.279879152774811, + "epoch": 0.14479638009049775, + "grad_norm": 0.7512595653533936, + "learning_rate": 0.00039858, + "loss": 0.4804, + "mean_token_accuracy": 0.889826312661171, + "num_tokens": 350902.0, + "step": 40 + }, + { + "entropy": 3.173546612262726, + "epoch": 0.14841628959276018, + "grad_norm": 0.6978861689567566, + "learning_rate": 0.00040879999999999996, + "loss": 0.3442, + "mean_token_accuracy": 0.9205169230699539, + "num_tokens": 359787.0, + "step": 41 + }, + { + "entropy": 3.2385765314102173, + "epoch": 0.1520361990950226, + "grad_norm": 0.8108944892883301, + "learning_rate": 0.00041901999999999993, + "loss": 0.4223, + "mean_token_accuracy": 0.8979178965091705, + "num_tokens": 368426.0, + "step": 42 + }, + { + "entropy": 3.146568477153778, + "epoch": 0.15565610859728507, + "grad_norm": 0.5847787261009216, + "learning_rate": 0.00042923999999999995, + "loss": 0.1953, + "mean_token_accuracy": 0.9556037336587906, + "num_tokens": 377349.0, + "step": 43 + }, + { + "entropy": 3.066233277320862, + "epoch": 0.1592760180995475, + "grad_norm": 0.7887329459190369, + "learning_rate": 0.00043945999999999997, + "loss": 0.6815, + "mean_token_accuracy": 0.8654293268918991, + "num_tokens": 386603.0, + "step": 44 + }, + { + "entropy": 3.1745981574058533, + "epoch": 0.16289592760180996, + "grad_norm": 0.7280165553092957, + "learning_rate": 0.00044967999999999994, + "loss": 0.1932, + "mean_token_accuracy": 0.9479279220104218, + "num_tokens": 395070.0, + "step": 45 + }, + { + "entropy": 3.1094446182250977, + "epoch": 0.1665158371040724, + "grad_norm": 0.6453448534011841, + "learning_rate": 0.00045989999999999996, + "loss": 0.2608, + "mean_token_accuracy": 0.9249396026134491, + "num_tokens": 403651.0, + "step": 46 + }, + { + "entropy": 2.9050925970077515, + "epoch": 0.17013574660633485, + "grad_norm": 0.6689278483390808, + "learning_rate": 0.00047012, + "loss": 0.4489, + "mean_token_accuracy": 0.898686870932579, + "num_tokens": 412898.0, + "step": 47 + }, + { + "entropy": 3.2239145040512085, + "epoch": 0.17375565610859728, + "grad_norm": 1.0014020204544067, + "learning_rate": 0.00048033999999999994, + "loss": 0.3234, + "mean_token_accuracy": 0.9231891483068466, + "num_tokens": 421420.0, + "step": 48 + }, + { + "entropy": 3.035899817943573, + "epoch": 0.17737556561085974, + "grad_norm": 0.6415768265724182, + "learning_rate": 0.0004905599999999999, + "loss": 0.2259, + "mean_token_accuracy": 0.9447792917490005, + "num_tokens": 430258.0, + "step": 49 + }, + { + "entropy": 3.057477653026581, + "epoch": 0.18099547511312217, + "grad_norm": 0.6042271256446838, + "learning_rate": 0.0005007799999999999, + "loss": 0.2228, + "mean_token_accuracy": 0.9473378211259842, + "num_tokens": 439593.0, + "step": 50 + }, + { + "entropy": 2.8375911116600037, + "epoch": 0.18461538461538463, + "grad_norm": 0.739811897277832, + "learning_rate": 0.000511, + "loss": 0.3623, + "mean_token_accuracy": 0.9050924181938171, + "num_tokens": 449056.0, + "step": 51 + }, + { + "entropy": 2.9926682114601135, + "epoch": 0.18823529411764706, + "grad_norm": 0.6637321710586548, + "learning_rate": 0.0005109995633102972, + "loss": 0.2924, + "mean_token_accuracy": 0.9397273659706116, + "num_tokens": 457677.0, + "step": 52 + }, + { + "entropy": 2.7932987809181213, + "epoch": 0.19185520361990951, + "grad_norm": 0.5666584372520447, + "learning_rate": 0.0005109982532428477, + "loss": 0.2055, + "mean_token_accuracy": 0.9385408014059067, + "num_tokens": 466969.0, + "step": 53 + }, + { + "entropy": 2.765812337398529, + "epoch": 0.19547511312217195, + "grad_norm": 0.7875120639801025, + "learning_rate": 0.0005109960698026271, + "loss": 0.4549, + "mean_token_accuracy": 0.9052814990282059, + "num_tokens": 476285.0, + "step": 54 + }, + { + "entropy": 2.884207248687744, + "epoch": 0.19909502262443438, + "grad_norm": 0.7538661956787109, + "learning_rate": 0.0005109930129979285, + "loss": 0.3751, + "mean_token_accuracy": 0.9210246652364731, + "num_tokens": 484668.0, + "step": 55 + }, + { + "entropy": 2.779718518257141, + "epoch": 0.20271493212669683, + "grad_norm": 0.8069296479225159, + "learning_rate": 0.0005109890828403621, + "loss": 0.3664, + "mean_token_accuracy": 0.9219843596220016, + "num_tokens": 493292.0, + "step": 56 + }, + { + "entropy": 2.841543674468994, + "epoch": 0.20633484162895926, + "grad_norm": 0.5545904636383057, + "learning_rate": 0.0005109842793448548, + "loss": 0.1973, + "mean_token_accuracy": 0.9547395706176758, + "num_tokens": 501973.0, + "step": 57 + }, + { + "entropy": 2.8180030584335327, + "epoch": 0.20995475113122172, + "grad_norm": 1.015456199645996, + "learning_rate": 0.0005109786025296513, + "loss": 0.6019, + "mean_token_accuracy": 0.88613361120224, + "num_tokens": 510840.0, + "step": 58 + }, + { + "entropy": 2.7450912594795227, + "epoch": 0.21357466063348415, + "grad_norm": 0.6784740686416626, + "learning_rate": 0.0005109720524163127, + "loss": 0.2868, + "mean_token_accuracy": 0.9295425117015839, + "num_tokens": 519656.0, + "step": 59 + }, + { + "entropy": 2.822400987148285, + "epoch": 0.2171945701357466, + "grad_norm": 0.8780149817466736, + "learning_rate": 0.000510964629029717, + "loss": 0.4371, + "mean_token_accuracy": 0.9089596569538116, + "num_tokens": 528105.0, + "step": 60 + }, + { + "entropy": 2.522100865840912, + "epoch": 0.22081447963800904, + "grad_norm": 0.51394122838974, + "learning_rate": 0.0005109563323980594, + "loss": 0.2509, + "mean_token_accuracy": 0.941976860165596, + "num_tokens": 537707.0, + "step": 61 + }, + { + "entropy": 2.6596657633781433, + "epoch": 0.2244343891402715, + "grad_norm": 0.6359816789627075, + "learning_rate": 0.0005109471625528516, + "loss": 0.3685, + "mean_token_accuracy": 0.9191890209913254, + "num_tokens": 546517.0, + "step": 62 + }, + { + "entropy": 2.800311803817749, + "epoch": 0.22805429864253393, + "grad_norm": 0.6862941980361938, + "learning_rate": 0.0005109371195289215, + "loss": 0.2457, + "mean_token_accuracy": 0.9330879002809525, + "num_tokens": 555493.0, + "step": 63 + }, + { + "entropy": 2.7235344648361206, + "epoch": 0.2316742081447964, + "grad_norm": 1.0464682579040527, + "learning_rate": 0.0005109262033644142, + "loss": 0.4417, + "mean_token_accuracy": 0.8957678377628326, + "num_tokens": 564255.0, + "step": 64 + }, + { + "entropy": 2.6643534302711487, + "epoch": 0.23529411764705882, + "grad_norm": 1.0790019035339355, + "learning_rate": 0.0005109144141007903, + "loss": 0.4947, + "mean_token_accuracy": 0.8889007717370987, + "num_tokens": 573401.0, + "step": 65 + }, + { + "entropy": 2.760925054550171, + "epoch": 0.23891402714932128, + "grad_norm": 0.7957189679145813, + "learning_rate": 0.0005109017517828273, + "loss": 0.2259, + "mean_token_accuracy": 0.944578230381012, + "num_tokens": 581905.0, + "step": 66 + }, + { + "entropy": 2.7048792839050293, + "epoch": 0.2425339366515837, + "grad_norm": 0.9530714750289917, + "learning_rate": 0.0005108882164586181, + "loss": 0.3122, + "mean_token_accuracy": 0.9257418513298035, + "num_tokens": 590802.0, + "step": 67 + }, + { + "entropy": 2.6733291149139404, + "epoch": 0.24615384615384617, + "grad_norm": 0.8295993208885193, + "learning_rate": 0.0005108738081795716, + "loss": 0.3701, + "mean_token_accuracy": 0.898589238524437, + "num_tokens": 599279.0, + "step": 68 + }, + { + "entropy": 2.5613606572151184, + "epoch": 0.2497737556561086, + "grad_norm": 0.6205935478210449, + "learning_rate": 0.0005108585270004123, + "loss": 0.4372, + "mean_token_accuracy": 0.9116007685661316, + "num_tokens": 608107.0, + "step": 69 + }, + { + "entropy": 2.458296835422516, + "epoch": 0.25339366515837103, + "grad_norm": 0.7629838585853577, + "learning_rate": 0.0005108423729791799, + "loss": 0.2307, + "mean_token_accuracy": 0.9386163502931595, + "num_tokens": 616881.0, + "step": 70 + }, + { + "entropy": 2.4176695346832275, + "epoch": 0.25701357466063346, + "grad_norm": 0.902400016784668, + "learning_rate": 0.0005108253461772298, + "loss": 0.2853, + "mean_token_accuracy": 0.9237343072891235, + "num_tokens": 625323.0, + "step": 71 + }, + { + "entropy": 2.2265281677246094, + "epoch": 0.26063348416289595, + "grad_norm": 0.7744383811950684, + "learning_rate": 0.0005108074466592316, + "loss": 0.2435, + "mean_token_accuracy": 0.9508260935544968, + "num_tokens": 634260.0, + "step": 72 + }, + { + "entropy": 2.1855952441692352, + "epoch": 0.2642533936651584, + "grad_norm": 0.8615190386772156, + "learning_rate": 0.0005107886744931702, + "loss": 0.3323, + "mean_token_accuracy": 0.9276078194379807, + "num_tokens": 643235.0, + "step": 73 + }, + { + "entropy": 2.179121494293213, + "epoch": 0.2678733031674208, + "grad_norm": 0.8953279256820679, + "learning_rate": 0.0005107690297503444, + "loss": 0.2384, + "mean_token_accuracy": 0.9425230622291565, + "num_tokens": 652032.0, + "step": 74 + }, + { + "entropy": 2.1565526127815247, + "epoch": 0.27149321266968324, + "grad_norm": 0.6830486059188843, + "learning_rate": 0.0005107485125053678, + "loss": 0.2759, + "mean_token_accuracy": 0.9360661953687668, + "num_tokens": 660978.0, + "step": 75 + }, + { + "entropy": 2.0900665521621704, + "epoch": 0.2751131221719457, + "grad_norm": 0.786665141582489, + "learning_rate": 0.0005107271228361672, + "loss": 0.4061, + "mean_token_accuracy": 0.910009115934372, + "num_tokens": 669817.0, + "step": 76 + }, + { + "entropy": 2.1311859488487244, + "epoch": 0.27873303167420815, + "grad_norm": 0.6399909853935242, + "learning_rate": 0.0005107048608239836, + "loss": 0.272, + "mean_token_accuracy": 0.9424714297056198, + "num_tokens": 678469.0, + "step": 77 + }, + { + "entropy": 2.059997320175171, + "epoch": 0.2823529411764706, + "grad_norm": 0.8114754557609558, + "learning_rate": 0.0005106817265533706, + "loss": 0.4029, + "mean_token_accuracy": 0.9037660360336304, + "num_tokens": 687261.0, + "step": 78 + }, + { + "entropy": 1.9725019037723541, + "epoch": 0.285972850678733, + "grad_norm": 0.9420941472053528, + "learning_rate": 0.0005106577201121952, + "loss": 0.535, + "mean_token_accuracy": 0.8996377140283585, + "num_tokens": 695941.0, + "step": 79 + }, + { + "entropy": 1.9951164424419403, + "epoch": 0.2895927601809955, + "grad_norm": 0.6476142406463623, + "learning_rate": 0.0005106328415916372, + "loss": 0.2242, + "mean_token_accuracy": 0.941379725933075, + "num_tokens": 704643.0, + "step": 80 + }, + { + "entropy": 1.8962564170360565, + "epoch": 0.29321266968325793, + "grad_norm": 0.5974630117416382, + "learning_rate": 0.0005106070910861881, + "loss": 0.2934, + "mean_token_accuracy": 0.9217697530984879, + "num_tokens": 713605.0, + "step": 81 + }, + { + "entropy": 1.9781515896320343, + "epoch": 0.29683257918552036, + "grad_norm": 0.8755478262901306, + "learning_rate": 0.0005105804686936518, + "loss": 0.4551, + "mean_token_accuracy": 0.9051328897476196, + "num_tokens": 722385.0, + "step": 82 + }, + { + "entropy": 1.9892418384552002, + "epoch": 0.3004524886877828, + "grad_norm": 0.6887345314025879, + "learning_rate": 0.0005105529745151433, + "loss": 0.244, + "mean_token_accuracy": 0.9261117279529572, + "num_tokens": 730962.0, + "step": 83 + }, + { + "entropy": 2.0053181648254395, + "epoch": 0.3040723981900452, + "grad_norm": 0.6930885910987854, + "learning_rate": 0.0005105246086550893, + "loss": 0.3155, + "mean_token_accuracy": 0.9206147193908691, + "num_tokens": 739499.0, + "step": 84 + }, + { + "entropy": 1.9716475903987885, + "epoch": 0.3076923076923077, + "grad_norm": 0.5049461722373962, + "learning_rate": 0.0005104953712212266, + "loss": 0.2215, + "mean_token_accuracy": 0.9608763605356216, + "num_tokens": 748604.0, + "step": 85 + }, + { + "entropy": 1.9186978042125702, + "epoch": 0.31131221719457014, + "grad_norm": 0.5756685733795166, + "learning_rate": 0.000510465262324603, + "loss": 0.2658, + "mean_token_accuracy": 0.9372887462377548, + "num_tokens": 757919.0, + "step": 86 + }, + { + "entropy": 1.9738290905952454, + "epoch": 0.31493212669683257, + "grad_norm": 0.6163789629936218, + "learning_rate": 0.0005104342820795758, + "loss": 0.2472, + "mean_token_accuracy": 0.9430449157953262, + "num_tokens": 766708.0, + "step": 87 + }, + { + "entropy": 2.1927571892738342, + "epoch": 0.318552036199095, + "grad_norm": 0.7953162789344788, + "learning_rate": 0.0005104024306038119, + "loss": 0.261, + "mean_token_accuracy": 0.9425829648971558, + "num_tokens": 774601.0, + "step": 88 + }, + { + "entropy": 2.043731451034546, + "epoch": 0.3221719457013575, + "grad_norm": 0.8098088502883911, + "learning_rate": 0.0005103697080182872, + "loss": 0.3126, + "mean_token_accuracy": 0.9158089309930801, + "num_tokens": 783170.0, + "step": 89 + }, + { + "entropy": 1.9801572561264038, + "epoch": 0.3257918552036199, + "grad_norm": 0.5227240920066833, + "learning_rate": 0.0005103361144472864, + "loss": 0.1291, + "mean_token_accuracy": 0.9666071832180023, + "num_tokens": 791769.0, + "step": 90 + }, + { + "entropy": 1.9553790986537933, + "epoch": 0.32941176470588235, + "grad_norm": 0.7819464206695557, + "learning_rate": 0.0005103016500184022, + "loss": 0.531, + "mean_token_accuracy": 0.8817111849784851, + "num_tokens": 800824.0, + "step": 91 + }, + { + "entropy": 1.9291303753852844, + "epoch": 0.3330316742081448, + "grad_norm": 0.7178757190704346, + "learning_rate": 0.0005102663148625347, + "loss": 0.3301, + "mean_token_accuracy": 0.9357631802558899, + "num_tokens": 809347.0, + "step": 92 + }, + { + "entropy": 1.9846041798591614, + "epoch": 0.33665158371040727, + "grad_norm": 1.316636085510254, + "learning_rate": 0.0005102301091138916, + "loss": 0.4241, + "mean_token_accuracy": 0.8993304669857025, + "num_tokens": 817174.0, + "step": 93 + }, + { + "entropy": 1.814637303352356, + "epoch": 0.3402714932126697, + "grad_norm": 0.5486414432525635, + "learning_rate": 0.0005101930329099865, + "loss": 0.116, + "mean_token_accuracy": 0.9674727618694305, + "num_tokens": 826177.0, + "step": 94 + }, + { + "entropy": 1.9128066003322601, + "epoch": 0.3438914027149321, + "grad_norm": 0.620303750038147, + "learning_rate": 0.00051015508639164, + "loss": 0.1833, + "mean_token_accuracy": 0.9569521993398666, + "num_tokens": 835409.0, + "step": 95 + }, + { + "entropy": 1.7541870176792145, + "epoch": 0.34751131221719456, + "grad_norm": 0.8337438702583313, + "learning_rate": 0.0005101162697029776, + "loss": 0.3327, + "mean_token_accuracy": 0.9193180054426193, + "num_tokens": 844692.0, + "step": 96 + }, + { + "entropy": 1.8255240619182587, + "epoch": 0.351131221719457, + "grad_norm": 0.877780556678772, + "learning_rate": 0.00051007658299143, + "loss": 0.2106, + "mean_token_accuracy": 0.9527023881673813, + "num_tokens": 853309.0, + "step": 97 + }, + { + "entropy": 1.8611579239368439, + "epoch": 0.3547511312217195, + "grad_norm": 1.0667716264724731, + "learning_rate": 0.0005100360264077325, + "loss": 0.3196, + "mean_token_accuracy": 0.9195879399776459, + "num_tokens": 861859.0, + "step": 98 + }, + { + "entropy": 1.821915864944458, + "epoch": 0.3583710407239819, + "grad_norm": 0.8400309681892395, + "learning_rate": 0.0005099946001059241, + "loss": 0.4036, + "mean_token_accuracy": 0.8951036781072617, + "num_tokens": 871060.0, + "step": 99 + }, + { + "entropy": 1.7648265063762665, + "epoch": 0.36199095022624433, + "grad_norm": 1.1391404867172241, + "learning_rate": 0.0005099523042433472, + "loss": 0.389, + "mean_token_accuracy": 0.901309460401535, + "num_tokens": 880593.0, + "step": 100 + }, + { + "entropy": 1.8506875336170197, + "epoch": 0.36561085972850677, + "grad_norm": 0.6923297643661499, + "learning_rate": 0.000509909138980647, + "loss": 0.2504, + "mean_token_accuracy": 0.9384842216968536, + "num_tokens": 889739.0, + "step": 101 + }, + { + "entropy": 1.9311015605926514, + "epoch": 0.36923076923076925, + "grad_norm": 0.9677391052246094, + "learning_rate": 0.0005098651044817704, + "loss": 0.6953, + "mean_token_accuracy": 0.8752655684947968, + "num_tokens": 898992.0, + "step": 102 + }, + { + "entropy": 1.9590983986854553, + "epoch": 0.3728506787330317, + "grad_norm": 0.6364567279815674, + "learning_rate": 0.0005098202009139663, + "loss": 0.4318, + "mean_token_accuracy": 0.9056479930877686, + "num_tokens": 908225.0, + "step": 103 + }, + { + "entropy": 1.9455370008945465, + "epoch": 0.3764705882352941, + "grad_norm": 0.6747863292694092, + "learning_rate": 0.0005097744284477839, + "loss": 0.244, + "mean_token_accuracy": 0.9428392052650452, + "num_tokens": 917134.0, + "step": 104 + }, + { + "entropy": 1.8632825911045074, + "epoch": 0.38009049773755654, + "grad_norm": 0.5705651044845581, + "learning_rate": 0.0005097277872570731, + "loss": 0.2508, + "mean_token_accuracy": 0.9325222969055176, + "num_tokens": 926573.0, + "step": 105 + }, + { + "entropy": 1.9370323717594147, + "epoch": 0.38371040723981903, + "grad_norm": 0.6298627853393555, + "learning_rate": 0.000509680277518983, + "loss": 0.2481, + "mean_token_accuracy": 0.9281332045793533, + "num_tokens": 935853.0, + "step": 106 + }, + { + "entropy": 2.0217572450637817, + "epoch": 0.38733031674208146, + "grad_norm": 0.5434353947639465, + "learning_rate": 0.0005096318994139617, + "loss": 0.1809, + "mean_token_accuracy": 0.9592084139585495, + "num_tokens": 944279.0, + "step": 107 + }, + { + "entropy": 1.9619770646095276, + "epoch": 0.3909502262443439, + "grad_norm": 0.6959638595581055, + "learning_rate": 0.0005095826531257552, + "loss": 0.1376, + "mean_token_accuracy": 0.9608310014009476, + "num_tokens": 953336.0, + "step": 108 + }, + { + "entropy": 2.12511146068573, + "epoch": 0.3945701357466063, + "grad_norm": 1.0152848958969116, + "learning_rate": 0.0005095325388414074, + "loss": 0.4382, + "mean_token_accuracy": 0.915201798081398, + "num_tokens": 962002.0, + "step": 109 + }, + { + "entropy": 2.0171878039836884, + "epoch": 0.39819004524886875, + "grad_norm": 0.8337467312812805, + "learning_rate": 0.0005094815567512587, + "loss": 0.2672, + "mean_token_accuracy": 0.9313560128211975, + "num_tokens": 970954.0, + "step": 110 + }, + { + "entropy": 2.1024146378040314, + "epoch": 0.40180995475113124, + "grad_norm": 0.8214333057403564, + "learning_rate": 0.0005094297070489455, + "loss": 0.3146, + "mean_token_accuracy": 0.9289091974496841, + "num_tokens": 979929.0, + "step": 111 + }, + { + "entropy": 2.260519325733185, + "epoch": 0.40542986425339367, + "grad_norm": 1.1298810243606567, + "learning_rate": 0.0005093769899313996, + "loss": 0.3055, + "mean_token_accuracy": 0.9213490188121796, + "num_tokens": 988477.0, + "step": 112 + }, + { + "entropy": 2.2228699326515198, + "epoch": 0.4090497737556561, + "grad_norm": 0.8601953983306885, + "learning_rate": 0.0005093234055988475, + "loss": 0.2738, + "mean_token_accuracy": 0.920888364315033, + "num_tokens": 997091.0, + "step": 113 + }, + { + "entropy": 2.2165185809135437, + "epoch": 0.41266968325791853, + "grad_norm": 0.6331561803817749, + "learning_rate": 0.0005092689542548091, + "loss": 0.2241, + "mean_token_accuracy": 0.9408514499664307, + "num_tokens": 1005866.0, + "step": 114 + }, + { + "entropy": 2.324040472507477, + "epoch": 0.416289592760181, + "grad_norm": 0.680496096611023, + "learning_rate": 0.0005092136361060975, + "loss": 0.2454, + "mean_token_accuracy": 0.9433349967002869, + "num_tokens": 1014277.0, + "step": 115 + }, + { + "entropy": 2.413789749145508, + "epoch": 0.41990950226244345, + "grad_norm": 0.7489557862281799, + "learning_rate": 0.0005091574513628183, + "loss": 0.2856, + "mean_token_accuracy": 0.934124082326889, + "num_tokens": 1023032.0, + "step": 116 + }, + { + "entropy": 2.4693005681037903, + "epoch": 0.4235294117647059, + "grad_norm": 0.6842612624168396, + "learning_rate": 0.0005091004002383682, + "loss": 0.2778, + "mean_token_accuracy": 0.9386793673038483, + "num_tokens": 1031883.0, + "step": 117 + }, + { + "entropy": 2.4351969361305237, + "epoch": 0.4271493212669683, + "grad_norm": 0.9150674343109131, + "learning_rate": 0.0005090424829494347, + "loss": 0.3151, + "mean_token_accuracy": 0.9177709072828293, + "num_tokens": 1040985.0, + "step": 118 + }, + { + "entropy": 2.5141562819480896, + "epoch": 0.4307692307692308, + "grad_norm": 1.0200655460357666, + "learning_rate": 0.000508983699715995, + "loss": 0.5134, + "mean_token_accuracy": 0.8835459351539612, + "num_tokens": 1049949.0, + "step": 119 + }, + { + "entropy": 2.479240596294403, + "epoch": 0.4343891402714932, + "grad_norm": 0.783278226852417, + "learning_rate": 0.0005089240507613151, + "loss": 0.2745, + "mean_token_accuracy": 0.9389322698116302, + "num_tokens": 1058953.0, + "step": 120 + }, + { + "entropy": 2.457803785800934, + "epoch": 0.43800904977375565, + "grad_norm": 0.7620834112167358, + "learning_rate": 0.0005088635363119497, + "loss": 0.3394, + "mean_token_accuracy": 0.9145695865154266, + "num_tokens": 1068624.0, + "step": 121 + }, + { + "entropy": 2.4909247756004333, + "epoch": 0.4416289592760181, + "grad_norm": 0.5868712067604065, + "learning_rate": 0.0005088021565977403, + "loss": 0.1726, + "mean_token_accuracy": 0.9567564129829407, + "num_tokens": 1077686.0, + "step": 122 + }, + { + "entropy": 2.5540462732315063, + "epoch": 0.4452488687782805, + "grad_norm": 1.1467291116714478, + "learning_rate": 0.0005087399118518148, + "loss": 0.2617, + "mean_token_accuracy": 0.9329706132411957, + "num_tokens": 1086230.0, + "step": 123 + }, + { + "entropy": 2.377680242061615, + "epoch": 0.448868778280543, + "grad_norm": 0.7021825909614563, + "learning_rate": 0.0005086768023105866, + "loss": 0.4124, + "mean_token_accuracy": 0.9093360006809235, + "num_tokens": 1095867.0, + "step": 124 + }, + { + "entropy": 2.55239599943161, + "epoch": 0.45248868778280543, + "grad_norm": 0.5947801470756531, + "learning_rate": 0.0005086128282137538, + "loss": 0.2752, + "mean_token_accuracy": 0.9248816668987274, + "num_tokens": 1105003.0, + "step": 125 + }, + { + "entropy": 2.4695483446121216, + "epoch": 0.45610859728506786, + "grad_norm": 1.345604658126831, + "learning_rate": 0.0005085479898042985, + "loss": 0.2577, + "mean_token_accuracy": 0.9318550229072571, + "num_tokens": 1114162.0, + "step": 126 + }, + { + "entropy": 2.4898732900619507, + "epoch": 0.4597285067873303, + "grad_norm": 0.8534179329872131, + "learning_rate": 0.0005084822873284848, + "loss": 0.3013, + "mean_token_accuracy": 0.9195661097764969, + "num_tokens": 1123457.0, + "step": 127 + }, + { + "entropy": 2.5951223969459534, + "epoch": 0.4633484162895928, + "grad_norm": 1.1677368879318237, + "learning_rate": 0.0005084157210358592, + "loss": 0.1612, + "mean_token_accuracy": 0.9599333852529526, + "num_tokens": 1131774.0, + "step": 128 + }, + { + "entropy": 2.7315847873687744, + "epoch": 0.4669683257918552, + "grad_norm": 0.7633224129676819, + "learning_rate": 0.0005083482911792492, + "loss": 0.2437, + "mean_token_accuracy": 0.9487509876489639, + "num_tokens": 1140301.0, + "step": 129 + }, + { + "entropy": 2.6348633766174316, + "epoch": 0.47058823529411764, + "grad_norm": 0.7573317885398865, + "learning_rate": 0.0005082799980147617, + "loss": 0.2426, + "mean_token_accuracy": 0.947308748960495, + "num_tokens": 1148929.0, + "step": 130 + }, + { + "entropy": 2.60002738237381, + "epoch": 0.47420814479638007, + "grad_norm": 1.8195319175720215, + "learning_rate": 0.0005082108418017829, + "loss": 0.1792, + "mean_token_accuracy": 0.9512491375207901, + "num_tokens": 1157682.0, + "step": 131 + }, + { + "entropy": 2.5319923162460327, + "epoch": 0.47782805429864256, + "grad_norm": 0.6342993378639221, + "learning_rate": 0.0005081408228029771, + "loss": 0.1843, + "mean_token_accuracy": 0.9440758228302002, + "num_tokens": 1166687.0, + "step": 132 + }, + { + "entropy": 2.5666881799697876, + "epoch": 0.481447963800905, + "grad_norm": 0.8979415893554688, + "learning_rate": 0.0005080699412842852, + "loss": 0.4824, + "mean_token_accuracy": 0.8837443292140961, + "num_tokens": 1175746.0, + "step": 133 + }, + { + "entropy": 2.6854636669158936, + "epoch": 0.4850678733031674, + "grad_norm": 0.8302125334739685, + "learning_rate": 0.0005079981975149243, + "loss": 0.267, + "mean_token_accuracy": 0.9279022663831711, + "num_tokens": 1184196.0, + "step": 134 + }, + { + "entropy": 2.564552128314972, + "epoch": 0.48868778280542985, + "grad_norm": 0.6785959005355835, + "learning_rate": 0.0005079255917673863, + "loss": 0.2031, + "mean_token_accuracy": 0.9463823586702347, + "num_tokens": 1192982.0, + "step": 135 + }, + { + "entropy": 2.673682928085327, + "epoch": 0.49230769230769234, + "grad_norm": 1.4760410785675049, + "learning_rate": 0.0005078521243174371, + "loss": 0.4791, + "mean_token_accuracy": 0.8969505727291107, + "num_tokens": 1201454.0, + "step": 136 + }, + { + "entropy": 2.6232714653015137, + "epoch": 0.49592760180995477, + "grad_norm": 0.7845668792724609, + "learning_rate": 0.0005077777954441157, + "loss": 0.2472, + "mean_token_accuracy": 0.9404618591070175, + "num_tokens": 1210182.0, + "step": 137 + }, + { + "entropy": 2.5614060163497925, + "epoch": 0.4995475113122172, + "grad_norm": 0.725419819355011, + "learning_rate": 0.0005077026054297322, + "loss": 0.3643, + "mean_token_accuracy": 0.9193316847085953, + "num_tokens": 1219487.0, + "step": 138 + }, + { + "entropy": 2.5907246470451355, + "epoch": 0.5031674208144796, + "grad_norm": 0.7741782665252686, + "learning_rate": 0.0005076265545598682, + "loss": 0.276, + "mean_token_accuracy": 0.9447730481624603, + "num_tokens": 1228066.0, + "step": 139 + }, + { + "entropy": 2.531104028224945, + "epoch": 0.5067873303167421, + "grad_norm": 0.680992603302002, + "learning_rate": 0.0005075496431233745, + "loss": 0.2004, + "mean_token_accuracy": 0.9470729678869247, + "num_tokens": 1236980.0, + "step": 140 + }, + { + "entropy": 2.590231478214264, + "epoch": 0.5104072398190045, + "grad_norm": 0.8260406255722046, + "learning_rate": 0.0005074718714123704, + "loss": 0.2756, + "mean_token_accuracy": 0.9301882535219193, + "num_tokens": 1245565.0, + "step": 141 + }, + { + "entropy": 2.4858668446540833, + "epoch": 0.5140271493212669, + "grad_norm": 0.8085922598838806, + "learning_rate": 0.0005073932397222429, + "loss": 0.2314, + "mean_token_accuracy": 0.9449103325605392, + "num_tokens": 1254366.0, + "step": 142 + }, + { + "entropy": 2.5374304056167603, + "epoch": 0.5176470588235295, + "grad_norm": 0.7858129143714905, + "learning_rate": 0.0005073137483516452, + "loss": 0.1622, + "mean_token_accuracy": 0.9510673582553864, + "num_tokens": 1263197.0, + "step": 143 + }, + { + "entropy": 2.608425199985504, + "epoch": 0.5212669683257919, + "grad_norm": 1.2698506116867065, + "learning_rate": 0.0005072333976024957, + "loss": 0.1729, + "mean_token_accuracy": 0.9509973376989365, + "num_tokens": 1271725.0, + "step": 144 + }, + { + "entropy": 2.437038242816925, + "epoch": 0.5248868778280543, + "grad_norm": 1.0788538455963135, + "learning_rate": 0.0005071521877799765, + "loss": 0.3344, + "mean_token_accuracy": 0.9166721999645233, + "num_tokens": 1280963.0, + "step": 145 + }, + { + "entropy": 2.589951515197754, + "epoch": 0.5285067873303168, + "grad_norm": 0.9228294491767883, + "learning_rate": 0.0005070701191925332, + "loss": 0.3095, + "mean_token_accuracy": 0.9239777624607086, + "num_tokens": 1289683.0, + "step": 146 + }, + { + "entropy": 2.575794994831085, + "epoch": 0.5321266968325792, + "grad_norm": 1.359767198562622, + "learning_rate": 0.0005069871921518726, + "loss": 0.2447, + "mean_token_accuracy": 0.9374738186597824, + "num_tokens": 1298397.0, + "step": 147 + }, + { + "entropy": 2.5628358721733093, + "epoch": 0.5357466063348416, + "grad_norm": 0.9870713353157043, + "learning_rate": 0.000506903406972962, + "loss": 0.4824, + "mean_token_accuracy": 0.9027767181396484, + "num_tokens": 1307191.0, + "step": 148 + }, + { + "entropy": 2.5513240098953247, + "epoch": 0.539366515837104, + "grad_norm": 0.7921387553215027, + "learning_rate": 0.0005068187639740286, + "loss": 0.3278, + "mean_token_accuracy": 0.9161934554576874, + "num_tokens": 1315878.0, + "step": 149 + }, + { + "entropy": 2.526439070701599, + "epoch": 0.5429864253393665, + "grad_norm": 0.6320391297340393, + "learning_rate": 0.000506733263476557, + "loss": 0.1701, + "mean_token_accuracy": 0.9575318098068237, + "num_tokens": 1324786.0, + "step": 150 + }, + { + "entropy": 2.4837265014648438, + "epoch": 0.5466063348416289, + "grad_norm": 0.5369354486465454, + "learning_rate": 0.000506646905805289, + "loss": 0.1328, + "mean_token_accuracy": 0.9636050164699554, + "num_tokens": 1333766.0, + "step": 151 + }, + { + "entropy": 2.5264737010002136, + "epoch": 0.5502262443438914, + "grad_norm": 0.7346852421760559, + "learning_rate": 0.0005065596912882222, + "loss": 0.2012, + "mean_token_accuracy": 0.9448132663965225, + "num_tokens": 1343004.0, + "step": 152 + }, + { + "entropy": 2.569309651851654, + "epoch": 0.5538461538461539, + "grad_norm": 0.9926508069038391, + "learning_rate": 0.0005064716202566082, + "loss": 0.2831, + "mean_token_accuracy": 0.9332023113965988, + "num_tokens": 1351561.0, + "step": 153 + }, + { + "entropy": 2.3148274421691895, + "epoch": 0.5574660633484163, + "grad_norm": 0.6301954984664917, + "learning_rate": 0.0005063826930449523, + "loss": 0.3622, + "mean_token_accuracy": 0.9349419325590134, + "num_tokens": 1360997.0, + "step": 154 + }, + { + "entropy": 2.497675657272339, + "epoch": 0.5610859728506787, + "grad_norm": 0.8846175670623779, + "learning_rate": 0.000506292909991011, + "loss": 0.2314, + "mean_token_accuracy": 0.9468862265348434, + "num_tokens": 1369600.0, + "step": 155 + }, + { + "entropy": 2.313987612724304, + "epoch": 0.5647058823529412, + "grad_norm": 0.5701894164085388, + "learning_rate": 0.0005062022714357922, + "loss": 0.2154, + "mean_token_accuracy": 0.945093959569931, + "num_tokens": 1379125.0, + "step": 156 + }, + { + "entropy": 2.4019755125045776, + "epoch": 0.5683257918552036, + "grad_norm": 0.8769335746765137, + "learning_rate": 0.0005061107777235524, + "loss": 0.3565, + "mean_token_accuracy": 0.9133864492177963, + "num_tokens": 1388111.0, + "step": 157 + }, + { + "entropy": 2.3127577900886536, + "epoch": 0.571945701357466, + "grad_norm": 1.1026453971862793, + "learning_rate": 0.0005060184292017965, + "loss": 0.2897, + "mean_token_accuracy": 0.899736076593399, + "num_tokens": 1397528.0, + "step": 158 + }, + { + "entropy": 2.2682697772979736, + "epoch": 0.5755656108597285, + "grad_norm": 0.5426591038703918, + "learning_rate": 0.000505925226221276, + "loss": 0.167, + "mean_token_accuracy": 0.9609879851341248, + "num_tokens": 1406809.0, + "step": 159 + }, + { + "entropy": 2.4639336466789246, + "epoch": 0.579185520361991, + "grad_norm": 0.6552363038063049, + "learning_rate": 0.0005058311691359875, + "loss": 0.2511, + "mean_token_accuracy": 0.9355164766311646, + "num_tokens": 1415498.0, + "step": 160 + }, + { + "entropy": 2.467900663614273, + "epoch": 0.5828054298642534, + "grad_norm": 0.7168154120445251, + "learning_rate": 0.000505736258303172, + "loss": 0.234, + "mean_token_accuracy": 0.9450509995222092, + "num_tokens": 1424524.0, + "step": 161 + }, + { + "entropy": 2.3683157563209534, + "epoch": 0.5864253393665159, + "grad_norm": 0.6433501839637756, + "learning_rate": 0.0005056404940833128, + "loss": 0.3441, + "mean_token_accuracy": 0.9261108189821243, + "num_tokens": 1434194.0, + "step": 162 + }, + { + "entropy": 2.4686295986175537, + "epoch": 0.5900452488687783, + "grad_norm": 0.9615177512168884, + "learning_rate": 0.0005055438768401348, + "loss": 0.1492, + "mean_token_accuracy": 0.966903567314148, + "num_tokens": 1442972.0, + "step": 163 + }, + { + "entropy": 2.5551892518997192, + "epoch": 0.5936651583710407, + "grad_norm": 0.4957484006881714, + "learning_rate": 0.0005054464069406023, + "loss": 0.1242, + "mean_token_accuracy": 0.969713419675827, + "num_tokens": 1451324.0, + "step": 164 + }, + { + "entropy": 2.554121434688568, + "epoch": 0.5972850678733032, + "grad_norm": 0.7399498224258423, + "learning_rate": 0.0005053480847549187, + "loss": 0.206, + "mean_token_accuracy": 0.9498797357082367, + "num_tokens": 1459698.0, + "step": 165 + }, + { + "entropy": 2.5181015729904175, + "epoch": 0.6009049773755656, + "grad_norm": 0.7433251142501831, + "learning_rate": 0.0005052489106565241, + "loss": 0.2883, + "mean_token_accuracy": 0.9419967085123062, + "num_tokens": 1468460.0, + "step": 166 + }, + { + "entropy": 2.3073930144309998, + "epoch": 0.604524886877828, + "grad_norm": 0.5920398831367493, + "learning_rate": 0.0005051488850220941, + "loss": 0.197, + "mean_token_accuracy": 0.952111005783081, + "num_tokens": 1477579.0, + "step": 167 + }, + { + "entropy": 2.532376289367676, + "epoch": 0.6081447963800904, + "grad_norm": 0.7033098936080933, + "learning_rate": 0.0005050480082315392, + "loss": 0.2122, + "mean_token_accuracy": 0.9488633275032043, + "num_tokens": 1486307.0, + "step": 168 + }, + { + "entropy": 2.397290349006653, + "epoch": 0.611764705882353, + "grad_norm": 0.8026869893074036, + "learning_rate": 0.0005049462806680021, + "loss": 0.2541, + "mean_token_accuracy": 0.9427233040332794, + "num_tokens": 1495152.0, + "step": 169 + }, + { + "entropy": 2.464823842048645, + "epoch": 0.6153846153846154, + "grad_norm": 0.6508225798606873, + "learning_rate": 0.0005048437027178571, + "loss": 0.2639, + "mean_token_accuracy": 0.9391255974769592, + "num_tokens": 1503903.0, + "step": 170 + }, + { + "entropy": 2.520734131336212, + "epoch": 0.6190045248868778, + "grad_norm": 0.8373616337776184, + "learning_rate": 0.0005047402747707084, + "loss": 0.3078, + "mean_token_accuracy": 0.9302930980920792, + "num_tokens": 1512588.0, + "step": 171 + }, + { + "entropy": 2.388108015060425, + "epoch": 0.6226244343891403, + "grad_norm": 0.6334089636802673, + "learning_rate": 0.0005046359972193884, + "loss": 0.1372, + "mean_token_accuracy": 0.9666119515895844, + "num_tokens": 1522011.0, + "step": 172 + }, + { + "entropy": 2.537126660346985, + "epoch": 0.6262443438914027, + "grad_norm": 0.7665116190910339, + "learning_rate": 0.0005045308704599566, + "loss": 0.2603, + "mean_token_accuracy": 0.9350012242794037, + "num_tokens": 1530767.0, + "step": 173 + }, + { + "entropy": 2.567205488681793, + "epoch": 0.6298642533936651, + "grad_norm": 0.8043875098228455, + "learning_rate": 0.0005044248948916977, + "loss": 0.2497, + "mean_token_accuracy": 0.9400482773780823, + "num_tokens": 1539971.0, + "step": 174 + }, + { + "entropy": 2.585887610912323, + "epoch": 0.6334841628959276, + "grad_norm": 0.5282150506973267, + "learning_rate": 0.0005043180709171206, + "loss": 0.1126, + "mean_token_accuracy": 0.9680279046297073, + "num_tokens": 1548971.0, + "step": 175 + }, + { + "entropy": 2.4289392232894897, + "epoch": 0.63710407239819, + "grad_norm": 0.6838382482528687, + "learning_rate": 0.0005042103989419563, + "loss": 0.2076, + "mean_token_accuracy": 0.9468046277761459, + "num_tokens": 1558403.0, + "step": 176 + }, + { + "entropy": 2.6080575585365295, + "epoch": 0.6407239819004525, + "grad_norm": 0.9058650732040405, + "learning_rate": 0.0005041018793751566, + "loss": 0.1781, + "mean_token_accuracy": 0.9432647377252579, + "num_tokens": 1567209.0, + "step": 177 + }, + { + "entropy": 2.5212480425834656, + "epoch": 0.644343891402715, + "grad_norm": 0.796381950378418, + "learning_rate": 0.0005039925126288929, + "loss": 0.2286, + "mean_token_accuracy": 0.9305787235498428, + "num_tokens": 1576255.0, + "step": 178 + }, + { + "entropy": 2.588195264339447, + "epoch": 0.6479638009049774, + "grad_norm": 0.6489388942718506, + "learning_rate": 0.0005038822991185536, + "loss": 0.1717, + "mean_token_accuracy": 0.9572225511074066, + "num_tokens": 1585335.0, + "step": 179 + }, + { + "entropy": 2.609215259552002, + "epoch": 0.6515837104072398, + "grad_norm": 0.8551130294799805, + "learning_rate": 0.0005037712392627441, + "loss": 0.2358, + "mean_token_accuracy": 0.9529621452093124, + "num_tokens": 1594354.0, + "step": 180 + }, + { + "entropy": 2.4199504256248474, + "epoch": 0.6552036199095023, + "grad_norm": 0.5775637030601501, + "learning_rate": 0.0005036593334832836, + "loss": 0.2402, + "mean_token_accuracy": 0.9437069743871689, + "num_tokens": 1603750.0, + "step": 181 + }, + { + "entropy": 2.516424596309662, + "epoch": 0.6588235294117647, + "grad_norm": 0.6967942118644714, + "learning_rate": 0.0005035465822052047, + "loss": 0.1624, + "mean_token_accuracy": 0.9518167823553085, + "num_tokens": 1612474.0, + "step": 182 + }, + { + "entropy": 2.463354170322418, + "epoch": 0.6624434389140271, + "grad_norm": 0.49672600626945496, + "learning_rate": 0.000503432985856751, + "loss": 0.1654, + "mean_token_accuracy": 0.9564716964960098, + "num_tokens": 1621563.0, + "step": 183 + }, + { + "entropy": 2.4456416964530945, + "epoch": 0.6660633484162896, + "grad_norm": 0.6207183003425598, + "learning_rate": 0.000503318544869376, + "loss": 0.1918, + "mean_token_accuracy": 0.9476529806852341, + "num_tokens": 1630801.0, + "step": 184 + }, + { + "entropy": 2.641440451145172, + "epoch": 0.669683257918552, + "grad_norm": 1.220821499824524, + "learning_rate": 0.000503203259677741, + "loss": 0.4019, + "mean_token_accuracy": 0.9172120243310928, + "num_tokens": 1639522.0, + "step": 185 + }, + { + "entropy": 2.6447275280952454, + "epoch": 0.6733031674208145, + "grad_norm": 0.7546490430831909, + "learning_rate": 0.000503087130719714, + "loss": 0.2484, + "mean_token_accuracy": 0.9387800246477127, + "num_tokens": 1647964.0, + "step": 186 + }, + { + "entropy": 2.4657886028289795, + "epoch": 0.676923076923077, + "grad_norm": 0.7679230570793152, + "learning_rate": 0.0005029701584363675, + "loss": 0.2659, + "mean_token_accuracy": 0.930300235748291, + "num_tokens": 1657181.0, + "step": 187 + }, + { + "entropy": 2.37973552942276, + "epoch": 0.6805429864253394, + "grad_norm": 0.7473414540290833, + "learning_rate": 0.0005028523432719772, + "loss": 0.32, + "mean_token_accuracy": 0.9233052879571915, + "num_tokens": 1666477.0, + "step": 188 + }, + { + "entropy": 2.5238219499588013, + "epoch": 0.6841628959276018, + "grad_norm": 0.5573673248291016, + "learning_rate": 0.0005027336856740201, + "loss": 0.1846, + "mean_token_accuracy": 0.9445535093545914, + "num_tokens": 1675002.0, + "step": 189 + }, + { + "entropy": 2.456815242767334, + "epoch": 0.6877828054298643, + "grad_norm": 0.47237634658813477, + "learning_rate": 0.0005026141860931728, + "loss": 0.1065, + "mean_token_accuracy": 0.964375838637352, + "num_tokens": 1683623.0, + "step": 190 + }, + { + "entropy": 2.548456132411957, + "epoch": 0.6914027149321267, + "grad_norm": 0.7699162364006042, + "learning_rate": 0.00050249384498331, + "loss": 0.1985, + "mean_token_accuracy": 0.9438774734735489, + "num_tokens": 1691718.0, + "step": 191 + }, + { + "entropy": 2.4514941573143005, + "epoch": 0.6950226244343891, + "grad_norm": 1.4113538265228271, + "learning_rate": 0.0005023726628015027, + "loss": 0.4541, + "mean_token_accuracy": 0.9207872897386551, + "num_tokens": 1699824.0, + "step": 192 + }, + { + "entropy": 2.2560824751853943, + "epoch": 0.6986425339366515, + "grad_norm": 0.6007948517799377, + "learning_rate": 0.0005022506400080161, + "loss": 0.1871, + "mean_token_accuracy": 0.9502484053373337, + "num_tokens": 1708722.0, + "step": 193 + }, + { + "entropy": 2.1833614110946655, + "epoch": 0.702262443438914, + "grad_norm": 0.7005489468574524, + "learning_rate": 0.0005021277770663082, + "loss": 0.2222, + "mean_token_accuracy": 0.9386974722146988, + "num_tokens": 1717592.0, + "step": 194 + }, + { + "entropy": 2.2031923830509186, + "epoch": 0.7058823529411765, + "grad_norm": 0.5830584764480591, + "learning_rate": 0.0005020040744430284, + "loss": 0.1106, + "mean_token_accuracy": 0.9719562232494354, + "num_tokens": 1726149.0, + "step": 195 + }, + { + "entropy": 2.199785351753235, + "epoch": 0.709502262443439, + "grad_norm": 0.7465847134590149, + "learning_rate": 0.0005018795326080149, + "loss": 0.1935, + "mean_token_accuracy": 0.9497270882129669, + "num_tokens": 1734541.0, + "step": 196 + }, + { + "entropy": 2.1103186309337616, + "epoch": 0.7131221719457014, + "grad_norm": 1.0782264471054077, + "learning_rate": 0.0005017541520342934, + "loss": 0.2895, + "mean_token_accuracy": 0.9274258464574814, + "num_tokens": 1743722.0, + "step": 197 + }, + { + "entropy": 2.2248528599739075, + "epoch": 0.7167420814479638, + "grad_norm": 0.6409780979156494, + "learning_rate": 0.0005016279331980754, + "loss": 0.1425, + "mean_token_accuracy": 0.96550352871418, + "num_tokens": 1752156.0, + "step": 198 + }, + { + "entropy": 2.19924658536911, + "epoch": 0.7203619909502262, + "grad_norm": 0.7019934058189392, + "learning_rate": 0.0005015008765787561, + "loss": 0.1969, + "mean_token_accuracy": 0.9429282248020172, + "num_tokens": 1760978.0, + "step": 199 + }, + { + "entropy": 2.297484815120697, + "epoch": 0.7239819004524887, + "grad_norm": 0.7826490998268127, + "learning_rate": 0.0005013729826589127, + "loss": 0.2399, + "mean_token_accuracy": 0.9416657984256744, + "num_tokens": 1769533.0, + "step": 200 + }, + { + "entropy": 2.2471498548984528, + "epoch": 0.7276018099547511, + "grad_norm": 0.621566891670227, + "learning_rate": 0.0005012442519243027, + "loss": 0.1876, + "mean_token_accuracy": 0.9460793286561966, + "num_tokens": 1778286.0, + "step": 201 + }, + { + "entropy": 2.2212815284729004, + "epoch": 0.7312217194570135, + "grad_norm": 0.622283935546875, + "learning_rate": 0.0005011146848638616, + "loss": 0.1617, + "mean_token_accuracy": 0.9482609927654266, + "num_tokens": 1787392.0, + "step": 202 + }, + { + "entropy": 2.308752655982971, + "epoch": 0.7348416289592761, + "grad_norm": 0.7263973355293274, + "learning_rate": 0.0005009842819697018, + "loss": 0.2043, + "mean_token_accuracy": 0.9378403723239899, + "num_tokens": 1796133.0, + "step": 203 + }, + { + "entropy": 2.3376497626304626, + "epoch": 0.7384615384615385, + "grad_norm": 0.5493630766868591, + "learning_rate": 0.0005008530437371101, + "loss": 0.1145, + "mean_token_accuracy": 0.970586434006691, + "num_tokens": 1804769.0, + "step": 204 + }, + { + "entropy": 2.373005509376526, + "epoch": 0.7420814479638009, + "grad_norm": 0.6313483119010925, + "learning_rate": 0.0005007209706645461, + "loss": 0.2183, + "mean_token_accuracy": 0.9472708404064178, + "num_tokens": 1813364.0, + "step": 205 + }, + { + "entropy": 2.468949854373932, + "epoch": 0.7457013574660634, + "grad_norm": 1.0125588178634644, + "learning_rate": 0.00050058806325364, + "loss": 0.2225, + "mean_token_accuracy": 0.9351322948932648, + "num_tokens": 1822149.0, + "step": 206 + }, + { + "entropy": 2.2420623898506165, + "epoch": 0.7493212669683258, + "grad_norm": 0.913761556148529, + "learning_rate": 0.0005004543220091911, + "loss": 0.2386, + "mean_token_accuracy": 0.9453927427530289, + "num_tokens": 1831533.0, + "step": 207 + }, + { + "entropy": 2.2966006994247437, + "epoch": 0.7529411764705882, + "grad_norm": 0.7386876940727234, + "learning_rate": 0.0005003197474391658, + "loss": 0.1768, + "mean_token_accuracy": 0.949826255440712, + "num_tokens": 1840157.0, + "step": 208 + }, + { + "entropy": 2.306001305580139, + "epoch": 0.7565610859728507, + "grad_norm": 0.8900741338729858, + "learning_rate": 0.0005001843400546955, + "loss": 0.2899, + "mean_token_accuracy": 0.9241485595703125, + "num_tokens": 1848898.0, + "step": 209 + }, + { + "entropy": 2.117514967918396, + "epoch": 0.7601809954751131, + "grad_norm": 0.644622802734375, + "learning_rate": 0.0005000481003700746, + "loss": 0.2714, + "mean_token_accuracy": 0.9299416691064835, + "num_tokens": 1858330.0, + "step": 210 + }, + { + "entropy": 2.3768392205238342, + "epoch": 0.7638009049773755, + "grad_norm": 0.9724471569061279, + "learning_rate": 0.0004999110289027587, + "loss": 0.1633, + "mean_token_accuracy": 0.9550061523914337, + "num_tokens": 1866806.0, + "step": 211 + }, + { + "entropy": 2.090679556131363, + "epoch": 0.7674208144796381, + "grad_norm": 0.5419518351554871, + "learning_rate": 0.0004997731261733628, + "loss": 0.1369, + "mean_token_accuracy": 0.9619670957326889, + "num_tokens": 1875937.0, + "step": 212 + }, + { + "entropy": 2.099909245967865, + "epoch": 0.7710407239819005, + "grad_norm": 0.6858121752738953, + "learning_rate": 0.0004996343927056592, + "loss": 0.1633, + "mean_token_accuracy": 0.9528832882642746, + "num_tokens": 1885145.0, + "step": 213 + }, + { + "entropy": 2.130059242248535, + "epoch": 0.7746606334841629, + "grad_norm": 0.7691065073013306, + "learning_rate": 0.000499494829026575, + "loss": 0.348, + "mean_token_accuracy": 0.9162366837263107, + "num_tokens": 1894255.0, + "step": 214 + }, + { + "entropy": 2.191373586654663, + "epoch": 0.7782805429864253, + "grad_norm": 0.7427324652671814, + "learning_rate": 0.000499354435666191, + "loss": 0.3373, + "mean_token_accuracy": 0.9311849176883698, + "num_tokens": 1902981.0, + "step": 215 + }, + { + "entropy": 2.1425398886203766, + "epoch": 0.7819004524886878, + "grad_norm": 0.6410383582115173, + "learning_rate": 0.0004992132131577392, + "loss": 0.2079, + "mean_token_accuracy": 0.949742391705513, + "num_tokens": 1912253.0, + "step": 216 + }, + { + "entropy": 2.1396586298942566, + "epoch": 0.7855203619909502, + "grad_norm": 0.5689850449562073, + "learning_rate": 0.0004990711620376003, + "loss": 0.1999, + "mean_token_accuracy": 0.946034774184227, + "num_tokens": 1921409.0, + "step": 217 + }, + { + "entropy": 2.2237865328788757, + "epoch": 0.7891402714932126, + "grad_norm": 0.6408923864364624, + "learning_rate": 0.0004989282828453029, + "loss": 0.2452, + "mean_token_accuracy": 0.9510752111673355, + "num_tokens": 1930397.0, + "step": 218 + }, + { + "entropy": 2.234771251678467, + "epoch": 0.7927601809954751, + "grad_norm": 0.751447856426239, + "learning_rate": 0.0004987845761235203, + "loss": 0.3057, + "mean_token_accuracy": 0.9217256307601929, + "num_tokens": 1939172.0, + "step": 219 + }, + { + "entropy": 2.2653815746307373, + "epoch": 0.7963800904977375, + "grad_norm": 0.751455545425415, + "learning_rate": 0.0004986400424180688, + "loss": 0.3245, + "mean_token_accuracy": 0.9256318956613541, + "num_tokens": 1947979.0, + "step": 220 + }, + { + "entropy": 2.3123483061790466, + "epoch": 0.8, + "grad_norm": 0.5939492583274841, + "learning_rate": 0.0004984946822779061, + "loss": 0.2429, + "mean_token_accuracy": 0.9333402067422867, + "num_tokens": 1956814.0, + "step": 221 + }, + { + "entropy": 2.3289234042167664, + "epoch": 0.8036199095022625, + "grad_norm": 0.5591994524002075, + "learning_rate": 0.0004983484962551284, + "loss": 0.1507, + "mean_token_accuracy": 0.96376833319664, + "num_tokens": 1965641.0, + "step": 222 + }, + { + "entropy": 2.4314023852348328, + "epoch": 0.8072398190045249, + "grad_norm": 0.5805783271789551, + "learning_rate": 0.0004982014849049687, + "loss": 0.2049, + "mean_token_accuracy": 0.9586948156356812, + "num_tokens": 1974180.0, + "step": 223 + }, + { + "entropy": 2.3639765977859497, + "epoch": 0.8108597285067873, + "grad_norm": 0.6924490332603455, + "learning_rate": 0.0004980536487857951, + "loss": 0.2137, + "mean_token_accuracy": 0.9441423565149307, + "num_tokens": 1982744.0, + "step": 224 + }, + { + "entropy": 2.3361759781837463, + "epoch": 0.8144796380090498, + "grad_norm": 0.4579620361328125, + "learning_rate": 0.0004979049884591077, + "loss": 0.1041, + "mean_token_accuracy": 0.9753208309412003, + "num_tokens": 1991583.0, + "step": 225 + }, + { + "entropy": 2.286989688873291, + "epoch": 0.8180995475113122, + "grad_norm": 0.6489312052726746, + "learning_rate": 0.0004977555044895377, + "loss": 0.2131, + "mean_token_accuracy": 0.9520440250635147, + "num_tokens": 2000193.0, + "step": 226 + }, + { + "entropy": 2.288672834634781, + "epoch": 0.8217194570135746, + "grad_norm": 0.7738961577415466, + "learning_rate": 0.0004976051974448441, + "loss": 0.325, + "mean_token_accuracy": 0.9060750156641006, + "num_tokens": 2009233.0, + "step": 227 + }, + { + "entropy": 2.288076102733612, + "epoch": 0.8253393665158371, + "grad_norm": 0.7042292356491089, + "learning_rate": 0.0004974540678959123, + "loss": 0.2206, + "mean_token_accuracy": 0.94980289041996, + "num_tokens": 2018417.0, + "step": 228 + }, + { + "entropy": 2.217707335948944, + "epoch": 0.8289592760180996, + "grad_norm": 0.6834208369255066, + "learning_rate": 0.0004973021164167515, + "loss": 0.2907, + "mean_token_accuracy": 0.951058641076088, + "num_tokens": 2027822.0, + "step": 229 + }, + { + "entropy": 2.1610691249370575, + "epoch": 0.832579185520362, + "grad_norm": 0.665044903755188, + "learning_rate": 0.0004971493435844928, + "loss": 0.2387, + "mean_token_accuracy": 0.9506549835205078, + "num_tokens": 2036983.0, + "step": 230 + }, + { + "entropy": 2.321135401725769, + "epoch": 0.8361990950226245, + "grad_norm": 0.8208273649215698, + "learning_rate": 0.0004969957499793869, + "loss": 0.2399, + "mean_token_accuracy": 0.9435176253318787, + "num_tokens": 2045574.0, + "step": 231 + }, + { + "entropy": 2.1943611800670624, + "epoch": 0.8398190045248869, + "grad_norm": 0.6293840408325195, + "learning_rate": 0.0004968413361848019, + "loss": 0.1784, + "mean_token_accuracy": 0.9559669345617294, + "num_tokens": 2054336.0, + "step": 232 + }, + { + "entropy": 2.2722273468971252, + "epoch": 0.8434389140271493, + "grad_norm": 0.6535817980766296, + "learning_rate": 0.0004966861027872211, + "loss": 0.1675, + "mean_token_accuracy": 0.9532535970211029, + "num_tokens": 2063225.0, + "step": 233 + }, + { + "entropy": 2.3278334736824036, + "epoch": 0.8470588235294118, + "grad_norm": 1.1610206365585327, + "learning_rate": 0.0004965300503762406, + "loss": 0.1588, + "mean_token_accuracy": 0.9641145765781403, + "num_tokens": 2071738.0, + "step": 234 + }, + { + "entropy": 2.202972888946533, + "epoch": 0.8506787330316742, + "grad_norm": 0.4811885356903076, + "learning_rate": 0.0004963731795445675, + "loss": 0.0813, + "mean_token_accuracy": 0.9766911715269089, + "num_tokens": 2080375.0, + "step": 235 + }, + { + "entropy": 2.2433705925941467, + "epoch": 0.8542986425339366, + "grad_norm": 0.8113318681716919, + "learning_rate": 0.0004962154908880171, + "loss": 0.2965, + "mean_token_accuracy": 0.9290606826543808, + "num_tokens": 2089522.0, + "step": 236 + }, + { + "entropy": 2.2168884873390198, + "epoch": 0.857918552036199, + "grad_norm": 0.6128959655761719, + "learning_rate": 0.0004960569850055111, + "loss": 0.1724, + "mean_token_accuracy": 0.9603384286165237, + "num_tokens": 2098162.0, + "step": 237 + }, + { + "entropy": 2.2738255858421326, + "epoch": 0.8615384615384616, + "grad_norm": 0.8557195663452148, + "learning_rate": 0.0004958976624990749, + "loss": 0.2596, + "mean_token_accuracy": 0.9487071484327316, + "num_tokens": 2106984.0, + "step": 238 + }, + { + "entropy": 2.2031425833702087, + "epoch": 0.865158371040724, + "grad_norm": 0.6621816158294678, + "learning_rate": 0.0004957375239738359, + "loss": 0.232, + "mean_token_accuracy": 0.9525040090084076, + "num_tokens": 2116040.0, + "step": 239 + }, + { + "entropy": 2.374737858772278, + "epoch": 0.8687782805429864, + "grad_norm": 0.8481062054634094, + "learning_rate": 0.0004955765700380204, + "loss": 0.2516, + "mean_token_accuracy": 0.9396061599254608, + "num_tokens": 2124862.0, + "step": 240 + }, + { + "entropy": 2.266704559326172, + "epoch": 0.8723981900452489, + "grad_norm": 0.6284282803535461, + "learning_rate": 0.0004954148013029521, + "loss": 0.3244, + "mean_token_accuracy": 0.9381244331598282, + "num_tokens": 2134018.0, + "step": 241 + }, + { + "entropy": 2.3935859203338623, + "epoch": 0.8760180995475113, + "grad_norm": 1.1564176082611084, + "learning_rate": 0.0004952522183830493, + "loss": 0.2706, + "mean_token_accuracy": 0.9297053664922714, + "num_tokens": 2142745.0, + "step": 242 + }, + { + "entropy": 2.281618118286133, + "epoch": 0.8796380090497737, + "grad_norm": 0.5324040055274963, + "learning_rate": 0.0004950888218958225, + "loss": 0.1573, + "mean_token_accuracy": 0.9568462073802948, + "num_tokens": 2151607.0, + "step": 243 + }, + { + "entropy": 2.230749189853668, + "epoch": 0.8832579185520362, + "grad_norm": 0.680780291557312, + "learning_rate": 0.0004949246124618726, + "loss": 0.1956, + "mean_token_accuracy": 0.9479999989271164, + "num_tokens": 2160904.0, + "step": 244 + }, + { + "entropy": 2.21382600069046, + "epoch": 0.8868778280542986, + "grad_norm": 0.6321626305580139, + "learning_rate": 0.0004947595907048877, + "loss": 0.2444, + "mean_token_accuracy": 0.9376699328422546, + "num_tokens": 2170021.0, + "step": 245 + }, + { + "entropy": 2.3659472465515137, + "epoch": 0.890497737556561, + "grad_norm": 0.9778954982757568, + "learning_rate": 0.0004945937572516417, + "loss": 0.3783, + "mean_token_accuracy": 0.9104805737733841, + "num_tokens": 2178995.0, + "step": 246 + }, + { + "entropy": 2.3233078718185425, + "epoch": 0.8941176470588236, + "grad_norm": 0.53229820728302, + "learning_rate": 0.0004944271127319909, + "loss": 0.0759, + "mean_token_accuracy": 0.9791453778743744, + "num_tokens": 2187823.0, + "step": 247 + }, + { + "entropy": 2.2469444274902344, + "epoch": 0.897737556561086, + "grad_norm": 0.6367197632789612, + "learning_rate": 0.0004942596577788728, + "loss": 0.2677, + "mean_token_accuracy": 0.9392691254615784, + "num_tokens": 2196923.0, + "step": 248 + }, + { + "entropy": 2.4508965611457825, + "epoch": 0.9013574660633484, + "grad_norm": 0.6042234897613525, + "learning_rate": 0.0004940913930283024, + "loss": 0.1102, + "mean_token_accuracy": 0.9762090593576431, + "num_tokens": 2205400.0, + "step": 249 + }, + { + "entropy": 2.365670144557953, + "epoch": 0.9049773755656109, + "grad_norm": 0.6490639448165894, + "learning_rate": 0.0004939223191193707, + "loss": 0.1532, + "mean_token_accuracy": 0.9489114433526993, + "num_tokens": 2214201.0, + "step": 250 + }, + { + "entropy": 2.4013625383377075, + "epoch": 0.9085972850678733, + "grad_norm": 0.5969854593276978, + "learning_rate": 0.0004937524366942419, + "loss": 0.1273, + "mean_token_accuracy": 0.9682519882917404, + "num_tokens": 2222979.0, + "step": 251 + }, + { + "entropy": 2.4402357935905457, + "epoch": 0.9122171945701357, + "grad_norm": 0.7559595704078674, + "learning_rate": 0.0004935817463981513, + "loss": 0.1979, + "mean_token_accuracy": 0.9483373910188675, + "num_tokens": 2231169.0, + "step": 252 + }, + { + "entropy": 2.4673256874084473, + "epoch": 0.9158371040723982, + "grad_norm": 0.8663308620452881, + "learning_rate": 0.0004934102488794023, + "loss": 0.2453, + "mean_token_accuracy": 0.9408974200487137, + "num_tokens": 2240099.0, + "step": 253 + }, + { + "entropy": 2.426262080669403, + "epoch": 0.9194570135746606, + "grad_norm": 0.7920467257499695, + "learning_rate": 0.0004932379447893643, + "loss": 0.2828, + "mean_token_accuracy": 0.9319239109754562, + "num_tokens": 2249088.0, + "step": 254 + }, + { + "entropy": 2.5018852949142456, + "epoch": 0.9230769230769231, + "grad_norm": 0.7216617465019226, + "learning_rate": 0.0004930648347824701, + "loss": 0.1647, + "mean_token_accuracy": 0.9551804810762405, + "num_tokens": 2257710.0, + "step": 255 + }, + { + "entropy": 2.43031644821167, + "epoch": 0.9266968325791856, + "grad_norm": 0.646794319152832, + "learning_rate": 0.0004928909195162138, + "loss": 0.1328, + "mean_token_accuracy": 0.9663553237915039, + "num_tokens": 2266883.0, + "step": 256 + }, + { + "entropy": 2.5406370759010315, + "epoch": 0.930316742081448, + "grad_norm": 0.5482825040817261, + "learning_rate": 0.0004927161996511474, + "loss": 0.1872, + "mean_token_accuracy": 0.9557004272937775, + "num_tokens": 2275728.0, + "step": 257 + }, + { + "entropy": 2.636320471763611, + "epoch": 0.9339366515837104, + "grad_norm": 0.7454632520675659, + "learning_rate": 0.0004925406758508797, + "loss": 0.1461, + "mean_token_accuracy": 0.9578974395990372, + "num_tokens": 2284319.0, + "step": 258 + }, + { + "entropy": 2.6067575812339783, + "epoch": 0.9375565610859729, + "grad_norm": 0.8695769309997559, + "learning_rate": 0.000492364348782072, + "loss": 0.1712, + "mean_token_accuracy": 0.9652896523475647, + "num_tokens": 2293035.0, + "step": 259 + }, + { + "entropy": 2.5837162137031555, + "epoch": 0.9411764705882353, + "grad_norm": 0.5752995014190674, + "learning_rate": 0.0004921872191144371, + "loss": 0.1398, + "mean_token_accuracy": 0.9553333520889282, + "num_tokens": 2301802.0, + "step": 260 + }, + { + "entropy": 2.713033616542816, + "epoch": 0.9447963800904977, + "grad_norm": 0.85626620054245, + "learning_rate": 0.0004920092875207363, + "loss": 0.2207, + "mean_token_accuracy": 0.9468346834182739, + "num_tokens": 2309981.0, + "step": 261 + }, + { + "entropy": 2.400112509727478, + "epoch": 0.9484162895927601, + "grad_norm": 0.6766608953475952, + "learning_rate": 0.0004918305546767764, + "loss": 0.1644, + "mean_token_accuracy": 0.9502440094947815, + "num_tokens": 2319212.0, + "step": 262 + }, + { + "entropy": 2.503827154636383, + "epoch": 0.9520361990950226, + "grad_norm": 0.789470911026001, + "learning_rate": 0.0004916510212614072, + "loss": 0.2117, + "mean_token_accuracy": 0.9454390555620193, + "num_tokens": 2328234.0, + "step": 263 + }, + { + "entropy": 2.669040560722351, + "epoch": 0.9556561085972851, + "grad_norm": 0.9579212069511414, + "learning_rate": 0.0004914706879565197, + "loss": 0.2193, + "mean_token_accuracy": 0.9321542829275131, + "num_tokens": 2336543.0, + "step": 264 + }, + { + "entropy": 2.507073998451233, + "epoch": 0.9592760180995475, + "grad_norm": 0.5315744876861572, + "learning_rate": 0.000491289555447043, + "loss": 0.0851, + "mean_token_accuracy": 0.9771326780319214, + "num_tokens": 2345292.0, + "step": 265 + }, + { + "entropy": 2.4205283522605896, + "epoch": 0.96289592760181, + "grad_norm": 0.5441373586654663, + "learning_rate": 0.000491107624420941, + "loss": 0.1323, + "mean_token_accuracy": 0.9541790336370468, + "num_tokens": 2354242.0, + "step": 266 + }, + { + "entropy": 2.3817258477211, + "epoch": 0.9665158371040724, + "grad_norm": 0.5946238040924072, + "learning_rate": 0.0004909248955692111, + "loss": 0.1708, + "mean_token_accuracy": 0.947738841176033, + "num_tokens": 2363183.0, + "step": 267 + }, + { + "entropy": 2.5073485374450684, + "epoch": 0.9701357466063348, + "grad_norm": 0.6979324817657471, + "learning_rate": 0.0004907413695858812, + "loss": 0.2099, + "mean_token_accuracy": 0.9423733651638031, + "num_tokens": 2371885.0, + "step": 268 + }, + { + "entropy": 2.5705007910728455, + "epoch": 0.9737556561085973, + "grad_norm": 0.8203943967819214, + "learning_rate": 0.0004905570471680057, + "loss": 0.217, + "mean_token_accuracy": 0.9511639326810837, + "num_tokens": 2380316.0, + "step": 269 + }, + { + "entropy": 2.2677993774414062, + "epoch": 0.9773755656108597, + "grad_norm": 0.5840432047843933, + "learning_rate": 0.0004903719290156649, + "loss": 0.2364, + "mean_token_accuracy": 0.9407180696725845, + "num_tokens": 2389723.0, + "step": 270 + }, + { + "entropy": 2.477886915206909, + "epoch": 0.9809954751131221, + "grad_norm": 0.818929135799408, + "learning_rate": 0.0004901860158319612, + "loss": 0.1707, + "mean_token_accuracy": 0.9579566866159439, + "num_tokens": 2398388.0, + "step": 271 + }, + { + "entropy": 2.549662232398987, + "epoch": 0.9846153846153847, + "grad_norm": 0.7804781198501587, + "learning_rate": 0.0004899993083230166, + "loss": 0.2944, + "mean_token_accuracy": 0.9381812512874603, + "num_tokens": 2406929.0, + "step": 272 + }, + { + "entropy": 2.4465304017066956, + "epoch": 0.9882352941176471, + "grad_norm": 0.5218799114227295, + "learning_rate": 0.0004898118071979699, + "loss": 0.1661, + "mean_token_accuracy": 0.9500218778848648, + "num_tokens": 2415631.0, + "step": 273 + }, + { + "entropy": 2.5852283239364624, + "epoch": 0.9918552036199095, + "grad_norm": 0.591163158416748, + "learning_rate": 0.0004896235131689743, + "loss": 0.2005, + "mean_token_accuracy": 0.9455285370349884, + "num_tokens": 2424091.0, + "step": 274 + }, + { + "entropy": 2.478701651096344, + "epoch": 0.995475113122172, + "grad_norm": 1.0615383386611938, + "learning_rate": 0.0004894344269511945, + "loss": 0.2864, + "mean_token_accuracy": 0.9306265562772751, + "num_tokens": 2432705.0, + "step": 275 + }, + { + "entropy": 2.600062847137451, + "epoch": 0.9990950226244344, + "grad_norm": 0.7011683583259583, + "learning_rate": 0.0004892445492628043, + "loss": 0.1664, + "mean_token_accuracy": 0.9547821134328842, + "num_tokens": 2440992.0, + "step": 276 + }, + { + "entropy": 2.3411240577697754, + "epoch": 1.0, + "grad_norm": 0.4944029450416565, + "learning_rate": 0.000489053880824983, + "loss": 0.022, + "mean_token_accuracy": 0.9929078221321106, + "num_tokens": 2441725.0, + "step": 277 + }, + { + "epoch": 1.0, + "eval_entropy": 2.5467925265552553, + "eval_loss": 0.21274714171886444, + "eval_mean_token_accuracy": 0.9444630068492114, + "eval_num_tokens": 2441725.0, + "eval_runtime": 116.0434, + "eval_samples_per_second": 3.18, + "eval_steps_per_second": 1.06, + "step": 277 + }, + { + "entropy": 2.609170138835907, + "epoch": 1.0036199095022624, + "grad_norm": 1.0785081386566162, + "learning_rate": 0.0004888624223619136, + "loss": 0.3167, + "mean_token_accuracy": 0.9296800643205643, + "num_tokens": 2450193.0, + "step": 278 + }, + { + "entropy": 2.497025430202484, + "epoch": 1.0072398190045249, + "grad_norm": 0.5221985578536987, + "learning_rate": 0.0004886701746007801, + "loss": 0.0854, + "mean_token_accuracy": 0.9753399342298508, + "num_tokens": 2459309.0, + "step": 279 + }, + { + "entropy": 2.5487362146377563, + "epoch": 1.0108597285067873, + "grad_norm": 0.5161958336830139, + "learning_rate": 0.0004884771382717638, + "loss": 0.0819, + "mean_token_accuracy": 0.9748431146144867, + "num_tokens": 2467844.0, + "step": 280 + }, + { + "entropy": 2.5276209115982056, + "epoch": 1.0144796380090497, + "grad_norm": 0.5731730461120605, + "learning_rate": 0.0004882833141080412, + "loss": 0.1541, + "mean_token_accuracy": 0.9567564427852631, + "num_tokens": 2476894.0, + "step": 281 + }, + { + "entropy": 2.4442760348320007, + "epoch": 1.0180995475113122, + "grad_norm": 0.7120366096496582, + "learning_rate": 0.0004880887028457813, + "loss": 0.1945, + "mean_token_accuracy": 0.9465379565954208, + "num_tokens": 2485971.0, + "step": 282 + }, + { + "entropy": 2.4069360494613647, + "epoch": 1.0217194570135746, + "grad_norm": 0.7468647360801697, + "learning_rate": 0.00048789330522414244, + "loss": 0.2345, + "mean_token_accuracy": 0.9446765780448914, + "num_tokens": 2495043.0, + "step": 283 + }, + { + "entropy": 2.468382716178894, + "epoch": 1.025339366515837, + "grad_norm": 0.666231632232666, + "learning_rate": 0.0004876971219852697, + "loss": 0.1779, + "mean_token_accuracy": 0.9534575343132019, + "num_tokens": 2503672.0, + "step": 284 + }, + { + "entropy": 2.4362316727638245, + "epoch": 1.0289592760180994, + "grad_norm": 0.8445858955383301, + "learning_rate": 0.000487500153874292, + "loss": 0.1698, + "mean_token_accuracy": 0.953661322593689, + "num_tokens": 2512322.0, + "step": 285 + }, + { + "entropy": 2.364333391189575, + "epoch": 1.032579185520362, + "grad_norm": 0.4805246591567993, + "learning_rate": 0.0004873024016393193, + "loss": 0.0778, + "mean_token_accuracy": 0.9824571758508682, + "num_tokens": 2520791.0, + "step": 286 + }, + { + "entropy": 2.223461151123047, + "epoch": 1.0361990950226245, + "grad_norm": 0.648465096950531, + "learning_rate": 0.0004871038660314399, + "loss": 0.2593, + "mean_token_accuracy": 0.9419913589954376, + "num_tokens": 2530082.0, + "step": 287 + }, + { + "entropy": 2.3313387036323547, + "epoch": 1.039819004524887, + "grad_norm": 0.6912294626235962, + "learning_rate": 0.00048690454780471725, + "loss": 0.1354, + "mean_token_accuracy": 0.9561934620141983, + "num_tokens": 2538728.0, + "step": 288 + }, + { + "entropy": 2.191806375980377, + "epoch": 1.0434389140271494, + "grad_norm": 0.8620694279670715, + "learning_rate": 0.0004867044477161874, + "loss": 0.1103, + "mean_token_accuracy": 0.968692272901535, + "num_tokens": 2547219.0, + "step": 289 + }, + { + "entropy": 2.167125165462494, + "epoch": 1.0470588235294118, + "grad_norm": 0.6192149519920349, + "learning_rate": 0.0004865035665258559, + "loss": 0.1288, + "mean_token_accuracy": 0.9643534421920776, + "num_tokens": 2555940.0, + "step": 290 + }, + { + "entropy": 2.2750985622406006, + "epoch": 1.0506787330316743, + "grad_norm": 1.7459602355957031, + "learning_rate": 0.0004863019049966953, + "loss": 0.393, + "mean_token_accuracy": 0.9146681725978851, + "num_tokens": 2564362.0, + "step": 291 + }, + { + "entropy": 2.236129105091095, + "epoch": 1.0542986425339367, + "grad_norm": 0.6311184167861938, + "learning_rate": 0.0004860994638946416, + "loss": 0.1536, + "mean_token_accuracy": 0.9636097103357315, + "num_tokens": 2573316.0, + "step": 292 + }, + { + "entropy": 2.2642418146133423, + "epoch": 1.0579185520361991, + "grad_norm": 0.6023411154747009, + "learning_rate": 0.000485896243988592, + "loss": 0.191, + "mean_token_accuracy": 0.9476015418767929, + "num_tokens": 2581835.0, + "step": 293 + }, + { + "entropy": 2.3589024543762207, + "epoch": 1.0615384615384615, + "grad_norm": 0.48049232363700867, + "learning_rate": 0.0004856922460504016, + "loss": 0.1017, + "mean_token_accuracy": 0.9713075459003448, + "num_tokens": 2590317.0, + "step": 294 + }, + { + "entropy": 2.4141315817832947, + "epoch": 1.065158371040724, + "grad_norm": 0.8456616997718811, + "learning_rate": 0.0004854874708548806, + "loss": 0.1422, + "mean_token_accuracy": 0.9622762501239777, + "num_tokens": 2598538.0, + "step": 295 + }, + { + "entropy": 2.069903999567032, + "epoch": 1.0687782805429864, + "grad_norm": 0.7641116380691528, + "learning_rate": 0.0004852819191797912, + "loss": 0.2185, + "mean_token_accuracy": 0.9464851468801498, + "num_tokens": 2608219.0, + "step": 296 + }, + { + "entropy": 2.163217008113861, + "epoch": 1.0723981900452488, + "grad_norm": 0.546085000038147, + "learning_rate": 0.0004850755918058449, + "loss": 0.1035, + "mean_token_accuracy": 0.9708487540483475, + "num_tokens": 2617261.0, + "step": 297 + }, + { + "entropy": 2.2678662836551666, + "epoch": 1.0760180995475113, + "grad_norm": 0.8699386119842529, + "learning_rate": 0.0004848684895166994, + "loss": 0.2384, + "mean_token_accuracy": 0.9486480504274368, + "num_tokens": 2626144.0, + "step": 298 + }, + { + "entropy": 2.13065105676651, + "epoch": 1.0796380090497737, + "grad_norm": 0.44323107600212097, + "learning_rate": 0.00048466061309895554, + "loss": 0.0818, + "mean_token_accuracy": 0.9722468554973602, + "num_tokens": 2635626.0, + "step": 299 + }, + { + "entropy": 2.184772551059723, + "epoch": 1.0832579185520361, + "grad_norm": 0.7928256988525391, + "learning_rate": 0.0004844519633421545, + "loss": 0.2378, + "mean_token_accuracy": 0.9477885961532593, + "num_tokens": 2644674.0, + "step": 300 + }, + { + "entropy": 2.1669145822525024, + "epoch": 1.0868778280542986, + "grad_norm": 0.5570158362388611, + "learning_rate": 0.00048424254103877456, + "loss": 0.1434, + "mean_token_accuracy": 0.9587411731481552, + "num_tokens": 2653658.0, + "step": 301 + }, + { + "entropy": 2.3057579398155212, + "epoch": 1.090497737556561, + "grad_norm": 0.9084392189979553, + "learning_rate": 0.00048403234698422837, + "loss": 0.3831, + "mean_token_accuracy": 0.8896283358335495, + "num_tokens": 2662350.0, + "step": 302 + }, + { + "entropy": 2.1741657853126526, + "epoch": 1.0941176470588236, + "grad_norm": 0.6791238784790039, + "learning_rate": 0.0004838213819768597, + "loss": 0.1648, + "mean_token_accuracy": 0.9576362520456314, + "num_tokens": 2671450.0, + "step": 303 + }, + { + "entropy": 2.089864045381546, + "epoch": 1.097737556561086, + "grad_norm": 0.5696312189102173, + "learning_rate": 0.0004836096468179406, + "loss": 0.1269, + "mean_token_accuracy": 0.9658148884773254, + "num_tokens": 2680581.0, + "step": 304 + }, + { + "entropy": 2.2657605409622192, + "epoch": 1.1013574660633485, + "grad_norm": 1.605503797531128, + "learning_rate": 0.0004833971423116682, + "loss": 0.1027, + "mean_token_accuracy": 0.9762597978115082, + "num_tokens": 2689001.0, + "step": 305 + }, + { + "entropy": 2.079287111759186, + "epoch": 1.104977375565611, + "grad_norm": 0.5804780721664429, + "learning_rate": 0.00048318386926516157, + "loss": 0.1137, + "mean_token_accuracy": 0.9633719325065613, + "num_tokens": 2698050.0, + "step": 306 + }, + { + "entropy": 2.201345145702362, + "epoch": 1.1085972850678734, + "grad_norm": 0.8606241941452026, + "learning_rate": 0.000482969828488459, + "loss": 0.2124, + "mean_token_accuracy": 0.9472681730985641, + "num_tokens": 2706704.0, + "step": 307 + }, + { + "entropy": 2.095236599445343, + "epoch": 1.1122171945701358, + "grad_norm": 0.7078782320022583, + "learning_rate": 0.0004827550207945147, + "loss": 0.1957, + "mean_token_accuracy": 0.9564679116010666, + "num_tokens": 2715745.0, + "step": 308 + }, + { + "entropy": 2.186302363872528, + "epoch": 1.1158371040723982, + "grad_norm": 0.7166503667831421, + "learning_rate": 0.0004825394469991956, + "loss": 0.1539, + "mean_token_accuracy": 0.9662427455186844, + "num_tokens": 2724296.0, + "step": 309 + }, + { + "entropy": 2.052559405565262, + "epoch": 1.1194570135746607, + "grad_norm": 0.6510501503944397, + "learning_rate": 0.00048232310792127846, + "loss": 0.1831, + "mean_token_accuracy": 0.9533994495868683, + "num_tokens": 2733482.0, + "step": 310 + }, + { + "entropy": 2.093154102563858, + "epoch": 1.123076923076923, + "grad_norm": 0.711121678352356, + "learning_rate": 0.0004821060043824466, + "loss": 0.2315, + "mean_token_accuracy": 0.9381555914878845, + "num_tokens": 2742912.0, + "step": 311 + }, + { + "entropy": 2.188497006893158, + "epoch": 1.1266968325791855, + "grad_norm": 0.6782490015029907, + "learning_rate": 0.00048188813720728707, + "loss": 0.2, + "mean_token_accuracy": 0.9501812607049942, + "num_tokens": 2751808.0, + "step": 312 + }, + { + "entropy": 2.0495824217796326, + "epoch": 1.130316742081448, + "grad_norm": 0.7644634246826172, + "learning_rate": 0.00048166950722328697, + "loss": 0.2152, + "mean_token_accuracy": 0.9440928995609283, + "num_tokens": 2761066.0, + "step": 313 + }, + { + "entropy": 2.1707025468349457, + "epoch": 1.1339366515837104, + "grad_norm": 0.655131459236145, + "learning_rate": 0.00048145011526083106, + "loss": 0.1637, + "mean_token_accuracy": 0.9500558227300644, + "num_tokens": 2769870.0, + "step": 314 + }, + { + "entropy": 2.1047372221946716, + "epoch": 1.1375565610859728, + "grad_norm": 0.5353516936302185, + "learning_rate": 0.0004812299621531979, + "loss": 0.1705, + "mean_token_accuracy": 0.9455999433994293, + "num_tokens": 2779383.0, + "step": 315 + }, + { + "entropy": 2.1921610236167908, + "epoch": 1.1411764705882352, + "grad_norm": 0.8998016119003296, + "learning_rate": 0.00048100904873655696, + "loss": 0.3918, + "mean_token_accuracy": 0.9382697492837906, + "num_tokens": 2788386.0, + "step": 316 + }, + { + "entropy": 2.0850723683834076, + "epoch": 1.1447963800904977, + "grad_norm": 0.867432713508606, + "learning_rate": 0.0004807873758499656, + "loss": 0.2196, + "mean_token_accuracy": 0.9498324394226074, + "num_tokens": 2797496.0, + "step": 317 + }, + { + "entropy": 2.1980925798416138, + "epoch": 1.14841628959276, + "grad_norm": 0.6076980233192444, + "learning_rate": 0.00048056494433536577, + "loss": 0.1086, + "mean_token_accuracy": 0.9642161130905151, + "num_tokens": 2805836.0, + "step": 318 + }, + { + "entropy": 2.15611070394516, + "epoch": 1.1520361990950225, + "grad_norm": 0.6276211738586426, + "learning_rate": 0.0004803417550375806, + "loss": 0.1463, + "mean_token_accuracy": 0.9622830748558044, + "num_tokens": 2814404.0, + "step": 319 + }, + { + "entropy": 2.0017230808734894, + "epoch": 1.155656108597285, + "grad_norm": 0.5840948820114136, + "learning_rate": 0.0004801178088043115, + "loss": 0.1869, + "mean_token_accuracy": 0.9506777077913284, + "num_tokens": 2823786.0, + "step": 320 + }, + { + "entropy": 2.1539418697357178, + "epoch": 1.1592760180995474, + "grad_norm": 1.074331283569336, + "learning_rate": 0.0004798931064861349, + "loss": 0.2797, + "mean_token_accuracy": 0.9271649420261383, + "num_tokens": 2832374.0, + "step": 321 + }, + { + "entropy": 1.930726408958435, + "epoch": 1.16289592760181, + "grad_norm": 0.5121958255767822, + "learning_rate": 0.0004796676489364988, + "loss": 0.1579, + "mean_token_accuracy": 0.9582571685314178, + "num_tokens": 2841561.0, + "step": 322 + }, + { + "entropy": 2.0205810368061066, + "epoch": 1.1665158371040725, + "grad_norm": 0.6360969543457031, + "learning_rate": 0.00047944143701171966, + "loss": 0.1582, + "mean_token_accuracy": 0.9620308429002762, + "num_tokens": 2850171.0, + "step": 323 + }, + { + "entropy": 1.9655758142471313, + "epoch": 1.170135746606335, + "grad_norm": 0.6647385358810425, + "learning_rate": 0.0004792144715709792, + "loss": 0.1594, + "mean_token_accuracy": 0.954497441649437, + "num_tokens": 2858905.0, + "step": 324 + }, + { + "entropy": 1.9725223183631897, + "epoch": 1.1737556561085973, + "grad_norm": 0.6429229974746704, + "learning_rate": 0.0004789867534763211, + "loss": 0.1407, + "mean_token_accuracy": 0.9645214527845383, + "num_tokens": 2867533.0, + "step": 325 + }, + { + "entropy": 1.9473685026168823, + "epoch": 1.1773755656108598, + "grad_norm": 0.811651349067688, + "learning_rate": 0.0004787582835926477, + "loss": 0.1608, + "mean_token_accuracy": 0.9479968994855881, + "num_tokens": 2876286.0, + "step": 326 + }, + { + "entropy": 1.8863109350204468, + "epoch": 1.1809954751131222, + "grad_norm": 0.5587059855461121, + "learning_rate": 0.00047852906278771686, + "loss": 0.131, + "mean_token_accuracy": 0.9684520065784454, + "num_tokens": 2885667.0, + "step": 327 + }, + { + "entropy": 1.8288891315460205, + "epoch": 1.1846153846153846, + "grad_norm": 0.8450536131858826, + "learning_rate": 0.0004782990919321383, + "loss": 0.2224, + "mean_token_accuracy": 0.9377491921186447, + "num_tokens": 2894765.0, + "step": 328 + }, + { + "entropy": 1.9347718358039856, + "epoch": 1.188235294117647, + "grad_norm": 0.7665867209434509, + "learning_rate": 0.0004780683718993705, + "loss": 0.167, + "mean_token_accuracy": 0.9583602845668793, + "num_tokens": 2903551.0, + "step": 329 + }, + { + "entropy": 1.9097798764705658, + "epoch": 1.1918552036199095, + "grad_norm": 0.7705667018890381, + "learning_rate": 0.00047783690356571784, + "loss": 0.2115, + "mean_token_accuracy": 0.9526428133249283, + "num_tokens": 2912197.0, + "step": 330 + }, + { + "entropy": 1.9174850285053253, + "epoch": 1.195475113122172, + "grad_norm": 0.5695499181747437, + "learning_rate": 0.00047760468781032634, + "loss": 0.1033, + "mean_token_accuracy": 0.969958484172821, + "num_tokens": 2920579.0, + "step": 331 + }, + { + "entropy": 1.8578442931175232, + "epoch": 1.1990950226244343, + "grad_norm": 0.7843735814094543, + "learning_rate": 0.000477371725515181, + "loss": 0.1664, + "mean_token_accuracy": 0.9545005410909653, + "num_tokens": 2929352.0, + "step": 332 + }, + { + "entropy": 1.8509328961372375, + "epoch": 1.2027149321266968, + "grad_norm": 0.5951048135757446, + "learning_rate": 0.0004771380175651026, + "loss": 0.1566, + "mean_token_accuracy": 0.9551403075456619, + "num_tokens": 2938387.0, + "step": 333 + }, + { + "entropy": 1.8236390948295593, + "epoch": 1.2063348416289592, + "grad_norm": 0.4988223910331726, + "learning_rate": 0.0004769035648477434, + "loss": 0.1242, + "mean_token_accuracy": 0.966319814324379, + "num_tokens": 2947741.0, + "step": 334 + }, + { + "entropy": 1.9594822525978088, + "epoch": 1.2099547511312216, + "grad_norm": 0.7550755143165588, + "learning_rate": 0.00047666836825358477, + "loss": 0.1591, + "mean_token_accuracy": 0.9666347652673721, + "num_tokens": 2956313.0, + "step": 335 + }, + { + "entropy": 1.9148444533348083, + "epoch": 1.213574660633484, + "grad_norm": 0.5889077186584473, + "learning_rate": 0.00047643242867593345, + "loss": 0.1343, + "mean_token_accuracy": 0.9611433297395706, + "num_tokens": 2964928.0, + "step": 336 + }, + { + "entropy": 1.8126957714557648, + "epoch": 1.2171945701357467, + "grad_norm": 0.5447750091552734, + "learning_rate": 0.0004761957470109179, + "loss": 0.1659, + "mean_token_accuracy": 0.9552300125360489, + "num_tokens": 2974160.0, + "step": 337 + }, + { + "entropy": 1.7981431782245636, + "epoch": 1.2208144796380092, + "grad_norm": 0.5400761365890503, + "learning_rate": 0.0004759583241574854, + "loss": 0.1339, + "mean_token_accuracy": 0.9620136916637421, + "num_tokens": 2982900.0, + "step": 338 + }, + { + "entropy": 1.8613979518413544, + "epoch": 1.2244343891402716, + "grad_norm": 0.7452914714813232, + "learning_rate": 0.0004757201610173981, + "loss": 0.4, + "mean_token_accuracy": 0.9068266004323959, + "num_tokens": 2991783.0, + "step": 339 + }, + { + "entropy": 1.8654026687145233, + "epoch": 1.228054298642534, + "grad_norm": 1.7142685651779175, + "learning_rate": 0.00047548125849523, + "loss": 0.3168, + "mean_token_accuracy": 0.9308896362781525, + "num_tokens": 3000530.0, + "step": 340 + }, + { + "entropy": 1.7702704071998596, + "epoch": 1.2316742081447964, + "grad_norm": 0.6687431931495667, + "learning_rate": 0.0004752416174983633, + "loss": 0.1697, + "mean_token_accuracy": 0.9530515670776367, + "num_tokens": 3009355.0, + "step": 341 + }, + { + "entropy": 1.735857516527176, + "epoch": 1.2352941176470589, + "grad_norm": 0.6127599477767944, + "learning_rate": 0.00047500123893698507, + "loss": 0.1706, + "mean_token_accuracy": 0.9593266248703003, + "num_tokens": 3018518.0, + "step": 342 + }, + { + "entropy": 1.7076368927955627, + "epoch": 1.2389140271493213, + "grad_norm": 0.6973987817764282, + "learning_rate": 0.0004747601237240836, + "loss": 0.1615, + "mean_token_accuracy": 0.9539438933134079, + "num_tokens": 3027752.0, + "step": 343 + }, + { + "entropy": 1.7353227138519287, + "epoch": 1.2425339366515837, + "grad_norm": 0.8406392335891724, + "learning_rate": 0.00047451827277544546, + "loss": 0.2063, + "mean_token_accuracy": 0.9488435834646225, + "num_tokens": 3036383.0, + "step": 344 + }, + { + "entropy": 1.6597246527671814, + "epoch": 1.2461538461538462, + "grad_norm": 0.5971431732177734, + "learning_rate": 0.00047427568700965107, + "loss": 0.1013, + "mean_token_accuracy": 0.9721864312887192, + "num_tokens": 3045375.0, + "step": 345 + }, + { + "entropy": 1.7100033462047577, + "epoch": 1.2497737556561086, + "grad_norm": 0.5883470773696899, + "learning_rate": 0.00047403236734807225, + "loss": 0.1164, + "mean_token_accuracy": 0.9664830714464188, + "num_tokens": 3054084.0, + "step": 346 + }, + { + "entropy": 1.7402609288692474, + "epoch": 1.253393665158371, + "grad_norm": 0.7355862855911255, + "learning_rate": 0.00047378831471486815, + "loss": 0.2007, + "mean_token_accuracy": 0.9560511559247971, + "num_tokens": 3062727.0, + "step": 347 + }, + { + "entropy": 1.79518261551857, + "epoch": 1.2570135746606335, + "grad_norm": 0.6006518006324768, + "learning_rate": 0.00047354353003698163, + "loss": 0.1085, + "mean_token_accuracy": 0.9598321914672852, + "num_tokens": 3071178.0, + "step": 348 + }, + { + "entropy": 1.7328391373157501, + "epoch": 1.260633484162896, + "grad_norm": 0.560342013835907, + "learning_rate": 0.0004732980142441362, + "loss": 0.1593, + "mean_token_accuracy": 0.9579409211874008, + "num_tokens": 3079927.0, + "step": 349 + }, + { + "entropy": 1.7356511652469635, + "epoch": 1.2642533936651583, + "grad_norm": 0.9149975776672363, + "learning_rate": 0.00047305176826883206, + "loss": 0.4064, + "mean_token_accuracy": 0.9265118837356567, + "num_tokens": 3089314.0, + "step": 350 + }, + { + "entropy": 1.8573569357395172, + "epoch": 1.2678733031674208, + "grad_norm": 0.8300670981407166, + "learning_rate": 0.0004728047930463428, + "loss": 0.195, + "mean_token_accuracy": 0.9453776180744171, + "num_tokens": 3097702.0, + "step": 351 + }, + { + "entropy": 1.7906217575073242, + "epoch": 1.2714932126696832, + "grad_norm": 0.5668906569480896, + "learning_rate": 0.0004725570895147118, + "loss": 0.1572, + "mean_token_accuracy": 0.962067037820816, + "num_tokens": 3106379.0, + "step": 352 + }, + { + "entropy": 1.6957395374774933, + "epoch": 1.2751131221719456, + "grad_norm": 0.4048328399658203, + "learning_rate": 0.0004723086586147487, + "loss": 0.0944, + "mean_token_accuracy": 0.9716819673776627, + "num_tokens": 3115622.0, + "step": 353 + }, + { + "entropy": 1.8158144056797028, + "epoch": 1.278733031674208, + "grad_norm": 0.6396092772483826, + "learning_rate": 0.00047205950129002564, + "loss": 0.1011, + "mean_token_accuracy": 0.9698463827371597, + "num_tokens": 3124016.0, + "step": 354 + }, + { + "entropy": 1.730194479227066, + "epoch": 1.2823529411764705, + "grad_norm": 0.662876307964325, + "learning_rate": 0.000471809618486874, + "loss": 0.1641, + "mean_token_accuracy": 0.9520179778337479, + "num_tokens": 3132712.0, + "step": 355 + }, + { + "entropy": 1.6776110529899597, + "epoch": 1.285972850678733, + "grad_norm": 0.868507981300354, + "learning_rate": 0.0004715590111543804, + "loss": 0.3374, + "mean_token_accuracy": 0.9303739666938782, + "num_tokens": 3142103.0, + "step": 356 + }, + { + "entropy": 1.6501678824424744, + "epoch": 1.2895927601809956, + "grad_norm": 0.5433686971664429, + "learning_rate": 0.0004713076802443834, + "loss": 0.1237, + "mean_token_accuracy": 0.9653612226247787, + "num_tokens": 3151192.0, + "step": 357 + }, + { + "entropy": 1.6524465382099152, + "epoch": 1.293212669683258, + "grad_norm": 0.6145523190498352, + "learning_rate": 0.00047105562671147, + "loss": 0.1204, + "mean_token_accuracy": 0.9690534323453903, + "num_tokens": 3159839.0, + "step": 358 + }, + { + "entropy": 1.5339214205741882, + "epoch": 1.2968325791855204, + "grad_norm": 0.500477135181427, + "learning_rate": 0.00047080285151297144, + "loss": 0.1295, + "mean_token_accuracy": 0.9571033865213394, + "num_tokens": 3169047.0, + "step": 359 + }, + { + "entropy": 1.6765435338020325, + "epoch": 1.3004524886877828, + "grad_norm": 0.6697553396224976, + "learning_rate": 0.00047054935560896026, + "loss": 0.135, + "mean_token_accuracy": 0.9672541171312332, + "num_tokens": 3177062.0, + "step": 360 + }, + { + "entropy": 1.5932062566280365, + "epoch": 1.3040723981900453, + "grad_norm": 0.706957221031189, + "learning_rate": 0.0004702951399622462, + "loss": 0.1229, + "mean_token_accuracy": 0.9634416699409485, + "num_tokens": 3185829.0, + "step": 361 + }, + { + "entropy": 1.5623145997524261, + "epoch": 1.3076923076923077, + "grad_norm": 0.6199461221694946, + "learning_rate": 0.00047004020553837275, + "loss": 0.1449, + "mean_token_accuracy": 0.9620065689086914, + "num_tokens": 3194426.0, + "step": 362 + }, + { + "entropy": 1.5226828753948212, + "epoch": 1.3113122171945701, + "grad_norm": 0.8962509036064148, + "learning_rate": 0.0004697845533056132, + "loss": 0.2207, + "mean_token_accuracy": 0.9403344839811325, + "num_tokens": 3203655.0, + "step": 363 + }, + { + "entropy": 1.5395641326904297, + "epoch": 1.3149321266968326, + "grad_norm": 0.5993619561195374, + "learning_rate": 0.00046952818423496727, + "loss": 0.1486, + "mean_token_accuracy": 0.9614185988903046, + "num_tokens": 3212069.0, + "step": 364 + }, + { + "entropy": 1.5738630294799805, + "epoch": 1.318552036199095, + "grad_norm": 0.7393983602523804, + "learning_rate": 0.00046927109930015756, + "loss": 0.1812, + "mean_token_accuracy": 0.9535021334886551, + "num_tokens": 3220482.0, + "step": 365 + }, + { + "entropy": 1.5462632775306702, + "epoch": 1.3221719457013574, + "grad_norm": 0.7453555464744568, + "learning_rate": 0.0004690132994776253, + "loss": 0.164, + "mean_token_accuracy": 0.9585814625024796, + "num_tokens": 3229505.0, + "step": 366 + }, + { + "entropy": 1.5241961777210236, + "epoch": 1.3257918552036199, + "grad_norm": 0.7553415298461914, + "learning_rate": 0.00046875478574652713, + "loss": 0.1445, + "mean_token_accuracy": 0.9682841598987579, + "num_tokens": 3238326.0, + "step": 367 + }, + { + "entropy": 1.5344699025154114, + "epoch": 1.3294117647058823, + "grad_norm": 0.8565949201583862, + "learning_rate": 0.0004684955590887311, + "loss": 0.2521, + "mean_token_accuracy": 0.920401468873024, + "num_tokens": 3247482.0, + "step": 368 + }, + { + "entropy": 1.5109277665615082, + "epoch": 1.3330316742081447, + "grad_norm": 0.5170580148696899, + "learning_rate": 0.00046823562048881295, + "loss": 0.1393, + "mean_token_accuracy": 0.9584086239337921, + "num_tokens": 3256464.0, + "step": 369 + }, + { + "entropy": 1.4666939079761505, + "epoch": 1.3366515837104074, + "grad_norm": 0.6995373368263245, + "learning_rate": 0.0004679749709340529, + "loss": 0.1726, + "mean_token_accuracy": 0.9477890431880951, + "num_tokens": 3265853.0, + "step": 370 + }, + { + "entropy": 1.4208430051803589, + "epoch": 1.3402714932126698, + "grad_norm": 1.1363991498947144, + "learning_rate": 0.000467713611414431, + "loss": 0.196, + "mean_token_accuracy": 0.9495431333780289, + "num_tokens": 3275367.0, + "step": 371 + }, + { + "entropy": 1.5009459853172302, + "epoch": 1.3438914027149322, + "grad_norm": 0.7883325219154358, + "learning_rate": 0.00046745154292262414, + "loss": 0.2526, + "mean_token_accuracy": 0.9334618002176285, + "num_tokens": 3284772.0, + "step": 372 + }, + { + "entropy": 1.5485479533672333, + "epoch": 1.3475113122171947, + "grad_norm": 0.6516429781913757, + "learning_rate": 0.00046718876645400156, + "loss": 0.2057, + "mean_token_accuracy": 0.9546459317207336, + "num_tokens": 3293493.0, + "step": 373 + }, + { + "entropy": 1.6237249970436096, + "epoch": 1.351131221719457, + "grad_norm": 0.8916263580322266, + "learning_rate": 0.00046692528300662213, + "loss": 0.2123, + "mean_token_accuracy": 0.9456845372915268, + "num_tokens": 3302063.0, + "step": 374 + }, + { + "entropy": 1.561572015285492, + "epoch": 1.3547511312217195, + "grad_norm": 0.7527791857719421, + "learning_rate": 0.00046666109358122935, + "loss": 0.2113, + "mean_token_accuracy": 0.9537477940320969, + "num_tokens": 3311037.0, + "step": 375 + }, + { + "entropy": 1.5594256818294525, + "epoch": 1.358371040723982, + "grad_norm": 1.25638747215271, + "learning_rate": 0.0004663961991812485, + "loss": 0.1629, + "mean_token_accuracy": 0.9508458077907562, + "num_tokens": 3319635.0, + "step": 376 + }, + { + "entropy": 1.6909976303577423, + "epoch": 1.3619909502262444, + "grad_norm": 0.7627813220024109, + "learning_rate": 0.00046613060081278194, + "loss": 0.2303, + "mean_token_accuracy": 0.9425801336765289, + "num_tokens": 3328043.0, + "step": 377 + }, + { + "entropy": 1.6074829697608948, + "epoch": 1.3656108597285068, + "grad_norm": 0.6584346294403076, + "learning_rate": 0.00046586429948460646, + "loss": 0.1815, + "mean_token_accuracy": 0.9536214470863342, + "num_tokens": 3337143.0, + "step": 378 + }, + { + "entropy": 1.7382183969020844, + "epoch": 1.3692307692307693, + "grad_norm": 1.37154221534729, + "learning_rate": 0.0004655972962081684, + "loss": 0.1849, + "mean_token_accuracy": 0.948440819978714, + "num_tokens": 3346033.0, + "step": 379 + }, + { + "entropy": 1.7148900926113129, + "epoch": 1.3728506787330317, + "grad_norm": 0.9487980604171753, + "learning_rate": 0.00046532959199758, + "loss": 0.2521, + "mean_token_accuracy": 0.9344504028558731, + "num_tokens": 3354849.0, + "step": 380 + }, + { + "entropy": 1.7164019346237183, + "epoch": 1.3764705882352941, + "grad_norm": 0.5609025359153748, + "learning_rate": 0.00046506118786961614, + "loss": 0.1425, + "mean_token_accuracy": 0.9571309834718704, + "num_tokens": 3363674.0, + "step": 381 + }, + { + "entropy": 1.894619107246399, + "epoch": 1.3800904977375565, + "grad_norm": 0.9811336994171143, + "learning_rate": 0.00046479208484370997, + "loss": 0.2522, + "mean_token_accuracy": 0.9424156546592712, + "num_tokens": 3372325.0, + "step": 382 + }, + { + "entropy": 1.78870290517807, + "epoch": 1.383710407239819, + "grad_norm": 0.5707085132598877, + "learning_rate": 0.00046452228394194893, + "loss": 0.1354, + "mean_token_accuracy": 0.9613165706396103, + "num_tokens": 3381270.0, + "step": 383 + }, + { + "entropy": 1.803922712802887, + "epoch": 1.3873303167420814, + "grad_norm": 0.5655364394187927, + "learning_rate": 0.0004642517861890713, + "loss": 0.0818, + "mean_token_accuracy": 0.9776160269975662, + "num_tokens": 3390363.0, + "step": 384 + }, + { + "entropy": 1.8172507882118225, + "epoch": 1.3909502262443438, + "grad_norm": 0.6950513124465942, + "learning_rate": 0.00046398059261246205, + "loss": 0.1145, + "mean_token_accuracy": 0.963288351893425, + "num_tokens": 3399176.0, + "step": 385 + }, + { + "entropy": 1.9182518422603607, + "epoch": 1.3945701357466063, + "grad_norm": 0.5900619029998779, + "learning_rate": 0.0004637087042421489, + "loss": 0.108, + "mean_token_accuracy": 0.9723307639360428, + "num_tokens": 3407978.0, + "step": 386 + }, + { + "entropy": 1.8558574616909027, + "epoch": 1.3981900452488687, + "grad_norm": 0.6279832124710083, + "learning_rate": 0.00046343612211079843, + "loss": 0.1471, + "mean_token_accuracy": 0.9603912532329559, + "num_tokens": 3416856.0, + "step": 387 + }, + { + "entropy": 1.8146779537200928, + "epoch": 1.4018099547511311, + "grad_norm": 0.6171274781227112, + "learning_rate": 0.0004631628472537125, + "loss": 0.1872, + "mean_token_accuracy": 0.9447146654129028, + "num_tokens": 3426044.0, + "step": 388 + }, + { + "entropy": 1.9342225790023804, + "epoch": 1.4054298642533936, + "grad_norm": 0.9947887659072876, + "learning_rate": 0.00046288888070882374, + "loss": 0.2966, + "mean_token_accuracy": 0.9279204607009888, + "num_tokens": 3435154.0, + "step": 389 + }, + { + "entropy": 1.9391801953315735, + "epoch": 1.409049773755656, + "grad_norm": 0.7155653834342957, + "learning_rate": 0.000462614223516692, + "loss": 0.1847, + "mean_token_accuracy": 0.9475171864032745, + "num_tokens": 3444563.0, + "step": 390 + }, + { + "entropy": 2.0716978013515472, + "epoch": 1.4126696832579184, + "grad_norm": 0.8198989629745483, + "learning_rate": 0.0004623388767205004, + "loss": 0.1317, + "mean_token_accuracy": 0.9608721435070038, + "num_tokens": 3453410.0, + "step": 391 + }, + { + "entropy": 2.1060431599617004, + "epoch": 1.416289592760181, + "grad_norm": 1.025406002998352, + "learning_rate": 0.00046206284136605106, + "loss": 0.2146, + "mean_token_accuracy": 0.9414294511079788, + "num_tokens": 3461958.0, + "step": 392 + }, + { + "entropy": 2.1459922194480896, + "epoch": 1.4199095022624435, + "grad_norm": 0.9209627509117126, + "learning_rate": 0.00046178611850176146, + "loss": 0.2137, + "mean_token_accuracy": 0.956874743103981, + "num_tokens": 3470547.0, + "step": 393 + }, + { + "entropy": 2.0233450531959534, + "epoch": 1.423529411764706, + "grad_norm": 0.5777944922447205, + "learning_rate": 0.00046150870917866025, + "loss": 0.122, + "mean_token_accuracy": 0.9672323018312454, + "num_tokens": 3479618.0, + "step": 394 + }, + { + "entropy": 2.035937190055847, + "epoch": 1.4271493212669684, + "grad_norm": 0.7945542931556702, + "learning_rate": 0.0004612306144503835, + "loss": 0.2879, + "mean_token_accuracy": 0.946587473154068, + "num_tokens": 3488533.0, + "step": 395 + }, + { + "entropy": 2.155315637588501, + "epoch": 1.4307692307692308, + "grad_norm": 0.6385292410850525, + "learning_rate": 0.00046095183537317035, + "loss": 0.1008, + "mean_token_accuracy": 0.9655124247074127, + "num_tokens": 3496686.0, + "step": 396 + }, + { + "entropy": 2.186827063560486, + "epoch": 1.4343891402714932, + "grad_norm": 0.4759826958179474, + "learning_rate": 0.0004606723730058593, + "loss": 0.0768, + "mean_token_accuracy": 0.9783597737550735, + "num_tokens": 3504958.0, + "step": 397 + }, + { + "entropy": 1.974392294883728, + "epoch": 1.4380090497737557, + "grad_norm": 0.6250292062759399, + "learning_rate": 0.00046039222840988406, + "loss": 0.1381, + "mean_token_accuracy": 0.9586146324872971, + "num_tokens": 3513694.0, + "step": 398 + }, + { + "entropy": 2.045738846063614, + "epoch": 1.441628959276018, + "grad_norm": 0.5517769455909729, + "learning_rate": 0.0004601114026492695, + "loss": 0.1312, + "mean_token_accuracy": 0.9682512134313583, + "num_tokens": 3522395.0, + "step": 399 + }, + { + "entropy": 2.105030357837677, + "epoch": 1.4452488687782805, + "grad_norm": 0.6748242974281311, + "learning_rate": 0.0004598298967906276, + "loss": 0.1056, + "mean_token_accuracy": 0.9701305478811264, + "num_tokens": 3530838.0, + "step": 400 + }, + { + "entropy": 2.024325281381607, + "epoch": 1.448868778280543, + "grad_norm": 0.6320233941078186, + "learning_rate": 0.00045954771190315344, + "loss": 0.1129, + "mean_token_accuracy": 0.9633017927408218, + "num_tokens": 3540184.0, + "step": 401 + }, + { + "entropy": 2.1561593413352966, + "epoch": 1.4524886877828054, + "grad_norm": 0.7380363941192627, + "learning_rate": 0.0004592648490586213, + "loss": 0.1304, + "mean_token_accuracy": 0.9599586874246597, + "num_tokens": 3548727.0, + "step": 402 + }, + { + "entropy": 2.2986454367637634, + "epoch": 1.4561085972850678, + "grad_norm": 0.669114351272583, + "learning_rate": 0.00045898130933138024, + "loss": 0.1005, + "mean_token_accuracy": 0.9724964797496796, + "num_tokens": 3556780.0, + "step": 403 + }, + { + "entropy": 2.103136509656906, + "epoch": 1.4597285067873302, + "grad_norm": 0.6677402853965759, + "learning_rate": 0.0004586970937983504, + "loss": 0.1177, + "mean_token_accuracy": 0.9597653448581696, + "num_tokens": 3565427.0, + "step": 404 + }, + { + "entropy": 2.112696200609207, + "epoch": 1.463348416289593, + "grad_norm": 0.4597342014312744, + "learning_rate": 0.0004584122035390185, + "loss": 0.0695, + "mean_token_accuracy": 0.9763098359107971, + "num_tokens": 3573902.0, + "step": 405 + }, + { + "entropy": 2.0472628474235535, + "epoch": 1.4669683257918553, + "grad_norm": 0.7842056751251221, + "learning_rate": 0.0004581266396354339, + "loss": 0.1981, + "mean_token_accuracy": 0.9521032422780991, + "num_tokens": 3582913.0, + "step": 406 + }, + { + "entropy": 2.236558735370636, + "epoch": 1.4705882352941178, + "grad_norm": 0.7634767293930054, + "learning_rate": 0.000457840403172205, + "loss": 0.1956, + "mean_token_accuracy": 0.9602932929992676, + "num_tokens": 3591197.0, + "step": 407 + }, + { + "entropy": 2.182949125766754, + "epoch": 1.4742081447963802, + "grad_norm": 0.7084661722183228, + "learning_rate": 0.00045755349523649415, + "loss": 0.2463, + "mean_token_accuracy": 0.9392582327127457, + "num_tokens": 3600134.0, + "step": 408 + }, + { + "entropy": 2.135133147239685, + "epoch": 1.4778280542986426, + "grad_norm": 0.8172940015792847, + "learning_rate": 0.00045726591691801433, + "loss": 0.2375, + "mean_token_accuracy": 0.9458330571651459, + "num_tokens": 3608945.0, + "step": 409 + }, + { + "entropy": 2.157473146915436, + "epoch": 1.481447963800905, + "grad_norm": 0.6165594458580017, + "learning_rate": 0.0004569776693090246, + "loss": 0.1628, + "mean_token_accuracy": 0.9586529731750488, + "num_tokens": 3617790.0, + "step": 410 + }, + { + "entropy": 2.15165376663208, + "epoch": 1.4850678733031675, + "grad_norm": 0.6619407534599304, + "learning_rate": 0.0004566887535043263, + "loss": 0.1866, + "mean_token_accuracy": 0.9545126557350159, + "num_tokens": 3626937.0, + "step": 411 + }, + { + "entropy": 2.271161735057831, + "epoch": 1.48868778280543, + "grad_norm": 0.5861835479736328, + "learning_rate": 0.0004563991706012582, + "loss": 0.1409, + "mean_token_accuracy": 0.9595955163240433, + "num_tokens": 3636025.0, + "step": 412 + }, + { + "entropy": 2.277799427509308, + "epoch": 1.4923076923076923, + "grad_norm": 0.6464956402778625, + "learning_rate": 0.00045610892169969323, + "loss": 0.0792, + "mean_token_accuracy": 0.9806316941976547, + "num_tokens": 3644746.0, + "step": 413 + }, + { + "entropy": 2.2143171429634094, + "epoch": 1.4959276018099548, + "grad_norm": 0.7531687021255493, + "learning_rate": 0.00045581800790203366, + "loss": 0.2584, + "mean_token_accuracy": 0.9225966930389404, + "num_tokens": 3654064.0, + "step": 414 + }, + { + "entropy": 2.231681764125824, + "epoch": 1.4995475113122172, + "grad_norm": 0.6902768015861511, + "learning_rate": 0.00045552643031320726, + "loss": 0.232, + "mean_token_accuracy": 0.9433842301368713, + "num_tokens": 3663130.0, + "step": 415 + }, + { + "entropy": 2.2672717571258545, + "epoch": 1.5031674208144796, + "grad_norm": 0.5134314894676208, + "learning_rate": 0.00045523419004066273, + "loss": 0.0874, + "mean_token_accuracy": 0.9708191752433777, + "num_tokens": 3671981.0, + "step": 416 + }, + { + "entropy": 2.3302834033966064, + "epoch": 1.506787330316742, + "grad_norm": 0.885969340801239, + "learning_rate": 0.0004549412881943659, + "loss": 0.0723, + "mean_token_accuracy": 0.9791463166475296, + "num_tokens": 3680525.0, + "step": 417 + }, + { + "entropy": 2.2693899869918823, + "epoch": 1.5104072398190045, + "grad_norm": 0.7424856424331665, + "learning_rate": 0.00045464772588679547, + "loss": 0.1509, + "mean_token_accuracy": 0.9600907415151596, + "num_tokens": 3689430.0, + "step": 418 + }, + { + "entropy": 2.4042725563049316, + "epoch": 1.514027149321267, + "grad_norm": 0.8968034982681274, + "learning_rate": 0.0004543535042329382, + "loss": 0.1984, + "mean_token_accuracy": 0.9488537162542343, + "num_tokens": 3697836.0, + "step": 419 + }, + { + "entropy": 2.2518428564071655, + "epoch": 1.5176470588235293, + "grad_norm": 0.5963534712791443, + "learning_rate": 0.0004540586243502858, + "loss": 0.1214, + "mean_token_accuracy": 0.9711381644010544, + "num_tokens": 3706675.0, + "step": 420 + }, + { + "entropy": 2.275522291660309, + "epoch": 1.5212669683257918, + "grad_norm": 1.0797090530395508, + "learning_rate": 0.0004537630873588293, + "loss": 0.2508, + "mean_token_accuracy": 0.9247037768363953, + "num_tokens": 3715631.0, + "step": 421 + }, + { + "entropy": 2.249617278575897, + "epoch": 1.5248868778280542, + "grad_norm": 0.7636313438415527, + "learning_rate": 0.000453466894381056, + "loss": 0.1112, + "mean_token_accuracy": 0.9681926071643829, + "num_tokens": 3724579.0, + "step": 422 + }, + { + "entropy": 2.280571699142456, + "epoch": 1.5285067873303166, + "grad_norm": 0.9915648698806763, + "learning_rate": 0.00045317004654194464, + "loss": 0.3532, + "mean_token_accuracy": 0.9360047876834869, + "num_tokens": 3733607.0, + "step": 423 + }, + { + "entropy": 2.241512656211853, + "epoch": 1.532126696832579, + "grad_norm": 0.924977719783783, + "learning_rate": 0.0004528725449689611, + "loss": 0.1997, + "mean_token_accuracy": 0.9475428760051727, + "num_tokens": 3742611.0, + "step": 424 + }, + { + "entropy": 2.201731503009796, + "epoch": 1.5357466063348415, + "grad_norm": 0.7018861770629883, + "learning_rate": 0.0004525743907920542, + "loss": 0.1683, + "mean_token_accuracy": 0.9465018659830093, + "num_tokens": 3751737.0, + "step": 425 + }, + { + "entropy": 2.28944593667984, + "epoch": 1.539366515837104, + "grad_norm": 0.5893452763557434, + "learning_rate": 0.00045227558514365166, + "loss": 0.0969, + "mean_token_accuracy": 0.9711766839027405, + "num_tokens": 3761245.0, + "step": 426 + }, + { + "entropy": 2.3497202396392822, + "epoch": 1.5429864253393664, + "grad_norm": 0.685279130935669, + "learning_rate": 0.0004519761291586551, + "loss": 0.106, + "mean_token_accuracy": 0.9663016647100449, + "num_tokens": 3769854.0, + "step": 427 + }, + { + "entropy": 2.308362066745758, + "epoch": 1.5466063348416288, + "grad_norm": 0.5116177797317505, + "learning_rate": 0.00045167602397443694, + "loss": 0.1132, + "mean_token_accuracy": 0.9700013697147369, + "num_tokens": 3778996.0, + "step": 428 + }, + { + "entropy": 2.238637685775757, + "epoch": 1.5502262443438914, + "grad_norm": 0.8374833464622498, + "learning_rate": 0.00045137527073083457, + "loss": 0.2539, + "mean_token_accuracy": 0.9407305717468262, + "num_tokens": 3787835.0, + "step": 429 + }, + { + "entropy": 2.3406758308410645, + "epoch": 1.5538461538461539, + "grad_norm": 0.5140913724899292, + "learning_rate": 0.0004510738705701473, + "loss": 0.1113, + "mean_token_accuracy": 0.9635641574859619, + "num_tokens": 3796498.0, + "step": 430 + }, + { + "entropy": 2.2642539143562317, + "epoch": 1.5574660633484163, + "grad_norm": 0.5750702023506165, + "learning_rate": 0.0004507718246371313, + "loss": 0.1127, + "mean_token_accuracy": 0.9660817235708237, + "num_tokens": 3805464.0, + "step": 431 + }, + { + "entropy": 2.2058264315128326, + "epoch": 1.5610859728506787, + "grad_norm": 0.6448659300804138, + "learning_rate": 0.0004504691340789955, + "loss": 0.0994, + "mean_token_accuracy": 0.96739861369133, + "num_tokens": 3814309.0, + "step": 432 + }, + { + "entropy": 2.330399215221405, + "epoch": 1.5647058823529412, + "grad_norm": 0.8432528376579285, + "learning_rate": 0.0004501658000453973, + "loss": 0.1999, + "mean_token_accuracy": 0.9510775059461594, + "num_tokens": 3823126.0, + "step": 433 + }, + { + "entropy": 2.4211326837539673, + "epoch": 1.5683257918552036, + "grad_norm": 0.8101194500923157, + "learning_rate": 0.00044986182368843806, + "loss": 0.144, + "mean_token_accuracy": 0.9656328558921814, + "num_tokens": 3831274.0, + "step": 434 + }, + { + "entropy": 2.2594956755638123, + "epoch": 1.571945701357466, + "grad_norm": 0.6753663420677185, + "learning_rate": 0.0004495572061626585, + "loss": 0.1433, + "mean_token_accuracy": 0.9572386592626572, + "num_tokens": 3840206.0, + "step": 435 + }, + { + "entropy": 2.1233682930469513, + "epoch": 1.5755656108597285, + "grad_norm": 0.48616713285446167, + "learning_rate": 0.000449251948625035, + "loss": 0.0934, + "mean_token_accuracy": 0.9740773588418961, + "num_tokens": 3849363.0, + "step": 436 + }, + { + "entropy": 2.325556695461273, + "epoch": 1.5791855203619911, + "grad_norm": 0.7744045853614807, + "learning_rate": 0.00044894605223497446, + "loss": 0.127, + "mean_token_accuracy": 0.9687052518129349, + "num_tokens": 3857733.0, + "step": 437 + }, + { + "entropy": 2.266542673110962, + "epoch": 1.5828054298642535, + "grad_norm": 2.373530387878418, + "learning_rate": 0.00044863951815431045, + "loss": 0.2404, + "mean_token_accuracy": 0.9437267184257507, + "num_tokens": 3866374.0, + "step": 438 + }, + { + "entropy": 2.1757248640060425, + "epoch": 1.586425339366516, + "grad_norm": 0.5588560700416565, + "learning_rate": 0.00044833234754729847, + "loss": 0.142, + "mean_token_accuracy": 0.9601300358772278, + "num_tokens": 3875520.0, + "step": 439 + }, + { + "entropy": 2.124377518892288, + "epoch": 1.5900452488687784, + "grad_norm": 0.5602438449859619, + "learning_rate": 0.0004480245415806116, + "loss": 0.1556, + "mean_token_accuracy": 0.9561446160078049, + "num_tokens": 3884345.0, + "step": 440 + }, + { + "entropy": 2.1571075320243835, + "epoch": 1.5936651583710408, + "grad_norm": 0.472598671913147, + "learning_rate": 0.0004477161014233361, + "loss": 0.0848, + "mean_token_accuracy": 0.9742853343486786, + "num_tokens": 3893129.0, + "step": 441 + }, + { + "entropy": 2.0434057414531708, + "epoch": 1.5972850678733033, + "grad_norm": 0.7104448676109314, + "learning_rate": 0.00044740702824696703, + "loss": 0.1524, + "mean_token_accuracy": 0.9542464315891266, + "num_tokens": 3902120.0, + "step": 442 + }, + { + "entropy": 2.1118403673171997, + "epoch": 1.6009049773755657, + "grad_norm": 0.6632394194602966, + "learning_rate": 0.0004470973232254037, + "loss": 0.3001, + "mean_token_accuracy": 0.928197592496872, + "num_tokens": 3910974.0, + "step": 443 + }, + { + "entropy": 2.0292475819587708, + "epoch": 1.6045248868778281, + "grad_norm": 1.050956130027771, + "learning_rate": 0.00044678698753494527, + "loss": 0.2226, + "mean_token_accuracy": 0.9448522627353668, + "num_tokens": 3920005.0, + "step": 444 + }, + { + "entropy": 1.991033524274826, + "epoch": 1.6081447963800906, + "grad_norm": 0.670244038105011, + "learning_rate": 0.00044647602235428624, + "loss": 0.2158, + "mean_token_accuracy": 0.9551118016242981, + "num_tokens": 3929334.0, + "step": 445 + }, + { + "entropy": 2.04949289560318, + "epoch": 1.611764705882353, + "grad_norm": 0.6321494579315186, + "learning_rate": 0.00044616442886451197, + "loss": 0.1743, + "mean_token_accuracy": 0.9494802355766296, + "num_tokens": 3938211.0, + "step": 446 + }, + { + "entropy": 2.1101951897144318, + "epoch": 1.6153846153846154, + "grad_norm": 0.6970012187957764, + "learning_rate": 0.0004458522082490943, + "loss": 0.1228, + "mean_token_accuracy": 0.9624926447868347, + "num_tokens": 3946534.0, + "step": 447 + }, + { + "entropy": 1.9337081909179688, + "epoch": 1.6190045248868778, + "grad_norm": 0.5971657633781433, + "learning_rate": 0.0004455393616938868, + "loss": 0.1431, + "mean_token_accuracy": 0.9635348320007324, + "num_tokens": 3955694.0, + "step": 448 + }, + { + "entropy": 1.9635128676891327, + "epoch": 1.6226244343891403, + "grad_norm": 0.8510827422142029, + "learning_rate": 0.00044522589038712074, + "loss": 0.2446, + "mean_token_accuracy": 0.9457641988992691, + "num_tokens": 3964907.0, + "step": 449 + }, + { + "entropy": 2.0336360335350037, + "epoch": 1.6262443438914027, + "grad_norm": 0.5803818106651306, + "learning_rate": 0.00044491179551939985, + "loss": 0.0872, + "mean_token_accuracy": 0.9734505414962769, + "num_tokens": 3973584.0, + "step": 450 + }, + { + "entropy": 2.0668878853321075, + "epoch": 1.6298642533936651, + "grad_norm": 0.6990496516227722, + "learning_rate": 0.0004445970782836967, + "loss": 0.1138, + "mean_token_accuracy": 0.9702571034431458, + "num_tokens": 3982632.0, + "step": 451 + }, + { + "entropy": 2.1481760144233704, + "epoch": 1.6334841628959276, + "grad_norm": 0.6156729459762573, + "learning_rate": 0.00044428173987534733, + "loss": 0.0936, + "mean_token_accuracy": 0.9739355593919754, + "num_tokens": 3991147.0, + "step": 452 + }, + { + "entropy": 2.0678701996803284, + "epoch": 1.63710407239819, + "grad_norm": 0.5441684126853943, + "learning_rate": 0.0004439657814920472, + "loss": 0.123, + "mean_token_accuracy": 0.9693446308374405, + "num_tokens": 3999990.0, + "step": 453 + }, + { + "entropy": 1.9867055118083954, + "epoch": 1.6407239819004524, + "grad_norm": 0.9218093156814575, + "learning_rate": 0.00044364920433384656, + "loss": 0.1997, + "mean_token_accuracy": 0.9564195573329926, + "num_tokens": 4009097.0, + "step": 454 + }, + { + "entropy": 2.145586997270584, + "epoch": 1.6443438914027149, + "grad_norm": 0.77643883228302, + "learning_rate": 0.0004433320096031458, + "loss": 0.1491, + "mean_token_accuracy": 0.9602408111095428, + "num_tokens": 4018059.0, + "step": 455 + }, + { + "entropy": 2.071108251810074, + "epoch": 1.6479638009049773, + "grad_norm": 0.5267088413238525, + "learning_rate": 0.0004430141985046909, + "loss": 0.0875, + "mean_token_accuracy": 0.9764399826526642, + "num_tokens": 4027089.0, + "step": 456 + }, + { + "entropy": 2.1659318804740906, + "epoch": 1.6515837104072397, + "grad_norm": 1.0642318725585938, + "learning_rate": 0.000442695772245569, + "loss": 0.2623, + "mean_token_accuracy": 0.9307756721973419, + "num_tokens": 4035719.0, + "step": 457 + }, + { + "entropy": 2.0232724249362946, + "epoch": 1.6552036199095022, + "grad_norm": 0.6213289499282837, + "learning_rate": 0.0004423767320352035, + "loss": 0.1597, + "mean_token_accuracy": 0.9599647223949432, + "num_tokens": 4045088.0, + "step": 458 + }, + { + "entropy": 2.047410547733307, + "epoch": 1.6588235294117646, + "grad_norm": 0.6346105933189392, + "learning_rate": 0.0004420570790853498, + "loss": 0.1422, + "mean_token_accuracy": 0.9649711549282074, + "num_tokens": 4054262.0, + "step": 459 + }, + { + "entropy": 2.0923012793064117, + "epoch": 1.662443438914027, + "grad_norm": 0.46477749943733215, + "learning_rate": 0.0004417368146100907, + "loss": 0.079, + "mean_token_accuracy": 0.9777993708848953, + "num_tokens": 4063107.0, + "step": 460 + }, + { + "entropy": 2.168913394212723, + "epoch": 1.6660633484162894, + "grad_norm": 0.5164734721183777, + "learning_rate": 0.0004414159398258312, + "loss": 0.0941, + "mean_token_accuracy": 0.9725133627653122, + "num_tokens": 4071656.0, + "step": 461 + }, + { + "entropy": 2.152670443058014, + "epoch": 1.6696832579185519, + "grad_norm": 0.8985757231712341, + "learning_rate": 0.00044109445595129495, + "loss": 0.2142, + "mean_token_accuracy": 0.9387252777814865, + "num_tokens": 4080023.0, + "step": 462 + }, + { + "entropy": 2.111784875392914, + "epoch": 1.6733031674208145, + "grad_norm": 0.47521084547042847, + "learning_rate": 0.0004407723642075184, + "loss": 0.0581, + "mean_token_accuracy": 0.9821985810995102, + "num_tokens": 4088469.0, + "step": 463 + }, + { + "entropy": 1.9784683287143707, + "epoch": 1.676923076923077, + "grad_norm": 0.5552536249160767, + "learning_rate": 0.0004404496658178472, + "loss": 0.1353, + "mean_token_accuracy": 0.9619844257831573, + "num_tokens": 4097737.0, + "step": 464 + }, + { + "entropy": 2.015674114227295, + "epoch": 1.6805429864253394, + "grad_norm": 0.6078305244445801, + "learning_rate": 0.0004401263620079309, + "loss": 0.1916, + "mean_token_accuracy": 0.9506707191467285, + "num_tokens": 4107156.0, + "step": 465 + }, + { + "entropy": 2.0832217931747437, + "epoch": 1.6841628959276018, + "grad_norm": 0.6618755459785461, + "learning_rate": 0.0004398024540057186, + "loss": 0.1671, + "mean_token_accuracy": 0.9617152661085129, + "num_tokens": 4116019.0, + "step": 466 + }, + { + "entropy": 2.0383114516735077, + "epoch": 1.6877828054298643, + "grad_norm": 0.5774693489074707, + "learning_rate": 0.0004394779430414541, + "loss": 0.2647, + "mean_token_accuracy": 0.9387127161026001, + "num_tokens": 4125001.0, + "step": 467 + }, + { + "entropy": 2.201409190893173, + "epoch": 1.6914027149321267, + "grad_norm": 0.7600311636924744, + "learning_rate": 0.0004391528303476715, + "loss": 0.073, + "mean_token_accuracy": 0.979825034737587, + "num_tokens": 4133467.0, + "step": 468 + }, + { + "entropy": 2.168666422367096, + "epoch": 1.6950226244343891, + "grad_norm": 0.7801902294158936, + "learning_rate": 0.00043882711715919015, + "loss": 0.2406, + "mean_token_accuracy": 0.9451306313276291, + "num_tokens": 4141765.0, + "step": 469 + }, + { + "entropy": 2.1429262161254883, + "epoch": 1.6986425339366515, + "grad_norm": 0.5192358493804932, + "learning_rate": 0.0004385008047131104, + "loss": 0.1052, + "mean_token_accuracy": 0.9749262481927872, + "num_tokens": 4150732.0, + "step": 470 + }, + { + "entropy": 2.1387495696544647, + "epoch": 1.702262443438914, + "grad_norm": 0.6219777464866638, + "learning_rate": 0.0004381738942488083, + "loss": 0.2127, + "mean_token_accuracy": 0.9398418068885803, + "num_tokens": 4159715.0, + "step": 471 + }, + { + "entropy": 2.1718398332595825, + "epoch": 1.7058823529411766, + "grad_norm": 0.5738123655319214, + "learning_rate": 0.0004378463870079316, + "loss": 0.1703, + "mean_token_accuracy": 0.9520847648382187, + "num_tokens": 4168526.0, + "step": 472 + }, + { + "entropy": 2.2768235206604004, + "epoch": 1.709502262443439, + "grad_norm": 0.662564754486084, + "learning_rate": 0.00043751828423439456, + "loss": 0.138, + "mean_token_accuracy": 0.9581841826438904, + "num_tokens": 4177189.0, + "step": 473 + }, + { + "entropy": 2.29143089056015, + "epoch": 1.7131221719457015, + "grad_norm": 0.8638074398040771, + "learning_rate": 0.00043718958717437324, + "loss": 0.1432, + "mean_token_accuracy": 0.9645630270242691, + "num_tokens": 4185367.0, + "step": 474 + }, + { + "entropy": 2.2810245156288147, + "epoch": 1.716742081447964, + "grad_norm": 0.6139346957206726, + "learning_rate": 0.00043686029707630097, + "loss": 0.173, + "mean_token_accuracy": 0.9592728316783905, + "num_tokens": 4194418.0, + "step": 475 + }, + { + "entropy": 2.1307725310325623, + "epoch": 1.7203619909502263, + "grad_norm": 0.5192779302597046, + "learning_rate": 0.00043653041519086354, + "loss": 0.1025, + "mean_token_accuracy": 0.970764696598053, + "num_tokens": 4203705.0, + "step": 476 + }, + { + "entropy": 2.160595118999481, + "epoch": 1.7239819004524888, + "grad_norm": 0.7398526668548584, + "learning_rate": 0.0004361999427709943, + "loss": 0.229, + "mean_token_accuracy": 0.9352773874998093, + "num_tokens": 4212648.0, + "step": 477 + }, + { + "entropy": 2.1865442991256714, + "epoch": 1.7276018099547512, + "grad_norm": 0.6227203011512756, + "learning_rate": 0.0004358688810718699, + "loss": 0.1118, + "mean_token_accuracy": 0.9689576476812363, + "num_tokens": 4221208.0, + "step": 478 + }, + { + "entropy": 2.086527943611145, + "epoch": 1.7312217194570136, + "grad_norm": 0.722144603729248, + "learning_rate": 0.00043553723135090447, + "loss": 0.1656, + "mean_token_accuracy": 0.9537550210952759, + "num_tokens": 4230810.0, + "step": 479 + }, + { + "entropy": 2.068355441093445, + "epoch": 1.734841628959276, + "grad_norm": 0.5781517028808594, + "learning_rate": 0.0004352049948677462, + "loss": 0.1497, + "mean_token_accuracy": 0.9600837379693985, + "num_tokens": 4240394.0, + "step": 480 + }, + { + "entropy": 2.185140371322632, + "epoch": 1.7384615384615385, + "grad_norm": 0.7261873483657837, + "learning_rate": 0.0004348721728842715, + "loss": 0.1582, + "mean_token_accuracy": 0.9584025889635086, + "num_tokens": 4249205.0, + "step": 481 + }, + { + "entropy": 2.21835720539093, + "epoch": 1.742081447963801, + "grad_norm": 0.5321667194366455, + "learning_rate": 0.0004345387666645807, + "loss": 0.1344, + "mean_token_accuracy": 0.9659005403518677, + "num_tokens": 4257808.0, + "step": 482 + }, + { + "entropy": 2.078131854534149, + "epoch": 1.7457013574660634, + "grad_norm": 0.5598498582839966, + "learning_rate": 0.00043420477747499307, + "loss": 0.1347, + "mean_token_accuracy": 0.9678008407354355, + "num_tokens": 4266728.0, + "step": 483 + }, + { + "entropy": 2.060504525899887, + "epoch": 1.7493212669683258, + "grad_norm": 0.5017166137695312, + "learning_rate": 0.0004338702065840422, + "loss": 0.0722, + "mean_token_accuracy": 0.9762782007455826, + "num_tokens": 4275514.0, + "step": 484 + }, + { + "entropy": 2.165244698524475, + "epoch": 1.7529411764705882, + "grad_norm": 0.4664002060890198, + "learning_rate": 0.00043353505526247084, + "loss": 0.1206, + "mean_token_accuracy": 0.9696767777204514, + "num_tokens": 4284013.0, + "step": 485 + }, + { + "entropy": 2.103049159049988, + "epoch": 1.7565610859728507, + "grad_norm": 0.6669000387191772, + "learning_rate": 0.0004331993247832265, + "loss": 0.1052, + "mean_token_accuracy": 0.9665459096431732, + "num_tokens": 4293011.0, + "step": 486 + }, + { + "entropy": 2.1286613941192627, + "epoch": 1.760180995475113, + "grad_norm": 0.7821269631385803, + "learning_rate": 0.00043286301642145634, + "loss": 0.3669, + "mean_token_accuracy": 0.9062697291374207, + "num_tokens": 4301965.0, + "step": 487 + }, + { + "entropy": 2.098009169101715, + "epoch": 1.7638009049773755, + "grad_norm": 0.5720731616020203, + "learning_rate": 0.0004325261314545024, + "loss": 0.1324, + "mean_token_accuracy": 0.9650943875312805, + "num_tokens": 4310914.0, + "step": 488 + }, + { + "entropy": 2.164614498615265, + "epoch": 1.767420814479638, + "grad_norm": 1.0500473976135254, + "learning_rate": 0.0004321886711618967, + "loss": 0.1182, + "mean_token_accuracy": 0.9720661342144012, + "num_tokens": 4319072.0, + "step": 489 + }, + { + "entropy": 2.2015402913093567, + "epoch": 1.7710407239819004, + "grad_norm": 0.5770253539085388, + "learning_rate": 0.00043185063682535634, + "loss": 0.1226, + "mean_token_accuracy": 0.9615659862756729, + "num_tokens": 4327539.0, + "step": 490 + }, + { + "entropy": 2.075456440448761, + "epoch": 1.7746606334841628, + "grad_norm": 0.6456925272941589, + "learning_rate": 0.0004315120297287789, + "loss": 0.1123, + "mean_token_accuracy": 0.9628709554672241, + "num_tokens": 4336523.0, + "step": 491 + }, + { + "entropy": 2.158169150352478, + "epoch": 1.7782805429864252, + "grad_norm": 0.8282069563865662, + "learning_rate": 0.00043117285115823733, + "loss": 0.2146, + "mean_token_accuracy": 0.9413971602916718, + "num_tokens": 4345294.0, + "step": 492 + }, + { + "entropy": 2.02735897898674, + "epoch": 1.7819004524886877, + "grad_norm": 0.783597469329834, + "learning_rate": 0.000430833102401975, + "loss": 0.1376, + "mean_token_accuracy": 0.964630737900734, + "num_tokens": 4354107.0, + "step": 493 + }, + { + "entropy": 2.138492166996002, + "epoch": 1.78552036199095, + "grad_norm": 0.6317175030708313, + "learning_rate": 0.000430492784750401, + "loss": 0.1005, + "mean_token_accuracy": 0.9734214246273041, + "num_tokens": 4362560.0, + "step": 494 + }, + { + "entropy": 2.0253217220306396, + "epoch": 1.7891402714932125, + "grad_norm": 0.5523395538330078, + "learning_rate": 0.000430151899496085, + "loss": 0.1633, + "mean_token_accuracy": 0.9558031558990479, + "num_tokens": 4371698.0, + "step": 495 + }, + { + "entropy": 2.160472810268402, + "epoch": 1.792760180995475, + "grad_norm": 0.6557935476303101, + "learning_rate": 0.00042981044793375295, + "loss": 0.1154, + "mean_token_accuracy": 0.9722230583429337, + "num_tokens": 4380612.0, + "step": 496 + }, + { + "entropy": 2.0284159183502197, + "epoch": 1.7963800904977374, + "grad_norm": 0.7357863187789917, + "learning_rate": 0.00042946843136028117, + "loss": 0.1166, + "mean_token_accuracy": 0.9629471153020859, + "num_tokens": 4389521.0, + "step": 497 + }, + { + "entropy": 2.1544791162014008, + "epoch": 1.8, + "grad_norm": 0.5604898929595947, + "learning_rate": 0.00042912585107469226, + "loss": 0.0834, + "mean_token_accuracy": 0.9783036410808563, + "num_tokens": 4398059.0, + "step": 498 + }, + { + "entropy": 2.1051094830036163, + "epoch": 1.8036199095022625, + "grad_norm": 0.4598539173603058, + "learning_rate": 0.0004287827083781497, + "loss": 0.0411, + "mean_token_accuracy": 0.9868490546941757, + "num_tokens": 4406453.0, + "step": 499 + }, + { + "entropy": 2.0219272077083588, + "epoch": 1.807239819004525, + "grad_norm": 0.8164628744125366, + "learning_rate": 0.00042843900457395343, + "loss": 0.1988, + "mean_token_accuracy": 0.9502352625131607, + "num_tokens": 4415440.0, + "step": 500 + }, + { + "entropy": 1.980013906955719, + "epoch": 1.8108597285067873, + "grad_norm": 0.572798490524292, + "learning_rate": 0.0004280947409675341, + "loss": 0.1148, + "mean_token_accuracy": 0.966580331325531, + "num_tokens": 4424532.0, + "step": 501 + }, + { + "entropy": 2.0646563172340393, + "epoch": 1.8144796380090498, + "grad_norm": 0.769386351108551, + "learning_rate": 0.00042774991886644875, + "loss": 0.1592, + "mean_token_accuracy": 0.9553463608026505, + "num_tokens": 4432913.0, + "step": 502 + }, + { + "entropy": 2.040877491235733, + "epoch": 1.8180995475113122, + "grad_norm": 0.7467371821403503, + "learning_rate": 0.0004274045395803758, + "loss": 0.2247, + "mean_token_accuracy": 0.9526964277029037, + "num_tokens": 4441425.0, + "step": 503 + }, + { + "entropy": 1.9934698939323425, + "epoch": 1.8217194570135746, + "grad_norm": 0.6602952480316162, + "learning_rate": 0.00042705860442110964, + "loss": 0.1681, + "mean_token_accuracy": 0.9594631940126419, + "num_tokens": 4450383.0, + "step": 504 + }, + { + "entropy": 2.0858289897441864, + "epoch": 1.825339366515837, + "grad_norm": 0.684380829334259, + "learning_rate": 0.0004267121147025562, + "loss": 0.1154, + "mean_token_accuracy": 0.9638111293315887, + "num_tokens": 4458862.0, + "step": 505 + }, + { + "entropy": 2.0886995792388916, + "epoch": 1.8289592760180997, + "grad_norm": 0.5784837007522583, + "learning_rate": 0.00042636507174072756, + "loss": 0.1026, + "mean_token_accuracy": 0.9676834791898727, + "num_tokens": 4467386.0, + "step": 506 + }, + { + "entropy": 2.0236063301563263, + "epoch": 1.8325791855203621, + "grad_norm": 0.5101180672645569, + "learning_rate": 0.00042601747685373716, + "loss": 0.1031, + "mean_token_accuracy": 0.9734093993902206, + "num_tokens": 4476054.0, + "step": 507 + }, + { + "entropy": 1.9801031053066254, + "epoch": 1.8361990950226246, + "grad_norm": 0.6581607460975647, + "learning_rate": 0.00042566933136179455, + "loss": 0.1548, + "mean_token_accuracy": 0.9581006914377213, + "num_tokens": 4484895.0, + "step": 508 + }, + { + "entropy": 2.0244787633419037, + "epoch": 1.839819004524887, + "grad_norm": 0.8100608587265015, + "learning_rate": 0.0004253206365872008, + "loss": 0.196, + "mean_token_accuracy": 0.9532899260520935, + "num_tokens": 4493737.0, + "step": 509 + }, + { + "entropy": 1.9108119010925293, + "epoch": 1.8434389140271494, + "grad_norm": 0.4903942048549652, + "learning_rate": 0.00042497139385434314, + "loss": 0.1313, + "mean_token_accuracy": 0.9667337089776993, + "num_tokens": 4502840.0, + "step": 510 + }, + { + "entropy": 2.009468197822571, + "epoch": 1.8470588235294119, + "grad_norm": 0.6010113954544067, + "learning_rate": 0.0004246216044896897, + "loss": 0.1013, + "mean_token_accuracy": 0.9692314714193344, + "num_tokens": 4511407.0, + "step": 511 + }, + { + "entropy": 2.0337170362472534, + "epoch": 1.8506787330316743, + "grad_norm": 0.7906802892684937, + "learning_rate": 0.00042427126982178546, + "loss": 0.1682, + "mean_token_accuracy": 0.9550099819898605, + "num_tokens": 4520018.0, + "step": 512 + }, + { + "entropy": 1.8813888728618622, + "epoch": 1.8542986425339367, + "grad_norm": 0.5353080034255981, + "learning_rate": 0.00042392039118124586, + "loss": 0.1228, + "mean_token_accuracy": 0.9624074995517731, + "num_tokens": 4529270.0, + "step": 513 + }, + { + "entropy": 2.012698233127594, + "epoch": 1.8579185520361992, + "grad_norm": 0.6713843941688538, + "learning_rate": 0.00042356896990075285, + "loss": 0.2225, + "mean_token_accuracy": 0.9417333751916885, + "num_tokens": 4538008.0, + "step": 514 + }, + { + "entropy": 1.880586564540863, + "epoch": 1.8615384615384616, + "grad_norm": 0.5821724534034729, + "learning_rate": 0.00042321700731504916, + "loss": 0.1144, + "mean_token_accuracy": 0.9677341282367706, + "num_tokens": 4546950.0, + "step": 515 + }, + { + "entropy": 2.0066279470920563, + "epoch": 1.865158371040724, + "grad_norm": 0.4095056354999542, + "learning_rate": 0.0004228645047609335, + "loss": 0.0424, + "mean_token_accuracy": 0.9854962974786758, + "num_tokens": 4555452.0, + "step": 516 + }, + { + "entropy": 2.042815536260605, + "epoch": 1.8687782805429864, + "grad_norm": 0.5398769974708557, + "learning_rate": 0.0004225114635772555, + "loss": 0.1343, + "mean_token_accuracy": 0.9615450948476791, + "num_tokens": 4564386.0, + "step": 517 + }, + { + "entropy": 2.0948933362960815, + "epoch": 1.8723981900452489, + "grad_norm": 0.6738974452018738, + "learning_rate": 0.0004221578851049107, + "loss": 0.1541, + "mean_token_accuracy": 0.9526563137769699, + "num_tokens": 4573041.0, + "step": 518 + }, + { + "entropy": 2.102545380592346, + "epoch": 1.8760180995475113, + "grad_norm": 0.7769943475723267, + "learning_rate": 0.00042180377068683504, + "loss": 0.2362, + "mean_token_accuracy": 0.9472651779651642, + "num_tokens": 4581666.0, + "step": 519 + }, + { + "entropy": 2.087820291519165, + "epoch": 1.8796380090497737, + "grad_norm": 0.5722424983978271, + "learning_rate": 0.0004214491216680004, + "loss": 0.1657, + "mean_token_accuracy": 0.9537082612514496, + "num_tokens": 4590238.0, + "step": 520 + }, + { + "entropy": 2.0093430876731873, + "epoch": 1.8832579185520362, + "grad_norm": 0.5844932198524475, + "learning_rate": 0.00042109393939540867, + "loss": 0.1485, + "mean_token_accuracy": 0.9624215811491013, + "num_tokens": 4599352.0, + "step": 521 + }, + { + "entropy": 1.9117147326469421, + "epoch": 1.8868778280542986, + "grad_norm": 0.46085676550865173, + "learning_rate": 0.0004207382252180876, + "loss": 0.0853, + "mean_token_accuracy": 0.9769327491521835, + "num_tokens": 4608571.0, + "step": 522 + }, + { + "entropy": 2.0205602943897247, + "epoch": 1.890497737556561, + "grad_norm": 0.5571608543395996, + "learning_rate": 0.000420381980487085, + "loss": 0.1517, + "mean_token_accuracy": 0.9646699875593185, + "num_tokens": 4617445.0, + "step": 523 + }, + { + "entropy": 1.9571953415870667, + "epoch": 1.8941176470588235, + "grad_norm": 0.470630943775177, + "learning_rate": 0.0004200252065554636, + "loss": 0.1005, + "mean_token_accuracy": 0.9750025719404221, + "num_tokens": 4626756.0, + "step": 524 + }, + { + "entropy": 2.063209116458893, + "epoch": 1.897737556561086, + "grad_norm": 0.6447069644927979, + "learning_rate": 0.00041966790477829637, + "loss": 0.113, + "mean_token_accuracy": 0.9695079624652863, + "num_tokens": 4635378.0, + "step": 525 + }, + { + "entropy": 1.9232109785079956, + "epoch": 1.9013574660633483, + "grad_norm": 0.5114295482635498, + "learning_rate": 0.000419310076512661, + "loss": 0.1492, + "mean_token_accuracy": 0.9653338938951492, + "num_tokens": 4644769.0, + "step": 526 + }, + { + "entropy": 2.1691197752952576, + "epoch": 1.9049773755656108, + "grad_norm": 0.7630137205123901, + "learning_rate": 0.00041895172311763476, + "loss": 0.212, + "mean_token_accuracy": 0.9533941894769669, + "num_tokens": 4652857.0, + "step": 527 + }, + { + "entropy": 2.04753240942955, + "epoch": 1.9085972850678732, + "grad_norm": 0.6423042416572571, + "learning_rate": 0.00041859284595428955, + "loss": 0.1455, + "mean_token_accuracy": 0.956505224108696, + "num_tokens": 4661591.0, + "step": 528 + }, + { + "entropy": 1.9440338611602783, + "epoch": 1.9122171945701356, + "grad_norm": 0.5011327266693115, + "learning_rate": 0.00041823344638568656, + "loss": 0.1255, + "mean_token_accuracy": 0.965131089091301, + "num_tokens": 4670594.0, + "step": 529 + }, + { + "entropy": 2.0554805397987366, + "epoch": 1.915837104072398, + "grad_norm": 0.5821590423583984, + "learning_rate": 0.0004178735257768713, + "loss": 0.0486, + "mean_token_accuracy": 0.9875282496213913, + "num_tokens": 4679344.0, + "step": 530 + }, + { + "entropy": 2.130349576473236, + "epoch": 1.9194570135746605, + "grad_norm": 0.5332052111625671, + "learning_rate": 0.0004175130854948679, + "loss": 0.0915, + "mean_token_accuracy": 0.9737034440040588, + "num_tokens": 4687922.0, + "step": 531 + }, + { + "entropy": 2.146788775920868, + "epoch": 1.9230769230769231, + "grad_norm": 0.5016877055168152, + "learning_rate": 0.00041715212690867455, + "loss": 0.1281, + "mean_token_accuracy": 0.9681432545185089, + "num_tokens": 4696593.0, + "step": 532 + }, + { + "entropy": 2.041268438100815, + "epoch": 1.9266968325791856, + "grad_norm": 0.5257729887962341, + "learning_rate": 0.00041679065138925807, + "loss": 0.1272, + "mean_token_accuracy": 0.9649266451597214, + "num_tokens": 4705792.0, + "step": 533 + }, + { + "entropy": 2.114819645881653, + "epoch": 1.930316742081448, + "grad_norm": 0.7085135579109192, + "learning_rate": 0.0004164286603095484, + "loss": 0.1545, + "mean_token_accuracy": 0.9581228941679001, + "num_tokens": 4714599.0, + "step": 534 + }, + { + "entropy": 2.022280514240265, + "epoch": 1.9339366515837104, + "grad_norm": 0.5309014320373535, + "learning_rate": 0.00041606615504443387, + "loss": 0.1933, + "mean_token_accuracy": 0.9562340676784515, + "num_tokens": 4724062.0, + "step": 535 + }, + { + "entropy": 2.0959260165691376, + "epoch": 1.9375565610859729, + "grad_norm": 0.6528061628341675, + "learning_rate": 0.0004157031369707557, + "loss": 0.1306, + "mean_token_accuracy": 0.9612343460321426, + "num_tokens": 4733077.0, + "step": 536 + }, + { + "entropy": 2.2772948145866394, + "epoch": 1.9411764705882353, + "grad_norm": 0.7351471185684204, + "learning_rate": 0.0004153396074673028, + "loss": 0.1494, + "mean_token_accuracy": 0.9608108699321747, + "num_tokens": 4741201.0, + "step": 537 + }, + { + "entropy": 2.0935052037239075, + "epoch": 1.9447963800904977, + "grad_norm": 0.5435840487480164, + "learning_rate": 0.0004149755679148065, + "loss": 0.0884, + "mean_token_accuracy": 0.9745689779520035, + "num_tokens": 4750306.0, + "step": 538 + }, + { + "entropy": 2.2082818746566772, + "epoch": 1.9484162895927601, + "grad_norm": 0.3780331611633301, + "learning_rate": 0.00041461101969593537, + "loss": 0.0739, + "mean_token_accuracy": 0.9777179658412933, + "num_tokens": 4758954.0, + "step": 539 + }, + { + "entropy": 2.1683040261268616, + "epoch": 1.9520361990950226, + "grad_norm": 0.4637961685657501, + "learning_rate": 0.00041424596419529017, + "loss": 0.0632, + "mean_token_accuracy": 0.9834533184766769, + "num_tokens": 4767615.0, + "step": 540 + }, + { + "entropy": 2.075555235147476, + "epoch": 1.9556561085972852, + "grad_norm": 0.7603118419647217, + "learning_rate": 0.00041388040279939804, + "loss": 0.2835, + "mean_token_accuracy": 0.9364205300807953, + "num_tokens": 4776714.0, + "step": 541 + }, + { + "entropy": 2.18926739692688, + "epoch": 1.9592760180995477, + "grad_norm": 0.8895708918571472, + "learning_rate": 0.0004135143368967079, + "loss": 0.2514, + "mean_token_accuracy": 0.9361050724983215, + "num_tokens": 4785402.0, + "step": 542 + }, + { + "entropy": 2.2387169003486633, + "epoch": 1.96289592760181, + "grad_norm": 0.6013544797897339, + "learning_rate": 0.00041314776787758454, + "loss": 0.1502, + "mean_token_accuracy": 0.9594238847494125, + "num_tokens": 4793928.0, + "step": 543 + }, + { + "entropy": 2.208383619785309, + "epoch": 1.9665158371040725, + "grad_norm": 0.6934756636619568, + "learning_rate": 0.00041278069713430386, + "loss": 0.1777, + "mean_token_accuracy": 0.9619583487510681, + "num_tokens": 4802612.0, + "step": 544 + }, + { + "entropy": 2.2621757984161377, + "epoch": 1.970135746606335, + "grad_norm": 0.6920077800750732, + "learning_rate": 0.00041241312606104743, + "loss": 0.1689, + "mean_token_accuracy": 0.9594835937023163, + "num_tokens": 4811332.0, + "step": 545 + }, + { + "entropy": 2.2654454112052917, + "epoch": 1.9737556561085974, + "grad_norm": 0.6259592771530151, + "learning_rate": 0.000412045056053897, + "loss": 0.142, + "mean_token_accuracy": 0.9648078680038452, + "num_tokens": 4820441.0, + "step": 546 + }, + { + "entropy": 2.218056857585907, + "epoch": 1.9773755656108598, + "grad_norm": 0.5390617847442627, + "learning_rate": 0.0004116764885108292, + "loss": 0.1737, + "mean_token_accuracy": 0.9595656991004944, + "num_tokens": 4829437.0, + "step": 547 + }, + { + "entropy": 2.2571592330932617, + "epoch": 1.9809954751131222, + "grad_norm": 0.3656528890132904, + "learning_rate": 0.0004113074248317108, + "loss": 0.0545, + "mean_token_accuracy": 0.9825418293476105, + "num_tokens": 4838118.0, + "step": 548 + }, + { + "entropy": 2.1890549659729004, + "epoch": 1.9846153846153847, + "grad_norm": 0.5716155767440796, + "learning_rate": 0.00041093786641829247, + "loss": 0.0997, + "mean_token_accuracy": 0.9715700745582581, + "num_tokens": 4847073.0, + "step": 549 + }, + { + "entropy": 2.2726192474365234, + "epoch": 1.988235294117647, + "grad_norm": 0.4709530770778656, + "learning_rate": 0.0004105678146742042, + "loss": 0.0746, + "mean_token_accuracy": 0.9799739569425583, + "num_tokens": 4855755.0, + "step": 550 + }, + { + "entropy": 2.2328362464904785, + "epoch": 1.9918552036199095, + "grad_norm": 0.6773779392242432, + "learning_rate": 0.0004101972710049498, + "loss": 0.1418, + "mean_token_accuracy": 0.9629421681165695, + "num_tokens": 4864601.0, + "step": 551 + }, + { + "entropy": 2.199812740087509, + "epoch": 1.995475113122172, + "grad_norm": 0.717012882232666, + "learning_rate": 0.00040982623681790113, + "loss": 0.2948, + "mean_token_accuracy": 0.9432803690433502, + "num_tokens": 4873630.0, + "step": 552 + }, + { + "entropy": 2.2102787494659424, + "epoch": 1.9990950226244344, + "grad_norm": 0.6925314664840698, + "learning_rate": 0.00040945471352229346, + "loss": 0.2579, + "mean_token_accuracy": 0.9435124397277832, + "num_tokens": 4882714.0, + "step": 553 + }, + { + "entropy": 2.3318979740142822, + "epoch": 2.0, + "grad_norm": 2.688188314437866, + "learning_rate": 0.0004090827025292197, + "loss": 0.0283, + "mean_token_accuracy": 0.9918032884597778, + "num_tokens": 4883450.0, + "step": 554 + }, + { + "epoch": 2.0, + "eval_entropy": 2.2165925522160723, + "eval_loss": 0.16817161440849304, + "eval_mean_token_accuracy": 0.9567220133494555, + "eval_num_tokens": 4883450.0, + "eval_runtime": 116.1556, + "eval_samples_per_second": 3.177, + "eval_steps_per_second": 1.059, + "step": 554 + }, + { + "entropy": 2.0389976799488068, + "epoch": 2.0036199095022624, + "grad_norm": 0.8596204519271851, + "learning_rate": 0.00040871020525162484, + "loss": 0.1341, + "mean_token_accuracy": 0.9626202881336212, + "num_tokens": 4893236.0, + "step": 555 + }, + { + "entropy": 2.245832860469818, + "epoch": 2.007239819004525, + "grad_norm": 0.39707237482070923, + "learning_rate": 0.00040833722310430114, + "loss": 0.0564, + "mean_token_accuracy": 0.9868980199098587, + "num_tokens": 4901819.0, + "step": 556 + }, + { + "entropy": 2.169717162847519, + "epoch": 2.0108597285067873, + "grad_norm": 0.46584129333496094, + "learning_rate": 0.0004079637575038822, + "loss": 0.0792, + "mean_token_accuracy": 0.9758767485618591, + "num_tokens": 4910892.0, + "step": 557 + }, + { + "entropy": 2.27083820104599, + "epoch": 2.0144796380090497, + "grad_norm": 0.8394352197647095, + "learning_rate": 0.0004075898098688381, + "loss": 0.0962, + "mean_token_accuracy": 0.9723308384418488, + "num_tokens": 4919510.0, + "step": 558 + }, + { + "entropy": 2.1067663431167603, + "epoch": 2.018099547511312, + "grad_norm": 0.4951268434524536, + "learning_rate": 0.0004072153816194696, + "loss": 0.1195, + "mean_token_accuracy": 0.9703402817249298, + "num_tokens": 4928439.0, + "step": 559 + }, + { + "entropy": 2.016420066356659, + "epoch": 2.0217194570135746, + "grad_norm": 0.5574740171432495, + "learning_rate": 0.00040684047417790273, + "loss": 0.1037, + "mean_token_accuracy": 0.9727325141429901, + "num_tokens": 4938061.0, + "step": 560 + }, + { + "entropy": 2.1843727231025696, + "epoch": 2.025339366515837, + "grad_norm": 0.786014199256897, + "learning_rate": 0.00040646508896808394, + "loss": 0.155, + "mean_token_accuracy": 0.9608975350856781, + "num_tokens": 4946619.0, + "step": 561 + }, + { + "entropy": 2.160427451133728, + "epoch": 2.0289592760180994, + "grad_norm": 0.5267161130905151, + "learning_rate": 0.000406089227415774, + "loss": 0.0632, + "mean_token_accuracy": 0.9791042655706406, + "num_tokens": 4955324.0, + "step": 562 + }, + { + "entropy": 2.0923200249671936, + "epoch": 2.032579185520362, + "grad_norm": 0.8306187987327576, + "learning_rate": 0.00040571289094854304, + "loss": 0.1976, + "mean_token_accuracy": 0.9538775235414505, + "num_tokens": 4964321.0, + "step": 563 + }, + { + "entropy": 2.0181354880332947, + "epoch": 2.0361990950226243, + "grad_norm": 0.6798867583274841, + "learning_rate": 0.0004053360809957649, + "loss": 0.1797, + "mean_token_accuracy": 0.9569422006607056, + "num_tokens": 4973937.0, + "step": 564 + }, + { + "entropy": 2.123030036687851, + "epoch": 2.0398190045248867, + "grad_norm": 0.4481683671474457, + "learning_rate": 0.00040495879898861173, + "loss": 0.0639, + "mean_token_accuracy": 0.9827965050935745, + "num_tokens": 4982779.0, + "step": 565 + }, + { + "entropy": 2.0797010362148285, + "epoch": 2.043438914027149, + "grad_norm": 0.7745859622955322, + "learning_rate": 0.00040458104636004877, + "loss": 0.1602, + "mean_token_accuracy": 0.9600242227315903, + "num_tokens": 4991793.0, + "step": 566 + }, + { + "entropy": 2.0320390164852142, + "epoch": 2.0470588235294116, + "grad_norm": 0.5792120695114136, + "learning_rate": 0.0004042028245448286, + "loss": 0.0816, + "mean_token_accuracy": 0.9757721722126007, + "num_tokens": 5000834.0, + "step": 567 + }, + { + "entropy": 2.1047743558883667, + "epoch": 2.050678733031674, + "grad_norm": 0.5770072937011719, + "learning_rate": 0.0004038241349794858, + "loss": 0.1367, + "mean_token_accuracy": 0.9598450362682343, + "num_tokens": 5010155.0, + "step": 568 + }, + { + "entropy": 2.022550255060196, + "epoch": 2.0542986425339365, + "grad_norm": 0.47085902094841003, + "learning_rate": 0.0004034449791023319, + "loss": 0.1005, + "mean_token_accuracy": 0.970214769244194, + "num_tokens": 5020010.0, + "step": 569 + }, + { + "entropy": 2.034317582845688, + "epoch": 2.057918552036199, + "grad_norm": 0.4816018044948578, + "learning_rate": 0.0004030653583534489, + "loss": 0.118, + "mean_token_accuracy": 0.9635649025440216, + "num_tokens": 5029205.0, + "step": 570 + }, + { + "entropy": 2.1142700910568237, + "epoch": 2.0615384615384613, + "grad_norm": 0.561765730381012, + "learning_rate": 0.0004026852741746849, + "loss": 0.0628, + "mean_token_accuracy": 0.9811093211174011, + "num_tokens": 5037830.0, + "step": 571 + }, + { + "entropy": 2.1506906747817993, + "epoch": 2.065158371040724, + "grad_norm": 0.9037840366363525, + "learning_rate": 0.0004023047280096482, + "loss": 0.1395, + "mean_token_accuracy": 0.9645196944475174, + "num_tokens": 5046618.0, + "step": 572 + }, + { + "entropy": 2.1811060309410095, + "epoch": 2.0687782805429866, + "grad_norm": 0.6224188208580017, + "learning_rate": 0.0004019237213037014, + "loss": 0.0766, + "mean_token_accuracy": 0.9752616137266159, + "num_tokens": 5055467.0, + "step": 573 + }, + { + "entropy": 2.0479070246219635, + "epoch": 2.072398190045249, + "grad_norm": 0.5052458643913269, + "learning_rate": 0.00040154225550395665, + "loss": 0.091, + "mean_token_accuracy": 0.9753529280424118, + "num_tokens": 5064518.0, + "step": 574 + }, + { + "entropy": 2.18623149394989, + "epoch": 2.0760180995475115, + "grad_norm": 0.49587905406951904, + "learning_rate": 0.00040116033205926964, + "loss": 0.0703, + "mean_token_accuracy": 0.979348823428154, + "num_tokens": 5072713.0, + "step": 575 + }, + { + "entropy": 2.131018817424774, + "epoch": 2.079638009049774, + "grad_norm": 0.607468843460083, + "learning_rate": 0.0004007779524202343, + "loss": 0.0988, + "mean_token_accuracy": 0.9756181836128235, + "num_tokens": 5081046.0, + "step": 576 + }, + { + "entropy": 2.0251292288303375, + "epoch": 2.0832579185520363, + "grad_norm": 0.867511510848999, + "learning_rate": 0.00040039511803917723, + "loss": 0.1672, + "mean_token_accuracy": 0.9638413190841675, + "num_tokens": 5089859.0, + "step": 577 + }, + { + "entropy": 2.0818732380867004, + "epoch": 2.086877828054299, + "grad_norm": 0.5915331840515137, + "learning_rate": 0.0004000118303701521, + "loss": 0.1103, + "mean_token_accuracy": 0.9715124219655991, + "num_tokens": 5098331.0, + "step": 578 + }, + { + "entropy": 1.9556698501110077, + "epoch": 2.090497737556561, + "grad_norm": 0.5216535329818726, + "learning_rate": 0.0003996280908689345, + "loss": 0.1481, + "mean_token_accuracy": 0.9601311087608337, + "num_tokens": 5107557.0, + "step": 579 + }, + { + "entropy": 2.015773117542267, + "epoch": 2.0941176470588236, + "grad_norm": 0.7138916254043579, + "learning_rate": 0.00039924390099301584, + "loss": 0.1173, + "mean_token_accuracy": 0.9670253992080688, + "num_tokens": 5116677.0, + "step": 580 + }, + { + "entropy": 2.0676984786987305, + "epoch": 2.097737556561086, + "grad_norm": 0.7776201963424683, + "learning_rate": 0.0003988592622015984, + "loss": 0.0668, + "mean_token_accuracy": 0.9766870141029358, + "num_tokens": 5125262.0, + "step": 581 + }, + { + "entropy": 2.0256679952144623, + "epoch": 2.1013574660633485, + "grad_norm": 0.5481430888175964, + "learning_rate": 0.00039847417595558903, + "loss": 0.0898, + "mean_token_accuracy": 0.9747780114412308, + "num_tokens": 5133848.0, + "step": 582 + }, + { + "entropy": 2.049301326274872, + "epoch": 2.104977375565611, + "grad_norm": 0.6634963154792786, + "learning_rate": 0.00039808864371759464, + "loss": 0.1012, + "mean_token_accuracy": 0.9695883542299271, + "num_tokens": 5142266.0, + "step": 583 + }, + { + "entropy": 1.8873322904109955, + "epoch": 2.1085972850678734, + "grad_norm": 0.6262965798377991, + "learning_rate": 0.0003977026669519156, + "loss": 0.1064, + "mean_token_accuracy": 0.9686857610940933, + "num_tokens": 5151297.0, + "step": 584 + }, + { + "entropy": 2.0208800733089447, + "epoch": 2.112217194570136, + "grad_norm": 0.6475429534912109, + "learning_rate": 0.0003973162471245411, + "loss": 0.126, + "mean_token_accuracy": 0.9671273976564407, + "num_tokens": 5159913.0, + "step": 585 + }, + { + "entropy": 2.0354510843753815, + "epoch": 2.1158371040723982, + "grad_norm": 0.6373077034950256, + "learning_rate": 0.0003969293857031426, + "loss": 0.1403, + "mean_token_accuracy": 0.9615094214677811, + "num_tokens": 5168392.0, + "step": 586 + }, + { + "entropy": 2.0489701330661774, + "epoch": 2.1194570135746607, + "grad_norm": 0.7459731698036194, + "learning_rate": 0.0003965420841570693, + "loss": 0.0847, + "mean_token_accuracy": 0.9742033332586288, + "num_tokens": 5176858.0, + "step": 587 + }, + { + "entropy": 2.0531455874443054, + "epoch": 2.123076923076923, + "grad_norm": 0.8357418179512024, + "learning_rate": 0.00039615434395734174, + "loss": 0.2558, + "mean_token_accuracy": 0.9348864704370499, + "num_tokens": 5185101.0, + "step": 588 + }, + { + "entropy": 1.9761857986450195, + "epoch": 2.1266968325791855, + "grad_norm": 0.4816463887691498, + "learning_rate": 0.00039576616657664666, + "loss": 0.0934, + "mean_token_accuracy": 0.9781179577112198, + "num_tokens": 5193987.0, + "step": 589 + }, + { + "entropy": 2.0150316655635834, + "epoch": 2.130316742081448, + "grad_norm": 0.7039950489997864, + "learning_rate": 0.0003953775534893311, + "loss": 0.1558, + "mean_token_accuracy": 0.9602096229791641, + "num_tokens": 5202598.0, + "step": 590 + }, + { + "entropy": 2.0542426705360413, + "epoch": 2.1339366515837104, + "grad_norm": 0.6318346858024597, + "learning_rate": 0.00039498850617139737, + "loss": 0.1277, + "mean_token_accuracy": 0.9658758789300919, + "num_tokens": 5211157.0, + "step": 591 + }, + { + "entropy": 2.0793416798114777, + "epoch": 2.137556561085973, + "grad_norm": 0.6513328552246094, + "learning_rate": 0.0003945990261004964, + "loss": 0.3452, + "mean_token_accuracy": 0.9376382231712341, + "num_tokens": 5220057.0, + "step": 592 + }, + { + "entropy": 1.834738850593567, + "epoch": 2.1411764705882352, + "grad_norm": 0.709550678730011, + "learning_rate": 0.0003942091147559234, + "loss": 0.1632, + "mean_token_accuracy": 0.9588025957345963, + "num_tokens": 5229649.0, + "step": 593 + }, + { + "entropy": 2.115740954875946, + "epoch": 2.1447963800904977, + "grad_norm": 0.6495632529258728, + "learning_rate": 0.00039381877361861127, + "loss": 0.0799, + "mean_token_accuracy": 0.9793208837509155, + "num_tokens": 5238060.0, + "step": 594 + }, + { + "entropy": 1.9325994551181793, + "epoch": 2.14841628959276, + "grad_norm": 0.3864371180534363, + "learning_rate": 0.0003934280041711253, + "loss": 0.0392, + "mean_token_accuracy": 0.9867032468318939, + "num_tokens": 5246715.0, + "step": 595 + }, + { + "entropy": 1.9573578834533691, + "epoch": 2.1520361990950225, + "grad_norm": 0.8978553414344788, + "learning_rate": 0.0003930368078976578, + "loss": 0.1043, + "mean_token_accuracy": 0.9700421690940857, + "num_tokens": 5255677.0, + "step": 596 + }, + { + "entropy": 2.017194092273712, + "epoch": 2.155656108597285, + "grad_norm": 0.8082290887832642, + "learning_rate": 0.0003926451862840221, + "loss": 0.193, + "mean_token_accuracy": 0.9494165182113647, + "num_tokens": 5264229.0, + "step": 597 + }, + { + "entropy": 1.8982190787792206, + "epoch": 2.1592760180995474, + "grad_norm": 0.7600063681602478, + "learning_rate": 0.00039225314081764673, + "loss": 0.2152, + "mean_token_accuracy": 0.9523166120052338, + "num_tokens": 5273397.0, + "step": 598 + }, + { + "entropy": 1.9896901845932007, + "epoch": 2.16289592760181, + "grad_norm": 0.45877528190612793, + "learning_rate": 0.0003918606729875706, + "loss": 0.0892, + "mean_token_accuracy": 0.9720247238874435, + "num_tokens": 5282376.0, + "step": 599 + }, + { + "entropy": 1.8235589861869812, + "epoch": 2.1665158371040723, + "grad_norm": 0.49329352378845215, + "learning_rate": 0.0003914677842844365, + "loss": 0.0803, + "mean_token_accuracy": 0.9721037000417709, + "num_tokens": 5291815.0, + "step": 600 + }, + { + "entropy": 1.9400377571582794, + "epoch": 2.1701357466063347, + "grad_norm": 0.5306346416473389, + "learning_rate": 0.0003910744762004857, + "loss": 0.0602, + "mean_token_accuracy": 0.9762802571058273, + "num_tokens": 5300394.0, + "step": 601 + }, + { + "entropy": 1.7808023691177368, + "epoch": 2.173755656108597, + "grad_norm": 0.5050559043884277, + "learning_rate": 0.00039068075022955255, + "loss": 0.0862, + "mean_token_accuracy": 0.9724314510822296, + "num_tokens": 5309685.0, + "step": 602 + }, + { + "entropy": 1.9939678311347961, + "epoch": 2.1773755656108595, + "grad_norm": 0.6879346966743469, + "learning_rate": 0.0003902866078670584, + "loss": 0.0936, + "mean_token_accuracy": 0.9765703976154327, + "num_tokens": 5318020.0, + "step": 603 + }, + { + "entropy": 1.9384137690067291, + "epoch": 2.180995475113122, + "grad_norm": 0.6881359219551086, + "learning_rate": 0.0003898920506100061, + "loss": 0.1303, + "mean_token_accuracy": 0.9615567773580551, + "num_tokens": 5326895.0, + "step": 604 + }, + { + "entropy": 1.9919665455818176, + "epoch": 2.184615384615385, + "grad_norm": 0.6181508302688599, + "learning_rate": 0.00038949707995697446, + "loss": 0.0745, + "mean_token_accuracy": 0.9808734804391861, + "num_tokens": 5335355.0, + "step": 605 + }, + { + "entropy": 1.9376583397388458, + "epoch": 2.1882352941176473, + "grad_norm": 0.46525871753692627, + "learning_rate": 0.0003891016974081125, + "loss": 0.0826, + "mean_token_accuracy": 0.9753947854042053, + "num_tokens": 5343879.0, + "step": 606 + }, + { + "entropy": 1.8252979516983032, + "epoch": 2.1918552036199097, + "grad_norm": 0.5332593321800232, + "learning_rate": 0.00038870590446513325, + "loss": 0.1218, + "mean_token_accuracy": 0.9644111543893814, + "num_tokens": 5352980.0, + "step": 607 + }, + { + "entropy": 1.8981524407863617, + "epoch": 2.195475113122172, + "grad_norm": 0.5849556922912598, + "learning_rate": 0.0003883097026313089, + "loss": 0.0854, + "mean_token_accuracy": 0.9766328930854797, + "num_tokens": 5361576.0, + "step": 608 + }, + { + "entropy": 1.9466857016086578, + "epoch": 2.1990950226244346, + "grad_norm": 1.0213185548782349, + "learning_rate": 0.00038791309341146453, + "loss": 0.1282, + "mean_token_accuracy": 0.975858062505722, + "num_tokens": 5369947.0, + "step": 609 + }, + { + "entropy": 1.9219308197498322, + "epoch": 2.202714932126697, + "grad_norm": 0.7259594798088074, + "learning_rate": 0.00038751607831197243, + "loss": 0.0986, + "mean_token_accuracy": 0.9709735363721848, + "num_tokens": 5378429.0, + "step": 610 + }, + { + "entropy": 1.934881567955017, + "epoch": 2.2063348416289594, + "grad_norm": 0.6190217137336731, + "learning_rate": 0.0003871186588407467, + "loss": 0.1259, + "mean_token_accuracy": 0.9606761038303375, + "num_tokens": 5386986.0, + "step": 611 + }, + { + "entropy": 1.9234256446361542, + "epoch": 2.209954751131222, + "grad_norm": 1.1731759309768677, + "learning_rate": 0.00038672083650723697, + "loss": 0.3705, + "mean_token_accuracy": 0.9448409974575043, + "num_tokens": 5395623.0, + "step": 612 + }, + { + "entropy": 1.9198957085609436, + "epoch": 2.2135746606334843, + "grad_norm": 0.38831791281700134, + "learning_rate": 0.00038632261282242316, + "loss": 0.0405, + "mean_token_accuracy": 0.9884084165096283, + "num_tokens": 5403964.0, + "step": 613 + }, + { + "entropy": 1.9401849210262299, + "epoch": 2.2171945701357467, + "grad_norm": 0.6391944885253906, + "learning_rate": 0.0003859239892988097, + "loss": 0.0803, + "mean_token_accuracy": 0.9763080179691315, + "num_tokens": 5412601.0, + "step": 614 + }, + { + "entropy": 1.906328171491623, + "epoch": 2.220814479638009, + "grad_norm": 0.5495765805244446, + "learning_rate": 0.00038552496745041935, + "loss": 0.0919, + "mean_token_accuracy": 0.9796502739191055, + "num_tokens": 5421112.0, + "step": 615 + }, + { + "entropy": 1.9130763709545135, + "epoch": 2.2244343891402716, + "grad_norm": 0.8233397006988525, + "learning_rate": 0.0003851255487927883, + "loss": 0.1246, + "mean_token_accuracy": 0.9621723592281342, + "num_tokens": 5429851.0, + "step": 616 + }, + { + "entropy": 1.8408336341381073, + "epoch": 2.228054298642534, + "grad_norm": 0.8857082724571228, + "learning_rate": 0.00038472573484295904, + "loss": 0.1061, + "mean_token_accuracy": 0.9664444029331207, + "num_tokens": 5438983.0, + "step": 617 + }, + { + "entropy": 1.8644142150878906, + "epoch": 2.2316742081447964, + "grad_norm": 0.6762974262237549, + "learning_rate": 0.0003843255271194762, + "loss": 0.1532, + "mean_token_accuracy": 0.952915757894516, + "num_tokens": 5447922.0, + "step": 618 + }, + { + "entropy": 1.7125722169876099, + "epoch": 2.235294117647059, + "grad_norm": 0.44111478328704834, + "learning_rate": 0.00038392492714237975, + "loss": 0.0819, + "mean_token_accuracy": 0.9738304615020752, + "num_tokens": 5457128.0, + "step": 619 + }, + { + "entropy": 1.7900195717811584, + "epoch": 2.2389140271493213, + "grad_norm": 0.5224407911300659, + "learning_rate": 0.0003835239364331993, + "loss": 0.1023, + "mean_token_accuracy": 0.975239485502243, + "num_tokens": 5465760.0, + "step": 620 + }, + { + "entropy": 1.715638667345047, + "epoch": 2.2425339366515837, + "grad_norm": 0.6327251195907593, + "learning_rate": 0.00038312255651494866, + "loss": 0.154, + "mean_token_accuracy": 0.9579339027404785, + "num_tokens": 5475190.0, + "step": 621 + }, + { + "entropy": 1.8499042093753815, + "epoch": 2.246153846153846, + "grad_norm": 0.6490166187286377, + "learning_rate": 0.00038272078891212017, + "loss": 0.1248, + "mean_token_accuracy": 0.9679877310991287, + "num_tokens": 5484011.0, + "step": 622 + }, + { + "entropy": 1.7533331513404846, + "epoch": 2.2497737556561086, + "grad_norm": 0.6320033073425293, + "learning_rate": 0.000382318635150678, + "loss": 0.1588, + "mean_token_accuracy": 0.9576389044523239, + "num_tokens": 5493123.0, + "step": 623 + }, + { + "entropy": 1.8554400503635406, + "epoch": 2.253393665158371, + "grad_norm": 0.7169481515884399, + "learning_rate": 0.0003819160967580536, + "loss": 0.1316, + "mean_token_accuracy": 0.966967299580574, + "num_tokens": 5501923.0, + "step": 624 + }, + { + "entropy": 1.9283805191516876, + "epoch": 2.2570135746606335, + "grad_norm": 0.599856436252594, + "learning_rate": 0.00038151317526313917, + "loss": 0.1326, + "mean_token_accuracy": 0.961080014705658, + "num_tokens": 5510356.0, + "step": 625 + }, + { + "entropy": 1.7921342253684998, + "epoch": 2.260633484162896, + "grad_norm": 0.7019768357276917, + "learning_rate": 0.0003811098721962818, + "loss": 0.0976, + "mean_token_accuracy": 0.970125287771225, + "num_tokens": 5519016.0, + "step": 626 + }, + { + "entropy": 1.7646876573562622, + "epoch": 2.2642533936651583, + "grad_norm": 0.7311795949935913, + "learning_rate": 0.00038070618908927784, + "loss": 0.0908, + "mean_token_accuracy": 0.9719386845827103, + "num_tokens": 5528139.0, + "step": 627 + }, + { + "entropy": 1.8233769237995148, + "epoch": 2.2678733031674208, + "grad_norm": 0.6742154955863953, + "learning_rate": 0.0003803021274753674, + "loss": 0.1348, + "mean_token_accuracy": 0.9619691967964172, + "num_tokens": 5537036.0, + "step": 628 + }, + { + "entropy": 1.7711736857891083, + "epoch": 2.271493212669683, + "grad_norm": 0.6000869274139404, + "learning_rate": 0.00037989768888922775, + "loss": 0.1086, + "mean_token_accuracy": 0.9672373533248901, + "num_tokens": 5545932.0, + "step": 629 + }, + { + "entropy": 1.8396382629871368, + "epoch": 2.2751131221719456, + "grad_norm": 0.541504979133606, + "learning_rate": 0.0003794928748669683, + "loss": 0.0775, + "mean_token_accuracy": 0.977355495095253, + "num_tokens": 5554403.0, + "step": 630 + }, + { + "entropy": 1.890054315328598, + "epoch": 2.278733031674208, + "grad_norm": 0.5629594326019287, + "learning_rate": 0.00037908768694612434, + "loss": 0.0711, + "mean_token_accuracy": 0.9779117107391357, + "num_tokens": 5563156.0, + "step": 631 + }, + { + "entropy": 1.9505741894245148, + "epoch": 2.2823529411764705, + "grad_norm": 0.6717761754989624, + "learning_rate": 0.0003786821266656512, + "loss": 0.1077, + "mean_token_accuracy": 0.9674138873815536, + "num_tokens": 5571618.0, + "step": 632 + }, + { + "entropy": 1.8377742171287537, + "epoch": 2.285972850678733, + "grad_norm": 0.6176472902297974, + "learning_rate": 0.0003782761955659185, + "loss": 0.1106, + "mean_token_accuracy": 0.9669957906007767, + "num_tokens": 5580668.0, + "step": 633 + }, + { + "entropy": 1.8336479365825653, + "epoch": 2.2895927601809953, + "grad_norm": 0.5120813846588135, + "learning_rate": 0.0003778698951887042, + "loss": 0.0732, + "mean_token_accuracy": 0.9774532318115234, + "num_tokens": 5589491.0, + "step": 634 + }, + { + "entropy": 1.9576656222343445, + "epoch": 2.2932126696832578, + "grad_norm": 0.9347079396247864, + "learning_rate": 0.00037746322707718895, + "loss": 0.2275, + "mean_token_accuracy": 0.9512088149785995, + "num_tokens": 5598327.0, + "step": 635 + }, + { + "entropy": 1.9309991896152496, + "epoch": 2.29683257918552, + "grad_norm": 0.506108283996582, + "learning_rate": 0.0003770561927759502, + "loss": 0.1046, + "mean_token_accuracy": 0.9633967131376266, + "num_tokens": 5606948.0, + "step": 636 + }, + { + "entropy": 1.963425725698471, + "epoch": 2.3004524886877826, + "grad_norm": 0.5499919056892395, + "learning_rate": 0.0003766487938309561, + "loss": 0.0804, + "mean_token_accuracy": 0.9783825874328613, + "num_tokens": 5615342.0, + "step": 637 + }, + { + "entropy": 1.8853708505630493, + "epoch": 2.304072398190045, + "grad_norm": 0.5846657156944275, + "learning_rate": 0.00037624103178955946, + "loss": 0.0904, + "mean_token_accuracy": 0.9774703830480576, + "num_tokens": 5624449.0, + "step": 638 + }, + { + "entropy": 1.928403079509735, + "epoch": 2.3076923076923075, + "grad_norm": 0.5203971266746521, + "learning_rate": 0.0003758329082004928, + "loss": 0.0917, + "mean_token_accuracy": 0.9723261743783951, + "num_tokens": 5633273.0, + "step": 639 + }, + { + "entropy": 1.8914157152175903, + "epoch": 2.31131221719457, + "grad_norm": 0.5215239524841309, + "learning_rate": 0.00037542442461386145, + "loss": 0.1072, + "mean_token_accuracy": 0.9704900681972504, + "num_tokens": 5642357.0, + "step": 640 + }, + { + "entropy": 1.9754666090011597, + "epoch": 2.3149321266968323, + "grad_norm": 0.6710624694824219, + "learning_rate": 0.0003750155825811379, + "loss": 0.1344, + "mean_token_accuracy": 0.9615458548069, + "num_tokens": 5651409.0, + "step": 641 + }, + { + "entropy": 1.97001314163208, + "epoch": 2.318552036199095, + "grad_norm": 0.6511638164520264, + "learning_rate": 0.00037460638365515673, + "loss": 0.0502, + "mean_token_accuracy": 0.9829420000314713, + "num_tokens": 5660362.0, + "step": 642 + }, + { + "entropy": 1.9473612904548645, + "epoch": 2.3221719457013577, + "grad_norm": 0.5315663814544678, + "learning_rate": 0.00037419682939010725, + "loss": 0.1004, + "mean_token_accuracy": 0.9741797298192978, + "num_tokens": 5669386.0, + "step": 643 + }, + { + "entropy": 1.9136508405208588, + "epoch": 2.32579185520362, + "grad_norm": 0.6636398434638977, + "learning_rate": 0.00037378692134152887, + "loss": 0.0928, + "mean_token_accuracy": 0.9753085225820541, + "num_tokens": 5678226.0, + "step": 644 + }, + { + "entropy": 2.0870893597602844, + "epoch": 2.3294117647058825, + "grad_norm": 0.45003074407577515, + "learning_rate": 0.00037337666106630464, + "loss": 0.0937, + "mean_token_accuracy": 0.9742898046970367, + "num_tokens": 5687017.0, + "step": 645 + }, + { + "entropy": 2.084017276763916, + "epoch": 2.333031674208145, + "grad_norm": 0.6305840611457825, + "learning_rate": 0.0003729660501226553, + "loss": 0.1085, + "mean_token_accuracy": 0.9696957617998123, + "num_tokens": 5695585.0, + "step": 646 + }, + { + "entropy": 2.0916273295879364, + "epoch": 2.3366515837104074, + "grad_norm": 0.6674802303314209, + "learning_rate": 0.00037255509007013353, + "loss": 0.1214, + "mean_token_accuracy": 0.9657080322504044, + "num_tokens": 5704167.0, + "step": 647 + }, + { + "entropy": 2.0445155799388885, + "epoch": 2.34027149321267, + "grad_norm": 0.9245135188102722, + "learning_rate": 0.0003721437824696181, + "loss": 0.124, + "mean_token_accuracy": 0.9668982475996017, + "num_tokens": 5712896.0, + "step": 648 + }, + { + "entropy": 2.040050685405731, + "epoch": 2.3438914027149322, + "grad_norm": 0.558266818523407, + "learning_rate": 0.00037173212888330756, + "loss": 0.103, + "mean_token_accuracy": 0.9663692861795425, + "num_tokens": 5721568.0, + "step": 649 + }, + { + "entropy": 2.078313887119293, + "epoch": 2.3475113122171947, + "grad_norm": 0.6157237887382507, + "learning_rate": 0.0003713201308747148, + "loss": 0.1247, + "mean_token_accuracy": 0.9645204842090607, + "num_tokens": 5730097.0, + "step": 650 + }, + { + "entropy": 1.9473297894001007, + "epoch": 2.351131221719457, + "grad_norm": 0.6460309028625488, + "learning_rate": 0.0003709077900086607, + "loss": 0.193, + "mean_token_accuracy": 0.9537071883678436, + "num_tokens": 5738953.0, + "step": 651 + }, + { + "entropy": 1.9319245219230652, + "epoch": 2.3547511312217195, + "grad_norm": 0.826302170753479, + "learning_rate": 0.0003704951078512684, + "loss": 0.2072, + "mean_token_accuracy": 0.9553762674331665, + "num_tokens": 5748421.0, + "step": 652 + }, + { + "entropy": 2.000667005777359, + "epoch": 2.358371040723982, + "grad_norm": 0.508975625038147, + "learning_rate": 0.00037008208596995743, + "loss": 0.1124, + "mean_token_accuracy": 0.9674097448587418, + "num_tokens": 5757333.0, + "step": 653 + }, + { + "entropy": 1.9692010879516602, + "epoch": 2.3619909502262444, + "grad_norm": 0.597391664981842, + "learning_rate": 0.00036966872593343747, + "loss": 0.0958, + "mean_token_accuracy": 0.9727880656719208, + "num_tokens": 5766427.0, + "step": 654 + }, + { + "entropy": 1.9356706142425537, + "epoch": 2.365610859728507, + "grad_norm": 0.6264978051185608, + "learning_rate": 0.0003692550293117025, + "loss": 0.0925, + "mean_token_accuracy": 0.9736592024564743, + "num_tokens": 5775578.0, + "step": 655 + }, + { + "entropy": 2.086688846349716, + "epoch": 2.3692307692307693, + "grad_norm": 0.926537811756134, + "learning_rate": 0.00036884099767602523, + "loss": 0.1772, + "mean_token_accuracy": 0.9588586837053299, + "num_tokens": 5783754.0, + "step": 656 + }, + { + "entropy": 1.8272685706615448, + "epoch": 2.3728506787330317, + "grad_norm": 0.5276276469230652, + "learning_rate": 0.0003684266325989504, + "loss": 0.106, + "mean_token_accuracy": 0.9692760407924652, + "num_tokens": 5793159.0, + "step": 657 + }, + { + "entropy": 1.8490014672279358, + "epoch": 2.376470588235294, + "grad_norm": 0.6970511078834534, + "learning_rate": 0.0003680119356542895, + "loss": 0.0849, + "mean_token_accuracy": 0.9812656790018082, + "num_tokens": 5802503.0, + "step": 658 + }, + { + "entropy": 1.8577990531921387, + "epoch": 2.3800904977375565, + "grad_norm": 0.49535682797431946, + "learning_rate": 0.00036759690841711435, + "loss": 0.0965, + "mean_token_accuracy": 0.9723764955997467, + "num_tokens": 5811839.0, + "step": 659 + }, + { + "entropy": 1.785957396030426, + "epoch": 2.383710407239819, + "grad_norm": 0.7373266220092773, + "learning_rate": 0.00036718155246375124, + "loss": 0.103, + "mean_token_accuracy": 0.9659082442522049, + "num_tokens": 5821076.0, + "step": 660 + }, + { + "entropy": 1.8944315016269684, + "epoch": 2.3873303167420814, + "grad_norm": 0.4784161448478699, + "learning_rate": 0.000366765869371775, + "loss": 0.0899, + "mean_token_accuracy": 0.9731316566467285, + "num_tokens": 5830098.0, + "step": 661 + }, + { + "entropy": 1.8901143372058868, + "epoch": 2.390950226244344, + "grad_norm": 0.5539003610610962, + "learning_rate": 0.00036634986072000305, + "loss": 0.078, + "mean_token_accuracy": 0.9769923985004425, + "num_tokens": 5839149.0, + "step": 662 + }, + { + "entropy": 1.8183043003082275, + "epoch": 2.3945701357466063, + "grad_norm": 0.48431649804115295, + "learning_rate": 0.0003659335280884893, + "loss": 0.0669, + "mean_token_accuracy": 0.978607714176178, + "num_tokens": 5848064.0, + "step": 663 + }, + { + "entropy": 1.7216700911521912, + "epoch": 2.3981900452488687, + "grad_norm": 0.5597919821739197, + "learning_rate": 0.00036551687305851803, + "loss": 0.1026, + "mean_token_accuracy": 0.9733614027500153, + "num_tokens": 5857075.0, + "step": 664 + }, + { + "entropy": 1.7788107991218567, + "epoch": 2.401809954751131, + "grad_norm": 0.6780642867088318, + "learning_rate": 0.00036509989721259824, + "loss": 0.0895, + "mean_token_accuracy": 0.9711848199367523, + "num_tokens": 5866029.0, + "step": 665 + }, + { + "entropy": 1.8354471325874329, + "epoch": 2.4054298642533936, + "grad_norm": 0.6284046769142151, + "learning_rate": 0.0003646826021344573, + "loss": 0.1153, + "mean_token_accuracy": 0.9645407199859619, + "num_tokens": 5874523.0, + "step": 666 + }, + { + "entropy": 1.829980492591858, + "epoch": 2.409049773755656, + "grad_norm": 0.6398605704307556, + "learning_rate": 0.00036426498940903506, + "loss": 0.0605, + "mean_token_accuracy": 0.9823256582021713, + "num_tokens": 5883067.0, + "step": 667 + }, + { + "entropy": 1.839373379945755, + "epoch": 2.4126696832579184, + "grad_norm": 0.6254173517227173, + "learning_rate": 0.000363847060622478, + "loss": 0.0708, + "mean_token_accuracy": 0.978134423494339, + "num_tokens": 5891921.0, + "step": 668 + }, + { + "entropy": 1.7790280282497406, + "epoch": 2.416289592760181, + "grad_norm": 0.5987306833267212, + "learning_rate": 0.0003634288173621326, + "loss": 0.0888, + "mean_token_accuracy": 0.9814571887254715, + "num_tokens": 5900603.0, + "step": 669 + }, + { + "entropy": 1.6918425559997559, + "epoch": 2.4199095022624433, + "grad_norm": 0.784694492816925, + "learning_rate": 0.00036301026121654057, + "loss": 0.1353, + "mean_token_accuracy": 0.9646909832954407, + "num_tokens": 5910028.0, + "step": 670 + }, + { + "entropy": 1.726965218782425, + "epoch": 2.4235294117647057, + "grad_norm": 0.7017857432365417, + "learning_rate": 0.00036259139377543104, + "loss": 0.1531, + "mean_token_accuracy": 0.9617924690246582, + "num_tokens": 5919145.0, + "step": 671 + }, + { + "entropy": 1.7354467511177063, + "epoch": 2.427149321266968, + "grad_norm": 0.49217918515205383, + "learning_rate": 0.00036217221662971613, + "loss": 0.1217, + "mean_token_accuracy": 0.96451136469841, + "num_tokens": 5928203.0, + "step": 672 + }, + { + "entropy": 1.827672392129898, + "epoch": 2.430769230769231, + "grad_norm": 0.5875037312507629, + "learning_rate": 0.0003617527313714841, + "loss": 0.1151, + "mean_token_accuracy": 0.9714375436306, + "num_tokens": 5936876.0, + "step": 673 + }, + { + "entropy": 1.787518948316574, + "epoch": 2.4343891402714934, + "grad_norm": 0.5444310307502747, + "learning_rate": 0.0003613329395939933, + "loss": 0.1096, + "mean_token_accuracy": 0.9701481461524963, + "num_tokens": 5946025.0, + "step": 674 + }, + { + "entropy": 1.832441657781601, + "epoch": 2.438009049773756, + "grad_norm": 0.6885861754417419, + "learning_rate": 0.00036091284289166637, + "loss": 0.1409, + "mean_token_accuracy": 0.9587968736886978, + "num_tokens": 5954406.0, + "step": 675 + }, + { + "entropy": 1.7488494515419006, + "epoch": 2.4416289592760183, + "grad_norm": 0.4765988290309906, + "learning_rate": 0.0003604924428600843, + "loss": 0.1183, + "mean_token_accuracy": 0.9581810384988785, + "num_tokens": 5963472.0, + "step": 676 + }, + { + "entropy": 1.885668009519577, + "epoch": 2.4452488687782807, + "grad_norm": 0.7310354113578796, + "learning_rate": 0.00036007174109597983, + "loss": 0.1248, + "mean_token_accuracy": 0.9588694721460342, + "num_tokens": 5971771.0, + "step": 677 + }, + { + "entropy": 1.8329627513885498, + "epoch": 2.448868778280543, + "grad_norm": 0.37075191736221313, + "learning_rate": 0.00035965073919723206, + "loss": 0.0694, + "mean_token_accuracy": 0.9812011271715164, + "num_tokens": 5980536.0, + "step": 678 + }, + { + "entropy": 1.8218618333339691, + "epoch": 2.4524886877828056, + "grad_norm": 0.5196499228477478, + "learning_rate": 0.0003592294387628597, + "loss": 0.0833, + "mean_token_accuracy": 0.9765996187925339, + "num_tokens": 5989462.0, + "step": 679 + }, + { + "entropy": 1.7702144086360931, + "epoch": 2.456108597285068, + "grad_norm": 0.68550044298172, + "learning_rate": 0.0003588078413930155, + "loss": 0.1395, + "mean_token_accuracy": 0.9701545089483261, + "num_tokens": 5998702.0, + "step": 680 + }, + { + "entropy": 1.729397028684616, + "epoch": 2.4597285067873305, + "grad_norm": 0.6107930541038513, + "learning_rate": 0.00035838594868898004, + "loss": 0.1009, + "mean_token_accuracy": 0.9712544083595276, + "num_tokens": 6007594.0, + "step": 681 + }, + { + "entropy": 1.6558150053024292, + "epoch": 2.463348416289593, + "grad_norm": 0.45058509707450867, + "learning_rate": 0.0003579637622531555, + "loss": 0.0747, + "mean_token_accuracy": 0.9791784882545471, + "num_tokens": 6016874.0, + "step": 682 + }, + { + "entropy": 1.7209869921207428, + "epoch": 2.4669683257918553, + "grad_norm": 0.6103800535202026, + "learning_rate": 0.0003575412836890599, + "loss": 0.1096, + "mean_token_accuracy": 0.9665796160697937, + "num_tokens": 6026056.0, + "step": 683 + }, + { + "entropy": 1.790249615907669, + "epoch": 2.4705882352941178, + "grad_norm": 0.67525315284729, + "learning_rate": 0.0003571185146013205, + "loss": 0.0811, + "mean_token_accuracy": 0.9776998162269592, + "num_tokens": 6034624.0, + "step": 684 + }, + { + "entropy": 1.735906183719635, + "epoch": 2.47420814479638, + "grad_norm": 0.884986162185669, + "learning_rate": 0.00035669545659566836, + "loss": 0.2324, + "mean_token_accuracy": 0.9448857754468918, + "num_tokens": 6043557.0, + "step": 685 + }, + { + "entropy": 1.673194944858551, + "epoch": 2.4778280542986426, + "grad_norm": 0.7441328763961792, + "learning_rate": 0.0003562721112789316, + "loss": 0.1661, + "mean_token_accuracy": 0.9566781520843506, + "num_tokens": 6052623.0, + "step": 686 + }, + { + "entropy": 1.736072987318039, + "epoch": 2.481447963800905, + "grad_norm": 0.5674424767494202, + "learning_rate": 0.00035584848025902973, + "loss": 0.0751, + "mean_token_accuracy": 0.9750215858221054, + "num_tokens": 6061347.0, + "step": 687 + }, + { + "entropy": 1.625234305858612, + "epoch": 2.4850678733031675, + "grad_norm": 0.6596720218658447, + "learning_rate": 0.00035542456514496725, + "loss": 0.0796, + "mean_token_accuracy": 0.9773041009902954, + "num_tokens": 6070396.0, + "step": 688 + }, + { + "entropy": 1.6548752784729004, + "epoch": 2.48868778280543, + "grad_norm": 0.5798892378807068, + "learning_rate": 0.00035500036754682794, + "loss": 0.1412, + "mean_token_accuracy": 0.9653023481369019, + "num_tokens": 6079757.0, + "step": 689 + }, + { + "entropy": 1.6213977932929993, + "epoch": 2.4923076923076923, + "grad_norm": 0.44931474328041077, + "learning_rate": 0.00035457588907576823, + "loss": 0.0724, + "mean_token_accuracy": 0.9800422787666321, + "num_tokens": 6088646.0, + "step": 690 + }, + { + "entropy": 1.6762541830539703, + "epoch": 2.4959276018099548, + "grad_norm": 0.6818104386329651, + "learning_rate": 0.0003541511313440114, + "loss": 0.1217, + "mean_token_accuracy": 0.9675028026103973, + "num_tokens": 6097441.0, + "step": 691 + }, + { + "entropy": 1.7241974771022797, + "epoch": 2.499547511312217, + "grad_norm": 0.4126259982585907, + "learning_rate": 0.00035372609596484166, + "loss": 0.0615, + "mean_token_accuracy": 0.9799284338951111, + "num_tokens": 6105578.0, + "step": 692 + }, + { + "entropy": 1.6379709541797638, + "epoch": 2.5031674208144796, + "grad_norm": 0.47291842103004456, + "learning_rate": 0.00035330078455259734, + "loss": 0.0858, + "mean_token_accuracy": 0.9744312763214111, + "num_tokens": 6114404.0, + "step": 693 + }, + { + "entropy": 1.6317658722400665, + "epoch": 2.506787330316742, + "grad_norm": 0.5747683048248291, + "learning_rate": 0.00035287519872266544, + "loss": 0.1344, + "mean_token_accuracy": 0.9632531553506851, + "num_tokens": 6123319.0, + "step": 694 + }, + { + "entropy": 1.6969698369503021, + "epoch": 2.5104072398190045, + "grad_norm": 0.5810018181800842, + "learning_rate": 0.00035244934009147523, + "loss": 0.0927, + "mean_token_accuracy": 0.9729650169610977, + "num_tokens": 6131814.0, + "step": 695 + }, + { + "entropy": 1.631262481212616, + "epoch": 2.514027149321267, + "grad_norm": 0.44387346506118774, + "learning_rate": 0.00035202321027649205, + "loss": 0.0657, + "mean_token_accuracy": 0.9802225232124329, + "num_tokens": 6140967.0, + "step": 696 + }, + { + "entropy": 1.610716551542282, + "epoch": 2.5176470588235293, + "grad_norm": 0.6546471118927002, + "learning_rate": 0.0003515968108962112, + "loss": 0.1114, + "mean_token_accuracy": 0.9671156108379364, + "num_tokens": 6149938.0, + "step": 697 + }, + { + "entropy": 1.598843276500702, + "epoch": 2.521266968325792, + "grad_norm": 0.541953444480896, + "learning_rate": 0.0003511701435701519, + "loss": 0.0504, + "mean_token_accuracy": 0.98616062104702, + "num_tokens": 6158686.0, + "step": 698 + }, + { + "entropy": 1.7793676853179932, + "epoch": 2.524886877828054, + "grad_norm": 0.6303162574768066, + "learning_rate": 0.00035074320991885106, + "loss": 0.0797, + "mean_token_accuracy": 0.9783169627189636, + "num_tokens": 6166835.0, + "step": 699 + }, + { + "entropy": 1.598317414522171, + "epoch": 2.5285067873303166, + "grad_norm": 0.4783090054988861, + "learning_rate": 0.000350316011563857, + "loss": 0.0693, + "mean_token_accuracy": 0.9740357846021652, + "num_tokens": 6175978.0, + "step": 700 + }, + { + "entropy": 1.6361595392227173, + "epoch": 2.532126696832579, + "grad_norm": 0.46353498101234436, + "learning_rate": 0.00034988855012772367, + "loss": 0.0543, + "mean_token_accuracy": 0.9821173399686813, + "num_tokens": 6185071.0, + "step": 701 + }, + { + "entropy": 1.6333596408367157, + "epoch": 2.5357466063348415, + "grad_norm": 0.4968421459197998, + "learning_rate": 0.0003494608272340039, + "loss": 0.1588, + "mean_token_accuracy": 0.9692430347204208, + "num_tokens": 6194279.0, + "step": 702 + }, + { + "entropy": 1.6701206266880035, + "epoch": 2.539366515837104, + "grad_norm": 0.7050784826278687, + "learning_rate": 0.00034903284450724385, + "loss": 0.1298, + "mean_token_accuracy": 0.9623726159334183, + "num_tokens": 6203017.0, + "step": 703 + }, + { + "entropy": 1.6594900786876678, + "epoch": 2.5429864253393664, + "grad_norm": 0.7955659031867981, + "learning_rate": 0.0003486046035729765, + "loss": 0.1695, + "mean_token_accuracy": 0.9616524875164032, + "num_tokens": 6212016.0, + "step": 704 + }, + { + "entropy": 1.7208792865276337, + "epoch": 2.546606334841629, + "grad_norm": 0.7105070352554321, + "learning_rate": 0.00034817610605771546, + "loss": 0.1655, + "mean_token_accuracy": 0.9637335985898972, + "num_tokens": 6220619.0, + "step": 705 + }, + { + "entropy": 1.668517529964447, + "epoch": 2.5502262443438912, + "grad_norm": 0.3955032527446747, + "learning_rate": 0.0003477473535889488, + "loss": 0.0502, + "mean_token_accuracy": 0.9823585599660873, + "num_tokens": 6229785.0, + "step": 706 + }, + { + "entropy": 1.7515103816986084, + "epoch": 2.5538461538461537, + "grad_norm": 0.6166616082191467, + "learning_rate": 0.00034731834779513313, + "loss": 0.1113, + "mean_token_accuracy": 0.9675650298595428, + "num_tokens": 6238724.0, + "step": 707 + }, + { + "entropy": 1.8460631668567657, + "epoch": 2.557466063348416, + "grad_norm": 0.8243921399116516, + "learning_rate": 0.0003468890903056872, + "loss": 0.1625, + "mean_token_accuracy": 0.9648249596357346, + "num_tokens": 6246939.0, + "step": 708 + }, + { + "entropy": 1.784417450428009, + "epoch": 2.5610859728506785, + "grad_norm": 0.5633116960525513, + "learning_rate": 0.00034645958275098557, + "loss": 0.1074, + "mean_token_accuracy": 0.9705483913421631, + "num_tokens": 6255686.0, + "step": 709 + }, + { + "entropy": 1.7208334505558014, + "epoch": 2.564705882352941, + "grad_norm": 0.8083389401435852, + "learning_rate": 0.0003460298267623526, + "loss": 0.1184, + "mean_token_accuracy": 0.9747882932424545, + "num_tokens": 6265047.0, + "step": 710 + }, + { + "entropy": 1.7345463037490845, + "epoch": 2.5683257918552034, + "grad_norm": 0.6094368100166321, + "learning_rate": 0.0003455998239720565, + "loss": 0.1689, + "mean_token_accuracy": 0.9613602459430695, + "num_tokens": 6274460.0, + "step": 711 + }, + { + "entropy": 1.9464713335037231, + "epoch": 2.571945701357466, + "grad_norm": 0.6025084853172302, + "learning_rate": 0.0003451695760133025, + "loss": 0.1477, + "mean_token_accuracy": 0.9618766456842422, + "num_tokens": 6282700.0, + "step": 712 + }, + { + "entropy": 1.8449675738811493, + "epoch": 2.5755656108597282, + "grad_norm": 0.43869853019714355, + "learning_rate": 0.0003447390845202272, + "loss": 0.0892, + "mean_token_accuracy": 0.974039301276207, + "num_tokens": 6291627.0, + "step": 713 + }, + { + "entropy": 1.9028298556804657, + "epoch": 2.579185520361991, + "grad_norm": 0.5455291271209717, + "learning_rate": 0.0003443083511278922, + "loss": 0.0939, + "mean_token_accuracy": 0.9729337990283966, + "num_tokens": 6300198.0, + "step": 714 + }, + { + "entropy": 1.8395194113254547, + "epoch": 2.5828054298642535, + "grad_norm": 0.48734748363494873, + "learning_rate": 0.00034387737747227786, + "loss": 0.0791, + "mean_token_accuracy": 0.9785804748535156, + "num_tokens": 6309362.0, + "step": 715 + }, + { + "entropy": 1.8357026278972626, + "epoch": 2.586425339366516, + "grad_norm": 0.4359396994113922, + "learning_rate": 0.000343446165190277, + "loss": 0.0752, + "mean_token_accuracy": 0.9807359129190445, + "num_tokens": 6318232.0, + "step": 716 + }, + { + "entropy": 1.7531521618366241, + "epoch": 2.5900452488687784, + "grad_norm": 0.7446436882019043, + "learning_rate": 0.0003430147159196887, + "loss": 0.1467, + "mean_token_accuracy": 0.9661064445972443, + "num_tokens": 6327607.0, + "step": 717 + }, + { + "entropy": 1.83816197514534, + "epoch": 2.593665158371041, + "grad_norm": 0.3669150173664093, + "learning_rate": 0.0003425830312992125, + "loss": 0.076, + "mean_token_accuracy": 0.9777591675519943, + "num_tokens": 6336991.0, + "step": 718 + }, + { + "entropy": 1.9396244585514069, + "epoch": 2.5972850678733033, + "grad_norm": 0.6049129962921143, + "learning_rate": 0.00034215111296844147, + "loss": 0.1001, + "mean_token_accuracy": 0.968943640589714, + "num_tokens": 6345381.0, + "step": 719 + }, + { + "entropy": 1.8745197057724, + "epoch": 2.6009049773755657, + "grad_norm": 0.8561233878135681, + "learning_rate": 0.00034171896256785645, + "loss": 0.2378, + "mean_token_accuracy": 0.9442594349384308, + "num_tokens": 6354290.0, + "step": 720 + }, + { + "entropy": 1.8199078440666199, + "epoch": 2.604524886877828, + "grad_norm": 0.4546636939048767, + "learning_rate": 0.00034128658173881993, + "loss": 0.0407, + "mean_token_accuracy": 0.9873656630516052, + "num_tokens": 6362826.0, + "step": 721 + }, + { + "entropy": 1.8066097497940063, + "epoch": 2.6081447963800906, + "grad_norm": 0.6496687531471252, + "learning_rate": 0.0003408539721235691, + "loss": 0.1279, + "mean_token_accuracy": 0.9674505293369293, + "num_tokens": 6371666.0, + "step": 722 + }, + { + "entropy": 1.8027856945991516, + "epoch": 2.611764705882353, + "grad_norm": 0.6001412272453308, + "learning_rate": 0.0003404211353652106, + "loss": 0.1144, + "mean_token_accuracy": 0.9672902077436447, + "num_tokens": 6380469.0, + "step": 723 + }, + { + "entropy": 1.7859437465667725, + "epoch": 2.6153846153846154, + "grad_norm": 0.4654795229434967, + "learning_rate": 0.0003399880731077136, + "loss": 0.0655, + "mean_token_accuracy": 0.9804074019193649, + "num_tokens": 6389485.0, + "step": 724 + }, + { + "entropy": 1.722127079963684, + "epoch": 2.619004524886878, + "grad_norm": 0.5452624559402466, + "learning_rate": 0.0003395547869959037, + "loss": 0.0827, + "mean_token_accuracy": 0.972189649939537, + "num_tokens": 6398523.0, + "step": 725 + }, + { + "entropy": 1.7406074404716492, + "epoch": 2.6226244343891403, + "grad_norm": 0.5524203181266785, + "learning_rate": 0.00033912127867545685, + "loss": 0.1279, + "mean_token_accuracy": 0.9688322842121124, + "num_tokens": 6407560.0, + "step": 726 + }, + { + "entropy": 1.7783840000629425, + "epoch": 2.6262443438914027, + "grad_norm": 0.6428073644638062, + "learning_rate": 0.00033868754979289275, + "loss": 0.1392, + "mean_token_accuracy": 0.9665655642747879, + "num_tokens": 6416230.0, + "step": 727 + }, + { + "entropy": 1.7406431436538696, + "epoch": 2.629864253393665, + "grad_norm": 0.6197221875190735, + "learning_rate": 0.0003382536019955691, + "loss": 0.2688, + "mean_token_accuracy": 0.9567561745643616, + "num_tokens": 6425158.0, + "step": 728 + }, + { + "entropy": 1.7054848670959473, + "epoch": 2.6334841628959276, + "grad_norm": 0.499615877866745, + "learning_rate": 0.0003378194369316749, + "loss": 0.0765, + "mean_token_accuracy": 0.9788558930158615, + "num_tokens": 6434219.0, + "step": 729 + }, + { + "entropy": 1.8623437583446503, + "epoch": 2.63710407239819, + "grad_norm": 0.428608775138855, + "learning_rate": 0.0003373850562502243, + "loss": 0.044, + "mean_token_accuracy": 0.9862259030342102, + "num_tokens": 6442657.0, + "step": 730 + }, + { + "entropy": 1.6827208995819092, + "epoch": 2.6407239819004524, + "grad_norm": 0.46222713589668274, + "learning_rate": 0.00033695046160105076, + "loss": 0.0687, + "mean_token_accuracy": 0.9762164503335953, + "num_tokens": 6451550.0, + "step": 731 + }, + { + "entropy": 1.707773894071579, + "epoch": 2.644343891402715, + "grad_norm": 0.4701695442199707, + "learning_rate": 0.0003365156546347998, + "loss": 0.0622, + "mean_token_accuracy": 0.9804075062274933, + "num_tokens": 6460494.0, + "step": 732 + }, + { + "entropy": 1.7011042833328247, + "epoch": 2.6479638009049773, + "grad_norm": 0.5986224412918091, + "learning_rate": 0.0003360806370029239, + "loss": 0.0954, + "mean_token_accuracy": 0.9730664491653442, + "num_tokens": 6469728.0, + "step": 733 + }, + { + "entropy": 1.810427963733673, + "epoch": 2.6515837104072397, + "grad_norm": 0.8224559426307678, + "learning_rate": 0.0003356454103576754, + "loss": 0.1218, + "mean_token_accuracy": 0.9742488712072372, + "num_tokens": 6478643.0, + "step": 734 + }, + { + "entropy": 1.773183435201645, + "epoch": 2.655203619909502, + "grad_norm": 0.609344482421875, + "learning_rate": 0.0003352099763521006, + "loss": 0.0955, + "mean_token_accuracy": 0.9747250378131866, + "num_tokens": 6487314.0, + "step": 735 + }, + { + "entropy": 1.7761066555976868, + "epoch": 2.6588235294117646, + "grad_norm": 0.6947258114814758, + "learning_rate": 0.0003347743366400333, + "loss": 0.1188, + "mean_token_accuracy": 0.9693178832530975, + "num_tokens": 6496074.0, + "step": 736 + }, + { + "entropy": 1.7725336253643036, + "epoch": 2.662443438914027, + "grad_norm": 0.6928444504737854, + "learning_rate": 0.0003343384928760887, + "loss": 0.1589, + "mean_token_accuracy": 0.9603369683027267, + "num_tokens": 6504997.0, + "step": 737 + }, + { + "entropy": 1.8763961493968964, + "epoch": 2.6660633484162894, + "grad_norm": 0.6204855442047119, + "learning_rate": 0.00033390244671565694, + "loss": 0.1115, + "mean_token_accuracy": 0.9727036952972412, + "num_tokens": 6513639.0, + "step": 738 + }, + { + "entropy": 1.8347080647945404, + "epoch": 2.669683257918552, + "grad_norm": 0.4470975697040558, + "learning_rate": 0.00033346619981489687, + "loss": 0.0707, + "mean_token_accuracy": 0.9816004037857056, + "num_tokens": 6522524.0, + "step": 739 + }, + { + "entropy": 1.8440867066383362, + "epoch": 2.6733031674208148, + "grad_norm": 0.6848122477531433, + "learning_rate": 0.0003330297538307298, + "loss": 0.1133, + "mean_token_accuracy": 0.966602012515068, + "num_tokens": 6531421.0, + "step": 740 + }, + { + "entropy": 1.829009771347046, + "epoch": 2.676923076923077, + "grad_norm": 0.37875643372535706, + "learning_rate": 0.0003325931104208333, + "loss": 0.0539, + "mean_token_accuracy": 0.9850967526435852, + "num_tokens": 6540304.0, + "step": 741 + }, + { + "entropy": 1.8256315886974335, + "epoch": 2.6805429864253396, + "grad_norm": 0.4970630407333374, + "learning_rate": 0.00033215627124363466, + "loss": 0.1195, + "mean_token_accuracy": 0.9662436544895172, + "num_tokens": 6549267.0, + "step": 742 + }, + { + "entropy": 1.823629915714264, + "epoch": 2.684162895927602, + "grad_norm": 0.659981906414032, + "learning_rate": 0.0003317192379583047, + "loss": 0.1368, + "mean_token_accuracy": 0.9655566364526749, + "num_tokens": 6558447.0, + "step": 743 + }, + { + "entropy": 1.8459455370903015, + "epoch": 2.6877828054298645, + "grad_norm": 0.620197057723999, + "learning_rate": 0.0003312820122247515, + "loss": 0.1766, + "mean_token_accuracy": 0.9569400995969772, + "num_tokens": 6567424.0, + "step": 744 + }, + { + "entropy": 1.7685991525650024, + "epoch": 2.691402714932127, + "grad_norm": 0.34498465061187744, + "learning_rate": 0.0003308445957036142, + "loss": 0.0615, + "mean_token_accuracy": 0.982216015458107, + "num_tokens": 6577071.0, + "step": 745 + }, + { + "entropy": 1.8037284910678864, + "epoch": 2.6950226244343893, + "grad_norm": 0.5550521016120911, + "learning_rate": 0.00033040699005625654, + "loss": 0.0701, + "mean_token_accuracy": 0.9795115292072296, + "num_tokens": 6586396.0, + "step": 746 + }, + { + "entropy": 1.813001424074173, + "epoch": 2.6986425339366518, + "grad_norm": 0.4117080271244049, + "learning_rate": 0.0003299691969447603, + "loss": 0.0657, + "mean_token_accuracy": 0.978747770190239, + "num_tokens": 6595189.0, + "step": 747 + }, + { + "entropy": 1.844575196504593, + "epoch": 2.702262443438914, + "grad_norm": 0.32197874784469604, + "learning_rate": 0.00032953121803191976, + "loss": 0.0342, + "mean_token_accuracy": 0.9904316365718842, + "num_tokens": 6604169.0, + "step": 748 + }, + { + "entropy": 1.9490505158901215, + "epoch": 2.7058823529411766, + "grad_norm": 0.5810762047767639, + "learning_rate": 0.00032909305498123465, + "loss": 0.1419, + "mean_token_accuracy": 0.9646100401878357, + "num_tokens": 6612744.0, + "step": 749 + }, + { + "entropy": 1.9927488267421722, + "epoch": 2.709502262443439, + "grad_norm": 0.7435065507888794, + "learning_rate": 0.0003286547094569039, + "loss": 0.1368, + "mean_token_accuracy": 0.9609140008687973, + "num_tokens": 6621000.0, + "step": 750 + }, + { + "entropy": 1.8266884088516235, + "epoch": 2.7131221719457015, + "grad_norm": 0.6717537045478821, + "learning_rate": 0.00032821618312381975, + "loss": 0.1449, + "mean_token_accuracy": 0.9694183021783829, + "num_tokens": 6629893.0, + "step": 751 + }, + { + "entropy": 1.850794643163681, + "epoch": 2.716742081447964, + "grad_norm": 0.44241195917129517, + "learning_rate": 0.00032777747764756117, + "loss": 0.0602, + "mean_token_accuracy": 0.9823136776685715, + "num_tokens": 6638696.0, + "step": 752 + }, + { + "entropy": 1.8408480882644653, + "epoch": 2.7203619909502263, + "grad_norm": 0.6299809217453003, + "learning_rate": 0.00032733859469438736, + "loss": 0.1408, + "mean_token_accuracy": 0.9629880636930466, + "num_tokens": 6647431.0, + "step": 753 + }, + { + "entropy": 1.7875444293022156, + "epoch": 2.723981900452489, + "grad_norm": 0.48492106795310974, + "learning_rate": 0.00032689953593123175, + "loss": 0.0806, + "mean_token_accuracy": 0.9798424690961838, + "num_tokens": 6656443.0, + "step": 754 + }, + { + "entropy": 1.778283566236496, + "epoch": 2.727601809954751, + "grad_norm": 0.46145930886268616, + "learning_rate": 0.0003264603030256955, + "loss": 0.0707, + "mean_token_accuracy": 0.9741399586200714, + "num_tokens": 6665465.0, + "step": 755 + }, + { + "entropy": 1.7340950965881348, + "epoch": 2.7312217194570136, + "grad_norm": 0.5734900236129761, + "learning_rate": 0.00032602089764604126, + "loss": 0.1443, + "mean_token_accuracy": 0.96195288002491, + "num_tokens": 6674797.0, + "step": 756 + }, + { + "entropy": 1.7791962027549744, + "epoch": 2.734841628959276, + "grad_norm": 0.5199477076530457, + "learning_rate": 0.00032558132146118636, + "loss": 0.0794, + "mean_token_accuracy": 0.975062221288681, + "num_tokens": 6683578.0, + "step": 757 + }, + { + "entropy": 1.825905591249466, + "epoch": 2.7384615384615385, + "grad_norm": 0.5944926738739014, + "learning_rate": 0.0003251415761406975, + "loss": 0.0909, + "mean_token_accuracy": 0.954865038394928, + "num_tokens": 6691818.0, + "step": 758 + }, + { + "entropy": 1.804949015378952, + "epoch": 2.742081447963801, + "grad_norm": 0.7065241932868958, + "learning_rate": 0.0003247016633547833, + "loss": 0.1511, + "mean_token_accuracy": 0.9687065333127975, + "num_tokens": 6700619.0, + "step": 759 + }, + { + "entropy": 1.7419202327728271, + "epoch": 2.7457013574660634, + "grad_norm": 0.49316564202308655, + "learning_rate": 0.00032426158477428857, + "loss": 0.0867, + "mean_token_accuracy": 0.9774050414562225, + "num_tokens": 6709635.0, + "step": 760 + }, + { + "entropy": 1.8934829235076904, + "epoch": 2.749321266968326, + "grad_norm": 0.9417999386787415, + "learning_rate": 0.00032382134207068787, + "loss": 0.1464, + "mean_token_accuracy": 0.9591032713651657, + "num_tokens": 6717657.0, + "step": 761 + }, + { + "entropy": 1.7354997992515564, + "epoch": 2.7529411764705882, + "grad_norm": 0.7240809798240662, + "learning_rate": 0.00032338093691607907, + "loss": 0.13, + "mean_token_accuracy": 0.9705345183610916, + "num_tokens": 6726671.0, + "step": 762 + }, + { + "entropy": 1.7620687186717987, + "epoch": 2.7565610859728507, + "grad_norm": 0.4986638128757477, + "learning_rate": 0.0003229403709831772, + "loss": 0.0963, + "mean_token_accuracy": 0.9756871312856674, + "num_tokens": 6735157.0, + "step": 763 + }, + { + "entropy": 1.7719130218029022, + "epoch": 2.760180995475113, + "grad_norm": 0.6204966902732849, + "learning_rate": 0.00032249964594530757, + "loss": 0.0578, + "mean_token_accuracy": 0.9815829247236252, + "num_tokens": 6743855.0, + "step": 764 + }, + { + "entropy": 1.7228702902793884, + "epoch": 2.7638009049773755, + "grad_norm": 0.5283492207527161, + "learning_rate": 0.0003220587634764003, + "loss": 0.069, + "mean_token_accuracy": 0.9851528853178024, + "num_tokens": 6753040.0, + "step": 765 + }, + { + "entropy": 1.7129736840724945, + "epoch": 2.767420814479638, + "grad_norm": 0.49026060104370117, + "learning_rate": 0.0003216177252509831, + "loss": 0.0672, + "mean_token_accuracy": 0.9857761710882187, + "num_tokens": 6762014.0, + "step": 766 + }, + { + "entropy": 1.7600707411766052, + "epoch": 2.7710407239819004, + "grad_norm": 0.5250128507614136, + "learning_rate": 0.00032117653294417523, + "loss": 0.1134, + "mean_token_accuracy": 0.9638848602771759, + "num_tokens": 6771012.0, + "step": 767 + }, + { + "entropy": 1.768298476934433, + "epoch": 2.774660633484163, + "grad_norm": 0.5671310424804688, + "learning_rate": 0.00032073518823168143, + "loss": 0.057, + "mean_token_accuracy": 0.9840837568044662, + "num_tokens": 6779601.0, + "step": 768 + }, + { + "entropy": 1.7464122474193573, + "epoch": 2.7782805429864252, + "grad_norm": 0.6007266044616699, + "learning_rate": 0.0003202936927897852, + "loss": 0.081, + "mean_token_accuracy": 0.9773043692111969, + "num_tokens": 6788518.0, + "step": 769 + }, + { + "entropy": 1.6484523713588715, + "epoch": 2.7819004524886877, + "grad_norm": 0.5163906812667847, + "learning_rate": 0.00031985204829534236, + "loss": 0.1215, + "mean_token_accuracy": 0.9645300209522247, + "num_tokens": 6797924.0, + "step": 770 + }, + { + "entropy": 1.7306124567985535, + "epoch": 2.78552036199095, + "grad_norm": 0.5778948068618774, + "learning_rate": 0.00031941025642577515, + "loss": 0.127, + "mean_token_accuracy": 0.9713134616613388, + "num_tokens": 6806828.0, + "step": 771 + }, + { + "entropy": 1.6599189043045044, + "epoch": 2.7891402714932125, + "grad_norm": 0.5121646523475647, + "learning_rate": 0.0003189683188590653, + "loss": 0.1066, + "mean_token_accuracy": 0.9707446396350861, + "num_tokens": 6816144.0, + "step": 772 + }, + { + "entropy": 1.71377295255661, + "epoch": 2.792760180995475, + "grad_norm": 0.9535031318664551, + "learning_rate": 0.00031852623727374787, + "loss": 0.2316, + "mean_token_accuracy": 0.9587533473968506, + "num_tokens": 6824849.0, + "step": 773 + }, + { + "entropy": 1.7716725766658783, + "epoch": 2.7963800904977374, + "grad_norm": 0.5735589265823364, + "learning_rate": 0.00031808401334890537, + "loss": 0.1028, + "mean_token_accuracy": 0.9716143608093262, + "num_tokens": 6833331.0, + "step": 774 + }, + { + "entropy": 1.7134707272052765, + "epoch": 2.8, + "grad_norm": 0.7087857127189636, + "learning_rate": 0.00031764164876416036, + "loss": 0.1201, + "mean_token_accuracy": 0.9686445444822311, + "num_tokens": 6842254.0, + "step": 775 + }, + { + "entropy": 1.6055873930454254, + "epoch": 2.8036199095022623, + "grad_norm": 0.4578965902328491, + "learning_rate": 0.00031719914519967, + "loss": 0.0827, + "mean_token_accuracy": 0.972065269947052, + "num_tokens": 6851644.0, + "step": 776 + }, + { + "entropy": 1.6444376707077026, + "epoch": 2.8072398190045247, + "grad_norm": 0.5656917095184326, + "learning_rate": 0.0003167565043361194, + "loss": 0.1036, + "mean_token_accuracy": 0.9723617881536484, + "num_tokens": 6860787.0, + "step": 777 + }, + { + "entropy": 1.6980305314064026, + "epoch": 2.810859728506787, + "grad_norm": 0.7013098001480103, + "learning_rate": 0.0003163137278547146, + "loss": 0.0838, + "mean_token_accuracy": 0.9793482422828674, + "num_tokens": 6869378.0, + "step": 778 + }, + { + "entropy": 1.6744478940963745, + "epoch": 2.8144796380090495, + "grad_norm": 0.6889812350273132, + "learning_rate": 0.00031587081743717735, + "loss": 0.0964, + "mean_token_accuracy": 0.9762091189622879, + "num_tokens": 6878050.0, + "step": 779 + }, + { + "entropy": 1.6397214829921722, + "epoch": 2.818099547511312, + "grad_norm": 0.7166011333465576, + "learning_rate": 0.00031542777476573785, + "loss": 0.1792, + "mean_token_accuracy": 0.9539972990751266, + "num_tokens": 6887153.0, + "step": 780 + }, + { + "entropy": 1.6447750926017761, + "epoch": 2.8217194570135744, + "grad_norm": 0.7113035321235657, + "learning_rate": 0.0003149846015231286, + "loss": 0.1464, + "mean_token_accuracy": 0.96909099817276, + "num_tokens": 6895877.0, + "step": 781 + }, + { + "entropy": 1.6827795505523682, + "epoch": 2.825339366515837, + "grad_norm": 0.6915350556373596, + "learning_rate": 0.0003145412993925781, + "loss": 0.1335, + "mean_token_accuracy": 0.9615183472633362, + "num_tokens": 6904553.0, + "step": 782 + }, + { + "entropy": 1.6189779937267303, + "epoch": 2.8289592760180997, + "grad_norm": 0.467428982257843, + "learning_rate": 0.00031409787005780423, + "loss": 0.0829, + "mean_token_accuracy": 0.9781016558408737, + "num_tokens": 6913634.0, + "step": 783 + }, + { + "entropy": 1.6323690116405487, + "epoch": 2.832579185520362, + "grad_norm": 0.49170154333114624, + "learning_rate": 0.00031365431520300813, + "loss": 0.0828, + "mean_token_accuracy": 0.9719655811786652, + "num_tokens": 6922638.0, + "step": 784 + }, + { + "entropy": 1.6121336817741394, + "epoch": 2.8361990950226246, + "grad_norm": 0.5629302263259888, + "learning_rate": 0.00031321063651286777, + "loss": 0.0757, + "mean_token_accuracy": 0.9791934490203857, + "num_tokens": 6931590.0, + "step": 785 + }, + { + "entropy": 1.7345627546310425, + "epoch": 2.839819004524887, + "grad_norm": 0.5514137148857117, + "learning_rate": 0.0003127668356725313, + "loss": 0.0819, + "mean_token_accuracy": 0.9800210148096085, + "num_tokens": 6940137.0, + "step": 786 + }, + { + "entropy": 1.6671563386917114, + "epoch": 2.8434389140271494, + "grad_norm": 0.5090643167495728, + "learning_rate": 0.0003123229143676109, + "loss": 0.0794, + "mean_token_accuracy": 0.9826332330703735, + "num_tokens": 6948616.0, + "step": 787 + }, + { + "entropy": 1.551501840353012, + "epoch": 2.847058823529412, + "grad_norm": 0.3994922935962677, + "learning_rate": 0.0003118788742841761, + "loss": 0.0491, + "mean_token_accuracy": 0.9865831136703491, + "num_tokens": 6957369.0, + "step": 788 + }, + { + "entropy": 1.500845193862915, + "epoch": 2.8506787330316743, + "grad_norm": 0.6023295521736145, + "learning_rate": 0.00031143471710874795, + "loss": 0.114, + "mean_token_accuracy": 0.9669302552938461, + "num_tokens": 6966667.0, + "step": 789 + }, + { + "entropy": 1.5258118510246277, + "epoch": 2.8542986425339367, + "grad_norm": 0.5326524972915649, + "learning_rate": 0.00031099044452829186, + "loss": 0.0657, + "mean_token_accuracy": 0.9833361059427261, + "num_tokens": 6975880.0, + "step": 790 + }, + { + "entropy": 1.5674570798873901, + "epoch": 2.857918552036199, + "grad_norm": 0.4518730044364929, + "learning_rate": 0.00031054605823021186, + "loss": 0.0569, + "mean_token_accuracy": 0.9832890778779984, + "num_tokens": 6984824.0, + "step": 791 + }, + { + "entropy": 1.5301121771335602, + "epoch": 2.8615384615384616, + "grad_norm": 0.5933698415756226, + "learning_rate": 0.00031010155990234364, + "loss": 0.1129, + "mean_token_accuracy": 0.9684284627437592, + "num_tokens": 6994076.0, + "step": 792 + }, + { + "entropy": 1.5711756348609924, + "epoch": 2.865158371040724, + "grad_norm": 0.6634730696678162, + "learning_rate": 0.00030965695123294837, + "loss": 0.1204, + "mean_token_accuracy": 0.972825437784195, + "num_tokens": 7003048.0, + "step": 793 + }, + { + "entropy": 1.6537431180477142, + "epoch": 2.8687782805429864, + "grad_norm": 0.5688450336456299, + "learning_rate": 0.0003092122339107067, + "loss": 0.0659, + "mean_token_accuracy": 0.9861912727355957, + "num_tokens": 7011743.0, + "step": 794 + }, + { + "entropy": 1.731940358877182, + "epoch": 2.872398190045249, + "grad_norm": 0.9030163288116455, + "learning_rate": 0.0003087674096247115, + "loss": 0.0829, + "mean_token_accuracy": 0.9802074134349823, + "num_tokens": 7020003.0, + "step": 795 + }, + { + "entropy": 1.6672345995903015, + "epoch": 2.8760180995475113, + "grad_norm": 0.5129911303520203, + "learning_rate": 0.00030832248006446223, + "loss": 0.0823, + "mean_token_accuracy": 0.9805259853601456, + "num_tokens": 7029275.0, + "step": 796 + }, + { + "entropy": 1.7102139592170715, + "epoch": 2.8796380090497737, + "grad_norm": 0.6210790872573853, + "learning_rate": 0.00030787744691985797, + "loss": 0.1248, + "mean_token_accuracy": 0.9665560126304626, + "num_tokens": 7038068.0, + "step": 797 + }, + { + "entropy": 1.659182459115982, + "epoch": 2.883257918552036, + "grad_norm": 0.6379976868629456, + "learning_rate": 0.0003074323118811913, + "loss": 0.1065, + "mean_token_accuracy": 0.9647062122821808, + "num_tokens": 7047039.0, + "step": 798 + }, + { + "entropy": 1.6344517767429352, + "epoch": 2.8868778280542986, + "grad_norm": 0.5851842761039734, + "learning_rate": 0.00030698707663914186, + "loss": 0.1046, + "mean_token_accuracy": 0.9666399955749512, + "num_tokens": 7056105.0, + "step": 799 + }, + { + "entropy": 1.6803805828094482, + "epoch": 2.890497737556561, + "grad_norm": 0.5926725268363953, + "learning_rate": 0.00030654174288477, + "loss": 0.1019, + "mean_token_accuracy": 0.9712099581956863, + "num_tokens": 7064710.0, + "step": 800 + }, + { + "entropy": 1.7004003822803497, + "epoch": 2.8941176470588235, + "grad_norm": 0.6103729605674744, + "learning_rate": 0.0003060963123095098, + "loss": 0.091, + "mean_token_accuracy": 0.9780148714780807, + "num_tokens": 7073218.0, + "step": 801 + }, + { + "entropy": 1.8133964240550995, + "epoch": 2.897737556561086, + "grad_norm": 0.872008740901947, + "learning_rate": 0.0003056507866051636, + "loss": 0.3003, + "mean_token_accuracy": 0.9385994374752045, + "num_tokens": 7081791.0, + "step": 802 + }, + { + "entropy": 1.7527997195720673, + "epoch": 2.9013574660633483, + "grad_norm": 0.553669810295105, + "learning_rate": 0.0003052051674638945, + "loss": 0.0999, + "mean_token_accuracy": 0.9695112109184265, + "num_tokens": 7090196.0, + "step": 803 + }, + { + "entropy": 1.6374657154083252, + "epoch": 2.9049773755656108, + "grad_norm": 0.4158615469932556, + "learning_rate": 0.00030475945657822107, + "loss": 0.0682, + "mean_token_accuracy": 0.9802833646535873, + "num_tokens": 7099216.0, + "step": 804 + }, + { + "entropy": 1.6056133210659027, + "epoch": 2.908597285067873, + "grad_norm": 0.47468429803848267, + "learning_rate": 0.00030431365564101003, + "loss": 0.1188, + "mean_token_accuracy": 0.9720293581485748, + "num_tokens": 7108787.0, + "step": 805 + }, + { + "entropy": 1.7184821665287018, + "epoch": 2.9122171945701356, + "grad_norm": 0.6617569923400879, + "learning_rate": 0.00030386776634547003, + "loss": 0.1121, + "mean_token_accuracy": 0.9623472690582275, + "num_tokens": 7117158.0, + "step": 806 + }, + { + "entropy": 1.7546651065349579, + "epoch": 2.915837104072398, + "grad_norm": 0.5058173537254333, + "learning_rate": 0.0003034217903851454, + "loss": 0.0861, + "mean_token_accuracy": 0.9664297550916672, + "num_tokens": 7125800.0, + "step": 807 + }, + { + "entropy": 1.6985557675361633, + "epoch": 2.9194570135746605, + "grad_norm": 0.5197705626487732, + "learning_rate": 0.00030297572945390996, + "loss": 0.1009, + "mean_token_accuracy": 0.9677706956863403, + "num_tokens": 7134221.0, + "step": 808 + }, + { + "entropy": 1.6737182438373566, + "epoch": 2.9230769230769234, + "grad_norm": 0.4528989791870117, + "learning_rate": 0.00030252958524595966, + "loss": 0.0656, + "mean_token_accuracy": 0.9853187948465347, + "num_tokens": 7142716.0, + "step": 809 + }, + { + "entropy": 1.687746375799179, + "epoch": 2.926696832579186, + "grad_norm": 0.8552060723304749, + "learning_rate": 0.00030208335945580716, + "loss": 0.1584, + "mean_token_accuracy": 0.958037719130516, + "num_tokens": 7151288.0, + "step": 810 + }, + { + "entropy": 1.6994356215000153, + "epoch": 2.930316742081448, + "grad_norm": 0.470833957195282, + "learning_rate": 0.00030163705377827496, + "loss": 0.0537, + "mean_token_accuracy": 0.9804185479879379, + "num_tokens": 7159738.0, + "step": 811 + }, + { + "entropy": 1.7072536945343018, + "epoch": 2.9339366515837106, + "grad_norm": 0.5749104022979736, + "learning_rate": 0.0003011906699084888, + "loss": 0.0502, + "mean_token_accuracy": 0.9830235093832016, + "num_tokens": 7168101.0, + "step": 812 + }, + { + "entropy": 1.70310440659523, + "epoch": 2.937556561085973, + "grad_norm": 0.7587386965751648, + "learning_rate": 0.0003007442095418715, + "loss": 0.1362, + "mean_token_accuracy": 0.9594880938529968, + "num_tokens": 7176663.0, + "step": 813 + }, + { + "entropy": 1.6307457983493805, + "epoch": 2.9411764705882355, + "grad_norm": 0.5054190754890442, + "learning_rate": 0.00030029767437413665, + "loss": 0.0744, + "mean_token_accuracy": 0.9738886505365372, + "num_tokens": 7185376.0, + "step": 814 + }, + { + "entropy": 1.5872860848903656, + "epoch": 2.944796380090498, + "grad_norm": 0.5463546514511108, + "learning_rate": 0.00029985106610128147, + "loss": 0.0916, + "mean_token_accuracy": 0.9782509952783585, + "num_tokens": 7194304.0, + "step": 815 + }, + { + "entropy": 1.6643644273281097, + "epoch": 2.9484162895927604, + "grad_norm": 0.5434613823890686, + "learning_rate": 0.0002994043864195811, + "loss": 0.1007, + "mean_token_accuracy": 0.9665197134017944, + "num_tokens": 7202895.0, + "step": 816 + }, + { + "entropy": 1.701482743024826, + "epoch": 2.952036199095023, + "grad_norm": 1.2643967866897583, + "learning_rate": 0.00029895763702558206, + "loss": 0.1377, + "mean_token_accuracy": 0.9696027487516403, + "num_tokens": 7211000.0, + "step": 817 + }, + { + "entropy": 1.688760131597519, + "epoch": 2.9556561085972852, + "grad_norm": 0.5438109636306763, + "learning_rate": 0.00029851081961609536, + "loss": 0.0637, + "mean_token_accuracy": 0.9724639654159546, + "num_tokens": 7219274.0, + "step": 818 + }, + { + "entropy": 1.6547857522964478, + "epoch": 2.9592760180995477, + "grad_norm": 0.4520387649536133, + "learning_rate": 0.0002980639358881906, + "loss": 0.0376, + "mean_token_accuracy": 0.9887004494667053, + "num_tokens": 7228000.0, + "step": 819 + }, + { + "entropy": 1.5814381837844849, + "epoch": 2.96289592760181, + "grad_norm": 0.49122339487075806, + "learning_rate": 0.00029761698753918894, + "loss": 0.0533, + "mean_token_accuracy": 0.983299508690834, + "num_tokens": 7236798.0, + "step": 820 + }, + { + "entropy": 1.5796774625778198, + "epoch": 2.9665158371040725, + "grad_norm": 0.43303897976875305, + "learning_rate": 0.00029716997626665726, + "loss": 0.0517, + "mean_token_accuracy": 0.984140008687973, + "num_tokens": 7245570.0, + "step": 821 + }, + { + "entropy": 1.5434466302394867, + "epoch": 2.970135746606335, + "grad_norm": 0.5712567567825317, + "learning_rate": 0.0002967229037684014, + "loss": 0.0634, + "mean_token_accuracy": 0.9851510971784592, + "num_tokens": 7254482.0, + "step": 822 + }, + { + "entropy": 1.5368549823760986, + "epoch": 2.9737556561085974, + "grad_norm": 0.5042312741279602, + "learning_rate": 0.0002962757717424595, + "loss": 0.1041, + "mean_token_accuracy": 0.9698852747678757, + "num_tokens": 7263428.0, + "step": 823 + }, + { + "entropy": 1.5740615129470825, + "epoch": 2.97737556561086, + "grad_norm": 0.8506835699081421, + "learning_rate": 0.0002958285818870963, + "loss": 0.0653, + "mean_token_accuracy": 0.9827365875244141, + "num_tokens": 7272425.0, + "step": 824 + }, + { + "entropy": 1.625010073184967, + "epoch": 2.9809954751131222, + "grad_norm": 0.6260822415351868, + "learning_rate": 0.00029538133590079556, + "loss": 0.1112, + "mean_token_accuracy": 0.9715189933776855, + "num_tokens": 7281312.0, + "step": 825 + }, + { + "entropy": 1.6078990697860718, + "epoch": 2.9846153846153847, + "grad_norm": 0.4316014349460602, + "learning_rate": 0.00029493403548225467, + "loss": 0.059, + "mean_token_accuracy": 0.9821690768003464, + "num_tokens": 7289748.0, + "step": 826 + }, + { + "entropy": 1.6132618486881256, + "epoch": 2.988235294117647, + "grad_norm": 0.6471059322357178, + "learning_rate": 0.0002944866823303776, + "loss": 0.0839, + "mean_token_accuracy": 0.9747331887483597, + "num_tokens": 7298453.0, + "step": 827 + }, + { + "entropy": 1.6038751900196075, + "epoch": 2.9918552036199095, + "grad_norm": 0.5383681654930115, + "learning_rate": 0.0002940392781442686, + "loss": 0.0728, + "mean_token_accuracy": 0.9774085730314255, + "num_tokens": 7307116.0, + "step": 828 + }, + { + "entropy": 1.6446776688098907, + "epoch": 2.995475113122172, + "grad_norm": 0.5420554280281067, + "learning_rate": 0.0002935918246232259, + "loss": 0.0799, + "mean_token_accuracy": 0.977481946349144, + "num_tokens": 7315668.0, + "step": 829 + }, + { + "entropy": 1.5571844279766083, + "epoch": 2.9990950226244344, + "grad_norm": 0.6471306681632996, + "learning_rate": 0.00029314432346673485, + "loss": 0.1657, + "mean_token_accuracy": 0.9566951394081116, + "num_tokens": 7324721.0, + "step": 830 + }, + { + "entropy": 2.0783205032348633, + "epoch": 3.0, + "grad_norm": 3.195817232131958, + "learning_rate": 0.000292696776374462, + "loss": 0.0742, + "mean_token_accuracy": 0.96875, + "num_tokens": 7325175.0, + "step": 831 + }, + { + "epoch": 3.0, + "eval_entropy": 1.6213929740394033, + "eval_loss": 0.14780744910240173, + "eval_mean_token_accuracy": 0.9634173047251817, + "eval_num_tokens": 7325175.0, + "eval_runtime": 116.0041, + "eval_samples_per_second": 3.181, + "eval_steps_per_second": 1.06, + "step": 831 + }, + { + "entropy": 1.639732986688614, + "epoch": 3.0036199095022624, + "grad_norm": 0.45313218235969543, + "learning_rate": 0.00029224918504624814, + "loss": 0.0569, + "mean_token_accuracy": 0.9821487963199615, + "num_tokens": 7333756.0, + "step": 832 + }, + { + "entropy": 1.620821863412857, + "epoch": 3.007239819004525, + "grad_norm": 0.4471704363822937, + "learning_rate": 0.0002918015511821022, + "loss": 0.059, + "mean_token_accuracy": 0.9843536615371704, + "num_tokens": 7342266.0, + "step": 833 + }, + { + "entropy": 1.722977101802826, + "epoch": 3.0108597285067873, + "grad_norm": 0.5039600729942322, + "learning_rate": 0.0002913538764821947, + "loss": 0.0438, + "mean_token_accuracy": 0.9868119210004807, + "num_tokens": 7350541.0, + "step": 834 + }, + { + "entropy": 1.6466768980026245, + "epoch": 3.0144796380090497, + "grad_norm": 0.4470590054988861, + "learning_rate": 0.0002909061626468512, + "loss": 0.0418, + "mean_token_accuracy": 0.9859062284231186, + "num_tokens": 7359236.0, + "step": 835 + }, + { + "entropy": 1.6936305463314056, + "epoch": 3.018099547511312, + "grad_norm": 0.5103632211685181, + "learning_rate": 0.00029045841137654584, + "loss": 0.0649, + "mean_token_accuracy": 0.9817161113023758, + "num_tokens": 7367649.0, + "step": 836 + }, + { + "entropy": 1.5894218981266022, + "epoch": 3.0217194570135746, + "grad_norm": 0.4315621554851532, + "learning_rate": 0.000290010624371895, + "loss": 0.0779, + "mean_token_accuracy": 0.9756399989128113, + "num_tokens": 7376772.0, + "step": 837 + }, + { + "entropy": 1.6676535904407501, + "epoch": 3.025339366515837, + "grad_norm": 0.6142503023147583, + "learning_rate": 0.00028956280333365084, + "loss": 0.0601, + "mean_token_accuracy": 0.9850548654794693, + "num_tokens": 7385454.0, + "step": 838 + }, + { + "entropy": 1.6877512037754059, + "epoch": 3.0289592760180994, + "grad_norm": 0.5499544739723206, + "learning_rate": 0.0002891149499626948, + "loss": 0.06, + "mean_token_accuracy": 0.980460986495018, + "num_tokens": 7393843.0, + "step": 839 + }, + { + "entropy": 1.64662566781044, + "epoch": 3.032579185520362, + "grad_norm": 0.7865297198295593, + "learning_rate": 0.00028866706596003094, + "loss": 0.1098, + "mean_token_accuracy": 0.9614097326993942, + "num_tokens": 7402203.0, + "step": 840 + }, + { + "entropy": 1.5609408617019653, + "epoch": 3.0361990950226243, + "grad_norm": 0.5209096074104309, + "learning_rate": 0.0002882191530267797, + "loss": 0.0893, + "mean_token_accuracy": 0.9731233417987823, + "num_tokens": 7411227.0, + "step": 841 + }, + { + "entropy": 1.6110387742519379, + "epoch": 3.0398190045248867, + "grad_norm": 0.49672260880470276, + "learning_rate": 0.00028777121286417185, + "loss": 0.0512, + "mean_token_accuracy": 0.9793709367513657, + "num_tokens": 7419751.0, + "step": 842 + }, + { + "entropy": 1.5630280673503876, + "epoch": 3.043438914027149, + "grad_norm": 0.5099878907203674, + "learning_rate": 0.00028732324717354083, + "loss": 0.0447, + "mean_token_accuracy": 0.9830391556024551, + "num_tokens": 7428533.0, + "step": 843 + }, + { + "entropy": 1.5407153069972992, + "epoch": 3.0470588235294116, + "grad_norm": 0.7725343704223633, + "learning_rate": 0.0002868752576563175, + "loss": 0.071, + "mean_token_accuracy": 0.9820850938558578, + "num_tokens": 7437390.0, + "step": 844 + }, + { + "entropy": 1.5895936191082, + "epoch": 3.050678733031674, + "grad_norm": 0.5729185938835144, + "learning_rate": 0.0002864272460140234, + "loss": 0.0651, + "mean_token_accuracy": 0.9816865175962448, + "num_tokens": 7445715.0, + "step": 845 + }, + { + "entropy": 1.5614444315433502, + "epoch": 3.0542986425339365, + "grad_norm": 0.49079445004463196, + "learning_rate": 0.00028597921394826346, + "loss": 0.078, + "mean_token_accuracy": 0.9791339933872223, + "num_tokens": 7454770.0, + "step": 846 + }, + { + "entropy": 1.4948404431343079, + "epoch": 3.057918552036199, + "grad_norm": 0.45897549390792847, + "learning_rate": 0.0002855311631607209, + "loss": 0.0506, + "mean_token_accuracy": 0.9858186691999435, + "num_tokens": 7463578.0, + "step": 847 + }, + { + "entropy": 1.4837007820606232, + "epoch": 3.0615384615384613, + "grad_norm": 0.6153395771980286, + "learning_rate": 0.0002850830953531494, + "loss": 0.0862, + "mean_token_accuracy": 0.9767305850982666, + "num_tokens": 7472726.0, + "step": 848 + }, + { + "entropy": 1.5454865992069244, + "epoch": 3.065158371040724, + "grad_norm": 0.9645626544952393, + "learning_rate": 0.00028463501222736787, + "loss": 0.1448, + "mean_token_accuracy": 0.9689669013023376, + "num_tokens": 7481594.0, + "step": 849 + }, + { + "entropy": 1.503423035144806, + "epoch": 3.0687782805429866, + "grad_norm": 0.5449880361557007, + "learning_rate": 0.00028418691548525306, + "loss": 0.0809, + "mean_token_accuracy": 0.9776449203491211, + "num_tokens": 7490420.0, + "step": 850 + }, + { + "entropy": 1.43245068192482, + "epoch": 3.072398190045249, + "grad_norm": 0.7362976670265198, + "learning_rate": 0.0002837388068287334, + "loss": 0.0956, + "mean_token_accuracy": 0.9742193967103958, + "num_tokens": 7499567.0, + "step": 851 + }, + { + "entropy": 1.4874032735824585, + "epoch": 3.0760180995475115, + "grad_norm": 0.5615106821060181, + "learning_rate": 0.00028329068795978274, + "loss": 0.0837, + "mean_token_accuracy": 0.9790486842393875, + "num_tokens": 7508507.0, + "step": 852 + }, + { + "entropy": 1.4498116374015808, + "epoch": 3.079638009049774, + "grad_norm": 0.4348931610584259, + "learning_rate": 0.00028284256058041363, + "loss": 0.0634, + "mean_token_accuracy": 0.9843485057353973, + "num_tokens": 7517576.0, + "step": 853 + }, + { + "entropy": 1.5673332512378693, + "epoch": 3.0832579185520363, + "grad_norm": 0.5114635825157166, + "learning_rate": 0.000282394426392671, + "loss": 0.0663, + "mean_token_accuracy": 0.9789460599422455, + "num_tokens": 7526323.0, + "step": 854 + }, + { + "entropy": 1.5406886637210846, + "epoch": 3.086877828054299, + "grad_norm": 0.4056108593940735, + "learning_rate": 0.0002819462870986256, + "loss": 0.051, + "mean_token_accuracy": 0.985608771443367, + "num_tokens": 7535015.0, + "step": 855 + }, + { + "entropy": 1.5750982761383057, + "epoch": 3.090497737556561, + "grad_norm": 0.5252281427383423, + "learning_rate": 0.00028149814440036757, + "loss": 0.0426, + "mean_token_accuracy": 0.9839760363101959, + "num_tokens": 7543712.0, + "step": 856 + }, + { + "entropy": 1.5696458220481873, + "epoch": 3.0941176470588236, + "grad_norm": 0.7764946222305298, + "learning_rate": 0.00028105, + "loss": 0.1381, + "mean_token_accuracy": 0.9690622985363007, + "num_tokens": 7552651.0, + "step": 857 + }, + { + "entropy": 1.601443588733673, + "epoch": 3.097737556561086, + "grad_norm": 0.5605809688568115, + "learning_rate": 0.0002806018555996324, + "loss": 0.0847, + "mean_token_accuracy": 0.9771271497011185, + "num_tokens": 7561157.0, + "step": 858 + }, + { + "entropy": 1.5733444690704346, + "epoch": 3.1013574660633485, + "grad_norm": 0.6177924871444702, + "learning_rate": 0.00028015371290137443, + "loss": 0.0654, + "mean_token_accuracy": 0.9801760017871857, + "num_tokens": 7569884.0, + "step": 859 + }, + { + "entropy": 1.6089049577713013, + "epoch": 3.104977375565611, + "grad_norm": 0.9339480400085449, + "learning_rate": 0.000279705573607329, + "loss": 0.0924, + "mean_token_accuracy": 0.9692800939083099, + "num_tokens": 7578417.0, + "step": 860 + }, + { + "entropy": 1.58562570810318, + "epoch": 3.1085972850678734, + "grad_norm": 0.6229763031005859, + "learning_rate": 0.00027925743941958637, + "loss": 0.0689, + "mean_token_accuracy": 0.9820535033941269, + "num_tokens": 7586899.0, + "step": 861 + }, + { + "entropy": 1.6127779185771942, + "epoch": 3.112217194570136, + "grad_norm": 0.5199776291847229, + "learning_rate": 0.0002788093120402174, + "loss": 0.0696, + "mean_token_accuracy": 0.9842628389596939, + "num_tokens": 7595283.0, + "step": 862 + }, + { + "entropy": 1.5815349221229553, + "epoch": 3.1158371040723982, + "grad_norm": 0.3927786946296692, + "learning_rate": 0.0002783611931712666, + "loss": 0.0489, + "mean_token_accuracy": 0.9855190068483353, + "num_tokens": 7603800.0, + "step": 863 + }, + { + "entropy": 1.5128042995929718, + "epoch": 3.1194570135746607, + "grad_norm": 0.5245664715766907, + "learning_rate": 0.00027791308451474695, + "loss": 0.0916, + "mean_token_accuracy": 0.9793255031108856, + "num_tokens": 7612765.0, + "step": 864 + }, + { + "entropy": 1.4662578105926514, + "epoch": 3.123076923076923, + "grad_norm": 0.4836482107639313, + "learning_rate": 0.000277464987772632, + "loss": 0.0363, + "mean_token_accuracy": 0.9882297664880753, + "num_tokens": 7621842.0, + "step": 865 + }, + { + "entropy": 1.6075958013534546, + "epoch": 3.1266968325791855, + "grad_norm": 0.6621652841567993, + "learning_rate": 0.00027701690464685053, + "loss": 0.0703, + "mean_token_accuracy": 0.9801139384508133, + "num_tokens": 7630299.0, + "step": 866 + }, + { + "entropy": 1.5028826892375946, + "epoch": 3.130316742081448, + "grad_norm": 1.076515555381775, + "learning_rate": 0.00027656883683927917, + "loss": 0.1021, + "mean_token_accuracy": 0.9723865538835526, + "num_tokens": 7639269.0, + "step": 867 + }, + { + "entropy": 1.4604552686214447, + "epoch": 3.1339366515837104, + "grad_norm": 0.6197560429573059, + "learning_rate": 0.0002761207860517365, + "loss": 0.0831, + "mean_token_accuracy": 0.9773454070091248, + "num_tokens": 7648589.0, + "step": 868 + }, + { + "entropy": 1.5533301830291748, + "epoch": 3.137556561085973, + "grad_norm": 0.6384056806564331, + "learning_rate": 0.00027567275398597665, + "loss": 0.085, + "mean_token_accuracy": 0.9763429015874863, + "num_tokens": 7657465.0, + "step": 869 + }, + { + "entropy": 1.499713659286499, + "epoch": 3.1411764705882352, + "grad_norm": 0.5099884867668152, + "learning_rate": 0.0002752247423436825, + "loss": 0.0506, + "mean_token_accuracy": 0.9845949709415436, + "num_tokens": 7666239.0, + "step": 870 + }, + { + "entropy": 1.5065864324569702, + "epoch": 3.1447963800904977, + "grad_norm": 0.500906765460968, + "learning_rate": 0.00027477675282645917, + "loss": 0.0505, + "mean_token_accuracy": 0.9816035628318787, + "num_tokens": 7675002.0, + "step": 871 + }, + { + "entropy": 1.492633044719696, + "epoch": 3.14841628959276, + "grad_norm": 0.5848217606544495, + "learning_rate": 0.00027432878713582826, + "loss": 0.0714, + "mean_token_accuracy": 0.9832541942596436, + "num_tokens": 7683452.0, + "step": 872 + }, + { + "entropy": 1.5013020932674408, + "epoch": 3.1520361990950225, + "grad_norm": 0.7728188037872314, + "learning_rate": 0.0002738808469732202, + "loss": 0.1403, + "mean_token_accuracy": 0.9723454862833023, + "num_tokens": 7692088.0, + "step": 873 + }, + { + "entropy": 1.4020000398159027, + "epoch": 3.155656108597285, + "grad_norm": 0.7066675424575806, + "learning_rate": 0.00027343293403996906, + "loss": 0.0631, + "mean_token_accuracy": 0.9841864109039307, + "num_tokens": 7701066.0, + "step": 874 + }, + { + "entropy": 1.4469320476055145, + "epoch": 3.1592760180995474, + "grad_norm": 0.47683194279670715, + "learning_rate": 0.0002729850500373052, + "loss": 0.0787, + "mean_token_accuracy": 0.9787198454141617, + "num_tokens": 7710189.0, + "step": 875 + }, + { + "entropy": 1.4941265881061554, + "epoch": 3.16289592760181, + "grad_norm": 0.5534874796867371, + "learning_rate": 0.00027253719666634916, + "loss": 0.0681, + "mean_token_accuracy": 0.9741384238004684, + "num_tokens": 7718736.0, + "step": 876 + }, + { + "entropy": 1.48758664727211, + "epoch": 3.1665158371040723, + "grad_norm": 0.42443010210990906, + "learning_rate": 0.000272089375628105, + "loss": 0.0452, + "mean_token_accuracy": 0.986537754535675, + "num_tokens": 7727565.0, + "step": 877 + }, + { + "entropy": 1.4197124242782593, + "epoch": 3.1701357466063347, + "grad_norm": 0.4680332541465759, + "learning_rate": 0.00027164158862345416, + "loss": 0.0712, + "mean_token_accuracy": 0.979786142706871, + "num_tokens": 7736663.0, + "step": 878 + }, + { + "entropy": 1.4459567070007324, + "epoch": 3.173755656108597, + "grad_norm": 0.5269680619239807, + "learning_rate": 0.00027119383735314887, + "loss": 0.0527, + "mean_token_accuracy": 0.9839773774147034, + "num_tokens": 7745837.0, + "step": 879 + }, + { + "entropy": 1.4754200279712677, + "epoch": 3.1773755656108595, + "grad_norm": 0.39273717999458313, + "learning_rate": 0.00027074612351780524, + "loss": 0.0188, + "mean_token_accuracy": 0.9941024333238602, + "num_tokens": 7754747.0, + "step": 880 + }, + { + "entropy": 1.440185934305191, + "epoch": 3.180995475113122, + "grad_norm": 0.6401451826095581, + "learning_rate": 0.00027029844881789776, + "loss": 0.0825, + "mean_token_accuracy": 0.9758540540933609, + "num_tokens": 7763933.0, + "step": 881 + }, + { + "entropy": 1.4647364616394043, + "epoch": 3.184615384615385, + "grad_norm": 0.6890838146209717, + "learning_rate": 0.0002698508149537519, + "loss": 0.0609, + "mean_token_accuracy": 0.9836824238300323, + "num_tokens": 7772693.0, + "step": 882 + }, + { + "entropy": 1.510004311800003, + "epoch": 3.1882352941176473, + "grad_norm": 0.4847521185874939, + "learning_rate": 0.000269403223625538, + "loss": 0.0665, + "mean_token_accuracy": 0.9833553731441498, + "num_tokens": 7781591.0, + "step": 883 + }, + { + "entropy": 1.5321883261203766, + "epoch": 3.1918552036199097, + "grad_norm": 0.5583149790763855, + "learning_rate": 0.00026895567653326515, + "loss": 0.0481, + "mean_token_accuracy": 0.9884297996759415, + "num_tokens": 7789893.0, + "step": 884 + }, + { + "entropy": 1.5181719362735748, + "epoch": 3.195475113122172, + "grad_norm": 0.5727811455726624, + "learning_rate": 0.000268508175376774, + "loss": 0.051, + "mean_token_accuracy": 0.9885639101266861, + "num_tokens": 7798544.0, + "step": 885 + }, + { + "entropy": 1.6374200582504272, + "epoch": 3.1990950226244346, + "grad_norm": 0.5002682209014893, + "learning_rate": 0.0002680607218557314, + "loss": 0.0778, + "mean_token_accuracy": 0.9834531843662262, + "num_tokens": 7807030.0, + "step": 886 + }, + { + "entropy": 1.4485781788825989, + "epoch": 3.202714932126697, + "grad_norm": 0.5490010976791382, + "learning_rate": 0.0002676133176696224, + "loss": 0.0612, + "mean_token_accuracy": 0.9833452105522156, + "num_tokens": 7816008.0, + "step": 887 + }, + { + "entropy": 1.5048691630363464, + "epoch": 3.2063348416289594, + "grad_norm": 0.37134769558906555, + "learning_rate": 0.0002671659645177453, + "loss": 0.0411, + "mean_token_accuracy": 0.9869852215051651, + "num_tokens": 7825152.0, + "step": 888 + }, + { + "entropy": 1.522626668214798, + "epoch": 3.209954751131222, + "grad_norm": 0.3474898040294647, + "learning_rate": 0.00026671866409920444, + "loss": 0.0453, + "mean_token_accuracy": 0.9880259335041046, + "num_tokens": 7833517.0, + "step": 889 + }, + { + "entropy": 1.4796842634677887, + "epoch": 3.2135746606334843, + "grad_norm": 0.6107187271118164, + "learning_rate": 0.0002662714181129038, + "loss": 0.0587, + "mean_token_accuracy": 0.9835474342107773, + "num_tokens": 7842418.0, + "step": 890 + }, + { + "entropy": 1.527999073266983, + "epoch": 3.2171945701357467, + "grad_norm": 0.8143520355224609, + "learning_rate": 0.00026582422825754037, + "loss": 0.1435, + "mean_token_accuracy": 0.9624571949243546, + "num_tokens": 7851284.0, + "step": 891 + }, + { + "entropy": 1.488987773656845, + "epoch": 3.220814479638009, + "grad_norm": 0.4910070300102234, + "learning_rate": 0.0002653770962315986, + "loss": 0.0627, + "mean_token_accuracy": 0.9796515852212906, + "num_tokens": 7860011.0, + "step": 892 + }, + { + "entropy": 1.497105747461319, + "epoch": 3.2244343891402716, + "grad_norm": 0.6304562091827393, + "learning_rate": 0.00026493002373334274, + "loss": 0.0837, + "mean_token_accuracy": 0.975927010178566, + "num_tokens": 7868618.0, + "step": 893 + }, + { + "entropy": 1.4863994121551514, + "epoch": 3.228054298642534, + "grad_norm": 0.4768204092979431, + "learning_rate": 0.00026448301246081106, + "loss": 0.0449, + "mean_token_accuracy": 0.9877417385578156, + "num_tokens": 7877335.0, + "step": 894 + }, + { + "entropy": 1.4551187455654144, + "epoch": 3.2316742081447964, + "grad_norm": 0.5773951411247253, + "learning_rate": 0.0002640360641118095, + "loss": 0.0807, + "mean_token_accuracy": 0.974293515086174, + "num_tokens": 7886486.0, + "step": 895 + }, + { + "entropy": 1.4752719104290009, + "epoch": 3.235294117647059, + "grad_norm": 0.8372188806533813, + "learning_rate": 0.00026358918038390464, + "loss": 0.1428, + "mean_token_accuracy": 0.9693069010972977, + "num_tokens": 7895501.0, + "step": 896 + }, + { + "entropy": 1.462522953748703, + "epoch": 3.2389140271493213, + "grad_norm": 0.4307233393192291, + "learning_rate": 0.0002631423629744179, + "loss": 0.0574, + "mean_token_accuracy": 0.9867023974657059, + "num_tokens": 7904614.0, + "step": 897 + }, + { + "entropy": 1.5075399577617645, + "epoch": 3.2425339366515837, + "grad_norm": 0.6246724724769592, + "learning_rate": 0.00026269561358041886, + "loss": 0.074, + "mean_token_accuracy": 0.9773128777742386, + "num_tokens": 7913383.0, + "step": 898 + }, + { + "entropy": 1.407273530960083, + "epoch": 3.246153846153846, + "grad_norm": 0.31213951110839844, + "learning_rate": 0.0002622489338987186, + "loss": 0.0225, + "mean_token_accuracy": 0.994924858212471, + "num_tokens": 7922686.0, + "step": 899 + }, + { + "entropy": 1.3883163630962372, + "epoch": 3.2497737556561086, + "grad_norm": 0.476696252822876, + "learning_rate": 0.00026180232562586335, + "loss": 0.0727, + "mean_token_accuracy": 0.9775501936674118, + "num_tokens": 7931958.0, + "step": 900 + }, + { + "entropy": 1.4200710952281952, + "epoch": 3.253393665158371, + "grad_norm": 0.5860406756401062, + "learning_rate": 0.0002613557904581284, + "loss": 0.0658, + "mean_token_accuracy": 0.9809585213661194, + "num_tokens": 7940834.0, + "step": 901 + }, + { + "entropy": 1.4369202852249146, + "epoch": 3.2570135746606335, + "grad_norm": 0.47559866309165955, + "learning_rate": 0.0002609093300915112, + "loss": 0.0481, + "mean_token_accuracy": 0.9899342954158783, + "num_tokens": 7949907.0, + "step": 902 + }, + { + "entropy": 1.544773817062378, + "epoch": 3.260633484162896, + "grad_norm": 0.6772119402885437, + "learning_rate": 0.00026046294622172504, + "loss": 0.067, + "mean_token_accuracy": 0.9841168224811554, + "num_tokens": 7958556.0, + "step": 903 + }, + { + "entropy": 1.504111796617508, + "epoch": 3.2642533936651583, + "grad_norm": 0.4870680868625641, + "learning_rate": 0.0002600166405441928, + "loss": 0.0379, + "mean_token_accuracy": 0.990149587392807, + "num_tokens": 7967079.0, + "step": 904 + }, + { + "entropy": 1.349067509174347, + "epoch": 3.2678733031674208, + "grad_norm": 0.46113792061805725, + "learning_rate": 0.0002595704147540404, + "loss": 0.0521, + "mean_token_accuracy": 0.9874720871448517, + "num_tokens": 7976551.0, + "step": 905 + }, + { + "entropy": 1.462818831205368, + "epoch": 3.271493212669683, + "grad_norm": 0.7971535325050354, + "learning_rate": 0.0002591242705460901, + "loss": 0.041, + "mean_token_accuracy": 0.9884305745363235, + "num_tokens": 7985622.0, + "step": 906 + }, + { + "entropy": 1.3945342004299164, + "epoch": 3.2751131221719456, + "grad_norm": 0.7364558577537537, + "learning_rate": 0.00025867820961485453, + "loss": 0.0978, + "mean_token_accuracy": 0.9766480922698975, + "num_tokens": 7995012.0, + "step": 907 + }, + { + "entropy": 1.4662614464759827, + "epoch": 3.278733031674208, + "grad_norm": 0.4509989619255066, + "learning_rate": 0.0002582322336545299, + "loss": 0.0401, + "mean_token_accuracy": 0.9875599294900894, + "num_tokens": 8003688.0, + "step": 908 + }, + { + "entropy": 1.5244667828083038, + "epoch": 3.2823529411764705, + "grad_norm": 0.76254802942276, + "learning_rate": 0.00025778634435899, + "loss": 0.0706, + "mean_token_accuracy": 0.9814789742231369, + "num_tokens": 8011711.0, + "step": 909 + }, + { + "entropy": 1.498660922050476, + "epoch": 3.285972850678733, + "grad_norm": 0.5205233097076416, + "learning_rate": 0.0002573405434217788, + "loss": 0.0433, + "mean_token_accuracy": 0.9895520657300949, + "num_tokens": 8020268.0, + "step": 910 + }, + { + "entropy": 1.3673588633537292, + "epoch": 3.2895927601809953, + "grad_norm": 0.36727628111839294, + "learning_rate": 0.0002568948325361054, + "loss": 0.046, + "mean_token_accuracy": 0.9816896766424179, + "num_tokens": 8029676.0, + "step": 911 + }, + { + "entropy": 1.3924965262413025, + "epoch": 3.2932126696832578, + "grad_norm": 0.6359453797340393, + "learning_rate": 0.0002564492133948364, + "loss": 0.0677, + "mean_token_accuracy": 0.9825267344713211, + "num_tokens": 8038613.0, + "step": 912 + }, + { + "entropy": 1.426201194524765, + "epoch": 3.29683257918552, + "grad_norm": 0.5639982223510742, + "learning_rate": 0.0002560036876904902, + "loss": 0.0762, + "mean_token_accuracy": 0.9812760651111603, + "num_tokens": 8047516.0, + "step": 913 + }, + { + "entropy": 1.4323900640010834, + "epoch": 3.3004524886877826, + "grad_norm": 0.5035631060600281, + "learning_rate": 0.00025555825711522995, + "loss": 0.0479, + "mean_token_accuracy": 0.9820713251829147, + "num_tokens": 8056237.0, + "step": 914 + }, + { + "entropy": 1.433140367269516, + "epoch": 3.304072398190045, + "grad_norm": 0.5381770133972168, + "learning_rate": 0.00025511292336085804, + "loss": 0.0584, + "mean_token_accuracy": 0.9868257641792297, + "num_tokens": 8064918.0, + "step": 915 + }, + { + "entropy": 1.412838101387024, + "epoch": 3.3076923076923075, + "grad_norm": 0.46058139204978943, + "learning_rate": 0.00025466768811880866, + "loss": 0.0396, + "mean_token_accuracy": 0.9873918145895004, + "num_tokens": 8073881.0, + "step": 916 + }, + { + "entropy": 1.4484798610210419, + "epoch": 3.31131221719457, + "grad_norm": 0.8550136685371399, + "learning_rate": 0.000254222553080142, + "loss": 0.0744, + "mean_token_accuracy": 0.9780523777008057, + "num_tokens": 8082249.0, + "step": 917 + }, + { + "entropy": 1.4633181393146515, + "epoch": 3.3149321266968323, + "grad_norm": 0.8231784105300903, + "learning_rate": 0.00025377751993553777, + "loss": 0.0847, + "mean_token_accuracy": 0.9764655083417892, + "num_tokens": 8090772.0, + "step": 918 + }, + { + "entropy": 1.5348555445671082, + "epoch": 3.318552036199095, + "grad_norm": 0.6072585582733154, + "learning_rate": 0.00025333259037528847, + "loss": 0.0547, + "mean_token_accuracy": 0.983170285820961, + "num_tokens": 8098744.0, + "step": 919 + }, + { + "entropy": 1.4343461096286774, + "epoch": 3.3221719457013577, + "grad_norm": 0.5895786881446838, + "learning_rate": 0.0002528877660892933, + "loss": 0.033, + "mean_token_accuracy": 0.9907310158014297, + "num_tokens": 8107459.0, + "step": 920 + }, + { + "entropy": 1.3224802315235138, + "epoch": 3.32579185520362, + "grad_norm": 0.4657888114452362, + "learning_rate": 0.0002524430487670515, + "loss": 0.0581, + "mean_token_accuracy": 0.9821915626525879, + "num_tokens": 8116673.0, + "step": 921 + }, + { + "entropy": 1.4497299492359161, + "epoch": 3.3294117647058825, + "grad_norm": 0.5360382795333862, + "learning_rate": 0.0002519984400976564, + "loss": 0.0849, + "mean_token_accuracy": 0.9713759422302246, + "num_tokens": 8125774.0, + "step": 922 + }, + { + "entropy": 1.4038201570510864, + "epoch": 3.333031674208145, + "grad_norm": 0.5329150557518005, + "learning_rate": 0.00025155394176978814, + "loss": 0.0679, + "mean_token_accuracy": 0.9782100170850754, + "num_tokens": 8134777.0, + "step": 923 + }, + { + "entropy": 1.3989399075508118, + "epoch": 3.3366515837104074, + "grad_norm": 0.47847944498062134, + "learning_rate": 0.00025110955547170803, + "loss": 0.0579, + "mean_token_accuracy": 0.9826236069202423, + "num_tokens": 8143596.0, + "step": 924 + }, + { + "entropy": 1.4384986460208893, + "epoch": 3.34027149321267, + "grad_norm": 0.6291977763175964, + "learning_rate": 0.0002506652828912521, + "loss": 0.0826, + "mean_token_accuracy": 0.9759467244148254, + "num_tokens": 8152554.0, + "step": 925 + }, + { + "entropy": 1.3491226136684418, + "epoch": 3.3438914027149322, + "grad_norm": 0.4057374596595764, + "learning_rate": 0.00025022112571582383, + "loss": 0.0428, + "mean_token_accuracy": 0.9861899316310883, + "num_tokens": 8161845.0, + "step": 926 + }, + { + "entropy": 1.3831347525119781, + "epoch": 3.3475113122171947, + "grad_norm": 0.5007946491241455, + "learning_rate": 0.0002497770856323891, + "loss": 0.0417, + "mean_token_accuracy": 0.9847332686185837, + "num_tokens": 8170865.0, + "step": 927 + }, + { + "entropy": 1.4520001113414764, + "epoch": 3.351131221719457, + "grad_norm": 0.5229163765907288, + "learning_rate": 0.00024933316432746864, + "loss": 0.0515, + "mean_token_accuracy": 0.98235984146595, + "num_tokens": 8179738.0, + "step": 928 + }, + { + "entropy": 1.4497073292732239, + "epoch": 3.3547511312217195, + "grad_norm": 0.6086527705192566, + "learning_rate": 0.0002488893634871322, + "loss": 0.082, + "mean_token_accuracy": 0.9839034825563431, + "num_tokens": 8188402.0, + "step": 929 + }, + { + "entropy": 1.4439297020435333, + "epoch": 3.358371040723982, + "grad_norm": 0.6497851014137268, + "learning_rate": 0.00024844568479699187, + "loss": 0.0863, + "mean_token_accuracy": 0.9722652286291122, + "num_tokens": 8196956.0, + "step": 930 + }, + { + "entropy": 1.3755157589912415, + "epoch": 3.3619909502262444, + "grad_norm": 0.6988303661346436, + "learning_rate": 0.0002480021299421957, + "loss": 0.0999, + "mean_token_accuracy": 0.9738437533378601, + "num_tokens": 8205951.0, + "step": 931 + }, + { + "entropy": 1.3790476024150848, + "epoch": 3.365610859728507, + "grad_norm": 0.8188769221305847, + "learning_rate": 0.0002475587006074219, + "loss": 0.206, + "mean_token_accuracy": 0.9557942748069763, + "num_tokens": 8215256.0, + "step": 932 + }, + { + "entropy": 1.4337495565414429, + "epoch": 3.3692307692307693, + "grad_norm": 0.481511652469635, + "learning_rate": 0.00024711539847687135, + "loss": 0.0568, + "mean_token_accuracy": 0.982319638133049, + "num_tokens": 8224081.0, + "step": 933 + }, + { + "entropy": 1.4721867442131042, + "epoch": 3.3728506787330317, + "grad_norm": 0.595804750919342, + "learning_rate": 0.00024667222523426204, + "loss": 0.073, + "mean_token_accuracy": 0.979112833738327, + "num_tokens": 8232560.0, + "step": 934 + }, + { + "entropy": 1.4026366472244263, + "epoch": 3.376470588235294, + "grad_norm": 0.8112502098083496, + "learning_rate": 0.0002462291825628226, + "loss": 0.1302, + "mean_token_accuracy": 0.9592884331941605, + "num_tokens": 8241529.0, + "step": 935 + }, + { + "entropy": 1.4276806712150574, + "epoch": 3.3800904977375565, + "grad_norm": 0.3144559860229492, + "learning_rate": 0.0002457862721452854, + "loss": 0.0355, + "mean_token_accuracy": 0.9895562827587128, + "num_tokens": 8250331.0, + "step": 936 + }, + { + "entropy": 1.4367564022541046, + "epoch": 3.383710407239819, + "grad_norm": 0.6843166947364807, + "learning_rate": 0.0002453434956638806, + "loss": 0.0674, + "mean_token_accuracy": 0.9829154461622238, + "num_tokens": 8259137.0, + "step": 937 + }, + { + "entropy": 1.390118271112442, + "epoch": 3.3873303167420814, + "grad_norm": 0.437500536441803, + "learning_rate": 0.00024490085480032996, + "loss": 0.0323, + "mean_token_accuracy": 0.9916883558034897, + "num_tokens": 8268372.0, + "step": 938 + }, + { + "entropy": 1.3605903685092926, + "epoch": 3.390950226244344, + "grad_norm": 0.6721571087837219, + "learning_rate": 0.00024445835123583964, + "loss": 0.1217, + "mean_token_accuracy": 0.9565094709396362, + "num_tokens": 8277388.0, + "step": 939 + }, + { + "entropy": 1.3998730778694153, + "epoch": 3.3945701357466063, + "grad_norm": 0.38136187195777893, + "learning_rate": 0.00024401598665109463, + "loss": 0.0397, + "mean_token_accuracy": 0.9870003908872604, + "num_tokens": 8286150.0, + "step": 940 + }, + { + "entropy": 1.406863808631897, + "epoch": 3.3981900452488687, + "grad_norm": 0.5735233426094055, + "learning_rate": 0.00024357376272625205, + "loss": 0.0794, + "mean_token_accuracy": 0.9789908528327942, + "num_tokens": 8294719.0, + "step": 941 + }, + { + "entropy": 1.418317824602127, + "epoch": 3.401809954751131, + "grad_norm": 0.624377965927124, + "learning_rate": 0.00024313168114093475, + "loss": 0.0466, + "mean_token_accuracy": 0.9851591736078262, + "num_tokens": 8303298.0, + "step": 942 + }, + { + "entropy": 1.3575542867183685, + "epoch": 3.4054298642533936, + "grad_norm": 0.5194457173347473, + "learning_rate": 0.00024268974357422488, + "loss": 0.0743, + "mean_token_accuracy": 0.9743311256170273, + "num_tokens": 8312635.0, + "step": 943 + }, + { + "entropy": 1.392454832792282, + "epoch": 3.409049773755656, + "grad_norm": 0.5445207357406616, + "learning_rate": 0.00024224795170465756, + "loss": 0.0986, + "mean_token_accuracy": 0.9710196405649185, + "num_tokens": 8321364.0, + "step": 944 + }, + { + "entropy": 1.324178010225296, + "epoch": 3.4126696832579184, + "grad_norm": 0.4121778607368469, + "learning_rate": 0.0002418063072102148, + "loss": 0.0513, + "mean_token_accuracy": 0.9844248443841934, + "num_tokens": 8330452.0, + "step": 945 + }, + { + "entropy": 1.4191058278083801, + "epoch": 3.416289592760181, + "grad_norm": 0.48296698927879333, + "learning_rate": 0.00024136481176831854, + "loss": 0.0561, + "mean_token_accuracy": 0.9812565594911575, + "num_tokens": 8339243.0, + "step": 946 + }, + { + "entropy": 1.3743943274021149, + "epoch": 3.4199095022624433, + "grad_norm": 0.5322384834289551, + "learning_rate": 0.00024092346705582474, + "loss": 0.065, + "mean_token_accuracy": 0.9788537919521332, + "num_tokens": 8347866.0, + "step": 947 + }, + { + "entropy": 1.4042058885097504, + "epoch": 3.4235294117647057, + "grad_norm": 0.5542939901351929, + "learning_rate": 0.00024048227474901697, + "loss": 0.0835, + "mean_token_accuracy": 0.9758901000022888, + "num_tokens": 8356604.0, + "step": 948 + }, + { + "entropy": 1.4089910387992859, + "epoch": 3.427149321266968, + "grad_norm": 0.6025400757789612, + "learning_rate": 0.00024004123652359973, + "loss": 0.0736, + "mean_token_accuracy": 0.9723546206951141, + "num_tokens": 8365168.0, + "step": 949 + }, + { + "entropy": 1.3679145872592926, + "epoch": 3.430769230769231, + "grad_norm": 0.6585437059402466, + "learning_rate": 0.00023960035405469235, + "loss": 0.1387, + "mean_token_accuracy": 0.9651710242033005, + "num_tokens": 8374034.0, + "step": 950 + }, + { + "entropy": 1.3707129955291748, + "epoch": 3.4343891402714934, + "grad_norm": 0.639600932598114, + "learning_rate": 0.0002391596290168228, + "loss": 0.0491, + "mean_token_accuracy": 0.9869592487812042, + "num_tokens": 8383019.0, + "step": 951 + }, + { + "entropy": 1.3920492231845856, + "epoch": 3.438009049773756, + "grad_norm": 0.4947279393672943, + "learning_rate": 0.00023871906308392088, + "loss": 0.0647, + "mean_token_accuracy": 0.98161181807518, + "num_tokens": 8392191.0, + "step": 952 + }, + { + "entropy": 1.4259005188941956, + "epoch": 3.4416289592760183, + "grad_norm": 0.5486235618591309, + "learning_rate": 0.00023827865792931205, + "loss": 0.0581, + "mean_token_accuracy": 0.9796920716762543, + "num_tokens": 8400966.0, + "step": 953 + }, + { + "entropy": 1.4309614896774292, + "epoch": 3.4452488687782807, + "grad_norm": 0.6024688482284546, + "learning_rate": 0.00023783841522571138, + "loss": 0.1217, + "mean_token_accuracy": 0.9621599614620209, + "num_tokens": 8409878.0, + "step": 954 + }, + { + "entropy": 1.427911102771759, + "epoch": 3.448868778280543, + "grad_norm": 0.4339677691459656, + "learning_rate": 0.00023739833664521671, + "loss": 0.0521, + "mean_token_accuracy": 0.9818601310253143, + "num_tokens": 8418609.0, + "step": 955 + }, + { + "entropy": 1.4235480725765228, + "epoch": 3.4524886877828056, + "grad_norm": 0.5715889930725098, + "learning_rate": 0.00023695842385930242, + "loss": 0.0657, + "mean_token_accuracy": 0.9833882004022598, + "num_tokens": 8427265.0, + "step": 956 + }, + { + "entropy": 1.335403710603714, + "epoch": 3.456108597285068, + "grad_norm": 0.34678834676742554, + "learning_rate": 0.00023651867853881356, + "loss": 0.0507, + "mean_token_accuracy": 0.9843446165323257, + "num_tokens": 8436591.0, + "step": 957 + }, + { + "entropy": 1.3923978507518768, + "epoch": 3.4597285067873305, + "grad_norm": 0.8088510632514954, + "learning_rate": 0.00023607910235395882, + "loss": 0.1065, + "mean_token_accuracy": 0.9738757163286209, + "num_tokens": 8445472.0, + "step": 958 + }, + { + "entropy": 1.513672262430191, + "epoch": 3.463348416289593, + "grad_norm": 0.6919769048690796, + "learning_rate": 0.0002356396969743044, + "loss": 0.0846, + "mean_token_accuracy": 0.9809803068637848, + "num_tokens": 8453788.0, + "step": 959 + }, + { + "entropy": 1.3483870327472687, + "epoch": 3.4669683257918553, + "grad_norm": 0.5901163220405579, + "learning_rate": 0.00023520046406876822, + "loss": 0.1035, + "mean_token_accuracy": 0.9659459739923477, + "num_tokens": 8463134.0, + "step": 960 + }, + { + "entropy": 1.4076683819293976, + "epoch": 3.4705882352941178, + "grad_norm": 0.5772054195404053, + "learning_rate": 0.00023476140530561253, + "loss": 0.058, + "mean_token_accuracy": 0.9804215878248215, + "num_tokens": 8471959.0, + "step": 961 + }, + { + "entropy": 1.3382205069065094, + "epoch": 3.47420814479638, + "grad_norm": 0.4780332148075104, + "learning_rate": 0.00023432252235243883, + "loss": 0.074, + "mean_token_accuracy": 0.9757792204618454, + "num_tokens": 8480866.0, + "step": 962 + }, + { + "entropy": 1.432678759098053, + "epoch": 3.4778280542986426, + "grad_norm": 0.5997639298439026, + "learning_rate": 0.00023388381687618022, + "loss": 0.0641, + "mean_token_accuracy": 0.9824596792459488, + "num_tokens": 8489355.0, + "step": 963 + }, + { + "entropy": 1.3388859629631042, + "epoch": 3.481447963800905, + "grad_norm": 0.3654438257217407, + "learning_rate": 0.0002334452905430961, + "loss": 0.0553, + "mean_token_accuracy": 0.9859583377838135, + "num_tokens": 8498607.0, + "step": 964 + }, + { + "entropy": 1.4407559633255005, + "epoch": 3.4850678733031675, + "grad_norm": 0.6571084856987, + "learning_rate": 0.00023300694501876535, + "loss": 0.0887, + "mean_token_accuracy": 0.9736911207437515, + "num_tokens": 8506915.0, + "step": 965 + }, + { + "entropy": 1.4553894400596619, + "epoch": 3.48868778280543, + "grad_norm": 0.459780752658844, + "learning_rate": 0.00023256878196808019, + "loss": 0.0578, + "mean_token_accuracy": 0.98088139295578, + "num_tokens": 8515157.0, + "step": 966 + }, + { + "entropy": 1.4285954535007477, + "epoch": 3.4923076923076923, + "grad_norm": 0.4488624930381775, + "learning_rate": 0.0002321308030552396, + "loss": 0.0466, + "mean_token_accuracy": 0.9883453100919724, + "num_tokens": 8523741.0, + "step": 967 + }, + { + "entropy": 1.3596898019313812, + "epoch": 3.4959276018099548, + "grad_norm": 0.5626068711280823, + "learning_rate": 0.00023169300994374352, + "loss": 0.0663, + "mean_token_accuracy": 0.979169949889183, + "num_tokens": 8532431.0, + "step": 968 + }, + { + "entropy": 1.4428678750991821, + "epoch": 3.499547511312217, + "grad_norm": 0.546142578125, + "learning_rate": 0.0002312554042963858, + "loss": 0.0777, + "mean_token_accuracy": 0.9774435311555862, + "num_tokens": 8540889.0, + "step": 969 + }, + { + "entropy": 1.3509635627269745, + "epoch": 3.5031674208144796, + "grad_norm": 0.6781264543533325, + "learning_rate": 0.00023081798777524847, + "loss": 0.0941, + "mean_token_accuracy": 0.9698395729064941, + "num_tokens": 8550128.0, + "step": 970 + }, + { + "entropy": 1.2727701663970947, + "epoch": 3.506787330316742, + "grad_norm": 0.477498322725296, + "learning_rate": 0.00023038076204169534, + "loss": 0.0447, + "mean_token_accuracy": 0.98555026948452, + "num_tokens": 8559305.0, + "step": 971 + }, + { + "entropy": 1.3704063892364502, + "epoch": 3.5104072398190045, + "grad_norm": 0.5665515661239624, + "learning_rate": 0.00022994372875636534, + "loss": 0.0727, + "mean_token_accuracy": 0.9838419556617737, + "num_tokens": 8568175.0, + "step": 972 + }, + { + "entropy": 1.3341827094554901, + "epoch": 3.514027149321267, + "grad_norm": 0.7451890110969543, + "learning_rate": 0.00022950688957916666, + "loss": 0.0892, + "mean_token_accuracy": 0.9721158593893051, + "num_tokens": 8576916.0, + "step": 973 + }, + { + "entropy": 1.3170603513717651, + "epoch": 3.5176470588235293, + "grad_norm": 0.6274797916412354, + "learning_rate": 0.00022907024616927016, + "loss": 0.0867, + "mean_token_accuracy": 0.9760665446519852, + "num_tokens": 8585937.0, + "step": 974 + }, + { + "entropy": 1.324271559715271, + "epoch": 3.521266968325792, + "grad_norm": 0.49691009521484375, + "learning_rate": 0.00022863380018510321, + "loss": 0.0617, + "mean_token_accuracy": 0.9794053137302399, + "num_tokens": 8594885.0, + "step": 975 + }, + { + "entropy": 1.3782964646816254, + "epoch": 3.524886877828054, + "grad_norm": 0.5726630687713623, + "learning_rate": 0.00022819755328434306, + "loss": 0.0789, + "mean_token_accuracy": 0.9756396412849426, + "num_tokens": 8603243.0, + "step": 976 + }, + { + "entropy": 1.3255096673965454, + "epoch": 3.5285067873303166, + "grad_norm": 0.5568417906761169, + "learning_rate": 0.00022776150712391127, + "loss": 0.0734, + "mean_token_accuracy": 0.974893182516098, + "num_tokens": 8612414.0, + "step": 977 + }, + { + "entropy": 1.4089244902133942, + "epoch": 3.532126696832579, + "grad_norm": 0.5721971392631531, + "learning_rate": 0.00022732566335996674, + "loss": 0.0719, + "mean_token_accuracy": 0.976212814450264, + "num_tokens": 8620851.0, + "step": 978 + }, + { + "entropy": 1.278488278388977, + "epoch": 3.5357466063348415, + "grad_norm": 0.4737057387828827, + "learning_rate": 0.00022689002364789938, + "loss": 0.0329, + "mean_token_accuracy": 0.9908775240182877, + "num_tokens": 8630000.0, + "step": 979 + }, + { + "entropy": 1.3656818866729736, + "epoch": 3.539366515837104, + "grad_norm": 0.6749998927116394, + "learning_rate": 0.00022645458964232456, + "loss": 0.0875, + "mean_token_accuracy": 0.978403314948082, + "num_tokens": 8638635.0, + "step": 980 + }, + { + "entropy": 1.3721227645874023, + "epoch": 3.5429864253393664, + "grad_norm": 0.5295807719230652, + "learning_rate": 0.00022601936299707616, + "loss": 0.0694, + "mean_token_accuracy": 0.9826173633337021, + "num_tokens": 8647726.0, + "step": 981 + }, + { + "entropy": 1.3711304068565369, + "epoch": 3.546606334841629, + "grad_norm": 0.5617223381996155, + "learning_rate": 0.0002255843453652002, + "loss": 0.0745, + "mean_token_accuracy": 0.9778908938169479, + "num_tokens": 8656421.0, + "step": 982 + }, + { + "entropy": 1.3603121936321259, + "epoch": 3.5502262443438912, + "grad_norm": 0.5830493569374084, + "learning_rate": 0.00022514953839894932, + "loss": 0.0498, + "mean_token_accuracy": 0.9847024828195572, + "num_tokens": 8665206.0, + "step": 983 + }, + { + "entropy": 1.3419533371925354, + "epoch": 3.5538461538461537, + "grad_norm": 0.5035730004310608, + "learning_rate": 0.00022471494374977556, + "loss": 0.0873, + "mean_token_accuracy": 0.9755606353282928, + "num_tokens": 8674482.0, + "step": 984 + }, + { + "entropy": 1.4004729092121124, + "epoch": 3.557466063348416, + "grad_norm": 0.4822017252445221, + "learning_rate": 0.0002242805630683251, + "loss": 0.0574, + "mean_token_accuracy": 0.9790775179862976, + "num_tokens": 8683066.0, + "step": 985 + }, + { + "entropy": 1.32045316696167, + "epoch": 3.5610859728506785, + "grad_norm": 0.3949761688709259, + "learning_rate": 0.00022384639800443088, + "loss": 0.0396, + "mean_token_accuracy": 0.9879113733768463, + "num_tokens": 8691966.0, + "step": 986 + }, + { + "entropy": 1.3542158901691437, + "epoch": 3.564705882352941, + "grad_norm": 0.6060124635696411, + "learning_rate": 0.0002234124502071072, + "loss": 0.0827, + "mean_token_accuracy": 0.9750298708677292, + "num_tokens": 8700859.0, + "step": 987 + }, + { + "entropy": 1.2594963014125824, + "epoch": 3.5683257918552034, + "grad_norm": 0.5580794215202332, + "learning_rate": 0.00022297872132454318, + "loss": 0.0691, + "mean_token_accuracy": 0.9793778210878372, + "num_tokens": 8710316.0, + "step": 988 + }, + { + "entropy": 1.3390154540538788, + "epoch": 3.571945701357466, + "grad_norm": 0.38052669167518616, + "learning_rate": 0.00022254521300409626, + "loss": 0.0436, + "mean_token_accuracy": 0.9838068634271622, + "num_tokens": 8719219.0, + "step": 989 + }, + { + "entropy": 1.4103966355323792, + "epoch": 3.5755656108597282, + "grad_norm": 0.6793152093887329, + "learning_rate": 0.00022211192689228633, + "loss": 0.0738, + "mean_token_accuracy": 0.9803069233894348, + "num_tokens": 8727658.0, + "step": 990 + }, + { + "entropy": 1.3170084357261658, + "epoch": 3.579185520361991, + "grad_norm": 0.5633034110069275, + "learning_rate": 0.00022167886463478933, + "loss": 0.0483, + "mean_token_accuracy": 0.9852565824985504, + "num_tokens": 8736502.0, + "step": 991 + }, + { + "entropy": 1.3381566107273102, + "epoch": 3.5828054298642535, + "grad_norm": 0.46346399188041687, + "learning_rate": 0.00022124602787643088, + "loss": 0.0324, + "mean_token_accuracy": 0.9907149076461792, + "num_tokens": 8745057.0, + "step": 992 + }, + { + "entropy": 1.2907682061195374, + "epoch": 3.586425339366516, + "grad_norm": 0.5343260169029236, + "learning_rate": 0.00022081341826118013, + "loss": 0.0577, + "mean_token_accuracy": 0.982825830578804, + "num_tokens": 8754046.0, + "step": 993 + }, + { + "entropy": 1.3302249014377594, + "epoch": 3.5900452488687784, + "grad_norm": 0.7435888051986694, + "learning_rate": 0.00022038103743214345, + "loss": 0.0644, + "mean_token_accuracy": 0.9805946946144104, + "num_tokens": 8762749.0, + "step": 994 + }, + { + "entropy": 1.3043336868286133, + "epoch": 3.593665158371041, + "grad_norm": 0.5991454124450684, + "learning_rate": 0.00021994888703155853, + "loss": 0.1013, + "mean_token_accuracy": 0.9729356169700623, + "num_tokens": 8771617.0, + "step": 995 + }, + { + "entropy": 1.2590918838977814, + "epoch": 3.5972850678733033, + "grad_norm": 0.6732775568962097, + "learning_rate": 0.00021951696870078748, + "loss": 0.2119, + "mean_token_accuracy": 0.9542236030101776, + "num_tokens": 8781055.0, + "step": 996 + }, + { + "entropy": 1.3064957559108734, + "epoch": 3.6009049773755657, + "grad_norm": 0.6471344828605652, + "learning_rate": 0.00021908528408031124, + "loss": 0.0775, + "mean_token_accuracy": 0.9764008969068527, + "num_tokens": 8789616.0, + "step": 997 + }, + { + "entropy": 1.2735644578933716, + "epoch": 3.604524886877828, + "grad_norm": 0.4242008626461029, + "learning_rate": 0.00021865383480972308, + "loss": 0.0517, + "mean_token_accuracy": 0.9826734960079193, + "num_tokens": 8798420.0, + "step": 998 + }, + { + "entropy": 1.2687926590442657, + "epoch": 3.6081447963800906, + "grad_norm": 0.6521032452583313, + "learning_rate": 0.00021822262252772212, + "loss": 0.0831, + "mean_token_accuracy": 0.9806712120771408, + "num_tokens": 8807486.0, + "step": 999 + }, + { + "entropy": 1.2503767311573029, + "epoch": 3.611764705882353, + "grad_norm": 0.44378921389579773, + "learning_rate": 0.00021779164887210774, + "loss": 0.0709, + "mean_token_accuracy": 0.9845052361488342, + "num_tokens": 8816795.0, + "step": 1000 + }, + { + "entropy": 1.2962157726287842, + "epoch": 3.6153846153846154, + "grad_norm": 0.47278398275375366, + "learning_rate": 0.0002173609154797728, + "loss": 0.0321, + "mean_token_accuracy": 0.986628457903862, + "num_tokens": 8825449.0, + "step": 1001 + }, + { + "entropy": 1.335391879081726, + "epoch": 3.619004524886878, + "grad_norm": 0.3435405492782593, + "learning_rate": 0.00021693042398669747, + "loss": 0.0361, + "mean_token_accuracy": 0.9887901991605759, + "num_tokens": 8834296.0, + "step": 1002 + }, + { + "entropy": 1.295527994632721, + "epoch": 3.6226244343891403, + "grad_norm": 0.4150637686252594, + "learning_rate": 0.0002165001760279435, + "loss": 0.0419, + "mean_token_accuracy": 0.9862009286880493, + "num_tokens": 8843354.0, + "step": 1003 + }, + { + "entropy": 1.270320326089859, + "epoch": 3.6262443438914027, + "grad_norm": 0.4439278542995453, + "learning_rate": 0.0002160701732376474, + "loss": 0.0676, + "mean_token_accuracy": 0.9789925366640091, + "num_tokens": 8852311.0, + "step": 1004 + }, + { + "entropy": 1.2495850026607513, + "epoch": 3.629864253393665, + "grad_norm": 0.4471176266670227, + "learning_rate": 0.00021564041724901446, + "loss": 0.0469, + "mean_token_accuracy": 0.98641636967659, + "num_tokens": 8861126.0, + "step": 1005 + }, + { + "entropy": 1.3307124376296997, + "epoch": 3.6334841628959276, + "grad_norm": 0.547099769115448, + "learning_rate": 0.0002152109096943128, + "loss": 0.0861, + "mean_token_accuracy": 0.9793668240308762, + "num_tokens": 8870129.0, + "step": 1006 + }, + { + "entropy": 1.3895522952079773, + "epoch": 3.63710407239819, + "grad_norm": 0.5946421027183533, + "learning_rate": 0.00021478165220486674, + "loss": 0.0704, + "mean_token_accuracy": 0.9831217378377914, + "num_tokens": 8878385.0, + "step": 1007 + }, + { + "entropy": 1.3864755928516388, + "epoch": 3.6407239819004524, + "grad_norm": 0.42203575372695923, + "learning_rate": 0.00021435264641105116, + "loss": 0.0557, + "mean_token_accuracy": 0.9843680560588837, + "num_tokens": 8887161.0, + "step": 1008 + }, + { + "entropy": 1.3556683957576752, + "epoch": 3.644343891402715, + "grad_norm": 0.5707162618637085, + "learning_rate": 0.00021392389394228454, + "loss": 0.0523, + "mean_token_accuracy": 0.9845058023929596, + "num_tokens": 8896049.0, + "step": 1009 + }, + { + "entropy": 1.2712234854698181, + "epoch": 3.6479638009049773, + "grad_norm": 0.6082377433776855, + "learning_rate": 0.00021349539642702347, + "loss": 0.1082, + "mean_token_accuracy": 0.9710930436849594, + "num_tokens": 8905546.0, + "step": 1010 + }, + { + "entropy": 1.3434297740459442, + "epoch": 3.6515837104072397, + "grad_norm": 0.7305653095245361, + "learning_rate": 0.0002130671554927561, + "loss": 0.088, + "mean_token_accuracy": 0.9745359718799591, + "num_tokens": 8914502.0, + "step": 1011 + }, + { + "entropy": 1.3378058075904846, + "epoch": 3.655203619909502, + "grad_norm": 0.4537632167339325, + "learning_rate": 0.00021263917276599607, + "loss": 0.047, + "mean_token_accuracy": 0.9869710952043533, + "num_tokens": 8923463.0, + "step": 1012 + }, + { + "entropy": 1.3655290305614471, + "epoch": 3.6588235294117646, + "grad_norm": 0.5036798119544983, + "learning_rate": 0.0002122114498722763, + "loss": 0.0655, + "mean_token_accuracy": 0.982716903090477, + "num_tokens": 8932384.0, + "step": 1013 + }, + { + "entropy": 1.3198035657405853, + "epoch": 3.662443438914027, + "grad_norm": 0.3511429727077484, + "learning_rate": 0.000211783988436143, + "loss": 0.0382, + "mean_token_accuracy": 0.9901436120271683, + "num_tokens": 8941300.0, + "step": 1014 + }, + { + "entropy": 1.3130914568901062, + "epoch": 3.6660633484162894, + "grad_norm": 0.4056939482688904, + "learning_rate": 0.00021135679008114894, + "loss": 0.0639, + "mean_token_accuracy": 0.9808386266231537, + "num_tokens": 8950534.0, + "step": 1015 + }, + { + "entropy": 1.3055587410926819, + "epoch": 3.669683257918552, + "grad_norm": 0.33344724774360657, + "learning_rate": 0.00021092985642984802, + "loss": 0.0449, + "mean_token_accuracy": 0.9886894524097443, + "num_tokens": 8960263.0, + "step": 1016 + }, + { + "entropy": 1.3109475672245026, + "epoch": 3.6733031674208148, + "grad_norm": 0.490029901266098, + "learning_rate": 0.00021050318910378874, + "loss": 0.0876, + "mean_token_accuracy": 0.9755903035402298, + "num_tokens": 8969611.0, + "step": 1017 + }, + { + "entropy": 1.3801122307777405, + "epoch": 3.676923076923077, + "grad_norm": 0.3520437479019165, + "learning_rate": 0.00021007678972350798, + "loss": 0.0482, + "mean_token_accuracy": 0.9860682934522629, + "num_tokens": 8978283.0, + "step": 1018 + }, + { + "entropy": 1.309948354959488, + "epoch": 3.6805429864253396, + "grad_norm": 0.485009104013443, + "learning_rate": 0.00020965065990852474, + "loss": 0.0824, + "mean_token_accuracy": 0.9751296043395996, + "num_tokens": 8987535.0, + "step": 1019 + }, + { + "entropy": 1.3771768808364868, + "epoch": 3.684162895927602, + "grad_norm": 0.5419639945030212, + "learning_rate": 0.00020922480127733448, + "loss": 0.0649, + "mean_token_accuracy": 0.9826148748397827, + "num_tokens": 8996533.0, + "step": 1020 + }, + { + "entropy": 1.337918907403946, + "epoch": 3.6877828054298645, + "grad_norm": 0.36202654242515564, + "learning_rate": 0.00020879921544740264, + "loss": 0.0311, + "mean_token_accuracy": 0.9919043332338333, + "num_tokens": 9005497.0, + "step": 1021 + }, + { + "entropy": 1.439345896244049, + "epoch": 3.691402714932127, + "grad_norm": 0.6851293444633484, + "learning_rate": 0.0002083739040351584, + "loss": 0.096, + "mean_token_accuracy": 0.9736751765012741, + "num_tokens": 9014037.0, + "step": 1022 + }, + { + "entropy": 1.44906947016716, + "epoch": 3.6950226244343893, + "grad_norm": 0.4260176122188568, + "learning_rate": 0.00020794886865598848, + "loss": 0.0523, + "mean_token_accuracy": 0.9793268889188766, + "num_tokens": 9022452.0, + "step": 1023 + }, + { + "entropy": 1.449628233909607, + "epoch": 3.6986425339366518, + "grad_norm": 0.6072604656219482, + "learning_rate": 0.00020752411092423177, + "loss": 0.0727, + "mean_token_accuracy": 0.9774363785982132, + "num_tokens": 9030847.0, + "step": 1024 + }, + { + "entropy": 1.3873493075370789, + "epoch": 3.702262443438914, + "grad_norm": 0.44552555680274963, + "learning_rate": 0.00020709963245317209, + "loss": 0.0639, + "mean_token_accuracy": 0.9800115376710892, + "num_tokens": 9039891.0, + "step": 1025 + }, + { + "entropy": 1.4281193912029266, + "epoch": 3.7058823529411766, + "grad_norm": 0.5228530764579773, + "learning_rate": 0.0002066754348550327, + "loss": 0.0738, + "mean_token_accuracy": 0.9765488505363464, + "num_tokens": 9048686.0, + "step": 1026 + }, + { + "entropy": 1.3790385127067566, + "epoch": 3.709502262443439, + "grad_norm": 0.4316764175891876, + "learning_rate": 0.00020625151974097022, + "loss": 0.0641, + "mean_token_accuracy": 0.97920823097229, + "num_tokens": 9057678.0, + "step": 1027 + }, + { + "entropy": 1.4287641942501068, + "epoch": 3.7131221719457015, + "grad_norm": 0.4056229591369629, + "learning_rate": 0.00020582788872106842, + "loss": 0.036, + "mean_token_accuracy": 0.9899342656135559, + "num_tokens": 9066521.0, + "step": 1028 + }, + { + "entropy": 1.454606294631958, + "epoch": 3.716742081447964, + "grad_norm": 0.7525569200515747, + "learning_rate": 0.0002054045434043316, + "loss": 0.1423, + "mean_token_accuracy": 0.9605622440576553, + "num_tokens": 9075595.0, + "step": 1029 + }, + { + "entropy": 1.3942890167236328, + "epoch": 3.7203619909502263, + "grad_norm": 0.4933125078678131, + "learning_rate": 0.00020498148539867944, + "loss": 0.0773, + "mean_token_accuracy": 0.970758929848671, + "num_tokens": 9084661.0, + "step": 1030 + }, + { + "entropy": 1.384048968553543, + "epoch": 3.723981900452489, + "grad_norm": 0.43627068400382996, + "learning_rate": 0.00020455871631094017, + "loss": 0.0678, + "mean_token_accuracy": 0.983132854104042, + "num_tokens": 9094062.0, + "step": 1031 + }, + { + "entropy": 1.4216719567775726, + "epoch": 3.727601809954751, + "grad_norm": 0.6412005424499512, + "learning_rate": 0.0002041362377468445, + "loss": 0.1097, + "mean_token_accuracy": 0.9793971478939056, + "num_tokens": 9103015.0, + "step": 1032 + }, + { + "entropy": 1.4771287441253662, + "epoch": 3.7312217194570136, + "grad_norm": 0.5385004281997681, + "learning_rate": 0.00020371405131102002, + "loss": 0.0553, + "mean_token_accuracy": 0.9826144278049469, + "num_tokens": 9111433.0, + "step": 1033 + }, + { + "entropy": 1.4442466795444489, + "epoch": 3.734841628959276, + "grad_norm": 0.5972802042961121, + "learning_rate": 0.00020329215860698458, + "loss": 0.0584, + "mean_token_accuracy": 0.984418511390686, + "num_tokens": 9120012.0, + "step": 1034 + }, + { + "entropy": 1.4893062710762024, + "epoch": 3.7384615384615385, + "grad_norm": 0.7473769783973694, + "learning_rate": 0.00020287056123714035, + "loss": 0.1091, + "mean_token_accuracy": 0.9683271646499634, + "num_tokens": 9128636.0, + "step": 1035 + }, + { + "entropy": 1.3519982993602753, + "epoch": 3.742081447963801, + "grad_norm": 0.47699517011642456, + "learning_rate": 0.00020244926080276794, + "loss": 0.0525, + "mean_token_accuracy": 0.9904675185680389, + "num_tokens": 9137968.0, + "step": 1036 + }, + { + "entropy": 1.4561591148376465, + "epoch": 3.7457013574660634, + "grad_norm": 0.42199093103408813, + "learning_rate": 0.00020202825890402003, + "loss": 0.0451, + "mean_token_accuracy": 0.9883602410554886, + "num_tokens": 9146589.0, + "step": 1037 + }, + { + "entropy": 1.408205658197403, + "epoch": 3.749321266968326, + "grad_norm": 0.4122658371925354, + "learning_rate": 0.0002016075571399157, + "loss": 0.0489, + "mean_token_accuracy": 0.9833643138408661, + "num_tokens": 9155443.0, + "step": 1038 + }, + { + "entropy": 1.3683985471725464, + "epoch": 3.7529411764705882, + "grad_norm": 0.43202081322669983, + "learning_rate": 0.0002011871571083336, + "loss": 0.0424, + "mean_token_accuracy": 0.9905680269002914, + "num_tokens": 9164792.0, + "step": 1039 + }, + { + "entropy": 1.3635350167751312, + "epoch": 3.7565610859728507, + "grad_norm": 0.606826663017273, + "learning_rate": 0.00020076706040600672, + "loss": 0.0883, + "mean_token_accuracy": 0.9747144728899002, + "num_tokens": 9174060.0, + "step": 1040 + }, + { + "entropy": 1.3948090970516205, + "epoch": 3.760180995475113, + "grad_norm": 0.561805009841919, + "learning_rate": 0.00020034726862851594, + "loss": 0.1131, + "mean_token_accuracy": 0.9722562730312347, + "num_tokens": 9183245.0, + "step": 1041 + }, + { + "entropy": 1.4282923936843872, + "epoch": 3.7638009049773755, + "grad_norm": 0.5546544790267944, + "learning_rate": 0.00019992778337028384, + "loss": 0.0762, + "mean_token_accuracy": 0.9801356643438339, + "num_tokens": 9191879.0, + "step": 1042 + }, + { + "entropy": 1.3675439953804016, + "epoch": 3.767420814479638, + "grad_norm": 0.5167890787124634, + "learning_rate": 0.0001995086062245689, + "loss": 0.0804, + "mean_token_accuracy": 0.98072350025177, + "num_tokens": 9201014.0, + "step": 1043 + }, + { + "entropy": 1.424451231956482, + "epoch": 3.7710407239819004, + "grad_norm": 0.44696182012557983, + "learning_rate": 0.00019908973878345943, + "loss": 0.0583, + "mean_token_accuracy": 0.9818143099546432, + "num_tokens": 9209954.0, + "step": 1044 + }, + { + "entropy": 1.3515954911708832, + "epoch": 3.774660633484163, + "grad_norm": 0.619891345500946, + "learning_rate": 0.0001986711826378673, + "loss": 0.0949, + "mean_token_accuracy": 0.9688181430101395, + "num_tokens": 9219157.0, + "step": 1045 + }, + { + "entropy": 1.3176933526992798, + "epoch": 3.7782805429864252, + "grad_norm": 0.43150845170021057, + "learning_rate": 0.00019825293937752203, + "loss": 0.0459, + "mean_token_accuracy": 0.9851347357034683, + "num_tokens": 9228415.0, + "step": 1046 + }, + { + "entropy": 1.3939999043941498, + "epoch": 3.7819004524886877, + "grad_norm": 0.6242758631706238, + "learning_rate": 0.00019783501059096495, + "loss": 0.0703, + "mean_token_accuracy": 0.9822264909744263, + "num_tokens": 9237479.0, + "step": 1047 + }, + { + "entropy": 1.430876463651657, + "epoch": 3.78552036199095, + "grad_norm": 0.6107195019721985, + "learning_rate": 0.00019741739786554273, + "loss": 0.0758, + "mean_token_accuracy": 0.9829006642103195, + "num_tokens": 9245975.0, + "step": 1048 + }, + { + "entropy": 1.4005264639854431, + "epoch": 3.7891402714932125, + "grad_norm": 0.5321121215820312, + "learning_rate": 0.00019700010278740174, + "loss": 0.0636, + "mean_token_accuracy": 0.9827183485031128, + "num_tokens": 9254487.0, + "step": 1049 + }, + { + "entropy": 1.4024662375450134, + "epoch": 3.792760180995475, + "grad_norm": 0.5756775140762329, + "learning_rate": 0.00019658312694148191, + "loss": 0.0702, + "mean_token_accuracy": 0.9786443412303925, + "num_tokens": 9263345.0, + "step": 1050 + }, + { + "entropy": 1.3956109881401062, + "epoch": 3.7963800904977374, + "grad_norm": 0.5821980834007263, + "learning_rate": 0.00019616647191151077, + "loss": 0.0715, + "mean_token_accuracy": 0.97563835978508, + "num_tokens": 9271916.0, + "step": 1051 + }, + { + "entropy": 1.4688811898231506, + "epoch": 3.8, + "grad_norm": 0.3763403594493866, + "learning_rate": 0.00019575013927999692, + "loss": 0.0399, + "mean_token_accuracy": 0.9858106821775436, + "num_tokens": 9280577.0, + "step": 1052 + }, + { + "entropy": 1.482151448726654, + "epoch": 3.8036199095022623, + "grad_norm": 0.4746648371219635, + "learning_rate": 0.00019533413062822495, + "loss": 0.0338, + "mean_token_accuracy": 0.9888868033885956, + "num_tokens": 9289036.0, + "step": 1053 + }, + { + "entropy": 1.39437335729599, + "epoch": 3.8072398190045247, + "grad_norm": 0.34683090448379517, + "learning_rate": 0.00019491844753624884, + "loss": 0.0411, + "mean_token_accuracy": 0.98799167573452, + "num_tokens": 9297968.0, + "step": 1054 + }, + { + "entropy": 1.4089177548885345, + "epoch": 3.810859728506787, + "grad_norm": 0.6755173802375793, + "learning_rate": 0.00019450309158288562, + "loss": 0.08, + "mean_token_accuracy": 0.975567102432251, + "num_tokens": 9306399.0, + "step": 1055 + }, + { + "entropy": 1.4395931661128998, + "epoch": 3.8144796380090495, + "grad_norm": 0.6466923356056213, + "learning_rate": 0.00019408806434571043, + "loss": 0.0962, + "mean_token_accuracy": 0.9790873825550079, + "num_tokens": 9315067.0, + "step": 1056 + }, + { + "entropy": 1.463386446237564, + "epoch": 3.818099547511312, + "grad_norm": 0.7904548645019531, + "learning_rate": 0.0001936733674010496, + "loss": 0.0982, + "mean_token_accuracy": 0.9723253399133682, + "num_tokens": 9323453.0, + "step": 1057 + }, + { + "entropy": 1.4089987576007843, + "epoch": 3.8217194570135744, + "grad_norm": 0.5947125554084778, + "learning_rate": 0.00019325900232397477, + "loss": 0.0558, + "mean_token_accuracy": 0.985149696469307, + "num_tokens": 9332220.0, + "step": 1058 + }, + { + "entropy": 1.424832284450531, + "epoch": 3.825339366515837, + "grad_norm": 0.6046349406242371, + "learning_rate": 0.00019284497068829747, + "loss": 0.103, + "mean_token_accuracy": 0.9751139581203461, + "num_tokens": 9341074.0, + "step": 1059 + }, + { + "entropy": 1.3354915082454681, + "epoch": 3.8289592760180997, + "grad_norm": 0.20708034932613373, + "learning_rate": 0.00019243127406656248, + "loss": 0.0117, + "mean_token_accuracy": 0.9978606253862381, + "num_tokens": 9350232.0, + "step": 1060 + }, + { + "entropy": 1.335442215204239, + "epoch": 3.832579185520362, + "grad_norm": 0.7157071232795715, + "learning_rate": 0.00019201791403004257, + "loss": 0.0915, + "mean_token_accuracy": 0.9730544090270996, + "num_tokens": 9359486.0, + "step": 1061 + }, + { + "entropy": 1.4432708621025085, + "epoch": 3.8361990950226246, + "grad_norm": 0.7831724286079407, + "learning_rate": 0.00019160489214873155, + "loss": 0.1163, + "mean_token_accuracy": 0.9673851430416107, + "num_tokens": 9368235.0, + "step": 1062 + }, + { + "entropy": 1.440447449684143, + "epoch": 3.839819004524887, + "grad_norm": 0.6418187022209167, + "learning_rate": 0.00019119220999133923, + "loss": 0.0587, + "mean_token_accuracy": 0.9853110462427139, + "num_tokens": 9376622.0, + "step": 1063 + }, + { + "entropy": 1.3945489525794983, + "epoch": 3.8434389140271494, + "grad_norm": 0.5115446448326111, + "learning_rate": 0.0001907798691252852, + "loss": 0.0627, + "mean_token_accuracy": 0.9849587380886078, + "num_tokens": 9385653.0, + "step": 1064 + }, + { + "entropy": 1.4040117859840393, + "epoch": 3.847058823529412, + "grad_norm": 0.40980765223503113, + "learning_rate": 0.0001903678711166924, + "loss": 0.0319, + "mean_token_accuracy": 0.990267813205719, + "num_tokens": 9394335.0, + "step": 1065 + }, + { + "entropy": 1.440806269645691, + "epoch": 3.8506787330316743, + "grad_norm": 0.7762898206710815, + "learning_rate": 0.00018995621753038183, + "loss": 0.1477, + "mean_token_accuracy": 0.9675359576940536, + "num_tokens": 9402789.0, + "step": 1066 + }, + { + "entropy": 1.3455476462841034, + "epoch": 3.8542986425339367, + "grad_norm": 0.4371282458305359, + "learning_rate": 0.00018954490992986644, + "loss": 0.047, + "mean_token_accuracy": 0.9871475845575333, + "num_tokens": 9411665.0, + "step": 1067 + }, + { + "entropy": 1.3937756717205048, + "epoch": 3.857918552036199, + "grad_norm": 0.8712350726127625, + "learning_rate": 0.0001891339498773447, + "loss": 0.143, + "mean_token_accuracy": 0.9606377333402634, + "num_tokens": 9420475.0, + "step": 1068 + }, + { + "entropy": 1.4569672644138336, + "epoch": 3.8615384615384616, + "grad_norm": 0.6399717926979065, + "learning_rate": 0.00018872333893369536, + "loss": 0.0625, + "mean_token_accuracy": 0.9822671264410019, + "num_tokens": 9429062.0, + "step": 1069 + }, + { + "entropy": 1.397208034992218, + "epoch": 3.865158371040724, + "grad_norm": 0.41650331020355225, + "learning_rate": 0.00018831307865847108, + "loss": 0.0565, + "mean_token_accuracy": 0.9822796285152435, + "num_tokens": 9437938.0, + "step": 1070 + }, + { + "entropy": 1.3304217159748077, + "epoch": 3.8687782805429864, + "grad_norm": 0.34858253598213196, + "learning_rate": 0.00018790317060989273, + "loss": 0.0355, + "mean_token_accuracy": 0.9889863580465317, + "num_tokens": 9446897.0, + "step": 1071 + }, + { + "entropy": 1.4303509891033173, + "epoch": 3.872398190045249, + "grad_norm": 0.5634018182754517, + "learning_rate": 0.00018749361634484325, + "loss": 0.0999, + "mean_token_accuracy": 0.9707607924938202, + "num_tokens": 9455618.0, + "step": 1072 + }, + { + "entropy": 1.4059797525405884, + "epoch": 3.8760180995475113, + "grad_norm": 0.4992756247520447, + "learning_rate": 0.00018708441741886194, + "loss": 0.062, + "mean_token_accuracy": 0.9801923334598541, + "num_tokens": 9464254.0, + "step": 1073 + }, + { + "entropy": 1.329335242509842, + "epoch": 3.8796380090497737, + "grad_norm": 0.43501394987106323, + "learning_rate": 0.00018667557538613863, + "loss": 0.0474, + "mean_token_accuracy": 0.987145259976387, + "num_tokens": 9473340.0, + "step": 1074 + }, + { + "entropy": 1.3786957263946533, + "epoch": 3.883257918552036, + "grad_norm": 0.640612781047821, + "learning_rate": 0.00018626709179950717, + "loss": 0.1196, + "mean_token_accuracy": 0.9668680727481842, + "num_tokens": 9482286.0, + "step": 1075 + }, + { + "entropy": 1.3925435245037079, + "epoch": 3.8868778280542986, + "grad_norm": 0.6338940262794495, + "learning_rate": 0.0001858589682104405, + "loss": 0.0643, + "mean_token_accuracy": 0.982734814286232, + "num_tokens": 9490868.0, + "step": 1076 + }, + { + "entropy": 1.3901928961277008, + "epoch": 3.890497737556561, + "grad_norm": 0.5943475365638733, + "learning_rate": 0.000185451206169044, + "loss": 0.0636, + "mean_token_accuracy": 0.9796071499586105, + "num_tokens": 9499551.0, + "step": 1077 + }, + { + "entropy": 1.3408487439155579, + "epoch": 3.8941176470588235, + "grad_norm": 0.47063320875167847, + "learning_rate": 0.00018504380722404975, + "loss": 0.059, + "mean_token_accuracy": 0.98704494535923, + "num_tokens": 9508605.0, + "step": 1078 + }, + { + "entropy": 1.3606760799884796, + "epoch": 3.897737556561086, + "grad_norm": 0.5077705383300781, + "learning_rate": 0.00018463677292281092, + "loss": 0.0586, + "mean_token_accuracy": 0.9849795997142792, + "num_tokens": 9517376.0, + "step": 1079 + }, + { + "entropy": 1.389219492673874, + "epoch": 3.9013574660633483, + "grad_norm": 0.451435923576355, + "learning_rate": 0.00018423010481129584, + "loss": 0.0414, + "mean_token_accuracy": 0.9872728437185287, + "num_tokens": 9525724.0, + "step": 1080 + }, + { + "entropy": 1.3427990972995758, + "epoch": 3.9049773755656108, + "grad_norm": 0.4996180236339569, + "learning_rate": 0.00018382380443408158, + "loss": 0.0519, + "mean_token_accuracy": 0.9842040240764618, + "num_tokens": 9534581.0, + "step": 1081 + }, + { + "entropy": 1.3136717081069946, + "epoch": 3.908597285067873, + "grad_norm": 0.31684455275535583, + "learning_rate": 0.00018341787333434872, + "loss": 0.0367, + "mean_token_accuracy": 0.986624076962471, + "num_tokens": 9543780.0, + "step": 1082 + }, + { + "entropy": 1.357143759727478, + "epoch": 3.9122171945701356, + "grad_norm": 0.392623633146286, + "learning_rate": 0.00018301231305387552, + "loss": 0.0361, + "mean_token_accuracy": 0.9899974465370178, + "num_tokens": 9552316.0, + "step": 1083 + }, + { + "entropy": 1.4222826957702637, + "epoch": 3.915837104072398, + "grad_norm": 0.6109654903411865, + "learning_rate": 0.00018260712513303167, + "loss": 0.0801, + "mean_token_accuracy": 0.9758190959692001, + "num_tokens": 9560547.0, + "step": 1084 + }, + { + "entropy": 1.387441635131836, + "epoch": 3.9194570135746605, + "grad_norm": 0.611193060874939, + "learning_rate": 0.00018220231111077217, + "loss": 0.0627, + "mean_token_accuracy": 0.9828397631645203, + "num_tokens": 9569112.0, + "step": 1085 + }, + { + "entropy": 1.3265759348869324, + "epoch": 3.9230769230769234, + "grad_norm": 0.35626664757728577, + "learning_rate": 0.0001817978725246326, + "loss": 0.0347, + "mean_token_accuracy": 0.9868002831935883, + "num_tokens": 9577936.0, + "step": 1086 + }, + { + "entropy": 1.3036309480667114, + "epoch": 3.926696832579186, + "grad_norm": 0.9055293202400208, + "learning_rate": 0.00018139381091072213, + "loss": 0.0869, + "mean_token_accuracy": 0.976190984249115, + "num_tokens": 9586725.0, + "step": 1087 + }, + { + "entropy": 1.317764014005661, + "epoch": 3.930316742081448, + "grad_norm": 0.34299445152282715, + "learning_rate": 0.00018099012780371814, + "loss": 0.0193, + "mean_token_accuracy": 0.9950294345617294, + "num_tokens": 9595580.0, + "step": 1088 + }, + { + "entropy": 1.4780614078044891, + "epoch": 3.9339366515837106, + "grad_norm": 0.45136016607284546, + "learning_rate": 0.00018058682473686075, + "loss": 0.03, + "mean_token_accuracy": 0.9902182072401047, + "num_tokens": 9603693.0, + "step": 1089 + }, + { + "entropy": 1.3006681501865387, + "epoch": 3.937556561085973, + "grad_norm": 0.8072985410690308, + "learning_rate": 0.00018018390324194637, + "loss": 0.1406, + "mean_token_accuracy": 0.9719722718000412, + "num_tokens": 9613517.0, + "step": 1090 + }, + { + "entropy": 1.2329892814159393, + "epoch": 3.9411764705882355, + "grad_norm": 0.7343161702156067, + "learning_rate": 0.00017978136484932198, + "loss": 0.1221, + "mean_token_accuracy": 0.9715431183576584, + "num_tokens": 9623002.0, + "step": 1091 + }, + { + "entropy": 1.339203268289566, + "epoch": 3.944796380090498, + "grad_norm": 0.422463595867157, + "learning_rate": 0.00017937921108787986, + "loss": 0.0366, + "mean_token_accuracy": 0.9875014275312424, + "num_tokens": 9631474.0, + "step": 1092 + }, + { + "entropy": 1.3312835395336151, + "epoch": 3.9484162895927604, + "grad_norm": 0.5072442889213562, + "learning_rate": 0.00017897744348505123, + "loss": 0.0561, + "mean_token_accuracy": 0.9836284965276718, + "num_tokens": 9640156.0, + "step": 1093 + }, + { + "entropy": 1.4004390239715576, + "epoch": 3.952036199095023, + "grad_norm": 0.48746341466903687, + "learning_rate": 0.0001785760635668007, + "loss": 0.044, + "mean_token_accuracy": 0.9849557876586914, + "num_tokens": 9648458.0, + "step": 1094 + }, + { + "entropy": 1.350889652967453, + "epoch": 3.9556561085972852, + "grad_norm": 0.41746893525123596, + "learning_rate": 0.00017817507285762023, + "loss": 0.0532, + "mean_token_accuracy": 0.9855844229459763, + "num_tokens": 9657136.0, + "step": 1095 + }, + { + "entropy": 1.237421602010727, + "epoch": 3.9592760180995477, + "grad_norm": 0.46493780612945557, + "learning_rate": 0.00017777447288052373, + "loss": 0.0759, + "mean_token_accuracy": 0.9721266627311707, + "num_tokens": 9667058.0, + "step": 1096 + }, + { + "entropy": 1.3534648716449738, + "epoch": 3.96289592760181, + "grad_norm": 0.406345933675766, + "learning_rate": 0.000177374265157041, + "loss": 0.0537, + "mean_token_accuracy": 0.9838696867227554, + "num_tokens": 9675627.0, + "step": 1097 + }, + { + "entropy": 1.237879753112793, + "epoch": 3.9665158371040725, + "grad_norm": 0.527837872505188, + "learning_rate": 0.00017697445120721175, + "loss": 0.0737, + "mean_token_accuracy": 0.9752355068922043, + "num_tokens": 9685091.0, + "step": 1098 + }, + { + "entropy": 1.2298554480075836, + "epoch": 3.970135746606335, + "grad_norm": 0.45402729511260986, + "learning_rate": 0.00017657503254958054, + "loss": 0.0556, + "mean_token_accuracy": 0.9843012988567352, + "num_tokens": 9694688.0, + "step": 1099 + }, + { + "entropy": 1.270505130290985, + "epoch": 3.9737556561085974, + "grad_norm": 0.6557897329330444, + "learning_rate": 0.00017617601070119037, + "loss": 0.0918, + "mean_token_accuracy": 0.9786079078912735, + "num_tokens": 9704286.0, + "step": 1100 + }, + { + "entropy": 1.3577526807785034, + "epoch": 3.97737556561086, + "grad_norm": 0.48044729232788086, + "learning_rate": 0.0001757773871775768, + "loss": 0.0564, + "mean_token_accuracy": 0.9776984602212906, + "num_tokens": 9712668.0, + "step": 1101 + }, + { + "entropy": 1.309740036725998, + "epoch": 3.9809954751131222, + "grad_norm": 0.8556230664253235, + "learning_rate": 0.00017537916349276303, + "loss": 0.2013, + "mean_token_accuracy": 0.9610435962677002, + "num_tokens": 9722042.0, + "step": 1102 + }, + { + "entropy": 1.375863939523697, + "epoch": 3.9846153846153847, + "grad_norm": 0.4123291075229645, + "learning_rate": 0.00017498134115925327, + "loss": 0.0208, + "mean_token_accuracy": 0.9937012493610382, + "num_tokens": 9730420.0, + "step": 1103 + }, + { + "entropy": 1.3407017588615417, + "epoch": 3.988235294117647, + "grad_norm": 0.3886757493019104, + "learning_rate": 0.0001745839216880275, + "loss": 0.0223, + "mean_token_accuracy": 0.9922950863838196, + "num_tokens": 9739292.0, + "step": 1104 + }, + { + "entropy": 1.2783922851085663, + "epoch": 3.9918552036199095, + "grad_norm": 0.39245131611824036, + "learning_rate": 0.00017418690658853542, + "loss": 0.0607, + "mean_token_accuracy": 0.9823390543460846, + "num_tokens": 9748635.0, + "step": 1105 + }, + { + "entropy": 1.3240907490253448, + "epoch": 3.995475113122172, + "grad_norm": 0.925537645816803, + "learning_rate": 0.00017379029736869103, + "loss": 0.1301, + "mean_token_accuracy": 0.9688823968172073, + "num_tokens": 9757450.0, + "step": 1106 + }, + { + "entropy": 1.2996585667133331, + "epoch": 3.9990950226244344, + "grad_norm": 0.5589770674705505, + "learning_rate": 0.00017339409553486675, + "loss": 0.0833, + "mean_token_accuracy": 0.9765840470790863, + "num_tokens": 9766204.0, + "step": 1107 + }, + { + "entropy": 1.3336817026138306, + "epoch": 4.0, + "grad_norm": 1.8711317777633667, + "learning_rate": 0.00017299830259188753, + "loss": 0.0647, + "mean_token_accuracy": 0.9789473414421082, + "num_tokens": 9766900.0, + "step": 1108 + }, + { + "epoch": 4.0, + "eval_entropy": 1.3279241662684496, + "eval_loss": 0.13373936712741852, + "eval_mean_token_accuracy": 0.9691996351490176, + "eval_num_tokens": 9766900.0, + "eval_runtime": 116.1625, + "eval_samples_per_second": 3.177, + "eval_steps_per_second": 1.059, + "step": 1108 + }, + { + "entropy": 1.293619453907013, + "epoch": 4.003619909502262, + "grad_norm": 0.3031173050403595, + "learning_rate": 0.0001726029200430255, + "loss": 0.0413, + "mean_token_accuracy": 0.9907345324754715, + "num_tokens": 9775499.0, + "step": 1109 + }, + { + "entropy": 1.278919130563736, + "epoch": 4.007239819004525, + "grad_norm": 0.4092402160167694, + "learning_rate": 0.00017220794938999388, + "loss": 0.0471, + "mean_token_accuracy": 0.9856289625167847, + "num_tokens": 9784864.0, + "step": 1110 + }, + { + "entropy": 1.2768179178237915, + "epoch": 4.010859728506787, + "grad_norm": 0.4138973355293274, + "learning_rate": 0.0001718133921329416, + "loss": 0.0487, + "mean_token_accuracy": 0.9872913360595703, + "num_tokens": 9794077.0, + "step": 1111 + }, + { + "entropy": 1.2907516956329346, + "epoch": 4.01447963800905, + "grad_norm": 0.433987021446228, + "learning_rate": 0.0001714192497704474, + "loss": 0.0458, + "mean_token_accuracy": 0.9867971241474152, + "num_tokens": 9802781.0, + "step": 1112 + }, + { + "entropy": 1.2788557410240173, + "epoch": 4.018099547511312, + "grad_norm": 0.456443727016449, + "learning_rate": 0.0001710255237995142, + "loss": 0.054, + "mean_token_accuracy": 0.9820699095726013, + "num_tokens": 9811440.0, + "step": 1113 + }, + { + "entropy": 1.2931778728961945, + "epoch": 4.021719457013575, + "grad_norm": 0.5304563641548157, + "learning_rate": 0.00017063221571556348, + "loss": 0.0663, + "mean_token_accuracy": 0.9788531064987183, + "num_tokens": 9820394.0, + "step": 1114 + }, + { + "entropy": 1.251348078250885, + "epoch": 4.025339366515837, + "grad_norm": 0.32978400588035583, + "learning_rate": 0.00017023932701242932, + "loss": 0.0201, + "mean_token_accuracy": 0.9935282766819, + "num_tokens": 9829196.0, + "step": 1115 + }, + { + "entropy": 1.3026653826236725, + "epoch": 4.0289592760180994, + "grad_norm": 0.4994417726993561, + "learning_rate": 0.0001698468591823532, + "loss": 0.0546, + "mean_token_accuracy": 0.9885783791542053, + "num_tokens": 9838079.0, + "step": 1116 + }, + { + "entropy": 1.2028016149997711, + "epoch": 4.032579185520362, + "grad_norm": 0.3469240963459015, + "learning_rate": 0.00016945481371597793, + "loss": 0.0414, + "mean_token_accuracy": 0.9887481927871704, + "num_tokens": 9847654.0, + "step": 1117 + }, + { + "entropy": 1.234688937664032, + "epoch": 4.036199095022624, + "grad_norm": 0.35243600606918335, + "learning_rate": 0.00016906319210234218, + "loss": 0.037, + "mean_token_accuracy": 0.9897292852401733, + "num_tokens": 9856599.0, + "step": 1118 + }, + { + "entropy": 1.2084547579288483, + "epoch": 4.039819004524887, + "grad_norm": 0.31716811656951904, + "learning_rate": 0.00016867199582887464, + "loss": 0.0385, + "mean_token_accuracy": 0.9857838749885559, + "num_tokens": 9865697.0, + "step": 1119 + }, + { + "entropy": 1.2915122210979462, + "epoch": 4.043438914027149, + "grad_norm": 0.38569268584251404, + "learning_rate": 0.00016828122638138876, + "loss": 0.0208, + "mean_token_accuracy": 0.9932724088430405, + "num_tokens": 9874265.0, + "step": 1120 + }, + { + "entropy": 1.2242113649845123, + "epoch": 4.047058823529412, + "grad_norm": 0.5431196689605713, + "learning_rate": 0.0001678908852440766, + "loss": 0.0641, + "mean_token_accuracy": 0.9854831397533417, + "num_tokens": 9883597.0, + "step": 1121 + }, + { + "entropy": 1.256748080253601, + "epoch": 4.050678733031674, + "grad_norm": 0.5003210306167603, + "learning_rate": 0.00016750097389950358, + "loss": 0.0391, + "mean_token_accuracy": 0.9875418394804001, + "num_tokens": 9892067.0, + "step": 1122 + }, + { + "entropy": 1.1860899925231934, + "epoch": 4.0542986425339365, + "grad_norm": 0.35398924350738525, + "learning_rate": 0.00016711149382860266, + "loss": 0.0287, + "mean_token_accuracy": 0.9883342385292053, + "num_tokens": 9901411.0, + "step": 1123 + }, + { + "entropy": 1.347067803144455, + "epoch": 4.057918552036199, + "grad_norm": 0.5562195777893066, + "learning_rate": 0.00016672244651066883, + "loss": 0.045, + "mean_token_accuracy": 0.987530916929245, + "num_tokens": 9909638.0, + "step": 1124 + }, + { + "entropy": 1.2423038482666016, + "epoch": 4.061538461538461, + "grad_norm": 0.38534653186798096, + "learning_rate": 0.00016633383342335331, + "loss": 0.0301, + "mean_token_accuracy": 0.991975411772728, + "num_tokens": 9918264.0, + "step": 1125 + }, + { + "entropy": 1.2479912340641022, + "epoch": 4.065158371040724, + "grad_norm": 0.5592287182807922, + "learning_rate": 0.00016594565604265816, + "loss": 0.0598, + "mean_token_accuracy": 0.9825469106435776, + "num_tokens": 9926721.0, + "step": 1126 + }, + { + "entropy": 1.176967740058899, + "epoch": 4.068778280542986, + "grad_norm": 0.4851750433444977, + "learning_rate": 0.0001655579158429307, + "loss": 0.0481, + "mean_token_accuracy": 0.9884417057037354, + "num_tokens": 9936006.0, + "step": 1127 + }, + { + "entropy": 1.2568339705467224, + "epoch": 4.072398190045249, + "grad_norm": 0.3245483934879303, + "learning_rate": 0.00016517061429685738, + "loss": 0.0183, + "mean_token_accuracy": 0.9941931664943695, + "num_tokens": 9944541.0, + "step": 1128 + }, + { + "entropy": 1.192540168762207, + "epoch": 4.076018099547511, + "grad_norm": 0.529319703578949, + "learning_rate": 0.00016478375287545886, + "loss": 0.0641, + "mean_token_accuracy": 0.9852915108203888, + "num_tokens": 9953279.0, + "step": 1129 + }, + { + "entropy": 1.216693490743637, + "epoch": 4.0796380090497735, + "grad_norm": 0.5261855721473694, + "learning_rate": 0.00016439733304808436, + "loss": 0.044, + "mean_token_accuracy": 0.9879221767187119, + "num_tokens": 9962236.0, + "step": 1130 + }, + { + "entropy": 1.2447461783885956, + "epoch": 4.083257918552036, + "grad_norm": 0.667833149433136, + "learning_rate": 0.0001640113562824054, + "loss": 0.0361, + "mean_token_accuracy": 0.987928569316864, + "num_tokens": 9970769.0, + "step": 1131 + }, + { + "entropy": 1.1653542816638947, + "epoch": 4.086877828054298, + "grad_norm": 0.42759764194488525, + "learning_rate": 0.00016362582404441084, + "loss": 0.0342, + "mean_token_accuracy": 0.9896238744258881, + "num_tokens": 9979647.0, + "step": 1132 + }, + { + "entropy": 1.2345812320709229, + "epoch": 4.090497737556561, + "grad_norm": 0.850532591342926, + "learning_rate": 0.00016324073779840165, + "loss": 0.0528, + "mean_token_accuracy": 0.9796405136585236, + "num_tokens": 9988323.0, + "step": 1133 + }, + { + "entropy": 1.2570472955703735, + "epoch": 4.094117647058823, + "grad_norm": 0.666858971118927, + "learning_rate": 0.00016285609900698413, + "loss": 0.0377, + "mean_token_accuracy": 0.9860774129629135, + "num_tokens": 9996967.0, + "step": 1134 + }, + { + "entropy": 1.1051703989505768, + "epoch": 4.097737556561086, + "grad_norm": 0.4458142817020416, + "learning_rate": 0.0001624719091310654, + "loss": 0.0504, + "mean_token_accuracy": 0.9836345016956329, + "num_tokens": 10006676.0, + "step": 1135 + }, + { + "entropy": 1.2148622274398804, + "epoch": 4.101357466063348, + "grad_norm": 0.7275047898292542, + "learning_rate": 0.0001620881696298478, + "loss": 0.0539, + "mean_token_accuracy": 0.9839228391647339, + "num_tokens": 10015437.0, + "step": 1136 + }, + { + "entropy": 1.2343271374702454, + "epoch": 4.1049773755656105, + "grad_norm": 0.4777041971683502, + "learning_rate": 0.00016170488196082285, + "loss": 0.0441, + "mean_token_accuracy": 0.9889576286077499, + "num_tokens": 10024075.0, + "step": 1137 + }, + { + "entropy": 1.2086644172668457, + "epoch": 4.108597285067873, + "grad_norm": 0.4287410080432892, + "learning_rate": 0.00016132204757976563, + "loss": 0.0327, + "mean_token_accuracy": 0.9914772063493729, + "num_tokens": 10032954.0, + "step": 1138 + }, + { + "entropy": 1.3105964958667755, + "epoch": 4.112217194570135, + "grad_norm": 0.627275288105011, + "learning_rate": 0.0001609396679407303, + "loss": 0.2148, + "mean_token_accuracy": 0.9677064567804337, + "num_tokens": 10041500.0, + "step": 1139 + }, + { + "entropy": 1.213149219751358, + "epoch": 4.115837104072398, + "grad_norm": 0.545829713344574, + "learning_rate": 0.00016055774449604337, + "loss": 0.0537, + "mean_token_accuracy": 0.9844522625207901, + "num_tokens": 10050143.0, + "step": 1140 + }, + { + "entropy": 1.2647437453269958, + "epoch": 4.11945701357466, + "grad_norm": 0.3859199285507202, + "learning_rate": 0.00016017627869629853, + "loss": 0.0211, + "mean_token_accuracy": 0.9922139197587967, + "num_tokens": 10058737.0, + "step": 1141 + }, + { + "entropy": 1.1950034499168396, + "epoch": 4.123076923076923, + "grad_norm": 0.5298987030982971, + "learning_rate": 0.00015979527199035172, + "loss": 0.0476, + "mean_token_accuracy": 0.9850315600633621, + "num_tokens": 10067294.0, + "step": 1142 + }, + { + "entropy": 1.2080510556697845, + "epoch": 4.126696832579185, + "grad_norm": 1.1342438459396362, + "learning_rate": 0.00015941472582531505, + "loss": 0.0548, + "mean_token_accuracy": 0.9844682365655899, + "num_tokens": 10076175.0, + "step": 1143 + }, + { + "entropy": 1.1458761394023895, + "epoch": 4.130316742081448, + "grad_norm": 0.4891660213470459, + "learning_rate": 0.00015903464164655103, + "loss": 0.0612, + "mean_token_accuracy": 0.9851740747690201, + "num_tokens": 10085549.0, + "step": 1144 + }, + { + "entropy": 1.1645109951496124, + "epoch": 4.133936651583711, + "grad_norm": 0.47334766387939453, + "learning_rate": 0.00015865502089766807, + "loss": 0.0464, + "mean_token_accuracy": 0.9830625057220459, + "num_tokens": 10095378.0, + "step": 1145 + }, + { + "entropy": 1.220601737499237, + "epoch": 4.137556561085973, + "grad_norm": 0.2870275378227234, + "learning_rate": 0.00015827586502051405, + "loss": 0.0117, + "mean_token_accuracy": 0.9957176744937897, + "num_tokens": 10104010.0, + "step": 1146 + }, + { + "entropy": 1.2811335325241089, + "epoch": 4.141176470588236, + "grad_norm": 0.80066978931427, + "learning_rate": 0.00015789717545517136, + "loss": 0.0834, + "mean_token_accuracy": 0.9741235673427582, + "num_tokens": 10112986.0, + "step": 1147 + }, + { + "entropy": 1.2601549923419952, + "epoch": 4.144796380090498, + "grad_norm": 0.5500946640968323, + "learning_rate": 0.00015751895363995118, + "loss": 0.0322, + "mean_token_accuracy": 0.9874281883239746, + "num_tokens": 10121704.0, + "step": 1148 + }, + { + "entropy": 1.1824783682823181, + "epoch": 4.1484162895927605, + "grad_norm": 0.33207061886787415, + "learning_rate": 0.00015714120101138824, + "loss": 0.0276, + "mean_token_accuracy": 0.9928829818964005, + "num_tokens": 10130838.0, + "step": 1149 + }, + { + "entropy": 1.2213251888751984, + "epoch": 4.152036199095023, + "grad_norm": 0.5984644889831543, + "learning_rate": 0.00015676391900423513, + "loss": 0.0432, + "mean_token_accuracy": 0.988958552479744, + "num_tokens": 10139651.0, + "step": 1150 + }, + { + "entropy": 1.233760267496109, + "epoch": 4.155656108597285, + "grad_norm": 0.38205328583717346, + "learning_rate": 0.00015638710905145693, + "loss": 0.0327, + "mean_token_accuracy": 0.9904181510210037, + "num_tokens": 10148702.0, + "step": 1151 + }, + { + "entropy": 1.2735399901866913, + "epoch": 4.159276018099548, + "grad_norm": 0.4505186080932617, + "learning_rate": 0.00015601077258422597, + "loss": 0.0343, + "mean_token_accuracy": 0.9863469153642654, + "num_tokens": 10157322.0, + "step": 1152 + }, + { + "entropy": 1.248571664094925, + "epoch": 4.16289592760181, + "grad_norm": 0.5458792448043823, + "learning_rate": 0.00015563491103191604, + "loss": 0.0586, + "mean_token_accuracy": 0.9839989989995956, + "num_tokens": 10166116.0, + "step": 1153 + }, + { + "entropy": 1.2250197231769562, + "epoch": 4.166515837104073, + "grad_norm": 0.4827875792980194, + "learning_rate": 0.00015525952582209725, + "loss": 0.0432, + "mean_token_accuracy": 0.9888117164373398, + "num_tokens": 10174949.0, + "step": 1154 + }, + { + "entropy": 1.2532283961772919, + "epoch": 4.170135746606335, + "grad_norm": 0.6164746284484863, + "learning_rate": 0.0001548846183805304, + "loss": 0.0489, + "mean_token_accuracy": 0.989223524928093, + "num_tokens": 10183649.0, + "step": 1155 + }, + { + "entropy": 1.3029357194900513, + "epoch": 4.173755656108598, + "grad_norm": 0.4397593140602112, + "learning_rate": 0.00015451019013116186, + "loss": 0.0239, + "mean_token_accuracy": 0.9899367988109589, + "num_tokens": 10192293.0, + "step": 1156 + }, + { + "entropy": 1.3099361062049866, + "epoch": 4.17737556561086, + "grad_norm": 0.5233256816864014, + "learning_rate": 0.00015413624249611773, + "loss": 0.0361, + "mean_token_accuracy": 0.9909869581460953, + "num_tokens": 10200972.0, + "step": 1157 + }, + { + "entropy": 1.332322210073471, + "epoch": 4.180995475113122, + "grad_norm": 0.40877828001976013, + "learning_rate": 0.00015376277689569884, + "loss": 0.0378, + "mean_token_accuracy": 0.9867051988840103, + "num_tokens": 10209383.0, + "step": 1158 + }, + { + "entropy": 1.2286882102489471, + "epoch": 4.184615384615385, + "grad_norm": 0.4264864921569824, + "learning_rate": 0.00015338979474837508, + "loss": 0.0349, + "mean_token_accuracy": 0.9909752011299133, + "num_tokens": 10218431.0, + "step": 1159 + }, + { + "entropy": 1.3134422600269318, + "epoch": 4.188235294117647, + "grad_norm": 0.5659143328666687, + "learning_rate": 0.00015301729747078027, + "loss": 0.0472, + "mean_token_accuracy": 0.9854199290275574, + "num_tokens": 10227234.0, + "step": 1160 + }, + { + "entropy": 1.2364612519741058, + "epoch": 4.19185520361991, + "grad_norm": 0.556576132774353, + "learning_rate": 0.00015264528647770644, + "loss": 0.0572, + "mean_token_accuracy": 0.9870803952217102, + "num_tokens": 10236143.0, + "step": 1161 + }, + { + "entropy": 1.3200373947620392, + "epoch": 4.195475113122172, + "grad_norm": 0.5832968950271606, + "learning_rate": 0.0001522737631820988, + "loss": 0.0596, + "mean_token_accuracy": 0.9803946912288666, + "num_tokens": 10244533.0, + "step": 1162 + }, + { + "entropy": 1.2263735830783844, + "epoch": 4.199095022624435, + "grad_norm": 0.5366385579109192, + "learning_rate": 0.00015190272899505024, + "loss": 0.0557, + "mean_token_accuracy": 0.9855044633150101, + "num_tokens": 10253397.0, + "step": 1163 + }, + { + "entropy": 1.2398187220096588, + "epoch": 4.202714932126697, + "grad_norm": 0.3928399384021759, + "learning_rate": 0.0001515321853257958, + "loss": 0.0353, + "mean_token_accuracy": 0.9881556481122971, + "num_tokens": 10262218.0, + "step": 1164 + }, + { + "entropy": 1.2799296081066132, + "epoch": 4.206334841628959, + "grad_norm": 0.34111514687538147, + "learning_rate": 0.00015116213358170756, + "loss": 0.0205, + "mean_token_accuracy": 0.9952614158391953, + "num_tokens": 10270643.0, + "step": 1165 + }, + { + "entropy": 1.2533689439296722, + "epoch": 4.209954751131222, + "grad_norm": 0.6038467884063721, + "learning_rate": 0.00015079257516828923, + "loss": 0.0364, + "mean_token_accuracy": 0.9885422587394714, + "num_tokens": 10279192.0, + "step": 1166 + }, + { + "entropy": 1.286198616027832, + "epoch": 4.213574660633484, + "grad_norm": 0.3359139561653137, + "learning_rate": 0.00015042351148917074, + "loss": 0.0169, + "mean_token_accuracy": 0.9941065609455109, + "num_tokens": 10287955.0, + "step": 1167 + }, + { + "entropy": 1.246151328086853, + "epoch": 4.217194570135747, + "grad_norm": 0.5997951030731201, + "learning_rate": 0.00015005494394610306, + "loss": 0.046, + "mean_token_accuracy": 0.98799067735672, + "num_tokens": 10296726.0, + "step": 1168 + }, + { + "entropy": 1.1993427872657776, + "epoch": 4.220814479638009, + "grad_norm": 0.48044100403785706, + "learning_rate": 0.00014968687393895243, + "loss": 0.037, + "mean_token_accuracy": 0.9872793555259705, + "num_tokens": 10306144.0, + "step": 1169 + }, + { + "entropy": 1.283030241727829, + "epoch": 4.224434389140272, + "grad_norm": 0.5447496175765991, + "learning_rate": 0.00014931930286569606, + "loss": 0.0512, + "mean_token_accuracy": 0.9839747399091721, + "num_tokens": 10314787.0, + "step": 1170 + }, + { + "entropy": 1.289113700389862, + "epoch": 4.228054298642534, + "grad_norm": 0.4241376221179962, + "learning_rate": 0.00014895223212241547, + "loss": 0.0337, + "mean_token_accuracy": 0.9877157956361771, + "num_tokens": 10323498.0, + "step": 1171 + }, + { + "entropy": 1.2173149585723877, + "epoch": 4.2316742081447964, + "grad_norm": 0.4567115604877472, + "learning_rate": 0.00014858566310329204, + "loss": 0.0308, + "mean_token_accuracy": 0.9885731935501099, + "num_tokens": 10332577.0, + "step": 1172 + }, + { + "entropy": 1.2338224053382874, + "epoch": 4.235294117647059, + "grad_norm": 0.7391862869262695, + "learning_rate": 0.00014821959720060196, + "loss": 0.0566, + "mean_token_accuracy": 0.9849485456943512, + "num_tokens": 10341145.0, + "step": 1173 + }, + { + "entropy": 1.2072623670101166, + "epoch": 4.238914027149321, + "grad_norm": 0.4814695715904236, + "learning_rate": 0.00014785403580470983, + "loss": 0.0337, + "mean_token_accuracy": 0.9873705059289932, + "num_tokens": 10349995.0, + "step": 1174 + }, + { + "entropy": 1.2393099963665009, + "epoch": 4.242533936651584, + "grad_norm": 0.9794650673866272, + "learning_rate": 0.0001474889803040645, + "loss": 0.1513, + "mean_token_accuracy": 0.970646321773529, + "num_tokens": 10358839.0, + "step": 1175 + }, + { + "entropy": 1.2389378249645233, + "epoch": 4.246153846153846, + "grad_norm": 0.5181885361671448, + "learning_rate": 0.00014712443208519352, + "loss": 0.0486, + "mean_token_accuracy": 0.9840831756591797, + "num_tokens": 10367605.0, + "step": 1176 + }, + { + "entropy": 1.179212898015976, + "epoch": 4.249773755656109, + "grad_norm": 0.36691412329673767, + "learning_rate": 0.0001467603925326972, + "loss": 0.0328, + "mean_token_accuracy": 0.9896590262651443, + "num_tokens": 10377166.0, + "step": 1177 + }, + { + "entropy": 1.1962746381759644, + "epoch": 4.253393665158371, + "grad_norm": 0.44420450925827026, + "learning_rate": 0.00014639686302924418, + "loss": 0.0347, + "mean_token_accuracy": 0.9883467555046082, + "num_tokens": 10386492.0, + "step": 1178 + }, + { + "entropy": 1.280052810907364, + "epoch": 4.2570135746606335, + "grad_norm": 0.7159825563430786, + "learning_rate": 0.000146033844955566, + "loss": 0.0478, + "mean_token_accuracy": 0.9879229664802551, + "num_tokens": 10395177.0, + "step": 1179 + }, + { + "entropy": 1.201603651046753, + "epoch": 4.260633484162896, + "grad_norm": 0.5321839451789856, + "learning_rate": 0.00014567133969045157, + "loss": 0.0549, + "mean_token_accuracy": 0.9802237451076508, + "num_tokens": 10404061.0, + "step": 1180 + }, + { + "entropy": 1.2947501838207245, + "epoch": 4.264253393665158, + "grad_norm": 0.4968154728412628, + "learning_rate": 0.00014530934861074193, + "loss": 0.0483, + "mean_token_accuracy": 0.9880443960428238, + "num_tokens": 10412500.0, + "step": 1181 + }, + { + "entropy": 1.3257293999195099, + "epoch": 4.267873303167421, + "grad_norm": 0.5405517220497131, + "learning_rate": 0.00014494787309132537, + "loss": 0.0334, + "mean_token_accuracy": 0.9880732297897339, + "num_tokens": 10420905.0, + "step": 1182 + }, + { + "entropy": 1.1579481065273285, + "epoch": 4.271493212669683, + "grad_norm": 0.4960266053676605, + "learning_rate": 0.00014458691450513212, + "loss": 0.0459, + "mean_token_accuracy": 0.9850940555334091, + "num_tokens": 10430442.0, + "step": 1183 + }, + { + "entropy": 1.259652554988861, + "epoch": 4.275113122171946, + "grad_norm": 0.5519370436668396, + "learning_rate": 0.00014422647422312874, + "loss": 0.0451, + "mean_token_accuracy": 0.9860022515058517, + "num_tokens": 10439289.0, + "step": 1184 + }, + { + "entropy": 1.1983542442321777, + "epoch": 4.278733031674208, + "grad_norm": 0.46854233741760254, + "learning_rate": 0.00014386655361431336, + "loss": 0.0378, + "mean_token_accuracy": 0.984845370054245, + "num_tokens": 10448374.0, + "step": 1185 + }, + { + "entropy": 1.2637499272823334, + "epoch": 4.2823529411764705, + "grad_norm": 0.41014033555984497, + "learning_rate": 0.00014350715404571045, + "loss": 0.0276, + "mean_token_accuracy": 0.9908969849348068, + "num_tokens": 10457150.0, + "step": 1186 + }, + { + "entropy": 1.185476541519165, + "epoch": 4.285972850678733, + "grad_norm": 0.36182934045791626, + "learning_rate": 0.00014314827688236527, + "loss": 0.0232, + "mean_token_accuracy": 0.9917554408311844, + "num_tokens": 10466114.0, + "step": 1187 + }, + { + "entropy": 1.2526941895484924, + "epoch": 4.289592760180995, + "grad_norm": 0.42682337760925293, + "learning_rate": 0.00014278992348733897, + "loss": 0.0258, + "mean_token_accuracy": 0.9892576783895493, + "num_tokens": 10474841.0, + "step": 1188 + }, + { + "entropy": 1.2896224856376648, + "epoch": 4.293212669683258, + "grad_norm": 0.6532279253005981, + "learning_rate": 0.00014243209522170366, + "loss": 0.0403, + "mean_token_accuracy": 0.9877428412437439, + "num_tokens": 10483179.0, + "step": 1189 + }, + { + "entropy": 1.2147268652915955, + "epoch": 4.29683257918552, + "grad_norm": 0.5174675583839417, + "learning_rate": 0.0001420747934445364, + "loss": 0.049, + "mean_token_accuracy": 0.9841071367263794, + "num_tokens": 10492012.0, + "step": 1190 + }, + { + "entropy": 1.2130130529403687, + "epoch": 4.300452488687783, + "grad_norm": 0.3487169146537781, + "learning_rate": 0.00014171801951291495, + "loss": 0.0379, + "mean_token_accuracy": 0.987470880150795, + "num_tokens": 10500781.0, + "step": 1191 + }, + { + "entropy": 1.2114128768444061, + "epoch": 4.304072398190045, + "grad_norm": 0.3877721428871155, + "learning_rate": 0.00014136177478191232, + "loss": 0.0366, + "mean_token_accuracy": 0.9868324846029282, + "num_tokens": 10509827.0, + "step": 1192 + }, + { + "entropy": 1.2727918028831482, + "epoch": 4.3076923076923075, + "grad_norm": 0.4810413122177124, + "learning_rate": 0.00014100606060459136, + "loss": 0.0264, + "mean_token_accuracy": 0.9923188835382462, + "num_tokens": 10518320.0, + "step": 1193 + }, + { + "entropy": 1.2287248075008392, + "epoch": 4.31131221719457, + "grad_norm": 0.4871313273906708, + "learning_rate": 0.0001406508783319996, + "loss": 0.0291, + "mean_token_accuracy": 0.9893288463354111, + "num_tokens": 10527372.0, + "step": 1194 + }, + { + "entropy": 1.2256539463996887, + "epoch": 4.314932126696832, + "grad_norm": 0.4660762846469879, + "learning_rate": 0.00014029622931316488, + "loss": 0.0442, + "mean_token_accuracy": 0.9856550842523575, + "num_tokens": 10536221.0, + "step": 1195 + }, + { + "entropy": 1.220929890871048, + "epoch": 4.318552036199095, + "grad_norm": 0.5266750454902649, + "learning_rate": 0.00013994211489508937, + "loss": 0.045, + "mean_token_accuracy": 0.9869575053453445, + "num_tokens": 10544855.0, + "step": 1196 + }, + { + "entropy": 1.2149785161018372, + "epoch": 4.322171945701357, + "grad_norm": 0.38891637325286865, + "learning_rate": 0.00013958853642274445, + "loss": 0.0279, + "mean_token_accuracy": 0.9925510138273239, + "num_tokens": 10553663.0, + "step": 1197 + }, + { + "entropy": 1.2534555792808533, + "epoch": 4.32579185520362, + "grad_norm": 0.504973828792572, + "learning_rate": 0.0001392354952390665, + "loss": 0.0265, + "mean_token_accuracy": 0.9886986464262009, + "num_tokens": 10562210.0, + "step": 1198 + }, + { + "entropy": 1.1534670293331146, + "epoch": 4.329411764705882, + "grad_norm": 0.3674943745136261, + "learning_rate": 0.00013888299268495095, + "loss": 0.0287, + "mean_token_accuracy": 0.9921697527170181, + "num_tokens": 10571681.0, + "step": 1199 + }, + { + "entropy": 1.1742701530456543, + "epoch": 4.3330316742081445, + "grad_norm": 0.5762848854064941, + "learning_rate": 0.0001385310300992471, + "loss": 0.0652, + "mean_token_accuracy": 0.9827972054481506, + "num_tokens": 10580931.0, + "step": 1200 + }, + { + "entropy": 1.224473923444748, + "epoch": 4.336651583710407, + "grad_norm": 0.3944694697856903, + "learning_rate": 0.00013817960881875406, + "loss": 0.0371, + "mean_token_accuracy": 0.9884412586688995, + "num_tokens": 10589820.0, + "step": 1201 + }, + { + "entropy": 1.1943216919898987, + "epoch": 4.340271493212669, + "grad_norm": 0.44515159726142883, + "learning_rate": 0.0001378287301782145, + "loss": 0.0354, + "mean_token_accuracy": 0.9919027835130692, + "num_tokens": 10599295.0, + "step": 1202 + }, + { + "entropy": 1.2744194567203522, + "epoch": 4.343891402714932, + "grad_norm": 0.5868192911148071, + "learning_rate": 0.0001374783955103102, + "loss": 0.0329, + "mean_token_accuracy": 0.990682065486908, + "num_tokens": 10607671.0, + "step": 1203 + }, + { + "entropy": 1.2263163924217224, + "epoch": 4.347511312217194, + "grad_norm": 0.7657507658004761, + "learning_rate": 0.00013712860614565687, + "loss": 0.0557, + "mean_token_accuracy": 0.9864864498376846, + "num_tokens": 10616269.0, + "step": 1204 + }, + { + "entropy": 1.1996283829212189, + "epoch": 4.351131221719457, + "grad_norm": 0.4356515407562256, + "learning_rate": 0.00013677936341279913, + "loss": 0.034, + "mean_token_accuracy": 0.9890493154525757, + "num_tokens": 10624896.0, + "step": 1205 + }, + { + "entropy": 1.2461954951286316, + "epoch": 4.354751131221719, + "grad_norm": 0.4709055721759796, + "learning_rate": 0.0001364306686382054, + "loss": 0.0372, + "mean_token_accuracy": 0.9871190786361694, + "num_tokens": 10633634.0, + "step": 1206 + }, + { + "entropy": 1.2780955731868744, + "epoch": 4.3583710407239815, + "grad_norm": 0.4301148056983948, + "learning_rate": 0.00013608252314626284, + "loss": 0.0302, + "mean_token_accuracy": 0.990274652838707, + "num_tokens": 10641899.0, + "step": 1207 + }, + { + "entropy": 1.1826459169387817, + "epoch": 4.361990950226244, + "grad_norm": 0.42350223660469055, + "learning_rate": 0.00013573492825927238, + "loss": 0.0359, + "mean_token_accuracy": 0.9923158586025238, + "num_tokens": 10650903.0, + "step": 1208 + }, + { + "entropy": 1.2023819386959076, + "epoch": 4.365610859728506, + "grad_norm": 0.5379685759544373, + "learning_rate": 0.00013538788529744375, + "loss": 0.0416, + "mean_token_accuracy": 0.990043118596077, + "num_tokens": 10659772.0, + "step": 1209 + }, + { + "entropy": 1.2026404440402985, + "epoch": 4.36923076923077, + "grad_norm": 0.4013594388961792, + "learning_rate": 0.00013504139557889033, + "loss": 0.0287, + "mean_token_accuracy": 0.9898358583450317, + "num_tokens": 10668661.0, + "step": 1210 + }, + { + "entropy": 1.1808496117591858, + "epoch": 4.372850678733032, + "grad_norm": 0.4315294623374939, + "learning_rate": 0.0001346954604196242, + "loss": 0.0345, + "mean_token_accuracy": 0.9906547367572784, + "num_tokens": 10677439.0, + "step": 1211 + }, + { + "entropy": 1.233126848936081, + "epoch": 4.376470588235295, + "grad_norm": 0.46303990483283997, + "learning_rate": 0.00013435008113355125, + "loss": 0.038, + "mean_token_accuracy": 0.9871664345264435, + "num_tokens": 10686193.0, + "step": 1212 + }, + { + "entropy": 1.1450912654399872, + "epoch": 4.380090497737557, + "grad_norm": 0.39165055751800537, + "learning_rate": 0.0001340052590324659, + "loss": 0.0268, + "mean_token_accuracy": 0.9913052469491959, + "num_tokens": 10695309.0, + "step": 1213 + }, + { + "entropy": 1.213069200515747, + "epoch": 4.383710407239819, + "grad_norm": 0.6870453357696533, + "learning_rate": 0.00013366099542604657, + "loss": 0.0928, + "mean_token_accuracy": 0.9731069356203079, + "num_tokens": 10704279.0, + "step": 1214 + }, + { + "entropy": 1.2083907425403595, + "epoch": 4.387330316742082, + "grad_norm": 0.5568958520889282, + "learning_rate": 0.00013331729162185021, + "loss": 0.0457, + "mean_token_accuracy": 0.9876127988100052, + "num_tokens": 10712689.0, + "step": 1215 + }, + { + "entropy": 1.1905820965766907, + "epoch": 4.390950226244344, + "grad_norm": 0.677911639213562, + "learning_rate": 0.00013297414892530775, + "loss": 0.0578, + "mean_token_accuracy": 0.9796578139066696, + "num_tokens": 10721429.0, + "step": 1216 + }, + { + "entropy": 1.1438561081886292, + "epoch": 4.394570135746607, + "grad_norm": 0.3332656919956207, + "learning_rate": 0.00013263156863971883, + "loss": 0.0211, + "mean_token_accuracy": 0.9961917698383331, + "num_tokens": 10729998.0, + "step": 1217 + }, + { + "entropy": 1.2401678264141083, + "epoch": 4.398190045248869, + "grad_norm": 0.4719739854335785, + "learning_rate": 0.00013228955206624703, + "loss": 0.0205, + "mean_token_accuracy": 0.9942127913236618, + "num_tokens": 10738536.0, + "step": 1218 + }, + { + "entropy": 1.1324664056301117, + "epoch": 4.401809954751132, + "grad_norm": 0.44815942645072937, + "learning_rate": 0.0001319481005039149, + "loss": 0.0343, + "mean_token_accuracy": 0.9881940931081772, + "num_tokens": 10747845.0, + "step": 1219 + }, + { + "entropy": 1.226366937160492, + "epoch": 4.405429864253394, + "grad_norm": 0.53164142370224, + "learning_rate": 0.00013160721524959904, + "loss": 0.0301, + "mean_token_accuracy": 0.9902307987213135, + "num_tokens": 10756478.0, + "step": 1220 + }, + { + "entropy": 1.1090387403964996, + "epoch": 4.409049773755656, + "grad_norm": 0.5124022364616394, + "learning_rate": 0.00013126689759802504, + "loss": 0.0475, + "mean_token_accuracy": 0.9851136803627014, + "num_tokens": 10765981.0, + "step": 1221 + }, + { + "entropy": 1.224695086479187, + "epoch": 4.412669683257919, + "grad_norm": 0.5596076250076294, + "learning_rate": 0.00013092714884176262, + "loss": 0.0398, + "mean_token_accuracy": 0.9881332963705063, + "num_tokens": 10774937.0, + "step": 1222 + }, + { + "entropy": 1.1444518268108368, + "epoch": 4.416289592760181, + "grad_norm": 0.593393087387085, + "learning_rate": 0.00013058797027122108, + "loss": 0.0582, + "mean_token_accuracy": 0.986479327082634, + "num_tokens": 10784549.0, + "step": 1223 + }, + { + "entropy": 1.2317917943000793, + "epoch": 4.419909502262444, + "grad_norm": 0.5458840727806091, + "learning_rate": 0.00013024936317464366, + "loss": 0.0435, + "mean_token_accuracy": 0.9879386126995087, + "num_tokens": 10792945.0, + "step": 1224 + }, + { + "entropy": 1.1805288195610046, + "epoch": 4.423529411764706, + "grad_norm": 1.2501155138015747, + "learning_rate": 0.00012991132883810328, + "loss": 0.0468, + "mean_token_accuracy": 0.9834897071123123, + "num_tokens": 10801754.0, + "step": 1225 + }, + { + "entropy": 1.267040103673935, + "epoch": 4.427149321266969, + "grad_norm": 0.371787428855896, + "learning_rate": 0.0001295738685454976, + "loss": 0.0276, + "mean_token_accuracy": 0.9908801317214966, + "num_tokens": 10810153.0, + "step": 1226 + }, + { + "entropy": 1.1796127259731293, + "epoch": 4.430769230769231, + "grad_norm": 0.5949414968490601, + "learning_rate": 0.00012923698357854367, + "loss": 0.065, + "mean_token_accuracy": 0.9796924442052841, + "num_tokens": 10819188.0, + "step": 1227 + }, + { + "entropy": 1.1894229352474213, + "epoch": 4.4343891402714934, + "grad_norm": 0.5610167980194092, + "learning_rate": 0.00012890067521677343, + "loss": 0.0579, + "mean_token_accuracy": 0.9834547340869904, + "num_tokens": 10827890.0, + "step": 1228 + }, + { + "entropy": 1.193329244852066, + "epoch": 4.438009049773756, + "grad_norm": 0.5496513247489929, + "learning_rate": 0.00012856494473752919, + "loss": 0.0699, + "mean_token_accuracy": 0.9785247892141342, + "num_tokens": 10836838.0, + "step": 1229 + }, + { + "entropy": 1.203848272562027, + "epoch": 4.441628959276018, + "grad_norm": 0.4252963364124298, + "learning_rate": 0.00012822979341595785, + "loss": 0.0298, + "mean_token_accuracy": 0.9896065294742584, + "num_tokens": 10845834.0, + "step": 1230 + }, + { + "entropy": 1.207732379436493, + "epoch": 4.445248868778281, + "grad_norm": 0.517036497592926, + "learning_rate": 0.00012789522252500685, + "loss": 0.0638, + "mean_token_accuracy": 0.9777253717184067, + "num_tokens": 10855063.0, + "step": 1231 + }, + { + "entropy": 1.1849088370800018, + "epoch": 4.448868778280543, + "grad_norm": 0.2902492582798004, + "learning_rate": 0.0001275612333354193, + "loss": 0.019, + "mean_token_accuracy": 0.9935135394334793, + "num_tokens": 10863916.0, + "step": 1232 + }, + { + "entropy": 1.2052322030067444, + "epoch": 4.452488687782806, + "grad_norm": 0.39002323150634766, + "learning_rate": 0.00012722782711572852, + "loss": 0.0338, + "mean_token_accuracy": 0.9920508861541748, + "num_tokens": 10872704.0, + "step": 1233 + }, + { + "entropy": 1.206953376531601, + "epoch": 4.456108597285068, + "grad_norm": 0.4019605815410614, + "learning_rate": 0.00012689500513225372, + "loss": 0.0334, + "mean_token_accuracy": 0.993528202176094, + "num_tokens": 10881430.0, + "step": 1234 + }, + { + "entropy": 1.1898608207702637, + "epoch": 4.4597285067873305, + "grad_norm": 0.4211233854293823, + "learning_rate": 0.00012656276864909545, + "loss": 0.0279, + "mean_token_accuracy": 0.9896760433912277, + "num_tokens": 10890291.0, + "step": 1235 + }, + { + "entropy": 1.1882571280002594, + "epoch": 4.463348416289593, + "grad_norm": 0.6270121335983276, + "learning_rate": 0.00012623111892813018, + "loss": 0.067, + "mean_token_accuracy": 0.9778329133987427, + "num_tokens": 10899339.0, + "step": 1236 + }, + { + "entropy": 1.2167966961860657, + "epoch": 4.466968325791855, + "grad_norm": 0.5115417242050171, + "learning_rate": 0.00012590005722900558, + "loss": 0.0438, + "mean_token_accuracy": 0.9870314300060272, + "num_tokens": 10908084.0, + "step": 1237 + }, + { + "entropy": 1.2357282936573029, + "epoch": 4.470588235294118, + "grad_norm": 0.4048464894294739, + "learning_rate": 0.00012556958480913644, + "loss": 0.0211, + "mean_token_accuracy": 0.9938371032476425, + "num_tokens": 10916729.0, + "step": 1238 + }, + { + "entropy": 1.1960048377513885, + "epoch": 4.47420814479638, + "grad_norm": 0.519778847694397, + "learning_rate": 0.00012523970292369906, + "loss": 0.0352, + "mean_token_accuracy": 0.9902931302785873, + "num_tokens": 10925469.0, + "step": 1239 + }, + { + "entropy": 1.115236908197403, + "epoch": 4.477828054298643, + "grad_norm": 1.1697970628738403, + "learning_rate": 0.00012491041282562673, + "loss": 0.0516, + "mean_token_accuracy": 0.9826432168483734, + "num_tokens": 10935029.0, + "step": 1240 + }, + { + "entropy": 1.1740309596061707, + "epoch": 4.481447963800905, + "grad_norm": 0.5328389406204224, + "learning_rate": 0.00012458171576560541, + "loss": 0.0369, + "mean_token_accuracy": 0.9888178706169128, + "num_tokens": 10943983.0, + "step": 1241 + }, + { + "entropy": 1.2090014219284058, + "epoch": 4.4850678733031675, + "grad_norm": 0.3985499441623688, + "learning_rate": 0.0001242536129920684, + "loss": 0.024, + "mean_token_accuracy": 0.9895037710666656, + "num_tokens": 10952723.0, + "step": 1242 + }, + { + "entropy": 1.1451536118984222, + "epoch": 4.48868778280543, + "grad_norm": 0.4824701249599457, + "learning_rate": 0.00012392610575119164, + "loss": 0.0366, + "mean_token_accuracy": 0.9898587912321091, + "num_tokens": 10961935.0, + "step": 1243 + }, + { + "entropy": 1.256010115146637, + "epoch": 4.492307692307692, + "grad_norm": 0.9043450951576233, + "learning_rate": 0.00012359919528688959, + "loss": 0.1034, + "mean_token_accuracy": 0.9813459366559982, + "num_tokens": 10970679.0, + "step": 1244 + }, + { + "entropy": 1.1992116272449493, + "epoch": 4.495927601809955, + "grad_norm": 0.359037309885025, + "learning_rate": 0.00012327288284080977, + "loss": 0.0262, + "mean_token_accuracy": 0.9912933856248856, + "num_tokens": 10979305.0, + "step": 1245 + }, + { + "entropy": 1.157044231891632, + "epoch": 4.499547511312217, + "grad_norm": 0.6599268913269043, + "learning_rate": 0.00012294716965232847, + "loss": 0.0492, + "mean_token_accuracy": 0.9798050671815872, + "num_tokens": 10988385.0, + "step": 1246 + }, + { + "entropy": 1.2378050088882446, + "epoch": 4.50316742081448, + "grad_norm": 0.28043484687805176, + "learning_rate": 0.00012262205695854584, + "loss": 0.0139, + "mean_token_accuracy": 0.9976954162120819, + "num_tokens": 10996897.0, + "step": 1247 + }, + { + "entropy": 1.2076781392097473, + "epoch": 4.506787330316742, + "grad_norm": 0.5077516436576843, + "learning_rate": 0.0001222975459942814, + "loss": 0.0409, + "mean_token_accuracy": 0.9874622672796249, + "num_tokens": 11005528.0, + "step": 1248 + }, + { + "entropy": 1.1962661147117615, + "epoch": 4.5104072398190045, + "grad_norm": 0.33547911047935486, + "learning_rate": 0.00012197363799206908, + "loss": 0.0161, + "mean_token_accuracy": 0.9955142885446548, + "num_tokens": 11014356.0, + "step": 1249 + }, + { + "entropy": 1.1652462780475616, + "epoch": 4.514027149321267, + "grad_norm": 0.4315963685512543, + "learning_rate": 0.00012165033418215278, + "loss": 0.038, + "mean_token_accuracy": 0.986273393034935, + "num_tokens": 11023148.0, + "step": 1250 + }, + { + "entropy": 1.1507481038570404, + "epoch": 4.517647058823529, + "grad_norm": 0.5866735577583313, + "learning_rate": 0.00012132763579248157, + "loss": 0.0709, + "mean_token_accuracy": 0.9747334122657776, + "num_tokens": 11032698.0, + "step": 1251 + }, + { + "entropy": 1.1324940025806427, + "epoch": 4.521266968325792, + "grad_norm": 0.6004504561424255, + "learning_rate": 0.00012100554404870504, + "loss": 0.062, + "mean_token_accuracy": 0.9802116751670837, + "num_tokens": 11042191.0, + "step": 1252 + }, + { + "entropy": 1.2644992768764496, + "epoch": 4.524886877828054, + "grad_norm": 0.4573010504245758, + "learning_rate": 0.00012068406017416869, + "loss": 0.0454, + "mean_token_accuracy": 0.9845332503318787, + "num_tokens": 11050560.0, + "step": 1253 + }, + { + "entropy": 1.1976861953735352, + "epoch": 4.528506787330317, + "grad_norm": 0.5563387870788574, + "learning_rate": 0.00012036318538990926, + "loss": 0.0461, + "mean_token_accuracy": 0.988158106803894, + "num_tokens": 11059522.0, + "step": 1254 + }, + { + "entropy": 1.1570810675621033, + "epoch": 4.532126696832579, + "grad_norm": 0.32090041041374207, + "learning_rate": 0.00012004292091465011, + "loss": 0.0271, + "mean_token_accuracy": 0.9932841211557388, + "num_tokens": 11068647.0, + "step": 1255 + }, + { + "entropy": 1.1704442203044891, + "epoch": 4.5357466063348415, + "grad_norm": 0.4426371455192566, + "learning_rate": 0.00011972326796479646, + "loss": 0.0382, + "mean_token_accuracy": 0.991062343120575, + "num_tokens": 11077626.0, + "step": 1256 + }, + { + "entropy": 1.1977724432945251, + "epoch": 4.539366515837104, + "grad_norm": 0.37808912992477417, + "learning_rate": 0.00011940422775443095, + "loss": 0.0294, + "mean_token_accuracy": 0.9922671616077423, + "num_tokens": 11086470.0, + "step": 1257 + }, + { + "entropy": 1.1857715249061584, + "epoch": 4.542986425339366, + "grad_norm": 0.5283500552177429, + "learning_rate": 0.00011908580149530903, + "loss": 0.0498, + "mean_token_accuracy": 0.983918771147728, + "num_tokens": 11095267.0, + "step": 1258 + }, + { + "entropy": 1.2057181596755981, + "epoch": 4.546606334841629, + "grad_norm": 0.6116226315498352, + "learning_rate": 0.00011876799039685415, + "loss": 0.0617, + "mean_token_accuracy": 0.9871800243854523, + "num_tokens": 11103982.0, + "step": 1259 + }, + { + "entropy": 1.1275395154953003, + "epoch": 4.550226244343891, + "grad_norm": 0.374963641166687, + "learning_rate": 0.0001184507956661534, + "loss": 0.0392, + "mean_token_accuracy": 0.9863047152757645, + "num_tokens": 11113139.0, + "step": 1260 + }, + { + "entropy": 1.2039974927902222, + "epoch": 4.553846153846154, + "grad_norm": 0.37970781326293945, + "learning_rate": 0.0001181342185079528, + "loss": 0.0316, + "mean_token_accuracy": 0.9907213151454926, + "num_tokens": 11121932.0, + "step": 1261 + }, + { + "entropy": 1.1615934669971466, + "epoch": 4.557466063348416, + "grad_norm": 0.5714005827903748, + "learning_rate": 0.00011781826012465267, + "loss": 0.076, + "mean_token_accuracy": 0.9788525849580765, + "num_tokens": 11131104.0, + "step": 1262 + }, + { + "entropy": 1.2253602147102356, + "epoch": 4.5610859728506785, + "grad_norm": 0.39638063311576843, + "learning_rate": 0.0001175029217163033, + "loss": 0.029, + "mean_token_accuracy": 0.9923384636640549, + "num_tokens": 11139439.0, + "step": 1263 + }, + { + "entropy": 1.226563960313797, + "epoch": 4.564705882352941, + "grad_norm": 0.6515656113624573, + "learning_rate": 0.00011718820448060013, + "loss": 0.039, + "mean_token_accuracy": 0.9874402731657028, + "num_tokens": 11148163.0, + "step": 1264 + }, + { + "entropy": 1.2068665623664856, + "epoch": 4.568325791855203, + "grad_norm": 0.4194830656051636, + "learning_rate": 0.00011687410961287929, + "loss": 0.0315, + "mean_token_accuracy": 0.9902697205543518, + "num_tokens": 11156698.0, + "step": 1265 + }, + { + "entropy": 1.185595840215683, + "epoch": 4.571945701357466, + "grad_norm": 0.3697413504123688, + "learning_rate": 0.00011656063830611315, + "loss": 0.0192, + "mean_token_accuracy": 0.9939739406108856, + "num_tokens": 11165563.0, + "step": 1266 + }, + { + "entropy": 1.146008312702179, + "epoch": 4.575565610859728, + "grad_norm": 0.6688143610954285, + "learning_rate": 0.0001162477917509057, + "loss": 0.0578, + "mean_token_accuracy": 0.9821225702762604, + "num_tokens": 11175007.0, + "step": 1267 + }, + { + "entropy": 1.1765292882919312, + "epoch": 4.579185520361991, + "grad_norm": 0.40471139550209045, + "learning_rate": 0.00011593557113548798, + "loss": 0.0424, + "mean_token_accuracy": 0.9884167313575745, + "num_tokens": 11184158.0, + "step": 1268 + }, + { + "entropy": 1.236164003610611, + "epoch": 4.582805429864253, + "grad_norm": 0.3746803104877472, + "learning_rate": 0.00011562397764571371, + "loss": 0.0214, + "mean_token_accuracy": 0.9929858446121216, + "num_tokens": 11192773.0, + "step": 1269 + }, + { + "entropy": 1.2086671888828278, + "epoch": 4.5864253393665155, + "grad_norm": 0.45422643423080444, + "learning_rate": 0.00011531301246505468, + "loss": 0.0348, + "mean_token_accuracy": 0.9905667155981064, + "num_tokens": 11201449.0, + "step": 1270 + }, + { + "entropy": 1.2117674052715302, + "epoch": 4.590045248868778, + "grad_norm": 0.6983218193054199, + "learning_rate": 0.00011500267677459625, + "loss": 0.0222, + "mean_token_accuracy": 0.9888782948255539, + "num_tokens": 11210311.0, + "step": 1271 + }, + { + "entropy": 1.2101182341575623, + "epoch": 4.59366515837104, + "grad_norm": 0.6044661998748779, + "learning_rate": 0.00011469297175303293, + "loss": 0.046, + "mean_token_accuracy": 0.9895824193954468, + "num_tokens": 11219016.0, + "step": 1272 + }, + { + "entropy": 1.1193795204162598, + "epoch": 4.597285067873303, + "grad_norm": 0.7723874449729919, + "learning_rate": 0.00011438389857666392, + "loss": 0.0758, + "mean_token_accuracy": 0.981050118803978, + "num_tokens": 11228429.0, + "step": 1273 + }, + { + "entropy": 1.240187257528305, + "epoch": 4.600904977375565, + "grad_norm": 0.680292010307312, + "learning_rate": 0.00011407545841938842, + "loss": 0.0418, + "mean_token_accuracy": 0.986311137676239, + "num_tokens": 11236929.0, + "step": 1274 + }, + { + "entropy": 1.2108842730522156, + "epoch": 4.604524886877828, + "grad_norm": 0.6665503978729248, + "learning_rate": 0.00011376765245270154, + "loss": 0.0679, + "mean_token_accuracy": 0.9780846238136292, + "num_tokens": 11245589.0, + "step": 1275 + }, + { + "entropy": 1.1833328604698181, + "epoch": 4.60814479638009, + "grad_norm": 0.4157174825668335, + "learning_rate": 0.00011346048184568953, + "loss": 0.0244, + "mean_token_accuracy": 0.9866289645433426, + "num_tokens": 11254225.0, + "step": 1276 + }, + { + "entropy": 1.1474134027957916, + "epoch": 4.6117647058823525, + "grad_norm": 0.388327032327652, + "learning_rate": 0.00011315394776502554, + "loss": 0.0305, + "mean_token_accuracy": 0.9905170947313309, + "num_tokens": 11262729.0, + "step": 1277 + }, + { + "entropy": 1.1579011976718903, + "epoch": 4.615384615384615, + "grad_norm": 0.5559958219528198, + "learning_rate": 0.00011284805137496494, + "loss": 0.0263, + "mean_token_accuracy": 0.9905281066894531, + "num_tokens": 11271565.0, + "step": 1278 + }, + { + "entropy": 1.1825282573699951, + "epoch": 4.619004524886877, + "grad_norm": 0.3398473858833313, + "learning_rate": 0.0001125427938373415, + "loss": 0.0175, + "mean_token_accuracy": 0.9951401799917221, + "num_tokens": 11280546.0, + "step": 1279 + }, + { + "entropy": 1.1961012184619904, + "epoch": 4.62262443438914, + "grad_norm": 0.3031075894832611, + "learning_rate": 0.00011223817631156197, + "loss": 0.0161, + "mean_token_accuracy": 0.9932055175304413, + "num_tokens": 11289253.0, + "step": 1280 + }, + { + "entropy": 1.1445344388484955, + "epoch": 4.626244343891402, + "grad_norm": 0.4055817127227783, + "learning_rate": 0.00011193419995460257, + "loss": 0.0305, + "mean_token_accuracy": 0.9872564822435379, + "num_tokens": 11298445.0, + "step": 1281 + }, + { + "entropy": 1.1870165169239044, + "epoch": 4.629864253393665, + "grad_norm": 0.5154533386230469, + "learning_rate": 0.00011163086592100444, + "loss": 0.0318, + "mean_token_accuracy": 0.9910313338041306, + "num_tokens": 11307495.0, + "step": 1282 + }, + { + "entropy": 1.2139601707458496, + "epoch": 4.633484162895927, + "grad_norm": 0.5279651880264282, + "learning_rate": 0.00011132817536286869, + "loss": 0.0293, + "mean_token_accuracy": 0.9919774979352951, + "num_tokens": 11315776.0, + "step": 1283 + }, + { + "entropy": 1.1790938973426819, + "epoch": 4.63710407239819, + "grad_norm": 0.3685486614704132, + "learning_rate": 0.00011102612942985265, + "loss": 0.0185, + "mean_token_accuracy": 0.9948403984308243, + "num_tokens": 11324516.0, + "step": 1284 + }, + { + "entropy": 1.090241402387619, + "epoch": 4.640723981900453, + "grad_norm": 0.35041630268096924, + "learning_rate": 0.00011072472926916545, + "loss": 0.0356, + "mean_token_accuracy": 0.9899317622184753, + "num_tokens": 11333970.0, + "step": 1285 + }, + { + "entropy": 1.113451361656189, + "epoch": 4.644343891402715, + "grad_norm": 0.2749859094619751, + "learning_rate": 0.00011042397602556312, + "loss": 0.0199, + "mean_token_accuracy": 0.9949481189250946, + "num_tokens": 11343096.0, + "step": 1286 + }, + { + "entropy": 1.1552523374557495, + "epoch": 4.647963800904978, + "grad_norm": 0.5669251084327698, + "learning_rate": 0.0001101238708413448, + "loss": 0.0656, + "mean_token_accuracy": 0.9822637885808945, + "num_tokens": 11351931.0, + "step": 1287 + }, + { + "entropy": 1.136662244796753, + "epoch": 4.65158371040724, + "grad_norm": 0.3863164484500885, + "learning_rate": 0.00010982441485634835, + "loss": 0.025, + "mean_token_accuracy": 0.9904049932956696, + "num_tokens": 11360920.0, + "step": 1288 + }, + { + "entropy": 1.1108895540237427, + "epoch": 4.655203619909503, + "grad_norm": 0.31534653902053833, + "learning_rate": 0.0001095256092079458, + "loss": 0.0204, + "mean_token_accuracy": 0.9938828349113464, + "num_tokens": 11369947.0, + "step": 1289 + }, + { + "entropy": 1.1851194500923157, + "epoch": 4.658823529411765, + "grad_norm": 0.5289696455001831, + "learning_rate": 0.00010922745503103884, + "loss": 0.0514, + "mean_token_accuracy": 0.9875812381505966, + "num_tokens": 11378552.0, + "step": 1290 + }, + { + "entropy": 1.0774552524089813, + "epoch": 4.6624434389140275, + "grad_norm": 0.40037772059440613, + "learning_rate": 0.00010892995345805528, + "loss": 0.0364, + "mean_token_accuracy": 0.9890797883272171, + "num_tokens": 11387967.0, + "step": 1291 + }, + { + "entropy": 1.2189326882362366, + "epoch": 4.66606334841629, + "grad_norm": 0.5457913279533386, + "learning_rate": 0.00010863310561894397, + "loss": 0.0478, + "mean_token_accuracy": 0.9867587238550186, + "num_tokens": 11396578.0, + "step": 1292 + }, + { + "entropy": 1.1517588794231415, + "epoch": 4.669683257918552, + "grad_norm": 0.796279788017273, + "learning_rate": 0.00010833691264117066, + "loss": 0.065, + "mean_token_accuracy": 0.9865336418151855, + "num_tokens": 11405363.0, + "step": 1293 + }, + { + "entropy": 1.1254721879959106, + "epoch": 4.673303167420815, + "grad_norm": 0.3716721832752228, + "learning_rate": 0.00010804137564971422, + "loss": 0.023, + "mean_token_accuracy": 0.9923341125249863, + "num_tokens": 11414795.0, + "step": 1294 + }, + { + "entropy": 1.0851550549268723, + "epoch": 4.676923076923077, + "grad_norm": 0.36168938875198364, + "learning_rate": 0.00010774649576706178, + "loss": 0.0284, + "mean_token_accuracy": 0.9884274005889893, + "num_tokens": 11424330.0, + "step": 1295 + }, + { + "entropy": 1.1117421388626099, + "epoch": 4.68054298642534, + "grad_norm": 0.36146998405456543, + "learning_rate": 0.0001074522741132045, + "loss": 0.0308, + "mean_token_accuracy": 0.9892974644899368, + "num_tokens": 11433315.0, + "step": 1296 + }, + { + "entropy": 1.1173599064350128, + "epoch": 4.684162895927602, + "grad_norm": 0.5733943581581116, + "learning_rate": 0.00010715871180563403, + "loss": 0.0595, + "mean_token_accuracy": 0.9814283847808838, + "num_tokens": 11442321.0, + "step": 1297 + }, + { + "entropy": 1.1422770619392395, + "epoch": 4.6877828054298645, + "grad_norm": 0.5190064907073975, + "learning_rate": 0.00010686580995933731, + "loss": 0.0527, + "mean_token_accuracy": 0.9888507723808289, + "num_tokens": 11451531.0, + "step": 1298 + }, + { + "entropy": 1.1887125968933105, + "epoch": 4.691402714932127, + "grad_norm": 0.43336015939712524, + "learning_rate": 0.00010657356968679273, + "loss": 0.0205, + "mean_token_accuracy": 0.9921245276927948, + "num_tokens": 11460372.0, + "step": 1299 + }, + { + "entropy": 1.1815907955169678, + "epoch": 4.695022624434389, + "grad_norm": 0.9948742389678955, + "learning_rate": 0.00010628199209796627, + "loss": 0.1664, + "mean_token_accuracy": 0.9725013822317123, + "num_tokens": 11469511.0, + "step": 1300 + }, + { + "entropy": 1.2282527089118958, + "epoch": 4.698642533936652, + "grad_norm": 0.24515807628631592, + "learning_rate": 0.00010599107830030672, + "loss": 0.0113, + "mean_token_accuracy": 0.9958342462778091, + "num_tokens": 11477768.0, + "step": 1301 + }, + { + "entropy": 1.1868560314178467, + "epoch": 4.702262443438914, + "grad_norm": 0.44611793756484985, + "learning_rate": 0.00010570082939874174, + "loss": 0.0294, + "mean_token_accuracy": 0.9905449897050858, + "num_tokens": 11486161.0, + "step": 1302 + }, + { + "entropy": 1.1398615539073944, + "epoch": 4.705882352941177, + "grad_norm": 0.4731442332267761, + "learning_rate": 0.00010541124649567368, + "loss": 0.0481, + "mean_token_accuracy": 0.9839041233062744, + "num_tokens": 11495424.0, + "step": 1303 + }, + { + "entropy": 1.1303377449512482, + "epoch": 4.709502262443439, + "grad_norm": 0.44781962037086487, + "learning_rate": 0.00010512233069097528, + "loss": 0.0157, + "mean_token_accuracy": 0.9963311403989792, + "num_tokens": 11504201.0, + "step": 1304 + }, + { + "entropy": 1.2015290260314941, + "epoch": 4.7131221719457015, + "grad_norm": 0.7324572801589966, + "learning_rate": 0.00010483408308198563, + "loss": 0.0383, + "mean_token_accuracy": 0.9885118752717972, + "num_tokens": 11513025.0, + "step": 1305 + }, + { + "entropy": 1.111644297838211, + "epoch": 4.716742081447964, + "grad_norm": 0.3878529369831085, + "learning_rate": 0.00010454650476350581, + "loss": 0.0365, + "mean_token_accuracy": 0.9912507086992264, + "num_tokens": 11522515.0, + "step": 1306 + }, + { + "entropy": 1.1407475769519806, + "epoch": 4.720361990950226, + "grad_norm": 0.4501092731952667, + "learning_rate": 0.000104259596827795, + "loss": 0.0506, + "mean_token_accuracy": 0.9859120547771454, + "num_tokens": 11531359.0, + "step": 1307 + }, + { + "entropy": 1.1798086017370224, + "epoch": 4.723981900452489, + "grad_norm": 0.5489926338195801, + "learning_rate": 0.00010397336036456606, + "loss": 0.0427, + "mean_token_accuracy": 0.9872020483016968, + "num_tokens": 11540219.0, + "step": 1308 + }, + { + "entropy": 1.1605547368526459, + "epoch": 4.727601809954751, + "grad_norm": 0.46645691990852356, + "learning_rate": 0.00010368779646098153, + "loss": 0.0341, + "mean_token_accuracy": 0.991564467549324, + "num_tokens": 11548703.0, + "step": 1309 + }, + { + "entropy": 1.160649299621582, + "epoch": 4.731221719457014, + "grad_norm": 0.41181880235671997, + "learning_rate": 0.00010340290620164959, + "loss": 0.0335, + "mean_token_accuracy": 0.9900417178869247, + "num_tokens": 11557884.0, + "step": 1310 + }, + { + "entropy": 1.2060245275497437, + "epoch": 4.734841628959276, + "grad_norm": 0.593100368976593, + "learning_rate": 0.00010311869066861967, + "loss": 0.0489, + "mean_token_accuracy": 0.9868187755346298, + "num_tokens": 11566192.0, + "step": 1311 + }, + { + "entropy": 1.1960155069828033, + "epoch": 4.7384615384615385, + "grad_norm": 0.6581326723098755, + "learning_rate": 0.00010283515094137866, + "loss": 0.0488, + "mean_token_accuracy": 0.9871827214956284, + "num_tokens": 11575309.0, + "step": 1312 + }, + { + "entropy": 1.1478987336158752, + "epoch": 4.742081447963801, + "grad_norm": 0.7423575520515442, + "learning_rate": 0.00010255228809684654, + "loss": 0.0442, + "mean_token_accuracy": 0.9883019477128983, + "num_tokens": 11584290.0, + "step": 1313 + }, + { + "entropy": 1.1630191802978516, + "epoch": 4.745701357466063, + "grad_norm": 0.4025084376335144, + "learning_rate": 0.00010227010320937243, + "loss": 0.0289, + "mean_token_accuracy": 0.9910438656806946, + "num_tokens": 11593101.0, + "step": 1314 + }, + { + "entropy": 1.201496183872223, + "epoch": 4.749321266968326, + "grad_norm": 0.2735002338886261, + "learning_rate": 0.0001019885973507305, + "loss": 0.0091, + "mean_token_accuracy": 0.997909814119339, + "num_tokens": 11601492.0, + "step": 1315 + }, + { + "entropy": 1.1884644627571106, + "epoch": 4.752941176470588, + "grad_norm": 0.36079543828964233, + "learning_rate": 0.00010170777159011589, + "loss": 0.0177, + "mean_token_accuracy": 0.9946515262126923, + "num_tokens": 11610063.0, + "step": 1316 + }, + { + "entropy": 1.1655499041080475, + "epoch": 4.756561085972851, + "grad_norm": 0.28955161571502686, + "learning_rate": 0.00010142762699414064, + "loss": 0.0213, + "mean_token_accuracy": 0.9961933195590973, + "num_tokens": 11618979.0, + "step": 1317 + }, + { + "entropy": 1.104924738407135, + "epoch": 4.760180995475113, + "grad_norm": 0.5413517355918884, + "learning_rate": 0.00010114816462682961, + "loss": 0.0523, + "mean_token_accuracy": 0.9876614063978195, + "num_tokens": 11628224.0, + "step": 1318 + }, + { + "entropy": 1.2001541256904602, + "epoch": 4.7638009049773755, + "grad_norm": 0.38713425397872925, + "learning_rate": 0.00010086938554961647, + "loss": 0.0289, + "mean_token_accuracy": 0.9902506768703461, + "num_tokens": 11636542.0, + "step": 1319 + }, + { + "entropy": 1.1230383515357971, + "epoch": 4.767420814479638, + "grad_norm": 0.42886537313461304, + "learning_rate": 0.00010059129082133972, + "loss": 0.0278, + "mean_token_accuracy": 0.991959884762764, + "num_tokens": 11645767.0, + "step": 1320 + }, + { + "entropy": 1.2156026661396027, + "epoch": 4.7710407239819, + "grad_norm": 0.5437722206115723, + "learning_rate": 0.00010031388149823848, + "loss": 0.0379, + "mean_token_accuracy": 0.9888860136270523, + "num_tokens": 11654434.0, + "step": 1321 + }, + { + "entropy": 1.1663733422756195, + "epoch": 4.774660633484163, + "grad_norm": 0.3484131693840027, + "learning_rate": 0.00010003715863394893, + "loss": 0.0179, + "mean_token_accuracy": 0.9933156818151474, + "num_tokens": 11663217.0, + "step": 1322 + }, + { + "entropy": 1.184800922870636, + "epoch": 4.778280542986425, + "grad_norm": 0.44003114104270935, + "learning_rate": 9.976112327949957e-05, + "loss": 0.0402, + "mean_token_accuracy": 0.9876485913991928, + "num_tokens": 11672149.0, + "step": 1323 + }, + { + "entropy": 1.0641057193279266, + "epoch": 4.781900452488688, + "grad_norm": 0.5698227882385254, + "learning_rate": 9.94857764833079e-05, + "loss": 0.0548, + "mean_token_accuracy": 0.9845984429121017, + "num_tokens": 11681304.0, + "step": 1324 + }, + { + "entropy": 1.2146041095256805, + "epoch": 4.78552036199095, + "grad_norm": 0.6113767623901367, + "learning_rate": 9.921111929117624e-05, + "loss": 0.0421, + "mean_token_accuracy": 0.9896378368139267, + "num_tokens": 11689662.0, + "step": 1325 + }, + { + "entropy": 1.1616670489311218, + "epoch": 4.7891402714932125, + "grad_norm": 0.41098496317863464, + "learning_rate": 9.893715274628749e-05, + "loss": 0.0372, + "mean_token_accuracy": 0.9886336326599121, + "num_tokens": 11698852.0, + "step": 1326 + }, + { + "entropy": 1.258986234664917, + "epoch": 4.792760180995475, + "grad_norm": 0.372736394405365, + "learning_rate": 9.866387788920149e-05, + "loss": 0.0196, + "mean_token_accuracy": 0.9942413419485092, + "num_tokens": 11707118.0, + "step": 1327 + }, + { + "entropy": 1.181258738040924, + "epoch": 4.796380090497737, + "grad_norm": 0.5828834176063538, + "learning_rate": 9.839129575785117e-05, + "loss": 0.0445, + "mean_token_accuracy": 0.9854382127523422, + "num_tokens": 11715957.0, + "step": 1328 + }, + { + "entropy": 1.1307817101478577, + "epoch": 4.8, + "grad_norm": 0.48134756088256836, + "learning_rate": 9.811940738753796e-05, + "loss": 0.039, + "mean_token_accuracy": 0.9897171407938004, + "num_tokens": 11725287.0, + "step": 1329 + }, + { + "entropy": 1.19740691781044, + "epoch": 4.803619909502262, + "grad_norm": 0.7992010712623596, + "learning_rate": 9.784821381092864e-05, + "loss": 0.0699, + "mean_token_accuracy": 0.9800475984811783, + "num_tokens": 11733857.0, + "step": 1330 + }, + { + "entropy": 1.2161820828914642, + "epoch": 4.807239819004525, + "grad_norm": 0.4690302908420563, + "learning_rate": 9.7577716058051e-05, + "loss": 0.0305, + "mean_token_accuracy": 0.9906628876924515, + "num_tokens": 11742243.0, + "step": 1331 + }, + { + "entropy": 1.194101721048355, + "epoch": 4.810859728506787, + "grad_norm": 0.382661908864975, + "learning_rate": 9.730791515629003e-05, + "loss": 0.0271, + "mean_token_accuracy": 0.9905851632356644, + "num_tokens": 11751039.0, + "step": 1332 + }, + { + "entropy": 1.2171461582183838, + "epoch": 4.8144796380090495, + "grad_norm": 0.41641414165496826, + "learning_rate": 9.703881213038375e-05, + "loss": 0.0377, + "mean_token_accuracy": 0.991615504026413, + "num_tokens": 11759786.0, + "step": 1333 + }, + { + "entropy": 1.15550696849823, + "epoch": 4.818099547511312, + "grad_norm": 0.43042320013046265, + "learning_rate": 9.677040800241995e-05, + "loss": 0.0377, + "mean_token_accuracy": 0.9883392006158829, + "num_tokens": 11768817.0, + "step": 1334 + }, + { + "entropy": 1.1094126105308533, + "epoch": 4.821719457013574, + "grad_norm": 0.4802487790584564, + "learning_rate": 9.650270379183166e-05, + "loss": 0.0357, + "mean_token_accuracy": 0.9853959828615189, + "num_tokens": 11778098.0, + "step": 1335 + }, + { + "entropy": 1.1772915720939636, + "epoch": 4.825339366515837, + "grad_norm": 0.47741183638572693, + "learning_rate": 9.623570051539347e-05, + "loss": 0.0488, + "mean_token_accuracy": 0.9828261733055115, + "num_tokens": 11787472.0, + "step": 1336 + }, + { + "entropy": 1.1935390532016754, + "epoch": 4.828959276018099, + "grad_norm": 0.49854576587677, + "learning_rate": 9.596939918721795e-05, + "loss": 0.0461, + "mean_token_accuracy": 0.9878488183021545, + "num_tokens": 11796269.0, + "step": 1337 + }, + { + "entropy": 1.2000310122966766, + "epoch": 4.832579185520362, + "grad_norm": 0.449413925409317, + "learning_rate": 9.570380081875159e-05, + "loss": 0.0326, + "mean_token_accuracy": 0.9885634779930115, + "num_tokens": 11804800.0, + "step": 1338 + }, + { + "entropy": 1.1534567177295685, + "epoch": 4.836199095022624, + "grad_norm": 0.4129447638988495, + "learning_rate": 9.543890641877057e-05, + "loss": 0.0241, + "mean_token_accuracy": 0.9913550019264221, + "num_tokens": 11813742.0, + "step": 1339 + }, + { + "entropy": 1.2593884468078613, + "epoch": 4.839819004524887, + "grad_norm": 0.5344672203063965, + "learning_rate": 9.51747169933778e-05, + "loss": 0.0328, + "mean_token_accuracy": 0.993433803319931, + "num_tokens": 11821810.0, + "step": 1340 + }, + { + "entropy": 1.1583607494831085, + "epoch": 4.843438914027149, + "grad_norm": 0.5206009745597839, + "learning_rate": 9.491123354599839e-05, + "loss": 0.0404, + "mean_token_accuracy": 0.9856080114841461, + "num_tokens": 11830477.0, + "step": 1341 + }, + { + "entropy": 1.1516379415988922, + "epoch": 4.847058823529411, + "grad_norm": 0.40458643436431885, + "learning_rate": 9.464845707737593e-05, + "loss": 0.0457, + "mean_token_accuracy": 0.9855477511882782, + "num_tokens": 11839532.0, + "step": 1342 + }, + { + "entropy": 1.268717736005783, + "epoch": 4.850678733031674, + "grad_norm": 0.49775683879852295, + "learning_rate": 9.438638858556893e-05, + "loss": 0.0394, + "mean_token_accuracy": 0.9892454147338867, + "num_tokens": 11847796.0, + "step": 1343 + }, + { + "entropy": 1.204954743385315, + "epoch": 4.854298642533936, + "grad_norm": 0.5535430312156677, + "learning_rate": 9.412502906594703e-05, + "loss": 0.0279, + "mean_token_accuracy": 0.9906836897134781, + "num_tokens": 11856597.0, + "step": 1344 + }, + { + "entropy": 1.239096075296402, + "epoch": 4.857918552036199, + "grad_norm": 0.43477335572242737, + "learning_rate": 9.3864379511187e-05, + "loss": 0.0282, + "mean_token_accuracy": 0.9896904081106186, + "num_tokens": 11864932.0, + "step": 1345 + }, + { + "entropy": 1.181561678647995, + "epoch": 4.861538461538462, + "grad_norm": 0.7839770317077637, + "learning_rate": 9.360444091126893e-05, + "loss": 0.0598, + "mean_token_accuracy": 0.9811979532241821, + "num_tokens": 11873696.0, + "step": 1346 + }, + { + "entropy": 1.2574385702610016, + "epoch": 4.8651583710407245, + "grad_norm": 0.8974226117134094, + "learning_rate": 9.334521425347285e-05, + "loss": 0.052, + "mean_token_accuracy": 0.9802359342575073, + "num_tokens": 11881907.0, + "step": 1347 + }, + { + "entropy": 1.2081020176410675, + "epoch": 4.868778280542987, + "grad_norm": 0.5048856735229492, + "learning_rate": 9.30867005223747e-05, + "loss": 0.0405, + "mean_token_accuracy": 0.9917744398117065, + "num_tokens": 11890265.0, + "step": 1348 + }, + { + "entropy": 1.1894653141498566, + "epoch": 4.872398190045249, + "grad_norm": 0.37338632345199585, + "learning_rate": 9.282890069984239e-05, + "loss": 0.0295, + "mean_token_accuracy": 0.9903924912214279, + "num_tokens": 11899137.0, + "step": 1349 + }, + { + "entropy": 1.1611916571855545, + "epoch": 4.876018099547512, + "grad_norm": 0.48497217893600464, + "learning_rate": 9.257181576503266e-05, + "loss": 0.0399, + "mean_token_accuracy": 0.9874439090490341, + "num_tokens": 11908436.0, + "step": 1350 + }, + { + "entropy": 1.1653602123260498, + "epoch": 4.879638009049774, + "grad_norm": 0.4942433834075928, + "learning_rate": 9.231544669438686e-05, + "loss": 0.0444, + "mean_token_accuracy": 0.9836005568504333, + "num_tokens": 11917077.0, + "step": 1351 + }, + { + "entropy": 1.170697033405304, + "epoch": 4.883257918552037, + "grad_norm": 0.4725663959980011, + "learning_rate": 9.205979446162726e-05, + "loss": 0.0285, + "mean_token_accuracy": 0.9913990944623947, + "num_tokens": 11925805.0, + "step": 1352 + }, + { + "entropy": 1.159929096698761, + "epoch": 4.886877828054299, + "grad_norm": 0.40604647994041443, + "learning_rate": 9.180486003775372e-05, + "loss": 0.0241, + "mean_token_accuracy": 0.9921021014451981, + "num_tokens": 11934655.0, + "step": 1353 + }, + { + "entropy": 1.2127621173858643, + "epoch": 4.8904977375565615, + "grad_norm": 0.5436980724334717, + "learning_rate": 9.155064439103966e-05, + "loss": 0.0274, + "mean_token_accuracy": 0.9923296123743057, + "num_tokens": 11943642.0, + "step": 1354 + }, + { + "entropy": 1.191734939813614, + "epoch": 4.894117647058824, + "grad_norm": 0.6527659893035889, + "learning_rate": 9.12971484870285e-05, + "loss": 0.0626, + "mean_token_accuracy": 0.9802115112543106, + "num_tokens": 11952498.0, + "step": 1355 + }, + { + "entropy": 1.161993145942688, + "epoch": 4.897737556561086, + "grad_norm": 0.44556480646133423, + "learning_rate": 9.104437328852997e-05, + "loss": 0.0351, + "mean_token_accuracy": 0.9911103397607803, + "num_tokens": 11961424.0, + "step": 1356 + }, + { + "entropy": 1.1761930584907532, + "epoch": 4.901357466063349, + "grad_norm": 0.3631705343723297, + "learning_rate": 9.079231975561655e-05, + "loss": 0.0402, + "mean_token_accuracy": 0.9864005595445633, + "num_tokens": 11970226.0, + "step": 1357 + }, + { + "entropy": 1.127123236656189, + "epoch": 4.904977375565611, + "grad_norm": 0.3652251064777374, + "learning_rate": 9.054098884561962e-05, + "loss": 0.0319, + "mean_token_accuracy": 0.9879902452230453, + "num_tokens": 11978695.0, + "step": 1358 + }, + { + "entropy": 1.1227464973926544, + "epoch": 4.908597285067874, + "grad_norm": 0.4769943058490753, + "learning_rate": 9.029038151312601e-05, + "loss": 0.0447, + "mean_token_accuracy": 0.9882356822490692, + "num_tokens": 11987609.0, + "step": 1359 + }, + { + "entropy": 1.067475363612175, + "epoch": 4.912217194570136, + "grad_norm": 0.35615095496177673, + "learning_rate": 9.004049870997432e-05, + "loss": 0.0419, + "mean_token_accuracy": 0.9882748872041702, + "num_tokens": 11996892.0, + "step": 1360 + }, + { + "entropy": 1.1167294383049011, + "epoch": 4.9158371040723985, + "grad_norm": 0.2417716383934021, + "learning_rate": 8.979134138525127e-05, + "loss": 0.0146, + "mean_token_accuracy": 0.9959355145692825, + "num_tokens": 12005729.0, + "step": 1361 + }, + { + "entropy": 1.1310055255889893, + "epoch": 4.919457013574661, + "grad_norm": 0.612522542476654, + "learning_rate": 8.954291048528816e-05, + "loss": 0.04, + "mean_token_accuracy": 0.9909398257732391, + "num_tokens": 12014648.0, + "step": 1362 + }, + { + "entropy": 1.1927096545696259, + "epoch": 4.923076923076923, + "grad_norm": 0.30201828479766846, + "learning_rate": 8.929520695365718e-05, + "loss": 0.0154, + "mean_token_accuracy": 0.9962878674268723, + "num_tokens": 12023243.0, + "step": 1363 + }, + { + "entropy": 1.1450499594211578, + "epoch": 4.926696832579186, + "grad_norm": 0.45451363921165466, + "learning_rate": 8.904823173116795e-05, + "loss": 0.0342, + "mean_token_accuracy": 0.9906298071146011, + "num_tokens": 12032220.0, + "step": 1364 + }, + { + "entropy": 1.1808405816555023, + "epoch": 4.930316742081448, + "grad_norm": 0.5427790284156799, + "learning_rate": 8.880198575586377e-05, + "loss": 0.037, + "mean_token_accuracy": 0.9875056445598602, + "num_tokens": 12041159.0, + "step": 1365 + }, + { + "entropy": 1.196801632642746, + "epoch": 4.933936651583711, + "grad_norm": 0.5426363348960876, + "learning_rate": 8.855646996301831e-05, + "loss": 0.0548, + "mean_token_accuracy": 0.9816399365663528, + "num_tokens": 12049774.0, + "step": 1366 + }, + { + "entropy": 1.1708534061908722, + "epoch": 4.937556561085973, + "grad_norm": 0.4115329384803772, + "learning_rate": 8.831168528513182e-05, + "loss": 0.0271, + "mean_token_accuracy": 0.9913832694292068, + "num_tokens": 12058589.0, + "step": 1367 + }, + { + "entropy": 1.2116675972938538, + "epoch": 4.9411764705882355, + "grad_norm": 0.6414517760276794, + "learning_rate": 8.806763265192764e-05, + "loss": 0.0665, + "mean_token_accuracy": 0.9832449555397034, + "num_tokens": 12067158.0, + "step": 1368 + }, + { + "entropy": 1.1851932406425476, + "epoch": 4.944796380090498, + "grad_norm": 0.5957241654396057, + "learning_rate": 8.782431299034888e-05, + "loss": 0.057, + "mean_token_accuracy": 0.9863469302654266, + "num_tokens": 12076142.0, + "step": 1369 + }, + { + "entropy": 1.1860441267490387, + "epoch": 4.94841628959276, + "grad_norm": 0.4882605969905853, + "learning_rate": 8.758172722455456e-05, + "loss": 0.0352, + "mean_token_accuracy": 0.9856973141431808, + "num_tokens": 12085136.0, + "step": 1370 + }, + { + "entropy": 1.1902669072151184, + "epoch": 4.952036199095023, + "grad_norm": 0.4293587803840637, + "learning_rate": 8.733987627591634e-05, + "loss": 0.0351, + "mean_token_accuracy": 0.9872110784053802, + "num_tokens": 12093958.0, + "step": 1371 + }, + { + "entropy": 1.1657917201519012, + "epoch": 4.955656108597285, + "grad_norm": 0.4974465072154999, + "learning_rate": 8.709876106301494e-05, + "loss": 0.0422, + "mean_token_accuracy": 0.9845677465200424, + "num_tokens": 12103052.0, + "step": 1372 + }, + { + "entropy": 1.1982821822166443, + "epoch": 4.959276018099548, + "grad_norm": 0.4955083131790161, + "learning_rate": 8.685838250163667e-05, + "loss": 0.034, + "mean_token_accuracy": 0.9909563511610031, + "num_tokens": 12111490.0, + "step": 1373 + }, + { + "entropy": 1.1515125632286072, + "epoch": 4.96289592760181, + "grad_norm": 0.3922666907310486, + "learning_rate": 8.661874150476996e-05, + "loss": 0.0295, + "mean_token_accuracy": 0.9927395433187485, + "num_tokens": 12119907.0, + "step": 1374 + }, + { + "entropy": 1.1215294301509857, + "epoch": 4.9665158371040725, + "grad_norm": 0.4663509130477905, + "learning_rate": 8.637983898260185e-05, + "loss": 0.0462, + "mean_token_accuracy": 0.9870833456516266, + "num_tokens": 12128753.0, + "step": 1375 + }, + { + "entropy": 1.2042448818683624, + "epoch": 4.970135746606335, + "grad_norm": 0.37983185052871704, + "learning_rate": 8.614167584251458e-05, + "loss": 0.0183, + "mean_token_accuracy": 0.9940395504236221, + "num_tokens": 12137481.0, + "step": 1376 + }, + { + "entropy": 1.1789166033267975, + "epoch": 4.973755656108597, + "grad_norm": 0.6218724250793457, + "learning_rate": 8.5904252989082e-05, + "loss": 0.0475, + "mean_token_accuracy": 0.9865689128637314, + "num_tokens": 12146367.0, + "step": 1377 + }, + { + "entropy": 1.0459700524806976, + "epoch": 4.97737556561086, + "grad_norm": 0.3584859073162079, + "learning_rate": 8.566757132406655e-05, + "loss": 0.022, + "mean_token_accuracy": 0.9925664961338043, + "num_tokens": 12156304.0, + "step": 1378 + }, + { + "entropy": 1.1764253973960876, + "epoch": 4.980995475113122, + "grad_norm": 0.455917626619339, + "learning_rate": 8.543163174641523e-05, + "loss": 0.0306, + "mean_token_accuracy": 0.9904853105545044, + "num_tokens": 12165145.0, + "step": 1379 + }, + { + "entropy": 1.2412675023078918, + "epoch": 4.984615384615385, + "grad_norm": 0.5041368007659912, + "learning_rate": 8.519643515225658e-05, + "loss": 0.0365, + "mean_token_accuracy": 0.9898426681756973, + "num_tokens": 12173502.0, + "step": 1380 + }, + { + "entropy": 1.1146800816059113, + "epoch": 4.988235294117647, + "grad_norm": 0.30426302552223206, + "learning_rate": 8.496198243489743e-05, + "loss": 0.0205, + "mean_token_accuracy": 0.9935296326875687, + "num_tokens": 12182445.0, + "step": 1381 + }, + { + "entropy": 1.2658378779888153, + "epoch": 4.9918552036199095, + "grad_norm": 0.602274477481842, + "learning_rate": 8.472827448481894e-05, + "loss": 0.0435, + "mean_token_accuracy": 0.9829444736242294, + "num_tokens": 12191040.0, + "step": 1382 + }, + { + "entropy": 1.1924521923065186, + "epoch": 4.995475113122172, + "grad_norm": 0.5336406230926514, + "learning_rate": 8.449531218967363e-05, + "loss": 0.0237, + "mean_token_accuracy": 0.993032917380333, + "num_tokens": 12199468.0, + "step": 1383 + }, + { + "entropy": 1.2425891757011414, + "epoch": 4.999095022624434, + "grad_norm": 1.141570806503296, + "learning_rate": 8.426309643428217e-05, + "loss": 0.0185, + "mean_token_accuracy": 0.9930049031972885, + "num_tokens": 12207979.0, + "step": 1384 + }, + { + "entropy": 1.0880640745162964, + "epoch": 5.0, + "grad_norm": 0.33857738971710205, + "learning_rate": 8.403162810062945e-05, + "loss": 0.0059, + "mean_token_accuracy": 1.0, + "num_tokens": 12208625.0, + "step": 1385 + }, + { + "epoch": 5.0, + "eval_entropy": 1.186980954030665, + "eval_loss": 0.13543975353240967, + "eval_mean_token_accuracy": 0.971454709526, + "eval_num_tokens": 12208625.0, + "eval_runtime": 116.009, + "eval_samples_per_second": 3.181, + "eval_steps_per_second": 1.06, + "step": 1385 + }, + { + "entropy": 1.1249819993972778, + "epoch": 5.003619909502262, + "grad_norm": 0.297380656003952, + "learning_rate": 8.380090806786169e-05, + "loss": 0.0202, + "mean_token_accuracy": 0.993257001042366, + "num_tokens": 12217830.0, + "step": 1386 + }, + { + "entropy": 1.2227768301963806, + "epoch": 5.007239819004525, + "grad_norm": 0.22833265364170074, + "learning_rate": 8.357093721228306e-05, + "loss": 0.0095, + "mean_token_accuracy": 0.9970414191484451, + "num_tokens": 12226226.0, + "step": 1387 + }, + { + "entropy": 1.2076982855796814, + "epoch": 5.010859728506787, + "grad_norm": 0.4217525124549866, + "learning_rate": 8.334171640735221e-05, + "loss": 0.0306, + "mean_token_accuracy": 0.9892292618751526, + "num_tokens": 12235068.0, + "step": 1388 + }, + { + "entropy": 1.1539611518383026, + "epoch": 5.01447963800905, + "grad_norm": 0.40077605843544006, + "learning_rate": 8.311324652367882e-05, + "loss": 0.049, + "mean_token_accuracy": 0.9891749173402786, + "num_tokens": 12244040.0, + "step": 1389 + }, + { + "entropy": 1.2254254519939423, + "epoch": 5.018099547511312, + "grad_norm": 0.3620734214782715, + "learning_rate": 8.288552842902076e-05, + "loss": 0.0236, + "mean_token_accuracy": 0.9929983466863632, + "num_tokens": 12252797.0, + "step": 1390 + }, + { + "entropy": 1.1726420223712921, + "epoch": 5.021719457013575, + "grad_norm": 0.367407888174057, + "learning_rate": 8.265856298828035e-05, + "loss": 0.0317, + "mean_token_accuracy": 0.9944701045751572, + "num_tokens": 12261698.0, + "step": 1391 + }, + { + "entropy": 1.1370590031147003, + "epoch": 5.025339366515837, + "grad_norm": 0.2235739678144455, + "learning_rate": 8.243235106350116e-05, + "loss": 0.0098, + "mean_token_accuracy": 0.9973361939191818, + "num_tokens": 12270798.0, + "step": 1392 + }, + { + "entropy": 1.1764906346797943, + "epoch": 5.0289592760180994, + "grad_norm": 0.3386642336845398, + "learning_rate": 8.220689351386502e-05, + "loss": 0.0146, + "mean_token_accuracy": 0.9945813864469528, + "num_tokens": 12279807.0, + "step": 1393 + }, + { + "entropy": 1.1405702531337738, + "epoch": 5.032579185520362, + "grad_norm": 0.23875506222248077, + "learning_rate": 8.198219119568846e-05, + "loss": 0.0136, + "mean_token_accuracy": 0.9940757155418396, + "num_tokens": 12288679.0, + "step": 1394 + }, + { + "entropy": 1.2103245854377747, + "epoch": 5.036199095022624, + "grad_norm": 0.33527252078056335, + "learning_rate": 8.175824496241934e-05, + "loss": 0.017, + "mean_token_accuracy": 0.9941600114107132, + "num_tokens": 12297410.0, + "step": 1395 + }, + { + "entropy": 1.2534641027450562, + "epoch": 5.039819004524887, + "grad_norm": 0.3719223439693451, + "learning_rate": 8.153505566463418e-05, + "loss": 0.0244, + "mean_token_accuracy": 0.9933225959539413, + "num_tokens": 12306004.0, + "step": 1396 + }, + { + "entropy": 1.230869710445404, + "epoch": 5.043438914027149, + "grad_norm": 0.3826448619365692, + "learning_rate": 8.131262415003435e-05, + "loss": 0.0332, + "mean_token_accuracy": 0.9893752932548523, + "num_tokens": 12314115.0, + "step": 1397 + }, + { + "entropy": 1.1246551871299744, + "epoch": 5.047058823529412, + "grad_norm": 0.3458666205406189, + "learning_rate": 8.1090951263443e-05, + "loss": 0.0229, + "mean_token_accuracy": 0.9947858601808548, + "num_tokens": 12323474.0, + "step": 1398 + }, + { + "entropy": 1.1537968516349792, + "epoch": 5.050678733031674, + "grad_norm": 0.4332570731639862, + "learning_rate": 8.087003784680207e-05, + "loss": 0.0233, + "mean_token_accuracy": 0.9933829456567764, + "num_tokens": 12332738.0, + "step": 1399 + }, + { + "entropy": 1.2828398048877716, + "epoch": 5.0542986425339365, + "grad_norm": 0.48059210181236267, + "learning_rate": 8.064988473916886e-05, + "loss": 0.0245, + "mean_token_accuracy": 0.988829717040062, + "num_tokens": 12341438.0, + "step": 1400 + }, + { + "entropy": 1.234535038471222, + "epoch": 5.057918552036199, + "grad_norm": 0.4489496648311615, + "learning_rate": 8.043049277671294e-05, + "loss": 0.0279, + "mean_token_accuracy": 0.9861601740121841, + "num_tokens": 12350028.0, + "step": 1401 + }, + { + "entropy": 1.2596010267734528, + "epoch": 5.061538461538461, + "grad_norm": 0.3683261573314667, + "learning_rate": 8.021186279271292e-05, + "loss": 0.0195, + "mean_token_accuracy": 0.995607316493988, + "num_tokens": 12358588.0, + "step": 1402 + }, + { + "entropy": 1.1976143419742584, + "epoch": 5.065158371040724, + "grad_norm": 0.33004674315452576, + "learning_rate": 7.999399561755334e-05, + "loss": 0.018, + "mean_token_accuracy": 0.9952428638935089, + "num_tokens": 12367955.0, + "step": 1403 + }, + { + "entropy": 1.1453632861375809, + "epoch": 5.068778280542986, + "grad_norm": 0.32939690351486206, + "learning_rate": 7.977689207872151e-05, + "loss": 0.0247, + "mean_token_accuracy": 0.9927968680858612, + "num_tokens": 12377649.0, + "step": 1404 + }, + { + "entropy": 1.1789866089820862, + "epoch": 5.072398190045249, + "grad_norm": 0.6219817399978638, + "learning_rate": 7.956055300080436e-05, + "loss": 0.0636, + "mean_token_accuracy": 0.9886423498392105, + "num_tokens": 12386441.0, + "step": 1405 + }, + { + "entropy": 1.1602259576320648, + "epoch": 5.076018099547511, + "grad_norm": 0.4582313895225525, + "learning_rate": 7.934497920548524e-05, + "loss": 0.0342, + "mean_token_accuracy": 0.9882922172546387, + "num_tokens": 12395465.0, + "step": 1406 + }, + { + "entropy": 1.2799539268016815, + "epoch": 5.0796380090497735, + "grad_norm": 0.38321343064308167, + "learning_rate": 7.913017151154091e-05, + "loss": 0.0142, + "mean_token_accuracy": 0.9936931282281876, + "num_tokens": 12404031.0, + "step": 1407 + }, + { + "entropy": 1.2249329090118408, + "epoch": 5.083257918552036, + "grad_norm": 0.3604588806629181, + "learning_rate": 7.891613073483839e-05, + "loss": 0.0148, + "mean_token_accuracy": 0.9949173480272293, + "num_tokens": 12412701.0, + "step": 1408 + }, + { + "entropy": 1.2634833753108978, + "epoch": 5.086877828054298, + "grad_norm": 0.38502612709999084, + "learning_rate": 7.870285768833178e-05, + "loss": 0.0168, + "mean_token_accuracy": 0.9942783117294312, + "num_tokens": 12421236.0, + "step": 1409 + }, + { + "entropy": 1.205617368221283, + "epoch": 5.090497737556561, + "grad_norm": 0.382969468832016, + "learning_rate": 7.849035318205932e-05, + "loss": 0.0243, + "mean_token_accuracy": 0.9931115359067917, + "num_tokens": 12430362.0, + "step": 1410 + }, + { + "entropy": 1.262538194656372, + "epoch": 5.094117647058823, + "grad_norm": 0.46106478571891785, + "learning_rate": 7.827861802314022e-05, + "loss": 0.0218, + "mean_token_accuracy": 0.9948606938123703, + "num_tokens": 12439082.0, + "step": 1411 + }, + { + "entropy": 1.3220985233783722, + "epoch": 5.097737556561086, + "grad_norm": 0.4070530831813812, + "learning_rate": 7.80676530157716e-05, + "loss": 0.016, + "mean_token_accuracy": 0.9932022243738174, + "num_tokens": 12447182.0, + "step": 1412 + }, + { + "entropy": 1.1942959129810333, + "epoch": 5.101357466063348, + "grad_norm": 0.29382020235061646, + "learning_rate": 7.785745896122541e-05, + "loss": 0.0141, + "mean_token_accuracy": 0.995876133441925, + "num_tokens": 12456302.0, + "step": 1413 + }, + { + "entropy": 1.2399134635925293, + "epoch": 5.1049773755656105, + "grad_norm": 0.4737322926521301, + "learning_rate": 7.764803665784547e-05, + "loss": 0.0285, + "mean_token_accuracy": 0.9911573827266693, + "num_tokens": 12465105.0, + "step": 1414 + }, + { + "entropy": 1.230169802904129, + "epoch": 5.108597285067873, + "grad_norm": 0.20496386289596558, + "learning_rate": 7.743938690104441e-05, + "loss": 0.008, + "mean_token_accuracy": 0.9970543384552002, + "num_tokens": 12473888.0, + "step": 1415 + }, + { + "entropy": 1.1815564334392548, + "epoch": 5.112217194570135, + "grad_norm": 0.39645281434059143, + "learning_rate": 7.723151048330055e-05, + "loss": 0.0258, + "mean_token_accuracy": 0.9912184923887253, + "num_tokens": 12483017.0, + "step": 1416 + }, + { + "entropy": 1.2476356029510498, + "epoch": 5.115837104072398, + "grad_norm": 0.3806418180465698, + "learning_rate": 7.7024408194155e-05, + "loss": 0.0136, + "mean_token_accuracy": 0.9966550022363663, + "num_tokens": 12491876.0, + "step": 1417 + }, + { + "entropy": 1.2351654767990112, + "epoch": 5.11945701357466, + "grad_norm": 0.27004656195640564, + "learning_rate": 7.681808082020874e-05, + "loss": 0.0162, + "mean_token_accuracy": 0.995867982506752, + "num_tokens": 12500951.0, + "step": 1418 + }, + { + "entropy": 1.3402520716190338, + "epoch": 5.123076923076923, + "grad_norm": 0.3575328290462494, + "learning_rate": 7.661252914511937e-05, + "loss": 0.0077, + "mean_token_accuracy": 0.998654842376709, + "num_tokens": 12509377.0, + "step": 1419 + }, + { + "entropy": 1.3409844934940338, + "epoch": 5.126696832579185, + "grad_norm": 0.3506350815296173, + "learning_rate": 7.640775394959829e-05, + "loss": 0.0147, + "mean_token_accuracy": 0.9954731464385986, + "num_tokens": 12518223.0, + "step": 1420 + }, + { + "entropy": 1.2145212590694427, + "epoch": 5.130316742081448, + "grad_norm": 0.5054464340209961, + "learning_rate": 7.620375601140794e-05, + "loss": 0.0306, + "mean_token_accuracy": 0.9916608929634094, + "num_tokens": 12527386.0, + "step": 1421 + }, + { + "entropy": 1.217678189277649, + "epoch": 5.133936651583711, + "grad_norm": 0.2966901659965515, + "learning_rate": 7.600053610535837e-05, + "loss": 0.0139, + "mean_token_accuracy": 0.9959810972213745, + "num_tokens": 12536270.0, + "step": 1422 + }, + { + "entropy": 1.2111278176307678, + "epoch": 5.137556561085973, + "grad_norm": 0.3617740571498871, + "learning_rate": 7.579809500330464e-05, + "loss": 0.0195, + "mean_token_accuracy": 0.9952263385057449, + "num_tokens": 12545417.0, + "step": 1423 + }, + { + "entropy": 1.3365572094917297, + "epoch": 5.141176470588236, + "grad_norm": 0.5943083167076111, + "learning_rate": 7.559643347414404e-05, + "loss": 0.0314, + "mean_token_accuracy": 0.9895547032356262, + "num_tokens": 12554019.0, + "step": 1424 + }, + { + "entropy": 1.2841481268405914, + "epoch": 5.144796380090498, + "grad_norm": 0.4198203682899475, + "learning_rate": 7.53955522838126e-05, + "loss": 0.0188, + "mean_token_accuracy": 0.9964423924684525, + "num_tokens": 12563056.0, + "step": 1425 + }, + { + "entropy": 1.3388768434524536, + "epoch": 5.1484162895927605, + "grad_norm": 0.6388291716575623, + "learning_rate": 7.519545219528275e-05, + "loss": 0.0255, + "mean_token_accuracy": 0.9938215464353561, + "num_tokens": 12571237.0, + "step": 1426 + }, + { + "entropy": 1.258266568183899, + "epoch": 5.152036199095023, + "grad_norm": 0.4118649959564209, + "learning_rate": 7.49961339685601e-05, + "loss": 0.0119, + "mean_token_accuracy": 0.9959937632083893, + "num_tokens": 12579940.0, + "step": 1427 + }, + { + "entropy": 1.2728670537471771, + "epoch": 5.155656108597285, + "grad_norm": 0.4799036383628845, + "learning_rate": 7.479759836068063e-05, + "loss": 0.0295, + "mean_token_accuracy": 0.9925651103258133, + "num_tokens": 12588911.0, + "step": 1428 + }, + { + "entropy": 1.3155244588851929, + "epoch": 5.159276018099548, + "grad_norm": 0.49331581592559814, + "learning_rate": 7.459984612570797e-05, + "loss": 0.0249, + "mean_token_accuracy": 0.9907650947570801, + "num_tokens": 12597204.0, + "step": 1429 + }, + { + "entropy": 1.263516515493393, + "epoch": 5.16289592760181, + "grad_norm": 0.33506688475608826, + "learning_rate": 7.440287801473023e-05, + "loss": 0.0172, + "mean_token_accuracy": 0.9954954087734222, + "num_tokens": 12606290.0, + "step": 1430 + }, + { + "entropy": 1.1920699924230576, + "epoch": 5.166515837104073, + "grad_norm": 0.28553274273872375, + "learning_rate": 7.420669477585753e-05, + "loss": 0.0146, + "mean_token_accuracy": 0.9961993992328644, + "num_tokens": 12615629.0, + "step": 1431 + }, + { + "entropy": 1.2724811434745789, + "epoch": 5.170135746606335, + "grad_norm": 0.48113831877708435, + "learning_rate": 7.401129715421866e-05, + "loss": 0.0212, + "mean_token_accuracy": 0.9933426231145859, + "num_tokens": 12623918.0, + "step": 1432 + }, + { + "entropy": 1.2462135553359985, + "epoch": 5.173755656108598, + "grad_norm": 0.395637571811676, + "learning_rate": 7.381668589195875e-05, + "loss": 0.0259, + "mean_token_accuracy": 0.9922471791505814, + "num_tokens": 12632906.0, + "step": 1433 + }, + { + "entropy": 1.2504163086414337, + "epoch": 5.17737556561086, + "grad_norm": 0.3723805546760559, + "learning_rate": 7.362286172823623e-05, + "loss": 0.0221, + "mean_token_accuracy": 0.9939638078212738, + "num_tokens": 12642022.0, + "step": 1434 + }, + { + "entropy": 1.3061585128307343, + "epoch": 5.180995475113122, + "grad_norm": 0.35139545798301697, + "learning_rate": 7.342982539921988e-05, + "loss": 0.0158, + "mean_token_accuracy": 0.9967697560787201, + "num_tokens": 12650592.0, + "step": 1435 + }, + { + "entropy": 1.2836159765720367, + "epoch": 5.184615384615385, + "grad_norm": 0.3337141275405884, + "learning_rate": 7.32375776380863e-05, + "loss": 0.0155, + "mean_token_accuracy": 0.997398167848587, + "num_tokens": 12658898.0, + "step": 1436 + }, + { + "entropy": 1.2803179025650024, + "epoch": 5.188235294117647, + "grad_norm": 0.5076804757118225, + "learning_rate": 7.304611917501707e-05, + "loss": 0.0371, + "mean_token_accuracy": 0.9879848212003708, + "num_tokens": 12667616.0, + "step": 1437 + }, + { + "entropy": 1.3708868622779846, + "epoch": 5.19185520361991, + "grad_norm": 0.4788109362125397, + "learning_rate": 7.285545073719577e-05, + "loss": 0.0206, + "mean_token_accuracy": 0.993600144982338, + "num_tokens": 12675877.0, + "step": 1438 + }, + { + "entropy": 1.2671563625335693, + "epoch": 5.195475113122172, + "grad_norm": 0.3305739462375641, + "learning_rate": 7.266557304880534e-05, + "loss": 0.0098, + "mean_token_accuracy": 0.9981791973114014, + "num_tokens": 12684592.0, + "step": 1439 + }, + { + "entropy": 1.1895796954631805, + "epoch": 5.199095022624435, + "grad_norm": 0.4455597698688507, + "learning_rate": 7.247648683102561e-05, + "loss": 0.014, + "mean_token_accuracy": 0.9961147755384445, + "num_tokens": 12693665.0, + "step": 1440 + }, + { + "entropy": 1.3177433907985687, + "epoch": 5.202714932126697, + "grad_norm": 0.30147814750671387, + "learning_rate": 7.228819280203009e-05, + "loss": 0.01, + "mean_token_accuracy": 0.996846467256546, + "num_tokens": 12702294.0, + "step": 1441 + }, + { + "entropy": 1.2993082702159882, + "epoch": 5.206334841628959, + "grad_norm": 0.419706791639328, + "learning_rate": 7.210069167698335e-05, + "loss": 0.0108, + "mean_token_accuracy": 0.9965634196996689, + "num_tokens": 12710776.0, + "step": 1442 + }, + { + "entropy": 1.2513138055801392, + "epoch": 5.209954751131222, + "grad_norm": 0.2982895076274872, + "learning_rate": 7.19139841680387e-05, + "loss": 0.0094, + "mean_token_accuracy": 0.9968519061803818, + "num_tokens": 12719738.0, + "step": 1443 + }, + { + "entropy": 1.3170332610607147, + "epoch": 5.213574660633484, + "grad_norm": 0.5569889545440674, + "learning_rate": 7.17280709843351e-05, + "loss": 0.0358, + "mean_token_accuracy": 0.9913972169160843, + "num_tokens": 12728199.0, + "step": 1444 + }, + { + "entropy": 1.2615925967693329, + "epoch": 5.217194570135747, + "grad_norm": 0.4605793058872223, + "learning_rate": 7.154295283199434e-05, + "loss": 0.0191, + "mean_token_accuracy": 0.9953597635030746, + "num_tokens": 12736938.0, + "step": 1445 + }, + { + "entropy": 1.3304391503334045, + "epoch": 5.220814479638009, + "grad_norm": 0.476976215839386, + "learning_rate": 7.135863041411887e-05, + "loss": 0.0292, + "mean_token_accuracy": 0.9946031868457794, + "num_tokens": 12745360.0, + "step": 1446 + }, + { + "entropy": 1.2853979766368866, + "epoch": 5.224434389140272, + "grad_norm": 0.4844076931476593, + "learning_rate": 7.117510443078877e-05, + "loss": 0.0328, + "mean_token_accuracy": 0.988853320479393, + "num_tokens": 12753970.0, + "step": 1447 + }, + { + "entropy": 1.278106302022934, + "epoch": 5.228054298642534, + "grad_norm": 0.38697949051856995, + "learning_rate": 7.099237557905898e-05, + "loss": 0.0153, + "mean_token_accuracy": 0.9957392662763596, + "num_tokens": 12762463.0, + "step": 1448 + }, + { + "entropy": 1.2172533869743347, + "epoch": 5.2316742081447964, + "grad_norm": 0.2944023609161377, + "learning_rate": 7.081044455295704e-05, + "loss": 0.0122, + "mean_token_accuracy": 0.9978005886077881, + "num_tokens": 12771302.0, + "step": 1449 + }, + { + "entropy": 1.3056898713111877, + "epoch": 5.235294117647059, + "grad_norm": 0.4920123219490051, + "learning_rate": 7.062931204348022e-05, + "loss": 0.0312, + "mean_token_accuracy": 0.9902955293655396, + "num_tokens": 12779836.0, + "step": 1450 + }, + { + "entropy": 1.229508250951767, + "epoch": 5.238914027149321, + "grad_norm": 0.47162678837776184, + "learning_rate": 7.044897873859276e-05, + "loss": 0.0295, + "mean_token_accuracy": 0.9908081144094467, + "num_tokens": 12789231.0, + "step": 1451 + }, + { + "entropy": 1.2454079985618591, + "epoch": 5.242533936651584, + "grad_norm": 0.47576668858528137, + "learning_rate": 7.026944532322361e-05, + "loss": 0.0193, + "mean_token_accuracy": 0.9938995093107224, + "num_tokens": 12798207.0, + "step": 1452 + }, + { + "entropy": 1.2500143647193909, + "epoch": 5.246153846153846, + "grad_norm": 0.5368858575820923, + "learning_rate": 7.009071247926356e-05, + "loss": 0.0269, + "mean_token_accuracy": 0.990957960486412, + "num_tokens": 12807320.0, + "step": 1453 + }, + { + "entropy": 1.2124861180782318, + "epoch": 5.249773755656109, + "grad_norm": 0.44579559564590454, + "learning_rate": 6.991278088556272e-05, + "loss": 0.0215, + "mean_token_accuracy": 0.9927420020103455, + "num_tokens": 12816701.0, + "step": 1454 + }, + { + "entropy": 1.2064794450998306, + "epoch": 5.253393665158371, + "grad_norm": 0.3836108446121216, + "learning_rate": 6.973565121792798e-05, + "loss": 0.0222, + "mean_token_accuracy": 0.9931780993938446, + "num_tokens": 12826016.0, + "step": 1455 + }, + { + "entropy": 1.2310892939567566, + "epoch": 5.2570135746606335, + "grad_norm": 0.3940860331058502, + "learning_rate": 6.955932414912034e-05, + "loss": 0.0297, + "mean_token_accuracy": 0.9916985780000687, + "num_tokens": 12834924.0, + "step": 1456 + }, + { + "entropy": 1.2955662608146667, + "epoch": 5.260633484162896, + "grad_norm": 0.3385579586029053, + "learning_rate": 6.93838003488525e-05, + "loss": 0.0239, + "mean_token_accuracy": 0.9944149702787399, + "num_tokens": 12843506.0, + "step": 1457 + }, + { + "entropy": 1.2246078848838806, + "epoch": 5.264253393665158, + "grad_norm": 0.5769771933555603, + "learning_rate": 6.920908048378626e-05, + "loss": 0.0289, + "mean_token_accuracy": 0.9907724410295486, + "num_tokens": 12852379.0, + "step": 1458 + }, + { + "entropy": 1.2576108872890472, + "epoch": 5.267873303167421, + "grad_norm": 0.5868719816207886, + "learning_rate": 6.903516521752989e-05, + "loss": 0.0203, + "mean_token_accuracy": 0.9925303757190704, + "num_tokens": 12861475.0, + "step": 1459 + }, + { + "entropy": 1.2694770097732544, + "epoch": 5.271493212669683, + "grad_norm": 0.4030245840549469, + "learning_rate": 6.886205521063574e-05, + "loss": 0.0203, + "mean_token_accuracy": 0.9939717352390289, + "num_tokens": 12870174.0, + "step": 1460 + }, + { + "entropy": 1.2955704927444458, + "epoch": 5.275113122171946, + "grad_norm": 0.3584703803062439, + "learning_rate": 6.868975112059769e-05, + "loss": 0.0125, + "mean_token_accuracy": 0.9957002848386765, + "num_tokens": 12878946.0, + "step": 1461 + }, + { + "entropy": 1.3232930898666382, + "epoch": 5.278733031674208, + "grad_norm": 0.5757815837860107, + "learning_rate": 6.851825360184862e-05, + "loss": 0.0222, + "mean_token_accuracy": 0.9917564988136292, + "num_tokens": 12887529.0, + "step": 1462 + }, + { + "entropy": 1.1817227005958557, + "epoch": 5.2823529411764705, + "grad_norm": 0.5483627319335938, + "learning_rate": 6.834756330575801e-05, + "loss": 0.0407, + "mean_token_accuracy": 0.9860571324825287, + "num_tokens": 12897022.0, + "step": 1463 + }, + { + "entropy": 1.2956542074680328, + "epoch": 5.285972850678733, + "grad_norm": 0.3427274525165558, + "learning_rate": 6.81776808806293e-05, + "loss": 0.0149, + "mean_token_accuracy": 0.9977661073207855, + "num_tokens": 12905790.0, + "step": 1464 + }, + { + "entropy": 1.2946183681488037, + "epoch": 5.289592760180995, + "grad_norm": 0.3707132041454315, + "learning_rate": 6.80086069716976e-05, + "loss": 0.0166, + "mean_token_accuracy": 0.9944005459547043, + "num_tokens": 12914222.0, + "step": 1465 + }, + { + "entropy": 1.226729691028595, + "epoch": 5.293212669683258, + "grad_norm": 0.49931633472442627, + "learning_rate": 6.78403422211272e-05, + "loss": 0.0379, + "mean_token_accuracy": 0.9895394891500473, + "num_tokens": 12923394.0, + "step": 1466 + }, + { + "entropy": 1.3508270978927612, + "epoch": 5.29683257918552, + "grad_norm": 0.5119744539260864, + "learning_rate": 6.767288726800901e-05, + "loss": 0.0335, + "mean_token_accuracy": 0.9892595261335373, + "num_tokens": 12931830.0, + "step": 1467 + }, + { + "entropy": 1.2204712629318237, + "epoch": 5.300452488687783, + "grad_norm": 0.4239683151245117, + "learning_rate": 6.750624274835833e-05, + "loss": 0.0198, + "mean_token_accuracy": 0.9962838441133499, + "num_tokens": 12940947.0, + "step": 1468 + }, + { + "entropy": 1.2589700818061829, + "epoch": 5.304072398190045, + "grad_norm": 0.2344283014535904, + "learning_rate": 6.734040929511228e-05, + "loss": 0.0102, + "mean_token_accuracy": 0.9973642379045486, + "num_tokens": 12949673.0, + "step": 1469 + }, + { + "entropy": 1.272909700870514, + "epoch": 5.3076923076923075, + "grad_norm": 0.6968575716018677, + "learning_rate": 6.717538753812741e-05, + "loss": 0.0414, + "mean_token_accuracy": 0.9880207628011703, + "num_tokens": 12958465.0, + "step": 1470 + }, + { + "entropy": 1.2807061076164246, + "epoch": 5.31131221719457, + "grad_norm": 0.45930564403533936, + "learning_rate": 6.701117810417743e-05, + "loss": 0.0216, + "mean_token_accuracy": 0.9957028478384018, + "num_tokens": 12967332.0, + "step": 1471 + }, + { + "entropy": 1.2394128739833832, + "epoch": 5.314932126696832, + "grad_norm": 0.30973997712135315, + "learning_rate": 6.684778161695067e-05, + "loss": 0.0149, + "mean_token_accuracy": 0.9978993535041809, + "num_tokens": 12976163.0, + "step": 1472 + }, + { + "entropy": 1.2903764247894287, + "epoch": 5.318552036199095, + "grad_norm": 0.4434456527233124, + "learning_rate": 6.668519869704778e-05, + "loss": 0.023, + "mean_token_accuracy": 0.9926130026578903, + "num_tokens": 12985080.0, + "step": 1473 + }, + { + "entropy": 1.2964073717594147, + "epoch": 5.322171945701357, + "grad_norm": 0.433030366897583, + "learning_rate": 6.652342996197953e-05, + "loss": 0.0217, + "mean_token_accuracy": 0.9954771995544434, + "num_tokens": 12993887.0, + "step": 1474 + }, + { + "entropy": 1.2503498494625092, + "epoch": 5.32579185520362, + "grad_norm": 0.3653145730495453, + "learning_rate": 6.636247602616407e-05, + "loss": 0.0253, + "mean_token_accuracy": 0.9930839687585831, + "num_tokens": 13002518.0, + "step": 1475 + }, + { + "entropy": 1.3162983059883118, + "epoch": 5.329411764705882, + "grad_norm": 0.4129994213581085, + "learning_rate": 6.620233750092497e-05, + "loss": 0.023, + "mean_token_accuracy": 0.9931599944829941, + "num_tokens": 13011117.0, + "step": 1476 + }, + { + "entropy": 1.2917849123477936, + "epoch": 5.3330316742081445, + "grad_norm": 0.40094584226608276, + "learning_rate": 6.604301499448889e-05, + "loss": 0.0169, + "mean_token_accuracy": 0.994608148932457, + "num_tokens": 13020085.0, + "step": 1477 + }, + { + "entropy": 1.282648503780365, + "epoch": 5.336651583710407, + "grad_norm": 0.2548517882823944, + "learning_rate": 6.588450911198285e-05, + "loss": 0.011, + "mean_token_accuracy": 0.9977367669343948, + "num_tokens": 13028983.0, + "step": 1478 + }, + { + "entropy": 1.3429225087165833, + "epoch": 5.340271493212669, + "grad_norm": 0.39588239789009094, + "learning_rate": 6.572682045543242e-05, + "loss": 0.02, + "mean_token_accuracy": 0.9964097440242767, + "num_tokens": 13037480.0, + "step": 1479 + }, + { + "entropy": 1.2354435324668884, + "epoch": 5.343891402714932, + "grad_norm": 0.4548267126083374, + "learning_rate": 6.556994962375932e-05, + "loss": 0.028, + "mean_token_accuracy": 0.9886217266321182, + "num_tokens": 13046680.0, + "step": 1480 + }, + { + "entropy": 1.3022720217704773, + "epoch": 5.347511312217194, + "grad_norm": 0.47686082124710083, + "learning_rate": 6.541389721277888e-05, + "loss": 0.0193, + "mean_token_accuracy": 0.9936110824346542, + "num_tokens": 13055167.0, + "step": 1481 + }, + { + "entropy": 1.2595111727714539, + "epoch": 5.351131221719457, + "grad_norm": 0.34837445616722107, + "learning_rate": 6.5258663815198e-05, + "loss": 0.0219, + "mean_token_accuracy": 0.990663543343544, + "num_tokens": 13063986.0, + "step": 1482 + }, + { + "entropy": 1.2352654337882996, + "epoch": 5.354751131221719, + "grad_norm": 0.5913805365562439, + "learning_rate": 6.510425002061304e-05, + "loss": 0.0228, + "mean_token_accuracy": 0.9937742054462433, + "num_tokens": 13072998.0, + "step": 1483 + }, + { + "entropy": 1.2092313766479492, + "epoch": 5.3583710407239815, + "grad_norm": 0.21999365091323853, + "learning_rate": 6.495065641550713e-05, + "loss": 0.0056, + "mean_token_accuracy": 0.9981000274419785, + "num_tokens": 13082279.0, + "step": 1484 + }, + { + "entropy": 1.2771494686603546, + "epoch": 5.361990950226244, + "grad_norm": 0.4143531322479248, + "learning_rate": 6.479788358324842e-05, + "loss": 0.0292, + "mean_token_accuracy": 0.9926983714103699, + "num_tokens": 13090928.0, + "step": 1485 + }, + { + "entropy": 1.3478175699710846, + "epoch": 5.365610859728506, + "grad_norm": 0.39420145750045776, + "learning_rate": 6.464593210408761e-05, + "loss": 0.0212, + "mean_token_accuracy": 0.9937123507261276, + "num_tokens": 13099630.0, + "step": 1486 + }, + { + "entropy": 1.2226761877536774, + "epoch": 5.36923076923077, + "grad_norm": 1.5592964887619019, + "learning_rate": 6.449480255515585e-05, + "loss": 0.0163, + "mean_token_accuracy": 0.9949090629816055, + "num_tokens": 13108689.0, + "step": 1487 + }, + { + "entropy": 1.2781298756599426, + "epoch": 5.372850678733032, + "grad_norm": 0.28722354769706726, + "learning_rate": 6.434449551046223e-05, + "loss": 0.0133, + "mean_token_accuracy": 0.9971500188112259, + "num_tokens": 13117282.0, + "step": 1488 + }, + { + "entropy": 1.2164996266365051, + "epoch": 5.376470588235295, + "grad_norm": 0.5297361612319946, + "learning_rate": 6.419501154089222e-05, + "loss": 0.0366, + "mean_token_accuracy": 0.9908336997032166, + "num_tokens": 13126318.0, + "step": 1489 + }, + { + "entropy": 1.2871930003166199, + "epoch": 5.380090497737557, + "grad_norm": 0.433758020401001, + "learning_rate": 6.404635121420498e-05, + "loss": 0.0339, + "mean_token_accuracy": 0.9900172799825668, + "num_tokens": 13134831.0, + "step": 1490 + }, + { + "entropy": 1.2622864246368408, + "epoch": 5.383710407239819, + "grad_norm": 0.3331410884857178, + "learning_rate": 6.389851509503129e-05, + "loss": 0.0194, + "mean_token_accuracy": 0.9945416748523712, + "num_tokens": 13143469.0, + "step": 1491 + }, + { + "entropy": 1.3061146140098572, + "epoch": 5.387330316742082, + "grad_norm": 0.4276162385940552, + "learning_rate": 6.375150374487164e-05, + "loss": 0.0307, + "mean_token_accuracy": 0.9938482642173767, + "num_tokens": 13152342.0, + "step": 1492 + }, + { + "entropy": 1.2685127556324005, + "epoch": 5.390950226244344, + "grad_norm": 0.6231507658958435, + "learning_rate": 6.36053177220939e-05, + "loss": 0.014, + "mean_token_accuracy": 0.9966719746589661, + "num_tokens": 13161066.0, + "step": 1493 + }, + { + "entropy": 1.2424243688583374, + "epoch": 5.394570135746607, + "grad_norm": 0.36967551708221436, + "learning_rate": 6.345995758193111e-05, + "loss": 0.0207, + "mean_token_accuracy": 0.990947961807251, + "num_tokens": 13170552.0, + "step": 1494 + }, + { + "entropy": 1.2724092900753021, + "epoch": 5.398190045248869, + "grad_norm": 0.42010676860809326, + "learning_rate": 6.331542387647969e-05, + "loss": 0.0221, + "mean_token_accuracy": 0.9927983731031418, + "num_tokens": 13179406.0, + "step": 1495 + }, + { + "entropy": 1.2592349648475647, + "epoch": 5.401809954751132, + "grad_norm": 1.1237038373947144, + "learning_rate": 6.317171715469706e-05, + "loss": 0.11, + "mean_token_accuracy": 0.9830374866724014, + "num_tokens": 13188547.0, + "step": 1496 + }, + { + "entropy": 1.2375436127185822, + "epoch": 5.405429864253394, + "grad_norm": 0.5965372323989868, + "learning_rate": 6.302883796239966e-05, + "loss": 0.032, + "mean_token_accuracy": 0.9887288808822632, + "num_tokens": 13197393.0, + "step": 1497 + }, + { + "entropy": 1.208868384361267, + "epoch": 5.409049773755656, + "grad_norm": 0.40183204412460327, + "learning_rate": 6.288678684226084e-05, + "loss": 0.0286, + "mean_token_accuracy": 0.9926381707191467, + "num_tokens": 13206574.0, + "step": 1498 + }, + { + "entropy": 1.1976245045661926, + "epoch": 5.412669683257919, + "grad_norm": 0.28087249398231506, + "learning_rate": 6.27455643338089e-05, + "loss": 0.014, + "mean_token_accuracy": 0.9949794262647629, + "num_tokens": 13216215.0, + "step": 1499 + }, + { + "entropy": 1.2002365589141846, + "epoch": 5.416289592760181, + "grad_norm": 0.6782754063606262, + "learning_rate": 6.260517097342497e-05, + "loss": 0.0733, + "mean_token_accuracy": 0.983364999294281, + "num_tokens": 13225655.0, + "step": 1500 + }, + { + "entropy": 1.3070992529392242, + "epoch": 5.419909502262444, + "grad_norm": 0.48341765999794006, + "learning_rate": 6.246560729434076e-05, + "loss": 0.0293, + "mean_token_accuracy": 0.9945407956838608, + "num_tokens": 13234476.0, + "step": 1501 + }, + { + "entropy": 1.2892843186855316, + "epoch": 5.423529411764706, + "grad_norm": 0.6314505338668823, + "learning_rate": 6.232687382663708e-05, + "loss": 0.0366, + "mean_token_accuracy": 0.9923235774040222, + "num_tokens": 13242609.0, + "step": 1502 + }, + { + "entropy": 1.223000854253769, + "epoch": 5.427149321266969, + "grad_norm": 0.2504178583621979, + "learning_rate": 6.218897109724126e-05, + "loss": 0.0073, + "mean_token_accuracy": 0.9974544644355774, + "num_tokens": 13251477.0, + "step": 1503 + }, + { + "entropy": 1.267005443572998, + "epoch": 5.430769230769231, + "grad_norm": 0.2562965750694275, + "learning_rate": 6.205189962992538e-05, + "loss": 0.0106, + "mean_token_accuracy": 0.9974716156721115, + "num_tokens": 13260566.0, + "step": 1504 + }, + { + "entropy": 1.2632325291633606, + "epoch": 5.4343891402714934, + "grad_norm": 0.34495604038238525, + "learning_rate": 6.191565994530442e-05, + "loss": 0.0212, + "mean_token_accuracy": 0.9941398203372955, + "num_tokens": 13269284.0, + "step": 1505 + }, + { + "entropy": 1.2822768688201904, + "epoch": 5.438009049773756, + "grad_norm": 0.5192990899085999, + "learning_rate": 6.178025256083404e-05, + "loss": 0.0296, + "mean_token_accuracy": 0.9927681684494019, + "num_tokens": 13277886.0, + "step": 1506 + }, + { + "entropy": 1.2891391515731812, + "epoch": 5.441628959276018, + "grad_norm": 0.44157925248146057, + "learning_rate": 6.164567799080877e-05, + "loss": 0.0356, + "mean_token_accuracy": 0.9884617477655411, + "num_tokens": 13286840.0, + "step": 1507 + }, + { + "entropy": 1.2129878401756287, + "epoch": 5.445248868778281, + "grad_norm": 0.20710670948028564, + "learning_rate": 6.151193674635994e-05, + "loss": 0.008, + "mean_token_accuracy": 0.998381495475769, + "num_tokens": 13296167.0, + "step": 1508 + }, + { + "entropy": 1.2723053097724915, + "epoch": 5.448868778280543, + "grad_norm": 0.6072095632553101, + "learning_rate": 6.137902933545386e-05, + "loss": 0.0543, + "mean_token_accuracy": 0.9872492700815201, + "num_tokens": 13305103.0, + "step": 1509 + }, + { + "entropy": 1.2852924466133118, + "epoch": 5.452488687782806, + "grad_norm": 0.5609065890312195, + "learning_rate": 6.124695626288979e-05, + "loss": 0.0224, + "mean_token_accuracy": 0.9923731684684753, + "num_tokens": 13313728.0, + "step": 1510 + }, + { + "entropy": 1.3181545436382294, + "epoch": 5.456108597285068, + "grad_norm": 0.4461587369441986, + "learning_rate": 6.111571803029812e-05, + "loss": 0.0343, + "mean_token_accuracy": 0.9911801367998123, + "num_tokens": 13322684.0, + "step": 1511 + }, + { + "entropy": 1.2889766991138458, + "epoch": 5.4597285067873305, + "grad_norm": 0.41750776767730713, + "learning_rate": 6.098531513613835e-05, + "loss": 0.0145, + "mean_token_accuracy": 0.9957853406667709, + "num_tokens": 13331418.0, + "step": 1512 + }, + { + "entropy": 1.266262024641037, + "epoch": 5.463348416289593, + "grad_norm": 0.7255335450172424, + "learning_rate": 6.085574807569735e-05, + "loss": 0.0351, + "mean_token_accuracy": 0.9877503961324692, + "num_tokens": 13340240.0, + "step": 1513 + }, + { + "entropy": 1.256599485874176, + "epoch": 5.466968325791855, + "grad_norm": 0.2573866844177246, + "learning_rate": 6.0727017341087255e-05, + "loss": 0.0126, + "mean_token_accuracy": 0.9980450570583344, + "num_tokens": 13349288.0, + "step": 1514 + }, + { + "entropy": 1.340590089559555, + "epoch": 5.470588235294118, + "grad_norm": 0.44159454107284546, + "learning_rate": 6.059912342124387e-05, + "loss": 0.0118, + "mean_token_accuracy": 0.9938922077417374, + "num_tokens": 13357944.0, + "step": 1515 + }, + { + "entropy": 1.295377254486084, + "epoch": 5.47420814479638, + "grad_norm": 0.5403456687927246, + "learning_rate": 6.047206680192455e-05, + "loss": 0.0303, + "mean_token_accuracy": 0.9915453940629959, + "num_tokens": 13367003.0, + "step": 1516 + }, + { + "entropy": 1.3225692212581635, + "epoch": 5.477828054298643, + "grad_norm": 0.3978511095046997, + "learning_rate": 6.034584796570654e-05, + "loss": 0.011, + "mean_token_accuracy": 0.9969775825738907, + "num_tokens": 13375217.0, + "step": 1517 + }, + { + "entropy": 1.2819705605506897, + "epoch": 5.481447963800905, + "grad_norm": 0.8367284536361694, + "learning_rate": 6.0220467391985096e-05, + "loss": 0.0433, + "mean_token_accuracy": 0.9867484122514725, + "num_tokens": 13384269.0, + "step": 1518 + }, + { + "entropy": 1.3063046634197235, + "epoch": 5.4850678733031675, + "grad_norm": 0.33846932649612427, + "learning_rate": 6.009592555697155e-05, + "loss": 0.0186, + "mean_token_accuracy": 0.9932365864515305, + "num_tokens": 13392987.0, + "step": 1519 + }, + { + "entropy": 1.340195208787918, + "epoch": 5.48868778280543, + "grad_norm": 0.3321581184864044, + "learning_rate": 5.997222293369176e-05, + "loss": 0.0096, + "mean_token_accuracy": 0.9972854256629944, + "num_tokens": 13401614.0, + "step": 1520 + }, + { + "entropy": 1.2862593233585358, + "epoch": 5.492307692307692, + "grad_norm": 0.3259969651699066, + "learning_rate": 5.9849359991983945e-05, + "loss": 0.016, + "mean_token_accuracy": 0.99678173661232, + "num_tokens": 13410952.0, + "step": 1521 + }, + { + "entropy": 1.2998616695404053, + "epoch": 5.495927601809955, + "grad_norm": 0.3443238139152527, + "learning_rate": 5.9727337198497314e-05, + "loss": 0.0172, + "mean_token_accuracy": 0.9940863698720932, + "num_tokens": 13419611.0, + "step": 1522 + }, + { + "entropy": 1.3606862127780914, + "epoch": 5.499547511312217, + "grad_norm": 0.5264281034469604, + "learning_rate": 5.9606155016689906e-05, + "loss": 0.0302, + "mean_token_accuracy": 0.9862578064203262, + "num_tokens": 13428122.0, + "step": 1523 + }, + { + "entropy": 1.2580232620239258, + "epoch": 5.50316742081448, + "grad_norm": 0.3240465521812439, + "learning_rate": 5.948581390682716e-05, + "loss": 0.0129, + "mean_token_accuracy": 0.9952738732099533, + "num_tokens": 13437203.0, + "step": 1524 + }, + { + "entropy": 1.3060366213321686, + "epoch": 5.506787330316742, + "grad_norm": 0.43022865056991577, + "learning_rate": 5.9366314325979904e-05, + "loss": 0.0216, + "mean_token_accuracy": 0.9942560791969299, + "num_tokens": 13446214.0, + "step": 1525 + }, + { + "entropy": 1.3073553144931793, + "epoch": 5.5104072398190045, + "grad_norm": 0.5377596020698547, + "learning_rate": 5.924765672802276e-05, + "loss": 0.0157, + "mean_token_accuracy": 0.9965550154447556, + "num_tokens": 13455286.0, + "step": 1526 + }, + { + "entropy": 1.390531301498413, + "epoch": 5.514027149321267, + "grad_norm": 0.38995346426963806, + "learning_rate": 5.912984156363248e-05, + "loss": 0.0257, + "mean_token_accuracy": 0.9911804348230362, + "num_tokens": 13464196.0, + "step": 1527 + }, + { + "entropy": 1.3397209644317627, + "epoch": 5.517647058823529, + "grad_norm": 0.3952063024044037, + "learning_rate": 5.9012869280285995e-05, + "loss": 0.0172, + "mean_token_accuracy": 0.9945816844701767, + "num_tokens": 13472944.0, + "step": 1528 + }, + { + "entropy": 1.362727791070938, + "epoch": 5.521266968325792, + "grad_norm": 0.5274761915206909, + "learning_rate": 5.889674032225896e-05, + "loss": 0.0234, + "mean_token_accuracy": 0.991330161690712, + "num_tokens": 13481522.0, + "step": 1529 + }, + { + "entropy": 1.23201984167099, + "epoch": 5.524886877828054, + "grad_norm": 0.4607956111431122, + "learning_rate": 5.878145513062404e-05, + "loss": 0.0146, + "mean_token_accuracy": 0.9957283586263657, + "num_tokens": 13491005.0, + "step": 1530 + }, + { + "entropy": 1.316679060459137, + "epoch": 5.528506787330317, + "grad_norm": 0.2274518609046936, + "learning_rate": 5.866701414324897e-05, + "loss": 0.0102, + "mean_token_accuracy": 0.997354120016098, + "num_tokens": 13500075.0, + "step": 1531 + }, + { + "entropy": 1.3667320013046265, + "epoch": 5.532126696832579, + "grad_norm": 0.4060635268688202, + "learning_rate": 5.855341779479524e-05, + "loss": 0.0082, + "mean_token_accuracy": 0.998924732208252, + "num_tokens": 13508479.0, + "step": 1532 + }, + { + "entropy": 1.3088455200195312, + "epoch": 5.5357466063348415, + "grad_norm": 0.41733798384666443, + "learning_rate": 5.844066651671635e-05, + "loss": 0.0212, + "mean_token_accuracy": 0.9942925572395325, + "num_tokens": 13517621.0, + "step": 1533 + }, + { + "entropy": 1.3226538300514221, + "epoch": 5.539366515837104, + "grad_norm": 0.2734561562538147, + "learning_rate": 5.832876073725588e-05, + "loss": 0.0098, + "mean_token_accuracy": 0.9975619912147522, + "num_tokens": 13526783.0, + "step": 1534 + }, + { + "entropy": 1.341841697692871, + "epoch": 5.542986425339366, + "grad_norm": 0.515436053276062, + "learning_rate": 5.8217700881446287e-05, + "loss": 0.035, + "mean_token_accuracy": 0.9904588460922241, + "num_tokens": 13535605.0, + "step": 1535 + }, + { + "entropy": 1.2958484888076782, + "epoch": 5.546606334841629, + "grad_norm": 0.42809170484542847, + "learning_rate": 5.810748737110716e-05, + "loss": 0.0318, + "mean_token_accuracy": 0.9916644841432571, + "num_tokens": 13544957.0, + "step": 1536 + }, + { + "entropy": 1.3874789476394653, + "epoch": 5.550226244343891, + "grad_norm": 0.47145822644233704, + "learning_rate": 5.799812062484332e-05, + "loss": 0.0253, + "mean_token_accuracy": 0.9926209449768066, + "num_tokens": 13553396.0, + "step": 1537 + }, + { + "entropy": 1.2924224734306335, + "epoch": 5.553846153846154, + "grad_norm": 0.4338943064212799, + "learning_rate": 5.788960105804365e-05, + "loss": 0.0202, + "mean_token_accuracy": 0.994803860783577, + "num_tokens": 13562780.0, + "step": 1538 + }, + { + "entropy": 1.2481121718883514, + "epoch": 5.557466063348416, + "grad_norm": 0.4761129915714264, + "learning_rate": 5.778192908287934e-05, + "loss": 0.0275, + "mean_token_accuracy": 0.991680458188057, + "num_tokens": 13571598.0, + "step": 1539 + }, + { + "entropy": 1.3122504651546478, + "epoch": 5.5610859728506785, + "grad_norm": 0.36068788170814514, + "learning_rate": 5.767510510830225e-05, + "loss": 0.0152, + "mean_token_accuracy": 0.9938983470201492, + "num_tokens": 13580081.0, + "step": 1540 + }, + { + "entropy": 1.281277447938919, + "epoch": 5.564705882352941, + "grad_norm": 0.32679954171180725, + "learning_rate": 5.756912954004339e-05, + "loss": 0.0164, + "mean_token_accuracy": 0.9923573285341263, + "num_tokens": 13588940.0, + "step": 1541 + }, + { + "entropy": 1.325427383184433, + "epoch": 5.568325791855203, + "grad_norm": 0.31074655055999756, + "learning_rate": 5.7464002780611554e-05, + "loss": 0.0152, + "mean_token_accuracy": 0.9945929795503616, + "num_tokens": 13597490.0, + "step": 1542 + }, + { + "entropy": 1.3484923243522644, + "epoch": 5.571945701357466, + "grad_norm": 0.32313072681427, + "learning_rate": 5.735972522929157e-05, + "loss": 0.0074, + "mean_token_accuracy": 0.9976039379835129, + "num_tokens": 13606205.0, + "step": 1543 + }, + { + "entropy": 1.2962106764316559, + "epoch": 5.575565610859728, + "grad_norm": 0.608600378036499, + "learning_rate": 5.72562972821428e-05, + "loss": 0.0465, + "mean_token_accuracy": 0.9877003580331802, + "num_tokens": 13615410.0, + "step": 1544 + }, + { + "entropy": 1.3085224330425262, + "epoch": 5.579185520361991, + "grad_norm": 0.3315836489200592, + "learning_rate": 5.71537193319978e-05, + "loss": 0.011, + "mean_token_accuracy": 0.9945812225341797, + "num_tokens": 13624071.0, + "step": 1545 + }, + { + "entropy": 1.3625003397464752, + "epoch": 5.582805429864253, + "grad_norm": 0.6188799738883972, + "learning_rate": 5.705199176846077e-05, + "loss": 0.0251, + "mean_token_accuracy": 0.9927790015935898, + "num_tokens": 13632941.0, + "step": 1546 + }, + { + "entropy": 1.2403265237808228, + "epoch": 5.5864253393665155, + "grad_norm": 0.41247501969337463, + "learning_rate": 5.695111497790583e-05, + "loss": 0.025, + "mean_token_accuracy": 0.9947519451379776, + "num_tokens": 13642094.0, + "step": 1547 + }, + { + "entropy": 1.3287761509418488, + "epoch": 5.590045248868778, + "grad_norm": 0.46822625398635864, + "learning_rate": 5.6851089343475965e-05, + "loss": 0.0244, + "mean_token_accuracy": 0.9927321225404739, + "num_tokens": 13650868.0, + "step": 1548 + }, + { + "entropy": 1.320343554019928, + "epoch": 5.59366515837104, + "grad_norm": 0.22526142001152039, + "learning_rate": 5.6751915245081295e-05, + "loss": 0.0086, + "mean_token_accuracy": 0.9986630976200104, + "num_tokens": 13659770.0, + "step": 1549 + }, + { + "entropy": 1.3084074258804321, + "epoch": 5.597285067873303, + "grad_norm": 0.33844485878944397, + "learning_rate": 5.665359305939765e-05, + "loss": 0.015, + "mean_token_accuracy": 0.9950332492589951, + "num_tokens": 13668345.0, + "step": 1550 + }, + { + "entropy": 1.259777694940567, + "epoch": 5.600904977375565, + "grad_norm": 0.5575007796287537, + "learning_rate": 5.655612315986521e-05, + "loss": 0.0293, + "mean_token_accuracy": 0.99033622443676, + "num_tokens": 13677894.0, + "step": 1551 + }, + { + "entropy": 1.2944768965244293, + "epoch": 5.604524886877828, + "grad_norm": 0.5677646994590759, + "learning_rate": 5.645950591668713e-05, + "loss": 0.0177, + "mean_token_accuracy": 0.9926212728023529, + "num_tokens": 13686486.0, + "step": 1552 + }, + { + "entropy": 1.3571326434612274, + "epoch": 5.60814479638009, + "grad_norm": 0.5276592969894409, + "learning_rate": 5.6363741696827954e-05, + "loss": 0.0285, + "mean_token_accuracy": 0.9894280284643173, + "num_tokens": 13695238.0, + "step": 1553 + }, + { + "entropy": 1.2772742807865143, + "epoch": 5.6117647058823525, + "grad_norm": 0.4687597453594208, + "learning_rate": 5.626883086401243e-05, + "loss": 0.0274, + "mean_token_accuracy": 0.9940439760684967, + "num_tokens": 13704125.0, + "step": 1554 + }, + { + "entropy": 1.4407296478748322, + "epoch": 5.615384615384615, + "grad_norm": 0.46527454257011414, + "learning_rate": 5.6174773778723984e-05, + "loss": 0.0218, + "mean_token_accuracy": 0.9920376837253571, + "num_tokens": 13712398.0, + "step": 1555 + }, + { + "entropy": 1.258561372756958, + "epoch": 5.619004524886877, + "grad_norm": 0.5362602472305298, + "learning_rate": 5.6081570798203395e-05, + "loss": 0.0362, + "mean_token_accuracy": 0.9891482442617416, + "num_tokens": 13721323.0, + "step": 1556 + }, + { + "entropy": 1.4052426517009735, + "epoch": 5.62262443438914, + "grad_norm": 0.8489402532577515, + "learning_rate": 5.598922227644752e-05, + "loss": 0.0306, + "mean_token_accuracy": 0.9931427240371704, + "num_tokens": 13729550.0, + "step": 1557 + }, + { + "entropy": 1.2838713228702545, + "epoch": 5.626244343891402, + "grad_norm": 0.530430257320404, + "learning_rate": 5.5897728564207753e-05, + "loss": 0.0267, + "mean_token_accuracy": 0.9914616346359253, + "num_tokens": 13738387.0, + "step": 1558 + }, + { + "entropy": 1.273788958787918, + "epoch": 5.629864253393665, + "grad_norm": 0.7338762283325195, + "learning_rate": 5.580709000898889e-05, + "loss": 0.0388, + "mean_token_accuracy": 0.9882805198431015, + "num_tokens": 13747392.0, + "step": 1559 + }, + { + "entropy": 1.2639087438583374, + "epoch": 5.633484162895927, + "grad_norm": 0.5340273976325989, + "learning_rate": 5.5717306955047726e-05, + "loss": 0.0312, + "mean_token_accuracy": 0.9922775477170944, + "num_tokens": 13756123.0, + "step": 1560 + }, + { + "entropy": 1.3404139280319214, + "epoch": 5.63710407239819, + "grad_norm": 0.4714674651622772, + "learning_rate": 5.5628379743391724e-05, + "loss": 0.0257, + "mean_token_accuracy": 0.9931610226631165, + "num_tokens": 13764723.0, + "step": 1561 + }, + { + "entropy": 1.1893364191055298, + "epoch": 5.640723981900453, + "grad_norm": 0.39249178767204285, + "learning_rate": 5.55403087117778e-05, + "loss": 0.0222, + "mean_token_accuracy": 0.9925505220890045, + "num_tokens": 13773758.0, + "step": 1562 + }, + { + "entropy": 1.2712668776512146, + "epoch": 5.644343891402715, + "grad_norm": 0.26875993609428406, + "learning_rate": 5.545309419471092e-05, + "loss": 0.0136, + "mean_token_accuracy": 0.9961008727550507, + "num_tokens": 13782731.0, + "step": 1563 + }, + { + "entropy": 1.342617392539978, + "epoch": 5.647963800904978, + "grad_norm": 0.28937265276908875, + "learning_rate": 5.536673652344296e-05, + "loss": 0.0108, + "mean_token_accuracy": 0.9980555474758148, + "num_tokens": 13791666.0, + "step": 1564 + }, + { + "entropy": 1.2662743031978607, + "epoch": 5.65158371040724, + "grad_norm": 0.3205937445163727, + "learning_rate": 5.528123602597134e-05, + "loss": 0.0153, + "mean_token_accuracy": 0.9951831251382828, + "num_tokens": 13800594.0, + "step": 1565 + }, + { + "entropy": 1.2666921317577362, + "epoch": 5.655203619909503, + "grad_norm": 0.42374908924102783, + "learning_rate": 5.519659302703785e-05, + "loss": 0.0242, + "mean_token_accuracy": 0.9919591844081879, + "num_tokens": 13809854.0, + "step": 1566 + }, + { + "entropy": 1.2738026678562164, + "epoch": 5.658823529411765, + "grad_norm": 0.3403335213661194, + "learning_rate": 5.511280784812739e-05, + "loss": 0.0101, + "mean_token_accuracy": 0.9977398216724396, + "num_tokens": 13819112.0, + "step": 1567 + }, + { + "entropy": 1.365299105644226, + "epoch": 5.6624434389140275, + "grad_norm": 0.44894078373908997, + "learning_rate": 5.502988080746677e-05, + "loss": 0.0154, + "mean_token_accuracy": 0.993926540017128, + "num_tokens": 13827414.0, + "step": 1568 + }, + { + "entropy": 1.3010685741901398, + "epoch": 5.66606334841629, + "grad_norm": 0.47789883613586426, + "learning_rate": 5.494781222002344e-05, + "loss": 0.0329, + "mean_token_accuracy": 0.9890218079090118, + "num_tokens": 13836114.0, + "step": 1569 + }, + { + "entropy": 1.2826255857944489, + "epoch": 5.669683257918552, + "grad_norm": 0.46785131096839905, + "learning_rate": 5.486660239750434e-05, + "loss": 0.0191, + "mean_token_accuracy": 0.9962319582700729, + "num_tokens": 13845024.0, + "step": 1570 + }, + { + "entropy": 1.3346929252147675, + "epoch": 5.673303167420815, + "grad_norm": 0.47035253047943115, + "learning_rate": 5.478625164835473e-05, + "loss": 0.0195, + "mean_token_accuracy": 0.993961974978447, + "num_tokens": 13853532.0, + "step": 1571 + }, + { + "entropy": 1.3151374459266663, + "epoch": 5.676923076923077, + "grad_norm": 0.5711143016815186, + "learning_rate": 5.4706760277757004e-05, + "loss": 0.0245, + "mean_token_accuracy": 0.9915094375610352, + "num_tokens": 13862088.0, + "step": 1572 + }, + { + "entropy": 1.2389680445194244, + "epoch": 5.68054298642534, + "grad_norm": 0.4790339171886444, + "learning_rate": 5.462812858762956e-05, + "loss": 0.0241, + "mean_token_accuracy": 0.993121325969696, + "num_tokens": 13871182.0, + "step": 1573 + }, + { + "entropy": 1.2746427357196808, + "epoch": 5.684162895927602, + "grad_norm": 0.355680912733078, + "learning_rate": 5.455035687662549e-05, + "loss": 0.0111, + "mean_token_accuracy": 0.9955466389656067, + "num_tokens": 13880410.0, + "step": 1574 + }, + { + "entropy": 1.350427269935608, + "epoch": 5.6877828054298645, + "grad_norm": 0.5827673077583313, + "learning_rate": 5.4473445440131725e-05, + "loss": 0.0391, + "mean_token_accuracy": 0.9856820851564407, + "num_tokens": 13889031.0, + "step": 1575 + }, + { + "entropy": 1.1718063652515411, + "epoch": 5.691402714932127, + "grad_norm": 0.6021273136138916, + "learning_rate": 5.439739457026771e-05, + "loss": 0.0275, + "mean_token_accuracy": 0.9901014715433121, + "num_tokens": 13898403.0, + "step": 1576 + }, + { + "entropy": 1.2004620283842087, + "epoch": 5.695022624434389, + "grad_norm": 0.5180173516273499, + "learning_rate": 5.432220455588429e-05, + "loss": 0.0172, + "mean_token_accuracy": 0.9942009299993515, + "num_tokens": 13907735.0, + "step": 1577 + }, + { + "entropy": 1.3614135086536407, + "epoch": 5.698642533936652, + "grad_norm": 0.5046451091766357, + "learning_rate": 5.424787568256274e-05, + "loss": 0.0347, + "mean_token_accuracy": 0.9900737702846527, + "num_tokens": 13916137.0, + "step": 1578 + }, + { + "entropy": 1.249350756406784, + "epoch": 5.702262443438914, + "grad_norm": 0.43841880559921265, + "learning_rate": 5.4174408232613654e-05, + "loss": 0.0228, + "mean_token_accuracy": 0.9945852309465408, + "num_tokens": 13925064.0, + "step": 1579 + }, + { + "entropy": 1.3201919496059418, + "epoch": 5.705882352941177, + "grad_norm": 0.4697517454624176, + "learning_rate": 5.4101802485075694e-05, + "loss": 0.0143, + "mean_token_accuracy": 0.9947197586297989, + "num_tokens": 13933477.0, + "step": 1580 + }, + { + "entropy": 1.3263331651687622, + "epoch": 5.709502262443439, + "grad_norm": 0.5381503701210022, + "learning_rate": 5.403005871571475e-05, + "loss": 0.0233, + "mean_token_accuracy": 0.992905929684639, + "num_tokens": 13941597.0, + "step": 1581 + }, + { + "entropy": 1.1689225137233734, + "epoch": 5.7131221719457015, + "grad_norm": 0.3916611969470978, + "learning_rate": 5.395917719702284e-05, + "loss": 0.0247, + "mean_token_accuracy": 0.9947749674320221, + "num_tokens": 13950777.0, + "step": 1582 + }, + { + "entropy": 1.3147439360618591, + "epoch": 5.716742081447964, + "grad_norm": 0.42797765135765076, + "learning_rate": 5.388915819821699e-05, + "loss": 0.018, + "mean_token_accuracy": 0.9924831241369247, + "num_tokens": 13959343.0, + "step": 1583 + }, + { + "entropy": 1.272243618965149, + "epoch": 5.720361990950226, + "grad_norm": 1.4143792390823364, + "learning_rate": 5.3820001985238264e-05, + "loss": 0.163, + "mean_token_accuracy": 0.9764289855957031, + "num_tokens": 13968065.0, + "step": 1584 + }, + { + "entropy": 1.2489393651485443, + "epoch": 5.723981900452489, + "grad_norm": 0.6652867197990417, + "learning_rate": 5.3751708820750786e-05, + "loss": 0.0448, + "mean_token_accuracy": 0.9833552837371826, + "num_tokens": 13976528.0, + "step": 1585 + }, + { + "entropy": 1.292122721672058, + "epoch": 5.727601809954751, + "grad_norm": 0.4392347037792206, + "learning_rate": 5.3684278964140704e-05, + "loss": 0.0226, + "mean_token_accuracy": 0.9931960105895996, + "num_tokens": 13985064.0, + "step": 1586 + }, + { + "entropy": 1.2156145870685577, + "epoch": 5.731221719457014, + "grad_norm": 0.21446168422698975, + "learning_rate": 5.361771267151519e-05, + "loss": 0.0094, + "mean_token_accuracy": 0.997468113899231, + "num_tokens": 13994305.0, + "step": 1587 + }, + { + "entropy": 1.2955162227153778, + "epoch": 5.734841628959276, + "grad_norm": 0.4201910197734833, + "learning_rate": 5.355201019570151e-05, + "loss": 0.0124, + "mean_token_accuracy": 0.9944577515125275, + "num_tokens": 14002998.0, + "step": 1588 + }, + { + "entropy": 1.235168069601059, + "epoch": 5.7384615384615385, + "grad_norm": 0.4309161603450775, + "learning_rate": 5.348717178624608e-05, + "loss": 0.0267, + "mean_token_accuracy": 0.9907435774803162, + "num_tokens": 14012266.0, + "step": 1589 + }, + { + "entropy": 1.2108790278434753, + "epoch": 5.742081447963801, + "grad_norm": 0.40157178044319153, + "learning_rate": 5.3423197689413376e-05, + "loss": 0.0094, + "mean_token_accuracy": 0.9969953000545502, + "num_tokens": 14021437.0, + "step": 1590 + }, + { + "entropy": 1.2842464447021484, + "epoch": 5.745701357466063, + "grad_norm": 0.2772553861141205, + "learning_rate": 5.33600881481852e-05, + "loss": 0.0134, + "mean_token_accuracy": 0.9935310631990433, + "num_tokens": 14030226.0, + "step": 1591 + }, + { + "entropy": 1.2977232038974762, + "epoch": 5.749321266968326, + "grad_norm": 0.3554794490337372, + "learning_rate": 5.3297843402259654e-05, + "loss": 0.0138, + "mean_token_accuracy": 0.9951223134994507, + "num_tokens": 14038729.0, + "step": 1592 + }, + { + "entropy": 1.323443442583084, + "epoch": 5.752941176470588, + "grad_norm": 0.35086503624916077, + "learning_rate": 5.3236463688050186e-05, + "loss": 0.0181, + "mean_token_accuracy": 0.9949861466884613, + "num_tokens": 14047075.0, + "step": 1593 + }, + { + "entropy": 1.2722989916801453, + "epoch": 5.756561085972851, + "grad_norm": 0.5069295167922974, + "learning_rate": 5.317594923868479e-05, + "loss": 0.026, + "mean_token_accuracy": 0.9922992587089539, + "num_tokens": 14055589.0, + "step": 1594 + }, + { + "entropy": 1.1814655661582947, + "epoch": 5.760180995475113, + "grad_norm": 0.4090133011341095, + "learning_rate": 5.3116300284005036e-05, + "loss": 0.0199, + "mean_token_accuracy": 0.9940384775400162, + "num_tokens": 14064413.0, + "step": 1595 + }, + { + "entropy": 1.2628917694091797, + "epoch": 5.7638009049773755, + "grad_norm": 0.6415342092514038, + "learning_rate": 5.30575170505653e-05, + "loss": 0.0188, + "mean_token_accuracy": 0.994137778878212, + "num_tokens": 14073385.0, + "step": 1596 + }, + { + "entropy": 1.2798434495925903, + "epoch": 5.767420814479638, + "grad_norm": 0.48153477907180786, + "learning_rate": 5.299959976163175e-05, + "loss": 0.0249, + "mean_token_accuracy": 0.9935984015464783, + "num_tokens": 14081896.0, + "step": 1597 + }, + { + "entropy": 1.3011021316051483, + "epoch": 5.7710407239819, + "grad_norm": 0.5275992155075073, + "learning_rate": 5.294254863718164e-05, + "loss": 0.0389, + "mean_token_accuracy": 0.9880504906177521, + "num_tokens": 14090396.0, + "step": 1598 + }, + { + "entropy": 1.2897521257400513, + "epoch": 5.774660633484163, + "grad_norm": 0.6837610602378845, + "learning_rate": 5.288636389390245e-05, + "loss": 0.0363, + "mean_token_accuracy": 0.990441769361496, + "num_tokens": 14099409.0, + "step": 1599 + }, + { + "entropy": 1.280575543642044, + "epoch": 5.778280542986425, + "grad_norm": 0.5229384303092957, + "learning_rate": 5.2831045745190916e-05, + "loss": 0.032, + "mean_token_accuracy": 0.9928101450204849, + "num_tokens": 14107842.0, + "step": 1600 + }, + { + "entropy": 1.1509940326213837, + "epoch": 5.781900452488688, + "grad_norm": 0.4012407660484314, + "learning_rate": 5.277659440115249e-05, + "loss": 0.0271, + "mean_token_accuracy": 0.9918580055236816, + "num_tokens": 14117610.0, + "step": 1601 + }, + { + "entropy": 1.2652796804904938, + "epoch": 5.78552036199095, + "grad_norm": 0.5040006637573242, + "learning_rate": 5.272301006860028e-05, + "loss": 0.0374, + "mean_token_accuracy": 0.987891674041748, + "num_tokens": 14126445.0, + "step": 1602 + }, + { + "entropy": 1.188992828130722, + "epoch": 5.7891402714932125, + "grad_norm": 0.3621242642402649, + "learning_rate": 5.267029295105442e-05, + "loss": 0.0212, + "mean_token_accuracy": 0.992416262626648, + "num_tokens": 14135968.0, + "step": 1603 + }, + { + "entropy": 1.3317647278308868, + "epoch": 5.792760180995475, + "grad_norm": 0.5425340533256531, + "learning_rate": 5.261844324874125e-05, + "loss": 0.024, + "mean_token_accuracy": 0.9912209361791611, + "num_tokens": 14144225.0, + "step": 1604 + }, + { + "entropy": 1.2885065078735352, + "epoch": 5.796380090497737, + "grad_norm": 0.42595869302749634, + "learning_rate": 5.256746115859252e-05, + "loss": 0.0136, + "mean_token_accuracy": 0.9963643103837967, + "num_tokens": 14152862.0, + "step": 1605 + }, + { + "entropy": 1.2519879043102264, + "epoch": 5.8, + "grad_norm": 0.4666123390197754, + "learning_rate": 5.251734687424474e-05, + "loss": 0.0212, + "mean_token_accuracy": 0.9921929985284805, + "num_tokens": 14161695.0, + "step": 1606 + }, + { + "entropy": 1.29306098818779, + "epoch": 5.803619909502262, + "grad_norm": 0.5787078142166138, + "learning_rate": 5.246810058603832e-05, + "loss": 0.0268, + "mean_token_accuracy": 0.9933550506830215, + "num_tokens": 14170254.0, + "step": 1607 + }, + { + "entropy": 1.2271457612514496, + "epoch": 5.807239819004525, + "grad_norm": 0.5058806538581848, + "learning_rate": 5.241972248101695e-05, + "loss": 0.0406, + "mean_token_accuracy": 0.9916260987520218, + "num_tokens": 14179585.0, + "step": 1608 + }, + { + "entropy": 1.2162154912948608, + "epoch": 5.810859728506787, + "grad_norm": 0.2700723111629486, + "learning_rate": 5.237221274292685e-05, + "loss": 0.0103, + "mean_token_accuracy": 0.9945412278175354, + "num_tokens": 14188639.0, + "step": 1609 + }, + { + "entropy": 1.277609944343567, + "epoch": 5.8144796380090495, + "grad_norm": 0.24579264223575592, + "learning_rate": 5.232557155221604e-05, + "loss": 0.0071, + "mean_token_accuracy": 0.9991496652364731, + "num_tokens": 14197139.0, + "step": 1610 + }, + { + "entropy": 1.2556023597717285, + "epoch": 5.818099547511312, + "grad_norm": 0.34210360050201416, + "learning_rate": 5.22797990860337e-05, + "loss": 0.0111, + "mean_token_accuracy": 0.9959723651409149, + "num_tokens": 14206297.0, + "step": 1611 + }, + { + "entropy": 1.227512389421463, + "epoch": 5.821719457013574, + "grad_norm": 0.3752354085445404, + "learning_rate": 5.223489551822953e-05, + "loss": 0.0225, + "mean_token_accuracy": 0.9938993155956268, + "num_tokens": 14215415.0, + "step": 1612 + }, + { + "entropy": 1.2725732028484344, + "epoch": 5.825339366515837, + "grad_norm": 0.3974504768848419, + "learning_rate": 5.219086101935298e-05, + "loss": 0.0253, + "mean_token_accuracy": 0.9943836182355881, + "num_tokens": 14224716.0, + "step": 1613 + }, + { + "entropy": 1.3655243217945099, + "epoch": 5.828959276018099, + "grad_norm": 0.479657381772995, + "learning_rate": 5.214769575665267e-05, + "loss": 0.0275, + "mean_token_accuracy": 0.9938498884439468, + "num_tokens": 14232768.0, + "step": 1614 + }, + { + "entropy": 1.229171484708786, + "epoch": 5.832579185520362, + "grad_norm": 0.46097689867019653, + "learning_rate": 5.21053998940758e-05, + "loss": 0.0316, + "mean_token_accuracy": 0.9871274530887604, + "num_tokens": 14241731.0, + "step": 1615 + }, + { + "entropy": 1.344991534948349, + "epoch": 5.836199095022624, + "grad_norm": 0.7656559348106384, + "learning_rate": 5.206397359226741e-05, + "loss": 0.0385, + "mean_token_accuracy": 0.9900447279214859, + "num_tokens": 14249869.0, + "step": 1616 + }, + { + "entropy": 1.2098518013954163, + "epoch": 5.839819004524887, + "grad_norm": 0.2345188558101654, + "learning_rate": 5.202341700856991e-05, + "loss": 0.0085, + "mean_token_accuracy": 0.996493473649025, + "num_tokens": 14258895.0, + "step": 1617 + }, + { + "entropy": 1.198757380247116, + "epoch": 5.843438914027149, + "grad_norm": 0.3825550675392151, + "learning_rate": 5.198373029702236e-05, + "loss": 0.0231, + "mean_token_accuracy": 0.9951962381601334, + "num_tokens": 14267847.0, + "step": 1618 + }, + { + "entropy": 1.2725798189640045, + "epoch": 5.847058823529411, + "grad_norm": 0.4243534505367279, + "learning_rate": 5.194491360835993e-05, + "loss": 0.0092, + "mean_token_accuracy": 0.9970068633556366, + "num_tokens": 14276527.0, + "step": 1619 + }, + { + "entropy": 1.293479561805725, + "epoch": 5.850678733031674, + "grad_norm": 0.4285407066345215, + "learning_rate": 5.190696709001339e-05, + "loss": 0.0171, + "mean_token_accuracy": 0.9920125603675842, + "num_tokens": 14284981.0, + "step": 1620 + }, + { + "entropy": 1.3100707828998566, + "epoch": 5.854298642533936, + "grad_norm": 0.3621552288532257, + "learning_rate": 5.1869890886108436e-05, + "loss": 0.0136, + "mean_token_accuracy": 0.9961602836847305, + "num_tokens": 14293630.0, + "step": 1621 + }, + { + "entropy": 1.226471483707428, + "epoch": 5.857918552036199, + "grad_norm": 0.23275220394134521, + "learning_rate": 5.18336851374652e-05, + "loss": 0.0093, + "mean_token_accuracy": 0.9981594979763031, + "num_tokens": 14302749.0, + "step": 1622 + }, + { + "entropy": 1.2625525295734406, + "epoch": 5.861538461538462, + "grad_norm": 0.5472415089607239, + "learning_rate": 5.179834998159773e-05, + "loss": 0.0286, + "mean_token_accuracy": 0.9924817234277725, + "num_tokens": 14311804.0, + "step": 1623 + }, + { + "entropy": 1.2301190197467804, + "epoch": 5.8651583710407245, + "grad_norm": 0.35090526938438416, + "learning_rate": 5.176388555271348e-05, + "loss": 0.0159, + "mean_token_accuracy": 0.9961193799972534, + "num_tokens": 14320919.0, + "step": 1624 + }, + { + "entropy": 1.1986662447452545, + "epoch": 5.868778280542987, + "grad_norm": 0.6269108653068542, + "learning_rate": 5.1730291981712694e-05, + "loss": 0.0349, + "mean_token_accuracy": 0.993238553404808, + "num_tokens": 14330104.0, + "step": 1625 + }, + { + "entropy": 1.2636993825435638, + "epoch": 5.872398190045249, + "grad_norm": 0.3052563965320587, + "learning_rate": 5.1697569396188094e-05, + "loss": 0.0104, + "mean_token_accuracy": 0.9975957870483398, + "num_tokens": 14338756.0, + "step": 1626 + }, + { + "entropy": 1.3073358237743378, + "epoch": 5.876018099547512, + "grad_norm": 0.32827746868133545, + "learning_rate": 5.166571792042419e-05, + "loss": 0.0133, + "mean_token_accuracy": 0.9968355000019073, + "num_tokens": 14347190.0, + "step": 1627 + }, + { + "entropy": 1.2305921018123627, + "epoch": 5.879638009049774, + "grad_norm": 0.49389222264289856, + "learning_rate": 5.163473767539694e-05, + "loss": 0.0203, + "mean_token_accuracy": 0.9923485219478607, + "num_tokens": 14355834.0, + "step": 1628 + }, + { + "entropy": 1.1803602278232574, + "epoch": 5.883257918552037, + "grad_norm": 0.316609263420105, + "learning_rate": 5.1604628778773314e-05, + "loss": 0.013, + "mean_token_accuracy": 0.9963434785604477, + "num_tokens": 14365067.0, + "step": 1629 + }, + { + "entropy": 1.2325558066368103, + "epoch": 5.886877828054299, + "grad_norm": 0.4338141083717346, + "learning_rate": 5.1575391344910694e-05, + "loss": 0.0322, + "mean_token_accuracy": 0.9935078620910645, + "num_tokens": 14374350.0, + "step": 1630 + }, + { + "entropy": 1.217078536748886, + "epoch": 5.8904977375565615, + "grad_norm": 0.47768285870552063, + "learning_rate": 5.1547025484856575e-05, + "loss": 0.0317, + "mean_token_accuracy": 0.9924156069755554, + "num_tokens": 14383202.0, + "step": 1631 + }, + { + "entropy": 1.248268723487854, + "epoch": 5.894117647058824, + "grad_norm": 0.3552463948726654, + "learning_rate": 5.151953130634814e-05, + "loss": 0.0176, + "mean_token_accuracy": 0.9952526688575745, + "num_tokens": 14391667.0, + "step": 1632 + }, + { + "entropy": 1.2855578064918518, + "epoch": 5.897737556561086, + "grad_norm": 0.39224013686180115, + "learning_rate": 5.149290891381178e-05, + "loss": 0.0187, + "mean_token_accuracy": 0.9942717403173447, + "num_tokens": 14400702.0, + "step": 1633 + }, + { + "entropy": 1.2877880334854126, + "epoch": 5.901357466063349, + "grad_norm": 0.6350893974304199, + "learning_rate": 5.146715840836271e-05, + "loss": 0.0254, + "mean_token_accuracy": 0.9916457682847977, + "num_tokens": 14409243.0, + "step": 1634 + }, + { + "entropy": 1.26042240858078, + "epoch": 5.904977375565611, + "grad_norm": 0.38120755553245544, + "learning_rate": 5.144227988780463e-05, + "loss": 0.0147, + "mean_token_accuracy": 0.9972520917654037, + "num_tokens": 14418441.0, + "step": 1635 + }, + { + "entropy": 1.2049488723278046, + "epoch": 5.908597285067874, + "grad_norm": 0.35644638538360596, + "learning_rate": 5.141827344662937e-05, + "loss": 0.0137, + "mean_token_accuracy": 0.9954682439565659, + "num_tokens": 14427803.0, + "step": 1636 + }, + { + "entropy": 1.3032657206058502, + "epoch": 5.912217194570136, + "grad_norm": 0.5313483476638794, + "learning_rate": 5.139513917601641e-05, + "loss": 0.029, + "mean_token_accuracy": 0.9916610568761826, + "num_tokens": 14436392.0, + "step": 1637 + }, + { + "entropy": 1.1812008321285248, + "epoch": 5.9158371040723985, + "grad_norm": 0.6506596803665161, + "learning_rate": 5.137287716383269e-05, + "loss": 0.0453, + "mean_token_accuracy": 0.9844148457050323, + "num_tokens": 14445558.0, + "step": 1638 + }, + { + "entropy": 1.2481446266174316, + "epoch": 5.919457013574661, + "grad_norm": 0.2924315333366394, + "learning_rate": 5.135148749463215e-05, + "loss": 0.0092, + "mean_token_accuracy": 0.9970064610242844, + "num_tokens": 14454341.0, + "step": 1639 + }, + { + "entropy": 1.2447481751441956, + "epoch": 5.923076923076923, + "grad_norm": 0.4878501892089844, + "learning_rate": 5.133097024965547e-05, + "loss": 0.0228, + "mean_token_accuracy": 0.9914904683828354, + "num_tokens": 14462934.0, + "step": 1640 + }, + { + "entropy": 1.278237909078598, + "epoch": 5.926696832579186, + "grad_norm": 0.4631216824054718, + "learning_rate": 5.13113255068298e-05, + "loss": 0.0173, + "mean_token_accuracy": 0.9932174235582352, + "num_tokens": 14471582.0, + "step": 1641 + }, + { + "entropy": 1.2570799589157104, + "epoch": 5.930316742081448, + "grad_norm": 0.6310830116271973, + "learning_rate": 5.129255334076834e-05, + "loss": 0.0332, + "mean_token_accuracy": 0.9933324307203293, + "num_tokens": 14480528.0, + "step": 1642 + }, + { + "entropy": 1.1861405670642853, + "epoch": 5.933936651583711, + "grad_norm": 0.2527785003185272, + "learning_rate": 5.127465382277018e-05, + "loss": 0.0134, + "mean_token_accuracy": 0.99566450715065, + "num_tokens": 14490337.0, + "step": 1643 + }, + { + "entropy": 1.3150043785572052, + "epoch": 5.937556561085973, + "grad_norm": 0.5266876816749573, + "learning_rate": 5.125762702081997e-05, + "loss": 0.032, + "mean_token_accuracy": 0.990280419588089, + "num_tokens": 14498480.0, + "step": 1644 + }, + { + "entropy": 1.2823132276535034, + "epoch": 5.9411764705882355, + "grad_norm": 0.31202566623687744, + "learning_rate": 5.1241472999587705e-05, + "loss": 0.0122, + "mean_token_accuracy": 0.9953040480613708, + "num_tokens": 14507224.0, + "step": 1645 + }, + { + "entropy": 1.3113637864589691, + "epoch": 5.944796380090498, + "grad_norm": 0.48932045698165894, + "learning_rate": 5.122619182042835e-05, + "loss": 0.0269, + "mean_token_accuracy": 0.9913112223148346, + "num_tokens": 14516000.0, + "step": 1646 + }, + { + "entropy": 1.2275946736335754, + "epoch": 5.94841628959276, + "grad_norm": 0.5408899188041687, + "learning_rate": 5.121178354138185e-05, + "loss": 0.0291, + "mean_token_accuracy": 0.9919410198926926, + "num_tokens": 14525176.0, + "step": 1647 + }, + { + "entropy": 1.304937094449997, + "epoch": 5.952036199095023, + "grad_norm": 0.46893346309661865, + "learning_rate": 5.119824821717262e-05, + "loss": 0.0278, + "mean_token_accuracy": 0.9904455691576004, + "num_tokens": 14533857.0, + "step": 1648 + }, + { + "entropy": 1.2776793241500854, + "epoch": 5.955656108597285, + "grad_norm": 0.3307070732116699, + "learning_rate": 5.1185585899209566e-05, + "loss": 0.0193, + "mean_token_accuracy": 0.996076837182045, + "num_tokens": 14542389.0, + "step": 1649 + }, + { + "entropy": 1.2690793871879578, + "epoch": 5.959276018099548, + "grad_norm": 0.3710049092769623, + "learning_rate": 5.11737966355858e-05, + "loss": 0.0234, + "mean_token_accuracy": 0.9957760125398636, + "num_tokens": 14551348.0, + "step": 1650 + }, + { + "entropy": 1.3152281641960144, + "epoch": 5.96289592760181, + "grad_norm": 0.5174146294593811, + "learning_rate": 5.116288047107844e-05, + "loss": 0.0276, + "mean_token_accuracy": 0.9932683706283569, + "num_tokens": 14559744.0, + "step": 1651 + }, + { + "entropy": 1.1746317148208618, + "epoch": 5.9665158371040725, + "grad_norm": 0.5684933066368103, + "learning_rate": 5.115283744714846e-05, + "loss": 0.0398, + "mean_token_accuracy": 0.983399361371994, + "num_tokens": 14569026.0, + "step": 1652 + }, + { + "entropy": 1.232101172208786, + "epoch": 5.970135746606335, + "grad_norm": 0.39715829491615295, + "learning_rate": 5.114366760194055e-05, + "loss": 0.0171, + "mean_token_accuracy": 0.9951004087924957, + "num_tokens": 14577579.0, + "step": 1653 + }, + { + "entropy": 1.2994600534439087, + "epoch": 5.973755656108597, + "grad_norm": 0.4262140989303589, + "learning_rate": 5.113537097028295e-05, + "loss": 0.014, + "mean_token_accuracy": 0.9980425387620926, + "num_tokens": 14586149.0, + "step": 1654 + }, + { + "entropy": 1.2196675837039948, + "epoch": 5.97737556561086, + "grad_norm": 0.3029596507549286, + "learning_rate": 5.112794758368734e-05, + "loss": 0.0127, + "mean_token_accuracy": 0.9962086975574493, + "num_tokens": 14594926.0, + "step": 1655 + }, + { + "entropy": 1.1897935271263123, + "epoch": 5.980995475113122, + "grad_norm": 0.45404985547065735, + "learning_rate": 5.112139747034868e-05, + "loss": 0.0263, + "mean_token_accuracy": 0.9904349595308304, + "num_tokens": 14604026.0, + "step": 1656 + }, + { + "entropy": 1.1457158923149109, + "epoch": 5.984615384615385, + "grad_norm": 0.41211384534835815, + "learning_rate": 5.111572065514511e-05, + "loss": 0.0281, + "mean_token_accuracy": 0.9915729612112045, + "num_tokens": 14613949.0, + "step": 1657 + }, + { + "entropy": 1.2796459197998047, + "epoch": 5.988235294117647, + "grad_norm": 0.49562034010887146, + "learning_rate": 5.1110917159637954e-05, + "loss": 0.0229, + "mean_token_accuracy": 0.9938651770353317, + "num_tokens": 14623087.0, + "step": 1658 + }, + { + "entropy": 1.270674616098404, + "epoch": 5.9918552036199095, + "grad_norm": 0.3609181046485901, + "learning_rate": 5.110698700207144e-05, + "loss": 0.0108, + "mean_token_accuracy": 0.9950906783342361, + "num_tokens": 14631854.0, + "step": 1659 + }, + { + "entropy": 1.217082679271698, + "epoch": 5.995475113122172, + "grad_norm": 0.5136594176292419, + "learning_rate": 5.110393019737281e-05, + "loss": 0.0318, + "mean_token_accuracy": 0.992085412144661, + "num_tokens": 14641042.0, + "step": 1660 + }, + { + "entropy": 1.2773634493350983, + "epoch": 5.999095022624434, + "grad_norm": 0.4825091063976288, + "learning_rate": 5.110174675715225e-05, + "loss": 0.0219, + "mean_token_accuracy": 0.9937254637479782, + "num_tokens": 14649634.0, + "step": 1661 + }, + { + "entropy": 1.49472177028656, + "epoch": 6.0, + "grad_norm": 1.9729365110397339, + "learning_rate": 5.1100436689702704e-05, + "loss": 0.0153, + "mean_token_accuracy": 0.9913793206214905, + "num_tokens": 14650350.0, + "step": 1662 + }, + { + "epoch": 6.0, + "eval_entropy": 1.2684073583866522, + "eval_loss": 0.14712285995483398, + "eval_mean_token_accuracy": 0.97200608253479, + "eval_num_tokens": 14650350.0, + "eval_runtime": 116.1766, + "eval_samples_per_second": 3.176, + "eval_steps_per_second": 1.059, + "step": 1662 + } + ], + "logging_steps": 1, + "max_steps": 1662, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.9885354458052244e+18, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1662/training_args.bin b/checkpoint-1662/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..070a2de135e794840c49e066215a1c9f2e550d1f --- /dev/null +++ b/checkpoint-1662/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc271f94ce32216bd6f2ee9866fb7d62a0583bc7ee0c7fa953fa57c302729c6c +size 6289 diff --git a/checkpoint-277/README.md b/checkpoint-277/README.md new file mode 100644 index 0000000000000000000000000000000000000000..58a4061707bcc32db3b543936f6b650c01f3dccb --- /dev/null +++ b/checkpoint-277/README.md @@ -0,0 +1,208 @@ +--- +base_model: openai/gpt-oss-20b +library_name: peft +tags: +- base_model:adapter:openai/gpt-oss-20b +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/checkpoint-277/adapter_config.json b/checkpoint-277/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..076480eaf349cc658de2eb00b26c7360a85f8f56 --- /dev/null +++ b/checkpoint-277/adapter_config.json @@ -0,0 +1,53 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "GptOssForCausalLM", + "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss" + }, + "base_model_name_or_path": "openai/gpt-oss-20b", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "o_proj", + "q_proj" + ], + "target_parameters": [ + "7.mlp.experts.gate_up_proj", + "7.mlp.experts.down_proj", + "15.mlp.experts.gate_up_proj", + "15.mlp.experts.down_proj", + "23.mlp.experts.gate_up_proj", + "23.mlp.experts.down_proj" + ], + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-277/adapter_model.safetensors b/checkpoint-277/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ae8f01714e87d71562b38a8bb4d994e99ba48c3f --- /dev/null +++ b/checkpoint-277/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cc157e17c2ffc1fef8d9bdae0802d741b86a51b13c7014b6d54050b2d932806 +size 60189176 diff --git a/checkpoint-277/chat_template.jinja b/checkpoint-277/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc7bb11927d29f653ba2740f2db2c688fd77592f --- /dev/null +++ b/checkpoint-277/chat_template.jinja @@ -0,0 +1,331 @@ +{#- + In addition to the normal inputs of `messages` and `tools`, this template also accepts the + following kwargs: + - "builtin_tools": A list, can contain "browser" and/or "python". + - "model_identity": A string that optionally describes the model identity. + - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium". + #} + +{#- Tool Definition Rendering ============================================== #} +{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%} + {%- if param_spec.type == "array" -%} + {%- if param_spec['items'] -%} + {%- if param_spec['items']['type'] == "string" -%} + {{- "string[]" }} + {%- elif param_spec['items']['type'] == "number" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "integer" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "boolean" -%} + {{- "boolean[]" }} + {%- else -%} + {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%} + {%- if inner_type == "object | object" or inner_type|length > 50 -%} + {{- "any[]" }} + {%- else -%} + {{- inner_type + "[]" }} + {%- endif -%} + {%- endif -%} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- else -%} + {{- "any[]" }} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%} + {#- Handle array of types like ["object", "object"] from Union[dict, list] #} + {%- if param_spec.type | length > 1 -%} + {{- param_spec.type | join(" | ") }} + {%- else -%} + {{- param_spec.type[0] }} + {%- endif -%} + {%- elif param_spec.oneOf -%} + {#- Handle oneOf schemas - check for complex unions and fallback to any #} + {%- set has_object_variants = false -%} + {%- for variant in param_spec.oneOf -%} + {%- if variant.type == "object" -%} + {%- set has_object_variants = true -%} + {%- endif -%} + {%- endfor -%} + {%- if has_object_variants and param_spec.oneOf|length > 1 -%} + {{- "any" }} + {%- else -%} + {%- for variant in param_spec.oneOf -%} + {{- render_typescript_type(variant, required_params) -}} + {%- if variant.description %} + {{- "// " + variant.description }} + {%- endif -%} + {%- if variant.default is defined %} + {{ "// default: " + variant.default|tojson }} + {%- endif -%} + {%- if not loop.last %} + {{- " | " }} + {% endif -%} + {%- endfor -%} + {%- endif -%} + {%- elif param_spec.type == "string" -%} + {%- if param_spec.enum -%} + {{- '"' + param_spec.enum|join('" | "') + '"' -}} + {%- else -%} + {{- "string" }} + {%- if param_spec.nullable %} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type == "number" -%} + {{- "number" }} + {%- elif param_spec.type == "integer" -%} + {{- "number" }} + {%- elif param_spec.type == "boolean" -%} + {{- "boolean" }} + + {%- elif param_spec.type == "object" -%} + {%- if param_spec.properties -%} + {{- "{\n" }} + {%- for prop_name, prop_spec in param_spec.properties.items() -%} + {{- prop_name -}} + {%- if prop_name not in (param_spec.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{ render_typescript_type(prop_spec, param_spec.required or []) }} + {%- if not loop.last -%} + {{-", " }} + {%- endif -%} + {%- endfor -%} + {{- "}" }} + {%- else -%} + {{- "object" }} + {%- endif -%} + {%- else -%} + {{- "any" }} + {%- endif -%} +{%- endmacro -%} + +{%- macro render_tool_namespace(namespace_name, tools) -%} + {{- "## " + namespace_name + "\n\n" }} + {{- "namespace " + namespace_name + " {\n\n" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- "// " + tool.description + "\n" }} + {{- "type "+ tool.name + " = " }} + {%- if tool.parameters and tool.parameters.properties %} + {{- "(_: {\n" }} + {%- for param_name, param_spec in tool.parameters.properties.items() %} + {%- if param_spec.description %} + {{- "// " + param_spec.description + "\n" }} + {%- endif %} + {{- param_name }} + {%- if param_name not in (tool.parameters.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{- render_typescript_type(param_spec, tool.parameters.required or []) }} + {%- if param_spec.default is defined -%} + {%- if param_spec.enum %} + {{- ", // default: " + param_spec.default }} + {%- elif param_spec.oneOf %} + {{- "// default: " + param_spec.default }} + {%- else %} + {{- ", // default: " + param_spec.default|tojson }} + {%- endif -%} + {%- endif -%} + {%- if not loop.last %} + {{- ",\n" }} + {%- else %} + {{- ",\n" }} + {%- endif -%} + {%- endfor %} + {{- "}) => any;\n\n" }} + {%- else -%} + {{- "() => any;\n\n" }} + {%- endif -%} + {%- endfor %} + {{- "} // namespace " + namespace_name }} +{%- endmacro -%} + +{%- macro render_builtin_tools(browser_tool, python_tool) -%} + {%- if browser_tool %} + {{- "## browser\n\n" }} + {{- "// Tool for browsing.\n" }} + {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }} + {{- "// Cite information from the tool using the following format:\n" }} + {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }} + {{- "// Do not quote more than 10 words directly from the tool output.\n" }} + {{- "// sources=web (default: web)\n" }} + {{- "namespace browser {\n\n" }} + {{- "// Searches for information related to `query` and displays `topn` results.\n" }} + {{- "type search = (_: {\n" }} + {{- "query: string,\n" }} + {{- "topn?: number, // default: 10\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }} + {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }} + {{- "// If `cursor` is not provided, the most recent page is implied.\n" }} + {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }} + {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }} + {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }} + {{- "type open = (_: {\n" }} + {{- "id?: number | string, // default: -1\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "loc?: number, // default: -1\n" }} + {{- "num_lines?: number, // default: -1\n" }} + {{- "view_source?: boolean, // default: false\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }} + {{- "type find = (_: {\n" }} + {{- "pattern: string,\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "}) => any;\n\n" }} + {{- "} // namespace browser\n\n" }} + {%- endif -%} + + {%- if python_tool %} + {{- "## python\n\n" }} + {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }} + {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }} + {%- endif -%} +{%- endmacro -%} + +{#- System Message Construction ============================================ #} +{%- macro build_system_message() -%} + {%- if model_identity is not defined %} + {%- set model_identity = "You are ChatGPT, a large language model trained by OpenAI." %} + {%- endif %} + {{- model_identity + "\n" }} + {{- "Knowledge cutoff: 2024-06\n" }} + {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }} + {%- if reasoning_effort is not defined %} + {%- set reasoning_effort = "medium" %} + {%- endif %} + {{- "Reasoning: " + reasoning_effort + "\n\n" }} + {%- if builtin_tools %} + {{- "# Tools\n\n" }} + {%- set available_builtin_tools = namespace(browser=false, python=false) %} + {%- for tool in builtin_tools %} + {%- if tool == "browser" %} + {%- set available_builtin_tools.browser = true %} + {%- elif tool == "python" %} + {%- set available_builtin_tools.python = true %} + {%- endif %} + {%- endfor %} + {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }} + {%- endif -%} + {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }} + {%- if tools -%} + {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }} + {%- endif -%} +{%- endmacro -%} + +{#- Main Template Logic ================================================= #} +{#- Set defaults #} + +{#- Render system message #} +{{- "<|start|>system<|message|>" }} +{{- build_system_message() }} +{{- "<|end|>" }} + +{#- Extract developer message #} +{%- if messages[0].role == "developer" or messages[0].role == "system" %} + {%- set developer_message = messages[0].content %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set developer_message = "" %} + {%- set loop_messages = messages %} +{%- endif %} + +{#- Render developer message #} +{%- if developer_message or tools %} + {{- "<|start|>developer<|message|>" }} + {%- if developer_message %} + {{- "# Instructions\n\n" }} + {{- developer_message }} + {{- "\n\n" }} + {%- endif %} + {%- if tools -%} + {{- "# Tools\n\n" }} + {{- render_tool_namespace("functions", tools) }} + {%- endif -%} + {{- "<|end|>" }} +{%- endif %} + +{#- Render messages #} +{%- set last_tool_call = namespace(name=none) %} +{%- for message in loop_messages -%} + {#- At this point only assistant/user/tool messages should remain #} + {%- if message.role == 'assistant' -%} + {#- Checks to ensure the messages are being passed in the format we expect #} + {%- if "content" in message %} + {%- if "<|channel|>analysis<|message|>" in message.content or "<|channel|>final<|message|>" in message.content %} + {{- raise_exception("You have passed a message containing <|channel|> tags in the content field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }} + {%- endif %} + {%- endif %} + {%- if "thinking" in message %} + {%- if "<|channel|>analysis<|message|>" in message.thinking or "<|channel|>final<|message|>" in message.thinking %} + {{- raise_exception("You have passed a message containing <|channel|> tags in the thinking field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }} + {%- endif %} + {%- endif %} + {%- if "tool_calls" in message %} + {#- We need very careful handling here - we want to drop the tool call analysis message if the model #} + {#- has output a later <|final|> message, but otherwise we want to retain it. This is the only case #} + {#- when we render CoT/analysis messages in inference. #} + {%- set future_final_message = namespace(found=false) %} + {%- for future_message in loop_messages[loop.index:] %} + {%- if future_message.role == 'assistant' and "tool_calls" not in future_message %} + {%- set future_final_message.found = true %} + {%- endif %} + {%- endfor %} + {#- We assume max 1 tool call per message, and so we infer the tool call name #} + {#- in "tool" messages from the most recent assistant tool call name #} + {%- set tool_call = message.tool_calls[0] %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if message.content and message.thinking %} + {{- raise_exception("Cannot pass both content and thinking in an assistant message with tool calls! Put the analysis message in one or the other, but not both.") }} + {%- elif message.content and not future_final_message.found %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }} + {%- elif message.thinking and not future_final_message.found %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {%- endif %} + {{- "<|start|>assistant to=" }} + {{- "functions." + tool_call.name + "<|channel|>commentary " }} + {{- (tool_call.content_type if tool_call.content_type is defined else "json") + "<|message|>" }} + {{- tool_call.arguments|tojson }} + {{- "<|call|>" }} + {%- set last_tool_call.name = tool_call.name %} + {%- elif loop.last and not add_generation_prompt %} + {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #} + {#- This is a situation that should only occur in training, never in inference. #} + {%- if "thinking" in message %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {%- endif %} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }} + {%- else %} + {#- CoT is dropped during all previous turns, so we never render it for inference #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- endif %} + {%- elif message.role == 'tool' -%} + {%- if last_tool_call.name is none %} + {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }} + {%- endif %} + {{- "<|start|>functions." + last_tool_call.name }} + {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} + {%- elif message.role == 'user' -%} + {{- "<|start|>user<|message|>" + message.content + "<|end|>" }} + {%- endif -%} +{%- endfor -%} + +{#- Generation prompt #} +{%- if add_generation_prompt -%} +<|start|>assistant +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-277/optimizer.pt b/checkpoint-277/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b6dde886cb42fd93ffdeb3ff2a9d42b7553b1ce --- /dev/null +++ b/checkpoint-277/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:832af62e78df49d695db6b48ae8f08caa80678ec4c7e76922a10fa2bc95b2fb6 +size 120498699 diff --git a/checkpoint-277/rng_state.pth b/checkpoint-277/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-277/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-277/scheduler.pt b/checkpoint-277/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a07536f0bd1b98860899ab50e7bb683d26184feb --- /dev/null +++ b/checkpoint-277/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a118aba730ae39b9b4005b28ca0ff44166522e1c08d8d2279fed35a06e5e37a +size 1465 diff --git a/checkpoint-277/special_tokens_map.json b/checkpoint-277/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c47e282982a9c6856832947a72ded329fad2e8c --- /dev/null +++ b/checkpoint-277/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|return|>" +} diff --git a/checkpoint-277/tokenizer.json b/checkpoint-277/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647 --- /dev/null +++ b/checkpoint-277/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3 +size 27868174 diff --git a/checkpoint-277/tokenizer_config.json b/checkpoint-277/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e86f6faa71de0fc3afe47ea8984da9e6138c031c --- /dev/null +++ b/checkpoint-277/tokenizer_config.json @@ -0,0 +1,183 @@ +{ + "added_tokens_decoder": { + "199998": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "199999": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200000": { + "content": "<|reserved_200000|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200001": { + "content": "<|reserved_200001|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200002": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200003": { + "content": "<|constrain|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200004": { + "content": "<|reserved_200004|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200005": { + "content": "<|channel|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200006": { + "content": "<|start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200008": { + "content": "<|message|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200009": { + "content": "<|reserved_200009|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200010": { + "content": "<|reserved_200010|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200011": { + "content": "<|reserved_200011|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200012": { + "content": "<|call|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200013": { + "content": "<|reserved_200013|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200014": { + "content": "<|reserved_200014|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200015": { + "content": "<|reserved_200015|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200016": { + "content": "<|reserved_200016|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200017": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200018": { + "content": "<|endofprompt|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|startoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|return|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|return|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-277/trainer_state.json b/checkpoint-277/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6d5b8d8fe3a9f1ea7f0a6251987a20f8bda88599 --- /dev/null +++ b/checkpoint-277/trainer_state.json @@ -0,0 +1,2815 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 277, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.358862280845642, + "epoch": 0.0036199095022624436, + "grad_norm": 2.292628288269043, + "learning_rate": 0.0, + "loss": 0.7311, + "mean_token_accuracy": 0.8534883409738541, + "num_tokens": 9316.0, + "step": 1 + }, + { + "entropy": 2.674945294857025, + "epoch": 0.007239819004524887, + "grad_norm": 3.8950836658477783, + "learning_rate": 1.0219999999999999e-05, + "loss": 1.0621, + "mean_token_accuracy": 0.8183160275220871, + "num_tokens": 17707.0, + "step": 2 + }, + { + "entropy": 2.4915525913238525, + "epoch": 0.01085972850678733, + "grad_norm": 2.792142868041992, + "learning_rate": 2.0439999999999997e-05, + "loss": 0.8448, + "mean_token_accuracy": 0.8489587754011154, + "num_tokens": 26783.0, + "step": 3 + }, + { + "entropy": 2.525622010231018, + "epoch": 0.014479638009049774, + "grad_norm": 2.7071900367736816, + "learning_rate": 3.0659999999999994e-05, + "loss": 0.8847, + "mean_token_accuracy": 0.8486668318510056, + "num_tokens": 35947.0, + "step": 4 + }, + { + "entropy": 2.588509976863861, + "epoch": 0.01809954751131222, + "grad_norm": 2.981574773788452, + "learning_rate": 4.0879999999999995e-05, + "loss": 1.0783, + "mean_token_accuracy": 0.8135111033916473, + "num_tokens": 44505.0, + "step": 5 + }, + { + "entropy": 2.662865400314331, + "epoch": 0.02171945701357466, + "grad_norm": 2.629283905029297, + "learning_rate": 5.1099999999999995e-05, + "loss": 0.9485, + "mean_token_accuracy": 0.8152717798948288, + "num_tokens": 53140.0, + "step": 6 + }, + { + "entropy": 2.6662243604660034, + "epoch": 0.025339366515837104, + "grad_norm": 2.730058431625366, + "learning_rate": 6.131999999999999e-05, + "loss": 0.6982, + "mean_token_accuracy": 0.8552135527133942, + "num_tokens": 61932.0, + "step": 7 + }, + { + "entropy": 2.661384105682373, + "epoch": 0.02895927601809955, + "grad_norm": 2.562839984893799, + "learning_rate": 7.154e-05, + "loss": 0.7296, + "mean_token_accuracy": 0.8579540699720383, + "num_tokens": 70973.0, + "step": 8 + }, + { + "entropy": 2.7889368534088135, + "epoch": 0.03257918552036199, + "grad_norm": 2.8640544414520264, + "learning_rate": 8.175999999999999e-05, + "loss": 0.5965, + "mean_token_accuracy": 0.8638457208871841, + "num_tokens": 79977.0, + "step": 9 + }, + { + "entropy": 2.811532199382782, + "epoch": 0.03619909502262444, + "grad_norm": 2.6199426651000977, + "learning_rate": 9.197999999999998e-05, + "loss": 0.4819, + "mean_token_accuracy": 0.8786454051733017, + "num_tokens": 88915.0, + "step": 10 + }, + { + "entropy": 2.941167712211609, + "epoch": 0.039819004524886875, + "grad_norm": 1.2497272491455078, + "learning_rate": 0.00010219999999999999, + "loss": 0.7192, + "mean_token_accuracy": 0.841494083404541, + "num_tokens": 97749.0, + "step": 11 + }, + { + "entropy": 3.0547962188720703, + "epoch": 0.04343891402714932, + "grad_norm": 1.436136245727539, + "learning_rate": 0.00011241999999999998, + "loss": 0.5908, + "mean_token_accuracy": 0.8657624870538712, + "num_tokens": 106048.0, + "step": 12 + }, + { + "entropy": 2.9914053082466125, + "epoch": 0.047058823529411764, + "grad_norm": 0.9903654456138611, + "learning_rate": 0.00012263999999999998, + "loss": 0.4008, + "mean_token_accuracy": 0.8985499292612076, + "num_tokens": 115216.0, + "step": 13 + }, + { + "entropy": 3.1867465376853943, + "epoch": 0.05067873303167421, + "grad_norm": 1.019572377204895, + "learning_rate": 0.00013286, + "loss": 0.5062, + "mean_token_accuracy": 0.8893097043037415, + "num_tokens": 124040.0, + "step": 14 + }, + { + "entropy": 3.2431325912475586, + "epoch": 0.05429864253393665, + "grad_norm": 1.2394084930419922, + "learning_rate": 0.00014308, + "loss": 0.361, + "mean_token_accuracy": 0.9009967148303986, + "num_tokens": 132447.0, + "step": 15 + }, + { + "entropy": 3.1858643889427185, + "epoch": 0.0579185520361991, + "grad_norm": 0.9859603643417358, + "learning_rate": 0.00015329999999999999, + "loss": 0.4498, + "mean_token_accuracy": 0.887280747294426, + "num_tokens": 141228.0, + "step": 16 + }, + { + "entropy": 3.5029141902923584, + "epoch": 0.06153846153846154, + "grad_norm": 1.453957438468933, + "learning_rate": 0.00016351999999999998, + "loss": 0.4949, + "mean_token_accuracy": 0.888081505894661, + "num_tokens": 149789.0, + "step": 17 + }, + { + "entropy": 3.4572895765304565, + "epoch": 0.06515837104072399, + "grad_norm": 1.390377402305603, + "learning_rate": 0.00017374, + "loss": 0.5449, + "mean_token_accuracy": 0.8745045810937881, + "num_tokens": 157813.0, + "step": 18 + }, + { + "entropy": 3.3081750869750977, + "epoch": 0.06877828054298643, + "grad_norm": 1.1171791553497314, + "learning_rate": 0.00018395999999999997, + "loss": 0.4786, + "mean_token_accuracy": 0.8893420845270157, + "num_tokens": 166315.0, + "step": 19 + }, + { + "entropy": 3.3776715993881226, + "epoch": 0.07239819004524888, + "grad_norm": 1.5567998886108398, + "learning_rate": 0.00019418, + "loss": 0.3669, + "mean_token_accuracy": 0.9146632701158524, + "num_tokens": 175207.0, + "step": 20 + }, + { + "entropy": 3.2677870988845825, + "epoch": 0.0760180995475113, + "grad_norm": 1.7404611110687256, + "learning_rate": 0.00020439999999999998, + "loss": 0.5287, + "mean_token_accuracy": 0.8777483552694321, + "num_tokens": 183833.0, + "step": 21 + }, + { + "entropy": 3.313201069831848, + "epoch": 0.07963800904977375, + "grad_norm": 1.0836979150772095, + "learning_rate": 0.00021461999999999997, + "loss": 0.3014, + "mean_token_accuracy": 0.9215261936187744, + "num_tokens": 192591.0, + "step": 22 + }, + { + "entropy": 3.208672881126404, + "epoch": 0.0832579185520362, + "grad_norm": 1.2197301387786865, + "learning_rate": 0.00022483999999999997, + "loss": 0.4401, + "mean_token_accuracy": 0.9031257778406143, + "num_tokens": 201372.0, + "step": 23 + }, + { + "entropy": 3.1830995082855225, + "epoch": 0.08687782805429864, + "grad_norm": 1.2422229051589966, + "learning_rate": 0.00023506, + "loss": 0.5144, + "mean_token_accuracy": 0.8915928155183792, + "num_tokens": 210348.0, + "step": 24 + }, + { + "entropy": 3.085207223892212, + "epoch": 0.09049773755656108, + "grad_norm": 0.8987624049186707, + "learning_rate": 0.00024527999999999996, + "loss": 0.3253, + "mean_token_accuracy": 0.9221627116203308, + "num_tokens": 219131.0, + "step": 25 + }, + { + "entropy": 3.026031017303467, + "epoch": 0.09411764705882353, + "grad_norm": 1.0273475646972656, + "learning_rate": 0.0002555, + "loss": 0.3495, + "mean_token_accuracy": 0.9147634357213974, + "num_tokens": 228292.0, + "step": 26 + }, + { + "entropy": 3.0420032739639282, + "epoch": 0.09773755656108597, + "grad_norm": 1.0590945482254028, + "learning_rate": 0.00026572, + "loss": 0.4495, + "mean_token_accuracy": 0.9019353687763214, + "num_tokens": 236942.0, + "step": 27 + }, + { + "entropy": 3.0469263792037964, + "epoch": 0.10135746606334842, + "grad_norm": 0.9584959745407104, + "learning_rate": 0.00027594, + "loss": 0.405, + "mean_token_accuracy": 0.9216890782117844, + "num_tokens": 245543.0, + "step": 28 + }, + { + "entropy": 2.92683744430542, + "epoch": 0.10497737556561086, + "grad_norm": 0.8826628923416138, + "learning_rate": 0.00028616, + "loss": 0.4004, + "mean_token_accuracy": 0.9173285663127899, + "num_tokens": 254264.0, + "step": 29 + }, + { + "entropy": 3.0086968541145325, + "epoch": 0.1085972850678733, + "grad_norm": 0.8521863222122192, + "learning_rate": 0.00029637999999999995, + "loss": 0.2876, + "mean_token_accuracy": 0.9335231184959412, + "num_tokens": 263143.0, + "step": 30 + }, + { + "entropy": 2.9086623191833496, + "epoch": 0.11221719457013575, + "grad_norm": 0.7830919623374939, + "learning_rate": 0.00030659999999999997, + "loss": 0.548, + "mean_token_accuracy": 0.8831343650817871, + "num_tokens": 272055.0, + "step": 31 + }, + { + "entropy": 2.9730575680732727, + "epoch": 0.1158371040723982, + "grad_norm": 0.7217472195625305, + "learning_rate": 0.00031682, + "loss": 0.3564, + "mean_token_accuracy": 0.9119151830673218, + "num_tokens": 280971.0, + "step": 32 + }, + { + "entropy": 3.081720530986786, + "epoch": 0.11945701357466064, + "grad_norm": 0.8697704076766968, + "learning_rate": 0.00032703999999999996, + "loss": 0.334, + "mean_token_accuracy": 0.9234935492277145, + "num_tokens": 289449.0, + "step": 33 + }, + { + "entropy": 3.1043431162834167, + "epoch": 0.12307692307692308, + "grad_norm": 0.7962514758110046, + "learning_rate": 0.00033726, + "loss": 0.1602, + "mean_token_accuracy": 0.9554370939731598, + "num_tokens": 297804.0, + "step": 34 + }, + { + "entropy": 3.0275490283966064, + "epoch": 0.12669683257918551, + "grad_norm": 0.5887104272842407, + "learning_rate": 0.00034748, + "loss": 0.2254, + "mean_token_accuracy": 0.9491932094097137, + "num_tokens": 306589.0, + "step": 35 + }, + { + "entropy": 3.099652886390686, + "epoch": 0.13031674208144797, + "grad_norm": 0.894397497177124, + "learning_rate": 0.00035769999999999997, + "loss": 0.6397, + "mean_token_accuracy": 0.8802188038825989, + "num_tokens": 315534.0, + "step": 36 + }, + { + "entropy": 3.0312134623527527, + "epoch": 0.1339366515837104, + "grad_norm": 0.6374682188034058, + "learning_rate": 0.00036791999999999993, + "loss": 0.2183, + "mean_token_accuracy": 0.9478497952222824, + "num_tokens": 324492.0, + "step": 37 + }, + { + "entropy": 3.28497713804245, + "epoch": 0.13755656108597286, + "grad_norm": 0.6740968823432922, + "learning_rate": 0.00037813999999999995, + "loss": 0.3619, + "mean_token_accuracy": 0.9288723170757294, + "num_tokens": 333195.0, + "step": 38 + }, + { + "entropy": 3.1478323340415955, + "epoch": 0.1411764705882353, + "grad_norm": 0.7235494256019592, + "learning_rate": 0.00038836, + "loss": 0.324, + "mean_token_accuracy": 0.9179254025220871, + "num_tokens": 342028.0, + "step": 39 + }, + { + "entropy": 3.279879152774811, + "epoch": 0.14479638009049775, + "grad_norm": 0.7512595653533936, + "learning_rate": 0.00039858, + "loss": 0.4804, + "mean_token_accuracy": 0.889826312661171, + "num_tokens": 350902.0, + "step": 40 + }, + { + "entropy": 3.173546612262726, + "epoch": 0.14841628959276018, + "grad_norm": 0.6978861689567566, + "learning_rate": 0.00040879999999999996, + "loss": 0.3442, + "mean_token_accuracy": 0.9205169230699539, + "num_tokens": 359787.0, + "step": 41 + }, + { + "entropy": 3.2385765314102173, + "epoch": 0.1520361990950226, + "grad_norm": 0.8108944892883301, + "learning_rate": 0.00041901999999999993, + "loss": 0.4223, + "mean_token_accuracy": 0.8979178965091705, + "num_tokens": 368426.0, + "step": 42 + }, + { + "entropy": 3.146568477153778, + "epoch": 0.15565610859728507, + "grad_norm": 0.5847787261009216, + "learning_rate": 0.00042923999999999995, + "loss": 0.1953, + "mean_token_accuracy": 0.9556037336587906, + "num_tokens": 377349.0, + "step": 43 + }, + { + "entropy": 3.066233277320862, + "epoch": 0.1592760180995475, + "grad_norm": 0.7887329459190369, + "learning_rate": 0.00043945999999999997, + "loss": 0.6815, + "mean_token_accuracy": 0.8654293268918991, + "num_tokens": 386603.0, + "step": 44 + }, + { + "entropy": 3.1745981574058533, + "epoch": 0.16289592760180996, + "grad_norm": 0.7280165553092957, + "learning_rate": 0.00044967999999999994, + "loss": 0.1932, + "mean_token_accuracy": 0.9479279220104218, + "num_tokens": 395070.0, + "step": 45 + }, + { + "entropy": 3.1094446182250977, + "epoch": 0.1665158371040724, + "grad_norm": 0.6453448534011841, + "learning_rate": 0.00045989999999999996, + "loss": 0.2608, + "mean_token_accuracy": 0.9249396026134491, + "num_tokens": 403651.0, + "step": 46 + }, + { + "entropy": 2.9050925970077515, + "epoch": 0.17013574660633485, + "grad_norm": 0.6689278483390808, + "learning_rate": 0.00047012, + "loss": 0.4489, + "mean_token_accuracy": 0.898686870932579, + "num_tokens": 412898.0, + "step": 47 + }, + { + "entropy": 3.2239145040512085, + "epoch": 0.17375565610859728, + "grad_norm": 1.0014020204544067, + "learning_rate": 0.00048033999999999994, + "loss": 0.3234, + "mean_token_accuracy": 0.9231891483068466, + "num_tokens": 421420.0, + "step": 48 + }, + { + "entropy": 3.035899817943573, + "epoch": 0.17737556561085974, + "grad_norm": 0.6415768265724182, + "learning_rate": 0.0004905599999999999, + "loss": 0.2259, + "mean_token_accuracy": 0.9447792917490005, + "num_tokens": 430258.0, + "step": 49 + }, + { + "entropy": 3.057477653026581, + "epoch": 0.18099547511312217, + "grad_norm": 0.6042271256446838, + "learning_rate": 0.0005007799999999999, + "loss": 0.2228, + "mean_token_accuracy": 0.9473378211259842, + "num_tokens": 439593.0, + "step": 50 + }, + { + "entropy": 2.8375911116600037, + "epoch": 0.18461538461538463, + "grad_norm": 0.739811897277832, + "learning_rate": 0.000511, + "loss": 0.3623, + "mean_token_accuracy": 0.9050924181938171, + "num_tokens": 449056.0, + "step": 51 + }, + { + "entropy": 2.9926682114601135, + "epoch": 0.18823529411764706, + "grad_norm": 0.6637321710586548, + "learning_rate": 0.0005109995633102972, + "loss": 0.2924, + "mean_token_accuracy": 0.9397273659706116, + "num_tokens": 457677.0, + "step": 52 + }, + { + "entropy": 2.7932987809181213, + "epoch": 0.19185520361990951, + "grad_norm": 0.5666584372520447, + "learning_rate": 0.0005109982532428477, + "loss": 0.2055, + "mean_token_accuracy": 0.9385408014059067, + "num_tokens": 466969.0, + "step": 53 + }, + { + "entropy": 2.765812337398529, + "epoch": 0.19547511312217195, + "grad_norm": 0.7875120639801025, + "learning_rate": 0.0005109960698026271, + "loss": 0.4549, + "mean_token_accuracy": 0.9052814990282059, + "num_tokens": 476285.0, + "step": 54 + }, + { + "entropy": 2.884207248687744, + "epoch": 0.19909502262443438, + "grad_norm": 0.7538661956787109, + "learning_rate": 0.0005109930129979285, + "loss": 0.3751, + "mean_token_accuracy": 0.9210246652364731, + "num_tokens": 484668.0, + "step": 55 + }, + { + "entropy": 2.779718518257141, + "epoch": 0.20271493212669683, + "grad_norm": 0.8069296479225159, + "learning_rate": 0.0005109890828403621, + "loss": 0.3664, + "mean_token_accuracy": 0.9219843596220016, + "num_tokens": 493292.0, + "step": 56 + }, + { + "entropy": 2.841543674468994, + "epoch": 0.20633484162895926, + "grad_norm": 0.5545904636383057, + "learning_rate": 0.0005109842793448548, + "loss": 0.1973, + "mean_token_accuracy": 0.9547395706176758, + "num_tokens": 501973.0, + "step": 57 + }, + { + "entropy": 2.8180030584335327, + "epoch": 0.20995475113122172, + "grad_norm": 1.015456199645996, + "learning_rate": 0.0005109786025296513, + "loss": 0.6019, + "mean_token_accuracy": 0.88613361120224, + "num_tokens": 510840.0, + "step": 58 + }, + { + "entropy": 2.7450912594795227, + "epoch": 0.21357466063348415, + "grad_norm": 0.6784740686416626, + "learning_rate": 0.0005109720524163127, + "loss": 0.2868, + "mean_token_accuracy": 0.9295425117015839, + "num_tokens": 519656.0, + "step": 59 + }, + { + "entropy": 2.822400987148285, + "epoch": 0.2171945701357466, + "grad_norm": 0.8780149817466736, + "learning_rate": 0.000510964629029717, + "loss": 0.4371, + "mean_token_accuracy": 0.9089596569538116, + "num_tokens": 528105.0, + "step": 60 + }, + { + "entropy": 2.522100865840912, + "epoch": 0.22081447963800904, + "grad_norm": 0.51394122838974, + "learning_rate": 0.0005109563323980594, + "loss": 0.2509, + "mean_token_accuracy": 0.941976860165596, + "num_tokens": 537707.0, + "step": 61 + }, + { + "entropy": 2.6596657633781433, + "epoch": 0.2244343891402715, + "grad_norm": 0.6359816789627075, + "learning_rate": 0.0005109471625528516, + "loss": 0.3685, + "mean_token_accuracy": 0.9191890209913254, + "num_tokens": 546517.0, + "step": 62 + }, + { + "entropy": 2.800311803817749, + "epoch": 0.22805429864253393, + "grad_norm": 0.6862941980361938, + "learning_rate": 0.0005109371195289215, + "loss": 0.2457, + "mean_token_accuracy": 0.9330879002809525, + "num_tokens": 555493.0, + "step": 63 + }, + { + "entropy": 2.7235344648361206, + "epoch": 0.2316742081447964, + "grad_norm": 1.0464682579040527, + "learning_rate": 0.0005109262033644142, + "loss": 0.4417, + "mean_token_accuracy": 0.8957678377628326, + "num_tokens": 564255.0, + "step": 64 + }, + { + "entropy": 2.6643534302711487, + "epoch": 0.23529411764705882, + "grad_norm": 1.0790019035339355, + "learning_rate": 0.0005109144141007903, + "loss": 0.4947, + "mean_token_accuracy": 0.8889007717370987, + "num_tokens": 573401.0, + "step": 65 + }, + { + "entropy": 2.760925054550171, + "epoch": 0.23891402714932128, + "grad_norm": 0.7957189679145813, + "learning_rate": 0.0005109017517828273, + "loss": 0.2259, + "mean_token_accuracy": 0.944578230381012, + "num_tokens": 581905.0, + "step": 66 + }, + { + "entropy": 2.7048792839050293, + "epoch": 0.2425339366515837, + "grad_norm": 0.9530714750289917, + "learning_rate": 0.0005108882164586181, + "loss": 0.3122, + "mean_token_accuracy": 0.9257418513298035, + "num_tokens": 590802.0, + "step": 67 + }, + { + "entropy": 2.6733291149139404, + "epoch": 0.24615384615384617, + "grad_norm": 0.8295993208885193, + "learning_rate": 0.0005108738081795716, + "loss": 0.3701, + "mean_token_accuracy": 0.898589238524437, + "num_tokens": 599279.0, + "step": 68 + }, + { + "entropy": 2.5613606572151184, + "epoch": 0.2497737556561086, + "grad_norm": 0.6205935478210449, + "learning_rate": 0.0005108585270004123, + "loss": 0.4372, + "mean_token_accuracy": 0.9116007685661316, + "num_tokens": 608107.0, + "step": 69 + }, + { + "entropy": 2.458296835422516, + "epoch": 0.25339366515837103, + "grad_norm": 0.7629838585853577, + "learning_rate": 0.0005108423729791799, + "loss": 0.2307, + "mean_token_accuracy": 0.9386163502931595, + "num_tokens": 616881.0, + "step": 70 + }, + { + "entropy": 2.4176695346832275, + "epoch": 0.25701357466063346, + "grad_norm": 0.902400016784668, + "learning_rate": 0.0005108253461772298, + "loss": 0.2853, + "mean_token_accuracy": 0.9237343072891235, + "num_tokens": 625323.0, + "step": 71 + }, + { + "entropy": 2.2265281677246094, + "epoch": 0.26063348416289595, + "grad_norm": 0.7744383811950684, + "learning_rate": 0.0005108074466592316, + "loss": 0.2435, + "mean_token_accuracy": 0.9508260935544968, + "num_tokens": 634260.0, + "step": 72 + }, + { + "entropy": 2.1855952441692352, + "epoch": 0.2642533936651584, + "grad_norm": 0.8615190386772156, + "learning_rate": 0.0005107886744931702, + "loss": 0.3323, + "mean_token_accuracy": 0.9276078194379807, + "num_tokens": 643235.0, + "step": 73 + }, + { + "entropy": 2.179121494293213, + "epoch": 0.2678733031674208, + "grad_norm": 0.8953279256820679, + "learning_rate": 0.0005107690297503444, + "loss": 0.2384, + "mean_token_accuracy": 0.9425230622291565, + "num_tokens": 652032.0, + "step": 74 + }, + { + "entropy": 2.1565526127815247, + "epoch": 0.27149321266968324, + "grad_norm": 0.6830486059188843, + "learning_rate": 0.0005107485125053678, + "loss": 0.2759, + "mean_token_accuracy": 0.9360661953687668, + "num_tokens": 660978.0, + "step": 75 + }, + { + "entropy": 2.0900665521621704, + "epoch": 0.2751131221719457, + "grad_norm": 0.786665141582489, + "learning_rate": 0.0005107271228361672, + "loss": 0.4061, + "mean_token_accuracy": 0.910009115934372, + "num_tokens": 669817.0, + "step": 76 + }, + { + "entropy": 2.1311859488487244, + "epoch": 0.27873303167420815, + "grad_norm": 0.6399909853935242, + "learning_rate": 0.0005107048608239836, + "loss": 0.272, + "mean_token_accuracy": 0.9424714297056198, + "num_tokens": 678469.0, + "step": 77 + }, + { + "entropy": 2.059997320175171, + "epoch": 0.2823529411764706, + "grad_norm": 0.8114754557609558, + "learning_rate": 0.0005106817265533706, + "loss": 0.4029, + "mean_token_accuracy": 0.9037660360336304, + "num_tokens": 687261.0, + "step": 78 + }, + { + "entropy": 1.9725019037723541, + "epoch": 0.285972850678733, + "grad_norm": 0.9420941472053528, + "learning_rate": 0.0005106577201121952, + "loss": 0.535, + "mean_token_accuracy": 0.8996377140283585, + "num_tokens": 695941.0, + "step": 79 + }, + { + "entropy": 1.9951164424419403, + "epoch": 0.2895927601809955, + "grad_norm": 0.6476142406463623, + "learning_rate": 0.0005106328415916372, + "loss": 0.2242, + "mean_token_accuracy": 0.941379725933075, + "num_tokens": 704643.0, + "step": 80 + }, + { + "entropy": 1.8962564170360565, + "epoch": 0.29321266968325793, + "grad_norm": 0.5974630117416382, + "learning_rate": 0.0005106070910861881, + "loss": 0.2934, + "mean_token_accuracy": 0.9217697530984879, + "num_tokens": 713605.0, + "step": 81 + }, + { + "entropy": 1.9781515896320343, + "epoch": 0.29683257918552036, + "grad_norm": 0.8755478262901306, + "learning_rate": 0.0005105804686936518, + "loss": 0.4551, + "mean_token_accuracy": 0.9051328897476196, + "num_tokens": 722385.0, + "step": 82 + }, + { + "entropy": 1.9892418384552002, + "epoch": 0.3004524886877828, + "grad_norm": 0.6887345314025879, + "learning_rate": 0.0005105529745151433, + "loss": 0.244, + "mean_token_accuracy": 0.9261117279529572, + "num_tokens": 730962.0, + "step": 83 + }, + { + "entropy": 2.0053181648254395, + "epoch": 0.3040723981900452, + "grad_norm": 0.6930885910987854, + "learning_rate": 0.0005105246086550893, + "loss": 0.3155, + "mean_token_accuracy": 0.9206147193908691, + "num_tokens": 739499.0, + "step": 84 + }, + { + "entropy": 1.9716475903987885, + "epoch": 0.3076923076923077, + "grad_norm": 0.5049461722373962, + "learning_rate": 0.0005104953712212266, + "loss": 0.2215, + "mean_token_accuracy": 0.9608763605356216, + "num_tokens": 748604.0, + "step": 85 + }, + { + "entropy": 1.9186978042125702, + "epoch": 0.31131221719457014, + "grad_norm": 0.5756685733795166, + "learning_rate": 0.000510465262324603, + "loss": 0.2658, + "mean_token_accuracy": 0.9372887462377548, + "num_tokens": 757919.0, + "step": 86 + }, + { + "entropy": 1.9738290905952454, + "epoch": 0.31493212669683257, + "grad_norm": 0.6163789629936218, + "learning_rate": 0.0005104342820795758, + "loss": 0.2472, + "mean_token_accuracy": 0.9430449157953262, + "num_tokens": 766708.0, + "step": 87 + }, + { + "entropy": 2.1927571892738342, + "epoch": 0.318552036199095, + "grad_norm": 0.7953162789344788, + "learning_rate": 0.0005104024306038119, + "loss": 0.261, + "mean_token_accuracy": 0.9425829648971558, + "num_tokens": 774601.0, + "step": 88 + }, + { + "entropy": 2.043731451034546, + "epoch": 0.3221719457013575, + "grad_norm": 0.8098088502883911, + "learning_rate": 0.0005103697080182872, + "loss": 0.3126, + "mean_token_accuracy": 0.9158089309930801, + "num_tokens": 783170.0, + "step": 89 + }, + { + "entropy": 1.9801572561264038, + "epoch": 0.3257918552036199, + "grad_norm": 0.5227240920066833, + "learning_rate": 0.0005103361144472864, + "loss": 0.1291, + "mean_token_accuracy": 0.9666071832180023, + "num_tokens": 791769.0, + "step": 90 + }, + { + "entropy": 1.9553790986537933, + "epoch": 0.32941176470588235, + "grad_norm": 0.7819464206695557, + "learning_rate": 0.0005103016500184022, + "loss": 0.531, + "mean_token_accuracy": 0.8817111849784851, + "num_tokens": 800824.0, + "step": 91 + }, + { + "entropy": 1.9291303753852844, + "epoch": 0.3330316742081448, + "grad_norm": 0.7178757190704346, + "learning_rate": 0.0005102663148625347, + "loss": 0.3301, + "mean_token_accuracy": 0.9357631802558899, + "num_tokens": 809347.0, + "step": 92 + }, + { + "entropy": 1.9846041798591614, + "epoch": 0.33665158371040727, + "grad_norm": 1.316636085510254, + "learning_rate": 0.0005102301091138916, + "loss": 0.4241, + "mean_token_accuracy": 0.8993304669857025, + "num_tokens": 817174.0, + "step": 93 + }, + { + "entropy": 1.814637303352356, + "epoch": 0.3402714932126697, + "grad_norm": 0.5486414432525635, + "learning_rate": 0.0005101930329099865, + "loss": 0.116, + "mean_token_accuracy": 0.9674727618694305, + "num_tokens": 826177.0, + "step": 94 + }, + { + "entropy": 1.9128066003322601, + "epoch": 0.3438914027149321, + "grad_norm": 0.620303750038147, + "learning_rate": 0.00051015508639164, + "loss": 0.1833, + "mean_token_accuracy": 0.9569521993398666, + "num_tokens": 835409.0, + "step": 95 + }, + { + "entropy": 1.7541870176792145, + "epoch": 0.34751131221719456, + "grad_norm": 0.8337438702583313, + "learning_rate": 0.0005101162697029776, + "loss": 0.3327, + "mean_token_accuracy": 0.9193180054426193, + "num_tokens": 844692.0, + "step": 96 + }, + { + "entropy": 1.8255240619182587, + "epoch": 0.351131221719457, + "grad_norm": 0.877780556678772, + "learning_rate": 0.00051007658299143, + "loss": 0.2106, + "mean_token_accuracy": 0.9527023881673813, + "num_tokens": 853309.0, + "step": 97 + }, + { + "entropy": 1.8611579239368439, + "epoch": 0.3547511312217195, + "grad_norm": 1.0667716264724731, + "learning_rate": 0.0005100360264077325, + "loss": 0.3196, + "mean_token_accuracy": 0.9195879399776459, + "num_tokens": 861859.0, + "step": 98 + }, + { + "entropy": 1.821915864944458, + "epoch": 0.3583710407239819, + "grad_norm": 0.8400309681892395, + "learning_rate": 0.0005099946001059241, + "loss": 0.4036, + "mean_token_accuracy": 0.8951036781072617, + "num_tokens": 871060.0, + "step": 99 + }, + { + "entropy": 1.7648265063762665, + "epoch": 0.36199095022624433, + "grad_norm": 1.1391404867172241, + "learning_rate": 0.0005099523042433472, + "loss": 0.389, + "mean_token_accuracy": 0.901309460401535, + "num_tokens": 880593.0, + "step": 100 + }, + { + "entropy": 1.8506875336170197, + "epoch": 0.36561085972850677, + "grad_norm": 0.6923297643661499, + "learning_rate": 0.000509909138980647, + "loss": 0.2504, + "mean_token_accuracy": 0.9384842216968536, + "num_tokens": 889739.0, + "step": 101 + }, + { + "entropy": 1.9311015605926514, + "epoch": 0.36923076923076925, + "grad_norm": 0.9677391052246094, + "learning_rate": 0.0005098651044817704, + "loss": 0.6953, + "mean_token_accuracy": 0.8752655684947968, + "num_tokens": 898992.0, + "step": 102 + }, + { + "entropy": 1.9590983986854553, + "epoch": 0.3728506787330317, + "grad_norm": 0.6364567279815674, + "learning_rate": 0.0005098202009139663, + "loss": 0.4318, + "mean_token_accuracy": 0.9056479930877686, + "num_tokens": 908225.0, + "step": 103 + }, + { + "entropy": 1.9455370008945465, + "epoch": 0.3764705882352941, + "grad_norm": 0.6747863292694092, + "learning_rate": 0.0005097744284477839, + "loss": 0.244, + "mean_token_accuracy": 0.9428392052650452, + "num_tokens": 917134.0, + "step": 104 + }, + { + "entropy": 1.8632825911045074, + "epoch": 0.38009049773755654, + "grad_norm": 0.5705651044845581, + "learning_rate": 0.0005097277872570731, + "loss": 0.2508, + "mean_token_accuracy": 0.9325222969055176, + "num_tokens": 926573.0, + "step": 105 + }, + { + "entropy": 1.9370323717594147, + "epoch": 0.38371040723981903, + "grad_norm": 0.6298627853393555, + "learning_rate": 0.000509680277518983, + "loss": 0.2481, + "mean_token_accuracy": 0.9281332045793533, + "num_tokens": 935853.0, + "step": 106 + }, + { + "entropy": 2.0217572450637817, + "epoch": 0.38733031674208146, + "grad_norm": 0.5434353947639465, + "learning_rate": 0.0005096318994139617, + "loss": 0.1809, + "mean_token_accuracy": 0.9592084139585495, + "num_tokens": 944279.0, + "step": 107 + }, + { + "entropy": 1.9619770646095276, + "epoch": 0.3909502262443439, + "grad_norm": 0.6959638595581055, + "learning_rate": 0.0005095826531257552, + "loss": 0.1376, + "mean_token_accuracy": 0.9608310014009476, + "num_tokens": 953336.0, + "step": 108 + }, + { + "entropy": 2.12511146068573, + "epoch": 0.3945701357466063, + "grad_norm": 1.0152848958969116, + "learning_rate": 0.0005095325388414074, + "loss": 0.4382, + "mean_token_accuracy": 0.915201798081398, + "num_tokens": 962002.0, + "step": 109 + }, + { + "entropy": 2.0171878039836884, + "epoch": 0.39819004524886875, + "grad_norm": 0.8337467312812805, + "learning_rate": 0.0005094815567512587, + "loss": 0.2672, + "mean_token_accuracy": 0.9313560128211975, + "num_tokens": 970954.0, + "step": 110 + }, + { + "entropy": 2.1024146378040314, + "epoch": 0.40180995475113124, + "grad_norm": 0.8214333057403564, + "learning_rate": 0.0005094297070489455, + "loss": 0.3146, + "mean_token_accuracy": 0.9289091974496841, + "num_tokens": 979929.0, + "step": 111 + }, + { + "entropy": 2.260519325733185, + "epoch": 0.40542986425339367, + "grad_norm": 1.1298810243606567, + "learning_rate": 0.0005093769899313996, + "loss": 0.3055, + "mean_token_accuracy": 0.9213490188121796, + "num_tokens": 988477.0, + "step": 112 + }, + { + "entropy": 2.2228699326515198, + "epoch": 0.4090497737556561, + "grad_norm": 0.8601953983306885, + "learning_rate": 0.0005093234055988475, + "loss": 0.2738, + "mean_token_accuracy": 0.920888364315033, + "num_tokens": 997091.0, + "step": 113 + }, + { + "entropy": 2.2165185809135437, + "epoch": 0.41266968325791853, + "grad_norm": 0.6331561803817749, + "learning_rate": 0.0005092689542548091, + "loss": 0.2241, + "mean_token_accuracy": 0.9408514499664307, + "num_tokens": 1005866.0, + "step": 114 + }, + { + "entropy": 2.324040472507477, + "epoch": 0.416289592760181, + "grad_norm": 0.680496096611023, + "learning_rate": 0.0005092136361060975, + "loss": 0.2454, + "mean_token_accuracy": 0.9433349967002869, + "num_tokens": 1014277.0, + "step": 115 + }, + { + "entropy": 2.413789749145508, + "epoch": 0.41990950226244345, + "grad_norm": 0.7489557862281799, + "learning_rate": 0.0005091574513628183, + "loss": 0.2856, + "mean_token_accuracy": 0.934124082326889, + "num_tokens": 1023032.0, + "step": 116 + }, + { + "entropy": 2.4693005681037903, + "epoch": 0.4235294117647059, + "grad_norm": 0.6842612624168396, + "learning_rate": 0.0005091004002383682, + "loss": 0.2778, + "mean_token_accuracy": 0.9386793673038483, + "num_tokens": 1031883.0, + "step": 117 + }, + { + "entropy": 2.4351969361305237, + "epoch": 0.4271493212669683, + "grad_norm": 0.9150674343109131, + "learning_rate": 0.0005090424829494347, + "loss": 0.3151, + "mean_token_accuracy": 0.9177709072828293, + "num_tokens": 1040985.0, + "step": 118 + }, + { + "entropy": 2.5141562819480896, + "epoch": 0.4307692307692308, + "grad_norm": 1.0200655460357666, + "learning_rate": 0.000508983699715995, + "loss": 0.5134, + "mean_token_accuracy": 0.8835459351539612, + "num_tokens": 1049949.0, + "step": 119 + }, + { + "entropy": 2.479240596294403, + "epoch": 0.4343891402714932, + "grad_norm": 0.783278226852417, + "learning_rate": 0.0005089240507613151, + "loss": 0.2745, + "mean_token_accuracy": 0.9389322698116302, + "num_tokens": 1058953.0, + "step": 120 + }, + { + "entropy": 2.457803785800934, + "epoch": 0.43800904977375565, + "grad_norm": 0.7620834112167358, + "learning_rate": 0.0005088635363119497, + "loss": 0.3394, + "mean_token_accuracy": 0.9145695865154266, + "num_tokens": 1068624.0, + "step": 121 + }, + { + "entropy": 2.4909247756004333, + "epoch": 0.4416289592760181, + "grad_norm": 0.5868712067604065, + "learning_rate": 0.0005088021565977403, + "loss": 0.1726, + "mean_token_accuracy": 0.9567564129829407, + "num_tokens": 1077686.0, + "step": 122 + }, + { + "entropy": 2.5540462732315063, + "epoch": 0.4452488687782805, + "grad_norm": 1.1467291116714478, + "learning_rate": 0.0005087399118518148, + "loss": 0.2617, + "mean_token_accuracy": 0.9329706132411957, + "num_tokens": 1086230.0, + "step": 123 + }, + { + "entropy": 2.377680242061615, + "epoch": 0.448868778280543, + "grad_norm": 0.7021825909614563, + "learning_rate": 0.0005086768023105866, + "loss": 0.4124, + "mean_token_accuracy": 0.9093360006809235, + "num_tokens": 1095867.0, + "step": 124 + }, + { + "entropy": 2.55239599943161, + "epoch": 0.45248868778280543, + "grad_norm": 0.5947801470756531, + "learning_rate": 0.0005086128282137538, + "loss": 0.2752, + "mean_token_accuracy": 0.9248816668987274, + "num_tokens": 1105003.0, + "step": 125 + }, + { + "entropy": 2.4695483446121216, + "epoch": 0.45610859728506786, + "grad_norm": 1.345604658126831, + "learning_rate": 0.0005085479898042985, + "loss": 0.2577, + "mean_token_accuracy": 0.9318550229072571, + "num_tokens": 1114162.0, + "step": 126 + }, + { + "entropy": 2.4898732900619507, + "epoch": 0.4597285067873303, + "grad_norm": 0.8534179329872131, + "learning_rate": 0.0005084822873284848, + "loss": 0.3013, + "mean_token_accuracy": 0.9195661097764969, + "num_tokens": 1123457.0, + "step": 127 + }, + { + "entropy": 2.5951223969459534, + "epoch": 0.4633484162895928, + "grad_norm": 1.1677368879318237, + "learning_rate": 0.0005084157210358592, + "loss": 0.1612, + "mean_token_accuracy": 0.9599333852529526, + "num_tokens": 1131774.0, + "step": 128 + }, + { + "entropy": 2.7315847873687744, + "epoch": 0.4669683257918552, + "grad_norm": 0.7633224129676819, + "learning_rate": 0.0005083482911792492, + "loss": 0.2437, + "mean_token_accuracy": 0.9487509876489639, + "num_tokens": 1140301.0, + "step": 129 + }, + { + "entropy": 2.6348633766174316, + "epoch": 0.47058823529411764, + "grad_norm": 0.7573317885398865, + "learning_rate": 0.0005082799980147617, + "loss": 0.2426, + "mean_token_accuracy": 0.947308748960495, + "num_tokens": 1148929.0, + "step": 130 + }, + { + "entropy": 2.60002738237381, + "epoch": 0.47420814479638007, + "grad_norm": 1.8195319175720215, + "learning_rate": 0.0005082108418017829, + "loss": 0.1792, + "mean_token_accuracy": 0.9512491375207901, + "num_tokens": 1157682.0, + "step": 131 + }, + { + "entropy": 2.5319923162460327, + "epoch": 0.47782805429864256, + "grad_norm": 0.6342993378639221, + "learning_rate": 0.0005081408228029771, + "loss": 0.1843, + "mean_token_accuracy": 0.9440758228302002, + "num_tokens": 1166687.0, + "step": 132 + }, + { + "entropy": 2.5666881799697876, + "epoch": 0.481447963800905, + "grad_norm": 0.8979415893554688, + "learning_rate": 0.0005080699412842852, + "loss": 0.4824, + "mean_token_accuracy": 0.8837443292140961, + "num_tokens": 1175746.0, + "step": 133 + }, + { + "entropy": 2.6854636669158936, + "epoch": 0.4850678733031674, + "grad_norm": 0.8302125334739685, + "learning_rate": 0.0005079981975149243, + "loss": 0.267, + "mean_token_accuracy": 0.9279022663831711, + "num_tokens": 1184196.0, + "step": 134 + }, + { + "entropy": 2.564552128314972, + "epoch": 0.48868778280542985, + "grad_norm": 0.6785959005355835, + "learning_rate": 0.0005079255917673863, + "loss": 0.2031, + "mean_token_accuracy": 0.9463823586702347, + "num_tokens": 1192982.0, + "step": 135 + }, + { + "entropy": 2.673682928085327, + "epoch": 0.49230769230769234, + "grad_norm": 1.4760410785675049, + "learning_rate": 0.0005078521243174371, + "loss": 0.4791, + "mean_token_accuracy": 0.8969505727291107, + "num_tokens": 1201454.0, + "step": 136 + }, + { + "entropy": 2.6232714653015137, + "epoch": 0.49592760180995477, + "grad_norm": 0.7845668792724609, + "learning_rate": 0.0005077777954441157, + "loss": 0.2472, + "mean_token_accuracy": 0.9404618591070175, + "num_tokens": 1210182.0, + "step": 137 + }, + { + "entropy": 2.5614060163497925, + "epoch": 0.4995475113122172, + "grad_norm": 0.725419819355011, + "learning_rate": 0.0005077026054297322, + "loss": 0.3643, + "mean_token_accuracy": 0.9193316847085953, + "num_tokens": 1219487.0, + "step": 138 + }, + { + "entropy": 2.5907246470451355, + "epoch": 0.5031674208144796, + "grad_norm": 0.7741782665252686, + "learning_rate": 0.0005076265545598682, + "loss": 0.276, + "mean_token_accuracy": 0.9447730481624603, + "num_tokens": 1228066.0, + "step": 139 + }, + { + "entropy": 2.531104028224945, + "epoch": 0.5067873303167421, + "grad_norm": 0.680992603302002, + "learning_rate": 0.0005075496431233745, + "loss": 0.2004, + "mean_token_accuracy": 0.9470729678869247, + "num_tokens": 1236980.0, + "step": 140 + }, + { + "entropy": 2.590231478214264, + "epoch": 0.5104072398190045, + "grad_norm": 0.8260406255722046, + "learning_rate": 0.0005074718714123704, + "loss": 0.2756, + "mean_token_accuracy": 0.9301882535219193, + "num_tokens": 1245565.0, + "step": 141 + }, + { + "entropy": 2.4858668446540833, + "epoch": 0.5140271493212669, + "grad_norm": 0.8085922598838806, + "learning_rate": 0.0005073932397222429, + "loss": 0.2314, + "mean_token_accuracy": 0.9449103325605392, + "num_tokens": 1254366.0, + "step": 142 + }, + { + "entropy": 2.5374304056167603, + "epoch": 0.5176470588235295, + "grad_norm": 0.7858129143714905, + "learning_rate": 0.0005073137483516452, + "loss": 0.1622, + "mean_token_accuracy": 0.9510673582553864, + "num_tokens": 1263197.0, + "step": 143 + }, + { + "entropy": 2.608425199985504, + "epoch": 0.5212669683257919, + "grad_norm": 1.2698506116867065, + "learning_rate": 0.0005072333976024957, + "loss": 0.1729, + "mean_token_accuracy": 0.9509973376989365, + "num_tokens": 1271725.0, + "step": 144 + }, + { + "entropy": 2.437038242816925, + "epoch": 0.5248868778280543, + "grad_norm": 1.0788538455963135, + "learning_rate": 0.0005071521877799765, + "loss": 0.3344, + "mean_token_accuracy": 0.9166721999645233, + "num_tokens": 1280963.0, + "step": 145 + }, + { + "entropy": 2.589951515197754, + "epoch": 0.5285067873303168, + "grad_norm": 0.9228294491767883, + "learning_rate": 0.0005070701191925332, + "loss": 0.3095, + "mean_token_accuracy": 0.9239777624607086, + "num_tokens": 1289683.0, + "step": 146 + }, + { + "entropy": 2.575794994831085, + "epoch": 0.5321266968325792, + "grad_norm": 1.359767198562622, + "learning_rate": 0.0005069871921518726, + "loss": 0.2447, + "mean_token_accuracy": 0.9374738186597824, + "num_tokens": 1298397.0, + "step": 147 + }, + { + "entropy": 2.5628358721733093, + "epoch": 0.5357466063348416, + "grad_norm": 0.9870713353157043, + "learning_rate": 0.000506903406972962, + "loss": 0.4824, + "mean_token_accuracy": 0.9027767181396484, + "num_tokens": 1307191.0, + "step": 148 + }, + { + "entropy": 2.5513240098953247, + "epoch": 0.539366515837104, + "grad_norm": 0.7921387553215027, + "learning_rate": 0.0005068187639740286, + "loss": 0.3278, + "mean_token_accuracy": 0.9161934554576874, + "num_tokens": 1315878.0, + "step": 149 + }, + { + "entropy": 2.526439070701599, + "epoch": 0.5429864253393665, + "grad_norm": 0.6320391297340393, + "learning_rate": 0.000506733263476557, + "loss": 0.1701, + "mean_token_accuracy": 0.9575318098068237, + "num_tokens": 1324786.0, + "step": 150 + }, + { + "entropy": 2.4837265014648438, + "epoch": 0.5466063348416289, + "grad_norm": 0.5369354486465454, + "learning_rate": 0.000506646905805289, + "loss": 0.1328, + "mean_token_accuracy": 0.9636050164699554, + "num_tokens": 1333766.0, + "step": 151 + }, + { + "entropy": 2.5264737010002136, + "epoch": 0.5502262443438914, + "grad_norm": 0.7346852421760559, + "learning_rate": 0.0005065596912882222, + "loss": 0.2012, + "mean_token_accuracy": 0.9448132663965225, + "num_tokens": 1343004.0, + "step": 152 + }, + { + "entropy": 2.569309651851654, + "epoch": 0.5538461538461539, + "grad_norm": 0.9926508069038391, + "learning_rate": 0.0005064716202566082, + "loss": 0.2831, + "mean_token_accuracy": 0.9332023113965988, + "num_tokens": 1351561.0, + "step": 153 + }, + { + "entropy": 2.3148274421691895, + "epoch": 0.5574660633484163, + "grad_norm": 0.6301954984664917, + "learning_rate": 0.0005063826930449523, + "loss": 0.3622, + "mean_token_accuracy": 0.9349419325590134, + "num_tokens": 1360997.0, + "step": 154 + }, + { + "entropy": 2.497675657272339, + "epoch": 0.5610859728506787, + "grad_norm": 0.8846175670623779, + "learning_rate": 0.000506292909991011, + "loss": 0.2314, + "mean_token_accuracy": 0.9468862265348434, + "num_tokens": 1369600.0, + "step": 155 + }, + { + "entropy": 2.313987612724304, + "epoch": 0.5647058823529412, + "grad_norm": 0.5701894164085388, + "learning_rate": 0.0005062022714357922, + "loss": 0.2154, + "mean_token_accuracy": 0.945093959569931, + "num_tokens": 1379125.0, + "step": 156 + }, + { + "entropy": 2.4019755125045776, + "epoch": 0.5683257918552036, + "grad_norm": 0.8769335746765137, + "learning_rate": 0.0005061107777235524, + "loss": 0.3565, + "mean_token_accuracy": 0.9133864492177963, + "num_tokens": 1388111.0, + "step": 157 + }, + { + "entropy": 2.3127577900886536, + "epoch": 0.571945701357466, + "grad_norm": 1.1026453971862793, + "learning_rate": 0.0005060184292017965, + "loss": 0.2897, + "mean_token_accuracy": 0.899736076593399, + "num_tokens": 1397528.0, + "step": 158 + }, + { + "entropy": 2.2682697772979736, + "epoch": 0.5755656108597285, + "grad_norm": 0.5426591038703918, + "learning_rate": 0.000505925226221276, + "loss": 0.167, + "mean_token_accuracy": 0.9609879851341248, + "num_tokens": 1406809.0, + "step": 159 + }, + { + "entropy": 2.4639336466789246, + "epoch": 0.579185520361991, + "grad_norm": 0.6552363038063049, + "learning_rate": 0.0005058311691359875, + "loss": 0.2511, + "mean_token_accuracy": 0.9355164766311646, + "num_tokens": 1415498.0, + "step": 160 + }, + { + "entropy": 2.467900663614273, + "epoch": 0.5828054298642534, + "grad_norm": 0.7168154120445251, + "learning_rate": 0.000505736258303172, + "loss": 0.234, + "mean_token_accuracy": 0.9450509995222092, + "num_tokens": 1424524.0, + "step": 161 + }, + { + "entropy": 2.3683157563209534, + "epoch": 0.5864253393665159, + "grad_norm": 0.6433501839637756, + "learning_rate": 0.0005056404940833128, + "loss": 0.3441, + "mean_token_accuracy": 0.9261108189821243, + "num_tokens": 1434194.0, + "step": 162 + }, + { + "entropy": 2.4686295986175537, + "epoch": 0.5900452488687783, + "grad_norm": 0.9615177512168884, + "learning_rate": 0.0005055438768401348, + "loss": 0.1492, + "mean_token_accuracy": 0.966903567314148, + "num_tokens": 1442972.0, + "step": 163 + }, + { + "entropy": 2.5551892518997192, + "epoch": 0.5936651583710407, + "grad_norm": 0.4957484006881714, + "learning_rate": 0.0005054464069406023, + "loss": 0.1242, + "mean_token_accuracy": 0.969713419675827, + "num_tokens": 1451324.0, + "step": 164 + }, + { + "entropy": 2.554121434688568, + "epoch": 0.5972850678733032, + "grad_norm": 0.7399498224258423, + "learning_rate": 0.0005053480847549187, + "loss": 0.206, + "mean_token_accuracy": 0.9498797357082367, + "num_tokens": 1459698.0, + "step": 165 + }, + { + "entropy": 2.5181015729904175, + "epoch": 0.6009049773755656, + "grad_norm": 0.7433251142501831, + "learning_rate": 0.0005052489106565241, + "loss": 0.2883, + "mean_token_accuracy": 0.9419967085123062, + "num_tokens": 1468460.0, + "step": 166 + }, + { + "entropy": 2.3073930144309998, + "epoch": 0.604524886877828, + "grad_norm": 0.5920398831367493, + "learning_rate": 0.0005051488850220941, + "loss": 0.197, + "mean_token_accuracy": 0.952111005783081, + "num_tokens": 1477579.0, + "step": 167 + }, + { + "entropy": 2.532376289367676, + "epoch": 0.6081447963800904, + "grad_norm": 0.7033098936080933, + "learning_rate": 0.0005050480082315392, + "loss": 0.2122, + "mean_token_accuracy": 0.9488633275032043, + "num_tokens": 1486307.0, + "step": 168 + }, + { + "entropy": 2.397290349006653, + "epoch": 0.611764705882353, + "grad_norm": 0.8026869893074036, + "learning_rate": 0.0005049462806680021, + "loss": 0.2541, + "mean_token_accuracy": 0.9427233040332794, + "num_tokens": 1495152.0, + "step": 169 + }, + { + "entropy": 2.464823842048645, + "epoch": 0.6153846153846154, + "grad_norm": 0.6508225798606873, + "learning_rate": 0.0005048437027178571, + "loss": 0.2639, + "mean_token_accuracy": 0.9391255974769592, + "num_tokens": 1503903.0, + "step": 170 + }, + { + "entropy": 2.520734131336212, + "epoch": 0.6190045248868778, + "grad_norm": 0.8373616337776184, + "learning_rate": 0.0005047402747707084, + "loss": 0.3078, + "mean_token_accuracy": 0.9302930980920792, + "num_tokens": 1512588.0, + "step": 171 + }, + { + "entropy": 2.388108015060425, + "epoch": 0.6226244343891403, + "grad_norm": 0.6334089636802673, + "learning_rate": 0.0005046359972193884, + "loss": 0.1372, + "mean_token_accuracy": 0.9666119515895844, + "num_tokens": 1522011.0, + "step": 172 + }, + { + "entropy": 2.537126660346985, + "epoch": 0.6262443438914027, + "grad_norm": 0.7665116190910339, + "learning_rate": 0.0005045308704599566, + "loss": 0.2603, + "mean_token_accuracy": 0.9350012242794037, + "num_tokens": 1530767.0, + "step": 173 + }, + { + "entropy": 2.567205488681793, + "epoch": 0.6298642533936651, + "grad_norm": 0.8043875098228455, + "learning_rate": 0.0005044248948916977, + "loss": 0.2497, + "mean_token_accuracy": 0.9400482773780823, + "num_tokens": 1539971.0, + "step": 174 + }, + { + "entropy": 2.585887610912323, + "epoch": 0.6334841628959276, + "grad_norm": 0.5282150506973267, + "learning_rate": 0.0005043180709171206, + "loss": 0.1126, + "mean_token_accuracy": 0.9680279046297073, + "num_tokens": 1548971.0, + "step": 175 + }, + { + "entropy": 2.4289392232894897, + "epoch": 0.63710407239819, + "grad_norm": 0.6838382482528687, + "learning_rate": 0.0005042103989419563, + "loss": 0.2076, + "mean_token_accuracy": 0.9468046277761459, + "num_tokens": 1558403.0, + "step": 176 + }, + { + "entropy": 2.6080575585365295, + "epoch": 0.6407239819004525, + "grad_norm": 0.9058650732040405, + "learning_rate": 0.0005041018793751566, + "loss": 0.1781, + "mean_token_accuracy": 0.9432647377252579, + "num_tokens": 1567209.0, + "step": 177 + }, + { + "entropy": 2.5212480425834656, + "epoch": 0.644343891402715, + "grad_norm": 0.796381950378418, + "learning_rate": 0.0005039925126288929, + "loss": 0.2286, + "mean_token_accuracy": 0.9305787235498428, + "num_tokens": 1576255.0, + "step": 178 + }, + { + "entropy": 2.588195264339447, + "epoch": 0.6479638009049774, + "grad_norm": 0.6489388942718506, + "learning_rate": 0.0005038822991185536, + "loss": 0.1717, + "mean_token_accuracy": 0.9572225511074066, + "num_tokens": 1585335.0, + "step": 179 + }, + { + "entropy": 2.609215259552002, + "epoch": 0.6515837104072398, + "grad_norm": 0.8551130294799805, + "learning_rate": 0.0005037712392627441, + "loss": 0.2358, + "mean_token_accuracy": 0.9529621452093124, + "num_tokens": 1594354.0, + "step": 180 + }, + { + "entropy": 2.4199504256248474, + "epoch": 0.6552036199095023, + "grad_norm": 0.5775637030601501, + "learning_rate": 0.0005036593334832836, + "loss": 0.2402, + "mean_token_accuracy": 0.9437069743871689, + "num_tokens": 1603750.0, + "step": 181 + }, + { + "entropy": 2.516424596309662, + "epoch": 0.6588235294117647, + "grad_norm": 0.6967942118644714, + "learning_rate": 0.0005035465822052047, + "loss": 0.1624, + "mean_token_accuracy": 0.9518167823553085, + "num_tokens": 1612474.0, + "step": 182 + }, + { + "entropy": 2.463354170322418, + "epoch": 0.6624434389140271, + "grad_norm": 0.49672600626945496, + "learning_rate": 0.000503432985856751, + "loss": 0.1654, + "mean_token_accuracy": 0.9564716964960098, + "num_tokens": 1621563.0, + "step": 183 + }, + { + "entropy": 2.4456416964530945, + "epoch": 0.6660633484162896, + "grad_norm": 0.6207183003425598, + "learning_rate": 0.000503318544869376, + "loss": 0.1918, + "mean_token_accuracy": 0.9476529806852341, + "num_tokens": 1630801.0, + "step": 184 + }, + { + "entropy": 2.641440451145172, + "epoch": 0.669683257918552, + "grad_norm": 1.220821499824524, + "learning_rate": 0.000503203259677741, + "loss": 0.4019, + "mean_token_accuracy": 0.9172120243310928, + "num_tokens": 1639522.0, + "step": 185 + }, + { + "entropy": 2.6447275280952454, + "epoch": 0.6733031674208145, + "grad_norm": 0.7546490430831909, + "learning_rate": 0.000503087130719714, + "loss": 0.2484, + "mean_token_accuracy": 0.9387800246477127, + "num_tokens": 1647964.0, + "step": 186 + }, + { + "entropy": 2.4657886028289795, + "epoch": 0.676923076923077, + "grad_norm": 0.7679230570793152, + "learning_rate": 0.0005029701584363675, + "loss": 0.2659, + "mean_token_accuracy": 0.930300235748291, + "num_tokens": 1657181.0, + "step": 187 + }, + { + "entropy": 2.37973552942276, + "epoch": 0.6805429864253394, + "grad_norm": 0.7473414540290833, + "learning_rate": 0.0005028523432719772, + "loss": 0.32, + "mean_token_accuracy": 0.9233052879571915, + "num_tokens": 1666477.0, + "step": 188 + }, + { + "entropy": 2.5238219499588013, + "epoch": 0.6841628959276018, + "grad_norm": 0.5573673248291016, + "learning_rate": 0.0005027336856740201, + "loss": 0.1846, + "mean_token_accuracy": 0.9445535093545914, + "num_tokens": 1675002.0, + "step": 189 + }, + { + "entropy": 2.456815242767334, + "epoch": 0.6877828054298643, + "grad_norm": 0.47237634658813477, + "learning_rate": 0.0005026141860931728, + "loss": 0.1065, + "mean_token_accuracy": 0.964375838637352, + "num_tokens": 1683623.0, + "step": 190 + }, + { + "entropy": 2.548456132411957, + "epoch": 0.6914027149321267, + "grad_norm": 0.7699162364006042, + "learning_rate": 0.00050249384498331, + "loss": 0.1985, + "mean_token_accuracy": 0.9438774734735489, + "num_tokens": 1691718.0, + "step": 191 + }, + { + "entropy": 2.4514941573143005, + "epoch": 0.6950226244343891, + "grad_norm": 1.4113538265228271, + "learning_rate": 0.0005023726628015027, + "loss": 0.4541, + "mean_token_accuracy": 0.9207872897386551, + "num_tokens": 1699824.0, + "step": 192 + }, + { + "entropy": 2.2560824751853943, + "epoch": 0.6986425339366515, + "grad_norm": 0.6007948517799377, + "learning_rate": 0.0005022506400080161, + "loss": 0.1871, + "mean_token_accuracy": 0.9502484053373337, + "num_tokens": 1708722.0, + "step": 193 + }, + { + "entropy": 2.1833614110946655, + "epoch": 0.702262443438914, + "grad_norm": 0.7005489468574524, + "learning_rate": 0.0005021277770663082, + "loss": 0.2222, + "mean_token_accuracy": 0.9386974722146988, + "num_tokens": 1717592.0, + "step": 194 + }, + { + "entropy": 2.2031923830509186, + "epoch": 0.7058823529411765, + "grad_norm": 0.5830584764480591, + "learning_rate": 0.0005020040744430284, + "loss": 0.1106, + "mean_token_accuracy": 0.9719562232494354, + "num_tokens": 1726149.0, + "step": 195 + }, + { + "entropy": 2.199785351753235, + "epoch": 0.709502262443439, + "grad_norm": 0.7465847134590149, + "learning_rate": 0.0005018795326080149, + "loss": 0.1935, + "mean_token_accuracy": 0.9497270882129669, + "num_tokens": 1734541.0, + "step": 196 + }, + { + "entropy": 2.1103186309337616, + "epoch": 0.7131221719457014, + "grad_norm": 1.0782264471054077, + "learning_rate": 0.0005017541520342934, + "loss": 0.2895, + "mean_token_accuracy": 0.9274258464574814, + "num_tokens": 1743722.0, + "step": 197 + }, + { + "entropy": 2.2248528599739075, + "epoch": 0.7167420814479638, + "grad_norm": 0.6409780979156494, + "learning_rate": 0.0005016279331980754, + "loss": 0.1425, + "mean_token_accuracy": 0.96550352871418, + "num_tokens": 1752156.0, + "step": 198 + }, + { + "entropy": 2.19924658536911, + "epoch": 0.7203619909502262, + "grad_norm": 0.7019934058189392, + "learning_rate": 0.0005015008765787561, + "loss": 0.1969, + "mean_token_accuracy": 0.9429282248020172, + "num_tokens": 1760978.0, + "step": 199 + }, + { + "entropy": 2.297484815120697, + "epoch": 0.7239819004524887, + "grad_norm": 0.7826490998268127, + "learning_rate": 0.0005013729826589127, + "loss": 0.2399, + "mean_token_accuracy": 0.9416657984256744, + "num_tokens": 1769533.0, + "step": 200 + }, + { + "entropy": 2.2471498548984528, + "epoch": 0.7276018099547511, + "grad_norm": 0.621566891670227, + "learning_rate": 0.0005012442519243027, + "loss": 0.1876, + "mean_token_accuracy": 0.9460793286561966, + "num_tokens": 1778286.0, + "step": 201 + }, + { + "entropy": 2.2212815284729004, + "epoch": 0.7312217194570135, + "grad_norm": 0.622283935546875, + "learning_rate": 0.0005011146848638616, + "loss": 0.1617, + "mean_token_accuracy": 0.9482609927654266, + "num_tokens": 1787392.0, + "step": 202 + }, + { + "entropy": 2.308752655982971, + "epoch": 0.7348416289592761, + "grad_norm": 0.7263973355293274, + "learning_rate": 0.0005009842819697018, + "loss": 0.2043, + "mean_token_accuracy": 0.9378403723239899, + "num_tokens": 1796133.0, + "step": 203 + }, + { + "entropy": 2.3376497626304626, + "epoch": 0.7384615384615385, + "grad_norm": 0.5493630766868591, + "learning_rate": 0.0005008530437371101, + "loss": 0.1145, + "mean_token_accuracy": 0.970586434006691, + "num_tokens": 1804769.0, + "step": 204 + }, + { + "entropy": 2.373005509376526, + "epoch": 0.7420814479638009, + "grad_norm": 0.6313483119010925, + "learning_rate": 0.0005007209706645461, + "loss": 0.2183, + "mean_token_accuracy": 0.9472708404064178, + "num_tokens": 1813364.0, + "step": 205 + }, + { + "entropy": 2.468949854373932, + "epoch": 0.7457013574660634, + "grad_norm": 1.0125588178634644, + "learning_rate": 0.00050058806325364, + "loss": 0.2225, + "mean_token_accuracy": 0.9351322948932648, + "num_tokens": 1822149.0, + "step": 206 + }, + { + "entropy": 2.2420623898506165, + "epoch": 0.7493212669683258, + "grad_norm": 0.913761556148529, + "learning_rate": 0.0005004543220091911, + "loss": 0.2386, + "mean_token_accuracy": 0.9453927427530289, + "num_tokens": 1831533.0, + "step": 207 + }, + { + "entropy": 2.2966006994247437, + "epoch": 0.7529411764705882, + "grad_norm": 0.7386876940727234, + "learning_rate": 0.0005003197474391658, + "loss": 0.1768, + "mean_token_accuracy": 0.949826255440712, + "num_tokens": 1840157.0, + "step": 208 + }, + { + "entropy": 2.306001305580139, + "epoch": 0.7565610859728507, + "grad_norm": 0.8900741338729858, + "learning_rate": 0.0005001843400546955, + "loss": 0.2899, + "mean_token_accuracy": 0.9241485595703125, + "num_tokens": 1848898.0, + "step": 209 + }, + { + "entropy": 2.117514967918396, + "epoch": 0.7601809954751131, + "grad_norm": 0.644622802734375, + "learning_rate": 0.0005000481003700746, + "loss": 0.2714, + "mean_token_accuracy": 0.9299416691064835, + "num_tokens": 1858330.0, + "step": 210 + }, + { + "entropy": 2.3768392205238342, + "epoch": 0.7638009049773755, + "grad_norm": 0.9724471569061279, + "learning_rate": 0.0004999110289027587, + "loss": 0.1633, + "mean_token_accuracy": 0.9550061523914337, + "num_tokens": 1866806.0, + "step": 211 + }, + { + "entropy": 2.090679556131363, + "epoch": 0.7674208144796381, + "grad_norm": 0.5419518351554871, + "learning_rate": 0.0004997731261733628, + "loss": 0.1369, + "mean_token_accuracy": 0.9619670957326889, + "num_tokens": 1875937.0, + "step": 212 + }, + { + "entropy": 2.099909245967865, + "epoch": 0.7710407239819005, + "grad_norm": 0.6858121752738953, + "learning_rate": 0.0004996343927056592, + "loss": 0.1633, + "mean_token_accuracy": 0.9528832882642746, + "num_tokens": 1885145.0, + "step": 213 + }, + { + "entropy": 2.130059242248535, + "epoch": 0.7746606334841629, + "grad_norm": 0.7691065073013306, + "learning_rate": 0.000499494829026575, + "loss": 0.348, + "mean_token_accuracy": 0.9162366837263107, + "num_tokens": 1894255.0, + "step": 214 + }, + { + "entropy": 2.191373586654663, + "epoch": 0.7782805429864253, + "grad_norm": 0.7427324652671814, + "learning_rate": 0.000499354435666191, + "loss": 0.3373, + "mean_token_accuracy": 0.9311849176883698, + "num_tokens": 1902981.0, + "step": 215 + }, + { + "entropy": 2.1425398886203766, + "epoch": 0.7819004524886878, + "grad_norm": 0.6410383582115173, + "learning_rate": 0.0004992132131577392, + "loss": 0.2079, + "mean_token_accuracy": 0.949742391705513, + "num_tokens": 1912253.0, + "step": 216 + }, + { + "entropy": 2.1396586298942566, + "epoch": 0.7855203619909502, + "grad_norm": 0.5689850449562073, + "learning_rate": 0.0004990711620376003, + "loss": 0.1999, + "mean_token_accuracy": 0.946034774184227, + "num_tokens": 1921409.0, + "step": 217 + }, + { + "entropy": 2.2237865328788757, + "epoch": 0.7891402714932126, + "grad_norm": 0.6408923864364624, + "learning_rate": 0.0004989282828453029, + "loss": 0.2452, + "mean_token_accuracy": 0.9510752111673355, + "num_tokens": 1930397.0, + "step": 218 + }, + { + "entropy": 2.234771251678467, + "epoch": 0.7927601809954751, + "grad_norm": 0.751447856426239, + "learning_rate": 0.0004987845761235203, + "loss": 0.3057, + "mean_token_accuracy": 0.9217256307601929, + "num_tokens": 1939172.0, + "step": 219 + }, + { + "entropy": 2.2653815746307373, + "epoch": 0.7963800904977375, + "grad_norm": 0.751455545425415, + "learning_rate": 0.0004986400424180688, + "loss": 0.3245, + "mean_token_accuracy": 0.9256318956613541, + "num_tokens": 1947979.0, + "step": 220 + }, + { + "entropy": 2.3123483061790466, + "epoch": 0.8, + "grad_norm": 0.5939492583274841, + "learning_rate": 0.0004984946822779061, + "loss": 0.2429, + "mean_token_accuracy": 0.9333402067422867, + "num_tokens": 1956814.0, + "step": 221 + }, + { + "entropy": 2.3289234042167664, + "epoch": 0.8036199095022625, + "grad_norm": 0.5591994524002075, + "learning_rate": 0.0004983484962551284, + "loss": 0.1507, + "mean_token_accuracy": 0.96376833319664, + "num_tokens": 1965641.0, + "step": 222 + }, + { + "entropy": 2.4314023852348328, + "epoch": 0.8072398190045249, + "grad_norm": 0.5805783271789551, + "learning_rate": 0.0004982014849049687, + "loss": 0.2049, + "mean_token_accuracy": 0.9586948156356812, + "num_tokens": 1974180.0, + "step": 223 + }, + { + "entropy": 2.3639765977859497, + "epoch": 0.8108597285067873, + "grad_norm": 0.6924490332603455, + "learning_rate": 0.0004980536487857951, + "loss": 0.2137, + "mean_token_accuracy": 0.9441423565149307, + "num_tokens": 1982744.0, + "step": 224 + }, + { + "entropy": 2.3361759781837463, + "epoch": 0.8144796380090498, + "grad_norm": 0.4579620361328125, + "learning_rate": 0.0004979049884591077, + "loss": 0.1041, + "mean_token_accuracy": 0.9753208309412003, + "num_tokens": 1991583.0, + "step": 225 + }, + { + "entropy": 2.286989688873291, + "epoch": 0.8180995475113122, + "grad_norm": 0.6489312052726746, + "learning_rate": 0.0004977555044895377, + "loss": 0.2131, + "mean_token_accuracy": 0.9520440250635147, + "num_tokens": 2000193.0, + "step": 226 + }, + { + "entropy": 2.288672834634781, + "epoch": 0.8217194570135746, + "grad_norm": 0.7738961577415466, + "learning_rate": 0.0004976051974448441, + "loss": 0.325, + "mean_token_accuracy": 0.9060750156641006, + "num_tokens": 2009233.0, + "step": 227 + }, + { + "entropy": 2.288076102733612, + "epoch": 0.8253393665158371, + "grad_norm": 0.7042292356491089, + "learning_rate": 0.0004974540678959123, + "loss": 0.2206, + "mean_token_accuracy": 0.94980289041996, + "num_tokens": 2018417.0, + "step": 228 + }, + { + "entropy": 2.217707335948944, + "epoch": 0.8289592760180996, + "grad_norm": 0.6834208369255066, + "learning_rate": 0.0004973021164167515, + "loss": 0.2907, + "mean_token_accuracy": 0.951058641076088, + "num_tokens": 2027822.0, + "step": 229 + }, + { + "entropy": 2.1610691249370575, + "epoch": 0.832579185520362, + "grad_norm": 0.665044903755188, + "learning_rate": 0.0004971493435844928, + "loss": 0.2387, + "mean_token_accuracy": 0.9506549835205078, + "num_tokens": 2036983.0, + "step": 230 + }, + { + "entropy": 2.321135401725769, + "epoch": 0.8361990950226245, + "grad_norm": 0.8208273649215698, + "learning_rate": 0.0004969957499793869, + "loss": 0.2399, + "mean_token_accuracy": 0.9435176253318787, + "num_tokens": 2045574.0, + "step": 231 + }, + { + "entropy": 2.1943611800670624, + "epoch": 0.8398190045248869, + "grad_norm": 0.6293840408325195, + "learning_rate": 0.0004968413361848019, + "loss": 0.1784, + "mean_token_accuracy": 0.9559669345617294, + "num_tokens": 2054336.0, + "step": 232 + }, + { + "entropy": 2.2722273468971252, + "epoch": 0.8434389140271493, + "grad_norm": 0.6535817980766296, + "learning_rate": 0.0004966861027872211, + "loss": 0.1675, + "mean_token_accuracy": 0.9532535970211029, + "num_tokens": 2063225.0, + "step": 233 + }, + { + "entropy": 2.3278334736824036, + "epoch": 0.8470588235294118, + "grad_norm": 1.1610206365585327, + "learning_rate": 0.0004965300503762406, + "loss": 0.1588, + "mean_token_accuracy": 0.9641145765781403, + "num_tokens": 2071738.0, + "step": 234 + }, + { + "entropy": 2.202972888946533, + "epoch": 0.8506787330316742, + "grad_norm": 0.4811885356903076, + "learning_rate": 0.0004963731795445675, + "loss": 0.0813, + "mean_token_accuracy": 0.9766911715269089, + "num_tokens": 2080375.0, + "step": 235 + }, + { + "entropy": 2.2433705925941467, + "epoch": 0.8542986425339366, + "grad_norm": 0.8113318681716919, + "learning_rate": 0.0004962154908880171, + "loss": 0.2965, + "mean_token_accuracy": 0.9290606826543808, + "num_tokens": 2089522.0, + "step": 236 + }, + { + "entropy": 2.2168884873390198, + "epoch": 0.857918552036199, + "grad_norm": 0.6128959655761719, + "learning_rate": 0.0004960569850055111, + "loss": 0.1724, + "mean_token_accuracy": 0.9603384286165237, + "num_tokens": 2098162.0, + "step": 237 + }, + { + "entropy": 2.2738255858421326, + "epoch": 0.8615384615384616, + "grad_norm": 0.8557195663452148, + "learning_rate": 0.0004958976624990749, + "loss": 0.2596, + "mean_token_accuracy": 0.9487071484327316, + "num_tokens": 2106984.0, + "step": 238 + }, + { + "entropy": 2.2031425833702087, + "epoch": 0.865158371040724, + "grad_norm": 0.6621816158294678, + "learning_rate": 0.0004957375239738359, + "loss": 0.232, + "mean_token_accuracy": 0.9525040090084076, + "num_tokens": 2116040.0, + "step": 239 + }, + { + "entropy": 2.374737858772278, + "epoch": 0.8687782805429864, + "grad_norm": 0.8481062054634094, + "learning_rate": 0.0004955765700380204, + "loss": 0.2516, + "mean_token_accuracy": 0.9396061599254608, + "num_tokens": 2124862.0, + "step": 240 + }, + { + "entropy": 2.266704559326172, + "epoch": 0.8723981900452489, + "grad_norm": 0.6284282803535461, + "learning_rate": 0.0004954148013029521, + "loss": 0.3244, + "mean_token_accuracy": 0.9381244331598282, + "num_tokens": 2134018.0, + "step": 241 + }, + { + "entropy": 2.3935859203338623, + "epoch": 0.8760180995475113, + "grad_norm": 1.1564176082611084, + "learning_rate": 0.0004952522183830493, + "loss": 0.2706, + "mean_token_accuracy": 0.9297053664922714, + "num_tokens": 2142745.0, + "step": 242 + }, + { + "entropy": 2.281618118286133, + "epoch": 0.8796380090497737, + "grad_norm": 0.5324040055274963, + "learning_rate": 0.0004950888218958225, + "loss": 0.1573, + "mean_token_accuracy": 0.9568462073802948, + "num_tokens": 2151607.0, + "step": 243 + }, + { + "entropy": 2.230749189853668, + "epoch": 0.8832579185520362, + "grad_norm": 0.680780291557312, + "learning_rate": 0.0004949246124618726, + "loss": 0.1956, + "mean_token_accuracy": 0.9479999989271164, + "num_tokens": 2160904.0, + "step": 244 + }, + { + "entropy": 2.21382600069046, + "epoch": 0.8868778280542986, + "grad_norm": 0.6321626305580139, + "learning_rate": 0.0004947595907048877, + "loss": 0.2444, + "mean_token_accuracy": 0.9376699328422546, + "num_tokens": 2170021.0, + "step": 245 + }, + { + "entropy": 2.3659472465515137, + "epoch": 0.890497737556561, + "grad_norm": 0.9778954982757568, + "learning_rate": 0.0004945937572516417, + "loss": 0.3783, + "mean_token_accuracy": 0.9104805737733841, + "num_tokens": 2178995.0, + "step": 246 + }, + { + "entropy": 2.3233078718185425, + "epoch": 0.8941176470588236, + "grad_norm": 0.53229820728302, + "learning_rate": 0.0004944271127319909, + "loss": 0.0759, + "mean_token_accuracy": 0.9791453778743744, + "num_tokens": 2187823.0, + "step": 247 + }, + { + "entropy": 2.2469444274902344, + "epoch": 0.897737556561086, + "grad_norm": 0.6367197632789612, + "learning_rate": 0.0004942596577788728, + "loss": 0.2677, + "mean_token_accuracy": 0.9392691254615784, + "num_tokens": 2196923.0, + "step": 248 + }, + { + "entropy": 2.4508965611457825, + "epoch": 0.9013574660633484, + "grad_norm": 0.6042234897613525, + "learning_rate": 0.0004940913930283024, + "loss": 0.1102, + "mean_token_accuracy": 0.9762090593576431, + "num_tokens": 2205400.0, + "step": 249 + }, + { + "entropy": 2.365670144557953, + "epoch": 0.9049773755656109, + "grad_norm": 0.6490639448165894, + "learning_rate": 0.0004939223191193707, + "loss": 0.1532, + "mean_token_accuracy": 0.9489114433526993, + "num_tokens": 2214201.0, + "step": 250 + }, + { + "entropy": 2.4013625383377075, + "epoch": 0.9085972850678733, + "grad_norm": 0.5969854593276978, + "learning_rate": 0.0004937524366942419, + "loss": 0.1273, + "mean_token_accuracy": 0.9682519882917404, + "num_tokens": 2222979.0, + "step": 251 + }, + { + "entropy": 2.4402357935905457, + "epoch": 0.9122171945701357, + "grad_norm": 0.7559595704078674, + "learning_rate": 0.0004935817463981513, + "loss": 0.1979, + "mean_token_accuracy": 0.9483373910188675, + "num_tokens": 2231169.0, + "step": 252 + }, + { + "entropy": 2.4673256874084473, + "epoch": 0.9158371040723982, + "grad_norm": 0.8663308620452881, + "learning_rate": 0.0004934102488794023, + "loss": 0.2453, + "mean_token_accuracy": 0.9408974200487137, + "num_tokens": 2240099.0, + "step": 253 + }, + { + "entropy": 2.426262080669403, + "epoch": 0.9194570135746606, + "grad_norm": 0.7920467257499695, + "learning_rate": 0.0004932379447893643, + "loss": 0.2828, + "mean_token_accuracy": 0.9319239109754562, + "num_tokens": 2249088.0, + "step": 254 + }, + { + "entropy": 2.5018852949142456, + "epoch": 0.9230769230769231, + "grad_norm": 0.7216617465019226, + "learning_rate": 0.0004930648347824701, + "loss": 0.1647, + "mean_token_accuracy": 0.9551804810762405, + "num_tokens": 2257710.0, + "step": 255 + }, + { + "entropy": 2.43031644821167, + "epoch": 0.9266968325791856, + "grad_norm": 0.646794319152832, + "learning_rate": 0.0004928909195162138, + "loss": 0.1328, + "mean_token_accuracy": 0.9663553237915039, + "num_tokens": 2266883.0, + "step": 256 + }, + { + "entropy": 2.5406370759010315, + "epoch": 0.930316742081448, + "grad_norm": 0.5482825040817261, + "learning_rate": 0.0004927161996511474, + "loss": 0.1872, + "mean_token_accuracy": 0.9557004272937775, + "num_tokens": 2275728.0, + "step": 257 + }, + { + "entropy": 2.636320471763611, + "epoch": 0.9339366515837104, + "grad_norm": 0.7454632520675659, + "learning_rate": 0.0004925406758508797, + "loss": 0.1461, + "mean_token_accuracy": 0.9578974395990372, + "num_tokens": 2284319.0, + "step": 258 + }, + { + "entropy": 2.6067575812339783, + "epoch": 0.9375565610859729, + "grad_norm": 0.8695769309997559, + "learning_rate": 0.000492364348782072, + "loss": 0.1712, + "mean_token_accuracy": 0.9652896523475647, + "num_tokens": 2293035.0, + "step": 259 + }, + { + "entropy": 2.5837162137031555, + "epoch": 0.9411764705882353, + "grad_norm": 0.5752995014190674, + "learning_rate": 0.0004921872191144371, + "loss": 0.1398, + "mean_token_accuracy": 0.9553333520889282, + "num_tokens": 2301802.0, + "step": 260 + }, + { + "entropy": 2.713033616542816, + "epoch": 0.9447963800904977, + "grad_norm": 0.85626620054245, + "learning_rate": 0.0004920092875207363, + "loss": 0.2207, + "mean_token_accuracy": 0.9468346834182739, + "num_tokens": 2309981.0, + "step": 261 + }, + { + "entropy": 2.400112509727478, + "epoch": 0.9484162895927601, + "grad_norm": 0.6766608953475952, + "learning_rate": 0.0004918305546767764, + "loss": 0.1644, + "mean_token_accuracy": 0.9502440094947815, + "num_tokens": 2319212.0, + "step": 262 + }, + { + "entropy": 2.503827154636383, + "epoch": 0.9520361990950226, + "grad_norm": 0.789470911026001, + "learning_rate": 0.0004916510212614072, + "loss": 0.2117, + "mean_token_accuracy": 0.9454390555620193, + "num_tokens": 2328234.0, + "step": 263 + }, + { + "entropy": 2.669040560722351, + "epoch": 0.9556561085972851, + "grad_norm": 0.9579212069511414, + "learning_rate": 0.0004914706879565197, + "loss": 0.2193, + "mean_token_accuracy": 0.9321542829275131, + "num_tokens": 2336543.0, + "step": 264 + }, + { + "entropy": 2.507073998451233, + "epoch": 0.9592760180995475, + "grad_norm": 0.5315744876861572, + "learning_rate": 0.000491289555447043, + "loss": 0.0851, + "mean_token_accuracy": 0.9771326780319214, + "num_tokens": 2345292.0, + "step": 265 + }, + { + "entropy": 2.4205283522605896, + "epoch": 0.96289592760181, + "grad_norm": 0.5441373586654663, + "learning_rate": 0.000491107624420941, + "loss": 0.1323, + "mean_token_accuracy": 0.9541790336370468, + "num_tokens": 2354242.0, + "step": 266 + }, + { + "entropy": 2.3817258477211, + "epoch": 0.9665158371040724, + "grad_norm": 0.5946238040924072, + "learning_rate": 0.0004909248955692111, + "loss": 0.1708, + "mean_token_accuracy": 0.947738841176033, + "num_tokens": 2363183.0, + "step": 267 + }, + { + "entropy": 2.5073485374450684, + "epoch": 0.9701357466063348, + "grad_norm": 0.6979324817657471, + "learning_rate": 0.0004907413695858812, + "loss": 0.2099, + "mean_token_accuracy": 0.9423733651638031, + "num_tokens": 2371885.0, + "step": 268 + }, + { + "entropy": 2.5705007910728455, + "epoch": 0.9737556561085973, + "grad_norm": 0.8203943967819214, + "learning_rate": 0.0004905570471680057, + "loss": 0.217, + "mean_token_accuracy": 0.9511639326810837, + "num_tokens": 2380316.0, + "step": 269 + }, + { + "entropy": 2.2677993774414062, + "epoch": 0.9773755656108597, + "grad_norm": 0.5840432047843933, + "learning_rate": 0.0004903719290156649, + "loss": 0.2364, + "mean_token_accuracy": 0.9407180696725845, + "num_tokens": 2389723.0, + "step": 270 + }, + { + "entropy": 2.477886915206909, + "epoch": 0.9809954751131221, + "grad_norm": 0.818929135799408, + "learning_rate": 0.0004901860158319612, + "loss": 0.1707, + "mean_token_accuracy": 0.9579566866159439, + "num_tokens": 2398388.0, + "step": 271 + }, + { + "entropy": 2.549662232398987, + "epoch": 0.9846153846153847, + "grad_norm": 0.7804781198501587, + "learning_rate": 0.0004899993083230166, + "loss": 0.2944, + "mean_token_accuracy": 0.9381812512874603, + "num_tokens": 2406929.0, + "step": 272 + }, + { + "entropy": 2.4465304017066956, + "epoch": 0.9882352941176471, + "grad_norm": 0.5218799114227295, + "learning_rate": 0.0004898118071979699, + "loss": 0.1661, + "mean_token_accuracy": 0.9500218778848648, + "num_tokens": 2415631.0, + "step": 273 + }, + { + "entropy": 2.5852283239364624, + "epoch": 0.9918552036199095, + "grad_norm": 0.591163158416748, + "learning_rate": 0.0004896235131689743, + "loss": 0.2005, + "mean_token_accuracy": 0.9455285370349884, + "num_tokens": 2424091.0, + "step": 274 + }, + { + "entropy": 2.478701651096344, + "epoch": 0.995475113122172, + "grad_norm": 1.0615383386611938, + "learning_rate": 0.0004894344269511945, + "loss": 0.2864, + "mean_token_accuracy": 0.9306265562772751, + "num_tokens": 2432705.0, + "step": 275 + }, + { + "entropy": 2.600062847137451, + "epoch": 0.9990950226244344, + "grad_norm": 0.7011683583259583, + "learning_rate": 0.0004892445492628043, + "loss": 0.1664, + "mean_token_accuracy": 0.9547821134328842, + "num_tokens": 2440992.0, + "step": 276 + }, + { + "entropy": 2.3411240577697754, + "epoch": 1.0, + "grad_norm": 0.4944029450416565, + "learning_rate": 0.000489053880824983, + "loss": 0.022, + "mean_token_accuracy": 0.9929078221321106, + "num_tokens": 2441725.0, + "step": 277 + }, + { + "epoch": 1.0, + "eval_entropy": 2.5467925265552553, + "eval_loss": 0.21274714171886444, + "eval_mean_token_accuracy": 0.9444630068492114, + "eval_num_tokens": 2441725.0, + "eval_runtime": 116.0434, + "eval_samples_per_second": 3.18, + "eval_steps_per_second": 1.06, + "step": 277 + } + ], + "logging_steps": 1, + "max_steps": 1662, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.317575055331963e+17, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-277/training_args.bin b/checkpoint-277/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..070a2de135e794840c49e066215a1c9f2e550d1f --- /dev/null +++ b/checkpoint-277/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc271f94ce32216bd6f2ee9866fb7d62a0583bc7ee0c7fa953fa57c302729c6c +size 6289 diff --git a/checkpoint-554/README.md b/checkpoint-554/README.md new file mode 100644 index 0000000000000000000000000000000000000000..58a4061707bcc32db3b543936f6b650c01f3dccb --- /dev/null +++ b/checkpoint-554/README.md @@ -0,0 +1,208 @@ +--- +base_model: openai/gpt-oss-20b +library_name: peft +tags: +- base_model:adapter:openai/gpt-oss-20b +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/checkpoint-554/adapter_config.json b/checkpoint-554/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..076480eaf349cc658de2eb00b26c7360a85f8f56 --- /dev/null +++ b/checkpoint-554/adapter_config.json @@ -0,0 +1,53 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "GptOssForCausalLM", + "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss" + }, + "base_model_name_or_path": "openai/gpt-oss-20b", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "o_proj", + "q_proj" + ], + "target_parameters": [ + "7.mlp.experts.gate_up_proj", + "7.mlp.experts.down_proj", + "15.mlp.experts.gate_up_proj", + "15.mlp.experts.down_proj", + "23.mlp.experts.gate_up_proj", + "23.mlp.experts.down_proj" + ], + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-554/adapter_model.safetensors b/checkpoint-554/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..24088c98fde266c60b3df79a008a0cce4c1009fb --- /dev/null +++ b/checkpoint-554/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08be5413fd76f7c5ddd4d3fb2f098ea528697a75d5c0f4d1fd8114d552ca9968 +size 60189176 diff --git a/checkpoint-554/chat_template.jinja b/checkpoint-554/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc7bb11927d29f653ba2740f2db2c688fd77592f --- /dev/null +++ b/checkpoint-554/chat_template.jinja @@ -0,0 +1,331 @@ +{#- + In addition to the normal inputs of `messages` and `tools`, this template also accepts the + following kwargs: + - "builtin_tools": A list, can contain "browser" and/or "python". + - "model_identity": A string that optionally describes the model identity. + - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium". + #} + +{#- Tool Definition Rendering ============================================== #} +{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%} + {%- if param_spec.type == "array" -%} + {%- if param_spec['items'] -%} + {%- if param_spec['items']['type'] == "string" -%} + {{- "string[]" }} + {%- elif param_spec['items']['type'] == "number" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "integer" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "boolean" -%} + {{- "boolean[]" }} + {%- else -%} + {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%} + {%- if inner_type == "object | object" or inner_type|length > 50 -%} + {{- "any[]" }} + {%- else -%} + {{- inner_type + "[]" }} + {%- endif -%} + {%- endif -%} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- else -%} + {{- "any[]" }} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%} + {#- Handle array of types like ["object", "object"] from Union[dict, list] #} + {%- if param_spec.type | length > 1 -%} + {{- param_spec.type | join(" | ") }} + {%- else -%} + {{- param_spec.type[0] }} + {%- endif -%} + {%- elif param_spec.oneOf -%} + {#- Handle oneOf schemas - check for complex unions and fallback to any #} + {%- set has_object_variants = false -%} + {%- for variant in param_spec.oneOf -%} + {%- if variant.type == "object" -%} + {%- set has_object_variants = true -%} + {%- endif -%} + {%- endfor -%} + {%- if has_object_variants and param_spec.oneOf|length > 1 -%} + {{- "any" }} + {%- else -%} + {%- for variant in param_spec.oneOf -%} + {{- render_typescript_type(variant, required_params) -}} + {%- if variant.description %} + {{- "// " + variant.description }} + {%- endif -%} + {%- if variant.default is defined %} + {{ "// default: " + variant.default|tojson }} + {%- endif -%} + {%- if not loop.last %} + {{- " | " }} + {% endif -%} + {%- endfor -%} + {%- endif -%} + {%- elif param_spec.type == "string" -%} + {%- if param_spec.enum -%} + {{- '"' + param_spec.enum|join('" | "') + '"' -}} + {%- else -%} + {{- "string" }} + {%- if param_spec.nullable %} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type == "number" -%} + {{- "number" }} + {%- elif param_spec.type == "integer" -%} + {{- "number" }} + {%- elif param_spec.type == "boolean" -%} + {{- "boolean" }} + + {%- elif param_spec.type == "object" -%} + {%- if param_spec.properties -%} + {{- "{\n" }} + {%- for prop_name, prop_spec in param_spec.properties.items() -%} + {{- prop_name -}} + {%- if prop_name not in (param_spec.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{ render_typescript_type(prop_spec, param_spec.required or []) }} + {%- if not loop.last -%} + {{-", " }} + {%- endif -%} + {%- endfor -%} + {{- "}" }} + {%- else -%} + {{- "object" }} + {%- endif -%} + {%- else -%} + {{- "any" }} + {%- endif -%} +{%- endmacro -%} + +{%- macro render_tool_namespace(namespace_name, tools) -%} + {{- "## " + namespace_name + "\n\n" }} + {{- "namespace " + namespace_name + " {\n\n" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- "// " + tool.description + "\n" }} + {{- "type "+ tool.name + " = " }} + {%- if tool.parameters and tool.parameters.properties %} + {{- "(_: {\n" }} + {%- for param_name, param_spec in tool.parameters.properties.items() %} + {%- if param_spec.description %} + {{- "// " + param_spec.description + "\n" }} + {%- endif %} + {{- param_name }} + {%- if param_name not in (tool.parameters.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{- render_typescript_type(param_spec, tool.parameters.required or []) }} + {%- if param_spec.default is defined -%} + {%- if param_spec.enum %} + {{- ", // default: " + param_spec.default }} + {%- elif param_spec.oneOf %} + {{- "// default: " + param_spec.default }} + {%- else %} + {{- ", // default: " + param_spec.default|tojson }} + {%- endif -%} + {%- endif -%} + {%- if not loop.last %} + {{- ",\n" }} + {%- else %} + {{- ",\n" }} + {%- endif -%} + {%- endfor %} + {{- "}) => any;\n\n" }} + {%- else -%} + {{- "() => any;\n\n" }} + {%- endif -%} + {%- endfor %} + {{- "} // namespace " + namespace_name }} +{%- endmacro -%} + +{%- macro render_builtin_tools(browser_tool, python_tool) -%} + {%- if browser_tool %} + {{- "## browser\n\n" }} + {{- "// Tool for browsing.\n" }} + {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }} + {{- "// Cite information from the tool using the following format:\n" }} + {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }} + {{- "// Do not quote more than 10 words directly from the tool output.\n" }} + {{- "// sources=web (default: web)\n" }} + {{- "namespace browser {\n\n" }} + {{- "// Searches for information related to `query` and displays `topn` results.\n" }} + {{- "type search = (_: {\n" }} + {{- "query: string,\n" }} + {{- "topn?: number, // default: 10\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }} + {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }} + {{- "// If `cursor` is not provided, the most recent page is implied.\n" }} + {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }} + {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }} + {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }} + {{- "type open = (_: {\n" }} + {{- "id?: number | string, // default: -1\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "loc?: number, // default: -1\n" }} + {{- "num_lines?: number, // default: -1\n" }} + {{- "view_source?: boolean, // default: false\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }} + {{- "type find = (_: {\n" }} + {{- "pattern: string,\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "}) => any;\n\n" }} + {{- "} // namespace browser\n\n" }} + {%- endif -%} + + {%- if python_tool %} + {{- "## python\n\n" }} + {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }} + {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }} + {%- endif -%} +{%- endmacro -%} + +{#- System Message Construction ============================================ #} +{%- macro build_system_message() -%} + {%- if model_identity is not defined %} + {%- set model_identity = "You are ChatGPT, a large language model trained by OpenAI." %} + {%- endif %} + {{- model_identity + "\n" }} + {{- "Knowledge cutoff: 2024-06\n" }} + {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }} + {%- if reasoning_effort is not defined %} + {%- set reasoning_effort = "medium" %} + {%- endif %} + {{- "Reasoning: " + reasoning_effort + "\n\n" }} + {%- if builtin_tools %} + {{- "# Tools\n\n" }} + {%- set available_builtin_tools = namespace(browser=false, python=false) %} + {%- for tool in builtin_tools %} + {%- if tool == "browser" %} + {%- set available_builtin_tools.browser = true %} + {%- elif tool == "python" %} + {%- set available_builtin_tools.python = true %} + {%- endif %} + {%- endfor %} + {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }} + {%- endif -%} + {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }} + {%- if tools -%} + {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }} + {%- endif -%} +{%- endmacro -%} + +{#- Main Template Logic ================================================= #} +{#- Set defaults #} + +{#- Render system message #} +{{- "<|start|>system<|message|>" }} +{{- build_system_message() }} +{{- "<|end|>" }} + +{#- Extract developer message #} +{%- if messages[0].role == "developer" or messages[0].role == "system" %} + {%- set developer_message = messages[0].content %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set developer_message = "" %} + {%- set loop_messages = messages %} +{%- endif %} + +{#- Render developer message #} +{%- if developer_message or tools %} + {{- "<|start|>developer<|message|>" }} + {%- if developer_message %} + {{- "# Instructions\n\n" }} + {{- developer_message }} + {{- "\n\n" }} + {%- endif %} + {%- if tools -%} + {{- "# Tools\n\n" }} + {{- render_tool_namespace("functions", tools) }} + {%- endif -%} + {{- "<|end|>" }} +{%- endif %} + +{#- Render messages #} +{%- set last_tool_call = namespace(name=none) %} +{%- for message in loop_messages -%} + {#- At this point only assistant/user/tool messages should remain #} + {%- if message.role == 'assistant' -%} + {#- Checks to ensure the messages are being passed in the format we expect #} + {%- if "content" in message %} + {%- if "<|channel|>analysis<|message|>" in message.content or "<|channel|>final<|message|>" in message.content %} + {{- raise_exception("You have passed a message containing <|channel|> tags in the content field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }} + {%- endif %} + {%- endif %} + {%- if "thinking" in message %} + {%- if "<|channel|>analysis<|message|>" in message.thinking or "<|channel|>final<|message|>" in message.thinking %} + {{- raise_exception("You have passed a message containing <|channel|> tags in the thinking field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }} + {%- endif %} + {%- endif %} + {%- if "tool_calls" in message %} + {#- We need very careful handling here - we want to drop the tool call analysis message if the model #} + {#- has output a later <|final|> message, but otherwise we want to retain it. This is the only case #} + {#- when we render CoT/analysis messages in inference. #} + {%- set future_final_message = namespace(found=false) %} + {%- for future_message in loop_messages[loop.index:] %} + {%- if future_message.role == 'assistant' and "tool_calls" not in future_message %} + {%- set future_final_message.found = true %} + {%- endif %} + {%- endfor %} + {#- We assume max 1 tool call per message, and so we infer the tool call name #} + {#- in "tool" messages from the most recent assistant tool call name #} + {%- set tool_call = message.tool_calls[0] %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if message.content and message.thinking %} + {{- raise_exception("Cannot pass both content and thinking in an assistant message with tool calls! Put the analysis message in one or the other, but not both.") }} + {%- elif message.content and not future_final_message.found %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }} + {%- elif message.thinking and not future_final_message.found %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {%- endif %} + {{- "<|start|>assistant to=" }} + {{- "functions." + tool_call.name + "<|channel|>commentary " }} + {{- (tool_call.content_type if tool_call.content_type is defined else "json") + "<|message|>" }} + {{- tool_call.arguments|tojson }} + {{- "<|call|>" }} + {%- set last_tool_call.name = tool_call.name %} + {%- elif loop.last and not add_generation_prompt %} + {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #} + {#- This is a situation that should only occur in training, never in inference. #} + {%- if "thinking" in message %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {%- endif %} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }} + {%- else %} + {#- CoT is dropped during all previous turns, so we never render it for inference #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- endif %} + {%- elif message.role == 'tool' -%} + {%- if last_tool_call.name is none %} + {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }} + {%- endif %} + {{- "<|start|>functions." + last_tool_call.name }} + {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} + {%- elif message.role == 'user' -%} + {{- "<|start|>user<|message|>" + message.content + "<|end|>" }} + {%- endif -%} +{%- endfor -%} + +{#- Generation prompt #} +{%- if add_generation_prompt -%} +<|start|>assistant +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-554/optimizer.pt b/checkpoint-554/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..199e0f9f4c53d7b6808b99bf5ae7381576aec083 --- /dev/null +++ b/checkpoint-554/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d3149c227ee5659bb5e22737e3468b04ad7b68ab2efb91807f392851f5aeb8a +size 120498699 diff --git a/checkpoint-554/rng_state.pth b/checkpoint-554/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..435e005883bf4440218c894822b086abf80abfc0 --- /dev/null +++ b/checkpoint-554/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8e2011629d8bed3ef560fa11175cac55684c4e12a72634bb24abf767b6c7399 +size 14645 diff --git a/checkpoint-554/scheduler.pt b/checkpoint-554/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d470f67a1b0d60098eba8a54b6929cf3171f4b85 --- /dev/null +++ b/checkpoint-554/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d322dfc4f66861b0919370db5617fb4fea8855e34f6a67b848c7dcda9eaf750b +size 1465 diff --git a/checkpoint-554/special_tokens_map.json b/checkpoint-554/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c47e282982a9c6856832947a72ded329fad2e8c --- /dev/null +++ b/checkpoint-554/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|return|>" +} diff --git a/checkpoint-554/tokenizer.json b/checkpoint-554/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647 --- /dev/null +++ b/checkpoint-554/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3 +size 27868174 diff --git a/checkpoint-554/tokenizer_config.json b/checkpoint-554/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e86f6faa71de0fc3afe47ea8984da9e6138c031c --- /dev/null +++ b/checkpoint-554/tokenizer_config.json @@ -0,0 +1,183 @@ +{ + "added_tokens_decoder": { + "199998": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "199999": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200000": { + "content": "<|reserved_200000|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200001": { + "content": "<|reserved_200001|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200002": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200003": { + "content": "<|constrain|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200004": { + "content": "<|reserved_200004|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200005": { + "content": "<|channel|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200006": { + "content": "<|start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200008": { + "content": "<|message|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200009": { + "content": "<|reserved_200009|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200010": { + "content": "<|reserved_200010|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200011": { + "content": "<|reserved_200011|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200012": { + "content": "<|call|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200013": { + "content": "<|reserved_200013|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200014": { + "content": "<|reserved_200014|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200015": { + "content": "<|reserved_200015|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200016": { + "content": "<|reserved_200016|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200017": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200018": { + "content": "<|endofprompt|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|startoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|return|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|return|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-554/trainer_state.json b/checkpoint-554/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1fae1f9b080378c2ed18e9cef1ab638444813e9a --- /dev/null +++ b/checkpoint-554/trainer_state.json @@ -0,0 +1,5596 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 554, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.358862280845642, + "epoch": 0.0036199095022624436, + "grad_norm": 2.292628288269043, + "learning_rate": 0.0, + "loss": 0.7311, + "mean_token_accuracy": 0.8534883409738541, + "num_tokens": 9316.0, + "step": 1 + }, + { + "entropy": 2.674945294857025, + "epoch": 0.007239819004524887, + "grad_norm": 3.8950836658477783, + "learning_rate": 1.0219999999999999e-05, + "loss": 1.0621, + "mean_token_accuracy": 0.8183160275220871, + "num_tokens": 17707.0, + "step": 2 + }, + { + "entropy": 2.4915525913238525, + "epoch": 0.01085972850678733, + "grad_norm": 2.792142868041992, + "learning_rate": 2.0439999999999997e-05, + "loss": 0.8448, + "mean_token_accuracy": 0.8489587754011154, + "num_tokens": 26783.0, + "step": 3 + }, + { + "entropy": 2.525622010231018, + "epoch": 0.014479638009049774, + "grad_norm": 2.7071900367736816, + "learning_rate": 3.0659999999999994e-05, + "loss": 0.8847, + "mean_token_accuracy": 0.8486668318510056, + "num_tokens": 35947.0, + "step": 4 + }, + { + "entropy": 2.588509976863861, + "epoch": 0.01809954751131222, + "grad_norm": 2.981574773788452, + "learning_rate": 4.0879999999999995e-05, + "loss": 1.0783, + "mean_token_accuracy": 0.8135111033916473, + "num_tokens": 44505.0, + "step": 5 + }, + { + "entropy": 2.662865400314331, + "epoch": 0.02171945701357466, + "grad_norm": 2.629283905029297, + "learning_rate": 5.1099999999999995e-05, + "loss": 0.9485, + "mean_token_accuracy": 0.8152717798948288, + "num_tokens": 53140.0, + "step": 6 + }, + { + "entropy": 2.6662243604660034, + "epoch": 0.025339366515837104, + "grad_norm": 2.730058431625366, + "learning_rate": 6.131999999999999e-05, + "loss": 0.6982, + "mean_token_accuracy": 0.8552135527133942, + "num_tokens": 61932.0, + "step": 7 + }, + { + "entropy": 2.661384105682373, + "epoch": 0.02895927601809955, + "grad_norm": 2.562839984893799, + "learning_rate": 7.154e-05, + "loss": 0.7296, + "mean_token_accuracy": 0.8579540699720383, + "num_tokens": 70973.0, + "step": 8 + }, + { + "entropy": 2.7889368534088135, + "epoch": 0.03257918552036199, + "grad_norm": 2.8640544414520264, + "learning_rate": 8.175999999999999e-05, + "loss": 0.5965, + "mean_token_accuracy": 0.8638457208871841, + "num_tokens": 79977.0, + "step": 9 + }, + { + "entropy": 2.811532199382782, + "epoch": 0.03619909502262444, + "grad_norm": 2.6199426651000977, + "learning_rate": 9.197999999999998e-05, + "loss": 0.4819, + "mean_token_accuracy": 0.8786454051733017, + "num_tokens": 88915.0, + "step": 10 + }, + { + "entropy": 2.941167712211609, + "epoch": 0.039819004524886875, + "grad_norm": 1.2497272491455078, + "learning_rate": 0.00010219999999999999, + "loss": 0.7192, + "mean_token_accuracy": 0.841494083404541, + "num_tokens": 97749.0, + "step": 11 + }, + { + "entropy": 3.0547962188720703, + "epoch": 0.04343891402714932, + "grad_norm": 1.436136245727539, + "learning_rate": 0.00011241999999999998, + "loss": 0.5908, + "mean_token_accuracy": 0.8657624870538712, + "num_tokens": 106048.0, + "step": 12 + }, + { + "entropy": 2.9914053082466125, + "epoch": 0.047058823529411764, + "grad_norm": 0.9903654456138611, + "learning_rate": 0.00012263999999999998, + "loss": 0.4008, + "mean_token_accuracy": 0.8985499292612076, + "num_tokens": 115216.0, + "step": 13 + }, + { + "entropy": 3.1867465376853943, + "epoch": 0.05067873303167421, + "grad_norm": 1.019572377204895, + "learning_rate": 0.00013286, + "loss": 0.5062, + "mean_token_accuracy": 0.8893097043037415, + "num_tokens": 124040.0, + "step": 14 + }, + { + "entropy": 3.2431325912475586, + "epoch": 0.05429864253393665, + "grad_norm": 1.2394084930419922, + "learning_rate": 0.00014308, + "loss": 0.361, + "mean_token_accuracy": 0.9009967148303986, + "num_tokens": 132447.0, + "step": 15 + }, + { + "entropy": 3.1858643889427185, + "epoch": 0.0579185520361991, + "grad_norm": 0.9859603643417358, + "learning_rate": 0.00015329999999999999, + "loss": 0.4498, + "mean_token_accuracy": 0.887280747294426, + "num_tokens": 141228.0, + "step": 16 + }, + { + "entropy": 3.5029141902923584, + "epoch": 0.06153846153846154, + "grad_norm": 1.453957438468933, + "learning_rate": 0.00016351999999999998, + "loss": 0.4949, + "mean_token_accuracy": 0.888081505894661, + "num_tokens": 149789.0, + "step": 17 + }, + { + "entropy": 3.4572895765304565, + "epoch": 0.06515837104072399, + "grad_norm": 1.390377402305603, + "learning_rate": 0.00017374, + "loss": 0.5449, + "mean_token_accuracy": 0.8745045810937881, + "num_tokens": 157813.0, + "step": 18 + }, + { + "entropy": 3.3081750869750977, + "epoch": 0.06877828054298643, + "grad_norm": 1.1171791553497314, + "learning_rate": 0.00018395999999999997, + "loss": 0.4786, + "mean_token_accuracy": 0.8893420845270157, + "num_tokens": 166315.0, + "step": 19 + }, + { + "entropy": 3.3776715993881226, + "epoch": 0.07239819004524888, + "grad_norm": 1.5567998886108398, + "learning_rate": 0.00019418, + "loss": 0.3669, + "mean_token_accuracy": 0.9146632701158524, + "num_tokens": 175207.0, + "step": 20 + }, + { + "entropy": 3.2677870988845825, + "epoch": 0.0760180995475113, + "grad_norm": 1.7404611110687256, + "learning_rate": 0.00020439999999999998, + "loss": 0.5287, + "mean_token_accuracy": 0.8777483552694321, + "num_tokens": 183833.0, + "step": 21 + }, + { + "entropy": 3.313201069831848, + "epoch": 0.07963800904977375, + "grad_norm": 1.0836979150772095, + "learning_rate": 0.00021461999999999997, + "loss": 0.3014, + "mean_token_accuracy": 0.9215261936187744, + "num_tokens": 192591.0, + "step": 22 + }, + { + "entropy": 3.208672881126404, + "epoch": 0.0832579185520362, + "grad_norm": 1.2197301387786865, + "learning_rate": 0.00022483999999999997, + "loss": 0.4401, + "mean_token_accuracy": 0.9031257778406143, + "num_tokens": 201372.0, + "step": 23 + }, + { + "entropy": 3.1830995082855225, + "epoch": 0.08687782805429864, + "grad_norm": 1.2422229051589966, + "learning_rate": 0.00023506, + "loss": 0.5144, + "mean_token_accuracy": 0.8915928155183792, + "num_tokens": 210348.0, + "step": 24 + }, + { + "entropy": 3.085207223892212, + "epoch": 0.09049773755656108, + "grad_norm": 0.8987624049186707, + "learning_rate": 0.00024527999999999996, + "loss": 0.3253, + "mean_token_accuracy": 0.9221627116203308, + "num_tokens": 219131.0, + "step": 25 + }, + { + "entropy": 3.026031017303467, + "epoch": 0.09411764705882353, + "grad_norm": 1.0273475646972656, + "learning_rate": 0.0002555, + "loss": 0.3495, + "mean_token_accuracy": 0.9147634357213974, + "num_tokens": 228292.0, + "step": 26 + }, + { + "entropy": 3.0420032739639282, + "epoch": 0.09773755656108597, + "grad_norm": 1.0590945482254028, + "learning_rate": 0.00026572, + "loss": 0.4495, + "mean_token_accuracy": 0.9019353687763214, + "num_tokens": 236942.0, + "step": 27 + }, + { + "entropy": 3.0469263792037964, + "epoch": 0.10135746606334842, + "grad_norm": 0.9584959745407104, + "learning_rate": 0.00027594, + "loss": 0.405, + "mean_token_accuracy": 0.9216890782117844, + "num_tokens": 245543.0, + "step": 28 + }, + { + "entropy": 2.92683744430542, + "epoch": 0.10497737556561086, + "grad_norm": 0.8826628923416138, + "learning_rate": 0.00028616, + "loss": 0.4004, + "mean_token_accuracy": 0.9173285663127899, + "num_tokens": 254264.0, + "step": 29 + }, + { + "entropy": 3.0086968541145325, + "epoch": 0.1085972850678733, + "grad_norm": 0.8521863222122192, + "learning_rate": 0.00029637999999999995, + "loss": 0.2876, + "mean_token_accuracy": 0.9335231184959412, + "num_tokens": 263143.0, + "step": 30 + }, + { + "entropy": 2.9086623191833496, + "epoch": 0.11221719457013575, + "grad_norm": 0.7830919623374939, + "learning_rate": 0.00030659999999999997, + "loss": 0.548, + "mean_token_accuracy": 0.8831343650817871, + "num_tokens": 272055.0, + "step": 31 + }, + { + "entropy": 2.9730575680732727, + "epoch": 0.1158371040723982, + "grad_norm": 0.7217472195625305, + "learning_rate": 0.00031682, + "loss": 0.3564, + "mean_token_accuracy": 0.9119151830673218, + "num_tokens": 280971.0, + "step": 32 + }, + { + "entropy": 3.081720530986786, + "epoch": 0.11945701357466064, + "grad_norm": 0.8697704076766968, + "learning_rate": 0.00032703999999999996, + "loss": 0.334, + "mean_token_accuracy": 0.9234935492277145, + "num_tokens": 289449.0, + "step": 33 + }, + { + "entropy": 3.1043431162834167, + "epoch": 0.12307692307692308, + "grad_norm": 0.7962514758110046, + "learning_rate": 0.00033726, + "loss": 0.1602, + "mean_token_accuracy": 0.9554370939731598, + "num_tokens": 297804.0, + "step": 34 + }, + { + "entropy": 3.0275490283966064, + "epoch": 0.12669683257918551, + "grad_norm": 0.5887104272842407, + "learning_rate": 0.00034748, + "loss": 0.2254, + "mean_token_accuracy": 0.9491932094097137, + "num_tokens": 306589.0, + "step": 35 + }, + { + "entropy": 3.099652886390686, + "epoch": 0.13031674208144797, + "grad_norm": 0.894397497177124, + "learning_rate": 0.00035769999999999997, + "loss": 0.6397, + "mean_token_accuracy": 0.8802188038825989, + "num_tokens": 315534.0, + "step": 36 + }, + { + "entropy": 3.0312134623527527, + "epoch": 0.1339366515837104, + "grad_norm": 0.6374682188034058, + "learning_rate": 0.00036791999999999993, + "loss": 0.2183, + "mean_token_accuracy": 0.9478497952222824, + "num_tokens": 324492.0, + "step": 37 + }, + { + "entropy": 3.28497713804245, + "epoch": 0.13755656108597286, + "grad_norm": 0.6740968823432922, + "learning_rate": 0.00037813999999999995, + "loss": 0.3619, + "mean_token_accuracy": 0.9288723170757294, + "num_tokens": 333195.0, + "step": 38 + }, + { + "entropy": 3.1478323340415955, + "epoch": 0.1411764705882353, + "grad_norm": 0.7235494256019592, + "learning_rate": 0.00038836, + "loss": 0.324, + "mean_token_accuracy": 0.9179254025220871, + "num_tokens": 342028.0, + "step": 39 + }, + { + "entropy": 3.279879152774811, + "epoch": 0.14479638009049775, + "grad_norm": 0.7512595653533936, + "learning_rate": 0.00039858, + "loss": 0.4804, + "mean_token_accuracy": 0.889826312661171, + "num_tokens": 350902.0, + "step": 40 + }, + { + "entropy": 3.173546612262726, + "epoch": 0.14841628959276018, + "grad_norm": 0.6978861689567566, + "learning_rate": 0.00040879999999999996, + "loss": 0.3442, + "mean_token_accuracy": 0.9205169230699539, + "num_tokens": 359787.0, + "step": 41 + }, + { + "entropy": 3.2385765314102173, + "epoch": 0.1520361990950226, + "grad_norm": 0.8108944892883301, + "learning_rate": 0.00041901999999999993, + "loss": 0.4223, + "mean_token_accuracy": 0.8979178965091705, + "num_tokens": 368426.0, + "step": 42 + }, + { + "entropy": 3.146568477153778, + "epoch": 0.15565610859728507, + "grad_norm": 0.5847787261009216, + "learning_rate": 0.00042923999999999995, + "loss": 0.1953, + "mean_token_accuracy": 0.9556037336587906, + "num_tokens": 377349.0, + "step": 43 + }, + { + "entropy": 3.066233277320862, + "epoch": 0.1592760180995475, + "grad_norm": 0.7887329459190369, + "learning_rate": 0.00043945999999999997, + "loss": 0.6815, + "mean_token_accuracy": 0.8654293268918991, + "num_tokens": 386603.0, + "step": 44 + }, + { + "entropy": 3.1745981574058533, + "epoch": 0.16289592760180996, + "grad_norm": 0.7280165553092957, + "learning_rate": 0.00044967999999999994, + "loss": 0.1932, + "mean_token_accuracy": 0.9479279220104218, + "num_tokens": 395070.0, + "step": 45 + }, + { + "entropy": 3.1094446182250977, + "epoch": 0.1665158371040724, + "grad_norm": 0.6453448534011841, + "learning_rate": 0.00045989999999999996, + "loss": 0.2608, + "mean_token_accuracy": 0.9249396026134491, + "num_tokens": 403651.0, + "step": 46 + }, + { + "entropy": 2.9050925970077515, + "epoch": 0.17013574660633485, + "grad_norm": 0.6689278483390808, + "learning_rate": 0.00047012, + "loss": 0.4489, + "mean_token_accuracy": 0.898686870932579, + "num_tokens": 412898.0, + "step": 47 + }, + { + "entropy": 3.2239145040512085, + "epoch": 0.17375565610859728, + "grad_norm": 1.0014020204544067, + "learning_rate": 0.00048033999999999994, + "loss": 0.3234, + "mean_token_accuracy": 0.9231891483068466, + "num_tokens": 421420.0, + "step": 48 + }, + { + "entropy": 3.035899817943573, + "epoch": 0.17737556561085974, + "grad_norm": 0.6415768265724182, + "learning_rate": 0.0004905599999999999, + "loss": 0.2259, + "mean_token_accuracy": 0.9447792917490005, + "num_tokens": 430258.0, + "step": 49 + }, + { + "entropy": 3.057477653026581, + "epoch": 0.18099547511312217, + "grad_norm": 0.6042271256446838, + "learning_rate": 0.0005007799999999999, + "loss": 0.2228, + "mean_token_accuracy": 0.9473378211259842, + "num_tokens": 439593.0, + "step": 50 + }, + { + "entropy": 2.8375911116600037, + "epoch": 0.18461538461538463, + "grad_norm": 0.739811897277832, + "learning_rate": 0.000511, + "loss": 0.3623, + "mean_token_accuracy": 0.9050924181938171, + "num_tokens": 449056.0, + "step": 51 + }, + { + "entropy": 2.9926682114601135, + "epoch": 0.18823529411764706, + "grad_norm": 0.6637321710586548, + "learning_rate": 0.0005109995633102972, + "loss": 0.2924, + "mean_token_accuracy": 0.9397273659706116, + "num_tokens": 457677.0, + "step": 52 + }, + { + "entropy": 2.7932987809181213, + "epoch": 0.19185520361990951, + "grad_norm": 0.5666584372520447, + "learning_rate": 0.0005109982532428477, + "loss": 0.2055, + "mean_token_accuracy": 0.9385408014059067, + "num_tokens": 466969.0, + "step": 53 + }, + { + "entropy": 2.765812337398529, + "epoch": 0.19547511312217195, + "grad_norm": 0.7875120639801025, + "learning_rate": 0.0005109960698026271, + "loss": 0.4549, + "mean_token_accuracy": 0.9052814990282059, + "num_tokens": 476285.0, + "step": 54 + }, + { + "entropy": 2.884207248687744, + "epoch": 0.19909502262443438, + "grad_norm": 0.7538661956787109, + "learning_rate": 0.0005109930129979285, + "loss": 0.3751, + "mean_token_accuracy": 0.9210246652364731, + "num_tokens": 484668.0, + "step": 55 + }, + { + "entropy": 2.779718518257141, + "epoch": 0.20271493212669683, + "grad_norm": 0.8069296479225159, + "learning_rate": 0.0005109890828403621, + "loss": 0.3664, + "mean_token_accuracy": 0.9219843596220016, + "num_tokens": 493292.0, + "step": 56 + }, + { + "entropy": 2.841543674468994, + "epoch": 0.20633484162895926, + "grad_norm": 0.5545904636383057, + "learning_rate": 0.0005109842793448548, + "loss": 0.1973, + "mean_token_accuracy": 0.9547395706176758, + "num_tokens": 501973.0, + "step": 57 + }, + { + "entropy": 2.8180030584335327, + "epoch": 0.20995475113122172, + "grad_norm": 1.015456199645996, + "learning_rate": 0.0005109786025296513, + "loss": 0.6019, + "mean_token_accuracy": 0.88613361120224, + "num_tokens": 510840.0, + "step": 58 + }, + { + "entropy": 2.7450912594795227, + "epoch": 0.21357466063348415, + "grad_norm": 0.6784740686416626, + "learning_rate": 0.0005109720524163127, + "loss": 0.2868, + "mean_token_accuracy": 0.9295425117015839, + "num_tokens": 519656.0, + "step": 59 + }, + { + "entropy": 2.822400987148285, + "epoch": 0.2171945701357466, + "grad_norm": 0.8780149817466736, + "learning_rate": 0.000510964629029717, + "loss": 0.4371, + "mean_token_accuracy": 0.9089596569538116, + "num_tokens": 528105.0, + "step": 60 + }, + { + "entropy": 2.522100865840912, + "epoch": 0.22081447963800904, + "grad_norm": 0.51394122838974, + "learning_rate": 0.0005109563323980594, + "loss": 0.2509, + "mean_token_accuracy": 0.941976860165596, + "num_tokens": 537707.0, + "step": 61 + }, + { + "entropy": 2.6596657633781433, + "epoch": 0.2244343891402715, + "grad_norm": 0.6359816789627075, + "learning_rate": 0.0005109471625528516, + "loss": 0.3685, + "mean_token_accuracy": 0.9191890209913254, + "num_tokens": 546517.0, + "step": 62 + }, + { + "entropy": 2.800311803817749, + "epoch": 0.22805429864253393, + "grad_norm": 0.6862941980361938, + "learning_rate": 0.0005109371195289215, + "loss": 0.2457, + "mean_token_accuracy": 0.9330879002809525, + "num_tokens": 555493.0, + "step": 63 + }, + { + "entropy": 2.7235344648361206, + "epoch": 0.2316742081447964, + "grad_norm": 1.0464682579040527, + "learning_rate": 0.0005109262033644142, + "loss": 0.4417, + "mean_token_accuracy": 0.8957678377628326, + "num_tokens": 564255.0, + "step": 64 + }, + { + "entropy": 2.6643534302711487, + "epoch": 0.23529411764705882, + "grad_norm": 1.0790019035339355, + "learning_rate": 0.0005109144141007903, + "loss": 0.4947, + "mean_token_accuracy": 0.8889007717370987, + "num_tokens": 573401.0, + "step": 65 + }, + { + "entropy": 2.760925054550171, + "epoch": 0.23891402714932128, + "grad_norm": 0.7957189679145813, + "learning_rate": 0.0005109017517828273, + "loss": 0.2259, + "mean_token_accuracy": 0.944578230381012, + "num_tokens": 581905.0, + "step": 66 + }, + { + "entropy": 2.7048792839050293, + "epoch": 0.2425339366515837, + "grad_norm": 0.9530714750289917, + "learning_rate": 0.0005108882164586181, + "loss": 0.3122, + "mean_token_accuracy": 0.9257418513298035, + "num_tokens": 590802.0, + "step": 67 + }, + { + "entropy": 2.6733291149139404, + "epoch": 0.24615384615384617, + "grad_norm": 0.8295993208885193, + "learning_rate": 0.0005108738081795716, + "loss": 0.3701, + "mean_token_accuracy": 0.898589238524437, + "num_tokens": 599279.0, + "step": 68 + }, + { + "entropy": 2.5613606572151184, + "epoch": 0.2497737556561086, + "grad_norm": 0.6205935478210449, + "learning_rate": 0.0005108585270004123, + "loss": 0.4372, + "mean_token_accuracy": 0.9116007685661316, + "num_tokens": 608107.0, + "step": 69 + }, + { + "entropy": 2.458296835422516, + "epoch": 0.25339366515837103, + "grad_norm": 0.7629838585853577, + "learning_rate": 0.0005108423729791799, + "loss": 0.2307, + "mean_token_accuracy": 0.9386163502931595, + "num_tokens": 616881.0, + "step": 70 + }, + { + "entropy": 2.4176695346832275, + "epoch": 0.25701357466063346, + "grad_norm": 0.902400016784668, + "learning_rate": 0.0005108253461772298, + "loss": 0.2853, + "mean_token_accuracy": 0.9237343072891235, + "num_tokens": 625323.0, + "step": 71 + }, + { + "entropy": 2.2265281677246094, + "epoch": 0.26063348416289595, + "grad_norm": 0.7744383811950684, + "learning_rate": 0.0005108074466592316, + "loss": 0.2435, + "mean_token_accuracy": 0.9508260935544968, + "num_tokens": 634260.0, + "step": 72 + }, + { + "entropy": 2.1855952441692352, + "epoch": 0.2642533936651584, + "grad_norm": 0.8615190386772156, + "learning_rate": 0.0005107886744931702, + "loss": 0.3323, + "mean_token_accuracy": 0.9276078194379807, + "num_tokens": 643235.0, + "step": 73 + }, + { + "entropy": 2.179121494293213, + "epoch": 0.2678733031674208, + "grad_norm": 0.8953279256820679, + "learning_rate": 0.0005107690297503444, + "loss": 0.2384, + "mean_token_accuracy": 0.9425230622291565, + "num_tokens": 652032.0, + "step": 74 + }, + { + "entropy": 2.1565526127815247, + "epoch": 0.27149321266968324, + "grad_norm": 0.6830486059188843, + "learning_rate": 0.0005107485125053678, + "loss": 0.2759, + "mean_token_accuracy": 0.9360661953687668, + "num_tokens": 660978.0, + "step": 75 + }, + { + "entropy": 2.0900665521621704, + "epoch": 0.2751131221719457, + "grad_norm": 0.786665141582489, + "learning_rate": 0.0005107271228361672, + "loss": 0.4061, + "mean_token_accuracy": 0.910009115934372, + "num_tokens": 669817.0, + "step": 76 + }, + { + "entropy": 2.1311859488487244, + "epoch": 0.27873303167420815, + "grad_norm": 0.6399909853935242, + "learning_rate": 0.0005107048608239836, + "loss": 0.272, + "mean_token_accuracy": 0.9424714297056198, + "num_tokens": 678469.0, + "step": 77 + }, + { + "entropy": 2.059997320175171, + "epoch": 0.2823529411764706, + "grad_norm": 0.8114754557609558, + "learning_rate": 0.0005106817265533706, + "loss": 0.4029, + "mean_token_accuracy": 0.9037660360336304, + "num_tokens": 687261.0, + "step": 78 + }, + { + "entropy": 1.9725019037723541, + "epoch": 0.285972850678733, + "grad_norm": 0.9420941472053528, + "learning_rate": 0.0005106577201121952, + "loss": 0.535, + "mean_token_accuracy": 0.8996377140283585, + "num_tokens": 695941.0, + "step": 79 + }, + { + "entropy": 1.9951164424419403, + "epoch": 0.2895927601809955, + "grad_norm": 0.6476142406463623, + "learning_rate": 0.0005106328415916372, + "loss": 0.2242, + "mean_token_accuracy": 0.941379725933075, + "num_tokens": 704643.0, + "step": 80 + }, + { + "entropy": 1.8962564170360565, + "epoch": 0.29321266968325793, + "grad_norm": 0.5974630117416382, + "learning_rate": 0.0005106070910861881, + "loss": 0.2934, + "mean_token_accuracy": 0.9217697530984879, + "num_tokens": 713605.0, + "step": 81 + }, + { + "entropy": 1.9781515896320343, + "epoch": 0.29683257918552036, + "grad_norm": 0.8755478262901306, + "learning_rate": 0.0005105804686936518, + "loss": 0.4551, + "mean_token_accuracy": 0.9051328897476196, + "num_tokens": 722385.0, + "step": 82 + }, + { + "entropy": 1.9892418384552002, + "epoch": 0.3004524886877828, + "grad_norm": 0.6887345314025879, + "learning_rate": 0.0005105529745151433, + "loss": 0.244, + "mean_token_accuracy": 0.9261117279529572, + "num_tokens": 730962.0, + "step": 83 + }, + { + "entropy": 2.0053181648254395, + "epoch": 0.3040723981900452, + "grad_norm": 0.6930885910987854, + "learning_rate": 0.0005105246086550893, + "loss": 0.3155, + "mean_token_accuracy": 0.9206147193908691, + "num_tokens": 739499.0, + "step": 84 + }, + { + "entropy": 1.9716475903987885, + "epoch": 0.3076923076923077, + "grad_norm": 0.5049461722373962, + "learning_rate": 0.0005104953712212266, + "loss": 0.2215, + "mean_token_accuracy": 0.9608763605356216, + "num_tokens": 748604.0, + "step": 85 + }, + { + "entropy": 1.9186978042125702, + "epoch": 0.31131221719457014, + "grad_norm": 0.5756685733795166, + "learning_rate": 0.000510465262324603, + "loss": 0.2658, + "mean_token_accuracy": 0.9372887462377548, + "num_tokens": 757919.0, + "step": 86 + }, + { + "entropy": 1.9738290905952454, + "epoch": 0.31493212669683257, + "grad_norm": 0.6163789629936218, + "learning_rate": 0.0005104342820795758, + "loss": 0.2472, + "mean_token_accuracy": 0.9430449157953262, + "num_tokens": 766708.0, + "step": 87 + }, + { + "entropy": 2.1927571892738342, + "epoch": 0.318552036199095, + "grad_norm": 0.7953162789344788, + "learning_rate": 0.0005104024306038119, + "loss": 0.261, + "mean_token_accuracy": 0.9425829648971558, + "num_tokens": 774601.0, + "step": 88 + }, + { + "entropy": 2.043731451034546, + "epoch": 0.3221719457013575, + "grad_norm": 0.8098088502883911, + "learning_rate": 0.0005103697080182872, + "loss": 0.3126, + "mean_token_accuracy": 0.9158089309930801, + "num_tokens": 783170.0, + "step": 89 + }, + { + "entropy": 1.9801572561264038, + "epoch": 0.3257918552036199, + "grad_norm": 0.5227240920066833, + "learning_rate": 0.0005103361144472864, + "loss": 0.1291, + "mean_token_accuracy": 0.9666071832180023, + "num_tokens": 791769.0, + "step": 90 + }, + { + "entropy": 1.9553790986537933, + "epoch": 0.32941176470588235, + "grad_norm": 0.7819464206695557, + "learning_rate": 0.0005103016500184022, + "loss": 0.531, + "mean_token_accuracy": 0.8817111849784851, + "num_tokens": 800824.0, + "step": 91 + }, + { + "entropy": 1.9291303753852844, + "epoch": 0.3330316742081448, + "grad_norm": 0.7178757190704346, + "learning_rate": 0.0005102663148625347, + "loss": 0.3301, + "mean_token_accuracy": 0.9357631802558899, + "num_tokens": 809347.0, + "step": 92 + }, + { + "entropy": 1.9846041798591614, + "epoch": 0.33665158371040727, + "grad_norm": 1.316636085510254, + "learning_rate": 0.0005102301091138916, + "loss": 0.4241, + "mean_token_accuracy": 0.8993304669857025, + "num_tokens": 817174.0, + "step": 93 + }, + { + "entropy": 1.814637303352356, + "epoch": 0.3402714932126697, + "grad_norm": 0.5486414432525635, + "learning_rate": 0.0005101930329099865, + "loss": 0.116, + "mean_token_accuracy": 0.9674727618694305, + "num_tokens": 826177.0, + "step": 94 + }, + { + "entropy": 1.9128066003322601, + "epoch": 0.3438914027149321, + "grad_norm": 0.620303750038147, + "learning_rate": 0.00051015508639164, + "loss": 0.1833, + "mean_token_accuracy": 0.9569521993398666, + "num_tokens": 835409.0, + "step": 95 + }, + { + "entropy": 1.7541870176792145, + "epoch": 0.34751131221719456, + "grad_norm": 0.8337438702583313, + "learning_rate": 0.0005101162697029776, + "loss": 0.3327, + "mean_token_accuracy": 0.9193180054426193, + "num_tokens": 844692.0, + "step": 96 + }, + { + "entropy": 1.8255240619182587, + "epoch": 0.351131221719457, + "grad_norm": 0.877780556678772, + "learning_rate": 0.00051007658299143, + "loss": 0.2106, + "mean_token_accuracy": 0.9527023881673813, + "num_tokens": 853309.0, + "step": 97 + }, + { + "entropy": 1.8611579239368439, + "epoch": 0.3547511312217195, + "grad_norm": 1.0667716264724731, + "learning_rate": 0.0005100360264077325, + "loss": 0.3196, + "mean_token_accuracy": 0.9195879399776459, + "num_tokens": 861859.0, + "step": 98 + }, + { + "entropy": 1.821915864944458, + "epoch": 0.3583710407239819, + "grad_norm": 0.8400309681892395, + "learning_rate": 0.0005099946001059241, + "loss": 0.4036, + "mean_token_accuracy": 0.8951036781072617, + "num_tokens": 871060.0, + "step": 99 + }, + { + "entropy": 1.7648265063762665, + "epoch": 0.36199095022624433, + "grad_norm": 1.1391404867172241, + "learning_rate": 0.0005099523042433472, + "loss": 0.389, + "mean_token_accuracy": 0.901309460401535, + "num_tokens": 880593.0, + "step": 100 + }, + { + "entropy": 1.8506875336170197, + "epoch": 0.36561085972850677, + "grad_norm": 0.6923297643661499, + "learning_rate": 0.000509909138980647, + "loss": 0.2504, + "mean_token_accuracy": 0.9384842216968536, + "num_tokens": 889739.0, + "step": 101 + }, + { + "entropy": 1.9311015605926514, + "epoch": 0.36923076923076925, + "grad_norm": 0.9677391052246094, + "learning_rate": 0.0005098651044817704, + "loss": 0.6953, + "mean_token_accuracy": 0.8752655684947968, + "num_tokens": 898992.0, + "step": 102 + }, + { + "entropy": 1.9590983986854553, + "epoch": 0.3728506787330317, + "grad_norm": 0.6364567279815674, + "learning_rate": 0.0005098202009139663, + "loss": 0.4318, + "mean_token_accuracy": 0.9056479930877686, + "num_tokens": 908225.0, + "step": 103 + }, + { + "entropy": 1.9455370008945465, + "epoch": 0.3764705882352941, + "grad_norm": 0.6747863292694092, + "learning_rate": 0.0005097744284477839, + "loss": 0.244, + "mean_token_accuracy": 0.9428392052650452, + "num_tokens": 917134.0, + "step": 104 + }, + { + "entropy": 1.8632825911045074, + "epoch": 0.38009049773755654, + "grad_norm": 0.5705651044845581, + "learning_rate": 0.0005097277872570731, + "loss": 0.2508, + "mean_token_accuracy": 0.9325222969055176, + "num_tokens": 926573.0, + "step": 105 + }, + { + "entropy": 1.9370323717594147, + "epoch": 0.38371040723981903, + "grad_norm": 0.6298627853393555, + "learning_rate": 0.000509680277518983, + "loss": 0.2481, + "mean_token_accuracy": 0.9281332045793533, + "num_tokens": 935853.0, + "step": 106 + }, + { + "entropy": 2.0217572450637817, + "epoch": 0.38733031674208146, + "grad_norm": 0.5434353947639465, + "learning_rate": 0.0005096318994139617, + "loss": 0.1809, + "mean_token_accuracy": 0.9592084139585495, + "num_tokens": 944279.0, + "step": 107 + }, + { + "entropy": 1.9619770646095276, + "epoch": 0.3909502262443439, + "grad_norm": 0.6959638595581055, + "learning_rate": 0.0005095826531257552, + "loss": 0.1376, + "mean_token_accuracy": 0.9608310014009476, + "num_tokens": 953336.0, + "step": 108 + }, + { + "entropy": 2.12511146068573, + "epoch": 0.3945701357466063, + "grad_norm": 1.0152848958969116, + "learning_rate": 0.0005095325388414074, + "loss": 0.4382, + "mean_token_accuracy": 0.915201798081398, + "num_tokens": 962002.0, + "step": 109 + }, + { + "entropy": 2.0171878039836884, + "epoch": 0.39819004524886875, + "grad_norm": 0.8337467312812805, + "learning_rate": 0.0005094815567512587, + "loss": 0.2672, + "mean_token_accuracy": 0.9313560128211975, + "num_tokens": 970954.0, + "step": 110 + }, + { + "entropy": 2.1024146378040314, + "epoch": 0.40180995475113124, + "grad_norm": 0.8214333057403564, + "learning_rate": 0.0005094297070489455, + "loss": 0.3146, + "mean_token_accuracy": 0.9289091974496841, + "num_tokens": 979929.0, + "step": 111 + }, + { + "entropy": 2.260519325733185, + "epoch": 0.40542986425339367, + "grad_norm": 1.1298810243606567, + "learning_rate": 0.0005093769899313996, + "loss": 0.3055, + "mean_token_accuracy": 0.9213490188121796, + "num_tokens": 988477.0, + "step": 112 + }, + { + "entropy": 2.2228699326515198, + "epoch": 0.4090497737556561, + "grad_norm": 0.8601953983306885, + "learning_rate": 0.0005093234055988475, + "loss": 0.2738, + "mean_token_accuracy": 0.920888364315033, + "num_tokens": 997091.0, + "step": 113 + }, + { + "entropy": 2.2165185809135437, + "epoch": 0.41266968325791853, + "grad_norm": 0.6331561803817749, + "learning_rate": 0.0005092689542548091, + "loss": 0.2241, + "mean_token_accuracy": 0.9408514499664307, + "num_tokens": 1005866.0, + "step": 114 + }, + { + "entropy": 2.324040472507477, + "epoch": 0.416289592760181, + "grad_norm": 0.680496096611023, + "learning_rate": 0.0005092136361060975, + "loss": 0.2454, + "mean_token_accuracy": 0.9433349967002869, + "num_tokens": 1014277.0, + "step": 115 + }, + { + "entropy": 2.413789749145508, + "epoch": 0.41990950226244345, + "grad_norm": 0.7489557862281799, + "learning_rate": 0.0005091574513628183, + "loss": 0.2856, + "mean_token_accuracy": 0.934124082326889, + "num_tokens": 1023032.0, + "step": 116 + }, + { + "entropy": 2.4693005681037903, + "epoch": 0.4235294117647059, + "grad_norm": 0.6842612624168396, + "learning_rate": 0.0005091004002383682, + "loss": 0.2778, + "mean_token_accuracy": 0.9386793673038483, + "num_tokens": 1031883.0, + "step": 117 + }, + { + "entropy": 2.4351969361305237, + "epoch": 0.4271493212669683, + "grad_norm": 0.9150674343109131, + "learning_rate": 0.0005090424829494347, + "loss": 0.3151, + "mean_token_accuracy": 0.9177709072828293, + "num_tokens": 1040985.0, + "step": 118 + }, + { + "entropy": 2.5141562819480896, + "epoch": 0.4307692307692308, + "grad_norm": 1.0200655460357666, + "learning_rate": 0.000508983699715995, + "loss": 0.5134, + "mean_token_accuracy": 0.8835459351539612, + "num_tokens": 1049949.0, + "step": 119 + }, + { + "entropy": 2.479240596294403, + "epoch": 0.4343891402714932, + "grad_norm": 0.783278226852417, + "learning_rate": 0.0005089240507613151, + "loss": 0.2745, + "mean_token_accuracy": 0.9389322698116302, + "num_tokens": 1058953.0, + "step": 120 + }, + { + "entropy": 2.457803785800934, + "epoch": 0.43800904977375565, + "grad_norm": 0.7620834112167358, + "learning_rate": 0.0005088635363119497, + "loss": 0.3394, + "mean_token_accuracy": 0.9145695865154266, + "num_tokens": 1068624.0, + "step": 121 + }, + { + "entropy": 2.4909247756004333, + "epoch": 0.4416289592760181, + "grad_norm": 0.5868712067604065, + "learning_rate": 0.0005088021565977403, + "loss": 0.1726, + "mean_token_accuracy": 0.9567564129829407, + "num_tokens": 1077686.0, + "step": 122 + }, + { + "entropy": 2.5540462732315063, + "epoch": 0.4452488687782805, + "grad_norm": 1.1467291116714478, + "learning_rate": 0.0005087399118518148, + "loss": 0.2617, + "mean_token_accuracy": 0.9329706132411957, + "num_tokens": 1086230.0, + "step": 123 + }, + { + "entropy": 2.377680242061615, + "epoch": 0.448868778280543, + "grad_norm": 0.7021825909614563, + "learning_rate": 0.0005086768023105866, + "loss": 0.4124, + "mean_token_accuracy": 0.9093360006809235, + "num_tokens": 1095867.0, + "step": 124 + }, + { + "entropy": 2.55239599943161, + "epoch": 0.45248868778280543, + "grad_norm": 0.5947801470756531, + "learning_rate": 0.0005086128282137538, + "loss": 0.2752, + "mean_token_accuracy": 0.9248816668987274, + "num_tokens": 1105003.0, + "step": 125 + }, + { + "entropy": 2.4695483446121216, + "epoch": 0.45610859728506786, + "grad_norm": 1.345604658126831, + "learning_rate": 0.0005085479898042985, + "loss": 0.2577, + "mean_token_accuracy": 0.9318550229072571, + "num_tokens": 1114162.0, + "step": 126 + }, + { + "entropy": 2.4898732900619507, + "epoch": 0.4597285067873303, + "grad_norm": 0.8534179329872131, + "learning_rate": 0.0005084822873284848, + "loss": 0.3013, + "mean_token_accuracy": 0.9195661097764969, + "num_tokens": 1123457.0, + "step": 127 + }, + { + "entropy": 2.5951223969459534, + "epoch": 0.4633484162895928, + "grad_norm": 1.1677368879318237, + "learning_rate": 0.0005084157210358592, + "loss": 0.1612, + "mean_token_accuracy": 0.9599333852529526, + "num_tokens": 1131774.0, + "step": 128 + }, + { + "entropy": 2.7315847873687744, + "epoch": 0.4669683257918552, + "grad_norm": 0.7633224129676819, + "learning_rate": 0.0005083482911792492, + "loss": 0.2437, + "mean_token_accuracy": 0.9487509876489639, + "num_tokens": 1140301.0, + "step": 129 + }, + { + "entropy": 2.6348633766174316, + "epoch": 0.47058823529411764, + "grad_norm": 0.7573317885398865, + "learning_rate": 0.0005082799980147617, + "loss": 0.2426, + "mean_token_accuracy": 0.947308748960495, + "num_tokens": 1148929.0, + "step": 130 + }, + { + "entropy": 2.60002738237381, + "epoch": 0.47420814479638007, + "grad_norm": 1.8195319175720215, + "learning_rate": 0.0005082108418017829, + "loss": 0.1792, + "mean_token_accuracy": 0.9512491375207901, + "num_tokens": 1157682.0, + "step": 131 + }, + { + "entropy": 2.5319923162460327, + "epoch": 0.47782805429864256, + "grad_norm": 0.6342993378639221, + "learning_rate": 0.0005081408228029771, + "loss": 0.1843, + "mean_token_accuracy": 0.9440758228302002, + "num_tokens": 1166687.0, + "step": 132 + }, + { + "entropy": 2.5666881799697876, + "epoch": 0.481447963800905, + "grad_norm": 0.8979415893554688, + "learning_rate": 0.0005080699412842852, + "loss": 0.4824, + "mean_token_accuracy": 0.8837443292140961, + "num_tokens": 1175746.0, + "step": 133 + }, + { + "entropy": 2.6854636669158936, + "epoch": 0.4850678733031674, + "grad_norm": 0.8302125334739685, + "learning_rate": 0.0005079981975149243, + "loss": 0.267, + "mean_token_accuracy": 0.9279022663831711, + "num_tokens": 1184196.0, + "step": 134 + }, + { + "entropy": 2.564552128314972, + "epoch": 0.48868778280542985, + "grad_norm": 0.6785959005355835, + "learning_rate": 0.0005079255917673863, + "loss": 0.2031, + "mean_token_accuracy": 0.9463823586702347, + "num_tokens": 1192982.0, + "step": 135 + }, + { + "entropy": 2.673682928085327, + "epoch": 0.49230769230769234, + "grad_norm": 1.4760410785675049, + "learning_rate": 0.0005078521243174371, + "loss": 0.4791, + "mean_token_accuracy": 0.8969505727291107, + "num_tokens": 1201454.0, + "step": 136 + }, + { + "entropy": 2.6232714653015137, + "epoch": 0.49592760180995477, + "grad_norm": 0.7845668792724609, + "learning_rate": 0.0005077777954441157, + "loss": 0.2472, + "mean_token_accuracy": 0.9404618591070175, + "num_tokens": 1210182.0, + "step": 137 + }, + { + "entropy": 2.5614060163497925, + "epoch": 0.4995475113122172, + "grad_norm": 0.725419819355011, + "learning_rate": 0.0005077026054297322, + "loss": 0.3643, + "mean_token_accuracy": 0.9193316847085953, + "num_tokens": 1219487.0, + "step": 138 + }, + { + "entropy": 2.5907246470451355, + "epoch": 0.5031674208144796, + "grad_norm": 0.7741782665252686, + "learning_rate": 0.0005076265545598682, + "loss": 0.276, + "mean_token_accuracy": 0.9447730481624603, + "num_tokens": 1228066.0, + "step": 139 + }, + { + "entropy": 2.531104028224945, + "epoch": 0.5067873303167421, + "grad_norm": 0.680992603302002, + "learning_rate": 0.0005075496431233745, + "loss": 0.2004, + "mean_token_accuracy": 0.9470729678869247, + "num_tokens": 1236980.0, + "step": 140 + }, + { + "entropy": 2.590231478214264, + "epoch": 0.5104072398190045, + "grad_norm": 0.8260406255722046, + "learning_rate": 0.0005074718714123704, + "loss": 0.2756, + "mean_token_accuracy": 0.9301882535219193, + "num_tokens": 1245565.0, + "step": 141 + }, + { + "entropy": 2.4858668446540833, + "epoch": 0.5140271493212669, + "grad_norm": 0.8085922598838806, + "learning_rate": 0.0005073932397222429, + "loss": 0.2314, + "mean_token_accuracy": 0.9449103325605392, + "num_tokens": 1254366.0, + "step": 142 + }, + { + "entropy": 2.5374304056167603, + "epoch": 0.5176470588235295, + "grad_norm": 0.7858129143714905, + "learning_rate": 0.0005073137483516452, + "loss": 0.1622, + "mean_token_accuracy": 0.9510673582553864, + "num_tokens": 1263197.0, + "step": 143 + }, + { + "entropy": 2.608425199985504, + "epoch": 0.5212669683257919, + "grad_norm": 1.2698506116867065, + "learning_rate": 0.0005072333976024957, + "loss": 0.1729, + "mean_token_accuracy": 0.9509973376989365, + "num_tokens": 1271725.0, + "step": 144 + }, + { + "entropy": 2.437038242816925, + "epoch": 0.5248868778280543, + "grad_norm": 1.0788538455963135, + "learning_rate": 0.0005071521877799765, + "loss": 0.3344, + "mean_token_accuracy": 0.9166721999645233, + "num_tokens": 1280963.0, + "step": 145 + }, + { + "entropy": 2.589951515197754, + "epoch": 0.5285067873303168, + "grad_norm": 0.9228294491767883, + "learning_rate": 0.0005070701191925332, + "loss": 0.3095, + "mean_token_accuracy": 0.9239777624607086, + "num_tokens": 1289683.0, + "step": 146 + }, + { + "entropy": 2.575794994831085, + "epoch": 0.5321266968325792, + "grad_norm": 1.359767198562622, + "learning_rate": 0.0005069871921518726, + "loss": 0.2447, + "mean_token_accuracy": 0.9374738186597824, + "num_tokens": 1298397.0, + "step": 147 + }, + { + "entropy": 2.5628358721733093, + "epoch": 0.5357466063348416, + "grad_norm": 0.9870713353157043, + "learning_rate": 0.000506903406972962, + "loss": 0.4824, + "mean_token_accuracy": 0.9027767181396484, + "num_tokens": 1307191.0, + "step": 148 + }, + { + "entropy": 2.5513240098953247, + "epoch": 0.539366515837104, + "grad_norm": 0.7921387553215027, + "learning_rate": 0.0005068187639740286, + "loss": 0.3278, + "mean_token_accuracy": 0.9161934554576874, + "num_tokens": 1315878.0, + "step": 149 + }, + { + "entropy": 2.526439070701599, + "epoch": 0.5429864253393665, + "grad_norm": 0.6320391297340393, + "learning_rate": 0.000506733263476557, + "loss": 0.1701, + "mean_token_accuracy": 0.9575318098068237, + "num_tokens": 1324786.0, + "step": 150 + }, + { + "entropy": 2.4837265014648438, + "epoch": 0.5466063348416289, + "grad_norm": 0.5369354486465454, + "learning_rate": 0.000506646905805289, + "loss": 0.1328, + "mean_token_accuracy": 0.9636050164699554, + "num_tokens": 1333766.0, + "step": 151 + }, + { + "entropy": 2.5264737010002136, + "epoch": 0.5502262443438914, + "grad_norm": 0.7346852421760559, + "learning_rate": 0.0005065596912882222, + "loss": 0.2012, + "mean_token_accuracy": 0.9448132663965225, + "num_tokens": 1343004.0, + "step": 152 + }, + { + "entropy": 2.569309651851654, + "epoch": 0.5538461538461539, + "grad_norm": 0.9926508069038391, + "learning_rate": 0.0005064716202566082, + "loss": 0.2831, + "mean_token_accuracy": 0.9332023113965988, + "num_tokens": 1351561.0, + "step": 153 + }, + { + "entropy": 2.3148274421691895, + "epoch": 0.5574660633484163, + "grad_norm": 0.6301954984664917, + "learning_rate": 0.0005063826930449523, + "loss": 0.3622, + "mean_token_accuracy": 0.9349419325590134, + "num_tokens": 1360997.0, + "step": 154 + }, + { + "entropy": 2.497675657272339, + "epoch": 0.5610859728506787, + "grad_norm": 0.8846175670623779, + "learning_rate": 0.000506292909991011, + "loss": 0.2314, + "mean_token_accuracy": 0.9468862265348434, + "num_tokens": 1369600.0, + "step": 155 + }, + { + "entropy": 2.313987612724304, + "epoch": 0.5647058823529412, + "grad_norm": 0.5701894164085388, + "learning_rate": 0.0005062022714357922, + "loss": 0.2154, + "mean_token_accuracy": 0.945093959569931, + "num_tokens": 1379125.0, + "step": 156 + }, + { + "entropy": 2.4019755125045776, + "epoch": 0.5683257918552036, + "grad_norm": 0.8769335746765137, + "learning_rate": 0.0005061107777235524, + "loss": 0.3565, + "mean_token_accuracy": 0.9133864492177963, + "num_tokens": 1388111.0, + "step": 157 + }, + { + "entropy": 2.3127577900886536, + "epoch": 0.571945701357466, + "grad_norm": 1.1026453971862793, + "learning_rate": 0.0005060184292017965, + "loss": 0.2897, + "mean_token_accuracy": 0.899736076593399, + "num_tokens": 1397528.0, + "step": 158 + }, + { + "entropy": 2.2682697772979736, + "epoch": 0.5755656108597285, + "grad_norm": 0.5426591038703918, + "learning_rate": 0.000505925226221276, + "loss": 0.167, + "mean_token_accuracy": 0.9609879851341248, + "num_tokens": 1406809.0, + "step": 159 + }, + { + "entropy": 2.4639336466789246, + "epoch": 0.579185520361991, + "grad_norm": 0.6552363038063049, + "learning_rate": 0.0005058311691359875, + "loss": 0.2511, + "mean_token_accuracy": 0.9355164766311646, + "num_tokens": 1415498.0, + "step": 160 + }, + { + "entropy": 2.467900663614273, + "epoch": 0.5828054298642534, + "grad_norm": 0.7168154120445251, + "learning_rate": 0.000505736258303172, + "loss": 0.234, + "mean_token_accuracy": 0.9450509995222092, + "num_tokens": 1424524.0, + "step": 161 + }, + { + "entropy": 2.3683157563209534, + "epoch": 0.5864253393665159, + "grad_norm": 0.6433501839637756, + "learning_rate": 0.0005056404940833128, + "loss": 0.3441, + "mean_token_accuracy": 0.9261108189821243, + "num_tokens": 1434194.0, + "step": 162 + }, + { + "entropy": 2.4686295986175537, + "epoch": 0.5900452488687783, + "grad_norm": 0.9615177512168884, + "learning_rate": 0.0005055438768401348, + "loss": 0.1492, + "mean_token_accuracy": 0.966903567314148, + "num_tokens": 1442972.0, + "step": 163 + }, + { + "entropy": 2.5551892518997192, + "epoch": 0.5936651583710407, + "grad_norm": 0.4957484006881714, + "learning_rate": 0.0005054464069406023, + "loss": 0.1242, + "mean_token_accuracy": 0.969713419675827, + "num_tokens": 1451324.0, + "step": 164 + }, + { + "entropy": 2.554121434688568, + "epoch": 0.5972850678733032, + "grad_norm": 0.7399498224258423, + "learning_rate": 0.0005053480847549187, + "loss": 0.206, + "mean_token_accuracy": 0.9498797357082367, + "num_tokens": 1459698.0, + "step": 165 + }, + { + "entropy": 2.5181015729904175, + "epoch": 0.6009049773755656, + "grad_norm": 0.7433251142501831, + "learning_rate": 0.0005052489106565241, + "loss": 0.2883, + "mean_token_accuracy": 0.9419967085123062, + "num_tokens": 1468460.0, + "step": 166 + }, + { + "entropy": 2.3073930144309998, + "epoch": 0.604524886877828, + "grad_norm": 0.5920398831367493, + "learning_rate": 0.0005051488850220941, + "loss": 0.197, + "mean_token_accuracy": 0.952111005783081, + "num_tokens": 1477579.0, + "step": 167 + }, + { + "entropy": 2.532376289367676, + "epoch": 0.6081447963800904, + "grad_norm": 0.7033098936080933, + "learning_rate": 0.0005050480082315392, + "loss": 0.2122, + "mean_token_accuracy": 0.9488633275032043, + "num_tokens": 1486307.0, + "step": 168 + }, + { + "entropy": 2.397290349006653, + "epoch": 0.611764705882353, + "grad_norm": 0.8026869893074036, + "learning_rate": 0.0005049462806680021, + "loss": 0.2541, + "mean_token_accuracy": 0.9427233040332794, + "num_tokens": 1495152.0, + "step": 169 + }, + { + "entropy": 2.464823842048645, + "epoch": 0.6153846153846154, + "grad_norm": 0.6508225798606873, + "learning_rate": 0.0005048437027178571, + "loss": 0.2639, + "mean_token_accuracy": 0.9391255974769592, + "num_tokens": 1503903.0, + "step": 170 + }, + { + "entropy": 2.520734131336212, + "epoch": 0.6190045248868778, + "grad_norm": 0.8373616337776184, + "learning_rate": 0.0005047402747707084, + "loss": 0.3078, + "mean_token_accuracy": 0.9302930980920792, + "num_tokens": 1512588.0, + "step": 171 + }, + { + "entropy": 2.388108015060425, + "epoch": 0.6226244343891403, + "grad_norm": 0.6334089636802673, + "learning_rate": 0.0005046359972193884, + "loss": 0.1372, + "mean_token_accuracy": 0.9666119515895844, + "num_tokens": 1522011.0, + "step": 172 + }, + { + "entropy": 2.537126660346985, + "epoch": 0.6262443438914027, + "grad_norm": 0.7665116190910339, + "learning_rate": 0.0005045308704599566, + "loss": 0.2603, + "mean_token_accuracy": 0.9350012242794037, + "num_tokens": 1530767.0, + "step": 173 + }, + { + "entropy": 2.567205488681793, + "epoch": 0.6298642533936651, + "grad_norm": 0.8043875098228455, + "learning_rate": 0.0005044248948916977, + "loss": 0.2497, + "mean_token_accuracy": 0.9400482773780823, + "num_tokens": 1539971.0, + "step": 174 + }, + { + "entropy": 2.585887610912323, + "epoch": 0.6334841628959276, + "grad_norm": 0.5282150506973267, + "learning_rate": 0.0005043180709171206, + "loss": 0.1126, + "mean_token_accuracy": 0.9680279046297073, + "num_tokens": 1548971.0, + "step": 175 + }, + { + "entropy": 2.4289392232894897, + "epoch": 0.63710407239819, + "grad_norm": 0.6838382482528687, + "learning_rate": 0.0005042103989419563, + "loss": 0.2076, + "mean_token_accuracy": 0.9468046277761459, + "num_tokens": 1558403.0, + "step": 176 + }, + { + "entropy": 2.6080575585365295, + "epoch": 0.6407239819004525, + "grad_norm": 0.9058650732040405, + "learning_rate": 0.0005041018793751566, + "loss": 0.1781, + "mean_token_accuracy": 0.9432647377252579, + "num_tokens": 1567209.0, + "step": 177 + }, + { + "entropy": 2.5212480425834656, + "epoch": 0.644343891402715, + "grad_norm": 0.796381950378418, + "learning_rate": 0.0005039925126288929, + "loss": 0.2286, + "mean_token_accuracy": 0.9305787235498428, + "num_tokens": 1576255.0, + "step": 178 + }, + { + "entropy": 2.588195264339447, + "epoch": 0.6479638009049774, + "grad_norm": 0.6489388942718506, + "learning_rate": 0.0005038822991185536, + "loss": 0.1717, + "mean_token_accuracy": 0.9572225511074066, + "num_tokens": 1585335.0, + "step": 179 + }, + { + "entropy": 2.609215259552002, + "epoch": 0.6515837104072398, + "grad_norm": 0.8551130294799805, + "learning_rate": 0.0005037712392627441, + "loss": 0.2358, + "mean_token_accuracy": 0.9529621452093124, + "num_tokens": 1594354.0, + "step": 180 + }, + { + "entropy": 2.4199504256248474, + "epoch": 0.6552036199095023, + "grad_norm": 0.5775637030601501, + "learning_rate": 0.0005036593334832836, + "loss": 0.2402, + "mean_token_accuracy": 0.9437069743871689, + "num_tokens": 1603750.0, + "step": 181 + }, + { + "entropy": 2.516424596309662, + "epoch": 0.6588235294117647, + "grad_norm": 0.6967942118644714, + "learning_rate": 0.0005035465822052047, + "loss": 0.1624, + "mean_token_accuracy": 0.9518167823553085, + "num_tokens": 1612474.0, + "step": 182 + }, + { + "entropy": 2.463354170322418, + "epoch": 0.6624434389140271, + "grad_norm": 0.49672600626945496, + "learning_rate": 0.000503432985856751, + "loss": 0.1654, + "mean_token_accuracy": 0.9564716964960098, + "num_tokens": 1621563.0, + "step": 183 + }, + { + "entropy": 2.4456416964530945, + "epoch": 0.6660633484162896, + "grad_norm": 0.6207183003425598, + "learning_rate": 0.000503318544869376, + "loss": 0.1918, + "mean_token_accuracy": 0.9476529806852341, + "num_tokens": 1630801.0, + "step": 184 + }, + { + "entropy": 2.641440451145172, + "epoch": 0.669683257918552, + "grad_norm": 1.220821499824524, + "learning_rate": 0.000503203259677741, + "loss": 0.4019, + "mean_token_accuracy": 0.9172120243310928, + "num_tokens": 1639522.0, + "step": 185 + }, + { + "entropy": 2.6447275280952454, + "epoch": 0.6733031674208145, + "grad_norm": 0.7546490430831909, + "learning_rate": 0.000503087130719714, + "loss": 0.2484, + "mean_token_accuracy": 0.9387800246477127, + "num_tokens": 1647964.0, + "step": 186 + }, + { + "entropy": 2.4657886028289795, + "epoch": 0.676923076923077, + "grad_norm": 0.7679230570793152, + "learning_rate": 0.0005029701584363675, + "loss": 0.2659, + "mean_token_accuracy": 0.930300235748291, + "num_tokens": 1657181.0, + "step": 187 + }, + { + "entropy": 2.37973552942276, + "epoch": 0.6805429864253394, + "grad_norm": 0.7473414540290833, + "learning_rate": 0.0005028523432719772, + "loss": 0.32, + "mean_token_accuracy": 0.9233052879571915, + "num_tokens": 1666477.0, + "step": 188 + }, + { + "entropy": 2.5238219499588013, + "epoch": 0.6841628959276018, + "grad_norm": 0.5573673248291016, + "learning_rate": 0.0005027336856740201, + "loss": 0.1846, + "mean_token_accuracy": 0.9445535093545914, + "num_tokens": 1675002.0, + "step": 189 + }, + { + "entropy": 2.456815242767334, + "epoch": 0.6877828054298643, + "grad_norm": 0.47237634658813477, + "learning_rate": 0.0005026141860931728, + "loss": 0.1065, + "mean_token_accuracy": 0.964375838637352, + "num_tokens": 1683623.0, + "step": 190 + }, + { + "entropy": 2.548456132411957, + "epoch": 0.6914027149321267, + "grad_norm": 0.7699162364006042, + "learning_rate": 0.00050249384498331, + "loss": 0.1985, + "mean_token_accuracy": 0.9438774734735489, + "num_tokens": 1691718.0, + "step": 191 + }, + { + "entropy": 2.4514941573143005, + "epoch": 0.6950226244343891, + "grad_norm": 1.4113538265228271, + "learning_rate": 0.0005023726628015027, + "loss": 0.4541, + "mean_token_accuracy": 0.9207872897386551, + "num_tokens": 1699824.0, + "step": 192 + }, + { + "entropy": 2.2560824751853943, + "epoch": 0.6986425339366515, + "grad_norm": 0.6007948517799377, + "learning_rate": 0.0005022506400080161, + "loss": 0.1871, + "mean_token_accuracy": 0.9502484053373337, + "num_tokens": 1708722.0, + "step": 193 + }, + { + "entropy": 2.1833614110946655, + "epoch": 0.702262443438914, + "grad_norm": 0.7005489468574524, + "learning_rate": 0.0005021277770663082, + "loss": 0.2222, + "mean_token_accuracy": 0.9386974722146988, + "num_tokens": 1717592.0, + "step": 194 + }, + { + "entropy": 2.2031923830509186, + "epoch": 0.7058823529411765, + "grad_norm": 0.5830584764480591, + "learning_rate": 0.0005020040744430284, + "loss": 0.1106, + "mean_token_accuracy": 0.9719562232494354, + "num_tokens": 1726149.0, + "step": 195 + }, + { + "entropy": 2.199785351753235, + "epoch": 0.709502262443439, + "grad_norm": 0.7465847134590149, + "learning_rate": 0.0005018795326080149, + "loss": 0.1935, + "mean_token_accuracy": 0.9497270882129669, + "num_tokens": 1734541.0, + "step": 196 + }, + { + "entropy": 2.1103186309337616, + "epoch": 0.7131221719457014, + "grad_norm": 1.0782264471054077, + "learning_rate": 0.0005017541520342934, + "loss": 0.2895, + "mean_token_accuracy": 0.9274258464574814, + "num_tokens": 1743722.0, + "step": 197 + }, + { + "entropy": 2.2248528599739075, + "epoch": 0.7167420814479638, + "grad_norm": 0.6409780979156494, + "learning_rate": 0.0005016279331980754, + "loss": 0.1425, + "mean_token_accuracy": 0.96550352871418, + "num_tokens": 1752156.0, + "step": 198 + }, + { + "entropy": 2.19924658536911, + "epoch": 0.7203619909502262, + "grad_norm": 0.7019934058189392, + "learning_rate": 0.0005015008765787561, + "loss": 0.1969, + "mean_token_accuracy": 0.9429282248020172, + "num_tokens": 1760978.0, + "step": 199 + }, + { + "entropy": 2.297484815120697, + "epoch": 0.7239819004524887, + "grad_norm": 0.7826490998268127, + "learning_rate": 0.0005013729826589127, + "loss": 0.2399, + "mean_token_accuracy": 0.9416657984256744, + "num_tokens": 1769533.0, + "step": 200 + }, + { + "entropy": 2.2471498548984528, + "epoch": 0.7276018099547511, + "grad_norm": 0.621566891670227, + "learning_rate": 0.0005012442519243027, + "loss": 0.1876, + "mean_token_accuracy": 0.9460793286561966, + "num_tokens": 1778286.0, + "step": 201 + }, + { + "entropy": 2.2212815284729004, + "epoch": 0.7312217194570135, + "grad_norm": 0.622283935546875, + "learning_rate": 0.0005011146848638616, + "loss": 0.1617, + "mean_token_accuracy": 0.9482609927654266, + "num_tokens": 1787392.0, + "step": 202 + }, + { + "entropy": 2.308752655982971, + "epoch": 0.7348416289592761, + "grad_norm": 0.7263973355293274, + "learning_rate": 0.0005009842819697018, + "loss": 0.2043, + "mean_token_accuracy": 0.9378403723239899, + "num_tokens": 1796133.0, + "step": 203 + }, + { + "entropy": 2.3376497626304626, + "epoch": 0.7384615384615385, + "grad_norm": 0.5493630766868591, + "learning_rate": 0.0005008530437371101, + "loss": 0.1145, + "mean_token_accuracy": 0.970586434006691, + "num_tokens": 1804769.0, + "step": 204 + }, + { + "entropy": 2.373005509376526, + "epoch": 0.7420814479638009, + "grad_norm": 0.6313483119010925, + "learning_rate": 0.0005007209706645461, + "loss": 0.2183, + "mean_token_accuracy": 0.9472708404064178, + "num_tokens": 1813364.0, + "step": 205 + }, + { + "entropy": 2.468949854373932, + "epoch": 0.7457013574660634, + "grad_norm": 1.0125588178634644, + "learning_rate": 0.00050058806325364, + "loss": 0.2225, + "mean_token_accuracy": 0.9351322948932648, + "num_tokens": 1822149.0, + "step": 206 + }, + { + "entropy": 2.2420623898506165, + "epoch": 0.7493212669683258, + "grad_norm": 0.913761556148529, + "learning_rate": 0.0005004543220091911, + "loss": 0.2386, + "mean_token_accuracy": 0.9453927427530289, + "num_tokens": 1831533.0, + "step": 207 + }, + { + "entropy": 2.2966006994247437, + "epoch": 0.7529411764705882, + "grad_norm": 0.7386876940727234, + "learning_rate": 0.0005003197474391658, + "loss": 0.1768, + "mean_token_accuracy": 0.949826255440712, + "num_tokens": 1840157.0, + "step": 208 + }, + { + "entropy": 2.306001305580139, + "epoch": 0.7565610859728507, + "grad_norm": 0.8900741338729858, + "learning_rate": 0.0005001843400546955, + "loss": 0.2899, + "mean_token_accuracy": 0.9241485595703125, + "num_tokens": 1848898.0, + "step": 209 + }, + { + "entropy": 2.117514967918396, + "epoch": 0.7601809954751131, + "grad_norm": 0.644622802734375, + "learning_rate": 0.0005000481003700746, + "loss": 0.2714, + "mean_token_accuracy": 0.9299416691064835, + "num_tokens": 1858330.0, + "step": 210 + }, + { + "entropy": 2.3768392205238342, + "epoch": 0.7638009049773755, + "grad_norm": 0.9724471569061279, + "learning_rate": 0.0004999110289027587, + "loss": 0.1633, + "mean_token_accuracy": 0.9550061523914337, + "num_tokens": 1866806.0, + "step": 211 + }, + { + "entropy": 2.090679556131363, + "epoch": 0.7674208144796381, + "grad_norm": 0.5419518351554871, + "learning_rate": 0.0004997731261733628, + "loss": 0.1369, + "mean_token_accuracy": 0.9619670957326889, + "num_tokens": 1875937.0, + "step": 212 + }, + { + "entropy": 2.099909245967865, + "epoch": 0.7710407239819005, + "grad_norm": 0.6858121752738953, + "learning_rate": 0.0004996343927056592, + "loss": 0.1633, + "mean_token_accuracy": 0.9528832882642746, + "num_tokens": 1885145.0, + "step": 213 + }, + { + "entropy": 2.130059242248535, + "epoch": 0.7746606334841629, + "grad_norm": 0.7691065073013306, + "learning_rate": 0.000499494829026575, + "loss": 0.348, + "mean_token_accuracy": 0.9162366837263107, + "num_tokens": 1894255.0, + "step": 214 + }, + { + "entropy": 2.191373586654663, + "epoch": 0.7782805429864253, + "grad_norm": 0.7427324652671814, + "learning_rate": 0.000499354435666191, + "loss": 0.3373, + "mean_token_accuracy": 0.9311849176883698, + "num_tokens": 1902981.0, + "step": 215 + }, + { + "entropy": 2.1425398886203766, + "epoch": 0.7819004524886878, + "grad_norm": 0.6410383582115173, + "learning_rate": 0.0004992132131577392, + "loss": 0.2079, + "mean_token_accuracy": 0.949742391705513, + "num_tokens": 1912253.0, + "step": 216 + }, + { + "entropy": 2.1396586298942566, + "epoch": 0.7855203619909502, + "grad_norm": 0.5689850449562073, + "learning_rate": 0.0004990711620376003, + "loss": 0.1999, + "mean_token_accuracy": 0.946034774184227, + "num_tokens": 1921409.0, + "step": 217 + }, + { + "entropy": 2.2237865328788757, + "epoch": 0.7891402714932126, + "grad_norm": 0.6408923864364624, + "learning_rate": 0.0004989282828453029, + "loss": 0.2452, + "mean_token_accuracy": 0.9510752111673355, + "num_tokens": 1930397.0, + "step": 218 + }, + { + "entropy": 2.234771251678467, + "epoch": 0.7927601809954751, + "grad_norm": 0.751447856426239, + "learning_rate": 0.0004987845761235203, + "loss": 0.3057, + "mean_token_accuracy": 0.9217256307601929, + "num_tokens": 1939172.0, + "step": 219 + }, + { + "entropy": 2.2653815746307373, + "epoch": 0.7963800904977375, + "grad_norm": 0.751455545425415, + "learning_rate": 0.0004986400424180688, + "loss": 0.3245, + "mean_token_accuracy": 0.9256318956613541, + "num_tokens": 1947979.0, + "step": 220 + }, + { + "entropy": 2.3123483061790466, + "epoch": 0.8, + "grad_norm": 0.5939492583274841, + "learning_rate": 0.0004984946822779061, + "loss": 0.2429, + "mean_token_accuracy": 0.9333402067422867, + "num_tokens": 1956814.0, + "step": 221 + }, + { + "entropy": 2.3289234042167664, + "epoch": 0.8036199095022625, + "grad_norm": 0.5591994524002075, + "learning_rate": 0.0004983484962551284, + "loss": 0.1507, + "mean_token_accuracy": 0.96376833319664, + "num_tokens": 1965641.0, + "step": 222 + }, + { + "entropy": 2.4314023852348328, + "epoch": 0.8072398190045249, + "grad_norm": 0.5805783271789551, + "learning_rate": 0.0004982014849049687, + "loss": 0.2049, + "mean_token_accuracy": 0.9586948156356812, + "num_tokens": 1974180.0, + "step": 223 + }, + { + "entropy": 2.3639765977859497, + "epoch": 0.8108597285067873, + "grad_norm": 0.6924490332603455, + "learning_rate": 0.0004980536487857951, + "loss": 0.2137, + "mean_token_accuracy": 0.9441423565149307, + "num_tokens": 1982744.0, + "step": 224 + }, + { + "entropy": 2.3361759781837463, + "epoch": 0.8144796380090498, + "grad_norm": 0.4579620361328125, + "learning_rate": 0.0004979049884591077, + "loss": 0.1041, + "mean_token_accuracy": 0.9753208309412003, + "num_tokens": 1991583.0, + "step": 225 + }, + { + "entropy": 2.286989688873291, + "epoch": 0.8180995475113122, + "grad_norm": 0.6489312052726746, + "learning_rate": 0.0004977555044895377, + "loss": 0.2131, + "mean_token_accuracy": 0.9520440250635147, + "num_tokens": 2000193.0, + "step": 226 + }, + { + "entropy": 2.288672834634781, + "epoch": 0.8217194570135746, + "grad_norm": 0.7738961577415466, + "learning_rate": 0.0004976051974448441, + "loss": 0.325, + "mean_token_accuracy": 0.9060750156641006, + "num_tokens": 2009233.0, + "step": 227 + }, + { + "entropy": 2.288076102733612, + "epoch": 0.8253393665158371, + "grad_norm": 0.7042292356491089, + "learning_rate": 0.0004974540678959123, + "loss": 0.2206, + "mean_token_accuracy": 0.94980289041996, + "num_tokens": 2018417.0, + "step": 228 + }, + { + "entropy": 2.217707335948944, + "epoch": 0.8289592760180996, + "grad_norm": 0.6834208369255066, + "learning_rate": 0.0004973021164167515, + "loss": 0.2907, + "mean_token_accuracy": 0.951058641076088, + "num_tokens": 2027822.0, + "step": 229 + }, + { + "entropy": 2.1610691249370575, + "epoch": 0.832579185520362, + "grad_norm": 0.665044903755188, + "learning_rate": 0.0004971493435844928, + "loss": 0.2387, + "mean_token_accuracy": 0.9506549835205078, + "num_tokens": 2036983.0, + "step": 230 + }, + { + "entropy": 2.321135401725769, + "epoch": 0.8361990950226245, + "grad_norm": 0.8208273649215698, + "learning_rate": 0.0004969957499793869, + "loss": 0.2399, + "mean_token_accuracy": 0.9435176253318787, + "num_tokens": 2045574.0, + "step": 231 + }, + { + "entropy": 2.1943611800670624, + "epoch": 0.8398190045248869, + "grad_norm": 0.6293840408325195, + "learning_rate": 0.0004968413361848019, + "loss": 0.1784, + "mean_token_accuracy": 0.9559669345617294, + "num_tokens": 2054336.0, + "step": 232 + }, + { + "entropy": 2.2722273468971252, + "epoch": 0.8434389140271493, + "grad_norm": 0.6535817980766296, + "learning_rate": 0.0004966861027872211, + "loss": 0.1675, + "mean_token_accuracy": 0.9532535970211029, + "num_tokens": 2063225.0, + "step": 233 + }, + { + "entropy": 2.3278334736824036, + "epoch": 0.8470588235294118, + "grad_norm": 1.1610206365585327, + "learning_rate": 0.0004965300503762406, + "loss": 0.1588, + "mean_token_accuracy": 0.9641145765781403, + "num_tokens": 2071738.0, + "step": 234 + }, + { + "entropy": 2.202972888946533, + "epoch": 0.8506787330316742, + "grad_norm": 0.4811885356903076, + "learning_rate": 0.0004963731795445675, + "loss": 0.0813, + "mean_token_accuracy": 0.9766911715269089, + "num_tokens": 2080375.0, + "step": 235 + }, + { + "entropy": 2.2433705925941467, + "epoch": 0.8542986425339366, + "grad_norm": 0.8113318681716919, + "learning_rate": 0.0004962154908880171, + "loss": 0.2965, + "mean_token_accuracy": 0.9290606826543808, + "num_tokens": 2089522.0, + "step": 236 + }, + { + "entropy": 2.2168884873390198, + "epoch": 0.857918552036199, + "grad_norm": 0.6128959655761719, + "learning_rate": 0.0004960569850055111, + "loss": 0.1724, + "mean_token_accuracy": 0.9603384286165237, + "num_tokens": 2098162.0, + "step": 237 + }, + { + "entropy": 2.2738255858421326, + "epoch": 0.8615384615384616, + "grad_norm": 0.8557195663452148, + "learning_rate": 0.0004958976624990749, + "loss": 0.2596, + "mean_token_accuracy": 0.9487071484327316, + "num_tokens": 2106984.0, + "step": 238 + }, + { + "entropy": 2.2031425833702087, + "epoch": 0.865158371040724, + "grad_norm": 0.6621816158294678, + "learning_rate": 0.0004957375239738359, + "loss": 0.232, + "mean_token_accuracy": 0.9525040090084076, + "num_tokens": 2116040.0, + "step": 239 + }, + { + "entropy": 2.374737858772278, + "epoch": 0.8687782805429864, + "grad_norm": 0.8481062054634094, + "learning_rate": 0.0004955765700380204, + "loss": 0.2516, + "mean_token_accuracy": 0.9396061599254608, + "num_tokens": 2124862.0, + "step": 240 + }, + { + "entropy": 2.266704559326172, + "epoch": 0.8723981900452489, + "grad_norm": 0.6284282803535461, + "learning_rate": 0.0004954148013029521, + "loss": 0.3244, + "mean_token_accuracy": 0.9381244331598282, + "num_tokens": 2134018.0, + "step": 241 + }, + { + "entropy": 2.3935859203338623, + "epoch": 0.8760180995475113, + "grad_norm": 1.1564176082611084, + "learning_rate": 0.0004952522183830493, + "loss": 0.2706, + "mean_token_accuracy": 0.9297053664922714, + "num_tokens": 2142745.0, + "step": 242 + }, + { + "entropy": 2.281618118286133, + "epoch": 0.8796380090497737, + "grad_norm": 0.5324040055274963, + "learning_rate": 0.0004950888218958225, + "loss": 0.1573, + "mean_token_accuracy": 0.9568462073802948, + "num_tokens": 2151607.0, + "step": 243 + }, + { + "entropy": 2.230749189853668, + "epoch": 0.8832579185520362, + "grad_norm": 0.680780291557312, + "learning_rate": 0.0004949246124618726, + "loss": 0.1956, + "mean_token_accuracy": 0.9479999989271164, + "num_tokens": 2160904.0, + "step": 244 + }, + { + "entropy": 2.21382600069046, + "epoch": 0.8868778280542986, + "grad_norm": 0.6321626305580139, + "learning_rate": 0.0004947595907048877, + "loss": 0.2444, + "mean_token_accuracy": 0.9376699328422546, + "num_tokens": 2170021.0, + "step": 245 + }, + { + "entropy": 2.3659472465515137, + "epoch": 0.890497737556561, + "grad_norm": 0.9778954982757568, + "learning_rate": 0.0004945937572516417, + "loss": 0.3783, + "mean_token_accuracy": 0.9104805737733841, + "num_tokens": 2178995.0, + "step": 246 + }, + { + "entropy": 2.3233078718185425, + "epoch": 0.8941176470588236, + "grad_norm": 0.53229820728302, + "learning_rate": 0.0004944271127319909, + "loss": 0.0759, + "mean_token_accuracy": 0.9791453778743744, + "num_tokens": 2187823.0, + "step": 247 + }, + { + "entropy": 2.2469444274902344, + "epoch": 0.897737556561086, + "grad_norm": 0.6367197632789612, + "learning_rate": 0.0004942596577788728, + "loss": 0.2677, + "mean_token_accuracy": 0.9392691254615784, + "num_tokens": 2196923.0, + "step": 248 + }, + { + "entropy": 2.4508965611457825, + "epoch": 0.9013574660633484, + "grad_norm": 0.6042234897613525, + "learning_rate": 0.0004940913930283024, + "loss": 0.1102, + "mean_token_accuracy": 0.9762090593576431, + "num_tokens": 2205400.0, + "step": 249 + }, + { + "entropy": 2.365670144557953, + "epoch": 0.9049773755656109, + "grad_norm": 0.6490639448165894, + "learning_rate": 0.0004939223191193707, + "loss": 0.1532, + "mean_token_accuracy": 0.9489114433526993, + "num_tokens": 2214201.0, + "step": 250 + }, + { + "entropy": 2.4013625383377075, + "epoch": 0.9085972850678733, + "grad_norm": 0.5969854593276978, + "learning_rate": 0.0004937524366942419, + "loss": 0.1273, + "mean_token_accuracy": 0.9682519882917404, + "num_tokens": 2222979.0, + "step": 251 + }, + { + "entropy": 2.4402357935905457, + "epoch": 0.9122171945701357, + "grad_norm": 0.7559595704078674, + "learning_rate": 0.0004935817463981513, + "loss": 0.1979, + "mean_token_accuracy": 0.9483373910188675, + "num_tokens": 2231169.0, + "step": 252 + }, + { + "entropy": 2.4673256874084473, + "epoch": 0.9158371040723982, + "grad_norm": 0.8663308620452881, + "learning_rate": 0.0004934102488794023, + "loss": 0.2453, + "mean_token_accuracy": 0.9408974200487137, + "num_tokens": 2240099.0, + "step": 253 + }, + { + "entropy": 2.426262080669403, + "epoch": 0.9194570135746606, + "grad_norm": 0.7920467257499695, + "learning_rate": 0.0004932379447893643, + "loss": 0.2828, + "mean_token_accuracy": 0.9319239109754562, + "num_tokens": 2249088.0, + "step": 254 + }, + { + "entropy": 2.5018852949142456, + "epoch": 0.9230769230769231, + "grad_norm": 0.7216617465019226, + "learning_rate": 0.0004930648347824701, + "loss": 0.1647, + "mean_token_accuracy": 0.9551804810762405, + "num_tokens": 2257710.0, + "step": 255 + }, + { + "entropy": 2.43031644821167, + "epoch": 0.9266968325791856, + "grad_norm": 0.646794319152832, + "learning_rate": 0.0004928909195162138, + "loss": 0.1328, + "mean_token_accuracy": 0.9663553237915039, + "num_tokens": 2266883.0, + "step": 256 + }, + { + "entropy": 2.5406370759010315, + "epoch": 0.930316742081448, + "grad_norm": 0.5482825040817261, + "learning_rate": 0.0004927161996511474, + "loss": 0.1872, + "mean_token_accuracy": 0.9557004272937775, + "num_tokens": 2275728.0, + "step": 257 + }, + { + "entropy": 2.636320471763611, + "epoch": 0.9339366515837104, + "grad_norm": 0.7454632520675659, + "learning_rate": 0.0004925406758508797, + "loss": 0.1461, + "mean_token_accuracy": 0.9578974395990372, + "num_tokens": 2284319.0, + "step": 258 + }, + { + "entropy": 2.6067575812339783, + "epoch": 0.9375565610859729, + "grad_norm": 0.8695769309997559, + "learning_rate": 0.000492364348782072, + "loss": 0.1712, + "mean_token_accuracy": 0.9652896523475647, + "num_tokens": 2293035.0, + "step": 259 + }, + { + "entropy": 2.5837162137031555, + "epoch": 0.9411764705882353, + "grad_norm": 0.5752995014190674, + "learning_rate": 0.0004921872191144371, + "loss": 0.1398, + "mean_token_accuracy": 0.9553333520889282, + "num_tokens": 2301802.0, + "step": 260 + }, + { + "entropy": 2.713033616542816, + "epoch": 0.9447963800904977, + "grad_norm": 0.85626620054245, + "learning_rate": 0.0004920092875207363, + "loss": 0.2207, + "mean_token_accuracy": 0.9468346834182739, + "num_tokens": 2309981.0, + "step": 261 + }, + { + "entropy": 2.400112509727478, + "epoch": 0.9484162895927601, + "grad_norm": 0.6766608953475952, + "learning_rate": 0.0004918305546767764, + "loss": 0.1644, + "mean_token_accuracy": 0.9502440094947815, + "num_tokens": 2319212.0, + "step": 262 + }, + { + "entropy": 2.503827154636383, + "epoch": 0.9520361990950226, + "grad_norm": 0.789470911026001, + "learning_rate": 0.0004916510212614072, + "loss": 0.2117, + "mean_token_accuracy": 0.9454390555620193, + "num_tokens": 2328234.0, + "step": 263 + }, + { + "entropy": 2.669040560722351, + "epoch": 0.9556561085972851, + "grad_norm": 0.9579212069511414, + "learning_rate": 0.0004914706879565197, + "loss": 0.2193, + "mean_token_accuracy": 0.9321542829275131, + "num_tokens": 2336543.0, + "step": 264 + }, + { + "entropy": 2.507073998451233, + "epoch": 0.9592760180995475, + "grad_norm": 0.5315744876861572, + "learning_rate": 0.000491289555447043, + "loss": 0.0851, + "mean_token_accuracy": 0.9771326780319214, + "num_tokens": 2345292.0, + "step": 265 + }, + { + "entropy": 2.4205283522605896, + "epoch": 0.96289592760181, + "grad_norm": 0.5441373586654663, + "learning_rate": 0.000491107624420941, + "loss": 0.1323, + "mean_token_accuracy": 0.9541790336370468, + "num_tokens": 2354242.0, + "step": 266 + }, + { + "entropy": 2.3817258477211, + "epoch": 0.9665158371040724, + "grad_norm": 0.5946238040924072, + "learning_rate": 0.0004909248955692111, + "loss": 0.1708, + "mean_token_accuracy": 0.947738841176033, + "num_tokens": 2363183.0, + "step": 267 + }, + { + "entropy": 2.5073485374450684, + "epoch": 0.9701357466063348, + "grad_norm": 0.6979324817657471, + "learning_rate": 0.0004907413695858812, + "loss": 0.2099, + "mean_token_accuracy": 0.9423733651638031, + "num_tokens": 2371885.0, + "step": 268 + }, + { + "entropy": 2.5705007910728455, + "epoch": 0.9737556561085973, + "grad_norm": 0.8203943967819214, + "learning_rate": 0.0004905570471680057, + "loss": 0.217, + "mean_token_accuracy": 0.9511639326810837, + "num_tokens": 2380316.0, + "step": 269 + }, + { + "entropy": 2.2677993774414062, + "epoch": 0.9773755656108597, + "grad_norm": 0.5840432047843933, + "learning_rate": 0.0004903719290156649, + "loss": 0.2364, + "mean_token_accuracy": 0.9407180696725845, + "num_tokens": 2389723.0, + "step": 270 + }, + { + "entropy": 2.477886915206909, + "epoch": 0.9809954751131221, + "grad_norm": 0.818929135799408, + "learning_rate": 0.0004901860158319612, + "loss": 0.1707, + "mean_token_accuracy": 0.9579566866159439, + "num_tokens": 2398388.0, + "step": 271 + }, + { + "entropy": 2.549662232398987, + "epoch": 0.9846153846153847, + "grad_norm": 0.7804781198501587, + "learning_rate": 0.0004899993083230166, + "loss": 0.2944, + "mean_token_accuracy": 0.9381812512874603, + "num_tokens": 2406929.0, + "step": 272 + }, + { + "entropy": 2.4465304017066956, + "epoch": 0.9882352941176471, + "grad_norm": 0.5218799114227295, + "learning_rate": 0.0004898118071979699, + "loss": 0.1661, + "mean_token_accuracy": 0.9500218778848648, + "num_tokens": 2415631.0, + "step": 273 + }, + { + "entropy": 2.5852283239364624, + "epoch": 0.9918552036199095, + "grad_norm": 0.591163158416748, + "learning_rate": 0.0004896235131689743, + "loss": 0.2005, + "mean_token_accuracy": 0.9455285370349884, + "num_tokens": 2424091.0, + "step": 274 + }, + { + "entropy": 2.478701651096344, + "epoch": 0.995475113122172, + "grad_norm": 1.0615383386611938, + "learning_rate": 0.0004894344269511945, + "loss": 0.2864, + "mean_token_accuracy": 0.9306265562772751, + "num_tokens": 2432705.0, + "step": 275 + }, + { + "entropy": 2.600062847137451, + "epoch": 0.9990950226244344, + "grad_norm": 0.7011683583259583, + "learning_rate": 0.0004892445492628043, + "loss": 0.1664, + "mean_token_accuracy": 0.9547821134328842, + "num_tokens": 2440992.0, + "step": 276 + }, + { + "entropy": 2.3411240577697754, + "epoch": 1.0, + "grad_norm": 0.4944029450416565, + "learning_rate": 0.000489053880824983, + "loss": 0.022, + "mean_token_accuracy": 0.9929078221321106, + "num_tokens": 2441725.0, + "step": 277 + }, + { + "epoch": 1.0, + "eval_entropy": 2.5467925265552553, + "eval_loss": 0.21274714171886444, + "eval_mean_token_accuracy": 0.9444630068492114, + "eval_num_tokens": 2441725.0, + "eval_runtime": 116.0434, + "eval_samples_per_second": 3.18, + "eval_steps_per_second": 1.06, + "step": 277 + }, + { + "entropy": 2.609170138835907, + "epoch": 1.0036199095022624, + "grad_norm": 1.0785081386566162, + "learning_rate": 0.0004888624223619136, + "loss": 0.3167, + "mean_token_accuracy": 0.9296800643205643, + "num_tokens": 2450193.0, + "step": 278 + }, + { + "entropy": 2.497025430202484, + "epoch": 1.0072398190045249, + "grad_norm": 0.5221985578536987, + "learning_rate": 0.0004886701746007801, + "loss": 0.0854, + "mean_token_accuracy": 0.9753399342298508, + "num_tokens": 2459309.0, + "step": 279 + }, + { + "entropy": 2.5487362146377563, + "epoch": 1.0108597285067873, + "grad_norm": 0.5161958336830139, + "learning_rate": 0.0004884771382717638, + "loss": 0.0819, + "mean_token_accuracy": 0.9748431146144867, + "num_tokens": 2467844.0, + "step": 280 + }, + { + "entropy": 2.5276209115982056, + "epoch": 1.0144796380090497, + "grad_norm": 0.5731730461120605, + "learning_rate": 0.0004882833141080412, + "loss": 0.1541, + "mean_token_accuracy": 0.9567564427852631, + "num_tokens": 2476894.0, + "step": 281 + }, + { + "entropy": 2.4442760348320007, + "epoch": 1.0180995475113122, + "grad_norm": 0.7120366096496582, + "learning_rate": 0.0004880887028457813, + "loss": 0.1945, + "mean_token_accuracy": 0.9465379565954208, + "num_tokens": 2485971.0, + "step": 282 + }, + { + "entropy": 2.4069360494613647, + "epoch": 1.0217194570135746, + "grad_norm": 0.7468647360801697, + "learning_rate": 0.00048789330522414244, + "loss": 0.2345, + "mean_token_accuracy": 0.9446765780448914, + "num_tokens": 2495043.0, + "step": 283 + }, + { + "entropy": 2.468382716178894, + "epoch": 1.025339366515837, + "grad_norm": 0.666231632232666, + "learning_rate": 0.0004876971219852697, + "loss": 0.1779, + "mean_token_accuracy": 0.9534575343132019, + "num_tokens": 2503672.0, + "step": 284 + }, + { + "entropy": 2.4362316727638245, + "epoch": 1.0289592760180994, + "grad_norm": 0.8445858955383301, + "learning_rate": 0.000487500153874292, + "loss": 0.1698, + "mean_token_accuracy": 0.953661322593689, + "num_tokens": 2512322.0, + "step": 285 + }, + { + "entropy": 2.364333391189575, + "epoch": 1.032579185520362, + "grad_norm": 0.4805246591567993, + "learning_rate": 0.0004873024016393193, + "loss": 0.0778, + "mean_token_accuracy": 0.9824571758508682, + "num_tokens": 2520791.0, + "step": 286 + }, + { + "entropy": 2.223461151123047, + "epoch": 1.0361990950226245, + "grad_norm": 0.648465096950531, + "learning_rate": 0.0004871038660314399, + "loss": 0.2593, + "mean_token_accuracy": 0.9419913589954376, + "num_tokens": 2530082.0, + "step": 287 + }, + { + "entropy": 2.3313387036323547, + "epoch": 1.039819004524887, + "grad_norm": 0.6912294626235962, + "learning_rate": 0.00048690454780471725, + "loss": 0.1354, + "mean_token_accuracy": 0.9561934620141983, + "num_tokens": 2538728.0, + "step": 288 + }, + { + "entropy": 2.191806375980377, + "epoch": 1.0434389140271494, + "grad_norm": 0.8620694279670715, + "learning_rate": 0.0004867044477161874, + "loss": 0.1103, + "mean_token_accuracy": 0.968692272901535, + "num_tokens": 2547219.0, + "step": 289 + }, + { + "entropy": 2.167125165462494, + "epoch": 1.0470588235294118, + "grad_norm": 0.6192149519920349, + "learning_rate": 0.0004865035665258559, + "loss": 0.1288, + "mean_token_accuracy": 0.9643534421920776, + "num_tokens": 2555940.0, + "step": 290 + }, + { + "entropy": 2.2750985622406006, + "epoch": 1.0506787330316743, + "grad_norm": 1.7459602355957031, + "learning_rate": 0.0004863019049966953, + "loss": 0.393, + "mean_token_accuracy": 0.9146681725978851, + "num_tokens": 2564362.0, + "step": 291 + }, + { + "entropy": 2.236129105091095, + "epoch": 1.0542986425339367, + "grad_norm": 0.6311184167861938, + "learning_rate": 0.0004860994638946416, + "loss": 0.1536, + "mean_token_accuracy": 0.9636097103357315, + "num_tokens": 2573316.0, + "step": 292 + }, + { + "entropy": 2.2642418146133423, + "epoch": 1.0579185520361991, + "grad_norm": 0.6023411154747009, + "learning_rate": 0.000485896243988592, + "loss": 0.191, + "mean_token_accuracy": 0.9476015418767929, + "num_tokens": 2581835.0, + "step": 293 + }, + { + "entropy": 2.3589024543762207, + "epoch": 1.0615384615384615, + "grad_norm": 0.48049232363700867, + "learning_rate": 0.0004856922460504016, + "loss": 0.1017, + "mean_token_accuracy": 0.9713075459003448, + "num_tokens": 2590317.0, + "step": 294 + }, + { + "entropy": 2.4141315817832947, + "epoch": 1.065158371040724, + "grad_norm": 0.8456616997718811, + "learning_rate": 0.0004854874708548806, + "loss": 0.1422, + "mean_token_accuracy": 0.9622762501239777, + "num_tokens": 2598538.0, + "step": 295 + }, + { + "entropy": 2.069903999567032, + "epoch": 1.0687782805429864, + "grad_norm": 0.7641116380691528, + "learning_rate": 0.0004852819191797912, + "loss": 0.2185, + "mean_token_accuracy": 0.9464851468801498, + "num_tokens": 2608219.0, + "step": 296 + }, + { + "entropy": 2.163217008113861, + "epoch": 1.0723981900452488, + "grad_norm": 0.546085000038147, + "learning_rate": 0.0004850755918058449, + "loss": 0.1035, + "mean_token_accuracy": 0.9708487540483475, + "num_tokens": 2617261.0, + "step": 297 + }, + { + "entropy": 2.2678662836551666, + "epoch": 1.0760180995475113, + "grad_norm": 0.8699386119842529, + "learning_rate": 0.0004848684895166994, + "loss": 0.2384, + "mean_token_accuracy": 0.9486480504274368, + "num_tokens": 2626144.0, + "step": 298 + }, + { + "entropy": 2.13065105676651, + "epoch": 1.0796380090497737, + "grad_norm": 0.44323107600212097, + "learning_rate": 0.00048466061309895554, + "loss": 0.0818, + "mean_token_accuracy": 0.9722468554973602, + "num_tokens": 2635626.0, + "step": 299 + }, + { + "entropy": 2.184772551059723, + "epoch": 1.0832579185520361, + "grad_norm": 0.7928256988525391, + "learning_rate": 0.0004844519633421545, + "loss": 0.2378, + "mean_token_accuracy": 0.9477885961532593, + "num_tokens": 2644674.0, + "step": 300 + }, + { + "entropy": 2.1669145822525024, + "epoch": 1.0868778280542986, + "grad_norm": 0.5570158362388611, + "learning_rate": 0.00048424254103877456, + "loss": 0.1434, + "mean_token_accuracy": 0.9587411731481552, + "num_tokens": 2653658.0, + "step": 301 + }, + { + "entropy": 2.3057579398155212, + "epoch": 1.090497737556561, + "grad_norm": 0.9084392189979553, + "learning_rate": 0.00048403234698422837, + "loss": 0.3831, + "mean_token_accuracy": 0.8896283358335495, + "num_tokens": 2662350.0, + "step": 302 + }, + { + "entropy": 2.1741657853126526, + "epoch": 1.0941176470588236, + "grad_norm": 0.6791238784790039, + "learning_rate": 0.0004838213819768597, + "loss": 0.1648, + "mean_token_accuracy": 0.9576362520456314, + "num_tokens": 2671450.0, + "step": 303 + }, + { + "entropy": 2.089864045381546, + "epoch": 1.097737556561086, + "grad_norm": 0.5696312189102173, + "learning_rate": 0.0004836096468179406, + "loss": 0.1269, + "mean_token_accuracy": 0.9658148884773254, + "num_tokens": 2680581.0, + "step": 304 + }, + { + "entropy": 2.2657605409622192, + "epoch": 1.1013574660633485, + "grad_norm": 1.605503797531128, + "learning_rate": 0.0004833971423116682, + "loss": 0.1027, + "mean_token_accuracy": 0.9762597978115082, + "num_tokens": 2689001.0, + "step": 305 + }, + { + "entropy": 2.079287111759186, + "epoch": 1.104977375565611, + "grad_norm": 0.5804780721664429, + "learning_rate": 0.00048318386926516157, + "loss": 0.1137, + "mean_token_accuracy": 0.9633719325065613, + "num_tokens": 2698050.0, + "step": 306 + }, + { + "entropy": 2.201345145702362, + "epoch": 1.1085972850678734, + "grad_norm": 0.8606241941452026, + "learning_rate": 0.000482969828488459, + "loss": 0.2124, + "mean_token_accuracy": 0.9472681730985641, + "num_tokens": 2706704.0, + "step": 307 + }, + { + "entropy": 2.095236599445343, + "epoch": 1.1122171945701358, + "grad_norm": 0.7078782320022583, + "learning_rate": 0.0004827550207945147, + "loss": 0.1957, + "mean_token_accuracy": 0.9564679116010666, + "num_tokens": 2715745.0, + "step": 308 + }, + { + "entropy": 2.186302363872528, + "epoch": 1.1158371040723982, + "grad_norm": 0.7166503667831421, + "learning_rate": 0.0004825394469991956, + "loss": 0.1539, + "mean_token_accuracy": 0.9662427455186844, + "num_tokens": 2724296.0, + "step": 309 + }, + { + "entropy": 2.052559405565262, + "epoch": 1.1194570135746607, + "grad_norm": 0.6510501503944397, + "learning_rate": 0.00048232310792127846, + "loss": 0.1831, + "mean_token_accuracy": 0.9533994495868683, + "num_tokens": 2733482.0, + "step": 310 + }, + { + "entropy": 2.093154102563858, + "epoch": 1.123076923076923, + "grad_norm": 0.711121678352356, + "learning_rate": 0.0004821060043824466, + "loss": 0.2315, + "mean_token_accuracy": 0.9381555914878845, + "num_tokens": 2742912.0, + "step": 311 + }, + { + "entropy": 2.188497006893158, + "epoch": 1.1266968325791855, + "grad_norm": 0.6782490015029907, + "learning_rate": 0.00048188813720728707, + "loss": 0.2, + "mean_token_accuracy": 0.9501812607049942, + "num_tokens": 2751808.0, + "step": 312 + }, + { + "entropy": 2.0495824217796326, + "epoch": 1.130316742081448, + "grad_norm": 0.7644634246826172, + "learning_rate": 0.00048166950722328697, + "loss": 0.2152, + "mean_token_accuracy": 0.9440928995609283, + "num_tokens": 2761066.0, + "step": 313 + }, + { + "entropy": 2.1707025468349457, + "epoch": 1.1339366515837104, + "grad_norm": 0.655131459236145, + "learning_rate": 0.00048145011526083106, + "loss": 0.1637, + "mean_token_accuracy": 0.9500558227300644, + "num_tokens": 2769870.0, + "step": 314 + }, + { + "entropy": 2.1047372221946716, + "epoch": 1.1375565610859728, + "grad_norm": 0.5353516936302185, + "learning_rate": 0.0004812299621531979, + "loss": 0.1705, + "mean_token_accuracy": 0.9455999433994293, + "num_tokens": 2779383.0, + "step": 315 + }, + { + "entropy": 2.1921610236167908, + "epoch": 1.1411764705882352, + "grad_norm": 0.8998016119003296, + "learning_rate": 0.00048100904873655696, + "loss": 0.3918, + "mean_token_accuracy": 0.9382697492837906, + "num_tokens": 2788386.0, + "step": 316 + }, + { + "entropy": 2.0850723683834076, + "epoch": 1.1447963800904977, + "grad_norm": 0.867432713508606, + "learning_rate": 0.0004807873758499656, + "loss": 0.2196, + "mean_token_accuracy": 0.9498324394226074, + "num_tokens": 2797496.0, + "step": 317 + }, + { + "entropy": 2.1980925798416138, + "epoch": 1.14841628959276, + "grad_norm": 0.6076980233192444, + "learning_rate": 0.00048056494433536577, + "loss": 0.1086, + "mean_token_accuracy": 0.9642161130905151, + "num_tokens": 2805836.0, + "step": 318 + }, + { + "entropy": 2.15611070394516, + "epoch": 1.1520361990950225, + "grad_norm": 0.6276211738586426, + "learning_rate": 0.0004803417550375806, + "loss": 0.1463, + "mean_token_accuracy": 0.9622830748558044, + "num_tokens": 2814404.0, + "step": 319 + }, + { + "entropy": 2.0017230808734894, + "epoch": 1.155656108597285, + "grad_norm": 0.5840948820114136, + "learning_rate": 0.0004801178088043115, + "loss": 0.1869, + "mean_token_accuracy": 0.9506777077913284, + "num_tokens": 2823786.0, + "step": 320 + }, + { + "entropy": 2.1539418697357178, + "epoch": 1.1592760180995474, + "grad_norm": 1.074331283569336, + "learning_rate": 0.0004798931064861349, + "loss": 0.2797, + "mean_token_accuracy": 0.9271649420261383, + "num_tokens": 2832374.0, + "step": 321 + }, + { + "entropy": 1.930726408958435, + "epoch": 1.16289592760181, + "grad_norm": 0.5121958255767822, + "learning_rate": 0.0004796676489364988, + "loss": 0.1579, + "mean_token_accuracy": 0.9582571685314178, + "num_tokens": 2841561.0, + "step": 322 + }, + { + "entropy": 2.0205810368061066, + "epoch": 1.1665158371040725, + "grad_norm": 0.6360969543457031, + "learning_rate": 0.00047944143701171966, + "loss": 0.1582, + "mean_token_accuracy": 0.9620308429002762, + "num_tokens": 2850171.0, + "step": 323 + }, + { + "entropy": 1.9655758142471313, + "epoch": 1.170135746606335, + "grad_norm": 0.6647385358810425, + "learning_rate": 0.0004792144715709792, + "loss": 0.1594, + "mean_token_accuracy": 0.954497441649437, + "num_tokens": 2858905.0, + "step": 324 + }, + { + "entropy": 1.9725223183631897, + "epoch": 1.1737556561085973, + "grad_norm": 0.6429229974746704, + "learning_rate": 0.0004789867534763211, + "loss": 0.1407, + "mean_token_accuracy": 0.9645214527845383, + "num_tokens": 2867533.0, + "step": 325 + }, + { + "entropy": 1.9473685026168823, + "epoch": 1.1773755656108598, + "grad_norm": 0.811651349067688, + "learning_rate": 0.0004787582835926477, + "loss": 0.1608, + "mean_token_accuracy": 0.9479968994855881, + "num_tokens": 2876286.0, + "step": 326 + }, + { + "entropy": 1.8863109350204468, + "epoch": 1.1809954751131222, + "grad_norm": 0.5587059855461121, + "learning_rate": 0.00047852906278771686, + "loss": 0.131, + "mean_token_accuracy": 0.9684520065784454, + "num_tokens": 2885667.0, + "step": 327 + }, + { + "entropy": 1.8288891315460205, + "epoch": 1.1846153846153846, + "grad_norm": 0.8450536131858826, + "learning_rate": 0.0004782990919321383, + "loss": 0.2224, + "mean_token_accuracy": 0.9377491921186447, + "num_tokens": 2894765.0, + "step": 328 + }, + { + "entropy": 1.9347718358039856, + "epoch": 1.188235294117647, + "grad_norm": 0.7665867209434509, + "learning_rate": 0.0004780683718993705, + "loss": 0.167, + "mean_token_accuracy": 0.9583602845668793, + "num_tokens": 2903551.0, + "step": 329 + }, + { + "entropy": 1.9097798764705658, + "epoch": 1.1918552036199095, + "grad_norm": 0.7705667018890381, + "learning_rate": 0.00047783690356571784, + "loss": 0.2115, + "mean_token_accuracy": 0.9526428133249283, + "num_tokens": 2912197.0, + "step": 330 + }, + { + "entropy": 1.9174850285053253, + "epoch": 1.195475113122172, + "grad_norm": 0.5695499181747437, + "learning_rate": 0.00047760468781032634, + "loss": 0.1033, + "mean_token_accuracy": 0.969958484172821, + "num_tokens": 2920579.0, + "step": 331 + }, + { + "entropy": 1.8578442931175232, + "epoch": 1.1990950226244343, + "grad_norm": 0.7843735814094543, + "learning_rate": 0.000477371725515181, + "loss": 0.1664, + "mean_token_accuracy": 0.9545005410909653, + "num_tokens": 2929352.0, + "step": 332 + }, + { + "entropy": 1.8509328961372375, + "epoch": 1.2027149321266968, + "grad_norm": 0.5951048135757446, + "learning_rate": 0.0004771380175651026, + "loss": 0.1566, + "mean_token_accuracy": 0.9551403075456619, + "num_tokens": 2938387.0, + "step": 333 + }, + { + "entropy": 1.8236390948295593, + "epoch": 1.2063348416289592, + "grad_norm": 0.4988223910331726, + "learning_rate": 0.0004769035648477434, + "loss": 0.1242, + "mean_token_accuracy": 0.966319814324379, + "num_tokens": 2947741.0, + "step": 334 + }, + { + "entropy": 1.9594822525978088, + "epoch": 1.2099547511312216, + "grad_norm": 0.7550755143165588, + "learning_rate": 0.00047666836825358477, + "loss": 0.1591, + "mean_token_accuracy": 0.9666347652673721, + "num_tokens": 2956313.0, + "step": 335 + }, + { + "entropy": 1.9148444533348083, + "epoch": 1.213574660633484, + "grad_norm": 0.5889077186584473, + "learning_rate": 0.00047643242867593345, + "loss": 0.1343, + "mean_token_accuracy": 0.9611433297395706, + "num_tokens": 2964928.0, + "step": 336 + }, + { + "entropy": 1.8126957714557648, + "epoch": 1.2171945701357467, + "grad_norm": 0.5447750091552734, + "learning_rate": 0.0004761957470109179, + "loss": 0.1659, + "mean_token_accuracy": 0.9552300125360489, + "num_tokens": 2974160.0, + "step": 337 + }, + { + "entropy": 1.7981431782245636, + "epoch": 1.2208144796380092, + "grad_norm": 0.5400761365890503, + "learning_rate": 0.0004759583241574854, + "loss": 0.1339, + "mean_token_accuracy": 0.9620136916637421, + "num_tokens": 2982900.0, + "step": 338 + }, + { + "entropy": 1.8613979518413544, + "epoch": 1.2244343891402716, + "grad_norm": 0.7452914714813232, + "learning_rate": 0.0004757201610173981, + "loss": 0.4, + "mean_token_accuracy": 0.9068266004323959, + "num_tokens": 2991783.0, + "step": 339 + }, + { + "entropy": 1.8654026687145233, + "epoch": 1.228054298642534, + "grad_norm": 1.7142685651779175, + "learning_rate": 0.00047548125849523, + "loss": 0.3168, + "mean_token_accuracy": 0.9308896362781525, + "num_tokens": 3000530.0, + "step": 340 + }, + { + "entropy": 1.7702704071998596, + "epoch": 1.2316742081447964, + "grad_norm": 0.6687431931495667, + "learning_rate": 0.0004752416174983633, + "loss": 0.1697, + "mean_token_accuracy": 0.9530515670776367, + "num_tokens": 3009355.0, + "step": 341 + }, + { + "entropy": 1.735857516527176, + "epoch": 1.2352941176470589, + "grad_norm": 0.6127599477767944, + "learning_rate": 0.00047500123893698507, + "loss": 0.1706, + "mean_token_accuracy": 0.9593266248703003, + "num_tokens": 3018518.0, + "step": 342 + }, + { + "entropy": 1.7076368927955627, + "epoch": 1.2389140271493213, + "grad_norm": 0.6973987817764282, + "learning_rate": 0.0004747601237240836, + "loss": 0.1615, + "mean_token_accuracy": 0.9539438933134079, + "num_tokens": 3027752.0, + "step": 343 + }, + { + "entropy": 1.7353227138519287, + "epoch": 1.2425339366515837, + "grad_norm": 0.8406392335891724, + "learning_rate": 0.00047451827277544546, + "loss": 0.2063, + "mean_token_accuracy": 0.9488435834646225, + "num_tokens": 3036383.0, + "step": 344 + }, + { + "entropy": 1.6597246527671814, + "epoch": 1.2461538461538462, + "grad_norm": 0.5971431732177734, + "learning_rate": 0.00047427568700965107, + "loss": 0.1013, + "mean_token_accuracy": 0.9721864312887192, + "num_tokens": 3045375.0, + "step": 345 + }, + { + "entropy": 1.7100033462047577, + "epoch": 1.2497737556561086, + "grad_norm": 0.5883470773696899, + "learning_rate": 0.00047403236734807225, + "loss": 0.1164, + "mean_token_accuracy": 0.9664830714464188, + "num_tokens": 3054084.0, + "step": 346 + }, + { + "entropy": 1.7402609288692474, + "epoch": 1.253393665158371, + "grad_norm": 0.7355862855911255, + "learning_rate": 0.00047378831471486815, + "loss": 0.2007, + "mean_token_accuracy": 0.9560511559247971, + "num_tokens": 3062727.0, + "step": 347 + }, + { + "entropy": 1.79518261551857, + "epoch": 1.2570135746606335, + "grad_norm": 0.6006518006324768, + "learning_rate": 0.00047354353003698163, + "loss": 0.1085, + "mean_token_accuracy": 0.9598321914672852, + "num_tokens": 3071178.0, + "step": 348 + }, + { + "entropy": 1.7328391373157501, + "epoch": 1.260633484162896, + "grad_norm": 0.560342013835907, + "learning_rate": 0.0004732980142441362, + "loss": 0.1593, + "mean_token_accuracy": 0.9579409211874008, + "num_tokens": 3079927.0, + "step": 349 + }, + { + "entropy": 1.7356511652469635, + "epoch": 1.2642533936651583, + "grad_norm": 0.9149975776672363, + "learning_rate": 0.00047305176826883206, + "loss": 0.4064, + "mean_token_accuracy": 0.9265118837356567, + "num_tokens": 3089314.0, + "step": 350 + }, + { + "entropy": 1.8573569357395172, + "epoch": 1.2678733031674208, + "grad_norm": 0.8300670981407166, + "learning_rate": 0.0004728047930463428, + "loss": 0.195, + "mean_token_accuracy": 0.9453776180744171, + "num_tokens": 3097702.0, + "step": 351 + }, + { + "entropy": 1.7906217575073242, + "epoch": 1.2714932126696832, + "grad_norm": 0.5668906569480896, + "learning_rate": 0.0004725570895147118, + "loss": 0.1572, + "mean_token_accuracy": 0.962067037820816, + "num_tokens": 3106379.0, + "step": 352 + }, + { + "entropy": 1.6957395374774933, + "epoch": 1.2751131221719456, + "grad_norm": 0.4048328399658203, + "learning_rate": 0.0004723086586147487, + "loss": 0.0944, + "mean_token_accuracy": 0.9716819673776627, + "num_tokens": 3115622.0, + "step": 353 + }, + { + "entropy": 1.8158144056797028, + "epoch": 1.278733031674208, + "grad_norm": 0.6396092772483826, + "learning_rate": 0.00047205950129002564, + "loss": 0.1011, + "mean_token_accuracy": 0.9698463827371597, + "num_tokens": 3124016.0, + "step": 354 + }, + { + "entropy": 1.730194479227066, + "epoch": 1.2823529411764705, + "grad_norm": 0.662876307964325, + "learning_rate": 0.000471809618486874, + "loss": 0.1641, + "mean_token_accuracy": 0.9520179778337479, + "num_tokens": 3132712.0, + "step": 355 + }, + { + "entropy": 1.6776110529899597, + "epoch": 1.285972850678733, + "grad_norm": 0.868507981300354, + "learning_rate": 0.0004715590111543804, + "loss": 0.3374, + "mean_token_accuracy": 0.9303739666938782, + "num_tokens": 3142103.0, + "step": 356 + }, + { + "entropy": 1.6501678824424744, + "epoch": 1.2895927601809956, + "grad_norm": 0.5433686971664429, + "learning_rate": 0.0004713076802443834, + "loss": 0.1237, + "mean_token_accuracy": 0.9653612226247787, + "num_tokens": 3151192.0, + "step": 357 + }, + { + "entropy": 1.6524465382099152, + "epoch": 1.293212669683258, + "grad_norm": 0.6145523190498352, + "learning_rate": 0.00047105562671147, + "loss": 0.1204, + "mean_token_accuracy": 0.9690534323453903, + "num_tokens": 3159839.0, + "step": 358 + }, + { + "entropy": 1.5339214205741882, + "epoch": 1.2968325791855204, + "grad_norm": 0.500477135181427, + "learning_rate": 0.00047080285151297144, + "loss": 0.1295, + "mean_token_accuracy": 0.9571033865213394, + "num_tokens": 3169047.0, + "step": 359 + }, + { + "entropy": 1.6765435338020325, + "epoch": 1.3004524886877828, + "grad_norm": 0.6697553396224976, + "learning_rate": 0.00047054935560896026, + "loss": 0.135, + "mean_token_accuracy": 0.9672541171312332, + "num_tokens": 3177062.0, + "step": 360 + }, + { + "entropy": 1.5932062566280365, + "epoch": 1.3040723981900453, + "grad_norm": 0.706957221031189, + "learning_rate": 0.0004702951399622462, + "loss": 0.1229, + "mean_token_accuracy": 0.9634416699409485, + "num_tokens": 3185829.0, + "step": 361 + }, + { + "entropy": 1.5623145997524261, + "epoch": 1.3076923076923077, + "grad_norm": 0.6199461221694946, + "learning_rate": 0.00047004020553837275, + "loss": 0.1449, + "mean_token_accuracy": 0.9620065689086914, + "num_tokens": 3194426.0, + "step": 362 + }, + { + "entropy": 1.5226828753948212, + "epoch": 1.3113122171945701, + "grad_norm": 0.8962509036064148, + "learning_rate": 0.0004697845533056132, + "loss": 0.2207, + "mean_token_accuracy": 0.9403344839811325, + "num_tokens": 3203655.0, + "step": 363 + }, + { + "entropy": 1.5395641326904297, + "epoch": 1.3149321266968326, + "grad_norm": 0.5993619561195374, + "learning_rate": 0.00046952818423496727, + "loss": 0.1486, + "mean_token_accuracy": 0.9614185988903046, + "num_tokens": 3212069.0, + "step": 364 + }, + { + "entropy": 1.5738630294799805, + "epoch": 1.318552036199095, + "grad_norm": 0.7393983602523804, + "learning_rate": 0.00046927109930015756, + "loss": 0.1812, + "mean_token_accuracy": 0.9535021334886551, + "num_tokens": 3220482.0, + "step": 365 + }, + { + "entropy": 1.5462632775306702, + "epoch": 1.3221719457013574, + "grad_norm": 0.7453555464744568, + "learning_rate": 0.0004690132994776253, + "loss": 0.164, + "mean_token_accuracy": 0.9585814625024796, + "num_tokens": 3229505.0, + "step": 366 + }, + { + "entropy": 1.5241961777210236, + "epoch": 1.3257918552036199, + "grad_norm": 0.7553415298461914, + "learning_rate": 0.00046875478574652713, + "loss": 0.1445, + "mean_token_accuracy": 0.9682841598987579, + "num_tokens": 3238326.0, + "step": 367 + }, + { + "entropy": 1.5344699025154114, + "epoch": 1.3294117647058823, + "grad_norm": 0.8565949201583862, + "learning_rate": 0.0004684955590887311, + "loss": 0.2521, + "mean_token_accuracy": 0.920401468873024, + "num_tokens": 3247482.0, + "step": 368 + }, + { + "entropy": 1.5109277665615082, + "epoch": 1.3330316742081447, + "grad_norm": 0.5170580148696899, + "learning_rate": 0.00046823562048881295, + "loss": 0.1393, + "mean_token_accuracy": 0.9584086239337921, + "num_tokens": 3256464.0, + "step": 369 + }, + { + "entropy": 1.4666939079761505, + "epoch": 1.3366515837104074, + "grad_norm": 0.6995373368263245, + "learning_rate": 0.0004679749709340529, + "loss": 0.1726, + "mean_token_accuracy": 0.9477890431880951, + "num_tokens": 3265853.0, + "step": 370 + }, + { + "entropy": 1.4208430051803589, + "epoch": 1.3402714932126698, + "grad_norm": 1.1363991498947144, + "learning_rate": 0.000467713611414431, + "loss": 0.196, + "mean_token_accuracy": 0.9495431333780289, + "num_tokens": 3275367.0, + "step": 371 + }, + { + "entropy": 1.5009459853172302, + "epoch": 1.3438914027149322, + "grad_norm": 0.7883325219154358, + "learning_rate": 0.00046745154292262414, + "loss": 0.2526, + "mean_token_accuracy": 0.9334618002176285, + "num_tokens": 3284772.0, + "step": 372 + }, + { + "entropy": 1.5485479533672333, + "epoch": 1.3475113122171947, + "grad_norm": 0.6516429781913757, + "learning_rate": 0.00046718876645400156, + "loss": 0.2057, + "mean_token_accuracy": 0.9546459317207336, + "num_tokens": 3293493.0, + "step": 373 + }, + { + "entropy": 1.6237249970436096, + "epoch": 1.351131221719457, + "grad_norm": 0.8916263580322266, + "learning_rate": 0.00046692528300662213, + "loss": 0.2123, + "mean_token_accuracy": 0.9456845372915268, + "num_tokens": 3302063.0, + "step": 374 + }, + { + "entropy": 1.561572015285492, + "epoch": 1.3547511312217195, + "grad_norm": 0.7527791857719421, + "learning_rate": 0.00046666109358122935, + "loss": 0.2113, + "mean_token_accuracy": 0.9537477940320969, + "num_tokens": 3311037.0, + "step": 375 + }, + { + "entropy": 1.5594256818294525, + "epoch": 1.358371040723982, + "grad_norm": 1.25638747215271, + "learning_rate": 0.0004663961991812485, + "loss": 0.1629, + "mean_token_accuracy": 0.9508458077907562, + "num_tokens": 3319635.0, + "step": 376 + }, + { + "entropy": 1.6909976303577423, + "epoch": 1.3619909502262444, + "grad_norm": 0.7627813220024109, + "learning_rate": 0.00046613060081278194, + "loss": 0.2303, + "mean_token_accuracy": 0.9425801336765289, + "num_tokens": 3328043.0, + "step": 377 + }, + { + "entropy": 1.6074829697608948, + "epoch": 1.3656108597285068, + "grad_norm": 0.6584346294403076, + "learning_rate": 0.00046586429948460646, + "loss": 0.1815, + "mean_token_accuracy": 0.9536214470863342, + "num_tokens": 3337143.0, + "step": 378 + }, + { + "entropy": 1.7382183969020844, + "epoch": 1.3692307692307693, + "grad_norm": 1.37154221534729, + "learning_rate": 0.0004655972962081684, + "loss": 0.1849, + "mean_token_accuracy": 0.948440819978714, + "num_tokens": 3346033.0, + "step": 379 + }, + { + "entropy": 1.7148900926113129, + "epoch": 1.3728506787330317, + "grad_norm": 0.9487980604171753, + "learning_rate": 0.00046532959199758, + "loss": 0.2521, + "mean_token_accuracy": 0.9344504028558731, + "num_tokens": 3354849.0, + "step": 380 + }, + { + "entropy": 1.7164019346237183, + "epoch": 1.3764705882352941, + "grad_norm": 0.5609025359153748, + "learning_rate": 0.00046506118786961614, + "loss": 0.1425, + "mean_token_accuracy": 0.9571309834718704, + "num_tokens": 3363674.0, + "step": 381 + }, + { + "entropy": 1.894619107246399, + "epoch": 1.3800904977375565, + "grad_norm": 0.9811336994171143, + "learning_rate": 0.00046479208484370997, + "loss": 0.2522, + "mean_token_accuracy": 0.9424156546592712, + "num_tokens": 3372325.0, + "step": 382 + }, + { + "entropy": 1.78870290517807, + "epoch": 1.383710407239819, + "grad_norm": 0.5707085132598877, + "learning_rate": 0.00046452228394194893, + "loss": 0.1354, + "mean_token_accuracy": 0.9613165706396103, + "num_tokens": 3381270.0, + "step": 383 + }, + { + "entropy": 1.803922712802887, + "epoch": 1.3873303167420814, + "grad_norm": 0.5655364394187927, + "learning_rate": 0.0004642517861890713, + "loss": 0.0818, + "mean_token_accuracy": 0.9776160269975662, + "num_tokens": 3390363.0, + "step": 384 + }, + { + "entropy": 1.8172507882118225, + "epoch": 1.3909502262443438, + "grad_norm": 0.6950513124465942, + "learning_rate": 0.00046398059261246205, + "loss": 0.1145, + "mean_token_accuracy": 0.963288351893425, + "num_tokens": 3399176.0, + "step": 385 + }, + { + "entropy": 1.9182518422603607, + "epoch": 1.3945701357466063, + "grad_norm": 0.5900619029998779, + "learning_rate": 0.0004637087042421489, + "loss": 0.108, + "mean_token_accuracy": 0.9723307639360428, + "num_tokens": 3407978.0, + "step": 386 + }, + { + "entropy": 1.8558574616909027, + "epoch": 1.3981900452488687, + "grad_norm": 0.6279832124710083, + "learning_rate": 0.00046343612211079843, + "loss": 0.1471, + "mean_token_accuracy": 0.9603912532329559, + "num_tokens": 3416856.0, + "step": 387 + }, + { + "entropy": 1.8146779537200928, + "epoch": 1.4018099547511311, + "grad_norm": 0.6171274781227112, + "learning_rate": 0.0004631628472537125, + "loss": 0.1872, + "mean_token_accuracy": 0.9447146654129028, + "num_tokens": 3426044.0, + "step": 388 + }, + { + "entropy": 1.9342225790023804, + "epoch": 1.4054298642533936, + "grad_norm": 0.9947887659072876, + "learning_rate": 0.00046288888070882374, + "loss": 0.2966, + "mean_token_accuracy": 0.9279204607009888, + "num_tokens": 3435154.0, + "step": 389 + }, + { + "entropy": 1.9391801953315735, + "epoch": 1.409049773755656, + "grad_norm": 0.7155653834342957, + "learning_rate": 0.000462614223516692, + "loss": 0.1847, + "mean_token_accuracy": 0.9475171864032745, + "num_tokens": 3444563.0, + "step": 390 + }, + { + "entropy": 2.0716978013515472, + "epoch": 1.4126696832579184, + "grad_norm": 0.8198989629745483, + "learning_rate": 0.0004623388767205004, + "loss": 0.1317, + "mean_token_accuracy": 0.9608721435070038, + "num_tokens": 3453410.0, + "step": 391 + }, + { + "entropy": 2.1060431599617004, + "epoch": 1.416289592760181, + "grad_norm": 1.025406002998352, + "learning_rate": 0.00046206284136605106, + "loss": 0.2146, + "mean_token_accuracy": 0.9414294511079788, + "num_tokens": 3461958.0, + "step": 392 + }, + { + "entropy": 2.1459922194480896, + "epoch": 1.4199095022624435, + "grad_norm": 0.9209627509117126, + "learning_rate": 0.00046178611850176146, + "loss": 0.2137, + "mean_token_accuracy": 0.956874743103981, + "num_tokens": 3470547.0, + "step": 393 + }, + { + "entropy": 2.0233450531959534, + "epoch": 1.423529411764706, + "grad_norm": 0.5777944922447205, + "learning_rate": 0.00046150870917866025, + "loss": 0.122, + "mean_token_accuracy": 0.9672323018312454, + "num_tokens": 3479618.0, + "step": 394 + }, + { + "entropy": 2.035937190055847, + "epoch": 1.4271493212669684, + "grad_norm": 0.7945542931556702, + "learning_rate": 0.0004612306144503835, + "loss": 0.2879, + "mean_token_accuracy": 0.946587473154068, + "num_tokens": 3488533.0, + "step": 395 + }, + { + "entropy": 2.155315637588501, + "epoch": 1.4307692307692308, + "grad_norm": 0.6385292410850525, + "learning_rate": 0.00046095183537317035, + "loss": 0.1008, + "mean_token_accuracy": 0.9655124247074127, + "num_tokens": 3496686.0, + "step": 396 + }, + { + "entropy": 2.186827063560486, + "epoch": 1.4343891402714932, + "grad_norm": 0.4759826958179474, + "learning_rate": 0.0004606723730058593, + "loss": 0.0768, + "mean_token_accuracy": 0.9783597737550735, + "num_tokens": 3504958.0, + "step": 397 + }, + { + "entropy": 1.974392294883728, + "epoch": 1.4380090497737557, + "grad_norm": 0.6250292062759399, + "learning_rate": 0.00046039222840988406, + "loss": 0.1381, + "mean_token_accuracy": 0.9586146324872971, + "num_tokens": 3513694.0, + "step": 398 + }, + { + "entropy": 2.045738846063614, + "epoch": 1.441628959276018, + "grad_norm": 0.5517769455909729, + "learning_rate": 0.0004601114026492695, + "loss": 0.1312, + "mean_token_accuracy": 0.9682512134313583, + "num_tokens": 3522395.0, + "step": 399 + }, + { + "entropy": 2.105030357837677, + "epoch": 1.4452488687782805, + "grad_norm": 0.6748242974281311, + "learning_rate": 0.0004598298967906276, + "loss": 0.1056, + "mean_token_accuracy": 0.9701305478811264, + "num_tokens": 3530838.0, + "step": 400 + }, + { + "entropy": 2.024325281381607, + "epoch": 1.448868778280543, + "grad_norm": 0.6320233941078186, + "learning_rate": 0.00045954771190315344, + "loss": 0.1129, + "mean_token_accuracy": 0.9633017927408218, + "num_tokens": 3540184.0, + "step": 401 + }, + { + "entropy": 2.1561593413352966, + "epoch": 1.4524886877828054, + "grad_norm": 0.7380363941192627, + "learning_rate": 0.0004592648490586213, + "loss": 0.1304, + "mean_token_accuracy": 0.9599586874246597, + "num_tokens": 3548727.0, + "step": 402 + }, + { + "entropy": 2.2986454367637634, + "epoch": 1.4561085972850678, + "grad_norm": 0.669114351272583, + "learning_rate": 0.00045898130933138024, + "loss": 0.1005, + "mean_token_accuracy": 0.9724964797496796, + "num_tokens": 3556780.0, + "step": 403 + }, + { + "entropy": 2.103136509656906, + "epoch": 1.4597285067873302, + "grad_norm": 0.6677402853965759, + "learning_rate": 0.0004586970937983504, + "loss": 0.1177, + "mean_token_accuracy": 0.9597653448581696, + "num_tokens": 3565427.0, + "step": 404 + }, + { + "entropy": 2.112696200609207, + "epoch": 1.463348416289593, + "grad_norm": 0.4597342014312744, + "learning_rate": 0.0004584122035390185, + "loss": 0.0695, + "mean_token_accuracy": 0.9763098359107971, + "num_tokens": 3573902.0, + "step": 405 + }, + { + "entropy": 2.0472628474235535, + "epoch": 1.4669683257918553, + "grad_norm": 0.7842056751251221, + "learning_rate": 0.0004581266396354339, + "loss": 0.1981, + "mean_token_accuracy": 0.9521032422780991, + "num_tokens": 3582913.0, + "step": 406 + }, + { + "entropy": 2.236558735370636, + "epoch": 1.4705882352941178, + "grad_norm": 0.7634767293930054, + "learning_rate": 0.000457840403172205, + "loss": 0.1956, + "mean_token_accuracy": 0.9602932929992676, + "num_tokens": 3591197.0, + "step": 407 + }, + { + "entropy": 2.182949125766754, + "epoch": 1.4742081447963802, + "grad_norm": 0.7084661722183228, + "learning_rate": 0.00045755349523649415, + "loss": 0.2463, + "mean_token_accuracy": 0.9392582327127457, + "num_tokens": 3600134.0, + "step": 408 + }, + { + "entropy": 2.135133147239685, + "epoch": 1.4778280542986426, + "grad_norm": 0.8172940015792847, + "learning_rate": 0.00045726591691801433, + "loss": 0.2375, + "mean_token_accuracy": 0.9458330571651459, + "num_tokens": 3608945.0, + "step": 409 + }, + { + "entropy": 2.157473146915436, + "epoch": 1.481447963800905, + "grad_norm": 0.6165594458580017, + "learning_rate": 0.0004569776693090246, + "loss": 0.1628, + "mean_token_accuracy": 0.9586529731750488, + "num_tokens": 3617790.0, + "step": 410 + }, + { + "entropy": 2.15165376663208, + "epoch": 1.4850678733031675, + "grad_norm": 0.6619407534599304, + "learning_rate": 0.0004566887535043263, + "loss": 0.1866, + "mean_token_accuracy": 0.9545126557350159, + "num_tokens": 3626937.0, + "step": 411 + }, + { + "entropy": 2.271161735057831, + "epoch": 1.48868778280543, + "grad_norm": 0.5861835479736328, + "learning_rate": 0.0004563991706012582, + "loss": 0.1409, + "mean_token_accuracy": 0.9595955163240433, + "num_tokens": 3636025.0, + "step": 412 + }, + { + "entropy": 2.277799427509308, + "epoch": 1.4923076923076923, + "grad_norm": 0.6464956402778625, + "learning_rate": 0.00045610892169969323, + "loss": 0.0792, + "mean_token_accuracy": 0.9806316941976547, + "num_tokens": 3644746.0, + "step": 413 + }, + { + "entropy": 2.2143171429634094, + "epoch": 1.4959276018099548, + "grad_norm": 0.7531687021255493, + "learning_rate": 0.00045581800790203366, + "loss": 0.2584, + "mean_token_accuracy": 0.9225966930389404, + "num_tokens": 3654064.0, + "step": 414 + }, + { + "entropy": 2.231681764125824, + "epoch": 1.4995475113122172, + "grad_norm": 0.6902768015861511, + "learning_rate": 0.00045552643031320726, + "loss": 0.232, + "mean_token_accuracy": 0.9433842301368713, + "num_tokens": 3663130.0, + "step": 415 + }, + { + "entropy": 2.2672717571258545, + "epoch": 1.5031674208144796, + "grad_norm": 0.5134314894676208, + "learning_rate": 0.00045523419004066273, + "loss": 0.0874, + "mean_token_accuracy": 0.9708191752433777, + "num_tokens": 3671981.0, + "step": 416 + }, + { + "entropy": 2.3302834033966064, + "epoch": 1.506787330316742, + "grad_norm": 0.885969340801239, + "learning_rate": 0.0004549412881943659, + "loss": 0.0723, + "mean_token_accuracy": 0.9791463166475296, + "num_tokens": 3680525.0, + "step": 417 + }, + { + "entropy": 2.2693899869918823, + "epoch": 1.5104072398190045, + "grad_norm": 0.7424856424331665, + "learning_rate": 0.00045464772588679547, + "loss": 0.1509, + "mean_token_accuracy": 0.9600907415151596, + "num_tokens": 3689430.0, + "step": 418 + }, + { + "entropy": 2.4042725563049316, + "epoch": 1.514027149321267, + "grad_norm": 0.8968034982681274, + "learning_rate": 0.0004543535042329382, + "loss": 0.1984, + "mean_token_accuracy": 0.9488537162542343, + "num_tokens": 3697836.0, + "step": 419 + }, + { + "entropy": 2.2518428564071655, + "epoch": 1.5176470588235293, + "grad_norm": 0.5963534712791443, + "learning_rate": 0.0004540586243502858, + "loss": 0.1214, + "mean_token_accuracy": 0.9711381644010544, + "num_tokens": 3706675.0, + "step": 420 + }, + { + "entropy": 2.275522291660309, + "epoch": 1.5212669683257918, + "grad_norm": 1.0797090530395508, + "learning_rate": 0.0004537630873588293, + "loss": 0.2508, + "mean_token_accuracy": 0.9247037768363953, + "num_tokens": 3715631.0, + "step": 421 + }, + { + "entropy": 2.249617278575897, + "epoch": 1.5248868778280542, + "grad_norm": 0.7636313438415527, + "learning_rate": 0.000453466894381056, + "loss": 0.1112, + "mean_token_accuracy": 0.9681926071643829, + "num_tokens": 3724579.0, + "step": 422 + }, + { + "entropy": 2.280571699142456, + "epoch": 1.5285067873303166, + "grad_norm": 0.9915648698806763, + "learning_rate": 0.00045317004654194464, + "loss": 0.3532, + "mean_token_accuracy": 0.9360047876834869, + "num_tokens": 3733607.0, + "step": 423 + }, + { + "entropy": 2.241512656211853, + "epoch": 1.532126696832579, + "grad_norm": 0.924977719783783, + "learning_rate": 0.0004528725449689611, + "loss": 0.1997, + "mean_token_accuracy": 0.9475428760051727, + "num_tokens": 3742611.0, + "step": 424 + }, + { + "entropy": 2.201731503009796, + "epoch": 1.5357466063348415, + "grad_norm": 0.7018861770629883, + "learning_rate": 0.0004525743907920542, + "loss": 0.1683, + "mean_token_accuracy": 0.9465018659830093, + "num_tokens": 3751737.0, + "step": 425 + }, + { + "entropy": 2.28944593667984, + "epoch": 1.539366515837104, + "grad_norm": 0.5893452763557434, + "learning_rate": 0.00045227558514365166, + "loss": 0.0969, + "mean_token_accuracy": 0.9711766839027405, + "num_tokens": 3761245.0, + "step": 426 + }, + { + "entropy": 2.3497202396392822, + "epoch": 1.5429864253393664, + "grad_norm": 0.685279130935669, + "learning_rate": 0.0004519761291586551, + "loss": 0.106, + "mean_token_accuracy": 0.9663016647100449, + "num_tokens": 3769854.0, + "step": 427 + }, + { + "entropy": 2.308362066745758, + "epoch": 1.5466063348416288, + "grad_norm": 0.5116177797317505, + "learning_rate": 0.00045167602397443694, + "loss": 0.1132, + "mean_token_accuracy": 0.9700013697147369, + "num_tokens": 3778996.0, + "step": 428 + }, + { + "entropy": 2.238637685775757, + "epoch": 1.5502262443438914, + "grad_norm": 0.8374833464622498, + "learning_rate": 0.00045137527073083457, + "loss": 0.2539, + "mean_token_accuracy": 0.9407305717468262, + "num_tokens": 3787835.0, + "step": 429 + }, + { + "entropy": 2.3406758308410645, + "epoch": 1.5538461538461539, + "grad_norm": 0.5140913724899292, + "learning_rate": 0.0004510738705701473, + "loss": 0.1113, + "mean_token_accuracy": 0.9635641574859619, + "num_tokens": 3796498.0, + "step": 430 + }, + { + "entropy": 2.2642539143562317, + "epoch": 1.5574660633484163, + "grad_norm": 0.5750702023506165, + "learning_rate": 0.0004507718246371313, + "loss": 0.1127, + "mean_token_accuracy": 0.9660817235708237, + "num_tokens": 3805464.0, + "step": 431 + }, + { + "entropy": 2.2058264315128326, + "epoch": 1.5610859728506787, + "grad_norm": 0.6448659300804138, + "learning_rate": 0.0004504691340789955, + "loss": 0.0994, + "mean_token_accuracy": 0.96739861369133, + "num_tokens": 3814309.0, + "step": 432 + }, + { + "entropy": 2.330399215221405, + "epoch": 1.5647058823529412, + "grad_norm": 0.8432528376579285, + "learning_rate": 0.0004501658000453973, + "loss": 0.1999, + "mean_token_accuracy": 0.9510775059461594, + "num_tokens": 3823126.0, + "step": 433 + }, + { + "entropy": 2.4211326837539673, + "epoch": 1.5683257918552036, + "grad_norm": 0.8101194500923157, + "learning_rate": 0.00044986182368843806, + "loss": 0.144, + "mean_token_accuracy": 0.9656328558921814, + "num_tokens": 3831274.0, + "step": 434 + }, + { + "entropy": 2.2594956755638123, + "epoch": 1.571945701357466, + "grad_norm": 0.6753663420677185, + "learning_rate": 0.0004495572061626585, + "loss": 0.1433, + "mean_token_accuracy": 0.9572386592626572, + "num_tokens": 3840206.0, + "step": 435 + }, + { + "entropy": 2.1233682930469513, + "epoch": 1.5755656108597285, + "grad_norm": 0.48616713285446167, + "learning_rate": 0.000449251948625035, + "loss": 0.0934, + "mean_token_accuracy": 0.9740773588418961, + "num_tokens": 3849363.0, + "step": 436 + }, + { + "entropy": 2.325556695461273, + "epoch": 1.5791855203619911, + "grad_norm": 0.7744045853614807, + "learning_rate": 0.00044894605223497446, + "loss": 0.127, + "mean_token_accuracy": 0.9687052518129349, + "num_tokens": 3857733.0, + "step": 437 + }, + { + "entropy": 2.266542673110962, + "epoch": 1.5828054298642535, + "grad_norm": 2.373530387878418, + "learning_rate": 0.00044863951815431045, + "loss": 0.2404, + "mean_token_accuracy": 0.9437267184257507, + "num_tokens": 3866374.0, + "step": 438 + }, + { + "entropy": 2.1757248640060425, + "epoch": 1.586425339366516, + "grad_norm": 0.5588560700416565, + "learning_rate": 0.00044833234754729847, + "loss": 0.142, + "mean_token_accuracy": 0.9601300358772278, + "num_tokens": 3875520.0, + "step": 439 + }, + { + "entropy": 2.124377518892288, + "epoch": 1.5900452488687784, + "grad_norm": 0.5602438449859619, + "learning_rate": 0.0004480245415806116, + "loss": 0.1556, + "mean_token_accuracy": 0.9561446160078049, + "num_tokens": 3884345.0, + "step": 440 + }, + { + "entropy": 2.1571075320243835, + "epoch": 1.5936651583710408, + "grad_norm": 0.472598671913147, + "learning_rate": 0.0004477161014233361, + "loss": 0.0848, + "mean_token_accuracy": 0.9742853343486786, + "num_tokens": 3893129.0, + "step": 441 + }, + { + "entropy": 2.0434057414531708, + "epoch": 1.5972850678733033, + "grad_norm": 0.7104448676109314, + "learning_rate": 0.00044740702824696703, + "loss": 0.1524, + "mean_token_accuracy": 0.9542464315891266, + "num_tokens": 3902120.0, + "step": 442 + }, + { + "entropy": 2.1118403673171997, + "epoch": 1.6009049773755657, + "grad_norm": 0.6632394194602966, + "learning_rate": 0.0004470973232254037, + "loss": 0.3001, + "mean_token_accuracy": 0.928197592496872, + "num_tokens": 3910974.0, + "step": 443 + }, + { + "entropy": 2.0292475819587708, + "epoch": 1.6045248868778281, + "grad_norm": 1.050956130027771, + "learning_rate": 0.00044678698753494527, + "loss": 0.2226, + "mean_token_accuracy": 0.9448522627353668, + "num_tokens": 3920005.0, + "step": 444 + }, + { + "entropy": 1.991033524274826, + "epoch": 1.6081447963800906, + "grad_norm": 0.670244038105011, + "learning_rate": 0.00044647602235428624, + "loss": 0.2158, + "mean_token_accuracy": 0.9551118016242981, + "num_tokens": 3929334.0, + "step": 445 + }, + { + "entropy": 2.04949289560318, + "epoch": 1.611764705882353, + "grad_norm": 0.6321494579315186, + "learning_rate": 0.00044616442886451197, + "loss": 0.1743, + "mean_token_accuracy": 0.9494802355766296, + "num_tokens": 3938211.0, + "step": 446 + }, + { + "entropy": 2.1101951897144318, + "epoch": 1.6153846153846154, + "grad_norm": 0.6970012187957764, + "learning_rate": 0.0004458522082490943, + "loss": 0.1228, + "mean_token_accuracy": 0.9624926447868347, + "num_tokens": 3946534.0, + "step": 447 + }, + { + "entropy": 1.9337081909179688, + "epoch": 1.6190045248868778, + "grad_norm": 0.5971657633781433, + "learning_rate": 0.0004455393616938868, + "loss": 0.1431, + "mean_token_accuracy": 0.9635348320007324, + "num_tokens": 3955694.0, + "step": 448 + }, + { + "entropy": 1.9635128676891327, + "epoch": 1.6226244343891403, + "grad_norm": 0.8510827422142029, + "learning_rate": 0.00044522589038712074, + "loss": 0.2446, + "mean_token_accuracy": 0.9457641988992691, + "num_tokens": 3964907.0, + "step": 449 + }, + { + "entropy": 2.0336360335350037, + "epoch": 1.6262443438914027, + "grad_norm": 0.5803818106651306, + "learning_rate": 0.00044491179551939985, + "loss": 0.0872, + "mean_token_accuracy": 0.9734505414962769, + "num_tokens": 3973584.0, + "step": 450 + }, + { + "entropy": 2.0668878853321075, + "epoch": 1.6298642533936651, + "grad_norm": 0.6990496516227722, + "learning_rate": 0.0004445970782836967, + "loss": 0.1138, + "mean_token_accuracy": 0.9702571034431458, + "num_tokens": 3982632.0, + "step": 451 + }, + { + "entropy": 2.1481760144233704, + "epoch": 1.6334841628959276, + "grad_norm": 0.6156729459762573, + "learning_rate": 0.00044428173987534733, + "loss": 0.0936, + "mean_token_accuracy": 0.9739355593919754, + "num_tokens": 3991147.0, + "step": 452 + }, + { + "entropy": 2.0678701996803284, + "epoch": 1.63710407239819, + "grad_norm": 0.5441684126853943, + "learning_rate": 0.0004439657814920472, + "loss": 0.123, + "mean_token_accuracy": 0.9693446308374405, + "num_tokens": 3999990.0, + "step": 453 + }, + { + "entropy": 1.9867055118083954, + "epoch": 1.6407239819004524, + "grad_norm": 0.9218093156814575, + "learning_rate": 0.00044364920433384656, + "loss": 0.1997, + "mean_token_accuracy": 0.9564195573329926, + "num_tokens": 4009097.0, + "step": 454 + }, + { + "entropy": 2.145586997270584, + "epoch": 1.6443438914027149, + "grad_norm": 0.77643883228302, + "learning_rate": 0.0004433320096031458, + "loss": 0.1491, + "mean_token_accuracy": 0.9602408111095428, + "num_tokens": 4018059.0, + "step": 455 + }, + { + "entropy": 2.071108251810074, + "epoch": 1.6479638009049773, + "grad_norm": 0.5267088413238525, + "learning_rate": 0.0004430141985046909, + "loss": 0.0875, + "mean_token_accuracy": 0.9764399826526642, + "num_tokens": 4027089.0, + "step": 456 + }, + { + "entropy": 2.1659318804740906, + "epoch": 1.6515837104072397, + "grad_norm": 1.0642318725585938, + "learning_rate": 0.000442695772245569, + "loss": 0.2623, + "mean_token_accuracy": 0.9307756721973419, + "num_tokens": 4035719.0, + "step": 457 + }, + { + "entropy": 2.0232724249362946, + "epoch": 1.6552036199095022, + "grad_norm": 0.6213289499282837, + "learning_rate": 0.0004423767320352035, + "loss": 0.1597, + "mean_token_accuracy": 0.9599647223949432, + "num_tokens": 4045088.0, + "step": 458 + }, + { + "entropy": 2.047410547733307, + "epoch": 1.6588235294117646, + "grad_norm": 0.6346105933189392, + "learning_rate": 0.0004420570790853498, + "loss": 0.1422, + "mean_token_accuracy": 0.9649711549282074, + "num_tokens": 4054262.0, + "step": 459 + }, + { + "entropy": 2.0923012793064117, + "epoch": 1.662443438914027, + "grad_norm": 0.46477749943733215, + "learning_rate": 0.0004417368146100907, + "loss": 0.079, + "mean_token_accuracy": 0.9777993708848953, + "num_tokens": 4063107.0, + "step": 460 + }, + { + "entropy": 2.168913394212723, + "epoch": 1.6660633484162894, + "grad_norm": 0.5164734721183777, + "learning_rate": 0.0004414159398258312, + "loss": 0.0941, + "mean_token_accuracy": 0.9725133627653122, + "num_tokens": 4071656.0, + "step": 461 + }, + { + "entropy": 2.152670443058014, + "epoch": 1.6696832579185519, + "grad_norm": 0.8985757231712341, + "learning_rate": 0.00044109445595129495, + "loss": 0.2142, + "mean_token_accuracy": 0.9387252777814865, + "num_tokens": 4080023.0, + "step": 462 + }, + { + "entropy": 2.111784875392914, + "epoch": 1.6733031674208145, + "grad_norm": 0.47521084547042847, + "learning_rate": 0.0004407723642075184, + "loss": 0.0581, + "mean_token_accuracy": 0.9821985810995102, + "num_tokens": 4088469.0, + "step": 463 + }, + { + "entropy": 1.9784683287143707, + "epoch": 1.676923076923077, + "grad_norm": 0.5552536249160767, + "learning_rate": 0.0004404496658178472, + "loss": 0.1353, + "mean_token_accuracy": 0.9619844257831573, + "num_tokens": 4097737.0, + "step": 464 + }, + { + "entropy": 2.015674114227295, + "epoch": 1.6805429864253394, + "grad_norm": 0.6078305244445801, + "learning_rate": 0.0004401263620079309, + "loss": 0.1916, + "mean_token_accuracy": 0.9506707191467285, + "num_tokens": 4107156.0, + "step": 465 + }, + { + "entropy": 2.0832217931747437, + "epoch": 1.6841628959276018, + "grad_norm": 0.6618755459785461, + "learning_rate": 0.0004398024540057186, + "loss": 0.1671, + "mean_token_accuracy": 0.9617152661085129, + "num_tokens": 4116019.0, + "step": 466 + }, + { + "entropy": 2.0383114516735077, + "epoch": 1.6877828054298643, + "grad_norm": 0.5774693489074707, + "learning_rate": 0.0004394779430414541, + "loss": 0.2647, + "mean_token_accuracy": 0.9387127161026001, + "num_tokens": 4125001.0, + "step": 467 + }, + { + "entropy": 2.201409190893173, + "epoch": 1.6914027149321267, + "grad_norm": 0.7600311636924744, + "learning_rate": 0.0004391528303476715, + "loss": 0.073, + "mean_token_accuracy": 0.979825034737587, + "num_tokens": 4133467.0, + "step": 468 + }, + { + "entropy": 2.168666422367096, + "epoch": 1.6950226244343891, + "grad_norm": 0.7801902294158936, + "learning_rate": 0.00043882711715919015, + "loss": 0.2406, + "mean_token_accuracy": 0.9451306313276291, + "num_tokens": 4141765.0, + "step": 469 + }, + { + "entropy": 2.1429262161254883, + "epoch": 1.6986425339366515, + "grad_norm": 0.5192358493804932, + "learning_rate": 0.0004385008047131104, + "loss": 0.1052, + "mean_token_accuracy": 0.9749262481927872, + "num_tokens": 4150732.0, + "step": 470 + }, + { + "entropy": 2.1387495696544647, + "epoch": 1.702262443438914, + "grad_norm": 0.6219777464866638, + "learning_rate": 0.0004381738942488083, + "loss": 0.2127, + "mean_token_accuracy": 0.9398418068885803, + "num_tokens": 4159715.0, + "step": 471 + }, + { + "entropy": 2.1718398332595825, + "epoch": 1.7058823529411766, + "grad_norm": 0.5738123655319214, + "learning_rate": 0.0004378463870079316, + "loss": 0.1703, + "mean_token_accuracy": 0.9520847648382187, + "num_tokens": 4168526.0, + "step": 472 + }, + { + "entropy": 2.2768235206604004, + "epoch": 1.709502262443439, + "grad_norm": 0.662564754486084, + "learning_rate": 0.00043751828423439456, + "loss": 0.138, + "mean_token_accuracy": 0.9581841826438904, + "num_tokens": 4177189.0, + "step": 473 + }, + { + "entropy": 2.29143089056015, + "epoch": 1.7131221719457015, + "grad_norm": 0.8638074398040771, + "learning_rate": 0.00043718958717437324, + "loss": 0.1432, + "mean_token_accuracy": 0.9645630270242691, + "num_tokens": 4185367.0, + "step": 474 + }, + { + "entropy": 2.2810245156288147, + "epoch": 1.716742081447964, + "grad_norm": 0.6139346957206726, + "learning_rate": 0.00043686029707630097, + "loss": 0.173, + "mean_token_accuracy": 0.9592728316783905, + "num_tokens": 4194418.0, + "step": 475 + }, + { + "entropy": 2.1307725310325623, + "epoch": 1.7203619909502263, + "grad_norm": 0.5192779302597046, + "learning_rate": 0.00043653041519086354, + "loss": 0.1025, + "mean_token_accuracy": 0.970764696598053, + "num_tokens": 4203705.0, + "step": 476 + }, + { + "entropy": 2.160595118999481, + "epoch": 1.7239819004524888, + "grad_norm": 0.7398526668548584, + "learning_rate": 0.0004361999427709943, + "loss": 0.229, + "mean_token_accuracy": 0.9352773874998093, + "num_tokens": 4212648.0, + "step": 477 + }, + { + "entropy": 2.1865442991256714, + "epoch": 1.7276018099547512, + "grad_norm": 0.6227203011512756, + "learning_rate": 0.0004358688810718699, + "loss": 0.1118, + "mean_token_accuracy": 0.9689576476812363, + "num_tokens": 4221208.0, + "step": 478 + }, + { + "entropy": 2.086527943611145, + "epoch": 1.7312217194570136, + "grad_norm": 0.722144603729248, + "learning_rate": 0.00043553723135090447, + "loss": 0.1656, + "mean_token_accuracy": 0.9537550210952759, + "num_tokens": 4230810.0, + "step": 479 + }, + { + "entropy": 2.068355441093445, + "epoch": 1.734841628959276, + "grad_norm": 0.5781517028808594, + "learning_rate": 0.0004352049948677462, + "loss": 0.1497, + "mean_token_accuracy": 0.9600837379693985, + "num_tokens": 4240394.0, + "step": 480 + }, + { + "entropy": 2.185140371322632, + "epoch": 1.7384615384615385, + "grad_norm": 0.7261873483657837, + "learning_rate": 0.0004348721728842715, + "loss": 0.1582, + "mean_token_accuracy": 0.9584025889635086, + "num_tokens": 4249205.0, + "step": 481 + }, + { + "entropy": 2.21835720539093, + "epoch": 1.742081447963801, + "grad_norm": 0.5321667194366455, + "learning_rate": 0.0004345387666645807, + "loss": 0.1344, + "mean_token_accuracy": 0.9659005403518677, + "num_tokens": 4257808.0, + "step": 482 + }, + { + "entropy": 2.078131854534149, + "epoch": 1.7457013574660634, + "grad_norm": 0.5598498582839966, + "learning_rate": 0.00043420477747499307, + "loss": 0.1347, + "mean_token_accuracy": 0.9678008407354355, + "num_tokens": 4266728.0, + "step": 483 + }, + { + "entropy": 2.060504525899887, + "epoch": 1.7493212669683258, + "grad_norm": 0.5017166137695312, + "learning_rate": 0.0004338702065840422, + "loss": 0.0722, + "mean_token_accuracy": 0.9762782007455826, + "num_tokens": 4275514.0, + "step": 484 + }, + { + "entropy": 2.165244698524475, + "epoch": 1.7529411764705882, + "grad_norm": 0.4664002060890198, + "learning_rate": 0.00043353505526247084, + "loss": 0.1206, + "mean_token_accuracy": 0.9696767777204514, + "num_tokens": 4284013.0, + "step": 485 + }, + { + "entropy": 2.103049159049988, + "epoch": 1.7565610859728507, + "grad_norm": 0.6669000387191772, + "learning_rate": 0.0004331993247832265, + "loss": 0.1052, + "mean_token_accuracy": 0.9665459096431732, + "num_tokens": 4293011.0, + "step": 486 + }, + { + "entropy": 2.1286613941192627, + "epoch": 1.760180995475113, + "grad_norm": 0.7821269631385803, + "learning_rate": 0.00043286301642145634, + "loss": 0.3669, + "mean_token_accuracy": 0.9062697291374207, + "num_tokens": 4301965.0, + "step": 487 + }, + { + "entropy": 2.098009169101715, + "epoch": 1.7638009049773755, + "grad_norm": 0.5720731616020203, + "learning_rate": 0.0004325261314545024, + "loss": 0.1324, + "mean_token_accuracy": 0.9650943875312805, + "num_tokens": 4310914.0, + "step": 488 + }, + { + "entropy": 2.164614498615265, + "epoch": 1.767420814479638, + "grad_norm": 1.0500473976135254, + "learning_rate": 0.0004321886711618967, + "loss": 0.1182, + "mean_token_accuracy": 0.9720661342144012, + "num_tokens": 4319072.0, + "step": 489 + }, + { + "entropy": 2.2015402913093567, + "epoch": 1.7710407239819004, + "grad_norm": 0.5770253539085388, + "learning_rate": 0.00043185063682535634, + "loss": 0.1226, + "mean_token_accuracy": 0.9615659862756729, + "num_tokens": 4327539.0, + "step": 490 + }, + { + "entropy": 2.075456440448761, + "epoch": 1.7746606334841628, + "grad_norm": 0.6456925272941589, + "learning_rate": 0.0004315120297287789, + "loss": 0.1123, + "mean_token_accuracy": 0.9628709554672241, + "num_tokens": 4336523.0, + "step": 491 + }, + { + "entropy": 2.158169150352478, + "epoch": 1.7782805429864252, + "grad_norm": 0.8282069563865662, + "learning_rate": 0.00043117285115823733, + "loss": 0.2146, + "mean_token_accuracy": 0.9413971602916718, + "num_tokens": 4345294.0, + "step": 492 + }, + { + "entropy": 2.02735897898674, + "epoch": 1.7819004524886877, + "grad_norm": 0.783597469329834, + "learning_rate": 0.000430833102401975, + "loss": 0.1376, + "mean_token_accuracy": 0.964630737900734, + "num_tokens": 4354107.0, + "step": 493 + }, + { + "entropy": 2.138492166996002, + "epoch": 1.78552036199095, + "grad_norm": 0.6317175030708313, + "learning_rate": 0.000430492784750401, + "loss": 0.1005, + "mean_token_accuracy": 0.9734214246273041, + "num_tokens": 4362560.0, + "step": 494 + }, + { + "entropy": 2.0253217220306396, + "epoch": 1.7891402714932125, + "grad_norm": 0.5523395538330078, + "learning_rate": 0.000430151899496085, + "loss": 0.1633, + "mean_token_accuracy": 0.9558031558990479, + "num_tokens": 4371698.0, + "step": 495 + }, + { + "entropy": 2.160472810268402, + "epoch": 1.792760180995475, + "grad_norm": 0.6557935476303101, + "learning_rate": 0.00042981044793375295, + "loss": 0.1154, + "mean_token_accuracy": 0.9722230583429337, + "num_tokens": 4380612.0, + "step": 496 + }, + { + "entropy": 2.0284159183502197, + "epoch": 1.7963800904977374, + "grad_norm": 0.7357863187789917, + "learning_rate": 0.00042946843136028117, + "loss": 0.1166, + "mean_token_accuracy": 0.9629471153020859, + "num_tokens": 4389521.0, + "step": 497 + }, + { + "entropy": 2.1544791162014008, + "epoch": 1.8, + "grad_norm": 0.5604898929595947, + "learning_rate": 0.00042912585107469226, + "loss": 0.0834, + "mean_token_accuracy": 0.9783036410808563, + "num_tokens": 4398059.0, + "step": 498 + }, + { + "entropy": 2.1051094830036163, + "epoch": 1.8036199095022625, + "grad_norm": 0.4598539173603058, + "learning_rate": 0.0004287827083781497, + "loss": 0.0411, + "mean_token_accuracy": 0.9868490546941757, + "num_tokens": 4406453.0, + "step": 499 + }, + { + "entropy": 2.0219272077083588, + "epoch": 1.807239819004525, + "grad_norm": 0.8164628744125366, + "learning_rate": 0.00042843900457395343, + "loss": 0.1988, + "mean_token_accuracy": 0.9502352625131607, + "num_tokens": 4415440.0, + "step": 500 + }, + { + "entropy": 1.980013906955719, + "epoch": 1.8108597285067873, + "grad_norm": 0.572798490524292, + "learning_rate": 0.0004280947409675341, + "loss": 0.1148, + "mean_token_accuracy": 0.966580331325531, + "num_tokens": 4424532.0, + "step": 501 + }, + { + "entropy": 2.0646563172340393, + "epoch": 1.8144796380090498, + "grad_norm": 0.769386351108551, + "learning_rate": 0.00042774991886644875, + "loss": 0.1592, + "mean_token_accuracy": 0.9553463608026505, + "num_tokens": 4432913.0, + "step": 502 + }, + { + "entropy": 2.040877491235733, + "epoch": 1.8180995475113122, + "grad_norm": 0.7467371821403503, + "learning_rate": 0.0004274045395803758, + "loss": 0.2247, + "mean_token_accuracy": 0.9526964277029037, + "num_tokens": 4441425.0, + "step": 503 + }, + { + "entropy": 1.9934698939323425, + "epoch": 1.8217194570135746, + "grad_norm": 0.6602952480316162, + "learning_rate": 0.00042705860442110964, + "loss": 0.1681, + "mean_token_accuracy": 0.9594631940126419, + "num_tokens": 4450383.0, + "step": 504 + }, + { + "entropy": 2.0858289897441864, + "epoch": 1.825339366515837, + "grad_norm": 0.684380829334259, + "learning_rate": 0.0004267121147025562, + "loss": 0.1154, + "mean_token_accuracy": 0.9638111293315887, + "num_tokens": 4458862.0, + "step": 505 + }, + { + "entropy": 2.0886995792388916, + "epoch": 1.8289592760180997, + "grad_norm": 0.5784837007522583, + "learning_rate": 0.00042636507174072756, + "loss": 0.1026, + "mean_token_accuracy": 0.9676834791898727, + "num_tokens": 4467386.0, + "step": 506 + }, + { + "entropy": 2.0236063301563263, + "epoch": 1.8325791855203621, + "grad_norm": 0.5101180672645569, + "learning_rate": 0.00042601747685373716, + "loss": 0.1031, + "mean_token_accuracy": 0.9734093993902206, + "num_tokens": 4476054.0, + "step": 507 + }, + { + "entropy": 1.9801031053066254, + "epoch": 1.8361990950226246, + "grad_norm": 0.6581607460975647, + "learning_rate": 0.00042566933136179455, + "loss": 0.1548, + "mean_token_accuracy": 0.9581006914377213, + "num_tokens": 4484895.0, + "step": 508 + }, + { + "entropy": 2.0244787633419037, + "epoch": 1.839819004524887, + "grad_norm": 0.8100608587265015, + "learning_rate": 0.0004253206365872008, + "loss": 0.196, + "mean_token_accuracy": 0.9532899260520935, + "num_tokens": 4493737.0, + "step": 509 + }, + { + "entropy": 1.9108119010925293, + "epoch": 1.8434389140271494, + "grad_norm": 0.4903942048549652, + "learning_rate": 0.00042497139385434314, + "loss": 0.1313, + "mean_token_accuracy": 0.9667337089776993, + "num_tokens": 4502840.0, + "step": 510 + }, + { + "entropy": 2.009468197822571, + "epoch": 1.8470588235294119, + "grad_norm": 0.6010113954544067, + "learning_rate": 0.0004246216044896897, + "loss": 0.1013, + "mean_token_accuracy": 0.9692314714193344, + "num_tokens": 4511407.0, + "step": 511 + }, + { + "entropy": 2.0337170362472534, + "epoch": 1.8506787330316743, + "grad_norm": 0.7906802892684937, + "learning_rate": 0.00042427126982178546, + "loss": 0.1682, + "mean_token_accuracy": 0.9550099819898605, + "num_tokens": 4520018.0, + "step": 512 + }, + { + "entropy": 1.8813888728618622, + "epoch": 1.8542986425339367, + "grad_norm": 0.5353080034255981, + "learning_rate": 0.00042392039118124586, + "loss": 0.1228, + "mean_token_accuracy": 0.9624074995517731, + "num_tokens": 4529270.0, + "step": 513 + }, + { + "entropy": 2.012698233127594, + "epoch": 1.8579185520361992, + "grad_norm": 0.6713843941688538, + "learning_rate": 0.00042356896990075285, + "loss": 0.2225, + "mean_token_accuracy": 0.9417333751916885, + "num_tokens": 4538008.0, + "step": 514 + }, + { + "entropy": 1.880586564540863, + "epoch": 1.8615384615384616, + "grad_norm": 0.5821724534034729, + "learning_rate": 0.00042321700731504916, + "loss": 0.1144, + "mean_token_accuracy": 0.9677341282367706, + "num_tokens": 4546950.0, + "step": 515 + }, + { + "entropy": 2.0066279470920563, + "epoch": 1.865158371040724, + "grad_norm": 0.4095056354999542, + "learning_rate": 0.0004228645047609335, + "loss": 0.0424, + "mean_token_accuracy": 0.9854962974786758, + "num_tokens": 4555452.0, + "step": 516 + }, + { + "entropy": 2.042815536260605, + "epoch": 1.8687782805429864, + "grad_norm": 0.5398769974708557, + "learning_rate": 0.0004225114635772555, + "loss": 0.1343, + "mean_token_accuracy": 0.9615450948476791, + "num_tokens": 4564386.0, + "step": 517 + }, + { + "entropy": 2.0948933362960815, + "epoch": 1.8723981900452489, + "grad_norm": 0.6738974452018738, + "learning_rate": 0.0004221578851049107, + "loss": 0.1541, + "mean_token_accuracy": 0.9526563137769699, + "num_tokens": 4573041.0, + "step": 518 + }, + { + "entropy": 2.102545380592346, + "epoch": 1.8760180995475113, + "grad_norm": 0.7769943475723267, + "learning_rate": 0.00042180377068683504, + "loss": 0.2362, + "mean_token_accuracy": 0.9472651779651642, + "num_tokens": 4581666.0, + "step": 519 + }, + { + "entropy": 2.087820291519165, + "epoch": 1.8796380090497737, + "grad_norm": 0.5722424983978271, + "learning_rate": 0.0004214491216680004, + "loss": 0.1657, + "mean_token_accuracy": 0.9537082612514496, + "num_tokens": 4590238.0, + "step": 520 + }, + { + "entropy": 2.0093430876731873, + "epoch": 1.8832579185520362, + "grad_norm": 0.5844932198524475, + "learning_rate": 0.00042109393939540867, + "loss": 0.1485, + "mean_token_accuracy": 0.9624215811491013, + "num_tokens": 4599352.0, + "step": 521 + }, + { + "entropy": 1.9117147326469421, + "epoch": 1.8868778280542986, + "grad_norm": 0.46085676550865173, + "learning_rate": 0.0004207382252180876, + "loss": 0.0853, + "mean_token_accuracy": 0.9769327491521835, + "num_tokens": 4608571.0, + "step": 522 + }, + { + "entropy": 2.0205602943897247, + "epoch": 1.890497737556561, + "grad_norm": 0.5571608543395996, + "learning_rate": 0.000420381980487085, + "loss": 0.1517, + "mean_token_accuracy": 0.9646699875593185, + "num_tokens": 4617445.0, + "step": 523 + }, + { + "entropy": 1.9571953415870667, + "epoch": 1.8941176470588235, + "grad_norm": 0.470630943775177, + "learning_rate": 0.0004200252065554636, + "loss": 0.1005, + "mean_token_accuracy": 0.9750025719404221, + "num_tokens": 4626756.0, + "step": 524 + }, + { + "entropy": 2.063209116458893, + "epoch": 1.897737556561086, + "grad_norm": 0.6447069644927979, + "learning_rate": 0.00041966790477829637, + "loss": 0.113, + "mean_token_accuracy": 0.9695079624652863, + "num_tokens": 4635378.0, + "step": 525 + }, + { + "entropy": 1.9232109785079956, + "epoch": 1.9013574660633483, + "grad_norm": 0.5114295482635498, + "learning_rate": 0.000419310076512661, + "loss": 0.1492, + "mean_token_accuracy": 0.9653338938951492, + "num_tokens": 4644769.0, + "step": 526 + }, + { + "entropy": 2.1691197752952576, + "epoch": 1.9049773755656108, + "grad_norm": 0.7630137205123901, + "learning_rate": 0.00041895172311763476, + "loss": 0.212, + "mean_token_accuracy": 0.9533941894769669, + "num_tokens": 4652857.0, + "step": 527 + }, + { + "entropy": 2.04753240942955, + "epoch": 1.9085972850678732, + "grad_norm": 0.6423042416572571, + "learning_rate": 0.00041859284595428955, + "loss": 0.1455, + "mean_token_accuracy": 0.956505224108696, + "num_tokens": 4661591.0, + "step": 528 + }, + { + "entropy": 1.9440338611602783, + "epoch": 1.9122171945701356, + "grad_norm": 0.5011327266693115, + "learning_rate": 0.00041823344638568656, + "loss": 0.1255, + "mean_token_accuracy": 0.965131089091301, + "num_tokens": 4670594.0, + "step": 529 + }, + { + "entropy": 2.0554805397987366, + "epoch": 1.915837104072398, + "grad_norm": 0.5821590423583984, + "learning_rate": 0.0004178735257768713, + "loss": 0.0486, + "mean_token_accuracy": 0.9875282496213913, + "num_tokens": 4679344.0, + "step": 530 + }, + { + "entropy": 2.130349576473236, + "epoch": 1.9194570135746605, + "grad_norm": 0.5332052111625671, + "learning_rate": 0.0004175130854948679, + "loss": 0.0915, + "mean_token_accuracy": 0.9737034440040588, + "num_tokens": 4687922.0, + "step": 531 + }, + { + "entropy": 2.146788775920868, + "epoch": 1.9230769230769231, + "grad_norm": 0.5016877055168152, + "learning_rate": 0.00041715212690867455, + "loss": 0.1281, + "mean_token_accuracy": 0.9681432545185089, + "num_tokens": 4696593.0, + "step": 532 + }, + { + "entropy": 2.041268438100815, + "epoch": 1.9266968325791856, + "grad_norm": 0.5257729887962341, + "learning_rate": 0.00041679065138925807, + "loss": 0.1272, + "mean_token_accuracy": 0.9649266451597214, + "num_tokens": 4705792.0, + "step": 533 + }, + { + "entropy": 2.114819645881653, + "epoch": 1.930316742081448, + "grad_norm": 0.7085135579109192, + "learning_rate": 0.0004164286603095484, + "loss": 0.1545, + "mean_token_accuracy": 0.9581228941679001, + "num_tokens": 4714599.0, + "step": 534 + }, + { + "entropy": 2.022280514240265, + "epoch": 1.9339366515837104, + "grad_norm": 0.5309014320373535, + "learning_rate": 0.00041606615504443387, + "loss": 0.1933, + "mean_token_accuracy": 0.9562340676784515, + "num_tokens": 4724062.0, + "step": 535 + }, + { + "entropy": 2.0959260165691376, + "epoch": 1.9375565610859729, + "grad_norm": 0.6528061628341675, + "learning_rate": 0.0004157031369707557, + "loss": 0.1306, + "mean_token_accuracy": 0.9612343460321426, + "num_tokens": 4733077.0, + "step": 536 + }, + { + "entropy": 2.2772948145866394, + "epoch": 1.9411764705882353, + "grad_norm": 0.7351471185684204, + "learning_rate": 0.0004153396074673028, + "loss": 0.1494, + "mean_token_accuracy": 0.9608108699321747, + "num_tokens": 4741201.0, + "step": 537 + }, + { + "entropy": 2.0935052037239075, + "epoch": 1.9447963800904977, + "grad_norm": 0.5435840487480164, + "learning_rate": 0.0004149755679148065, + "loss": 0.0884, + "mean_token_accuracy": 0.9745689779520035, + "num_tokens": 4750306.0, + "step": 538 + }, + { + "entropy": 2.2082818746566772, + "epoch": 1.9484162895927601, + "grad_norm": 0.3780331611633301, + "learning_rate": 0.00041461101969593537, + "loss": 0.0739, + "mean_token_accuracy": 0.9777179658412933, + "num_tokens": 4758954.0, + "step": 539 + }, + { + "entropy": 2.1683040261268616, + "epoch": 1.9520361990950226, + "grad_norm": 0.4637961685657501, + "learning_rate": 0.00041424596419529017, + "loss": 0.0632, + "mean_token_accuracy": 0.9834533184766769, + "num_tokens": 4767615.0, + "step": 540 + }, + { + "entropy": 2.075555235147476, + "epoch": 1.9556561085972852, + "grad_norm": 0.7603118419647217, + "learning_rate": 0.00041388040279939804, + "loss": 0.2835, + "mean_token_accuracy": 0.9364205300807953, + "num_tokens": 4776714.0, + "step": 541 + }, + { + "entropy": 2.18926739692688, + "epoch": 1.9592760180995477, + "grad_norm": 0.8895708918571472, + "learning_rate": 0.0004135143368967079, + "loss": 0.2514, + "mean_token_accuracy": 0.9361050724983215, + "num_tokens": 4785402.0, + "step": 542 + }, + { + "entropy": 2.2387169003486633, + "epoch": 1.96289592760181, + "grad_norm": 0.6013544797897339, + "learning_rate": 0.00041314776787758454, + "loss": 0.1502, + "mean_token_accuracy": 0.9594238847494125, + "num_tokens": 4793928.0, + "step": 543 + }, + { + "entropy": 2.208383619785309, + "epoch": 1.9665158371040725, + "grad_norm": 0.6934756636619568, + "learning_rate": 0.00041278069713430386, + "loss": 0.1777, + "mean_token_accuracy": 0.9619583487510681, + "num_tokens": 4802612.0, + "step": 544 + }, + { + "entropy": 2.2621757984161377, + "epoch": 1.970135746606335, + "grad_norm": 0.6920077800750732, + "learning_rate": 0.00041241312606104743, + "loss": 0.1689, + "mean_token_accuracy": 0.9594835937023163, + "num_tokens": 4811332.0, + "step": 545 + }, + { + "entropy": 2.2654454112052917, + "epoch": 1.9737556561085974, + "grad_norm": 0.6259592771530151, + "learning_rate": 0.000412045056053897, + "loss": 0.142, + "mean_token_accuracy": 0.9648078680038452, + "num_tokens": 4820441.0, + "step": 546 + }, + { + "entropy": 2.218056857585907, + "epoch": 1.9773755656108598, + "grad_norm": 0.5390617847442627, + "learning_rate": 0.0004116764885108292, + "loss": 0.1737, + "mean_token_accuracy": 0.9595656991004944, + "num_tokens": 4829437.0, + "step": 547 + }, + { + "entropy": 2.2571592330932617, + "epoch": 1.9809954751131222, + "grad_norm": 0.3656528890132904, + "learning_rate": 0.0004113074248317108, + "loss": 0.0545, + "mean_token_accuracy": 0.9825418293476105, + "num_tokens": 4838118.0, + "step": 548 + }, + { + "entropy": 2.1890549659729004, + "epoch": 1.9846153846153847, + "grad_norm": 0.5716155767440796, + "learning_rate": 0.00041093786641829247, + "loss": 0.0997, + "mean_token_accuracy": 0.9715700745582581, + "num_tokens": 4847073.0, + "step": 549 + }, + { + "entropy": 2.2726192474365234, + "epoch": 1.988235294117647, + "grad_norm": 0.4709530770778656, + "learning_rate": 0.0004105678146742042, + "loss": 0.0746, + "mean_token_accuracy": 0.9799739569425583, + "num_tokens": 4855755.0, + "step": 550 + }, + { + "entropy": 2.2328362464904785, + "epoch": 1.9918552036199095, + "grad_norm": 0.6773779392242432, + "learning_rate": 0.0004101972710049498, + "loss": 0.1418, + "mean_token_accuracy": 0.9629421681165695, + "num_tokens": 4864601.0, + "step": 551 + }, + { + "entropy": 2.199812740087509, + "epoch": 1.995475113122172, + "grad_norm": 0.717012882232666, + "learning_rate": 0.00040982623681790113, + "loss": 0.2948, + "mean_token_accuracy": 0.9432803690433502, + "num_tokens": 4873630.0, + "step": 552 + }, + { + "entropy": 2.2102787494659424, + "epoch": 1.9990950226244344, + "grad_norm": 0.6925314664840698, + "learning_rate": 0.00040945471352229346, + "loss": 0.2579, + "mean_token_accuracy": 0.9435124397277832, + "num_tokens": 4882714.0, + "step": 553 + }, + { + "entropy": 2.3318979740142822, + "epoch": 2.0, + "grad_norm": 2.688188314437866, + "learning_rate": 0.0004090827025292197, + "loss": 0.0283, + "mean_token_accuracy": 0.9918032884597778, + "num_tokens": 4883450.0, + "step": 554 + }, + { + "epoch": 2.0, + "eval_entropy": 2.2165925522160723, + "eval_loss": 0.16817161440849304, + "eval_mean_token_accuracy": 0.9567220133494555, + "eval_num_tokens": 4883450.0, + "eval_runtime": 116.1556, + "eval_samples_per_second": 3.177, + "eval_steps_per_second": 1.059, + "step": 554 + } + ], + "logging_steps": 1, + "max_steps": 1662, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.634384518674615e+17, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-554/training_args.bin b/checkpoint-554/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..070a2de135e794840c49e066215a1c9f2e550d1f --- /dev/null +++ b/checkpoint-554/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc271f94ce32216bd6f2ee9866fb7d62a0583bc7ee0c7fa953fa57c302729c6c +size 6289 diff --git a/checkpoint-831/README.md b/checkpoint-831/README.md new file mode 100644 index 0000000000000000000000000000000000000000..58a4061707bcc32db3b543936f6b650c01f3dccb --- /dev/null +++ b/checkpoint-831/README.md @@ -0,0 +1,208 @@ +--- +base_model: openai/gpt-oss-20b +library_name: peft +tags: +- base_model:adapter:openai/gpt-oss-20b +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/checkpoint-831/adapter_config.json b/checkpoint-831/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..076480eaf349cc658de2eb00b26c7360a85f8f56 --- /dev/null +++ b/checkpoint-831/adapter_config.json @@ -0,0 +1,53 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "GptOssForCausalLM", + "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss" + }, + "base_model_name_or_path": "openai/gpt-oss-20b", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "o_proj", + "q_proj" + ], + "target_parameters": [ + "7.mlp.experts.gate_up_proj", + "7.mlp.experts.down_proj", + "15.mlp.experts.gate_up_proj", + "15.mlp.experts.down_proj", + "23.mlp.experts.gate_up_proj", + "23.mlp.experts.down_proj" + ], + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-831/adapter_model.safetensors b/checkpoint-831/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7ac580fe569cd71a8bed11b4d04a598d088722b2 --- /dev/null +++ b/checkpoint-831/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cfe71b327a3694c136a8911cfa25b9e95b8231d2caef4f410faf1c618d7b2e8 +size 60189176 diff --git a/checkpoint-831/chat_template.jinja b/checkpoint-831/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc7bb11927d29f653ba2740f2db2c688fd77592f --- /dev/null +++ b/checkpoint-831/chat_template.jinja @@ -0,0 +1,331 @@ +{#- + In addition to the normal inputs of `messages` and `tools`, this template also accepts the + following kwargs: + - "builtin_tools": A list, can contain "browser" and/or "python". + - "model_identity": A string that optionally describes the model identity. + - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium". + #} + +{#- Tool Definition Rendering ============================================== #} +{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%} + {%- if param_spec.type == "array" -%} + {%- if param_spec['items'] -%} + {%- if param_spec['items']['type'] == "string" -%} + {{- "string[]" }} + {%- elif param_spec['items']['type'] == "number" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "integer" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "boolean" -%} + {{- "boolean[]" }} + {%- else -%} + {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%} + {%- if inner_type == "object | object" or inner_type|length > 50 -%} + {{- "any[]" }} + {%- else -%} + {{- inner_type + "[]" }} + {%- endif -%} + {%- endif -%} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- else -%} + {{- "any[]" }} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%} + {#- Handle array of types like ["object", "object"] from Union[dict, list] #} + {%- if param_spec.type | length > 1 -%} + {{- param_spec.type | join(" | ") }} + {%- else -%} + {{- param_spec.type[0] }} + {%- endif -%} + {%- elif param_spec.oneOf -%} + {#- Handle oneOf schemas - check for complex unions and fallback to any #} + {%- set has_object_variants = false -%} + {%- for variant in param_spec.oneOf -%} + {%- if variant.type == "object" -%} + {%- set has_object_variants = true -%} + {%- endif -%} + {%- endfor -%} + {%- if has_object_variants and param_spec.oneOf|length > 1 -%} + {{- "any" }} + {%- else -%} + {%- for variant in param_spec.oneOf -%} + {{- render_typescript_type(variant, required_params) -}} + {%- if variant.description %} + {{- "// " + variant.description }} + {%- endif -%} + {%- if variant.default is defined %} + {{ "// default: " + variant.default|tojson }} + {%- endif -%} + {%- if not loop.last %} + {{- " | " }} + {% endif -%} + {%- endfor -%} + {%- endif -%} + {%- elif param_spec.type == "string" -%} + {%- if param_spec.enum -%} + {{- '"' + param_spec.enum|join('" | "') + '"' -}} + {%- else -%} + {{- "string" }} + {%- if param_spec.nullable %} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type == "number" -%} + {{- "number" }} + {%- elif param_spec.type == "integer" -%} + {{- "number" }} + {%- elif param_spec.type == "boolean" -%} + {{- "boolean" }} + + {%- elif param_spec.type == "object" -%} + {%- if param_spec.properties -%} + {{- "{\n" }} + {%- for prop_name, prop_spec in param_spec.properties.items() -%} + {{- prop_name -}} + {%- if prop_name not in (param_spec.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{ render_typescript_type(prop_spec, param_spec.required or []) }} + {%- if not loop.last -%} + {{-", " }} + {%- endif -%} + {%- endfor -%} + {{- "}" }} + {%- else -%} + {{- "object" }} + {%- endif -%} + {%- else -%} + {{- "any" }} + {%- endif -%} +{%- endmacro -%} + +{%- macro render_tool_namespace(namespace_name, tools) -%} + {{- "## " + namespace_name + "\n\n" }} + {{- "namespace " + namespace_name + " {\n\n" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- "// " + tool.description + "\n" }} + {{- "type "+ tool.name + " = " }} + {%- if tool.parameters and tool.parameters.properties %} + {{- "(_: {\n" }} + {%- for param_name, param_spec in tool.parameters.properties.items() %} + {%- if param_spec.description %} + {{- "// " + param_spec.description + "\n" }} + {%- endif %} + {{- param_name }} + {%- if param_name not in (tool.parameters.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{- render_typescript_type(param_spec, tool.parameters.required or []) }} + {%- if param_spec.default is defined -%} + {%- if param_spec.enum %} + {{- ", // default: " + param_spec.default }} + {%- elif param_spec.oneOf %} + {{- "// default: " + param_spec.default }} + {%- else %} + {{- ", // default: " + param_spec.default|tojson }} + {%- endif -%} + {%- endif -%} + {%- if not loop.last %} + {{- ",\n" }} + {%- else %} + {{- ",\n" }} + {%- endif -%} + {%- endfor %} + {{- "}) => any;\n\n" }} + {%- else -%} + {{- "() => any;\n\n" }} + {%- endif -%} + {%- endfor %} + {{- "} // namespace " + namespace_name }} +{%- endmacro -%} + +{%- macro render_builtin_tools(browser_tool, python_tool) -%} + {%- if browser_tool %} + {{- "## browser\n\n" }} + {{- "// Tool for browsing.\n" }} + {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }} + {{- "// Cite information from the tool using the following format:\n" }} + {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }} + {{- "// Do not quote more than 10 words directly from the tool output.\n" }} + {{- "// sources=web (default: web)\n" }} + {{- "namespace browser {\n\n" }} + {{- "// Searches for information related to `query` and displays `topn` results.\n" }} + {{- "type search = (_: {\n" }} + {{- "query: string,\n" }} + {{- "topn?: number, // default: 10\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }} + {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }} + {{- "// If `cursor` is not provided, the most recent page is implied.\n" }} + {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }} + {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }} + {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }} + {{- "type open = (_: {\n" }} + {{- "id?: number | string, // default: -1\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "loc?: number, // default: -1\n" }} + {{- "num_lines?: number, // default: -1\n" }} + {{- "view_source?: boolean, // default: false\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }} + {{- "type find = (_: {\n" }} + {{- "pattern: string,\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "}) => any;\n\n" }} + {{- "} // namespace browser\n\n" }} + {%- endif -%} + + {%- if python_tool %} + {{- "## python\n\n" }} + {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }} + {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }} + {%- endif -%} +{%- endmacro -%} + +{#- System Message Construction ============================================ #} +{%- macro build_system_message() -%} + {%- if model_identity is not defined %} + {%- set model_identity = "You are ChatGPT, a large language model trained by OpenAI." %} + {%- endif %} + {{- model_identity + "\n" }} + {{- "Knowledge cutoff: 2024-06\n" }} + {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }} + {%- if reasoning_effort is not defined %} + {%- set reasoning_effort = "medium" %} + {%- endif %} + {{- "Reasoning: " + reasoning_effort + "\n\n" }} + {%- if builtin_tools %} + {{- "# Tools\n\n" }} + {%- set available_builtin_tools = namespace(browser=false, python=false) %} + {%- for tool in builtin_tools %} + {%- if tool == "browser" %} + {%- set available_builtin_tools.browser = true %} + {%- elif tool == "python" %} + {%- set available_builtin_tools.python = true %} + {%- endif %} + {%- endfor %} + {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }} + {%- endif -%} + {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }} + {%- if tools -%} + {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }} + {%- endif -%} +{%- endmacro -%} + +{#- Main Template Logic ================================================= #} +{#- Set defaults #} + +{#- Render system message #} +{{- "<|start|>system<|message|>" }} +{{- build_system_message() }} +{{- "<|end|>" }} + +{#- Extract developer message #} +{%- if messages[0].role == "developer" or messages[0].role == "system" %} + {%- set developer_message = messages[0].content %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set developer_message = "" %} + {%- set loop_messages = messages %} +{%- endif %} + +{#- Render developer message #} +{%- if developer_message or tools %} + {{- "<|start|>developer<|message|>" }} + {%- if developer_message %} + {{- "# Instructions\n\n" }} + {{- developer_message }} + {{- "\n\n" }} + {%- endif %} + {%- if tools -%} + {{- "# Tools\n\n" }} + {{- render_tool_namespace("functions", tools) }} + {%- endif -%} + {{- "<|end|>" }} +{%- endif %} + +{#- Render messages #} +{%- set last_tool_call = namespace(name=none) %} +{%- for message in loop_messages -%} + {#- At this point only assistant/user/tool messages should remain #} + {%- if message.role == 'assistant' -%} + {#- Checks to ensure the messages are being passed in the format we expect #} + {%- if "content" in message %} + {%- if "<|channel|>analysis<|message|>" in message.content or "<|channel|>final<|message|>" in message.content %} + {{- raise_exception("You have passed a message containing <|channel|> tags in the content field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }} + {%- endif %} + {%- endif %} + {%- if "thinking" in message %} + {%- if "<|channel|>analysis<|message|>" in message.thinking or "<|channel|>final<|message|>" in message.thinking %} + {{- raise_exception("You have passed a message containing <|channel|> tags in the thinking field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }} + {%- endif %} + {%- endif %} + {%- if "tool_calls" in message %} + {#- We need very careful handling here - we want to drop the tool call analysis message if the model #} + {#- has output a later <|final|> message, but otherwise we want to retain it. This is the only case #} + {#- when we render CoT/analysis messages in inference. #} + {%- set future_final_message = namespace(found=false) %} + {%- for future_message in loop_messages[loop.index:] %} + {%- if future_message.role == 'assistant' and "tool_calls" not in future_message %} + {%- set future_final_message.found = true %} + {%- endif %} + {%- endfor %} + {#- We assume max 1 tool call per message, and so we infer the tool call name #} + {#- in "tool" messages from the most recent assistant tool call name #} + {%- set tool_call = message.tool_calls[0] %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if message.content and message.thinking %} + {{- raise_exception("Cannot pass both content and thinking in an assistant message with tool calls! Put the analysis message in one or the other, but not both.") }} + {%- elif message.content and not future_final_message.found %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }} + {%- elif message.thinking and not future_final_message.found %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {%- endif %} + {{- "<|start|>assistant to=" }} + {{- "functions." + tool_call.name + "<|channel|>commentary " }} + {{- (tool_call.content_type if tool_call.content_type is defined else "json") + "<|message|>" }} + {{- tool_call.arguments|tojson }} + {{- "<|call|>" }} + {%- set last_tool_call.name = tool_call.name %} + {%- elif loop.last and not add_generation_prompt %} + {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #} + {#- This is a situation that should only occur in training, never in inference. #} + {%- if "thinking" in message %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {%- endif %} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }} + {%- else %} + {#- CoT is dropped during all previous turns, so we never render it for inference #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- endif %} + {%- elif message.role == 'tool' -%} + {%- if last_tool_call.name is none %} + {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }} + {%- endif %} + {{- "<|start|>functions." + last_tool_call.name }} + {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} + {%- elif message.role == 'user' -%} + {{- "<|start|>user<|message|>" + message.content + "<|end|>" }} + {%- endif -%} +{%- endfor -%} + +{#- Generation prompt #} +{%- if add_generation_prompt -%} +<|start|>assistant +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-831/optimizer.pt b/checkpoint-831/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f924f9d1396d3017e3b69868b9ff8d1964541b3d --- /dev/null +++ b/checkpoint-831/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:743bbd61345e198aa10231b3b325dca94a70e3d4761ed03f61ccfc5a59900649 +size 120498699 diff --git a/checkpoint-831/rng_state.pth b/checkpoint-831/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ab68364cdd28cbfdc34c4fcc2e4416efba5ff2a --- /dev/null +++ b/checkpoint-831/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01f9a0f7843a37be87edd23f4e88aa93b38b95cc2c07503eeb1cf2e4632453a2 +size 14645 diff --git a/checkpoint-831/scheduler.pt b/checkpoint-831/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..712539f459bb6fc6d327475192a0556983a67ac1 --- /dev/null +++ b/checkpoint-831/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2469d195c1e9e54fadde572de5106954f0c6c1e11599b6eee5707e39dbd875c5 +size 1465 diff --git a/checkpoint-831/special_tokens_map.json b/checkpoint-831/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1c47e282982a9c6856832947a72ded329fad2e8c --- /dev/null +++ b/checkpoint-831/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|return|>" +} diff --git a/checkpoint-831/tokenizer.json b/checkpoint-831/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647 --- /dev/null +++ b/checkpoint-831/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3 +size 27868174 diff --git a/checkpoint-831/tokenizer_config.json b/checkpoint-831/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e86f6faa71de0fc3afe47ea8984da9e6138c031c --- /dev/null +++ b/checkpoint-831/tokenizer_config.json @@ -0,0 +1,183 @@ +{ + "added_tokens_decoder": { + "199998": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "199999": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200000": { + "content": "<|reserved_200000|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200001": { + "content": "<|reserved_200001|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200002": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200003": { + "content": "<|constrain|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200004": { + "content": "<|reserved_200004|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200005": { + "content": "<|channel|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200006": { + "content": "<|start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200008": { + "content": "<|message|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200009": { + "content": "<|reserved_200009|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200010": { + "content": "<|reserved_200010|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200011": { + "content": "<|reserved_200011|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200012": { + "content": "<|call|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200013": { + "content": "<|reserved_200013|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200014": { + "content": "<|reserved_200014|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200015": { + "content": "<|reserved_200015|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200016": { + "content": "<|reserved_200016|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200017": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200018": { + "content": "<|endofprompt|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|startoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|return|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|return|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-831/trainer_state.json b/checkpoint-831/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..232778781f02dc648cd32bb2a3915bba7575ee98 --- /dev/null +++ b/checkpoint-831/trainer_state.json @@ -0,0 +1,8377 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 831, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.358862280845642, + "epoch": 0.0036199095022624436, + "grad_norm": 2.292628288269043, + "learning_rate": 0.0, + "loss": 0.7311, + "mean_token_accuracy": 0.8534883409738541, + "num_tokens": 9316.0, + "step": 1 + }, + { + "entropy": 2.674945294857025, + "epoch": 0.007239819004524887, + "grad_norm": 3.8950836658477783, + "learning_rate": 1.0219999999999999e-05, + "loss": 1.0621, + "mean_token_accuracy": 0.8183160275220871, + "num_tokens": 17707.0, + "step": 2 + }, + { + "entropy": 2.4915525913238525, + "epoch": 0.01085972850678733, + "grad_norm": 2.792142868041992, + "learning_rate": 2.0439999999999997e-05, + "loss": 0.8448, + "mean_token_accuracy": 0.8489587754011154, + "num_tokens": 26783.0, + "step": 3 + }, + { + "entropy": 2.525622010231018, + "epoch": 0.014479638009049774, + "grad_norm": 2.7071900367736816, + "learning_rate": 3.0659999999999994e-05, + "loss": 0.8847, + "mean_token_accuracy": 0.8486668318510056, + "num_tokens": 35947.0, + "step": 4 + }, + { + "entropy": 2.588509976863861, + "epoch": 0.01809954751131222, + "grad_norm": 2.981574773788452, + "learning_rate": 4.0879999999999995e-05, + "loss": 1.0783, + "mean_token_accuracy": 0.8135111033916473, + "num_tokens": 44505.0, + "step": 5 + }, + { + "entropy": 2.662865400314331, + "epoch": 0.02171945701357466, + "grad_norm": 2.629283905029297, + "learning_rate": 5.1099999999999995e-05, + "loss": 0.9485, + "mean_token_accuracy": 0.8152717798948288, + "num_tokens": 53140.0, + "step": 6 + }, + { + "entropy": 2.6662243604660034, + "epoch": 0.025339366515837104, + "grad_norm": 2.730058431625366, + "learning_rate": 6.131999999999999e-05, + "loss": 0.6982, + "mean_token_accuracy": 0.8552135527133942, + "num_tokens": 61932.0, + "step": 7 + }, + { + "entropy": 2.661384105682373, + "epoch": 0.02895927601809955, + "grad_norm": 2.562839984893799, + "learning_rate": 7.154e-05, + "loss": 0.7296, + "mean_token_accuracy": 0.8579540699720383, + "num_tokens": 70973.0, + "step": 8 + }, + { + "entropy": 2.7889368534088135, + "epoch": 0.03257918552036199, + "grad_norm": 2.8640544414520264, + "learning_rate": 8.175999999999999e-05, + "loss": 0.5965, + "mean_token_accuracy": 0.8638457208871841, + "num_tokens": 79977.0, + "step": 9 + }, + { + "entropy": 2.811532199382782, + "epoch": 0.03619909502262444, + "grad_norm": 2.6199426651000977, + "learning_rate": 9.197999999999998e-05, + "loss": 0.4819, + "mean_token_accuracy": 0.8786454051733017, + "num_tokens": 88915.0, + "step": 10 + }, + { + "entropy": 2.941167712211609, + "epoch": 0.039819004524886875, + "grad_norm": 1.2497272491455078, + "learning_rate": 0.00010219999999999999, + "loss": 0.7192, + "mean_token_accuracy": 0.841494083404541, + "num_tokens": 97749.0, + "step": 11 + }, + { + "entropy": 3.0547962188720703, + "epoch": 0.04343891402714932, + "grad_norm": 1.436136245727539, + "learning_rate": 0.00011241999999999998, + "loss": 0.5908, + "mean_token_accuracy": 0.8657624870538712, + "num_tokens": 106048.0, + "step": 12 + }, + { + "entropy": 2.9914053082466125, + "epoch": 0.047058823529411764, + "grad_norm": 0.9903654456138611, + "learning_rate": 0.00012263999999999998, + "loss": 0.4008, + "mean_token_accuracy": 0.8985499292612076, + "num_tokens": 115216.0, + "step": 13 + }, + { + "entropy": 3.1867465376853943, + "epoch": 0.05067873303167421, + "grad_norm": 1.019572377204895, + "learning_rate": 0.00013286, + "loss": 0.5062, + "mean_token_accuracy": 0.8893097043037415, + "num_tokens": 124040.0, + "step": 14 + }, + { + "entropy": 3.2431325912475586, + "epoch": 0.05429864253393665, + "grad_norm": 1.2394084930419922, + "learning_rate": 0.00014308, + "loss": 0.361, + "mean_token_accuracy": 0.9009967148303986, + "num_tokens": 132447.0, + "step": 15 + }, + { + "entropy": 3.1858643889427185, + "epoch": 0.0579185520361991, + "grad_norm": 0.9859603643417358, + "learning_rate": 0.00015329999999999999, + "loss": 0.4498, + "mean_token_accuracy": 0.887280747294426, + "num_tokens": 141228.0, + "step": 16 + }, + { + "entropy": 3.5029141902923584, + "epoch": 0.06153846153846154, + "grad_norm": 1.453957438468933, + "learning_rate": 0.00016351999999999998, + "loss": 0.4949, + "mean_token_accuracy": 0.888081505894661, + "num_tokens": 149789.0, + "step": 17 + }, + { + "entropy": 3.4572895765304565, + "epoch": 0.06515837104072399, + "grad_norm": 1.390377402305603, + "learning_rate": 0.00017374, + "loss": 0.5449, + "mean_token_accuracy": 0.8745045810937881, + "num_tokens": 157813.0, + "step": 18 + }, + { + "entropy": 3.3081750869750977, + "epoch": 0.06877828054298643, + "grad_norm": 1.1171791553497314, + "learning_rate": 0.00018395999999999997, + "loss": 0.4786, + "mean_token_accuracy": 0.8893420845270157, + "num_tokens": 166315.0, + "step": 19 + }, + { + "entropy": 3.3776715993881226, + "epoch": 0.07239819004524888, + "grad_norm": 1.5567998886108398, + "learning_rate": 0.00019418, + "loss": 0.3669, + "mean_token_accuracy": 0.9146632701158524, + "num_tokens": 175207.0, + "step": 20 + }, + { + "entropy": 3.2677870988845825, + "epoch": 0.0760180995475113, + "grad_norm": 1.7404611110687256, + "learning_rate": 0.00020439999999999998, + "loss": 0.5287, + "mean_token_accuracy": 0.8777483552694321, + "num_tokens": 183833.0, + "step": 21 + }, + { + "entropy": 3.313201069831848, + "epoch": 0.07963800904977375, + "grad_norm": 1.0836979150772095, + "learning_rate": 0.00021461999999999997, + "loss": 0.3014, + "mean_token_accuracy": 0.9215261936187744, + "num_tokens": 192591.0, + "step": 22 + }, + { + "entropy": 3.208672881126404, + "epoch": 0.0832579185520362, + "grad_norm": 1.2197301387786865, + "learning_rate": 0.00022483999999999997, + "loss": 0.4401, + "mean_token_accuracy": 0.9031257778406143, + "num_tokens": 201372.0, + "step": 23 + }, + { + "entropy": 3.1830995082855225, + "epoch": 0.08687782805429864, + "grad_norm": 1.2422229051589966, + "learning_rate": 0.00023506, + "loss": 0.5144, + "mean_token_accuracy": 0.8915928155183792, + "num_tokens": 210348.0, + "step": 24 + }, + { + "entropy": 3.085207223892212, + "epoch": 0.09049773755656108, + "grad_norm": 0.8987624049186707, + "learning_rate": 0.00024527999999999996, + "loss": 0.3253, + "mean_token_accuracy": 0.9221627116203308, + "num_tokens": 219131.0, + "step": 25 + }, + { + "entropy": 3.026031017303467, + "epoch": 0.09411764705882353, + "grad_norm": 1.0273475646972656, + "learning_rate": 0.0002555, + "loss": 0.3495, + "mean_token_accuracy": 0.9147634357213974, + "num_tokens": 228292.0, + "step": 26 + }, + { + "entropy": 3.0420032739639282, + "epoch": 0.09773755656108597, + "grad_norm": 1.0590945482254028, + "learning_rate": 0.00026572, + "loss": 0.4495, + "mean_token_accuracy": 0.9019353687763214, + "num_tokens": 236942.0, + "step": 27 + }, + { + "entropy": 3.0469263792037964, + "epoch": 0.10135746606334842, + "grad_norm": 0.9584959745407104, + "learning_rate": 0.00027594, + "loss": 0.405, + "mean_token_accuracy": 0.9216890782117844, + "num_tokens": 245543.0, + "step": 28 + }, + { + "entropy": 2.92683744430542, + "epoch": 0.10497737556561086, + "grad_norm": 0.8826628923416138, + "learning_rate": 0.00028616, + "loss": 0.4004, + "mean_token_accuracy": 0.9173285663127899, + "num_tokens": 254264.0, + "step": 29 + }, + { + "entropy": 3.0086968541145325, + "epoch": 0.1085972850678733, + "grad_norm": 0.8521863222122192, + "learning_rate": 0.00029637999999999995, + "loss": 0.2876, + "mean_token_accuracy": 0.9335231184959412, + "num_tokens": 263143.0, + "step": 30 + }, + { + "entropy": 2.9086623191833496, + "epoch": 0.11221719457013575, + "grad_norm": 0.7830919623374939, + "learning_rate": 0.00030659999999999997, + "loss": 0.548, + "mean_token_accuracy": 0.8831343650817871, + "num_tokens": 272055.0, + "step": 31 + }, + { + "entropy": 2.9730575680732727, + "epoch": 0.1158371040723982, + "grad_norm": 0.7217472195625305, + "learning_rate": 0.00031682, + "loss": 0.3564, + "mean_token_accuracy": 0.9119151830673218, + "num_tokens": 280971.0, + "step": 32 + }, + { + "entropy": 3.081720530986786, + "epoch": 0.11945701357466064, + "grad_norm": 0.8697704076766968, + "learning_rate": 0.00032703999999999996, + "loss": 0.334, + "mean_token_accuracy": 0.9234935492277145, + "num_tokens": 289449.0, + "step": 33 + }, + { + "entropy": 3.1043431162834167, + "epoch": 0.12307692307692308, + "grad_norm": 0.7962514758110046, + "learning_rate": 0.00033726, + "loss": 0.1602, + "mean_token_accuracy": 0.9554370939731598, + "num_tokens": 297804.0, + "step": 34 + }, + { + "entropy": 3.0275490283966064, + "epoch": 0.12669683257918551, + "grad_norm": 0.5887104272842407, + "learning_rate": 0.00034748, + "loss": 0.2254, + "mean_token_accuracy": 0.9491932094097137, + "num_tokens": 306589.0, + "step": 35 + }, + { + "entropy": 3.099652886390686, + "epoch": 0.13031674208144797, + "grad_norm": 0.894397497177124, + "learning_rate": 0.00035769999999999997, + "loss": 0.6397, + "mean_token_accuracy": 0.8802188038825989, + "num_tokens": 315534.0, + "step": 36 + }, + { + "entropy": 3.0312134623527527, + "epoch": 0.1339366515837104, + "grad_norm": 0.6374682188034058, + "learning_rate": 0.00036791999999999993, + "loss": 0.2183, + "mean_token_accuracy": 0.9478497952222824, + "num_tokens": 324492.0, + "step": 37 + }, + { + "entropy": 3.28497713804245, + "epoch": 0.13755656108597286, + "grad_norm": 0.6740968823432922, + "learning_rate": 0.00037813999999999995, + "loss": 0.3619, + "mean_token_accuracy": 0.9288723170757294, + "num_tokens": 333195.0, + "step": 38 + }, + { + "entropy": 3.1478323340415955, + "epoch": 0.1411764705882353, + "grad_norm": 0.7235494256019592, + "learning_rate": 0.00038836, + "loss": 0.324, + "mean_token_accuracy": 0.9179254025220871, + "num_tokens": 342028.0, + "step": 39 + }, + { + "entropy": 3.279879152774811, + "epoch": 0.14479638009049775, + "grad_norm": 0.7512595653533936, + "learning_rate": 0.00039858, + "loss": 0.4804, + "mean_token_accuracy": 0.889826312661171, + "num_tokens": 350902.0, + "step": 40 + }, + { + "entropy": 3.173546612262726, + "epoch": 0.14841628959276018, + "grad_norm": 0.6978861689567566, + "learning_rate": 0.00040879999999999996, + "loss": 0.3442, + "mean_token_accuracy": 0.9205169230699539, + "num_tokens": 359787.0, + "step": 41 + }, + { + "entropy": 3.2385765314102173, + "epoch": 0.1520361990950226, + "grad_norm": 0.8108944892883301, + "learning_rate": 0.00041901999999999993, + "loss": 0.4223, + "mean_token_accuracy": 0.8979178965091705, + "num_tokens": 368426.0, + "step": 42 + }, + { + "entropy": 3.146568477153778, + "epoch": 0.15565610859728507, + "grad_norm": 0.5847787261009216, + "learning_rate": 0.00042923999999999995, + "loss": 0.1953, + "mean_token_accuracy": 0.9556037336587906, + "num_tokens": 377349.0, + "step": 43 + }, + { + "entropy": 3.066233277320862, + "epoch": 0.1592760180995475, + "grad_norm": 0.7887329459190369, + "learning_rate": 0.00043945999999999997, + "loss": 0.6815, + "mean_token_accuracy": 0.8654293268918991, + "num_tokens": 386603.0, + "step": 44 + }, + { + "entropy": 3.1745981574058533, + "epoch": 0.16289592760180996, + "grad_norm": 0.7280165553092957, + "learning_rate": 0.00044967999999999994, + "loss": 0.1932, + "mean_token_accuracy": 0.9479279220104218, + "num_tokens": 395070.0, + "step": 45 + }, + { + "entropy": 3.1094446182250977, + "epoch": 0.1665158371040724, + "grad_norm": 0.6453448534011841, + "learning_rate": 0.00045989999999999996, + "loss": 0.2608, + "mean_token_accuracy": 0.9249396026134491, + "num_tokens": 403651.0, + "step": 46 + }, + { + "entropy": 2.9050925970077515, + "epoch": 0.17013574660633485, + "grad_norm": 0.6689278483390808, + "learning_rate": 0.00047012, + "loss": 0.4489, + "mean_token_accuracy": 0.898686870932579, + "num_tokens": 412898.0, + "step": 47 + }, + { + "entropy": 3.2239145040512085, + "epoch": 0.17375565610859728, + "grad_norm": 1.0014020204544067, + "learning_rate": 0.00048033999999999994, + "loss": 0.3234, + "mean_token_accuracy": 0.9231891483068466, + "num_tokens": 421420.0, + "step": 48 + }, + { + "entropy": 3.035899817943573, + "epoch": 0.17737556561085974, + "grad_norm": 0.6415768265724182, + "learning_rate": 0.0004905599999999999, + "loss": 0.2259, + "mean_token_accuracy": 0.9447792917490005, + "num_tokens": 430258.0, + "step": 49 + }, + { + "entropy": 3.057477653026581, + "epoch": 0.18099547511312217, + "grad_norm": 0.6042271256446838, + "learning_rate": 0.0005007799999999999, + "loss": 0.2228, + "mean_token_accuracy": 0.9473378211259842, + "num_tokens": 439593.0, + "step": 50 + }, + { + "entropy": 2.8375911116600037, + "epoch": 0.18461538461538463, + "grad_norm": 0.739811897277832, + "learning_rate": 0.000511, + "loss": 0.3623, + "mean_token_accuracy": 0.9050924181938171, + "num_tokens": 449056.0, + "step": 51 + }, + { + "entropy": 2.9926682114601135, + "epoch": 0.18823529411764706, + "grad_norm": 0.6637321710586548, + "learning_rate": 0.0005109995633102972, + "loss": 0.2924, + "mean_token_accuracy": 0.9397273659706116, + "num_tokens": 457677.0, + "step": 52 + }, + { + "entropy": 2.7932987809181213, + "epoch": 0.19185520361990951, + "grad_norm": 0.5666584372520447, + "learning_rate": 0.0005109982532428477, + "loss": 0.2055, + "mean_token_accuracy": 0.9385408014059067, + "num_tokens": 466969.0, + "step": 53 + }, + { + "entropy": 2.765812337398529, + "epoch": 0.19547511312217195, + "grad_norm": 0.7875120639801025, + "learning_rate": 0.0005109960698026271, + "loss": 0.4549, + "mean_token_accuracy": 0.9052814990282059, + "num_tokens": 476285.0, + "step": 54 + }, + { + "entropy": 2.884207248687744, + "epoch": 0.19909502262443438, + "grad_norm": 0.7538661956787109, + "learning_rate": 0.0005109930129979285, + "loss": 0.3751, + "mean_token_accuracy": 0.9210246652364731, + "num_tokens": 484668.0, + "step": 55 + }, + { + "entropy": 2.779718518257141, + "epoch": 0.20271493212669683, + "grad_norm": 0.8069296479225159, + "learning_rate": 0.0005109890828403621, + "loss": 0.3664, + "mean_token_accuracy": 0.9219843596220016, + "num_tokens": 493292.0, + "step": 56 + }, + { + "entropy": 2.841543674468994, + "epoch": 0.20633484162895926, + "grad_norm": 0.5545904636383057, + "learning_rate": 0.0005109842793448548, + "loss": 0.1973, + "mean_token_accuracy": 0.9547395706176758, + "num_tokens": 501973.0, + "step": 57 + }, + { + "entropy": 2.8180030584335327, + "epoch": 0.20995475113122172, + "grad_norm": 1.015456199645996, + "learning_rate": 0.0005109786025296513, + "loss": 0.6019, + "mean_token_accuracy": 0.88613361120224, + "num_tokens": 510840.0, + "step": 58 + }, + { + "entropy": 2.7450912594795227, + "epoch": 0.21357466063348415, + "grad_norm": 0.6784740686416626, + "learning_rate": 0.0005109720524163127, + "loss": 0.2868, + "mean_token_accuracy": 0.9295425117015839, + "num_tokens": 519656.0, + "step": 59 + }, + { + "entropy": 2.822400987148285, + "epoch": 0.2171945701357466, + "grad_norm": 0.8780149817466736, + "learning_rate": 0.000510964629029717, + "loss": 0.4371, + "mean_token_accuracy": 0.9089596569538116, + "num_tokens": 528105.0, + "step": 60 + }, + { + "entropy": 2.522100865840912, + "epoch": 0.22081447963800904, + "grad_norm": 0.51394122838974, + "learning_rate": 0.0005109563323980594, + "loss": 0.2509, + "mean_token_accuracy": 0.941976860165596, + "num_tokens": 537707.0, + "step": 61 + }, + { + "entropy": 2.6596657633781433, + "epoch": 0.2244343891402715, + "grad_norm": 0.6359816789627075, + "learning_rate": 0.0005109471625528516, + "loss": 0.3685, + "mean_token_accuracy": 0.9191890209913254, + "num_tokens": 546517.0, + "step": 62 + }, + { + "entropy": 2.800311803817749, + "epoch": 0.22805429864253393, + "grad_norm": 0.6862941980361938, + "learning_rate": 0.0005109371195289215, + "loss": 0.2457, + "mean_token_accuracy": 0.9330879002809525, + "num_tokens": 555493.0, + "step": 63 + }, + { + "entropy": 2.7235344648361206, + "epoch": 0.2316742081447964, + "grad_norm": 1.0464682579040527, + "learning_rate": 0.0005109262033644142, + "loss": 0.4417, + "mean_token_accuracy": 0.8957678377628326, + "num_tokens": 564255.0, + "step": 64 + }, + { + "entropy": 2.6643534302711487, + "epoch": 0.23529411764705882, + "grad_norm": 1.0790019035339355, + "learning_rate": 0.0005109144141007903, + "loss": 0.4947, + "mean_token_accuracy": 0.8889007717370987, + "num_tokens": 573401.0, + "step": 65 + }, + { + "entropy": 2.760925054550171, + "epoch": 0.23891402714932128, + "grad_norm": 0.7957189679145813, + "learning_rate": 0.0005109017517828273, + "loss": 0.2259, + "mean_token_accuracy": 0.944578230381012, + "num_tokens": 581905.0, + "step": 66 + }, + { + "entropy": 2.7048792839050293, + "epoch": 0.2425339366515837, + "grad_norm": 0.9530714750289917, + "learning_rate": 0.0005108882164586181, + "loss": 0.3122, + "mean_token_accuracy": 0.9257418513298035, + "num_tokens": 590802.0, + "step": 67 + }, + { + "entropy": 2.6733291149139404, + "epoch": 0.24615384615384617, + "grad_norm": 0.8295993208885193, + "learning_rate": 0.0005108738081795716, + "loss": 0.3701, + "mean_token_accuracy": 0.898589238524437, + "num_tokens": 599279.0, + "step": 68 + }, + { + "entropy": 2.5613606572151184, + "epoch": 0.2497737556561086, + "grad_norm": 0.6205935478210449, + "learning_rate": 0.0005108585270004123, + "loss": 0.4372, + "mean_token_accuracy": 0.9116007685661316, + "num_tokens": 608107.0, + "step": 69 + }, + { + "entropy": 2.458296835422516, + "epoch": 0.25339366515837103, + "grad_norm": 0.7629838585853577, + "learning_rate": 0.0005108423729791799, + "loss": 0.2307, + "mean_token_accuracy": 0.9386163502931595, + "num_tokens": 616881.0, + "step": 70 + }, + { + "entropy": 2.4176695346832275, + "epoch": 0.25701357466063346, + "grad_norm": 0.902400016784668, + "learning_rate": 0.0005108253461772298, + "loss": 0.2853, + "mean_token_accuracy": 0.9237343072891235, + "num_tokens": 625323.0, + "step": 71 + }, + { + "entropy": 2.2265281677246094, + "epoch": 0.26063348416289595, + "grad_norm": 0.7744383811950684, + "learning_rate": 0.0005108074466592316, + "loss": 0.2435, + "mean_token_accuracy": 0.9508260935544968, + "num_tokens": 634260.0, + "step": 72 + }, + { + "entropy": 2.1855952441692352, + "epoch": 0.2642533936651584, + "grad_norm": 0.8615190386772156, + "learning_rate": 0.0005107886744931702, + "loss": 0.3323, + "mean_token_accuracy": 0.9276078194379807, + "num_tokens": 643235.0, + "step": 73 + }, + { + "entropy": 2.179121494293213, + "epoch": 0.2678733031674208, + "grad_norm": 0.8953279256820679, + "learning_rate": 0.0005107690297503444, + "loss": 0.2384, + "mean_token_accuracy": 0.9425230622291565, + "num_tokens": 652032.0, + "step": 74 + }, + { + "entropy": 2.1565526127815247, + "epoch": 0.27149321266968324, + "grad_norm": 0.6830486059188843, + "learning_rate": 0.0005107485125053678, + "loss": 0.2759, + "mean_token_accuracy": 0.9360661953687668, + "num_tokens": 660978.0, + "step": 75 + }, + { + "entropy": 2.0900665521621704, + "epoch": 0.2751131221719457, + "grad_norm": 0.786665141582489, + "learning_rate": 0.0005107271228361672, + "loss": 0.4061, + "mean_token_accuracy": 0.910009115934372, + "num_tokens": 669817.0, + "step": 76 + }, + { + "entropy": 2.1311859488487244, + "epoch": 0.27873303167420815, + "grad_norm": 0.6399909853935242, + "learning_rate": 0.0005107048608239836, + "loss": 0.272, + "mean_token_accuracy": 0.9424714297056198, + "num_tokens": 678469.0, + "step": 77 + }, + { + "entropy": 2.059997320175171, + "epoch": 0.2823529411764706, + "grad_norm": 0.8114754557609558, + "learning_rate": 0.0005106817265533706, + "loss": 0.4029, + "mean_token_accuracy": 0.9037660360336304, + "num_tokens": 687261.0, + "step": 78 + }, + { + "entropy": 1.9725019037723541, + "epoch": 0.285972850678733, + "grad_norm": 0.9420941472053528, + "learning_rate": 0.0005106577201121952, + "loss": 0.535, + "mean_token_accuracy": 0.8996377140283585, + "num_tokens": 695941.0, + "step": 79 + }, + { + "entropy": 1.9951164424419403, + "epoch": 0.2895927601809955, + "grad_norm": 0.6476142406463623, + "learning_rate": 0.0005106328415916372, + "loss": 0.2242, + "mean_token_accuracy": 0.941379725933075, + "num_tokens": 704643.0, + "step": 80 + }, + { + "entropy": 1.8962564170360565, + "epoch": 0.29321266968325793, + "grad_norm": 0.5974630117416382, + "learning_rate": 0.0005106070910861881, + "loss": 0.2934, + "mean_token_accuracy": 0.9217697530984879, + "num_tokens": 713605.0, + "step": 81 + }, + { + "entropy": 1.9781515896320343, + "epoch": 0.29683257918552036, + "grad_norm": 0.8755478262901306, + "learning_rate": 0.0005105804686936518, + "loss": 0.4551, + "mean_token_accuracy": 0.9051328897476196, + "num_tokens": 722385.0, + "step": 82 + }, + { + "entropy": 1.9892418384552002, + "epoch": 0.3004524886877828, + "grad_norm": 0.6887345314025879, + "learning_rate": 0.0005105529745151433, + "loss": 0.244, + "mean_token_accuracy": 0.9261117279529572, + "num_tokens": 730962.0, + "step": 83 + }, + { + "entropy": 2.0053181648254395, + "epoch": 0.3040723981900452, + "grad_norm": 0.6930885910987854, + "learning_rate": 0.0005105246086550893, + "loss": 0.3155, + "mean_token_accuracy": 0.9206147193908691, + "num_tokens": 739499.0, + "step": 84 + }, + { + "entropy": 1.9716475903987885, + "epoch": 0.3076923076923077, + "grad_norm": 0.5049461722373962, + "learning_rate": 0.0005104953712212266, + "loss": 0.2215, + "mean_token_accuracy": 0.9608763605356216, + "num_tokens": 748604.0, + "step": 85 + }, + { + "entropy": 1.9186978042125702, + "epoch": 0.31131221719457014, + "grad_norm": 0.5756685733795166, + "learning_rate": 0.000510465262324603, + "loss": 0.2658, + "mean_token_accuracy": 0.9372887462377548, + "num_tokens": 757919.0, + "step": 86 + }, + { + "entropy": 1.9738290905952454, + "epoch": 0.31493212669683257, + "grad_norm": 0.6163789629936218, + "learning_rate": 0.0005104342820795758, + "loss": 0.2472, + "mean_token_accuracy": 0.9430449157953262, + "num_tokens": 766708.0, + "step": 87 + }, + { + "entropy": 2.1927571892738342, + "epoch": 0.318552036199095, + "grad_norm": 0.7953162789344788, + "learning_rate": 0.0005104024306038119, + "loss": 0.261, + "mean_token_accuracy": 0.9425829648971558, + "num_tokens": 774601.0, + "step": 88 + }, + { + "entropy": 2.043731451034546, + "epoch": 0.3221719457013575, + "grad_norm": 0.8098088502883911, + "learning_rate": 0.0005103697080182872, + "loss": 0.3126, + "mean_token_accuracy": 0.9158089309930801, + "num_tokens": 783170.0, + "step": 89 + }, + { + "entropy": 1.9801572561264038, + "epoch": 0.3257918552036199, + "grad_norm": 0.5227240920066833, + "learning_rate": 0.0005103361144472864, + "loss": 0.1291, + "mean_token_accuracy": 0.9666071832180023, + "num_tokens": 791769.0, + "step": 90 + }, + { + "entropy": 1.9553790986537933, + "epoch": 0.32941176470588235, + "grad_norm": 0.7819464206695557, + "learning_rate": 0.0005103016500184022, + "loss": 0.531, + "mean_token_accuracy": 0.8817111849784851, + "num_tokens": 800824.0, + "step": 91 + }, + { + "entropy": 1.9291303753852844, + "epoch": 0.3330316742081448, + "grad_norm": 0.7178757190704346, + "learning_rate": 0.0005102663148625347, + "loss": 0.3301, + "mean_token_accuracy": 0.9357631802558899, + "num_tokens": 809347.0, + "step": 92 + }, + { + "entropy": 1.9846041798591614, + "epoch": 0.33665158371040727, + "grad_norm": 1.316636085510254, + "learning_rate": 0.0005102301091138916, + "loss": 0.4241, + "mean_token_accuracy": 0.8993304669857025, + "num_tokens": 817174.0, + "step": 93 + }, + { + "entropy": 1.814637303352356, + "epoch": 0.3402714932126697, + "grad_norm": 0.5486414432525635, + "learning_rate": 0.0005101930329099865, + "loss": 0.116, + "mean_token_accuracy": 0.9674727618694305, + "num_tokens": 826177.0, + "step": 94 + }, + { + "entropy": 1.9128066003322601, + "epoch": 0.3438914027149321, + "grad_norm": 0.620303750038147, + "learning_rate": 0.00051015508639164, + "loss": 0.1833, + "mean_token_accuracy": 0.9569521993398666, + "num_tokens": 835409.0, + "step": 95 + }, + { + "entropy": 1.7541870176792145, + "epoch": 0.34751131221719456, + "grad_norm": 0.8337438702583313, + "learning_rate": 0.0005101162697029776, + "loss": 0.3327, + "mean_token_accuracy": 0.9193180054426193, + "num_tokens": 844692.0, + "step": 96 + }, + { + "entropy": 1.8255240619182587, + "epoch": 0.351131221719457, + "grad_norm": 0.877780556678772, + "learning_rate": 0.00051007658299143, + "loss": 0.2106, + "mean_token_accuracy": 0.9527023881673813, + "num_tokens": 853309.0, + "step": 97 + }, + { + "entropy": 1.8611579239368439, + "epoch": 0.3547511312217195, + "grad_norm": 1.0667716264724731, + "learning_rate": 0.0005100360264077325, + "loss": 0.3196, + "mean_token_accuracy": 0.9195879399776459, + "num_tokens": 861859.0, + "step": 98 + }, + { + "entropy": 1.821915864944458, + "epoch": 0.3583710407239819, + "grad_norm": 0.8400309681892395, + "learning_rate": 0.0005099946001059241, + "loss": 0.4036, + "mean_token_accuracy": 0.8951036781072617, + "num_tokens": 871060.0, + "step": 99 + }, + { + "entropy": 1.7648265063762665, + "epoch": 0.36199095022624433, + "grad_norm": 1.1391404867172241, + "learning_rate": 0.0005099523042433472, + "loss": 0.389, + "mean_token_accuracy": 0.901309460401535, + "num_tokens": 880593.0, + "step": 100 + }, + { + "entropy": 1.8506875336170197, + "epoch": 0.36561085972850677, + "grad_norm": 0.6923297643661499, + "learning_rate": 0.000509909138980647, + "loss": 0.2504, + "mean_token_accuracy": 0.9384842216968536, + "num_tokens": 889739.0, + "step": 101 + }, + { + "entropy": 1.9311015605926514, + "epoch": 0.36923076923076925, + "grad_norm": 0.9677391052246094, + "learning_rate": 0.0005098651044817704, + "loss": 0.6953, + "mean_token_accuracy": 0.8752655684947968, + "num_tokens": 898992.0, + "step": 102 + }, + { + "entropy": 1.9590983986854553, + "epoch": 0.3728506787330317, + "grad_norm": 0.6364567279815674, + "learning_rate": 0.0005098202009139663, + "loss": 0.4318, + "mean_token_accuracy": 0.9056479930877686, + "num_tokens": 908225.0, + "step": 103 + }, + { + "entropy": 1.9455370008945465, + "epoch": 0.3764705882352941, + "grad_norm": 0.6747863292694092, + "learning_rate": 0.0005097744284477839, + "loss": 0.244, + "mean_token_accuracy": 0.9428392052650452, + "num_tokens": 917134.0, + "step": 104 + }, + { + "entropy": 1.8632825911045074, + "epoch": 0.38009049773755654, + "grad_norm": 0.5705651044845581, + "learning_rate": 0.0005097277872570731, + "loss": 0.2508, + "mean_token_accuracy": 0.9325222969055176, + "num_tokens": 926573.0, + "step": 105 + }, + { + "entropy": 1.9370323717594147, + "epoch": 0.38371040723981903, + "grad_norm": 0.6298627853393555, + "learning_rate": 0.000509680277518983, + "loss": 0.2481, + "mean_token_accuracy": 0.9281332045793533, + "num_tokens": 935853.0, + "step": 106 + }, + { + "entropy": 2.0217572450637817, + "epoch": 0.38733031674208146, + "grad_norm": 0.5434353947639465, + "learning_rate": 0.0005096318994139617, + "loss": 0.1809, + "mean_token_accuracy": 0.9592084139585495, + "num_tokens": 944279.0, + "step": 107 + }, + { + "entropy": 1.9619770646095276, + "epoch": 0.3909502262443439, + "grad_norm": 0.6959638595581055, + "learning_rate": 0.0005095826531257552, + "loss": 0.1376, + "mean_token_accuracy": 0.9608310014009476, + "num_tokens": 953336.0, + "step": 108 + }, + { + "entropy": 2.12511146068573, + "epoch": 0.3945701357466063, + "grad_norm": 1.0152848958969116, + "learning_rate": 0.0005095325388414074, + "loss": 0.4382, + "mean_token_accuracy": 0.915201798081398, + "num_tokens": 962002.0, + "step": 109 + }, + { + "entropy": 2.0171878039836884, + "epoch": 0.39819004524886875, + "grad_norm": 0.8337467312812805, + "learning_rate": 0.0005094815567512587, + "loss": 0.2672, + "mean_token_accuracy": 0.9313560128211975, + "num_tokens": 970954.0, + "step": 110 + }, + { + "entropy": 2.1024146378040314, + "epoch": 0.40180995475113124, + "grad_norm": 0.8214333057403564, + "learning_rate": 0.0005094297070489455, + "loss": 0.3146, + "mean_token_accuracy": 0.9289091974496841, + "num_tokens": 979929.0, + "step": 111 + }, + { + "entropy": 2.260519325733185, + "epoch": 0.40542986425339367, + "grad_norm": 1.1298810243606567, + "learning_rate": 0.0005093769899313996, + "loss": 0.3055, + "mean_token_accuracy": 0.9213490188121796, + "num_tokens": 988477.0, + "step": 112 + }, + { + "entropy": 2.2228699326515198, + "epoch": 0.4090497737556561, + "grad_norm": 0.8601953983306885, + "learning_rate": 0.0005093234055988475, + "loss": 0.2738, + "mean_token_accuracy": 0.920888364315033, + "num_tokens": 997091.0, + "step": 113 + }, + { + "entropy": 2.2165185809135437, + "epoch": 0.41266968325791853, + "grad_norm": 0.6331561803817749, + "learning_rate": 0.0005092689542548091, + "loss": 0.2241, + "mean_token_accuracy": 0.9408514499664307, + "num_tokens": 1005866.0, + "step": 114 + }, + { + "entropy": 2.324040472507477, + "epoch": 0.416289592760181, + "grad_norm": 0.680496096611023, + "learning_rate": 0.0005092136361060975, + "loss": 0.2454, + "mean_token_accuracy": 0.9433349967002869, + "num_tokens": 1014277.0, + "step": 115 + }, + { + "entropy": 2.413789749145508, + "epoch": 0.41990950226244345, + "grad_norm": 0.7489557862281799, + "learning_rate": 0.0005091574513628183, + "loss": 0.2856, + "mean_token_accuracy": 0.934124082326889, + "num_tokens": 1023032.0, + "step": 116 + }, + { + "entropy": 2.4693005681037903, + "epoch": 0.4235294117647059, + "grad_norm": 0.6842612624168396, + "learning_rate": 0.0005091004002383682, + "loss": 0.2778, + "mean_token_accuracy": 0.9386793673038483, + "num_tokens": 1031883.0, + "step": 117 + }, + { + "entropy": 2.4351969361305237, + "epoch": 0.4271493212669683, + "grad_norm": 0.9150674343109131, + "learning_rate": 0.0005090424829494347, + "loss": 0.3151, + "mean_token_accuracy": 0.9177709072828293, + "num_tokens": 1040985.0, + "step": 118 + }, + { + "entropy": 2.5141562819480896, + "epoch": 0.4307692307692308, + "grad_norm": 1.0200655460357666, + "learning_rate": 0.000508983699715995, + "loss": 0.5134, + "mean_token_accuracy": 0.8835459351539612, + "num_tokens": 1049949.0, + "step": 119 + }, + { + "entropy": 2.479240596294403, + "epoch": 0.4343891402714932, + "grad_norm": 0.783278226852417, + "learning_rate": 0.0005089240507613151, + "loss": 0.2745, + "mean_token_accuracy": 0.9389322698116302, + "num_tokens": 1058953.0, + "step": 120 + }, + { + "entropy": 2.457803785800934, + "epoch": 0.43800904977375565, + "grad_norm": 0.7620834112167358, + "learning_rate": 0.0005088635363119497, + "loss": 0.3394, + "mean_token_accuracy": 0.9145695865154266, + "num_tokens": 1068624.0, + "step": 121 + }, + { + "entropy": 2.4909247756004333, + "epoch": 0.4416289592760181, + "grad_norm": 0.5868712067604065, + "learning_rate": 0.0005088021565977403, + "loss": 0.1726, + "mean_token_accuracy": 0.9567564129829407, + "num_tokens": 1077686.0, + "step": 122 + }, + { + "entropy": 2.5540462732315063, + "epoch": 0.4452488687782805, + "grad_norm": 1.1467291116714478, + "learning_rate": 0.0005087399118518148, + "loss": 0.2617, + "mean_token_accuracy": 0.9329706132411957, + "num_tokens": 1086230.0, + "step": 123 + }, + { + "entropy": 2.377680242061615, + "epoch": 0.448868778280543, + "grad_norm": 0.7021825909614563, + "learning_rate": 0.0005086768023105866, + "loss": 0.4124, + "mean_token_accuracy": 0.9093360006809235, + "num_tokens": 1095867.0, + "step": 124 + }, + { + "entropy": 2.55239599943161, + "epoch": 0.45248868778280543, + "grad_norm": 0.5947801470756531, + "learning_rate": 0.0005086128282137538, + "loss": 0.2752, + "mean_token_accuracy": 0.9248816668987274, + "num_tokens": 1105003.0, + "step": 125 + }, + { + "entropy": 2.4695483446121216, + "epoch": 0.45610859728506786, + "grad_norm": 1.345604658126831, + "learning_rate": 0.0005085479898042985, + "loss": 0.2577, + "mean_token_accuracy": 0.9318550229072571, + "num_tokens": 1114162.0, + "step": 126 + }, + { + "entropy": 2.4898732900619507, + "epoch": 0.4597285067873303, + "grad_norm": 0.8534179329872131, + "learning_rate": 0.0005084822873284848, + "loss": 0.3013, + "mean_token_accuracy": 0.9195661097764969, + "num_tokens": 1123457.0, + "step": 127 + }, + { + "entropy": 2.5951223969459534, + "epoch": 0.4633484162895928, + "grad_norm": 1.1677368879318237, + "learning_rate": 0.0005084157210358592, + "loss": 0.1612, + "mean_token_accuracy": 0.9599333852529526, + "num_tokens": 1131774.0, + "step": 128 + }, + { + "entropy": 2.7315847873687744, + "epoch": 0.4669683257918552, + "grad_norm": 0.7633224129676819, + "learning_rate": 0.0005083482911792492, + "loss": 0.2437, + "mean_token_accuracy": 0.9487509876489639, + "num_tokens": 1140301.0, + "step": 129 + }, + { + "entropy": 2.6348633766174316, + "epoch": 0.47058823529411764, + "grad_norm": 0.7573317885398865, + "learning_rate": 0.0005082799980147617, + "loss": 0.2426, + "mean_token_accuracy": 0.947308748960495, + "num_tokens": 1148929.0, + "step": 130 + }, + { + "entropy": 2.60002738237381, + "epoch": 0.47420814479638007, + "grad_norm": 1.8195319175720215, + "learning_rate": 0.0005082108418017829, + "loss": 0.1792, + "mean_token_accuracy": 0.9512491375207901, + "num_tokens": 1157682.0, + "step": 131 + }, + { + "entropy": 2.5319923162460327, + "epoch": 0.47782805429864256, + "grad_norm": 0.6342993378639221, + "learning_rate": 0.0005081408228029771, + "loss": 0.1843, + "mean_token_accuracy": 0.9440758228302002, + "num_tokens": 1166687.0, + "step": 132 + }, + { + "entropy": 2.5666881799697876, + "epoch": 0.481447963800905, + "grad_norm": 0.8979415893554688, + "learning_rate": 0.0005080699412842852, + "loss": 0.4824, + "mean_token_accuracy": 0.8837443292140961, + "num_tokens": 1175746.0, + "step": 133 + }, + { + "entropy": 2.6854636669158936, + "epoch": 0.4850678733031674, + "grad_norm": 0.8302125334739685, + "learning_rate": 0.0005079981975149243, + "loss": 0.267, + "mean_token_accuracy": 0.9279022663831711, + "num_tokens": 1184196.0, + "step": 134 + }, + { + "entropy": 2.564552128314972, + "epoch": 0.48868778280542985, + "grad_norm": 0.6785959005355835, + "learning_rate": 0.0005079255917673863, + "loss": 0.2031, + "mean_token_accuracy": 0.9463823586702347, + "num_tokens": 1192982.0, + "step": 135 + }, + { + "entropy": 2.673682928085327, + "epoch": 0.49230769230769234, + "grad_norm": 1.4760410785675049, + "learning_rate": 0.0005078521243174371, + "loss": 0.4791, + "mean_token_accuracy": 0.8969505727291107, + "num_tokens": 1201454.0, + "step": 136 + }, + { + "entropy": 2.6232714653015137, + "epoch": 0.49592760180995477, + "grad_norm": 0.7845668792724609, + "learning_rate": 0.0005077777954441157, + "loss": 0.2472, + "mean_token_accuracy": 0.9404618591070175, + "num_tokens": 1210182.0, + "step": 137 + }, + { + "entropy": 2.5614060163497925, + "epoch": 0.4995475113122172, + "grad_norm": 0.725419819355011, + "learning_rate": 0.0005077026054297322, + "loss": 0.3643, + "mean_token_accuracy": 0.9193316847085953, + "num_tokens": 1219487.0, + "step": 138 + }, + { + "entropy": 2.5907246470451355, + "epoch": 0.5031674208144796, + "grad_norm": 0.7741782665252686, + "learning_rate": 0.0005076265545598682, + "loss": 0.276, + "mean_token_accuracy": 0.9447730481624603, + "num_tokens": 1228066.0, + "step": 139 + }, + { + "entropy": 2.531104028224945, + "epoch": 0.5067873303167421, + "grad_norm": 0.680992603302002, + "learning_rate": 0.0005075496431233745, + "loss": 0.2004, + "mean_token_accuracy": 0.9470729678869247, + "num_tokens": 1236980.0, + "step": 140 + }, + { + "entropy": 2.590231478214264, + "epoch": 0.5104072398190045, + "grad_norm": 0.8260406255722046, + "learning_rate": 0.0005074718714123704, + "loss": 0.2756, + "mean_token_accuracy": 0.9301882535219193, + "num_tokens": 1245565.0, + "step": 141 + }, + { + "entropy": 2.4858668446540833, + "epoch": 0.5140271493212669, + "grad_norm": 0.8085922598838806, + "learning_rate": 0.0005073932397222429, + "loss": 0.2314, + "mean_token_accuracy": 0.9449103325605392, + "num_tokens": 1254366.0, + "step": 142 + }, + { + "entropy": 2.5374304056167603, + "epoch": 0.5176470588235295, + "grad_norm": 0.7858129143714905, + "learning_rate": 0.0005073137483516452, + "loss": 0.1622, + "mean_token_accuracy": 0.9510673582553864, + "num_tokens": 1263197.0, + "step": 143 + }, + { + "entropy": 2.608425199985504, + "epoch": 0.5212669683257919, + "grad_norm": 1.2698506116867065, + "learning_rate": 0.0005072333976024957, + "loss": 0.1729, + "mean_token_accuracy": 0.9509973376989365, + "num_tokens": 1271725.0, + "step": 144 + }, + { + "entropy": 2.437038242816925, + "epoch": 0.5248868778280543, + "grad_norm": 1.0788538455963135, + "learning_rate": 0.0005071521877799765, + "loss": 0.3344, + "mean_token_accuracy": 0.9166721999645233, + "num_tokens": 1280963.0, + "step": 145 + }, + { + "entropy": 2.589951515197754, + "epoch": 0.5285067873303168, + "grad_norm": 0.9228294491767883, + "learning_rate": 0.0005070701191925332, + "loss": 0.3095, + "mean_token_accuracy": 0.9239777624607086, + "num_tokens": 1289683.0, + "step": 146 + }, + { + "entropy": 2.575794994831085, + "epoch": 0.5321266968325792, + "grad_norm": 1.359767198562622, + "learning_rate": 0.0005069871921518726, + "loss": 0.2447, + "mean_token_accuracy": 0.9374738186597824, + "num_tokens": 1298397.0, + "step": 147 + }, + { + "entropy": 2.5628358721733093, + "epoch": 0.5357466063348416, + "grad_norm": 0.9870713353157043, + "learning_rate": 0.000506903406972962, + "loss": 0.4824, + "mean_token_accuracy": 0.9027767181396484, + "num_tokens": 1307191.0, + "step": 148 + }, + { + "entropy": 2.5513240098953247, + "epoch": 0.539366515837104, + "grad_norm": 0.7921387553215027, + "learning_rate": 0.0005068187639740286, + "loss": 0.3278, + "mean_token_accuracy": 0.9161934554576874, + "num_tokens": 1315878.0, + "step": 149 + }, + { + "entropy": 2.526439070701599, + "epoch": 0.5429864253393665, + "grad_norm": 0.6320391297340393, + "learning_rate": 0.000506733263476557, + "loss": 0.1701, + "mean_token_accuracy": 0.9575318098068237, + "num_tokens": 1324786.0, + "step": 150 + }, + { + "entropy": 2.4837265014648438, + "epoch": 0.5466063348416289, + "grad_norm": 0.5369354486465454, + "learning_rate": 0.000506646905805289, + "loss": 0.1328, + "mean_token_accuracy": 0.9636050164699554, + "num_tokens": 1333766.0, + "step": 151 + }, + { + "entropy": 2.5264737010002136, + "epoch": 0.5502262443438914, + "grad_norm": 0.7346852421760559, + "learning_rate": 0.0005065596912882222, + "loss": 0.2012, + "mean_token_accuracy": 0.9448132663965225, + "num_tokens": 1343004.0, + "step": 152 + }, + { + "entropy": 2.569309651851654, + "epoch": 0.5538461538461539, + "grad_norm": 0.9926508069038391, + "learning_rate": 0.0005064716202566082, + "loss": 0.2831, + "mean_token_accuracy": 0.9332023113965988, + "num_tokens": 1351561.0, + "step": 153 + }, + { + "entropy": 2.3148274421691895, + "epoch": 0.5574660633484163, + "grad_norm": 0.6301954984664917, + "learning_rate": 0.0005063826930449523, + "loss": 0.3622, + "mean_token_accuracy": 0.9349419325590134, + "num_tokens": 1360997.0, + "step": 154 + }, + { + "entropy": 2.497675657272339, + "epoch": 0.5610859728506787, + "grad_norm": 0.8846175670623779, + "learning_rate": 0.000506292909991011, + "loss": 0.2314, + "mean_token_accuracy": 0.9468862265348434, + "num_tokens": 1369600.0, + "step": 155 + }, + { + "entropy": 2.313987612724304, + "epoch": 0.5647058823529412, + "grad_norm": 0.5701894164085388, + "learning_rate": 0.0005062022714357922, + "loss": 0.2154, + "mean_token_accuracy": 0.945093959569931, + "num_tokens": 1379125.0, + "step": 156 + }, + { + "entropy": 2.4019755125045776, + "epoch": 0.5683257918552036, + "grad_norm": 0.8769335746765137, + "learning_rate": 0.0005061107777235524, + "loss": 0.3565, + "mean_token_accuracy": 0.9133864492177963, + "num_tokens": 1388111.0, + "step": 157 + }, + { + "entropy": 2.3127577900886536, + "epoch": 0.571945701357466, + "grad_norm": 1.1026453971862793, + "learning_rate": 0.0005060184292017965, + "loss": 0.2897, + "mean_token_accuracy": 0.899736076593399, + "num_tokens": 1397528.0, + "step": 158 + }, + { + "entropy": 2.2682697772979736, + "epoch": 0.5755656108597285, + "grad_norm": 0.5426591038703918, + "learning_rate": 0.000505925226221276, + "loss": 0.167, + "mean_token_accuracy": 0.9609879851341248, + "num_tokens": 1406809.0, + "step": 159 + }, + { + "entropy": 2.4639336466789246, + "epoch": 0.579185520361991, + "grad_norm": 0.6552363038063049, + "learning_rate": 0.0005058311691359875, + "loss": 0.2511, + "mean_token_accuracy": 0.9355164766311646, + "num_tokens": 1415498.0, + "step": 160 + }, + { + "entropy": 2.467900663614273, + "epoch": 0.5828054298642534, + "grad_norm": 0.7168154120445251, + "learning_rate": 0.000505736258303172, + "loss": 0.234, + "mean_token_accuracy": 0.9450509995222092, + "num_tokens": 1424524.0, + "step": 161 + }, + { + "entropy": 2.3683157563209534, + "epoch": 0.5864253393665159, + "grad_norm": 0.6433501839637756, + "learning_rate": 0.0005056404940833128, + "loss": 0.3441, + "mean_token_accuracy": 0.9261108189821243, + "num_tokens": 1434194.0, + "step": 162 + }, + { + "entropy": 2.4686295986175537, + "epoch": 0.5900452488687783, + "grad_norm": 0.9615177512168884, + "learning_rate": 0.0005055438768401348, + "loss": 0.1492, + "mean_token_accuracy": 0.966903567314148, + "num_tokens": 1442972.0, + "step": 163 + }, + { + "entropy": 2.5551892518997192, + "epoch": 0.5936651583710407, + "grad_norm": 0.4957484006881714, + "learning_rate": 0.0005054464069406023, + "loss": 0.1242, + "mean_token_accuracy": 0.969713419675827, + "num_tokens": 1451324.0, + "step": 164 + }, + { + "entropy": 2.554121434688568, + "epoch": 0.5972850678733032, + "grad_norm": 0.7399498224258423, + "learning_rate": 0.0005053480847549187, + "loss": 0.206, + "mean_token_accuracy": 0.9498797357082367, + "num_tokens": 1459698.0, + "step": 165 + }, + { + "entropy": 2.5181015729904175, + "epoch": 0.6009049773755656, + "grad_norm": 0.7433251142501831, + "learning_rate": 0.0005052489106565241, + "loss": 0.2883, + "mean_token_accuracy": 0.9419967085123062, + "num_tokens": 1468460.0, + "step": 166 + }, + { + "entropy": 2.3073930144309998, + "epoch": 0.604524886877828, + "grad_norm": 0.5920398831367493, + "learning_rate": 0.0005051488850220941, + "loss": 0.197, + "mean_token_accuracy": 0.952111005783081, + "num_tokens": 1477579.0, + "step": 167 + }, + { + "entropy": 2.532376289367676, + "epoch": 0.6081447963800904, + "grad_norm": 0.7033098936080933, + "learning_rate": 0.0005050480082315392, + "loss": 0.2122, + "mean_token_accuracy": 0.9488633275032043, + "num_tokens": 1486307.0, + "step": 168 + }, + { + "entropy": 2.397290349006653, + "epoch": 0.611764705882353, + "grad_norm": 0.8026869893074036, + "learning_rate": 0.0005049462806680021, + "loss": 0.2541, + "mean_token_accuracy": 0.9427233040332794, + "num_tokens": 1495152.0, + "step": 169 + }, + { + "entropy": 2.464823842048645, + "epoch": 0.6153846153846154, + "grad_norm": 0.6508225798606873, + "learning_rate": 0.0005048437027178571, + "loss": 0.2639, + "mean_token_accuracy": 0.9391255974769592, + "num_tokens": 1503903.0, + "step": 170 + }, + { + "entropy": 2.520734131336212, + "epoch": 0.6190045248868778, + "grad_norm": 0.8373616337776184, + "learning_rate": 0.0005047402747707084, + "loss": 0.3078, + "mean_token_accuracy": 0.9302930980920792, + "num_tokens": 1512588.0, + "step": 171 + }, + { + "entropy": 2.388108015060425, + "epoch": 0.6226244343891403, + "grad_norm": 0.6334089636802673, + "learning_rate": 0.0005046359972193884, + "loss": 0.1372, + "mean_token_accuracy": 0.9666119515895844, + "num_tokens": 1522011.0, + "step": 172 + }, + { + "entropy": 2.537126660346985, + "epoch": 0.6262443438914027, + "grad_norm": 0.7665116190910339, + "learning_rate": 0.0005045308704599566, + "loss": 0.2603, + "mean_token_accuracy": 0.9350012242794037, + "num_tokens": 1530767.0, + "step": 173 + }, + { + "entropy": 2.567205488681793, + "epoch": 0.6298642533936651, + "grad_norm": 0.8043875098228455, + "learning_rate": 0.0005044248948916977, + "loss": 0.2497, + "mean_token_accuracy": 0.9400482773780823, + "num_tokens": 1539971.0, + "step": 174 + }, + { + "entropy": 2.585887610912323, + "epoch": 0.6334841628959276, + "grad_norm": 0.5282150506973267, + "learning_rate": 0.0005043180709171206, + "loss": 0.1126, + "mean_token_accuracy": 0.9680279046297073, + "num_tokens": 1548971.0, + "step": 175 + }, + { + "entropy": 2.4289392232894897, + "epoch": 0.63710407239819, + "grad_norm": 0.6838382482528687, + "learning_rate": 0.0005042103989419563, + "loss": 0.2076, + "mean_token_accuracy": 0.9468046277761459, + "num_tokens": 1558403.0, + "step": 176 + }, + { + "entropy": 2.6080575585365295, + "epoch": 0.6407239819004525, + "grad_norm": 0.9058650732040405, + "learning_rate": 0.0005041018793751566, + "loss": 0.1781, + "mean_token_accuracy": 0.9432647377252579, + "num_tokens": 1567209.0, + "step": 177 + }, + { + "entropy": 2.5212480425834656, + "epoch": 0.644343891402715, + "grad_norm": 0.796381950378418, + "learning_rate": 0.0005039925126288929, + "loss": 0.2286, + "mean_token_accuracy": 0.9305787235498428, + "num_tokens": 1576255.0, + "step": 178 + }, + { + "entropy": 2.588195264339447, + "epoch": 0.6479638009049774, + "grad_norm": 0.6489388942718506, + "learning_rate": 0.0005038822991185536, + "loss": 0.1717, + "mean_token_accuracy": 0.9572225511074066, + "num_tokens": 1585335.0, + "step": 179 + }, + { + "entropy": 2.609215259552002, + "epoch": 0.6515837104072398, + "grad_norm": 0.8551130294799805, + "learning_rate": 0.0005037712392627441, + "loss": 0.2358, + "mean_token_accuracy": 0.9529621452093124, + "num_tokens": 1594354.0, + "step": 180 + }, + { + "entropy": 2.4199504256248474, + "epoch": 0.6552036199095023, + "grad_norm": 0.5775637030601501, + "learning_rate": 0.0005036593334832836, + "loss": 0.2402, + "mean_token_accuracy": 0.9437069743871689, + "num_tokens": 1603750.0, + "step": 181 + }, + { + "entropy": 2.516424596309662, + "epoch": 0.6588235294117647, + "grad_norm": 0.6967942118644714, + "learning_rate": 0.0005035465822052047, + "loss": 0.1624, + "mean_token_accuracy": 0.9518167823553085, + "num_tokens": 1612474.0, + "step": 182 + }, + { + "entropy": 2.463354170322418, + "epoch": 0.6624434389140271, + "grad_norm": 0.49672600626945496, + "learning_rate": 0.000503432985856751, + "loss": 0.1654, + "mean_token_accuracy": 0.9564716964960098, + "num_tokens": 1621563.0, + "step": 183 + }, + { + "entropy": 2.4456416964530945, + "epoch": 0.6660633484162896, + "grad_norm": 0.6207183003425598, + "learning_rate": 0.000503318544869376, + "loss": 0.1918, + "mean_token_accuracy": 0.9476529806852341, + "num_tokens": 1630801.0, + "step": 184 + }, + { + "entropy": 2.641440451145172, + "epoch": 0.669683257918552, + "grad_norm": 1.220821499824524, + "learning_rate": 0.000503203259677741, + "loss": 0.4019, + "mean_token_accuracy": 0.9172120243310928, + "num_tokens": 1639522.0, + "step": 185 + }, + { + "entropy": 2.6447275280952454, + "epoch": 0.6733031674208145, + "grad_norm": 0.7546490430831909, + "learning_rate": 0.000503087130719714, + "loss": 0.2484, + "mean_token_accuracy": 0.9387800246477127, + "num_tokens": 1647964.0, + "step": 186 + }, + { + "entropy": 2.4657886028289795, + "epoch": 0.676923076923077, + "grad_norm": 0.7679230570793152, + "learning_rate": 0.0005029701584363675, + "loss": 0.2659, + "mean_token_accuracy": 0.930300235748291, + "num_tokens": 1657181.0, + "step": 187 + }, + { + "entropy": 2.37973552942276, + "epoch": 0.6805429864253394, + "grad_norm": 0.7473414540290833, + "learning_rate": 0.0005028523432719772, + "loss": 0.32, + "mean_token_accuracy": 0.9233052879571915, + "num_tokens": 1666477.0, + "step": 188 + }, + { + "entropy": 2.5238219499588013, + "epoch": 0.6841628959276018, + "grad_norm": 0.5573673248291016, + "learning_rate": 0.0005027336856740201, + "loss": 0.1846, + "mean_token_accuracy": 0.9445535093545914, + "num_tokens": 1675002.0, + "step": 189 + }, + { + "entropy": 2.456815242767334, + "epoch": 0.6877828054298643, + "grad_norm": 0.47237634658813477, + "learning_rate": 0.0005026141860931728, + "loss": 0.1065, + "mean_token_accuracy": 0.964375838637352, + "num_tokens": 1683623.0, + "step": 190 + }, + { + "entropy": 2.548456132411957, + "epoch": 0.6914027149321267, + "grad_norm": 0.7699162364006042, + "learning_rate": 0.00050249384498331, + "loss": 0.1985, + "mean_token_accuracy": 0.9438774734735489, + "num_tokens": 1691718.0, + "step": 191 + }, + { + "entropy": 2.4514941573143005, + "epoch": 0.6950226244343891, + "grad_norm": 1.4113538265228271, + "learning_rate": 0.0005023726628015027, + "loss": 0.4541, + "mean_token_accuracy": 0.9207872897386551, + "num_tokens": 1699824.0, + "step": 192 + }, + { + "entropy": 2.2560824751853943, + "epoch": 0.6986425339366515, + "grad_norm": 0.6007948517799377, + "learning_rate": 0.0005022506400080161, + "loss": 0.1871, + "mean_token_accuracy": 0.9502484053373337, + "num_tokens": 1708722.0, + "step": 193 + }, + { + "entropy": 2.1833614110946655, + "epoch": 0.702262443438914, + "grad_norm": 0.7005489468574524, + "learning_rate": 0.0005021277770663082, + "loss": 0.2222, + "mean_token_accuracy": 0.9386974722146988, + "num_tokens": 1717592.0, + "step": 194 + }, + { + "entropy": 2.2031923830509186, + "epoch": 0.7058823529411765, + "grad_norm": 0.5830584764480591, + "learning_rate": 0.0005020040744430284, + "loss": 0.1106, + "mean_token_accuracy": 0.9719562232494354, + "num_tokens": 1726149.0, + "step": 195 + }, + { + "entropy": 2.199785351753235, + "epoch": 0.709502262443439, + "grad_norm": 0.7465847134590149, + "learning_rate": 0.0005018795326080149, + "loss": 0.1935, + "mean_token_accuracy": 0.9497270882129669, + "num_tokens": 1734541.0, + "step": 196 + }, + { + "entropy": 2.1103186309337616, + "epoch": 0.7131221719457014, + "grad_norm": 1.0782264471054077, + "learning_rate": 0.0005017541520342934, + "loss": 0.2895, + "mean_token_accuracy": 0.9274258464574814, + "num_tokens": 1743722.0, + "step": 197 + }, + { + "entropy": 2.2248528599739075, + "epoch": 0.7167420814479638, + "grad_norm": 0.6409780979156494, + "learning_rate": 0.0005016279331980754, + "loss": 0.1425, + "mean_token_accuracy": 0.96550352871418, + "num_tokens": 1752156.0, + "step": 198 + }, + { + "entropy": 2.19924658536911, + "epoch": 0.7203619909502262, + "grad_norm": 0.7019934058189392, + "learning_rate": 0.0005015008765787561, + "loss": 0.1969, + "mean_token_accuracy": 0.9429282248020172, + "num_tokens": 1760978.0, + "step": 199 + }, + { + "entropy": 2.297484815120697, + "epoch": 0.7239819004524887, + "grad_norm": 0.7826490998268127, + "learning_rate": 0.0005013729826589127, + "loss": 0.2399, + "mean_token_accuracy": 0.9416657984256744, + "num_tokens": 1769533.0, + "step": 200 + }, + { + "entropy": 2.2471498548984528, + "epoch": 0.7276018099547511, + "grad_norm": 0.621566891670227, + "learning_rate": 0.0005012442519243027, + "loss": 0.1876, + "mean_token_accuracy": 0.9460793286561966, + "num_tokens": 1778286.0, + "step": 201 + }, + { + "entropy": 2.2212815284729004, + "epoch": 0.7312217194570135, + "grad_norm": 0.622283935546875, + "learning_rate": 0.0005011146848638616, + "loss": 0.1617, + "mean_token_accuracy": 0.9482609927654266, + "num_tokens": 1787392.0, + "step": 202 + }, + { + "entropy": 2.308752655982971, + "epoch": 0.7348416289592761, + "grad_norm": 0.7263973355293274, + "learning_rate": 0.0005009842819697018, + "loss": 0.2043, + "mean_token_accuracy": 0.9378403723239899, + "num_tokens": 1796133.0, + "step": 203 + }, + { + "entropy": 2.3376497626304626, + "epoch": 0.7384615384615385, + "grad_norm": 0.5493630766868591, + "learning_rate": 0.0005008530437371101, + "loss": 0.1145, + "mean_token_accuracy": 0.970586434006691, + "num_tokens": 1804769.0, + "step": 204 + }, + { + "entropy": 2.373005509376526, + "epoch": 0.7420814479638009, + "grad_norm": 0.6313483119010925, + "learning_rate": 0.0005007209706645461, + "loss": 0.2183, + "mean_token_accuracy": 0.9472708404064178, + "num_tokens": 1813364.0, + "step": 205 + }, + { + "entropy": 2.468949854373932, + "epoch": 0.7457013574660634, + "grad_norm": 1.0125588178634644, + "learning_rate": 0.00050058806325364, + "loss": 0.2225, + "mean_token_accuracy": 0.9351322948932648, + "num_tokens": 1822149.0, + "step": 206 + }, + { + "entropy": 2.2420623898506165, + "epoch": 0.7493212669683258, + "grad_norm": 0.913761556148529, + "learning_rate": 0.0005004543220091911, + "loss": 0.2386, + "mean_token_accuracy": 0.9453927427530289, + "num_tokens": 1831533.0, + "step": 207 + }, + { + "entropy": 2.2966006994247437, + "epoch": 0.7529411764705882, + "grad_norm": 0.7386876940727234, + "learning_rate": 0.0005003197474391658, + "loss": 0.1768, + "mean_token_accuracy": 0.949826255440712, + "num_tokens": 1840157.0, + "step": 208 + }, + { + "entropy": 2.306001305580139, + "epoch": 0.7565610859728507, + "grad_norm": 0.8900741338729858, + "learning_rate": 0.0005001843400546955, + "loss": 0.2899, + "mean_token_accuracy": 0.9241485595703125, + "num_tokens": 1848898.0, + "step": 209 + }, + { + "entropy": 2.117514967918396, + "epoch": 0.7601809954751131, + "grad_norm": 0.644622802734375, + "learning_rate": 0.0005000481003700746, + "loss": 0.2714, + "mean_token_accuracy": 0.9299416691064835, + "num_tokens": 1858330.0, + "step": 210 + }, + { + "entropy": 2.3768392205238342, + "epoch": 0.7638009049773755, + "grad_norm": 0.9724471569061279, + "learning_rate": 0.0004999110289027587, + "loss": 0.1633, + "mean_token_accuracy": 0.9550061523914337, + "num_tokens": 1866806.0, + "step": 211 + }, + { + "entropy": 2.090679556131363, + "epoch": 0.7674208144796381, + "grad_norm": 0.5419518351554871, + "learning_rate": 0.0004997731261733628, + "loss": 0.1369, + "mean_token_accuracy": 0.9619670957326889, + "num_tokens": 1875937.0, + "step": 212 + }, + { + "entropy": 2.099909245967865, + "epoch": 0.7710407239819005, + "grad_norm": 0.6858121752738953, + "learning_rate": 0.0004996343927056592, + "loss": 0.1633, + "mean_token_accuracy": 0.9528832882642746, + "num_tokens": 1885145.0, + "step": 213 + }, + { + "entropy": 2.130059242248535, + "epoch": 0.7746606334841629, + "grad_norm": 0.7691065073013306, + "learning_rate": 0.000499494829026575, + "loss": 0.348, + "mean_token_accuracy": 0.9162366837263107, + "num_tokens": 1894255.0, + "step": 214 + }, + { + "entropy": 2.191373586654663, + "epoch": 0.7782805429864253, + "grad_norm": 0.7427324652671814, + "learning_rate": 0.000499354435666191, + "loss": 0.3373, + "mean_token_accuracy": 0.9311849176883698, + "num_tokens": 1902981.0, + "step": 215 + }, + { + "entropy": 2.1425398886203766, + "epoch": 0.7819004524886878, + "grad_norm": 0.6410383582115173, + "learning_rate": 0.0004992132131577392, + "loss": 0.2079, + "mean_token_accuracy": 0.949742391705513, + "num_tokens": 1912253.0, + "step": 216 + }, + { + "entropy": 2.1396586298942566, + "epoch": 0.7855203619909502, + "grad_norm": 0.5689850449562073, + "learning_rate": 0.0004990711620376003, + "loss": 0.1999, + "mean_token_accuracy": 0.946034774184227, + "num_tokens": 1921409.0, + "step": 217 + }, + { + "entropy": 2.2237865328788757, + "epoch": 0.7891402714932126, + "grad_norm": 0.6408923864364624, + "learning_rate": 0.0004989282828453029, + "loss": 0.2452, + "mean_token_accuracy": 0.9510752111673355, + "num_tokens": 1930397.0, + "step": 218 + }, + { + "entropy": 2.234771251678467, + "epoch": 0.7927601809954751, + "grad_norm": 0.751447856426239, + "learning_rate": 0.0004987845761235203, + "loss": 0.3057, + "mean_token_accuracy": 0.9217256307601929, + "num_tokens": 1939172.0, + "step": 219 + }, + { + "entropy": 2.2653815746307373, + "epoch": 0.7963800904977375, + "grad_norm": 0.751455545425415, + "learning_rate": 0.0004986400424180688, + "loss": 0.3245, + "mean_token_accuracy": 0.9256318956613541, + "num_tokens": 1947979.0, + "step": 220 + }, + { + "entropy": 2.3123483061790466, + "epoch": 0.8, + "grad_norm": 0.5939492583274841, + "learning_rate": 0.0004984946822779061, + "loss": 0.2429, + "mean_token_accuracy": 0.9333402067422867, + "num_tokens": 1956814.0, + "step": 221 + }, + { + "entropy": 2.3289234042167664, + "epoch": 0.8036199095022625, + "grad_norm": 0.5591994524002075, + "learning_rate": 0.0004983484962551284, + "loss": 0.1507, + "mean_token_accuracy": 0.96376833319664, + "num_tokens": 1965641.0, + "step": 222 + }, + { + "entropy": 2.4314023852348328, + "epoch": 0.8072398190045249, + "grad_norm": 0.5805783271789551, + "learning_rate": 0.0004982014849049687, + "loss": 0.2049, + "mean_token_accuracy": 0.9586948156356812, + "num_tokens": 1974180.0, + "step": 223 + }, + { + "entropy": 2.3639765977859497, + "epoch": 0.8108597285067873, + "grad_norm": 0.6924490332603455, + "learning_rate": 0.0004980536487857951, + "loss": 0.2137, + "mean_token_accuracy": 0.9441423565149307, + "num_tokens": 1982744.0, + "step": 224 + }, + { + "entropy": 2.3361759781837463, + "epoch": 0.8144796380090498, + "grad_norm": 0.4579620361328125, + "learning_rate": 0.0004979049884591077, + "loss": 0.1041, + "mean_token_accuracy": 0.9753208309412003, + "num_tokens": 1991583.0, + "step": 225 + }, + { + "entropy": 2.286989688873291, + "epoch": 0.8180995475113122, + "grad_norm": 0.6489312052726746, + "learning_rate": 0.0004977555044895377, + "loss": 0.2131, + "mean_token_accuracy": 0.9520440250635147, + "num_tokens": 2000193.0, + "step": 226 + }, + { + "entropy": 2.288672834634781, + "epoch": 0.8217194570135746, + "grad_norm": 0.7738961577415466, + "learning_rate": 0.0004976051974448441, + "loss": 0.325, + "mean_token_accuracy": 0.9060750156641006, + "num_tokens": 2009233.0, + "step": 227 + }, + { + "entropy": 2.288076102733612, + "epoch": 0.8253393665158371, + "grad_norm": 0.7042292356491089, + "learning_rate": 0.0004974540678959123, + "loss": 0.2206, + "mean_token_accuracy": 0.94980289041996, + "num_tokens": 2018417.0, + "step": 228 + }, + { + "entropy": 2.217707335948944, + "epoch": 0.8289592760180996, + "grad_norm": 0.6834208369255066, + "learning_rate": 0.0004973021164167515, + "loss": 0.2907, + "mean_token_accuracy": 0.951058641076088, + "num_tokens": 2027822.0, + "step": 229 + }, + { + "entropy": 2.1610691249370575, + "epoch": 0.832579185520362, + "grad_norm": 0.665044903755188, + "learning_rate": 0.0004971493435844928, + "loss": 0.2387, + "mean_token_accuracy": 0.9506549835205078, + "num_tokens": 2036983.0, + "step": 230 + }, + { + "entropy": 2.321135401725769, + "epoch": 0.8361990950226245, + "grad_norm": 0.8208273649215698, + "learning_rate": 0.0004969957499793869, + "loss": 0.2399, + "mean_token_accuracy": 0.9435176253318787, + "num_tokens": 2045574.0, + "step": 231 + }, + { + "entropy": 2.1943611800670624, + "epoch": 0.8398190045248869, + "grad_norm": 0.6293840408325195, + "learning_rate": 0.0004968413361848019, + "loss": 0.1784, + "mean_token_accuracy": 0.9559669345617294, + "num_tokens": 2054336.0, + "step": 232 + }, + { + "entropy": 2.2722273468971252, + "epoch": 0.8434389140271493, + "grad_norm": 0.6535817980766296, + "learning_rate": 0.0004966861027872211, + "loss": 0.1675, + "mean_token_accuracy": 0.9532535970211029, + "num_tokens": 2063225.0, + "step": 233 + }, + { + "entropy": 2.3278334736824036, + "epoch": 0.8470588235294118, + "grad_norm": 1.1610206365585327, + "learning_rate": 0.0004965300503762406, + "loss": 0.1588, + "mean_token_accuracy": 0.9641145765781403, + "num_tokens": 2071738.0, + "step": 234 + }, + { + "entropy": 2.202972888946533, + "epoch": 0.8506787330316742, + "grad_norm": 0.4811885356903076, + "learning_rate": 0.0004963731795445675, + "loss": 0.0813, + "mean_token_accuracy": 0.9766911715269089, + "num_tokens": 2080375.0, + "step": 235 + }, + { + "entropy": 2.2433705925941467, + "epoch": 0.8542986425339366, + "grad_norm": 0.8113318681716919, + "learning_rate": 0.0004962154908880171, + "loss": 0.2965, + "mean_token_accuracy": 0.9290606826543808, + "num_tokens": 2089522.0, + "step": 236 + }, + { + "entropy": 2.2168884873390198, + "epoch": 0.857918552036199, + "grad_norm": 0.6128959655761719, + "learning_rate": 0.0004960569850055111, + "loss": 0.1724, + "mean_token_accuracy": 0.9603384286165237, + "num_tokens": 2098162.0, + "step": 237 + }, + { + "entropy": 2.2738255858421326, + "epoch": 0.8615384615384616, + "grad_norm": 0.8557195663452148, + "learning_rate": 0.0004958976624990749, + "loss": 0.2596, + "mean_token_accuracy": 0.9487071484327316, + "num_tokens": 2106984.0, + "step": 238 + }, + { + "entropy": 2.2031425833702087, + "epoch": 0.865158371040724, + "grad_norm": 0.6621816158294678, + "learning_rate": 0.0004957375239738359, + "loss": 0.232, + "mean_token_accuracy": 0.9525040090084076, + "num_tokens": 2116040.0, + "step": 239 + }, + { + "entropy": 2.374737858772278, + "epoch": 0.8687782805429864, + "grad_norm": 0.8481062054634094, + "learning_rate": 0.0004955765700380204, + "loss": 0.2516, + "mean_token_accuracy": 0.9396061599254608, + "num_tokens": 2124862.0, + "step": 240 + }, + { + "entropy": 2.266704559326172, + "epoch": 0.8723981900452489, + "grad_norm": 0.6284282803535461, + "learning_rate": 0.0004954148013029521, + "loss": 0.3244, + "mean_token_accuracy": 0.9381244331598282, + "num_tokens": 2134018.0, + "step": 241 + }, + { + "entropy": 2.3935859203338623, + "epoch": 0.8760180995475113, + "grad_norm": 1.1564176082611084, + "learning_rate": 0.0004952522183830493, + "loss": 0.2706, + "mean_token_accuracy": 0.9297053664922714, + "num_tokens": 2142745.0, + "step": 242 + }, + { + "entropy": 2.281618118286133, + "epoch": 0.8796380090497737, + "grad_norm": 0.5324040055274963, + "learning_rate": 0.0004950888218958225, + "loss": 0.1573, + "mean_token_accuracy": 0.9568462073802948, + "num_tokens": 2151607.0, + "step": 243 + }, + { + "entropy": 2.230749189853668, + "epoch": 0.8832579185520362, + "grad_norm": 0.680780291557312, + "learning_rate": 0.0004949246124618726, + "loss": 0.1956, + "mean_token_accuracy": 0.9479999989271164, + "num_tokens": 2160904.0, + "step": 244 + }, + { + "entropy": 2.21382600069046, + "epoch": 0.8868778280542986, + "grad_norm": 0.6321626305580139, + "learning_rate": 0.0004947595907048877, + "loss": 0.2444, + "mean_token_accuracy": 0.9376699328422546, + "num_tokens": 2170021.0, + "step": 245 + }, + { + "entropy": 2.3659472465515137, + "epoch": 0.890497737556561, + "grad_norm": 0.9778954982757568, + "learning_rate": 0.0004945937572516417, + "loss": 0.3783, + "mean_token_accuracy": 0.9104805737733841, + "num_tokens": 2178995.0, + "step": 246 + }, + { + "entropy": 2.3233078718185425, + "epoch": 0.8941176470588236, + "grad_norm": 0.53229820728302, + "learning_rate": 0.0004944271127319909, + "loss": 0.0759, + "mean_token_accuracy": 0.9791453778743744, + "num_tokens": 2187823.0, + "step": 247 + }, + { + "entropy": 2.2469444274902344, + "epoch": 0.897737556561086, + "grad_norm": 0.6367197632789612, + "learning_rate": 0.0004942596577788728, + "loss": 0.2677, + "mean_token_accuracy": 0.9392691254615784, + "num_tokens": 2196923.0, + "step": 248 + }, + { + "entropy": 2.4508965611457825, + "epoch": 0.9013574660633484, + "grad_norm": 0.6042234897613525, + "learning_rate": 0.0004940913930283024, + "loss": 0.1102, + "mean_token_accuracy": 0.9762090593576431, + "num_tokens": 2205400.0, + "step": 249 + }, + { + "entropy": 2.365670144557953, + "epoch": 0.9049773755656109, + "grad_norm": 0.6490639448165894, + "learning_rate": 0.0004939223191193707, + "loss": 0.1532, + "mean_token_accuracy": 0.9489114433526993, + "num_tokens": 2214201.0, + "step": 250 + }, + { + "entropy": 2.4013625383377075, + "epoch": 0.9085972850678733, + "grad_norm": 0.5969854593276978, + "learning_rate": 0.0004937524366942419, + "loss": 0.1273, + "mean_token_accuracy": 0.9682519882917404, + "num_tokens": 2222979.0, + "step": 251 + }, + { + "entropy": 2.4402357935905457, + "epoch": 0.9122171945701357, + "grad_norm": 0.7559595704078674, + "learning_rate": 0.0004935817463981513, + "loss": 0.1979, + "mean_token_accuracy": 0.9483373910188675, + "num_tokens": 2231169.0, + "step": 252 + }, + { + "entropy": 2.4673256874084473, + "epoch": 0.9158371040723982, + "grad_norm": 0.8663308620452881, + "learning_rate": 0.0004934102488794023, + "loss": 0.2453, + "mean_token_accuracy": 0.9408974200487137, + "num_tokens": 2240099.0, + "step": 253 + }, + { + "entropy": 2.426262080669403, + "epoch": 0.9194570135746606, + "grad_norm": 0.7920467257499695, + "learning_rate": 0.0004932379447893643, + "loss": 0.2828, + "mean_token_accuracy": 0.9319239109754562, + "num_tokens": 2249088.0, + "step": 254 + }, + { + "entropy": 2.5018852949142456, + "epoch": 0.9230769230769231, + "grad_norm": 0.7216617465019226, + "learning_rate": 0.0004930648347824701, + "loss": 0.1647, + "mean_token_accuracy": 0.9551804810762405, + "num_tokens": 2257710.0, + "step": 255 + }, + { + "entropy": 2.43031644821167, + "epoch": 0.9266968325791856, + "grad_norm": 0.646794319152832, + "learning_rate": 0.0004928909195162138, + "loss": 0.1328, + "mean_token_accuracy": 0.9663553237915039, + "num_tokens": 2266883.0, + "step": 256 + }, + { + "entropy": 2.5406370759010315, + "epoch": 0.930316742081448, + "grad_norm": 0.5482825040817261, + "learning_rate": 0.0004927161996511474, + "loss": 0.1872, + "mean_token_accuracy": 0.9557004272937775, + "num_tokens": 2275728.0, + "step": 257 + }, + { + "entropy": 2.636320471763611, + "epoch": 0.9339366515837104, + "grad_norm": 0.7454632520675659, + "learning_rate": 0.0004925406758508797, + "loss": 0.1461, + "mean_token_accuracy": 0.9578974395990372, + "num_tokens": 2284319.0, + "step": 258 + }, + { + "entropy": 2.6067575812339783, + "epoch": 0.9375565610859729, + "grad_norm": 0.8695769309997559, + "learning_rate": 0.000492364348782072, + "loss": 0.1712, + "mean_token_accuracy": 0.9652896523475647, + "num_tokens": 2293035.0, + "step": 259 + }, + { + "entropy": 2.5837162137031555, + "epoch": 0.9411764705882353, + "grad_norm": 0.5752995014190674, + "learning_rate": 0.0004921872191144371, + "loss": 0.1398, + "mean_token_accuracy": 0.9553333520889282, + "num_tokens": 2301802.0, + "step": 260 + }, + { + "entropy": 2.713033616542816, + "epoch": 0.9447963800904977, + "grad_norm": 0.85626620054245, + "learning_rate": 0.0004920092875207363, + "loss": 0.2207, + "mean_token_accuracy": 0.9468346834182739, + "num_tokens": 2309981.0, + "step": 261 + }, + { + "entropy": 2.400112509727478, + "epoch": 0.9484162895927601, + "grad_norm": 0.6766608953475952, + "learning_rate": 0.0004918305546767764, + "loss": 0.1644, + "mean_token_accuracy": 0.9502440094947815, + "num_tokens": 2319212.0, + "step": 262 + }, + { + "entropy": 2.503827154636383, + "epoch": 0.9520361990950226, + "grad_norm": 0.789470911026001, + "learning_rate": 0.0004916510212614072, + "loss": 0.2117, + "mean_token_accuracy": 0.9454390555620193, + "num_tokens": 2328234.0, + "step": 263 + }, + { + "entropy": 2.669040560722351, + "epoch": 0.9556561085972851, + "grad_norm": 0.9579212069511414, + "learning_rate": 0.0004914706879565197, + "loss": 0.2193, + "mean_token_accuracy": 0.9321542829275131, + "num_tokens": 2336543.0, + "step": 264 + }, + { + "entropy": 2.507073998451233, + "epoch": 0.9592760180995475, + "grad_norm": 0.5315744876861572, + "learning_rate": 0.000491289555447043, + "loss": 0.0851, + "mean_token_accuracy": 0.9771326780319214, + "num_tokens": 2345292.0, + "step": 265 + }, + { + "entropy": 2.4205283522605896, + "epoch": 0.96289592760181, + "grad_norm": 0.5441373586654663, + "learning_rate": 0.000491107624420941, + "loss": 0.1323, + "mean_token_accuracy": 0.9541790336370468, + "num_tokens": 2354242.0, + "step": 266 + }, + { + "entropy": 2.3817258477211, + "epoch": 0.9665158371040724, + "grad_norm": 0.5946238040924072, + "learning_rate": 0.0004909248955692111, + "loss": 0.1708, + "mean_token_accuracy": 0.947738841176033, + "num_tokens": 2363183.0, + "step": 267 + }, + { + "entropy": 2.5073485374450684, + "epoch": 0.9701357466063348, + "grad_norm": 0.6979324817657471, + "learning_rate": 0.0004907413695858812, + "loss": 0.2099, + "mean_token_accuracy": 0.9423733651638031, + "num_tokens": 2371885.0, + "step": 268 + }, + { + "entropy": 2.5705007910728455, + "epoch": 0.9737556561085973, + "grad_norm": 0.8203943967819214, + "learning_rate": 0.0004905570471680057, + "loss": 0.217, + "mean_token_accuracy": 0.9511639326810837, + "num_tokens": 2380316.0, + "step": 269 + }, + { + "entropy": 2.2677993774414062, + "epoch": 0.9773755656108597, + "grad_norm": 0.5840432047843933, + "learning_rate": 0.0004903719290156649, + "loss": 0.2364, + "mean_token_accuracy": 0.9407180696725845, + "num_tokens": 2389723.0, + "step": 270 + }, + { + "entropy": 2.477886915206909, + "epoch": 0.9809954751131221, + "grad_norm": 0.818929135799408, + "learning_rate": 0.0004901860158319612, + "loss": 0.1707, + "mean_token_accuracy": 0.9579566866159439, + "num_tokens": 2398388.0, + "step": 271 + }, + { + "entropy": 2.549662232398987, + "epoch": 0.9846153846153847, + "grad_norm": 0.7804781198501587, + "learning_rate": 0.0004899993083230166, + "loss": 0.2944, + "mean_token_accuracy": 0.9381812512874603, + "num_tokens": 2406929.0, + "step": 272 + }, + { + "entropy": 2.4465304017066956, + "epoch": 0.9882352941176471, + "grad_norm": 0.5218799114227295, + "learning_rate": 0.0004898118071979699, + "loss": 0.1661, + "mean_token_accuracy": 0.9500218778848648, + "num_tokens": 2415631.0, + "step": 273 + }, + { + "entropy": 2.5852283239364624, + "epoch": 0.9918552036199095, + "grad_norm": 0.591163158416748, + "learning_rate": 0.0004896235131689743, + "loss": 0.2005, + "mean_token_accuracy": 0.9455285370349884, + "num_tokens": 2424091.0, + "step": 274 + }, + { + "entropy": 2.478701651096344, + "epoch": 0.995475113122172, + "grad_norm": 1.0615383386611938, + "learning_rate": 0.0004894344269511945, + "loss": 0.2864, + "mean_token_accuracy": 0.9306265562772751, + "num_tokens": 2432705.0, + "step": 275 + }, + { + "entropy": 2.600062847137451, + "epoch": 0.9990950226244344, + "grad_norm": 0.7011683583259583, + "learning_rate": 0.0004892445492628043, + "loss": 0.1664, + "mean_token_accuracy": 0.9547821134328842, + "num_tokens": 2440992.0, + "step": 276 + }, + { + "entropy": 2.3411240577697754, + "epoch": 1.0, + "grad_norm": 0.4944029450416565, + "learning_rate": 0.000489053880824983, + "loss": 0.022, + "mean_token_accuracy": 0.9929078221321106, + "num_tokens": 2441725.0, + "step": 277 + }, + { + "epoch": 1.0, + "eval_entropy": 2.5467925265552553, + "eval_loss": 0.21274714171886444, + "eval_mean_token_accuracy": 0.9444630068492114, + "eval_num_tokens": 2441725.0, + "eval_runtime": 116.0434, + "eval_samples_per_second": 3.18, + "eval_steps_per_second": 1.06, + "step": 277 + }, + { + "entropy": 2.609170138835907, + "epoch": 1.0036199095022624, + "grad_norm": 1.0785081386566162, + "learning_rate": 0.0004888624223619136, + "loss": 0.3167, + "mean_token_accuracy": 0.9296800643205643, + "num_tokens": 2450193.0, + "step": 278 + }, + { + "entropy": 2.497025430202484, + "epoch": 1.0072398190045249, + "grad_norm": 0.5221985578536987, + "learning_rate": 0.0004886701746007801, + "loss": 0.0854, + "mean_token_accuracy": 0.9753399342298508, + "num_tokens": 2459309.0, + "step": 279 + }, + { + "entropy": 2.5487362146377563, + "epoch": 1.0108597285067873, + "grad_norm": 0.5161958336830139, + "learning_rate": 0.0004884771382717638, + "loss": 0.0819, + "mean_token_accuracy": 0.9748431146144867, + "num_tokens": 2467844.0, + "step": 280 + }, + { + "entropy": 2.5276209115982056, + "epoch": 1.0144796380090497, + "grad_norm": 0.5731730461120605, + "learning_rate": 0.0004882833141080412, + "loss": 0.1541, + "mean_token_accuracy": 0.9567564427852631, + "num_tokens": 2476894.0, + "step": 281 + }, + { + "entropy": 2.4442760348320007, + "epoch": 1.0180995475113122, + "grad_norm": 0.7120366096496582, + "learning_rate": 0.0004880887028457813, + "loss": 0.1945, + "mean_token_accuracy": 0.9465379565954208, + "num_tokens": 2485971.0, + "step": 282 + }, + { + "entropy": 2.4069360494613647, + "epoch": 1.0217194570135746, + "grad_norm": 0.7468647360801697, + "learning_rate": 0.00048789330522414244, + "loss": 0.2345, + "mean_token_accuracy": 0.9446765780448914, + "num_tokens": 2495043.0, + "step": 283 + }, + { + "entropy": 2.468382716178894, + "epoch": 1.025339366515837, + "grad_norm": 0.666231632232666, + "learning_rate": 0.0004876971219852697, + "loss": 0.1779, + "mean_token_accuracy": 0.9534575343132019, + "num_tokens": 2503672.0, + "step": 284 + }, + { + "entropy": 2.4362316727638245, + "epoch": 1.0289592760180994, + "grad_norm": 0.8445858955383301, + "learning_rate": 0.000487500153874292, + "loss": 0.1698, + "mean_token_accuracy": 0.953661322593689, + "num_tokens": 2512322.0, + "step": 285 + }, + { + "entropy": 2.364333391189575, + "epoch": 1.032579185520362, + "grad_norm": 0.4805246591567993, + "learning_rate": 0.0004873024016393193, + "loss": 0.0778, + "mean_token_accuracy": 0.9824571758508682, + "num_tokens": 2520791.0, + "step": 286 + }, + { + "entropy": 2.223461151123047, + "epoch": 1.0361990950226245, + "grad_norm": 0.648465096950531, + "learning_rate": 0.0004871038660314399, + "loss": 0.2593, + "mean_token_accuracy": 0.9419913589954376, + "num_tokens": 2530082.0, + "step": 287 + }, + { + "entropy": 2.3313387036323547, + "epoch": 1.039819004524887, + "grad_norm": 0.6912294626235962, + "learning_rate": 0.00048690454780471725, + "loss": 0.1354, + "mean_token_accuracy": 0.9561934620141983, + "num_tokens": 2538728.0, + "step": 288 + }, + { + "entropy": 2.191806375980377, + "epoch": 1.0434389140271494, + "grad_norm": 0.8620694279670715, + "learning_rate": 0.0004867044477161874, + "loss": 0.1103, + "mean_token_accuracy": 0.968692272901535, + "num_tokens": 2547219.0, + "step": 289 + }, + { + "entropy": 2.167125165462494, + "epoch": 1.0470588235294118, + "grad_norm": 0.6192149519920349, + "learning_rate": 0.0004865035665258559, + "loss": 0.1288, + "mean_token_accuracy": 0.9643534421920776, + "num_tokens": 2555940.0, + "step": 290 + }, + { + "entropy": 2.2750985622406006, + "epoch": 1.0506787330316743, + "grad_norm": 1.7459602355957031, + "learning_rate": 0.0004863019049966953, + "loss": 0.393, + "mean_token_accuracy": 0.9146681725978851, + "num_tokens": 2564362.0, + "step": 291 + }, + { + "entropy": 2.236129105091095, + "epoch": 1.0542986425339367, + "grad_norm": 0.6311184167861938, + "learning_rate": 0.0004860994638946416, + "loss": 0.1536, + "mean_token_accuracy": 0.9636097103357315, + "num_tokens": 2573316.0, + "step": 292 + }, + { + "entropy": 2.2642418146133423, + "epoch": 1.0579185520361991, + "grad_norm": 0.6023411154747009, + "learning_rate": 0.000485896243988592, + "loss": 0.191, + "mean_token_accuracy": 0.9476015418767929, + "num_tokens": 2581835.0, + "step": 293 + }, + { + "entropy": 2.3589024543762207, + "epoch": 1.0615384615384615, + "grad_norm": 0.48049232363700867, + "learning_rate": 0.0004856922460504016, + "loss": 0.1017, + "mean_token_accuracy": 0.9713075459003448, + "num_tokens": 2590317.0, + "step": 294 + }, + { + "entropy": 2.4141315817832947, + "epoch": 1.065158371040724, + "grad_norm": 0.8456616997718811, + "learning_rate": 0.0004854874708548806, + "loss": 0.1422, + "mean_token_accuracy": 0.9622762501239777, + "num_tokens": 2598538.0, + "step": 295 + }, + { + "entropy": 2.069903999567032, + "epoch": 1.0687782805429864, + "grad_norm": 0.7641116380691528, + "learning_rate": 0.0004852819191797912, + "loss": 0.2185, + "mean_token_accuracy": 0.9464851468801498, + "num_tokens": 2608219.0, + "step": 296 + }, + { + "entropy": 2.163217008113861, + "epoch": 1.0723981900452488, + "grad_norm": 0.546085000038147, + "learning_rate": 0.0004850755918058449, + "loss": 0.1035, + "mean_token_accuracy": 0.9708487540483475, + "num_tokens": 2617261.0, + "step": 297 + }, + { + "entropy": 2.2678662836551666, + "epoch": 1.0760180995475113, + "grad_norm": 0.8699386119842529, + "learning_rate": 0.0004848684895166994, + "loss": 0.2384, + "mean_token_accuracy": 0.9486480504274368, + "num_tokens": 2626144.0, + "step": 298 + }, + { + "entropy": 2.13065105676651, + "epoch": 1.0796380090497737, + "grad_norm": 0.44323107600212097, + "learning_rate": 0.00048466061309895554, + "loss": 0.0818, + "mean_token_accuracy": 0.9722468554973602, + "num_tokens": 2635626.0, + "step": 299 + }, + { + "entropy": 2.184772551059723, + "epoch": 1.0832579185520361, + "grad_norm": 0.7928256988525391, + "learning_rate": 0.0004844519633421545, + "loss": 0.2378, + "mean_token_accuracy": 0.9477885961532593, + "num_tokens": 2644674.0, + "step": 300 + }, + { + "entropy": 2.1669145822525024, + "epoch": 1.0868778280542986, + "grad_norm": 0.5570158362388611, + "learning_rate": 0.00048424254103877456, + "loss": 0.1434, + "mean_token_accuracy": 0.9587411731481552, + "num_tokens": 2653658.0, + "step": 301 + }, + { + "entropy": 2.3057579398155212, + "epoch": 1.090497737556561, + "grad_norm": 0.9084392189979553, + "learning_rate": 0.00048403234698422837, + "loss": 0.3831, + "mean_token_accuracy": 0.8896283358335495, + "num_tokens": 2662350.0, + "step": 302 + }, + { + "entropy": 2.1741657853126526, + "epoch": 1.0941176470588236, + "grad_norm": 0.6791238784790039, + "learning_rate": 0.0004838213819768597, + "loss": 0.1648, + "mean_token_accuracy": 0.9576362520456314, + "num_tokens": 2671450.0, + "step": 303 + }, + { + "entropy": 2.089864045381546, + "epoch": 1.097737556561086, + "grad_norm": 0.5696312189102173, + "learning_rate": 0.0004836096468179406, + "loss": 0.1269, + "mean_token_accuracy": 0.9658148884773254, + "num_tokens": 2680581.0, + "step": 304 + }, + { + "entropy": 2.2657605409622192, + "epoch": 1.1013574660633485, + "grad_norm": 1.605503797531128, + "learning_rate": 0.0004833971423116682, + "loss": 0.1027, + "mean_token_accuracy": 0.9762597978115082, + "num_tokens": 2689001.0, + "step": 305 + }, + { + "entropy": 2.079287111759186, + "epoch": 1.104977375565611, + "grad_norm": 0.5804780721664429, + "learning_rate": 0.00048318386926516157, + "loss": 0.1137, + "mean_token_accuracy": 0.9633719325065613, + "num_tokens": 2698050.0, + "step": 306 + }, + { + "entropy": 2.201345145702362, + "epoch": 1.1085972850678734, + "grad_norm": 0.8606241941452026, + "learning_rate": 0.000482969828488459, + "loss": 0.2124, + "mean_token_accuracy": 0.9472681730985641, + "num_tokens": 2706704.0, + "step": 307 + }, + { + "entropy": 2.095236599445343, + "epoch": 1.1122171945701358, + "grad_norm": 0.7078782320022583, + "learning_rate": 0.0004827550207945147, + "loss": 0.1957, + "mean_token_accuracy": 0.9564679116010666, + "num_tokens": 2715745.0, + "step": 308 + }, + { + "entropy": 2.186302363872528, + "epoch": 1.1158371040723982, + "grad_norm": 0.7166503667831421, + "learning_rate": 0.0004825394469991956, + "loss": 0.1539, + "mean_token_accuracy": 0.9662427455186844, + "num_tokens": 2724296.0, + "step": 309 + }, + { + "entropy": 2.052559405565262, + "epoch": 1.1194570135746607, + "grad_norm": 0.6510501503944397, + "learning_rate": 0.00048232310792127846, + "loss": 0.1831, + "mean_token_accuracy": 0.9533994495868683, + "num_tokens": 2733482.0, + "step": 310 + }, + { + "entropy": 2.093154102563858, + "epoch": 1.123076923076923, + "grad_norm": 0.711121678352356, + "learning_rate": 0.0004821060043824466, + "loss": 0.2315, + "mean_token_accuracy": 0.9381555914878845, + "num_tokens": 2742912.0, + "step": 311 + }, + { + "entropy": 2.188497006893158, + "epoch": 1.1266968325791855, + "grad_norm": 0.6782490015029907, + "learning_rate": 0.00048188813720728707, + "loss": 0.2, + "mean_token_accuracy": 0.9501812607049942, + "num_tokens": 2751808.0, + "step": 312 + }, + { + "entropy": 2.0495824217796326, + "epoch": 1.130316742081448, + "grad_norm": 0.7644634246826172, + "learning_rate": 0.00048166950722328697, + "loss": 0.2152, + "mean_token_accuracy": 0.9440928995609283, + "num_tokens": 2761066.0, + "step": 313 + }, + { + "entropy": 2.1707025468349457, + "epoch": 1.1339366515837104, + "grad_norm": 0.655131459236145, + "learning_rate": 0.00048145011526083106, + "loss": 0.1637, + "mean_token_accuracy": 0.9500558227300644, + "num_tokens": 2769870.0, + "step": 314 + }, + { + "entropy": 2.1047372221946716, + "epoch": 1.1375565610859728, + "grad_norm": 0.5353516936302185, + "learning_rate": 0.0004812299621531979, + "loss": 0.1705, + "mean_token_accuracy": 0.9455999433994293, + "num_tokens": 2779383.0, + "step": 315 + }, + { + "entropy": 2.1921610236167908, + "epoch": 1.1411764705882352, + "grad_norm": 0.8998016119003296, + "learning_rate": 0.00048100904873655696, + "loss": 0.3918, + "mean_token_accuracy": 0.9382697492837906, + "num_tokens": 2788386.0, + "step": 316 + }, + { + "entropy": 2.0850723683834076, + "epoch": 1.1447963800904977, + "grad_norm": 0.867432713508606, + "learning_rate": 0.0004807873758499656, + "loss": 0.2196, + "mean_token_accuracy": 0.9498324394226074, + "num_tokens": 2797496.0, + "step": 317 + }, + { + "entropy": 2.1980925798416138, + "epoch": 1.14841628959276, + "grad_norm": 0.6076980233192444, + "learning_rate": 0.00048056494433536577, + "loss": 0.1086, + "mean_token_accuracy": 0.9642161130905151, + "num_tokens": 2805836.0, + "step": 318 + }, + { + "entropy": 2.15611070394516, + "epoch": 1.1520361990950225, + "grad_norm": 0.6276211738586426, + "learning_rate": 0.0004803417550375806, + "loss": 0.1463, + "mean_token_accuracy": 0.9622830748558044, + "num_tokens": 2814404.0, + "step": 319 + }, + { + "entropy": 2.0017230808734894, + "epoch": 1.155656108597285, + "grad_norm": 0.5840948820114136, + "learning_rate": 0.0004801178088043115, + "loss": 0.1869, + "mean_token_accuracy": 0.9506777077913284, + "num_tokens": 2823786.0, + "step": 320 + }, + { + "entropy": 2.1539418697357178, + "epoch": 1.1592760180995474, + "grad_norm": 1.074331283569336, + "learning_rate": 0.0004798931064861349, + "loss": 0.2797, + "mean_token_accuracy": 0.9271649420261383, + "num_tokens": 2832374.0, + "step": 321 + }, + { + "entropy": 1.930726408958435, + "epoch": 1.16289592760181, + "grad_norm": 0.5121958255767822, + "learning_rate": 0.0004796676489364988, + "loss": 0.1579, + "mean_token_accuracy": 0.9582571685314178, + "num_tokens": 2841561.0, + "step": 322 + }, + { + "entropy": 2.0205810368061066, + "epoch": 1.1665158371040725, + "grad_norm": 0.6360969543457031, + "learning_rate": 0.00047944143701171966, + "loss": 0.1582, + "mean_token_accuracy": 0.9620308429002762, + "num_tokens": 2850171.0, + "step": 323 + }, + { + "entropy": 1.9655758142471313, + "epoch": 1.170135746606335, + "grad_norm": 0.6647385358810425, + "learning_rate": 0.0004792144715709792, + "loss": 0.1594, + "mean_token_accuracy": 0.954497441649437, + "num_tokens": 2858905.0, + "step": 324 + }, + { + "entropy": 1.9725223183631897, + "epoch": 1.1737556561085973, + "grad_norm": 0.6429229974746704, + "learning_rate": 0.0004789867534763211, + "loss": 0.1407, + "mean_token_accuracy": 0.9645214527845383, + "num_tokens": 2867533.0, + "step": 325 + }, + { + "entropy": 1.9473685026168823, + "epoch": 1.1773755656108598, + "grad_norm": 0.811651349067688, + "learning_rate": 0.0004787582835926477, + "loss": 0.1608, + "mean_token_accuracy": 0.9479968994855881, + "num_tokens": 2876286.0, + "step": 326 + }, + { + "entropy": 1.8863109350204468, + "epoch": 1.1809954751131222, + "grad_norm": 0.5587059855461121, + "learning_rate": 0.00047852906278771686, + "loss": 0.131, + "mean_token_accuracy": 0.9684520065784454, + "num_tokens": 2885667.0, + "step": 327 + }, + { + "entropy": 1.8288891315460205, + "epoch": 1.1846153846153846, + "grad_norm": 0.8450536131858826, + "learning_rate": 0.0004782990919321383, + "loss": 0.2224, + "mean_token_accuracy": 0.9377491921186447, + "num_tokens": 2894765.0, + "step": 328 + }, + { + "entropy": 1.9347718358039856, + "epoch": 1.188235294117647, + "grad_norm": 0.7665867209434509, + "learning_rate": 0.0004780683718993705, + "loss": 0.167, + "mean_token_accuracy": 0.9583602845668793, + "num_tokens": 2903551.0, + "step": 329 + }, + { + "entropy": 1.9097798764705658, + "epoch": 1.1918552036199095, + "grad_norm": 0.7705667018890381, + "learning_rate": 0.00047783690356571784, + "loss": 0.2115, + "mean_token_accuracy": 0.9526428133249283, + "num_tokens": 2912197.0, + "step": 330 + }, + { + "entropy": 1.9174850285053253, + "epoch": 1.195475113122172, + "grad_norm": 0.5695499181747437, + "learning_rate": 0.00047760468781032634, + "loss": 0.1033, + "mean_token_accuracy": 0.969958484172821, + "num_tokens": 2920579.0, + "step": 331 + }, + { + "entropy": 1.8578442931175232, + "epoch": 1.1990950226244343, + "grad_norm": 0.7843735814094543, + "learning_rate": 0.000477371725515181, + "loss": 0.1664, + "mean_token_accuracy": 0.9545005410909653, + "num_tokens": 2929352.0, + "step": 332 + }, + { + "entropy": 1.8509328961372375, + "epoch": 1.2027149321266968, + "grad_norm": 0.5951048135757446, + "learning_rate": 0.0004771380175651026, + "loss": 0.1566, + "mean_token_accuracy": 0.9551403075456619, + "num_tokens": 2938387.0, + "step": 333 + }, + { + "entropy": 1.8236390948295593, + "epoch": 1.2063348416289592, + "grad_norm": 0.4988223910331726, + "learning_rate": 0.0004769035648477434, + "loss": 0.1242, + "mean_token_accuracy": 0.966319814324379, + "num_tokens": 2947741.0, + "step": 334 + }, + { + "entropy": 1.9594822525978088, + "epoch": 1.2099547511312216, + "grad_norm": 0.7550755143165588, + "learning_rate": 0.00047666836825358477, + "loss": 0.1591, + "mean_token_accuracy": 0.9666347652673721, + "num_tokens": 2956313.0, + "step": 335 + }, + { + "entropy": 1.9148444533348083, + "epoch": 1.213574660633484, + "grad_norm": 0.5889077186584473, + "learning_rate": 0.00047643242867593345, + "loss": 0.1343, + "mean_token_accuracy": 0.9611433297395706, + "num_tokens": 2964928.0, + "step": 336 + }, + { + "entropy": 1.8126957714557648, + "epoch": 1.2171945701357467, + "grad_norm": 0.5447750091552734, + "learning_rate": 0.0004761957470109179, + "loss": 0.1659, + "mean_token_accuracy": 0.9552300125360489, + "num_tokens": 2974160.0, + "step": 337 + }, + { + "entropy": 1.7981431782245636, + "epoch": 1.2208144796380092, + "grad_norm": 0.5400761365890503, + "learning_rate": 0.0004759583241574854, + "loss": 0.1339, + "mean_token_accuracy": 0.9620136916637421, + "num_tokens": 2982900.0, + "step": 338 + }, + { + "entropy": 1.8613979518413544, + "epoch": 1.2244343891402716, + "grad_norm": 0.7452914714813232, + "learning_rate": 0.0004757201610173981, + "loss": 0.4, + "mean_token_accuracy": 0.9068266004323959, + "num_tokens": 2991783.0, + "step": 339 + }, + { + "entropy": 1.8654026687145233, + "epoch": 1.228054298642534, + "grad_norm": 1.7142685651779175, + "learning_rate": 0.00047548125849523, + "loss": 0.3168, + "mean_token_accuracy": 0.9308896362781525, + "num_tokens": 3000530.0, + "step": 340 + }, + { + "entropy": 1.7702704071998596, + "epoch": 1.2316742081447964, + "grad_norm": 0.6687431931495667, + "learning_rate": 0.0004752416174983633, + "loss": 0.1697, + "mean_token_accuracy": 0.9530515670776367, + "num_tokens": 3009355.0, + "step": 341 + }, + { + "entropy": 1.735857516527176, + "epoch": 1.2352941176470589, + "grad_norm": 0.6127599477767944, + "learning_rate": 0.00047500123893698507, + "loss": 0.1706, + "mean_token_accuracy": 0.9593266248703003, + "num_tokens": 3018518.0, + "step": 342 + }, + { + "entropy": 1.7076368927955627, + "epoch": 1.2389140271493213, + "grad_norm": 0.6973987817764282, + "learning_rate": 0.0004747601237240836, + "loss": 0.1615, + "mean_token_accuracy": 0.9539438933134079, + "num_tokens": 3027752.0, + "step": 343 + }, + { + "entropy": 1.7353227138519287, + "epoch": 1.2425339366515837, + "grad_norm": 0.8406392335891724, + "learning_rate": 0.00047451827277544546, + "loss": 0.2063, + "mean_token_accuracy": 0.9488435834646225, + "num_tokens": 3036383.0, + "step": 344 + }, + { + "entropy": 1.6597246527671814, + "epoch": 1.2461538461538462, + "grad_norm": 0.5971431732177734, + "learning_rate": 0.00047427568700965107, + "loss": 0.1013, + "mean_token_accuracy": 0.9721864312887192, + "num_tokens": 3045375.0, + "step": 345 + }, + { + "entropy": 1.7100033462047577, + "epoch": 1.2497737556561086, + "grad_norm": 0.5883470773696899, + "learning_rate": 0.00047403236734807225, + "loss": 0.1164, + "mean_token_accuracy": 0.9664830714464188, + "num_tokens": 3054084.0, + "step": 346 + }, + { + "entropy": 1.7402609288692474, + "epoch": 1.253393665158371, + "grad_norm": 0.7355862855911255, + "learning_rate": 0.00047378831471486815, + "loss": 0.2007, + "mean_token_accuracy": 0.9560511559247971, + "num_tokens": 3062727.0, + "step": 347 + }, + { + "entropy": 1.79518261551857, + "epoch": 1.2570135746606335, + "grad_norm": 0.6006518006324768, + "learning_rate": 0.00047354353003698163, + "loss": 0.1085, + "mean_token_accuracy": 0.9598321914672852, + "num_tokens": 3071178.0, + "step": 348 + }, + { + "entropy": 1.7328391373157501, + "epoch": 1.260633484162896, + "grad_norm": 0.560342013835907, + "learning_rate": 0.0004732980142441362, + "loss": 0.1593, + "mean_token_accuracy": 0.9579409211874008, + "num_tokens": 3079927.0, + "step": 349 + }, + { + "entropy": 1.7356511652469635, + "epoch": 1.2642533936651583, + "grad_norm": 0.9149975776672363, + "learning_rate": 0.00047305176826883206, + "loss": 0.4064, + "mean_token_accuracy": 0.9265118837356567, + "num_tokens": 3089314.0, + "step": 350 + }, + { + "entropy": 1.8573569357395172, + "epoch": 1.2678733031674208, + "grad_norm": 0.8300670981407166, + "learning_rate": 0.0004728047930463428, + "loss": 0.195, + "mean_token_accuracy": 0.9453776180744171, + "num_tokens": 3097702.0, + "step": 351 + }, + { + "entropy": 1.7906217575073242, + "epoch": 1.2714932126696832, + "grad_norm": 0.5668906569480896, + "learning_rate": 0.0004725570895147118, + "loss": 0.1572, + "mean_token_accuracy": 0.962067037820816, + "num_tokens": 3106379.0, + "step": 352 + }, + { + "entropy": 1.6957395374774933, + "epoch": 1.2751131221719456, + "grad_norm": 0.4048328399658203, + "learning_rate": 0.0004723086586147487, + "loss": 0.0944, + "mean_token_accuracy": 0.9716819673776627, + "num_tokens": 3115622.0, + "step": 353 + }, + { + "entropy": 1.8158144056797028, + "epoch": 1.278733031674208, + "grad_norm": 0.6396092772483826, + "learning_rate": 0.00047205950129002564, + "loss": 0.1011, + "mean_token_accuracy": 0.9698463827371597, + "num_tokens": 3124016.0, + "step": 354 + }, + { + "entropy": 1.730194479227066, + "epoch": 1.2823529411764705, + "grad_norm": 0.662876307964325, + "learning_rate": 0.000471809618486874, + "loss": 0.1641, + "mean_token_accuracy": 0.9520179778337479, + "num_tokens": 3132712.0, + "step": 355 + }, + { + "entropy": 1.6776110529899597, + "epoch": 1.285972850678733, + "grad_norm": 0.868507981300354, + "learning_rate": 0.0004715590111543804, + "loss": 0.3374, + "mean_token_accuracy": 0.9303739666938782, + "num_tokens": 3142103.0, + "step": 356 + }, + { + "entropy": 1.6501678824424744, + "epoch": 1.2895927601809956, + "grad_norm": 0.5433686971664429, + "learning_rate": 0.0004713076802443834, + "loss": 0.1237, + "mean_token_accuracy": 0.9653612226247787, + "num_tokens": 3151192.0, + "step": 357 + }, + { + "entropy": 1.6524465382099152, + "epoch": 1.293212669683258, + "grad_norm": 0.6145523190498352, + "learning_rate": 0.00047105562671147, + "loss": 0.1204, + "mean_token_accuracy": 0.9690534323453903, + "num_tokens": 3159839.0, + "step": 358 + }, + { + "entropy": 1.5339214205741882, + "epoch": 1.2968325791855204, + "grad_norm": 0.500477135181427, + "learning_rate": 0.00047080285151297144, + "loss": 0.1295, + "mean_token_accuracy": 0.9571033865213394, + "num_tokens": 3169047.0, + "step": 359 + }, + { + "entropy": 1.6765435338020325, + "epoch": 1.3004524886877828, + "grad_norm": 0.6697553396224976, + "learning_rate": 0.00047054935560896026, + "loss": 0.135, + "mean_token_accuracy": 0.9672541171312332, + "num_tokens": 3177062.0, + "step": 360 + }, + { + "entropy": 1.5932062566280365, + "epoch": 1.3040723981900453, + "grad_norm": 0.706957221031189, + "learning_rate": 0.0004702951399622462, + "loss": 0.1229, + "mean_token_accuracy": 0.9634416699409485, + "num_tokens": 3185829.0, + "step": 361 + }, + { + "entropy": 1.5623145997524261, + "epoch": 1.3076923076923077, + "grad_norm": 0.6199461221694946, + "learning_rate": 0.00047004020553837275, + "loss": 0.1449, + "mean_token_accuracy": 0.9620065689086914, + "num_tokens": 3194426.0, + "step": 362 + }, + { + "entropy": 1.5226828753948212, + "epoch": 1.3113122171945701, + "grad_norm": 0.8962509036064148, + "learning_rate": 0.0004697845533056132, + "loss": 0.2207, + "mean_token_accuracy": 0.9403344839811325, + "num_tokens": 3203655.0, + "step": 363 + }, + { + "entropy": 1.5395641326904297, + "epoch": 1.3149321266968326, + "grad_norm": 0.5993619561195374, + "learning_rate": 0.00046952818423496727, + "loss": 0.1486, + "mean_token_accuracy": 0.9614185988903046, + "num_tokens": 3212069.0, + "step": 364 + }, + { + "entropy": 1.5738630294799805, + "epoch": 1.318552036199095, + "grad_norm": 0.7393983602523804, + "learning_rate": 0.00046927109930015756, + "loss": 0.1812, + "mean_token_accuracy": 0.9535021334886551, + "num_tokens": 3220482.0, + "step": 365 + }, + { + "entropy": 1.5462632775306702, + "epoch": 1.3221719457013574, + "grad_norm": 0.7453555464744568, + "learning_rate": 0.0004690132994776253, + "loss": 0.164, + "mean_token_accuracy": 0.9585814625024796, + "num_tokens": 3229505.0, + "step": 366 + }, + { + "entropy": 1.5241961777210236, + "epoch": 1.3257918552036199, + "grad_norm": 0.7553415298461914, + "learning_rate": 0.00046875478574652713, + "loss": 0.1445, + "mean_token_accuracy": 0.9682841598987579, + "num_tokens": 3238326.0, + "step": 367 + }, + { + "entropy": 1.5344699025154114, + "epoch": 1.3294117647058823, + "grad_norm": 0.8565949201583862, + "learning_rate": 0.0004684955590887311, + "loss": 0.2521, + "mean_token_accuracy": 0.920401468873024, + "num_tokens": 3247482.0, + "step": 368 + }, + { + "entropy": 1.5109277665615082, + "epoch": 1.3330316742081447, + "grad_norm": 0.5170580148696899, + "learning_rate": 0.00046823562048881295, + "loss": 0.1393, + "mean_token_accuracy": 0.9584086239337921, + "num_tokens": 3256464.0, + "step": 369 + }, + { + "entropy": 1.4666939079761505, + "epoch": 1.3366515837104074, + "grad_norm": 0.6995373368263245, + "learning_rate": 0.0004679749709340529, + "loss": 0.1726, + "mean_token_accuracy": 0.9477890431880951, + "num_tokens": 3265853.0, + "step": 370 + }, + { + "entropy": 1.4208430051803589, + "epoch": 1.3402714932126698, + "grad_norm": 1.1363991498947144, + "learning_rate": 0.000467713611414431, + "loss": 0.196, + "mean_token_accuracy": 0.9495431333780289, + "num_tokens": 3275367.0, + "step": 371 + }, + { + "entropy": 1.5009459853172302, + "epoch": 1.3438914027149322, + "grad_norm": 0.7883325219154358, + "learning_rate": 0.00046745154292262414, + "loss": 0.2526, + "mean_token_accuracy": 0.9334618002176285, + "num_tokens": 3284772.0, + "step": 372 + }, + { + "entropy": 1.5485479533672333, + "epoch": 1.3475113122171947, + "grad_norm": 0.6516429781913757, + "learning_rate": 0.00046718876645400156, + "loss": 0.2057, + "mean_token_accuracy": 0.9546459317207336, + "num_tokens": 3293493.0, + "step": 373 + }, + { + "entropy": 1.6237249970436096, + "epoch": 1.351131221719457, + "grad_norm": 0.8916263580322266, + "learning_rate": 0.00046692528300662213, + "loss": 0.2123, + "mean_token_accuracy": 0.9456845372915268, + "num_tokens": 3302063.0, + "step": 374 + }, + { + "entropy": 1.561572015285492, + "epoch": 1.3547511312217195, + "grad_norm": 0.7527791857719421, + "learning_rate": 0.00046666109358122935, + "loss": 0.2113, + "mean_token_accuracy": 0.9537477940320969, + "num_tokens": 3311037.0, + "step": 375 + }, + { + "entropy": 1.5594256818294525, + "epoch": 1.358371040723982, + "grad_norm": 1.25638747215271, + "learning_rate": 0.0004663961991812485, + "loss": 0.1629, + "mean_token_accuracy": 0.9508458077907562, + "num_tokens": 3319635.0, + "step": 376 + }, + { + "entropy": 1.6909976303577423, + "epoch": 1.3619909502262444, + "grad_norm": 0.7627813220024109, + "learning_rate": 0.00046613060081278194, + "loss": 0.2303, + "mean_token_accuracy": 0.9425801336765289, + "num_tokens": 3328043.0, + "step": 377 + }, + { + "entropy": 1.6074829697608948, + "epoch": 1.3656108597285068, + "grad_norm": 0.6584346294403076, + "learning_rate": 0.00046586429948460646, + "loss": 0.1815, + "mean_token_accuracy": 0.9536214470863342, + "num_tokens": 3337143.0, + "step": 378 + }, + { + "entropy": 1.7382183969020844, + "epoch": 1.3692307692307693, + "grad_norm": 1.37154221534729, + "learning_rate": 0.0004655972962081684, + "loss": 0.1849, + "mean_token_accuracy": 0.948440819978714, + "num_tokens": 3346033.0, + "step": 379 + }, + { + "entropy": 1.7148900926113129, + "epoch": 1.3728506787330317, + "grad_norm": 0.9487980604171753, + "learning_rate": 0.00046532959199758, + "loss": 0.2521, + "mean_token_accuracy": 0.9344504028558731, + "num_tokens": 3354849.0, + "step": 380 + }, + { + "entropy": 1.7164019346237183, + "epoch": 1.3764705882352941, + "grad_norm": 0.5609025359153748, + "learning_rate": 0.00046506118786961614, + "loss": 0.1425, + "mean_token_accuracy": 0.9571309834718704, + "num_tokens": 3363674.0, + "step": 381 + }, + { + "entropy": 1.894619107246399, + "epoch": 1.3800904977375565, + "grad_norm": 0.9811336994171143, + "learning_rate": 0.00046479208484370997, + "loss": 0.2522, + "mean_token_accuracy": 0.9424156546592712, + "num_tokens": 3372325.0, + "step": 382 + }, + { + "entropy": 1.78870290517807, + "epoch": 1.383710407239819, + "grad_norm": 0.5707085132598877, + "learning_rate": 0.00046452228394194893, + "loss": 0.1354, + "mean_token_accuracy": 0.9613165706396103, + "num_tokens": 3381270.0, + "step": 383 + }, + { + "entropy": 1.803922712802887, + "epoch": 1.3873303167420814, + "grad_norm": 0.5655364394187927, + "learning_rate": 0.0004642517861890713, + "loss": 0.0818, + "mean_token_accuracy": 0.9776160269975662, + "num_tokens": 3390363.0, + "step": 384 + }, + { + "entropy": 1.8172507882118225, + "epoch": 1.3909502262443438, + "grad_norm": 0.6950513124465942, + "learning_rate": 0.00046398059261246205, + "loss": 0.1145, + "mean_token_accuracy": 0.963288351893425, + "num_tokens": 3399176.0, + "step": 385 + }, + { + "entropy": 1.9182518422603607, + "epoch": 1.3945701357466063, + "grad_norm": 0.5900619029998779, + "learning_rate": 0.0004637087042421489, + "loss": 0.108, + "mean_token_accuracy": 0.9723307639360428, + "num_tokens": 3407978.0, + "step": 386 + }, + { + "entropy": 1.8558574616909027, + "epoch": 1.3981900452488687, + "grad_norm": 0.6279832124710083, + "learning_rate": 0.00046343612211079843, + "loss": 0.1471, + "mean_token_accuracy": 0.9603912532329559, + "num_tokens": 3416856.0, + "step": 387 + }, + { + "entropy": 1.8146779537200928, + "epoch": 1.4018099547511311, + "grad_norm": 0.6171274781227112, + "learning_rate": 0.0004631628472537125, + "loss": 0.1872, + "mean_token_accuracy": 0.9447146654129028, + "num_tokens": 3426044.0, + "step": 388 + }, + { + "entropy": 1.9342225790023804, + "epoch": 1.4054298642533936, + "grad_norm": 0.9947887659072876, + "learning_rate": 0.00046288888070882374, + "loss": 0.2966, + "mean_token_accuracy": 0.9279204607009888, + "num_tokens": 3435154.0, + "step": 389 + }, + { + "entropy": 1.9391801953315735, + "epoch": 1.409049773755656, + "grad_norm": 0.7155653834342957, + "learning_rate": 0.000462614223516692, + "loss": 0.1847, + "mean_token_accuracy": 0.9475171864032745, + "num_tokens": 3444563.0, + "step": 390 + }, + { + "entropy": 2.0716978013515472, + "epoch": 1.4126696832579184, + "grad_norm": 0.8198989629745483, + "learning_rate": 0.0004623388767205004, + "loss": 0.1317, + "mean_token_accuracy": 0.9608721435070038, + "num_tokens": 3453410.0, + "step": 391 + }, + { + "entropy": 2.1060431599617004, + "epoch": 1.416289592760181, + "grad_norm": 1.025406002998352, + "learning_rate": 0.00046206284136605106, + "loss": 0.2146, + "mean_token_accuracy": 0.9414294511079788, + "num_tokens": 3461958.0, + "step": 392 + }, + { + "entropy": 2.1459922194480896, + "epoch": 1.4199095022624435, + "grad_norm": 0.9209627509117126, + "learning_rate": 0.00046178611850176146, + "loss": 0.2137, + "mean_token_accuracy": 0.956874743103981, + "num_tokens": 3470547.0, + "step": 393 + }, + { + "entropy": 2.0233450531959534, + "epoch": 1.423529411764706, + "grad_norm": 0.5777944922447205, + "learning_rate": 0.00046150870917866025, + "loss": 0.122, + "mean_token_accuracy": 0.9672323018312454, + "num_tokens": 3479618.0, + "step": 394 + }, + { + "entropy": 2.035937190055847, + "epoch": 1.4271493212669684, + "grad_norm": 0.7945542931556702, + "learning_rate": 0.0004612306144503835, + "loss": 0.2879, + "mean_token_accuracy": 0.946587473154068, + "num_tokens": 3488533.0, + "step": 395 + }, + { + "entropy": 2.155315637588501, + "epoch": 1.4307692307692308, + "grad_norm": 0.6385292410850525, + "learning_rate": 0.00046095183537317035, + "loss": 0.1008, + "mean_token_accuracy": 0.9655124247074127, + "num_tokens": 3496686.0, + "step": 396 + }, + { + "entropy": 2.186827063560486, + "epoch": 1.4343891402714932, + "grad_norm": 0.4759826958179474, + "learning_rate": 0.0004606723730058593, + "loss": 0.0768, + "mean_token_accuracy": 0.9783597737550735, + "num_tokens": 3504958.0, + "step": 397 + }, + { + "entropy": 1.974392294883728, + "epoch": 1.4380090497737557, + "grad_norm": 0.6250292062759399, + "learning_rate": 0.00046039222840988406, + "loss": 0.1381, + "mean_token_accuracy": 0.9586146324872971, + "num_tokens": 3513694.0, + "step": 398 + }, + { + "entropy": 2.045738846063614, + "epoch": 1.441628959276018, + "grad_norm": 0.5517769455909729, + "learning_rate": 0.0004601114026492695, + "loss": 0.1312, + "mean_token_accuracy": 0.9682512134313583, + "num_tokens": 3522395.0, + "step": 399 + }, + { + "entropy": 2.105030357837677, + "epoch": 1.4452488687782805, + "grad_norm": 0.6748242974281311, + "learning_rate": 0.0004598298967906276, + "loss": 0.1056, + "mean_token_accuracy": 0.9701305478811264, + "num_tokens": 3530838.0, + "step": 400 + }, + { + "entropy": 2.024325281381607, + "epoch": 1.448868778280543, + "grad_norm": 0.6320233941078186, + "learning_rate": 0.00045954771190315344, + "loss": 0.1129, + "mean_token_accuracy": 0.9633017927408218, + "num_tokens": 3540184.0, + "step": 401 + }, + { + "entropy": 2.1561593413352966, + "epoch": 1.4524886877828054, + "grad_norm": 0.7380363941192627, + "learning_rate": 0.0004592648490586213, + "loss": 0.1304, + "mean_token_accuracy": 0.9599586874246597, + "num_tokens": 3548727.0, + "step": 402 + }, + { + "entropy": 2.2986454367637634, + "epoch": 1.4561085972850678, + "grad_norm": 0.669114351272583, + "learning_rate": 0.00045898130933138024, + "loss": 0.1005, + "mean_token_accuracy": 0.9724964797496796, + "num_tokens": 3556780.0, + "step": 403 + }, + { + "entropy": 2.103136509656906, + "epoch": 1.4597285067873302, + "grad_norm": 0.6677402853965759, + "learning_rate": 0.0004586970937983504, + "loss": 0.1177, + "mean_token_accuracy": 0.9597653448581696, + "num_tokens": 3565427.0, + "step": 404 + }, + { + "entropy": 2.112696200609207, + "epoch": 1.463348416289593, + "grad_norm": 0.4597342014312744, + "learning_rate": 0.0004584122035390185, + "loss": 0.0695, + "mean_token_accuracy": 0.9763098359107971, + "num_tokens": 3573902.0, + "step": 405 + }, + { + "entropy": 2.0472628474235535, + "epoch": 1.4669683257918553, + "grad_norm": 0.7842056751251221, + "learning_rate": 0.0004581266396354339, + "loss": 0.1981, + "mean_token_accuracy": 0.9521032422780991, + "num_tokens": 3582913.0, + "step": 406 + }, + { + "entropy": 2.236558735370636, + "epoch": 1.4705882352941178, + "grad_norm": 0.7634767293930054, + "learning_rate": 0.000457840403172205, + "loss": 0.1956, + "mean_token_accuracy": 0.9602932929992676, + "num_tokens": 3591197.0, + "step": 407 + }, + { + "entropy": 2.182949125766754, + "epoch": 1.4742081447963802, + "grad_norm": 0.7084661722183228, + "learning_rate": 0.00045755349523649415, + "loss": 0.2463, + "mean_token_accuracy": 0.9392582327127457, + "num_tokens": 3600134.0, + "step": 408 + }, + { + "entropy": 2.135133147239685, + "epoch": 1.4778280542986426, + "grad_norm": 0.8172940015792847, + "learning_rate": 0.00045726591691801433, + "loss": 0.2375, + "mean_token_accuracy": 0.9458330571651459, + "num_tokens": 3608945.0, + "step": 409 + }, + { + "entropy": 2.157473146915436, + "epoch": 1.481447963800905, + "grad_norm": 0.6165594458580017, + "learning_rate": 0.0004569776693090246, + "loss": 0.1628, + "mean_token_accuracy": 0.9586529731750488, + "num_tokens": 3617790.0, + "step": 410 + }, + { + "entropy": 2.15165376663208, + "epoch": 1.4850678733031675, + "grad_norm": 0.6619407534599304, + "learning_rate": 0.0004566887535043263, + "loss": 0.1866, + "mean_token_accuracy": 0.9545126557350159, + "num_tokens": 3626937.0, + "step": 411 + }, + { + "entropy": 2.271161735057831, + "epoch": 1.48868778280543, + "grad_norm": 0.5861835479736328, + "learning_rate": 0.0004563991706012582, + "loss": 0.1409, + "mean_token_accuracy": 0.9595955163240433, + "num_tokens": 3636025.0, + "step": 412 + }, + { + "entropy": 2.277799427509308, + "epoch": 1.4923076923076923, + "grad_norm": 0.6464956402778625, + "learning_rate": 0.00045610892169969323, + "loss": 0.0792, + "mean_token_accuracy": 0.9806316941976547, + "num_tokens": 3644746.0, + "step": 413 + }, + { + "entropy": 2.2143171429634094, + "epoch": 1.4959276018099548, + "grad_norm": 0.7531687021255493, + "learning_rate": 0.00045581800790203366, + "loss": 0.2584, + "mean_token_accuracy": 0.9225966930389404, + "num_tokens": 3654064.0, + "step": 414 + }, + { + "entropy": 2.231681764125824, + "epoch": 1.4995475113122172, + "grad_norm": 0.6902768015861511, + "learning_rate": 0.00045552643031320726, + "loss": 0.232, + "mean_token_accuracy": 0.9433842301368713, + "num_tokens": 3663130.0, + "step": 415 + }, + { + "entropy": 2.2672717571258545, + "epoch": 1.5031674208144796, + "grad_norm": 0.5134314894676208, + "learning_rate": 0.00045523419004066273, + "loss": 0.0874, + "mean_token_accuracy": 0.9708191752433777, + "num_tokens": 3671981.0, + "step": 416 + }, + { + "entropy": 2.3302834033966064, + "epoch": 1.506787330316742, + "grad_norm": 0.885969340801239, + "learning_rate": 0.0004549412881943659, + "loss": 0.0723, + "mean_token_accuracy": 0.9791463166475296, + "num_tokens": 3680525.0, + "step": 417 + }, + { + "entropy": 2.2693899869918823, + "epoch": 1.5104072398190045, + "grad_norm": 0.7424856424331665, + "learning_rate": 0.00045464772588679547, + "loss": 0.1509, + "mean_token_accuracy": 0.9600907415151596, + "num_tokens": 3689430.0, + "step": 418 + }, + { + "entropy": 2.4042725563049316, + "epoch": 1.514027149321267, + "grad_norm": 0.8968034982681274, + "learning_rate": 0.0004543535042329382, + "loss": 0.1984, + "mean_token_accuracy": 0.9488537162542343, + "num_tokens": 3697836.0, + "step": 419 + }, + { + "entropy": 2.2518428564071655, + "epoch": 1.5176470588235293, + "grad_norm": 0.5963534712791443, + "learning_rate": 0.0004540586243502858, + "loss": 0.1214, + "mean_token_accuracy": 0.9711381644010544, + "num_tokens": 3706675.0, + "step": 420 + }, + { + "entropy": 2.275522291660309, + "epoch": 1.5212669683257918, + "grad_norm": 1.0797090530395508, + "learning_rate": 0.0004537630873588293, + "loss": 0.2508, + "mean_token_accuracy": 0.9247037768363953, + "num_tokens": 3715631.0, + "step": 421 + }, + { + "entropy": 2.249617278575897, + "epoch": 1.5248868778280542, + "grad_norm": 0.7636313438415527, + "learning_rate": 0.000453466894381056, + "loss": 0.1112, + "mean_token_accuracy": 0.9681926071643829, + "num_tokens": 3724579.0, + "step": 422 + }, + { + "entropy": 2.280571699142456, + "epoch": 1.5285067873303166, + "grad_norm": 0.9915648698806763, + "learning_rate": 0.00045317004654194464, + "loss": 0.3532, + "mean_token_accuracy": 0.9360047876834869, + "num_tokens": 3733607.0, + "step": 423 + }, + { + "entropy": 2.241512656211853, + "epoch": 1.532126696832579, + "grad_norm": 0.924977719783783, + "learning_rate": 0.0004528725449689611, + "loss": 0.1997, + "mean_token_accuracy": 0.9475428760051727, + "num_tokens": 3742611.0, + "step": 424 + }, + { + "entropy": 2.201731503009796, + "epoch": 1.5357466063348415, + "grad_norm": 0.7018861770629883, + "learning_rate": 0.0004525743907920542, + "loss": 0.1683, + "mean_token_accuracy": 0.9465018659830093, + "num_tokens": 3751737.0, + "step": 425 + }, + { + "entropy": 2.28944593667984, + "epoch": 1.539366515837104, + "grad_norm": 0.5893452763557434, + "learning_rate": 0.00045227558514365166, + "loss": 0.0969, + "mean_token_accuracy": 0.9711766839027405, + "num_tokens": 3761245.0, + "step": 426 + }, + { + "entropy": 2.3497202396392822, + "epoch": 1.5429864253393664, + "grad_norm": 0.685279130935669, + "learning_rate": 0.0004519761291586551, + "loss": 0.106, + "mean_token_accuracy": 0.9663016647100449, + "num_tokens": 3769854.0, + "step": 427 + }, + { + "entropy": 2.308362066745758, + "epoch": 1.5466063348416288, + "grad_norm": 0.5116177797317505, + "learning_rate": 0.00045167602397443694, + "loss": 0.1132, + "mean_token_accuracy": 0.9700013697147369, + "num_tokens": 3778996.0, + "step": 428 + }, + { + "entropy": 2.238637685775757, + "epoch": 1.5502262443438914, + "grad_norm": 0.8374833464622498, + "learning_rate": 0.00045137527073083457, + "loss": 0.2539, + "mean_token_accuracy": 0.9407305717468262, + "num_tokens": 3787835.0, + "step": 429 + }, + { + "entropy": 2.3406758308410645, + "epoch": 1.5538461538461539, + "grad_norm": 0.5140913724899292, + "learning_rate": 0.0004510738705701473, + "loss": 0.1113, + "mean_token_accuracy": 0.9635641574859619, + "num_tokens": 3796498.0, + "step": 430 + }, + { + "entropy": 2.2642539143562317, + "epoch": 1.5574660633484163, + "grad_norm": 0.5750702023506165, + "learning_rate": 0.0004507718246371313, + "loss": 0.1127, + "mean_token_accuracy": 0.9660817235708237, + "num_tokens": 3805464.0, + "step": 431 + }, + { + "entropy": 2.2058264315128326, + "epoch": 1.5610859728506787, + "grad_norm": 0.6448659300804138, + "learning_rate": 0.0004504691340789955, + "loss": 0.0994, + "mean_token_accuracy": 0.96739861369133, + "num_tokens": 3814309.0, + "step": 432 + }, + { + "entropy": 2.330399215221405, + "epoch": 1.5647058823529412, + "grad_norm": 0.8432528376579285, + "learning_rate": 0.0004501658000453973, + "loss": 0.1999, + "mean_token_accuracy": 0.9510775059461594, + "num_tokens": 3823126.0, + "step": 433 + }, + { + "entropy": 2.4211326837539673, + "epoch": 1.5683257918552036, + "grad_norm": 0.8101194500923157, + "learning_rate": 0.00044986182368843806, + "loss": 0.144, + "mean_token_accuracy": 0.9656328558921814, + "num_tokens": 3831274.0, + "step": 434 + }, + { + "entropy": 2.2594956755638123, + "epoch": 1.571945701357466, + "grad_norm": 0.6753663420677185, + "learning_rate": 0.0004495572061626585, + "loss": 0.1433, + "mean_token_accuracy": 0.9572386592626572, + "num_tokens": 3840206.0, + "step": 435 + }, + { + "entropy": 2.1233682930469513, + "epoch": 1.5755656108597285, + "grad_norm": 0.48616713285446167, + "learning_rate": 0.000449251948625035, + "loss": 0.0934, + "mean_token_accuracy": 0.9740773588418961, + "num_tokens": 3849363.0, + "step": 436 + }, + { + "entropy": 2.325556695461273, + "epoch": 1.5791855203619911, + "grad_norm": 0.7744045853614807, + "learning_rate": 0.00044894605223497446, + "loss": 0.127, + "mean_token_accuracy": 0.9687052518129349, + "num_tokens": 3857733.0, + "step": 437 + }, + { + "entropy": 2.266542673110962, + "epoch": 1.5828054298642535, + "grad_norm": 2.373530387878418, + "learning_rate": 0.00044863951815431045, + "loss": 0.2404, + "mean_token_accuracy": 0.9437267184257507, + "num_tokens": 3866374.0, + "step": 438 + }, + { + "entropy": 2.1757248640060425, + "epoch": 1.586425339366516, + "grad_norm": 0.5588560700416565, + "learning_rate": 0.00044833234754729847, + "loss": 0.142, + "mean_token_accuracy": 0.9601300358772278, + "num_tokens": 3875520.0, + "step": 439 + }, + { + "entropy": 2.124377518892288, + "epoch": 1.5900452488687784, + "grad_norm": 0.5602438449859619, + "learning_rate": 0.0004480245415806116, + "loss": 0.1556, + "mean_token_accuracy": 0.9561446160078049, + "num_tokens": 3884345.0, + "step": 440 + }, + { + "entropy": 2.1571075320243835, + "epoch": 1.5936651583710408, + "grad_norm": 0.472598671913147, + "learning_rate": 0.0004477161014233361, + "loss": 0.0848, + "mean_token_accuracy": 0.9742853343486786, + "num_tokens": 3893129.0, + "step": 441 + }, + { + "entropy": 2.0434057414531708, + "epoch": 1.5972850678733033, + "grad_norm": 0.7104448676109314, + "learning_rate": 0.00044740702824696703, + "loss": 0.1524, + "mean_token_accuracy": 0.9542464315891266, + "num_tokens": 3902120.0, + "step": 442 + }, + { + "entropy": 2.1118403673171997, + "epoch": 1.6009049773755657, + "grad_norm": 0.6632394194602966, + "learning_rate": 0.0004470973232254037, + "loss": 0.3001, + "mean_token_accuracy": 0.928197592496872, + "num_tokens": 3910974.0, + "step": 443 + }, + { + "entropy": 2.0292475819587708, + "epoch": 1.6045248868778281, + "grad_norm": 1.050956130027771, + "learning_rate": 0.00044678698753494527, + "loss": 0.2226, + "mean_token_accuracy": 0.9448522627353668, + "num_tokens": 3920005.0, + "step": 444 + }, + { + "entropy": 1.991033524274826, + "epoch": 1.6081447963800906, + "grad_norm": 0.670244038105011, + "learning_rate": 0.00044647602235428624, + "loss": 0.2158, + "mean_token_accuracy": 0.9551118016242981, + "num_tokens": 3929334.0, + "step": 445 + }, + { + "entropy": 2.04949289560318, + "epoch": 1.611764705882353, + "grad_norm": 0.6321494579315186, + "learning_rate": 0.00044616442886451197, + "loss": 0.1743, + "mean_token_accuracy": 0.9494802355766296, + "num_tokens": 3938211.0, + "step": 446 + }, + { + "entropy": 2.1101951897144318, + "epoch": 1.6153846153846154, + "grad_norm": 0.6970012187957764, + "learning_rate": 0.0004458522082490943, + "loss": 0.1228, + "mean_token_accuracy": 0.9624926447868347, + "num_tokens": 3946534.0, + "step": 447 + }, + { + "entropy": 1.9337081909179688, + "epoch": 1.6190045248868778, + "grad_norm": 0.5971657633781433, + "learning_rate": 0.0004455393616938868, + "loss": 0.1431, + "mean_token_accuracy": 0.9635348320007324, + "num_tokens": 3955694.0, + "step": 448 + }, + { + "entropy": 1.9635128676891327, + "epoch": 1.6226244343891403, + "grad_norm": 0.8510827422142029, + "learning_rate": 0.00044522589038712074, + "loss": 0.2446, + "mean_token_accuracy": 0.9457641988992691, + "num_tokens": 3964907.0, + "step": 449 + }, + { + "entropy": 2.0336360335350037, + "epoch": 1.6262443438914027, + "grad_norm": 0.5803818106651306, + "learning_rate": 0.00044491179551939985, + "loss": 0.0872, + "mean_token_accuracy": 0.9734505414962769, + "num_tokens": 3973584.0, + "step": 450 + }, + { + "entropy": 2.0668878853321075, + "epoch": 1.6298642533936651, + "grad_norm": 0.6990496516227722, + "learning_rate": 0.0004445970782836967, + "loss": 0.1138, + "mean_token_accuracy": 0.9702571034431458, + "num_tokens": 3982632.0, + "step": 451 + }, + { + "entropy": 2.1481760144233704, + "epoch": 1.6334841628959276, + "grad_norm": 0.6156729459762573, + "learning_rate": 0.00044428173987534733, + "loss": 0.0936, + "mean_token_accuracy": 0.9739355593919754, + "num_tokens": 3991147.0, + "step": 452 + }, + { + "entropy": 2.0678701996803284, + "epoch": 1.63710407239819, + "grad_norm": 0.5441684126853943, + "learning_rate": 0.0004439657814920472, + "loss": 0.123, + "mean_token_accuracy": 0.9693446308374405, + "num_tokens": 3999990.0, + "step": 453 + }, + { + "entropy": 1.9867055118083954, + "epoch": 1.6407239819004524, + "grad_norm": 0.9218093156814575, + "learning_rate": 0.00044364920433384656, + "loss": 0.1997, + "mean_token_accuracy": 0.9564195573329926, + "num_tokens": 4009097.0, + "step": 454 + }, + { + "entropy": 2.145586997270584, + "epoch": 1.6443438914027149, + "grad_norm": 0.77643883228302, + "learning_rate": 0.0004433320096031458, + "loss": 0.1491, + "mean_token_accuracy": 0.9602408111095428, + "num_tokens": 4018059.0, + "step": 455 + }, + { + "entropy": 2.071108251810074, + "epoch": 1.6479638009049773, + "grad_norm": 0.5267088413238525, + "learning_rate": 0.0004430141985046909, + "loss": 0.0875, + "mean_token_accuracy": 0.9764399826526642, + "num_tokens": 4027089.0, + "step": 456 + }, + { + "entropy": 2.1659318804740906, + "epoch": 1.6515837104072397, + "grad_norm": 1.0642318725585938, + "learning_rate": 0.000442695772245569, + "loss": 0.2623, + "mean_token_accuracy": 0.9307756721973419, + "num_tokens": 4035719.0, + "step": 457 + }, + { + "entropy": 2.0232724249362946, + "epoch": 1.6552036199095022, + "grad_norm": 0.6213289499282837, + "learning_rate": 0.0004423767320352035, + "loss": 0.1597, + "mean_token_accuracy": 0.9599647223949432, + "num_tokens": 4045088.0, + "step": 458 + }, + { + "entropy": 2.047410547733307, + "epoch": 1.6588235294117646, + "grad_norm": 0.6346105933189392, + "learning_rate": 0.0004420570790853498, + "loss": 0.1422, + "mean_token_accuracy": 0.9649711549282074, + "num_tokens": 4054262.0, + "step": 459 + }, + { + "entropy": 2.0923012793064117, + "epoch": 1.662443438914027, + "grad_norm": 0.46477749943733215, + "learning_rate": 0.0004417368146100907, + "loss": 0.079, + "mean_token_accuracy": 0.9777993708848953, + "num_tokens": 4063107.0, + "step": 460 + }, + { + "entropy": 2.168913394212723, + "epoch": 1.6660633484162894, + "grad_norm": 0.5164734721183777, + "learning_rate": 0.0004414159398258312, + "loss": 0.0941, + "mean_token_accuracy": 0.9725133627653122, + "num_tokens": 4071656.0, + "step": 461 + }, + { + "entropy": 2.152670443058014, + "epoch": 1.6696832579185519, + "grad_norm": 0.8985757231712341, + "learning_rate": 0.00044109445595129495, + "loss": 0.2142, + "mean_token_accuracy": 0.9387252777814865, + "num_tokens": 4080023.0, + "step": 462 + }, + { + "entropy": 2.111784875392914, + "epoch": 1.6733031674208145, + "grad_norm": 0.47521084547042847, + "learning_rate": 0.0004407723642075184, + "loss": 0.0581, + "mean_token_accuracy": 0.9821985810995102, + "num_tokens": 4088469.0, + "step": 463 + }, + { + "entropy": 1.9784683287143707, + "epoch": 1.676923076923077, + "grad_norm": 0.5552536249160767, + "learning_rate": 0.0004404496658178472, + "loss": 0.1353, + "mean_token_accuracy": 0.9619844257831573, + "num_tokens": 4097737.0, + "step": 464 + }, + { + "entropy": 2.015674114227295, + "epoch": 1.6805429864253394, + "grad_norm": 0.6078305244445801, + "learning_rate": 0.0004401263620079309, + "loss": 0.1916, + "mean_token_accuracy": 0.9506707191467285, + "num_tokens": 4107156.0, + "step": 465 + }, + { + "entropy": 2.0832217931747437, + "epoch": 1.6841628959276018, + "grad_norm": 0.6618755459785461, + "learning_rate": 0.0004398024540057186, + "loss": 0.1671, + "mean_token_accuracy": 0.9617152661085129, + "num_tokens": 4116019.0, + "step": 466 + }, + { + "entropy": 2.0383114516735077, + "epoch": 1.6877828054298643, + "grad_norm": 0.5774693489074707, + "learning_rate": 0.0004394779430414541, + "loss": 0.2647, + "mean_token_accuracy": 0.9387127161026001, + "num_tokens": 4125001.0, + "step": 467 + }, + { + "entropy": 2.201409190893173, + "epoch": 1.6914027149321267, + "grad_norm": 0.7600311636924744, + "learning_rate": 0.0004391528303476715, + "loss": 0.073, + "mean_token_accuracy": 0.979825034737587, + "num_tokens": 4133467.0, + "step": 468 + }, + { + "entropy": 2.168666422367096, + "epoch": 1.6950226244343891, + "grad_norm": 0.7801902294158936, + "learning_rate": 0.00043882711715919015, + "loss": 0.2406, + "mean_token_accuracy": 0.9451306313276291, + "num_tokens": 4141765.0, + "step": 469 + }, + { + "entropy": 2.1429262161254883, + "epoch": 1.6986425339366515, + "grad_norm": 0.5192358493804932, + "learning_rate": 0.0004385008047131104, + "loss": 0.1052, + "mean_token_accuracy": 0.9749262481927872, + "num_tokens": 4150732.0, + "step": 470 + }, + { + "entropy": 2.1387495696544647, + "epoch": 1.702262443438914, + "grad_norm": 0.6219777464866638, + "learning_rate": 0.0004381738942488083, + "loss": 0.2127, + "mean_token_accuracy": 0.9398418068885803, + "num_tokens": 4159715.0, + "step": 471 + }, + { + "entropy": 2.1718398332595825, + "epoch": 1.7058823529411766, + "grad_norm": 0.5738123655319214, + "learning_rate": 0.0004378463870079316, + "loss": 0.1703, + "mean_token_accuracy": 0.9520847648382187, + "num_tokens": 4168526.0, + "step": 472 + }, + { + "entropy": 2.2768235206604004, + "epoch": 1.709502262443439, + "grad_norm": 0.662564754486084, + "learning_rate": 0.00043751828423439456, + "loss": 0.138, + "mean_token_accuracy": 0.9581841826438904, + "num_tokens": 4177189.0, + "step": 473 + }, + { + "entropy": 2.29143089056015, + "epoch": 1.7131221719457015, + "grad_norm": 0.8638074398040771, + "learning_rate": 0.00043718958717437324, + "loss": 0.1432, + "mean_token_accuracy": 0.9645630270242691, + "num_tokens": 4185367.0, + "step": 474 + }, + { + "entropy": 2.2810245156288147, + "epoch": 1.716742081447964, + "grad_norm": 0.6139346957206726, + "learning_rate": 0.00043686029707630097, + "loss": 0.173, + "mean_token_accuracy": 0.9592728316783905, + "num_tokens": 4194418.0, + "step": 475 + }, + { + "entropy": 2.1307725310325623, + "epoch": 1.7203619909502263, + "grad_norm": 0.5192779302597046, + "learning_rate": 0.00043653041519086354, + "loss": 0.1025, + "mean_token_accuracy": 0.970764696598053, + "num_tokens": 4203705.0, + "step": 476 + }, + { + "entropy": 2.160595118999481, + "epoch": 1.7239819004524888, + "grad_norm": 0.7398526668548584, + "learning_rate": 0.0004361999427709943, + "loss": 0.229, + "mean_token_accuracy": 0.9352773874998093, + "num_tokens": 4212648.0, + "step": 477 + }, + { + "entropy": 2.1865442991256714, + "epoch": 1.7276018099547512, + "grad_norm": 0.6227203011512756, + "learning_rate": 0.0004358688810718699, + "loss": 0.1118, + "mean_token_accuracy": 0.9689576476812363, + "num_tokens": 4221208.0, + "step": 478 + }, + { + "entropy": 2.086527943611145, + "epoch": 1.7312217194570136, + "grad_norm": 0.722144603729248, + "learning_rate": 0.00043553723135090447, + "loss": 0.1656, + "mean_token_accuracy": 0.9537550210952759, + "num_tokens": 4230810.0, + "step": 479 + }, + { + "entropy": 2.068355441093445, + "epoch": 1.734841628959276, + "grad_norm": 0.5781517028808594, + "learning_rate": 0.0004352049948677462, + "loss": 0.1497, + "mean_token_accuracy": 0.9600837379693985, + "num_tokens": 4240394.0, + "step": 480 + }, + { + "entropy": 2.185140371322632, + "epoch": 1.7384615384615385, + "grad_norm": 0.7261873483657837, + "learning_rate": 0.0004348721728842715, + "loss": 0.1582, + "mean_token_accuracy": 0.9584025889635086, + "num_tokens": 4249205.0, + "step": 481 + }, + { + "entropy": 2.21835720539093, + "epoch": 1.742081447963801, + "grad_norm": 0.5321667194366455, + "learning_rate": 0.0004345387666645807, + "loss": 0.1344, + "mean_token_accuracy": 0.9659005403518677, + "num_tokens": 4257808.0, + "step": 482 + }, + { + "entropy": 2.078131854534149, + "epoch": 1.7457013574660634, + "grad_norm": 0.5598498582839966, + "learning_rate": 0.00043420477747499307, + "loss": 0.1347, + "mean_token_accuracy": 0.9678008407354355, + "num_tokens": 4266728.0, + "step": 483 + }, + { + "entropy": 2.060504525899887, + "epoch": 1.7493212669683258, + "grad_norm": 0.5017166137695312, + "learning_rate": 0.0004338702065840422, + "loss": 0.0722, + "mean_token_accuracy": 0.9762782007455826, + "num_tokens": 4275514.0, + "step": 484 + }, + { + "entropy": 2.165244698524475, + "epoch": 1.7529411764705882, + "grad_norm": 0.4664002060890198, + "learning_rate": 0.00043353505526247084, + "loss": 0.1206, + "mean_token_accuracy": 0.9696767777204514, + "num_tokens": 4284013.0, + "step": 485 + }, + { + "entropy": 2.103049159049988, + "epoch": 1.7565610859728507, + "grad_norm": 0.6669000387191772, + "learning_rate": 0.0004331993247832265, + "loss": 0.1052, + "mean_token_accuracy": 0.9665459096431732, + "num_tokens": 4293011.0, + "step": 486 + }, + { + "entropy": 2.1286613941192627, + "epoch": 1.760180995475113, + "grad_norm": 0.7821269631385803, + "learning_rate": 0.00043286301642145634, + "loss": 0.3669, + "mean_token_accuracy": 0.9062697291374207, + "num_tokens": 4301965.0, + "step": 487 + }, + { + "entropy": 2.098009169101715, + "epoch": 1.7638009049773755, + "grad_norm": 0.5720731616020203, + "learning_rate": 0.0004325261314545024, + "loss": 0.1324, + "mean_token_accuracy": 0.9650943875312805, + "num_tokens": 4310914.0, + "step": 488 + }, + { + "entropy": 2.164614498615265, + "epoch": 1.767420814479638, + "grad_norm": 1.0500473976135254, + "learning_rate": 0.0004321886711618967, + "loss": 0.1182, + "mean_token_accuracy": 0.9720661342144012, + "num_tokens": 4319072.0, + "step": 489 + }, + { + "entropy": 2.2015402913093567, + "epoch": 1.7710407239819004, + "grad_norm": 0.5770253539085388, + "learning_rate": 0.00043185063682535634, + "loss": 0.1226, + "mean_token_accuracy": 0.9615659862756729, + "num_tokens": 4327539.0, + "step": 490 + }, + { + "entropy": 2.075456440448761, + "epoch": 1.7746606334841628, + "grad_norm": 0.6456925272941589, + "learning_rate": 0.0004315120297287789, + "loss": 0.1123, + "mean_token_accuracy": 0.9628709554672241, + "num_tokens": 4336523.0, + "step": 491 + }, + { + "entropy": 2.158169150352478, + "epoch": 1.7782805429864252, + "grad_norm": 0.8282069563865662, + "learning_rate": 0.00043117285115823733, + "loss": 0.2146, + "mean_token_accuracy": 0.9413971602916718, + "num_tokens": 4345294.0, + "step": 492 + }, + { + "entropy": 2.02735897898674, + "epoch": 1.7819004524886877, + "grad_norm": 0.783597469329834, + "learning_rate": 0.000430833102401975, + "loss": 0.1376, + "mean_token_accuracy": 0.964630737900734, + "num_tokens": 4354107.0, + "step": 493 + }, + { + "entropy": 2.138492166996002, + "epoch": 1.78552036199095, + "grad_norm": 0.6317175030708313, + "learning_rate": 0.000430492784750401, + "loss": 0.1005, + "mean_token_accuracy": 0.9734214246273041, + "num_tokens": 4362560.0, + "step": 494 + }, + { + "entropy": 2.0253217220306396, + "epoch": 1.7891402714932125, + "grad_norm": 0.5523395538330078, + "learning_rate": 0.000430151899496085, + "loss": 0.1633, + "mean_token_accuracy": 0.9558031558990479, + "num_tokens": 4371698.0, + "step": 495 + }, + { + "entropy": 2.160472810268402, + "epoch": 1.792760180995475, + "grad_norm": 0.6557935476303101, + "learning_rate": 0.00042981044793375295, + "loss": 0.1154, + "mean_token_accuracy": 0.9722230583429337, + "num_tokens": 4380612.0, + "step": 496 + }, + { + "entropy": 2.0284159183502197, + "epoch": 1.7963800904977374, + "grad_norm": 0.7357863187789917, + "learning_rate": 0.00042946843136028117, + "loss": 0.1166, + "mean_token_accuracy": 0.9629471153020859, + "num_tokens": 4389521.0, + "step": 497 + }, + { + "entropy": 2.1544791162014008, + "epoch": 1.8, + "grad_norm": 0.5604898929595947, + "learning_rate": 0.00042912585107469226, + "loss": 0.0834, + "mean_token_accuracy": 0.9783036410808563, + "num_tokens": 4398059.0, + "step": 498 + }, + { + "entropy": 2.1051094830036163, + "epoch": 1.8036199095022625, + "grad_norm": 0.4598539173603058, + "learning_rate": 0.0004287827083781497, + "loss": 0.0411, + "mean_token_accuracy": 0.9868490546941757, + "num_tokens": 4406453.0, + "step": 499 + }, + { + "entropy": 2.0219272077083588, + "epoch": 1.807239819004525, + "grad_norm": 0.8164628744125366, + "learning_rate": 0.00042843900457395343, + "loss": 0.1988, + "mean_token_accuracy": 0.9502352625131607, + "num_tokens": 4415440.0, + "step": 500 + }, + { + "entropy": 1.980013906955719, + "epoch": 1.8108597285067873, + "grad_norm": 0.572798490524292, + "learning_rate": 0.0004280947409675341, + "loss": 0.1148, + "mean_token_accuracy": 0.966580331325531, + "num_tokens": 4424532.0, + "step": 501 + }, + { + "entropy": 2.0646563172340393, + "epoch": 1.8144796380090498, + "grad_norm": 0.769386351108551, + "learning_rate": 0.00042774991886644875, + "loss": 0.1592, + "mean_token_accuracy": 0.9553463608026505, + "num_tokens": 4432913.0, + "step": 502 + }, + { + "entropy": 2.040877491235733, + "epoch": 1.8180995475113122, + "grad_norm": 0.7467371821403503, + "learning_rate": 0.0004274045395803758, + "loss": 0.2247, + "mean_token_accuracy": 0.9526964277029037, + "num_tokens": 4441425.0, + "step": 503 + }, + { + "entropy": 1.9934698939323425, + "epoch": 1.8217194570135746, + "grad_norm": 0.6602952480316162, + "learning_rate": 0.00042705860442110964, + "loss": 0.1681, + "mean_token_accuracy": 0.9594631940126419, + "num_tokens": 4450383.0, + "step": 504 + }, + { + "entropy": 2.0858289897441864, + "epoch": 1.825339366515837, + "grad_norm": 0.684380829334259, + "learning_rate": 0.0004267121147025562, + "loss": 0.1154, + "mean_token_accuracy": 0.9638111293315887, + "num_tokens": 4458862.0, + "step": 505 + }, + { + "entropy": 2.0886995792388916, + "epoch": 1.8289592760180997, + "grad_norm": 0.5784837007522583, + "learning_rate": 0.00042636507174072756, + "loss": 0.1026, + "mean_token_accuracy": 0.9676834791898727, + "num_tokens": 4467386.0, + "step": 506 + }, + { + "entropy": 2.0236063301563263, + "epoch": 1.8325791855203621, + "grad_norm": 0.5101180672645569, + "learning_rate": 0.00042601747685373716, + "loss": 0.1031, + "mean_token_accuracy": 0.9734093993902206, + "num_tokens": 4476054.0, + "step": 507 + }, + { + "entropy": 1.9801031053066254, + "epoch": 1.8361990950226246, + "grad_norm": 0.6581607460975647, + "learning_rate": 0.00042566933136179455, + "loss": 0.1548, + "mean_token_accuracy": 0.9581006914377213, + "num_tokens": 4484895.0, + "step": 508 + }, + { + "entropy": 2.0244787633419037, + "epoch": 1.839819004524887, + "grad_norm": 0.8100608587265015, + "learning_rate": 0.0004253206365872008, + "loss": 0.196, + "mean_token_accuracy": 0.9532899260520935, + "num_tokens": 4493737.0, + "step": 509 + }, + { + "entropy": 1.9108119010925293, + "epoch": 1.8434389140271494, + "grad_norm": 0.4903942048549652, + "learning_rate": 0.00042497139385434314, + "loss": 0.1313, + "mean_token_accuracy": 0.9667337089776993, + "num_tokens": 4502840.0, + "step": 510 + }, + { + "entropy": 2.009468197822571, + "epoch": 1.8470588235294119, + "grad_norm": 0.6010113954544067, + "learning_rate": 0.0004246216044896897, + "loss": 0.1013, + "mean_token_accuracy": 0.9692314714193344, + "num_tokens": 4511407.0, + "step": 511 + }, + { + "entropy": 2.0337170362472534, + "epoch": 1.8506787330316743, + "grad_norm": 0.7906802892684937, + "learning_rate": 0.00042427126982178546, + "loss": 0.1682, + "mean_token_accuracy": 0.9550099819898605, + "num_tokens": 4520018.0, + "step": 512 + }, + { + "entropy": 1.8813888728618622, + "epoch": 1.8542986425339367, + "grad_norm": 0.5353080034255981, + "learning_rate": 0.00042392039118124586, + "loss": 0.1228, + "mean_token_accuracy": 0.9624074995517731, + "num_tokens": 4529270.0, + "step": 513 + }, + { + "entropy": 2.012698233127594, + "epoch": 1.8579185520361992, + "grad_norm": 0.6713843941688538, + "learning_rate": 0.00042356896990075285, + "loss": 0.2225, + "mean_token_accuracy": 0.9417333751916885, + "num_tokens": 4538008.0, + "step": 514 + }, + { + "entropy": 1.880586564540863, + "epoch": 1.8615384615384616, + "grad_norm": 0.5821724534034729, + "learning_rate": 0.00042321700731504916, + "loss": 0.1144, + "mean_token_accuracy": 0.9677341282367706, + "num_tokens": 4546950.0, + "step": 515 + }, + { + "entropy": 2.0066279470920563, + "epoch": 1.865158371040724, + "grad_norm": 0.4095056354999542, + "learning_rate": 0.0004228645047609335, + "loss": 0.0424, + "mean_token_accuracy": 0.9854962974786758, + "num_tokens": 4555452.0, + "step": 516 + }, + { + "entropy": 2.042815536260605, + "epoch": 1.8687782805429864, + "grad_norm": 0.5398769974708557, + "learning_rate": 0.0004225114635772555, + "loss": 0.1343, + "mean_token_accuracy": 0.9615450948476791, + "num_tokens": 4564386.0, + "step": 517 + }, + { + "entropy": 2.0948933362960815, + "epoch": 1.8723981900452489, + "grad_norm": 0.6738974452018738, + "learning_rate": 0.0004221578851049107, + "loss": 0.1541, + "mean_token_accuracy": 0.9526563137769699, + "num_tokens": 4573041.0, + "step": 518 + }, + { + "entropy": 2.102545380592346, + "epoch": 1.8760180995475113, + "grad_norm": 0.7769943475723267, + "learning_rate": 0.00042180377068683504, + "loss": 0.2362, + "mean_token_accuracy": 0.9472651779651642, + "num_tokens": 4581666.0, + "step": 519 + }, + { + "entropy": 2.087820291519165, + "epoch": 1.8796380090497737, + "grad_norm": 0.5722424983978271, + "learning_rate": 0.0004214491216680004, + "loss": 0.1657, + "mean_token_accuracy": 0.9537082612514496, + "num_tokens": 4590238.0, + "step": 520 + }, + { + "entropy": 2.0093430876731873, + "epoch": 1.8832579185520362, + "grad_norm": 0.5844932198524475, + "learning_rate": 0.00042109393939540867, + "loss": 0.1485, + "mean_token_accuracy": 0.9624215811491013, + "num_tokens": 4599352.0, + "step": 521 + }, + { + "entropy": 1.9117147326469421, + "epoch": 1.8868778280542986, + "grad_norm": 0.46085676550865173, + "learning_rate": 0.0004207382252180876, + "loss": 0.0853, + "mean_token_accuracy": 0.9769327491521835, + "num_tokens": 4608571.0, + "step": 522 + }, + { + "entropy": 2.0205602943897247, + "epoch": 1.890497737556561, + "grad_norm": 0.5571608543395996, + "learning_rate": 0.000420381980487085, + "loss": 0.1517, + "mean_token_accuracy": 0.9646699875593185, + "num_tokens": 4617445.0, + "step": 523 + }, + { + "entropy": 1.9571953415870667, + "epoch": 1.8941176470588235, + "grad_norm": 0.470630943775177, + "learning_rate": 0.0004200252065554636, + "loss": 0.1005, + "mean_token_accuracy": 0.9750025719404221, + "num_tokens": 4626756.0, + "step": 524 + }, + { + "entropy": 2.063209116458893, + "epoch": 1.897737556561086, + "grad_norm": 0.6447069644927979, + "learning_rate": 0.00041966790477829637, + "loss": 0.113, + "mean_token_accuracy": 0.9695079624652863, + "num_tokens": 4635378.0, + "step": 525 + }, + { + "entropy": 1.9232109785079956, + "epoch": 1.9013574660633483, + "grad_norm": 0.5114295482635498, + "learning_rate": 0.000419310076512661, + "loss": 0.1492, + "mean_token_accuracy": 0.9653338938951492, + "num_tokens": 4644769.0, + "step": 526 + }, + { + "entropy": 2.1691197752952576, + "epoch": 1.9049773755656108, + "grad_norm": 0.7630137205123901, + "learning_rate": 0.00041895172311763476, + "loss": 0.212, + "mean_token_accuracy": 0.9533941894769669, + "num_tokens": 4652857.0, + "step": 527 + }, + { + "entropy": 2.04753240942955, + "epoch": 1.9085972850678732, + "grad_norm": 0.6423042416572571, + "learning_rate": 0.00041859284595428955, + "loss": 0.1455, + "mean_token_accuracy": 0.956505224108696, + "num_tokens": 4661591.0, + "step": 528 + }, + { + "entropy": 1.9440338611602783, + "epoch": 1.9122171945701356, + "grad_norm": 0.5011327266693115, + "learning_rate": 0.00041823344638568656, + "loss": 0.1255, + "mean_token_accuracy": 0.965131089091301, + "num_tokens": 4670594.0, + "step": 529 + }, + { + "entropy": 2.0554805397987366, + "epoch": 1.915837104072398, + "grad_norm": 0.5821590423583984, + "learning_rate": 0.0004178735257768713, + "loss": 0.0486, + "mean_token_accuracy": 0.9875282496213913, + "num_tokens": 4679344.0, + "step": 530 + }, + { + "entropy": 2.130349576473236, + "epoch": 1.9194570135746605, + "grad_norm": 0.5332052111625671, + "learning_rate": 0.0004175130854948679, + "loss": 0.0915, + "mean_token_accuracy": 0.9737034440040588, + "num_tokens": 4687922.0, + "step": 531 + }, + { + "entropy": 2.146788775920868, + "epoch": 1.9230769230769231, + "grad_norm": 0.5016877055168152, + "learning_rate": 0.00041715212690867455, + "loss": 0.1281, + "mean_token_accuracy": 0.9681432545185089, + "num_tokens": 4696593.0, + "step": 532 + }, + { + "entropy": 2.041268438100815, + "epoch": 1.9266968325791856, + "grad_norm": 0.5257729887962341, + "learning_rate": 0.00041679065138925807, + "loss": 0.1272, + "mean_token_accuracy": 0.9649266451597214, + "num_tokens": 4705792.0, + "step": 533 + }, + { + "entropy": 2.114819645881653, + "epoch": 1.930316742081448, + "grad_norm": 0.7085135579109192, + "learning_rate": 0.0004164286603095484, + "loss": 0.1545, + "mean_token_accuracy": 0.9581228941679001, + "num_tokens": 4714599.0, + "step": 534 + }, + { + "entropy": 2.022280514240265, + "epoch": 1.9339366515837104, + "grad_norm": 0.5309014320373535, + "learning_rate": 0.00041606615504443387, + "loss": 0.1933, + "mean_token_accuracy": 0.9562340676784515, + "num_tokens": 4724062.0, + "step": 535 + }, + { + "entropy": 2.0959260165691376, + "epoch": 1.9375565610859729, + "grad_norm": 0.6528061628341675, + "learning_rate": 0.0004157031369707557, + "loss": 0.1306, + "mean_token_accuracy": 0.9612343460321426, + "num_tokens": 4733077.0, + "step": 536 + }, + { + "entropy": 2.2772948145866394, + "epoch": 1.9411764705882353, + "grad_norm": 0.7351471185684204, + "learning_rate": 0.0004153396074673028, + "loss": 0.1494, + "mean_token_accuracy": 0.9608108699321747, + "num_tokens": 4741201.0, + "step": 537 + }, + { + "entropy": 2.0935052037239075, + "epoch": 1.9447963800904977, + "grad_norm": 0.5435840487480164, + "learning_rate": 0.0004149755679148065, + "loss": 0.0884, + "mean_token_accuracy": 0.9745689779520035, + "num_tokens": 4750306.0, + "step": 538 + }, + { + "entropy": 2.2082818746566772, + "epoch": 1.9484162895927601, + "grad_norm": 0.3780331611633301, + "learning_rate": 0.00041461101969593537, + "loss": 0.0739, + "mean_token_accuracy": 0.9777179658412933, + "num_tokens": 4758954.0, + "step": 539 + }, + { + "entropy": 2.1683040261268616, + "epoch": 1.9520361990950226, + "grad_norm": 0.4637961685657501, + "learning_rate": 0.00041424596419529017, + "loss": 0.0632, + "mean_token_accuracy": 0.9834533184766769, + "num_tokens": 4767615.0, + "step": 540 + }, + { + "entropy": 2.075555235147476, + "epoch": 1.9556561085972852, + "grad_norm": 0.7603118419647217, + "learning_rate": 0.00041388040279939804, + "loss": 0.2835, + "mean_token_accuracy": 0.9364205300807953, + "num_tokens": 4776714.0, + "step": 541 + }, + { + "entropy": 2.18926739692688, + "epoch": 1.9592760180995477, + "grad_norm": 0.8895708918571472, + "learning_rate": 0.0004135143368967079, + "loss": 0.2514, + "mean_token_accuracy": 0.9361050724983215, + "num_tokens": 4785402.0, + "step": 542 + }, + { + "entropy": 2.2387169003486633, + "epoch": 1.96289592760181, + "grad_norm": 0.6013544797897339, + "learning_rate": 0.00041314776787758454, + "loss": 0.1502, + "mean_token_accuracy": 0.9594238847494125, + "num_tokens": 4793928.0, + "step": 543 + }, + { + "entropy": 2.208383619785309, + "epoch": 1.9665158371040725, + "grad_norm": 0.6934756636619568, + "learning_rate": 0.00041278069713430386, + "loss": 0.1777, + "mean_token_accuracy": 0.9619583487510681, + "num_tokens": 4802612.0, + "step": 544 + }, + { + "entropy": 2.2621757984161377, + "epoch": 1.970135746606335, + "grad_norm": 0.6920077800750732, + "learning_rate": 0.00041241312606104743, + "loss": 0.1689, + "mean_token_accuracy": 0.9594835937023163, + "num_tokens": 4811332.0, + "step": 545 + }, + { + "entropy": 2.2654454112052917, + "epoch": 1.9737556561085974, + "grad_norm": 0.6259592771530151, + "learning_rate": 0.000412045056053897, + "loss": 0.142, + "mean_token_accuracy": 0.9648078680038452, + "num_tokens": 4820441.0, + "step": 546 + }, + { + "entropy": 2.218056857585907, + "epoch": 1.9773755656108598, + "grad_norm": 0.5390617847442627, + "learning_rate": 0.0004116764885108292, + "loss": 0.1737, + "mean_token_accuracy": 0.9595656991004944, + "num_tokens": 4829437.0, + "step": 547 + }, + { + "entropy": 2.2571592330932617, + "epoch": 1.9809954751131222, + "grad_norm": 0.3656528890132904, + "learning_rate": 0.0004113074248317108, + "loss": 0.0545, + "mean_token_accuracy": 0.9825418293476105, + "num_tokens": 4838118.0, + "step": 548 + }, + { + "entropy": 2.1890549659729004, + "epoch": 1.9846153846153847, + "grad_norm": 0.5716155767440796, + "learning_rate": 0.00041093786641829247, + "loss": 0.0997, + "mean_token_accuracy": 0.9715700745582581, + "num_tokens": 4847073.0, + "step": 549 + }, + { + "entropy": 2.2726192474365234, + "epoch": 1.988235294117647, + "grad_norm": 0.4709530770778656, + "learning_rate": 0.0004105678146742042, + "loss": 0.0746, + "mean_token_accuracy": 0.9799739569425583, + "num_tokens": 4855755.0, + "step": 550 + }, + { + "entropy": 2.2328362464904785, + "epoch": 1.9918552036199095, + "grad_norm": 0.6773779392242432, + "learning_rate": 0.0004101972710049498, + "loss": 0.1418, + "mean_token_accuracy": 0.9629421681165695, + "num_tokens": 4864601.0, + "step": 551 + }, + { + "entropy": 2.199812740087509, + "epoch": 1.995475113122172, + "grad_norm": 0.717012882232666, + "learning_rate": 0.00040982623681790113, + "loss": 0.2948, + "mean_token_accuracy": 0.9432803690433502, + "num_tokens": 4873630.0, + "step": 552 + }, + { + "entropy": 2.2102787494659424, + "epoch": 1.9990950226244344, + "grad_norm": 0.6925314664840698, + "learning_rate": 0.00040945471352229346, + "loss": 0.2579, + "mean_token_accuracy": 0.9435124397277832, + "num_tokens": 4882714.0, + "step": 553 + }, + { + "entropy": 2.3318979740142822, + "epoch": 2.0, + "grad_norm": 2.688188314437866, + "learning_rate": 0.0004090827025292197, + "loss": 0.0283, + "mean_token_accuracy": 0.9918032884597778, + "num_tokens": 4883450.0, + "step": 554 + }, + { + "epoch": 2.0, + "eval_entropy": 2.2165925522160723, + "eval_loss": 0.16817161440849304, + "eval_mean_token_accuracy": 0.9567220133494555, + "eval_num_tokens": 4883450.0, + "eval_runtime": 116.1556, + "eval_samples_per_second": 3.177, + "eval_steps_per_second": 1.059, + "step": 554 + }, + { + "entropy": 2.0389976799488068, + "epoch": 2.0036199095022624, + "grad_norm": 0.8596204519271851, + "learning_rate": 0.00040871020525162484, + "loss": 0.1341, + "mean_token_accuracy": 0.9626202881336212, + "num_tokens": 4893236.0, + "step": 555 + }, + { + "entropy": 2.245832860469818, + "epoch": 2.007239819004525, + "grad_norm": 0.39707237482070923, + "learning_rate": 0.00040833722310430114, + "loss": 0.0564, + "mean_token_accuracy": 0.9868980199098587, + "num_tokens": 4901819.0, + "step": 556 + }, + { + "entropy": 2.169717162847519, + "epoch": 2.0108597285067873, + "grad_norm": 0.46584129333496094, + "learning_rate": 0.0004079637575038822, + "loss": 0.0792, + "mean_token_accuracy": 0.9758767485618591, + "num_tokens": 4910892.0, + "step": 557 + }, + { + "entropy": 2.27083820104599, + "epoch": 2.0144796380090497, + "grad_norm": 0.8394352197647095, + "learning_rate": 0.0004075898098688381, + "loss": 0.0962, + "mean_token_accuracy": 0.9723308384418488, + "num_tokens": 4919510.0, + "step": 558 + }, + { + "entropy": 2.1067663431167603, + "epoch": 2.018099547511312, + "grad_norm": 0.4951268434524536, + "learning_rate": 0.0004072153816194696, + "loss": 0.1195, + "mean_token_accuracy": 0.9703402817249298, + "num_tokens": 4928439.0, + "step": 559 + }, + { + "entropy": 2.016420066356659, + "epoch": 2.0217194570135746, + "grad_norm": 0.5574740171432495, + "learning_rate": 0.00040684047417790273, + "loss": 0.1037, + "mean_token_accuracy": 0.9727325141429901, + "num_tokens": 4938061.0, + "step": 560 + }, + { + "entropy": 2.1843727231025696, + "epoch": 2.025339366515837, + "grad_norm": 0.786014199256897, + "learning_rate": 0.00040646508896808394, + "loss": 0.155, + "mean_token_accuracy": 0.9608975350856781, + "num_tokens": 4946619.0, + "step": 561 + }, + { + "entropy": 2.160427451133728, + "epoch": 2.0289592760180994, + "grad_norm": 0.5267161130905151, + "learning_rate": 0.000406089227415774, + "loss": 0.0632, + "mean_token_accuracy": 0.9791042655706406, + "num_tokens": 4955324.0, + "step": 562 + }, + { + "entropy": 2.0923200249671936, + "epoch": 2.032579185520362, + "grad_norm": 0.8306187987327576, + "learning_rate": 0.00040571289094854304, + "loss": 0.1976, + "mean_token_accuracy": 0.9538775235414505, + "num_tokens": 4964321.0, + "step": 563 + }, + { + "entropy": 2.0181354880332947, + "epoch": 2.0361990950226243, + "grad_norm": 0.6798867583274841, + "learning_rate": 0.0004053360809957649, + "loss": 0.1797, + "mean_token_accuracy": 0.9569422006607056, + "num_tokens": 4973937.0, + "step": 564 + }, + { + "entropy": 2.123030036687851, + "epoch": 2.0398190045248867, + "grad_norm": 0.4481683671474457, + "learning_rate": 0.00040495879898861173, + "loss": 0.0639, + "mean_token_accuracy": 0.9827965050935745, + "num_tokens": 4982779.0, + "step": 565 + }, + { + "entropy": 2.0797010362148285, + "epoch": 2.043438914027149, + "grad_norm": 0.7745859622955322, + "learning_rate": 0.00040458104636004877, + "loss": 0.1602, + "mean_token_accuracy": 0.9600242227315903, + "num_tokens": 4991793.0, + "step": 566 + }, + { + "entropy": 2.0320390164852142, + "epoch": 2.0470588235294116, + "grad_norm": 0.5792120695114136, + "learning_rate": 0.0004042028245448286, + "loss": 0.0816, + "mean_token_accuracy": 0.9757721722126007, + "num_tokens": 5000834.0, + "step": 567 + }, + { + "entropy": 2.1047743558883667, + "epoch": 2.050678733031674, + "grad_norm": 0.5770072937011719, + "learning_rate": 0.0004038241349794858, + "loss": 0.1367, + "mean_token_accuracy": 0.9598450362682343, + "num_tokens": 5010155.0, + "step": 568 + }, + { + "entropy": 2.022550255060196, + "epoch": 2.0542986425339365, + "grad_norm": 0.47085902094841003, + "learning_rate": 0.0004034449791023319, + "loss": 0.1005, + "mean_token_accuracy": 0.970214769244194, + "num_tokens": 5020010.0, + "step": 569 + }, + { + "entropy": 2.034317582845688, + "epoch": 2.057918552036199, + "grad_norm": 0.4816018044948578, + "learning_rate": 0.0004030653583534489, + "loss": 0.118, + "mean_token_accuracy": 0.9635649025440216, + "num_tokens": 5029205.0, + "step": 570 + }, + { + "entropy": 2.1142700910568237, + "epoch": 2.0615384615384613, + "grad_norm": 0.561765730381012, + "learning_rate": 0.0004026852741746849, + "loss": 0.0628, + "mean_token_accuracy": 0.9811093211174011, + "num_tokens": 5037830.0, + "step": 571 + }, + { + "entropy": 2.1506906747817993, + "epoch": 2.065158371040724, + "grad_norm": 0.9037840366363525, + "learning_rate": 0.0004023047280096482, + "loss": 0.1395, + "mean_token_accuracy": 0.9645196944475174, + "num_tokens": 5046618.0, + "step": 572 + }, + { + "entropy": 2.1811060309410095, + "epoch": 2.0687782805429866, + "grad_norm": 0.6224188208580017, + "learning_rate": 0.0004019237213037014, + "loss": 0.0766, + "mean_token_accuracy": 0.9752616137266159, + "num_tokens": 5055467.0, + "step": 573 + }, + { + "entropy": 2.0479070246219635, + "epoch": 2.072398190045249, + "grad_norm": 0.5052458643913269, + "learning_rate": 0.00040154225550395665, + "loss": 0.091, + "mean_token_accuracy": 0.9753529280424118, + "num_tokens": 5064518.0, + "step": 574 + }, + { + "entropy": 2.18623149394989, + "epoch": 2.0760180995475115, + "grad_norm": 0.49587905406951904, + "learning_rate": 0.00040116033205926964, + "loss": 0.0703, + "mean_token_accuracy": 0.979348823428154, + "num_tokens": 5072713.0, + "step": 575 + }, + { + "entropy": 2.131018817424774, + "epoch": 2.079638009049774, + "grad_norm": 0.607468843460083, + "learning_rate": 0.0004007779524202343, + "loss": 0.0988, + "mean_token_accuracy": 0.9756181836128235, + "num_tokens": 5081046.0, + "step": 576 + }, + { + "entropy": 2.0251292288303375, + "epoch": 2.0832579185520363, + "grad_norm": 0.867511510848999, + "learning_rate": 0.00040039511803917723, + "loss": 0.1672, + "mean_token_accuracy": 0.9638413190841675, + "num_tokens": 5089859.0, + "step": 577 + }, + { + "entropy": 2.0818732380867004, + "epoch": 2.086877828054299, + "grad_norm": 0.5915331840515137, + "learning_rate": 0.0004000118303701521, + "loss": 0.1103, + "mean_token_accuracy": 0.9715124219655991, + "num_tokens": 5098331.0, + "step": 578 + }, + { + "entropy": 1.9556698501110077, + "epoch": 2.090497737556561, + "grad_norm": 0.5216535329818726, + "learning_rate": 0.0003996280908689345, + "loss": 0.1481, + "mean_token_accuracy": 0.9601311087608337, + "num_tokens": 5107557.0, + "step": 579 + }, + { + "entropy": 2.015773117542267, + "epoch": 2.0941176470588236, + "grad_norm": 0.7138916254043579, + "learning_rate": 0.00039924390099301584, + "loss": 0.1173, + "mean_token_accuracy": 0.9670253992080688, + "num_tokens": 5116677.0, + "step": 580 + }, + { + "entropy": 2.0676984786987305, + "epoch": 2.097737556561086, + "grad_norm": 0.7776201963424683, + "learning_rate": 0.0003988592622015984, + "loss": 0.0668, + "mean_token_accuracy": 0.9766870141029358, + "num_tokens": 5125262.0, + "step": 581 + }, + { + "entropy": 2.0256679952144623, + "epoch": 2.1013574660633485, + "grad_norm": 0.5481430888175964, + "learning_rate": 0.00039847417595558903, + "loss": 0.0898, + "mean_token_accuracy": 0.9747780114412308, + "num_tokens": 5133848.0, + "step": 582 + }, + { + "entropy": 2.049301326274872, + "epoch": 2.104977375565611, + "grad_norm": 0.6634963154792786, + "learning_rate": 0.00039808864371759464, + "loss": 0.1012, + "mean_token_accuracy": 0.9695883542299271, + "num_tokens": 5142266.0, + "step": 583 + }, + { + "entropy": 1.8873322904109955, + "epoch": 2.1085972850678734, + "grad_norm": 0.6262965798377991, + "learning_rate": 0.0003977026669519156, + "loss": 0.1064, + "mean_token_accuracy": 0.9686857610940933, + "num_tokens": 5151297.0, + "step": 584 + }, + { + "entropy": 2.0208800733089447, + "epoch": 2.112217194570136, + "grad_norm": 0.6475429534912109, + "learning_rate": 0.0003973162471245411, + "loss": 0.126, + "mean_token_accuracy": 0.9671273976564407, + "num_tokens": 5159913.0, + "step": 585 + }, + { + "entropy": 2.0354510843753815, + "epoch": 2.1158371040723982, + "grad_norm": 0.6373077034950256, + "learning_rate": 0.0003969293857031426, + "loss": 0.1403, + "mean_token_accuracy": 0.9615094214677811, + "num_tokens": 5168392.0, + "step": 586 + }, + { + "entropy": 2.0489701330661774, + "epoch": 2.1194570135746607, + "grad_norm": 0.7459731698036194, + "learning_rate": 0.0003965420841570693, + "loss": 0.0847, + "mean_token_accuracy": 0.9742033332586288, + "num_tokens": 5176858.0, + "step": 587 + }, + { + "entropy": 2.0531455874443054, + "epoch": 2.123076923076923, + "grad_norm": 0.8357418179512024, + "learning_rate": 0.00039615434395734174, + "loss": 0.2558, + "mean_token_accuracy": 0.9348864704370499, + "num_tokens": 5185101.0, + "step": 588 + }, + { + "entropy": 1.9761857986450195, + "epoch": 2.1266968325791855, + "grad_norm": 0.4816463887691498, + "learning_rate": 0.00039576616657664666, + "loss": 0.0934, + "mean_token_accuracy": 0.9781179577112198, + "num_tokens": 5193987.0, + "step": 589 + }, + { + "entropy": 2.0150316655635834, + "epoch": 2.130316742081448, + "grad_norm": 0.7039950489997864, + "learning_rate": 0.0003953775534893311, + "loss": 0.1558, + "mean_token_accuracy": 0.9602096229791641, + "num_tokens": 5202598.0, + "step": 590 + }, + { + "entropy": 2.0542426705360413, + "epoch": 2.1339366515837104, + "grad_norm": 0.6318346858024597, + "learning_rate": 0.00039498850617139737, + "loss": 0.1277, + "mean_token_accuracy": 0.9658758789300919, + "num_tokens": 5211157.0, + "step": 591 + }, + { + "entropy": 2.0793416798114777, + "epoch": 2.137556561085973, + "grad_norm": 0.6513328552246094, + "learning_rate": 0.0003945990261004964, + "loss": 0.3452, + "mean_token_accuracy": 0.9376382231712341, + "num_tokens": 5220057.0, + "step": 592 + }, + { + "entropy": 1.834738850593567, + "epoch": 2.1411764705882352, + "grad_norm": 0.709550678730011, + "learning_rate": 0.0003942091147559234, + "loss": 0.1632, + "mean_token_accuracy": 0.9588025957345963, + "num_tokens": 5229649.0, + "step": 593 + }, + { + "entropy": 2.115740954875946, + "epoch": 2.1447963800904977, + "grad_norm": 0.6495632529258728, + "learning_rate": 0.00039381877361861127, + "loss": 0.0799, + "mean_token_accuracy": 0.9793208837509155, + "num_tokens": 5238060.0, + "step": 594 + }, + { + "entropy": 1.9325994551181793, + "epoch": 2.14841628959276, + "grad_norm": 0.3864371180534363, + "learning_rate": 0.0003934280041711253, + "loss": 0.0392, + "mean_token_accuracy": 0.9867032468318939, + "num_tokens": 5246715.0, + "step": 595 + }, + { + "entropy": 1.9573578834533691, + "epoch": 2.1520361990950225, + "grad_norm": 0.8978553414344788, + "learning_rate": 0.0003930368078976578, + "loss": 0.1043, + "mean_token_accuracy": 0.9700421690940857, + "num_tokens": 5255677.0, + "step": 596 + }, + { + "entropy": 2.017194092273712, + "epoch": 2.155656108597285, + "grad_norm": 0.8082290887832642, + "learning_rate": 0.0003926451862840221, + "loss": 0.193, + "mean_token_accuracy": 0.9494165182113647, + "num_tokens": 5264229.0, + "step": 597 + }, + { + "entropy": 1.8982190787792206, + "epoch": 2.1592760180995474, + "grad_norm": 0.7600063681602478, + "learning_rate": 0.00039225314081764673, + "loss": 0.2152, + "mean_token_accuracy": 0.9523166120052338, + "num_tokens": 5273397.0, + "step": 598 + }, + { + "entropy": 1.9896901845932007, + "epoch": 2.16289592760181, + "grad_norm": 0.45877528190612793, + "learning_rate": 0.0003918606729875706, + "loss": 0.0892, + "mean_token_accuracy": 0.9720247238874435, + "num_tokens": 5282376.0, + "step": 599 + }, + { + "entropy": 1.8235589861869812, + "epoch": 2.1665158371040723, + "grad_norm": 0.49329352378845215, + "learning_rate": 0.0003914677842844365, + "loss": 0.0803, + "mean_token_accuracy": 0.9721037000417709, + "num_tokens": 5291815.0, + "step": 600 + }, + { + "entropy": 1.9400377571582794, + "epoch": 2.1701357466063347, + "grad_norm": 0.5306346416473389, + "learning_rate": 0.0003910744762004857, + "loss": 0.0602, + "mean_token_accuracy": 0.9762802571058273, + "num_tokens": 5300394.0, + "step": 601 + }, + { + "entropy": 1.7808023691177368, + "epoch": 2.173755656108597, + "grad_norm": 0.5050559043884277, + "learning_rate": 0.00039068075022955255, + "loss": 0.0862, + "mean_token_accuracy": 0.9724314510822296, + "num_tokens": 5309685.0, + "step": 602 + }, + { + "entropy": 1.9939678311347961, + "epoch": 2.1773755656108595, + "grad_norm": 0.6879346966743469, + "learning_rate": 0.0003902866078670584, + "loss": 0.0936, + "mean_token_accuracy": 0.9765703976154327, + "num_tokens": 5318020.0, + "step": 603 + }, + { + "entropy": 1.9384137690067291, + "epoch": 2.180995475113122, + "grad_norm": 0.6881359219551086, + "learning_rate": 0.0003898920506100061, + "loss": 0.1303, + "mean_token_accuracy": 0.9615567773580551, + "num_tokens": 5326895.0, + "step": 604 + }, + { + "entropy": 1.9919665455818176, + "epoch": 2.184615384615385, + "grad_norm": 0.6181508302688599, + "learning_rate": 0.00038949707995697446, + "loss": 0.0745, + "mean_token_accuracy": 0.9808734804391861, + "num_tokens": 5335355.0, + "step": 605 + }, + { + "entropy": 1.9376583397388458, + "epoch": 2.1882352941176473, + "grad_norm": 0.46525871753692627, + "learning_rate": 0.0003891016974081125, + "loss": 0.0826, + "mean_token_accuracy": 0.9753947854042053, + "num_tokens": 5343879.0, + "step": 606 + }, + { + "entropy": 1.8252979516983032, + "epoch": 2.1918552036199097, + "grad_norm": 0.5332593321800232, + "learning_rate": 0.00038870590446513325, + "loss": 0.1218, + "mean_token_accuracy": 0.9644111543893814, + "num_tokens": 5352980.0, + "step": 607 + }, + { + "entropy": 1.8981524407863617, + "epoch": 2.195475113122172, + "grad_norm": 0.5849556922912598, + "learning_rate": 0.0003883097026313089, + "loss": 0.0854, + "mean_token_accuracy": 0.9766328930854797, + "num_tokens": 5361576.0, + "step": 608 + }, + { + "entropy": 1.9466857016086578, + "epoch": 2.1990950226244346, + "grad_norm": 1.0213185548782349, + "learning_rate": 0.00038791309341146453, + "loss": 0.1282, + "mean_token_accuracy": 0.975858062505722, + "num_tokens": 5369947.0, + "step": 609 + }, + { + "entropy": 1.9219308197498322, + "epoch": 2.202714932126697, + "grad_norm": 0.7259594798088074, + "learning_rate": 0.00038751607831197243, + "loss": 0.0986, + "mean_token_accuracy": 0.9709735363721848, + "num_tokens": 5378429.0, + "step": 610 + }, + { + "entropy": 1.934881567955017, + "epoch": 2.2063348416289594, + "grad_norm": 0.6190217137336731, + "learning_rate": 0.0003871186588407467, + "loss": 0.1259, + "mean_token_accuracy": 0.9606761038303375, + "num_tokens": 5386986.0, + "step": 611 + }, + { + "entropy": 1.9234256446361542, + "epoch": 2.209954751131222, + "grad_norm": 1.1731759309768677, + "learning_rate": 0.00038672083650723697, + "loss": 0.3705, + "mean_token_accuracy": 0.9448409974575043, + "num_tokens": 5395623.0, + "step": 612 + }, + { + "entropy": 1.9198957085609436, + "epoch": 2.2135746606334843, + "grad_norm": 0.38831791281700134, + "learning_rate": 0.00038632261282242316, + "loss": 0.0405, + "mean_token_accuracy": 0.9884084165096283, + "num_tokens": 5403964.0, + "step": 613 + }, + { + "entropy": 1.9401849210262299, + "epoch": 2.2171945701357467, + "grad_norm": 0.6391944885253906, + "learning_rate": 0.0003859239892988097, + "loss": 0.0803, + "mean_token_accuracy": 0.9763080179691315, + "num_tokens": 5412601.0, + "step": 614 + }, + { + "entropy": 1.906328171491623, + "epoch": 2.220814479638009, + "grad_norm": 0.5495765805244446, + "learning_rate": 0.00038552496745041935, + "loss": 0.0919, + "mean_token_accuracy": 0.9796502739191055, + "num_tokens": 5421112.0, + "step": 615 + }, + { + "entropy": 1.9130763709545135, + "epoch": 2.2244343891402716, + "grad_norm": 0.8233397006988525, + "learning_rate": 0.0003851255487927883, + "loss": 0.1246, + "mean_token_accuracy": 0.9621723592281342, + "num_tokens": 5429851.0, + "step": 616 + }, + { + "entropy": 1.8408336341381073, + "epoch": 2.228054298642534, + "grad_norm": 0.8857082724571228, + "learning_rate": 0.00038472573484295904, + "loss": 0.1061, + "mean_token_accuracy": 0.9664444029331207, + "num_tokens": 5438983.0, + "step": 617 + }, + { + "entropy": 1.8644142150878906, + "epoch": 2.2316742081447964, + "grad_norm": 0.6762974262237549, + "learning_rate": 0.0003843255271194762, + "loss": 0.1532, + "mean_token_accuracy": 0.952915757894516, + "num_tokens": 5447922.0, + "step": 618 + }, + { + "entropy": 1.7125722169876099, + "epoch": 2.235294117647059, + "grad_norm": 0.44111478328704834, + "learning_rate": 0.00038392492714237975, + "loss": 0.0819, + "mean_token_accuracy": 0.9738304615020752, + "num_tokens": 5457128.0, + "step": 619 + }, + { + "entropy": 1.7900195717811584, + "epoch": 2.2389140271493213, + "grad_norm": 0.5224407911300659, + "learning_rate": 0.0003835239364331993, + "loss": 0.1023, + "mean_token_accuracy": 0.975239485502243, + "num_tokens": 5465760.0, + "step": 620 + }, + { + "entropy": 1.715638667345047, + "epoch": 2.2425339366515837, + "grad_norm": 0.6327251195907593, + "learning_rate": 0.00038312255651494866, + "loss": 0.154, + "mean_token_accuracy": 0.9579339027404785, + "num_tokens": 5475190.0, + "step": 621 + }, + { + "entropy": 1.8499042093753815, + "epoch": 2.246153846153846, + "grad_norm": 0.6490166187286377, + "learning_rate": 0.00038272078891212017, + "loss": 0.1248, + "mean_token_accuracy": 0.9679877310991287, + "num_tokens": 5484011.0, + "step": 622 + }, + { + "entropy": 1.7533331513404846, + "epoch": 2.2497737556561086, + "grad_norm": 0.6320033073425293, + "learning_rate": 0.000382318635150678, + "loss": 0.1588, + "mean_token_accuracy": 0.9576389044523239, + "num_tokens": 5493123.0, + "step": 623 + }, + { + "entropy": 1.8554400503635406, + "epoch": 2.253393665158371, + "grad_norm": 0.7169481515884399, + "learning_rate": 0.0003819160967580536, + "loss": 0.1316, + "mean_token_accuracy": 0.966967299580574, + "num_tokens": 5501923.0, + "step": 624 + }, + { + "entropy": 1.9283805191516876, + "epoch": 2.2570135746606335, + "grad_norm": 0.599856436252594, + "learning_rate": 0.00038151317526313917, + "loss": 0.1326, + "mean_token_accuracy": 0.961080014705658, + "num_tokens": 5510356.0, + "step": 625 + }, + { + "entropy": 1.7921342253684998, + "epoch": 2.260633484162896, + "grad_norm": 0.7019768357276917, + "learning_rate": 0.0003811098721962818, + "loss": 0.0976, + "mean_token_accuracy": 0.970125287771225, + "num_tokens": 5519016.0, + "step": 626 + }, + { + "entropy": 1.7646876573562622, + "epoch": 2.2642533936651583, + "grad_norm": 0.7311795949935913, + "learning_rate": 0.00038070618908927784, + "loss": 0.0908, + "mean_token_accuracy": 0.9719386845827103, + "num_tokens": 5528139.0, + "step": 627 + }, + { + "entropy": 1.8233769237995148, + "epoch": 2.2678733031674208, + "grad_norm": 0.6742154955863953, + "learning_rate": 0.0003803021274753674, + "loss": 0.1348, + "mean_token_accuracy": 0.9619691967964172, + "num_tokens": 5537036.0, + "step": 628 + }, + { + "entropy": 1.7711736857891083, + "epoch": 2.271493212669683, + "grad_norm": 0.6000869274139404, + "learning_rate": 0.00037989768888922775, + "loss": 0.1086, + "mean_token_accuracy": 0.9672373533248901, + "num_tokens": 5545932.0, + "step": 629 + }, + { + "entropy": 1.8396382629871368, + "epoch": 2.2751131221719456, + "grad_norm": 0.541504979133606, + "learning_rate": 0.0003794928748669683, + "loss": 0.0775, + "mean_token_accuracy": 0.977355495095253, + "num_tokens": 5554403.0, + "step": 630 + }, + { + "entropy": 1.890054315328598, + "epoch": 2.278733031674208, + "grad_norm": 0.5629594326019287, + "learning_rate": 0.00037908768694612434, + "loss": 0.0711, + "mean_token_accuracy": 0.9779117107391357, + "num_tokens": 5563156.0, + "step": 631 + }, + { + "entropy": 1.9505741894245148, + "epoch": 2.2823529411764705, + "grad_norm": 0.6717761754989624, + "learning_rate": 0.0003786821266656512, + "loss": 0.1077, + "mean_token_accuracy": 0.9674138873815536, + "num_tokens": 5571618.0, + "step": 632 + }, + { + "entropy": 1.8377742171287537, + "epoch": 2.285972850678733, + "grad_norm": 0.6176472902297974, + "learning_rate": 0.0003782761955659185, + "loss": 0.1106, + "mean_token_accuracy": 0.9669957906007767, + "num_tokens": 5580668.0, + "step": 633 + }, + { + "entropy": 1.8336479365825653, + "epoch": 2.2895927601809953, + "grad_norm": 0.5120813846588135, + "learning_rate": 0.0003778698951887042, + "loss": 0.0732, + "mean_token_accuracy": 0.9774532318115234, + "num_tokens": 5589491.0, + "step": 634 + }, + { + "entropy": 1.9576656222343445, + "epoch": 2.2932126696832578, + "grad_norm": 0.9347079396247864, + "learning_rate": 0.00037746322707718895, + "loss": 0.2275, + "mean_token_accuracy": 0.9512088149785995, + "num_tokens": 5598327.0, + "step": 635 + }, + { + "entropy": 1.9309991896152496, + "epoch": 2.29683257918552, + "grad_norm": 0.506108283996582, + "learning_rate": 0.0003770561927759502, + "loss": 0.1046, + "mean_token_accuracy": 0.9633967131376266, + "num_tokens": 5606948.0, + "step": 636 + }, + { + "entropy": 1.963425725698471, + "epoch": 2.3004524886877826, + "grad_norm": 0.5499919056892395, + "learning_rate": 0.0003766487938309561, + "loss": 0.0804, + "mean_token_accuracy": 0.9783825874328613, + "num_tokens": 5615342.0, + "step": 637 + }, + { + "entropy": 1.8853708505630493, + "epoch": 2.304072398190045, + "grad_norm": 0.5846657156944275, + "learning_rate": 0.00037624103178955946, + "loss": 0.0904, + "mean_token_accuracy": 0.9774703830480576, + "num_tokens": 5624449.0, + "step": 638 + }, + { + "entropy": 1.928403079509735, + "epoch": 2.3076923076923075, + "grad_norm": 0.5203971266746521, + "learning_rate": 0.0003758329082004928, + "loss": 0.0917, + "mean_token_accuracy": 0.9723261743783951, + "num_tokens": 5633273.0, + "step": 639 + }, + { + "entropy": 1.8914157152175903, + "epoch": 2.31131221719457, + "grad_norm": 0.5215239524841309, + "learning_rate": 0.00037542442461386145, + "loss": 0.1072, + "mean_token_accuracy": 0.9704900681972504, + "num_tokens": 5642357.0, + "step": 640 + }, + { + "entropy": 1.9754666090011597, + "epoch": 2.3149321266968323, + "grad_norm": 0.6710624694824219, + "learning_rate": 0.0003750155825811379, + "loss": 0.1344, + "mean_token_accuracy": 0.9615458548069, + "num_tokens": 5651409.0, + "step": 641 + }, + { + "entropy": 1.97001314163208, + "epoch": 2.318552036199095, + "grad_norm": 0.6511638164520264, + "learning_rate": 0.00037460638365515673, + "loss": 0.0502, + "mean_token_accuracy": 0.9829420000314713, + "num_tokens": 5660362.0, + "step": 642 + }, + { + "entropy": 1.9473612904548645, + "epoch": 2.3221719457013577, + "grad_norm": 0.5315663814544678, + "learning_rate": 0.00037419682939010725, + "loss": 0.1004, + "mean_token_accuracy": 0.9741797298192978, + "num_tokens": 5669386.0, + "step": 643 + }, + { + "entropy": 1.9136508405208588, + "epoch": 2.32579185520362, + "grad_norm": 0.6636398434638977, + "learning_rate": 0.00037378692134152887, + "loss": 0.0928, + "mean_token_accuracy": 0.9753085225820541, + "num_tokens": 5678226.0, + "step": 644 + }, + { + "entropy": 2.0870893597602844, + "epoch": 2.3294117647058825, + "grad_norm": 0.45003074407577515, + "learning_rate": 0.00037337666106630464, + "loss": 0.0937, + "mean_token_accuracy": 0.9742898046970367, + "num_tokens": 5687017.0, + "step": 645 + }, + { + "entropy": 2.084017276763916, + "epoch": 2.333031674208145, + "grad_norm": 0.6305840611457825, + "learning_rate": 0.0003729660501226553, + "loss": 0.1085, + "mean_token_accuracy": 0.9696957617998123, + "num_tokens": 5695585.0, + "step": 646 + }, + { + "entropy": 2.0916273295879364, + "epoch": 2.3366515837104074, + "grad_norm": 0.6674802303314209, + "learning_rate": 0.00037255509007013353, + "loss": 0.1214, + "mean_token_accuracy": 0.9657080322504044, + "num_tokens": 5704167.0, + "step": 647 + }, + { + "entropy": 2.0445155799388885, + "epoch": 2.34027149321267, + "grad_norm": 0.9245135188102722, + "learning_rate": 0.0003721437824696181, + "loss": 0.124, + "mean_token_accuracy": 0.9668982475996017, + "num_tokens": 5712896.0, + "step": 648 + }, + { + "entropy": 2.040050685405731, + "epoch": 2.3438914027149322, + "grad_norm": 0.558266818523407, + "learning_rate": 0.00037173212888330756, + "loss": 0.103, + "mean_token_accuracy": 0.9663692861795425, + "num_tokens": 5721568.0, + "step": 649 + }, + { + "entropy": 2.078313887119293, + "epoch": 2.3475113122171947, + "grad_norm": 0.6157237887382507, + "learning_rate": 0.0003713201308747148, + "loss": 0.1247, + "mean_token_accuracy": 0.9645204842090607, + "num_tokens": 5730097.0, + "step": 650 + }, + { + "entropy": 1.9473297894001007, + "epoch": 2.351131221719457, + "grad_norm": 0.6460309028625488, + "learning_rate": 0.0003709077900086607, + "loss": 0.193, + "mean_token_accuracy": 0.9537071883678436, + "num_tokens": 5738953.0, + "step": 651 + }, + { + "entropy": 1.9319245219230652, + "epoch": 2.3547511312217195, + "grad_norm": 0.826302170753479, + "learning_rate": 0.0003704951078512684, + "loss": 0.2072, + "mean_token_accuracy": 0.9553762674331665, + "num_tokens": 5748421.0, + "step": 652 + }, + { + "entropy": 2.000667005777359, + "epoch": 2.358371040723982, + "grad_norm": 0.508975625038147, + "learning_rate": 0.00037008208596995743, + "loss": 0.1124, + "mean_token_accuracy": 0.9674097448587418, + "num_tokens": 5757333.0, + "step": 653 + }, + { + "entropy": 1.9692010879516602, + "epoch": 2.3619909502262444, + "grad_norm": 0.597391664981842, + "learning_rate": 0.00036966872593343747, + "loss": 0.0958, + "mean_token_accuracy": 0.9727880656719208, + "num_tokens": 5766427.0, + "step": 654 + }, + { + "entropy": 1.9356706142425537, + "epoch": 2.365610859728507, + "grad_norm": 0.6264978051185608, + "learning_rate": 0.0003692550293117025, + "loss": 0.0925, + "mean_token_accuracy": 0.9736592024564743, + "num_tokens": 5775578.0, + "step": 655 + }, + { + "entropy": 2.086688846349716, + "epoch": 2.3692307692307693, + "grad_norm": 0.926537811756134, + "learning_rate": 0.00036884099767602523, + "loss": 0.1772, + "mean_token_accuracy": 0.9588586837053299, + "num_tokens": 5783754.0, + "step": 656 + }, + { + "entropy": 1.8272685706615448, + "epoch": 2.3728506787330317, + "grad_norm": 0.5276276469230652, + "learning_rate": 0.0003684266325989504, + "loss": 0.106, + "mean_token_accuracy": 0.9692760407924652, + "num_tokens": 5793159.0, + "step": 657 + }, + { + "entropy": 1.8490014672279358, + "epoch": 2.376470588235294, + "grad_norm": 0.6970511078834534, + "learning_rate": 0.0003680119356542895, + "loss": 0.0849, + "mean_token_accuracy": 0.9812656790018082, + "num_tokens": 5802503.0, + "step": 658 + }, + { + "entropy": 1.8577990531921387, + "epoch": 2.3800904977375565, + "grad_norm": 0.49535682797431946, + "learning_rate": 0.00036759690841711435, + "loss": 0.0965, + "mean_token_accuracy": 0.9723764955997467, + "num_tokens": 5811839.0, + "step": 659 + }, + { + "entropy": 1.785957396030426, + "epoch": 2.383710407239819, + "grad_norm": 0.7373266220092773, + "learning_rate": 0.00036718155246375124, + "loss": 0.103, + "mean_token_accuracy": 0.9659082442522049, + "num_tokens": 5821076.0, + "step": 660 + }, + { + "entropy": 1.8944315016269684, + "epoch": 2.3873303167420814, + "grad_norm": 0.4784161448478699, + "learning_rate": 0.000366765869371775, + "loss": 0.0899, + "mean_token_accuracy": 0.9731316566467285, + "num_tokens": 5830098.0, + "step": 661 + }, + { + "entropy": 1.8901143372058868, + "epoch": 2.390950226244344, + "grad_norm": 0.5539003610610962, + "learning_rate": 0.00036634986072000305, + "loss": 0.078, + "mean_token_accuracy": 0.9769923985004425, + "num_tokens": 5839149.0, + "step": 662 + }, + { + "entropy": 1.8183043003082275, + "epoch": 2.3945701357466063, + "grad_norm": 0.48431649804115295, + "learning_rate": 0.0003659335280884893, + "loss": 0.0669, + "mean_token_accuracy": 0.978607714176178, + "num_tokens": 5848064.0, + "step": 663 + }, + { + "entropy": 1.7216700911521912, + "epoch": 2.3981900452488687, + "grad_norm": 0.5597919821739197, + "learning_rate": 0.00036551687305851803, + "loss": 0.1026, + "mean_token_accuracy": 0.9733614027500153, + "num_tokens": 5857075.0, + "step": 664 + }, + { + "entropy": 1.7788107991218567, + "epoch": 2.401809954751131, + "grad_norm": 0.6780642867088318, + "learning_rate": 0.00036509989721259824, + "loss": 0.0895, + "mean_token_accuracy": 0.9711848199367523, + "num_tokens": 5866029.0, + "step": 665 + }, + { + "entropy": 1.8354471325874329, + "epoch": 2.4054298642533936, + "grad_norm": 0.6284046769142151, + "learning_rate": 0.0003646826021344573, + "loss": 0.1153, + "mean_token_accuracy": 0.9645407199859619, + "num_tokens": 5874523.0, + "step": 666 + }, + { + "entropy": 1.829980492591858, + "epoch": 2.409049773755656, + "grad_norm": 0.6398605704307556, + "learning_rate": 0.00036426498940903506, + "loss": 0.0605, + "mean_token_accuracy": 0.9823256582021713, + "num_tokens": 5883067.0, + "step": 667 + }, + { + "entropy": 1.839373379945755, + "epoch": 2.4126696832579184, + "grad_norm": 0.6254173517227173, + "learning_rate": 0.000363847060622478, + "loss": 0.0708, + "mean_token_accuracy": 0.978134423494339, + "num_tokens": 5891921.0, + "step": 668 + }, + { + "entropy": 1.7790280282497406, + "epoch": 2.416289592760181, + "grad_norm": 0.5987306833267212, + "learning_rate": 0.0003634288173621326, + "loss": 0.0888, + "mean_token_accuracy": 0.9814571887254715, + "num_tokens": 5900603.0, + "step": 669 + }, + { + "entropy": 1.6918425559997559, + "epoch": 2.4199095022624433, + "grad_norm": 0.784694492816925, + "learning_rate": 0.00036301026121654057, + "loss": 0.1353, + "mean_token_accuracy": 0.9646909832954407, + "num_tokens": 5910028.0, + "step": 670 + }, + { + "entropy": 1.726965218782425, + "epoch": 2.4235294117647057, + "grad_norm": 0.7017857432365417, + "learning_rate": 0.00036259139377543104, + "loss": 0.1531, + "mean_token_accuracy": 0.9617924690246582, + "num_tokens": 5919145.0, + "step": 671 + }, + { + "entropy": 1.7354467511177063, + "epoch": 2.427149321266968, + "grad_norm": 0.49217918515205383, + "learning_rate": 0.00036217221662971613, + "loss": 0.1217, + "mean_token_accuracy": 0.96451136469841, + "num_tokens": 5928203.0, + "step": 672 + }, + { + "entropy": 1.827672392129898, + "epoch": 2.430769230769231, + "grad_norm": 0.5875037312507629, + "learning_rate": 0.0003617527313714841, + "loss": 0.1151, + "mean_token_accuracy": 0.9714375436306, + "num_tokens": 5936876.0, + "step": 673 + }, + { + "entropy": 1.787518948316574, + "epoch": 2.4343891402714934, + "grad_norm": 0.5444310307502747, + "learning_rate": 0.0003613329395939933, + "loss": 0.1096, + "mean_token_accuracy": 0.9701481461524963, + "num_tokens": 5946025.0, + "step": 674 + }, + { + "entropy": 1.832441657781601, + "epoch": 2.438009049773756, + "grad_norm": 0.6885861754417419, + "learning_rate": 0.00036091284289166637, + "loss": 0.1409, + "mean_token_accuracy": 0.9587968736886978, + "num_tokens": 5954406.0, + "step": 675 + }, + { + "entropy": 1.7488494515419006, + "epoch": 2.4416289592760183, + "grad_norm": 0.4765988290309906, + "learning_rate": 0.0003604924428600843, + "loss": 0.1183, + "mean_token_accuracy": 0.9581810384988785, + "num_tokens": 5963472.0, + "step": 676 + }, + { + "entropy": 1.885668009519577, + "epoch": 2.4452488687782807, + "grad_norm": 0.7310354113578796, + "learning_rate": 0.00036007174109597983, + "loss": 0.1248, + "mean_token_accuracy": 0.9588694721460342, + "num_tokens": 5971771.0, + "step": 677 + }, + { + "entropy": 1.8329627513885498, + "epoch": 2.448868778280543, + "grad_norm": 0.37075191736221313, + "learning_rate": 0.00035965073919723206, + "loss": 0.0694, + "mean_token_accuracy": 0.9812011271715164, + "num_tokens": 5980536.0, + "step": 678 + }, + { + "entropy": 1.8218618333339691, + "epoch": 2.4524886877828056, + "grad_norm": 0.5196499228477478, + "learning_rate": 0.0003592294387628597, + "loss": 0.0833, + "mean_token_accuracy": 0.9765996187925339, + "num_tokens": 5989462.0, + "step": 679 + }, + { + "entropy": 1.7702144086360931, + "epoch": 2.456108597285068, + "grad_norm": 0.68550044298172, + "learning_rate": 0.0003588078413930155, + "loss": 0.1395, + "mean_token_accuracy": 0.9701545089483261, + "num_tokens": 5998702.0, + "step": 680 + }, + { + "entropy": 1.729397028684616, + "epoch": 2.4597285067873305, + "grad_norm": 0.6107930541038513, + "learning_rate": 0.00035838594868898004, + "loss": 0.1009, + "mean_token_accuracy": 0.9712544083595276, + "num_tokens": 6007594.0, + "step": 681 + }, + { + "entropy": 1.6558150053024292, + "epoch": 2.463348416289593, + "grad_norm": 0.45058509707450867, + "learning_rate": 0.0003579637622531555, + "loss": 0.0747, + "mean_token_accuracy": 0.9791784882545471, + "num_tokens": 6016874.0, + "step": 682 + }, + { + "entropy": 1.7209869921207428, + "epoch": 2.4669683257918553, + "grad_norm": 0.6103800535202026, + "learning_rate": 0.0003575412836890599, + "loss": 0.1096, + "mean_token_accuracy": 0.9665796160697937, + "num_tokens": 6026056.0, + "step": 683 + }, + { + "entropy": 1.790249615907669, + "epoch": 2.4705882352941178, + "grad_norm": 0.67525315284729, + "learning_rate": 0.0003571185146013205, + "loss": 0.0811, + "mean_token_accuracy": 0.9776998162269592, + "num_tokens": 6034624.0, + "step": 684 + }, + { + "entropy": 1.735906183719635, + "epoch": 2.47420814479638, + "grad_norm": 0.884986162185669, + "learning_rate": 0.00035669545659566836, + "loss": 0.2324, + "mean_token_accuracy": 0.9448857754468918, + "num_tokens": 6043557.0, + "step": 685 + }, + { + "entropy": 1.673194944858551, + "epoch": 2.4778280542986426, + "grad_norm": 0.7441328763961792, + "learning_rate": 0.0003562721112789316, + "loss": 0.1661, + "mean_token_accuracy": 0.9566781520843506, + "num_tokens": 6052623.0, + "step": 686 + }, + { + "entropy": 1.736072987318039, + "epoch": 2.481447963800905, + "grad_norm": 0.5674424767494202, + "learning_rate": 0.00035584848025902973, + "loss": 0.0751, + "mean_token_accuracy": 0.9750215858221054, + "num_tokens": 6061347.0, + "step": 687 + }, + { + "entropy": 1.625234305858612, + "epoch": 2.4850678733031675, + "grad_norm": 0.6596720218658447, + "learning_rate": 0.00035542456514496725, + "loss": 0.0796, + "mean_token_accuracy": 0.9773041009902954, + "num_tokens": 6070396.0, + "step": 688 + }, + { + "entropy": 1.6548752784729004, + "epoch": 2.48868778280543, + "grad_norm": 0.5798892378807068, + "learning_rate": 0.00035500036754682794, + "loss": 0.1412, + "mean_token_accuracy": 0.9653023481369019, + "num_tokens": 6079757.0, + "step": 689 + }, + { + "entropy": 1.6213977932929993, + "epoch": 2.4923076923076923, + "grad_norm": 0.44931474328041077, + "learning_rate": 0.00035457588907576823, + "loss": 0.0724, + "mean_token_accuracy": 0.9800422787666321, + "num_tokens": 6088646.0, + "step": 690 + }, + { + "entropy": 1.6762541830539703, + "epoch": 2.4959276018099548, + "grad_norm": 0.6818104386329651, + "learning_rate": 0.0003541511313440114, + "loss": 0.1217, + "mean_token_accuracy": 0.9675028026103973, + "num_tokens": 6097441.0, + "step": 691 + }, + { + "entropy": 1.7241974771022797, + "epoch": 2.499547511312217, + "grad_norm": 0.4126259982585907, + "learning_rate": 0.00035372609596484166, + "loss": 0.0615, + "mean_token_accuracy": 0.9799284338951111, + "num_tokens": 6105578.0, + "step": 692 + }, + { + "entropy": 1.6379709541797638, + "epoch": 2.5031674208144796, + "grad_norm": 0.47291842103004456, + "learning_rate": 0.00035330078455259734, + "loss": 0.0858, + "mean_token_accuracy": 0.9744312763214111, + "num_tokens": 6114404.0, + "step": 693 + }, + { + "entropy": 1.6317658722400665, + "epoch": 2.506787330316742, + "grad_norm": 0.5747683048248291, + "learning_rate": 0.00035287519872266544, + "loss": 0.1344, + "mean_token_accuracy": 0.9632531553506851, + "num_tokens": 6123319.0, + "step": 694 + }, + { + "entropy": 1.6969698369503021, + "epoch": 2.5104072398190045, + "grad_norm": 0.5810018181800842, + "learning_rate": 0.00035244934009147523, + "loss": 0.0927, + "mean_token_accuracy": 0.9729650169610977, + "num_tokens": 6131814.0, + "step": 695 + }, + { + "entropy": 1.631262481212616, + "epoch": 2.514027149321267, + "grad_norm": 0.44387346506118774, + "learning_rate": 0.00035202321027649205, + "loss": 0.0657, + "mean_token_accuracy": 0.9802225232124329, + "num_tokens": 6140967.0, + "step": 696 + }, + { + "entropy": 1.610716551542282, + "epoch": 2.5176470588235293, + "grad_norm": 0.6546471118927002, + "learning_rate": 0.0003515968108962112, + "loss": 0.1114, + "mean_token_accuracy": 0.9671156108379364, + "num_tokens": 6149938.0, + "step": 697 + }, + { + "entropy": 1.598843276500702, + "epoch": 2.521266968325792, + "grad_norm": 0.541953444480896, + "learning_rate": 0.0003511701435701519, + "loss": 0.0504, + "mean_token_accuracy": 0.98616062104702, + "num_tokens": 6158686.0, + "step": 698 + }, + { + "entropy": 1.7793676853179932, + "epoch": 2.524886877828054, + "grad_norm": 0.6303162574768066, + "learning_rate": 0.00035074320991885106, + "loss": 0.0797, + "mean_token_accuracy": 0.9783169627189636, + "num_tokens": 6166835.0, + "step": 699 + }, + { + "entropy": 1.598317414522171, + "epoch": 2.5285067873303166, + "grad_norm": 0.4783090054988861, + "learning_rate": 0.000350316011563857, + "loss": 0.0693, + "mean_token_accuracy": 0.9740357846021652, + "num_tokens": 6175978.0, + "step": 700 + }, + { + "entropy": 1.6361595392227173, + "epoch": 2.532126696832579, + "grad_norm": 0.46353498101234436, + "learning_rate": 0.00034988855012772367, + "loss": 0.0543, + "mean_token_accuracy": 0.9821173399686813, + "num_tokens": 6185071.0, + "step": 701 + }, + { + "entropy": 1.6333596408367157, + "epoch": 2.5357466063348415, + "grad_norm": 0.4968421459197998, + "learning_rate": 0.0003494608272340039, + "loss": 0.1588, + "mean_token_accuracy": 0.9692430347204208, + "num_tokens": 6194279.0, + "step": 702 + }, + { + "entropy": 1.6701206266880035, + "epoch": 2.539366515837104, + "grad_norm": 0.7050784826278687, + "learning_rate": 0.00034903284450724385, + "loss": 0.1298, + "mean_token_accuracy": 0.9623726159334183, + "num_tokens": 6203017.0, + "step": 703 + }, + { + "entropy": 1.6594900786876678, + "epoch": 2.5429864253393664, + "grad_norm": 0.7955659031867981, + "learning_rate": 0.0003486046035729765, + "loss": 0.1695, + "mean_token_accuracy": 0.9616524875164032, + "num_tokens": 6212016.0, + "step": 704 + }, + { + "entropy": 1.7208792865276337, + "epoch": 2.546606334841629, + "grad_norm": 0.7105070352554321, + "learning_rate": 0.00034817610605771546, + "loss": 0.1655, + "mean_token_accuracy": 0.9637335985898972, + "num_tokens": 6220619.0, + "step": 705 + }, + { + "entropy": 1.668517529964447, + "epoch": 2.5502262443438912, + "grad_norm": 0.3955032527446747, + "learning_rate": 0.0003477473535889488, + "loss": 0.0502, + "mean_token_accuracy": 0.9823585599660873, + "num_tokens": 6229785.0, + "step": 706 + }, + { + "entropy": 1.7515103816986084, + "epoch": 2.5538461538461537, + "grad_norm": 0.6166616082191467, + "learning_rate": 0.00034731834779513313, + "loss": 0.1113, + "mean_token_accuracy": 0.9675650298595428, + "num_tokens": 6238724.0, + "step": 707 + }, + { + "entropy": 1.8460631668567657, + "epoch": 2.557466063348416, + "grad_norm": 0.8243921399116516, + "learning_rate": 0.0003468890903056872, + "loss": 0.1625, + "mean_token_accuracy": 0.9648249596357346, + "num_tokens": 6246939.0, + "step": 708 + }, + { + "entropy": 1.784417450428009, + "epoch": 2.5610859728506785, + "grad_norm": 0.5633116960525513, + "learning_rate": 0.00034645958275098557, + "loss": 0.1074, + "mean_token_accuracy": 0.9705483913421631, + "num_tokens": 6255686.0, + "step": 709 + }, + { + "entropy": 1.7208334505558014, + "epoch": 2.564705882352941, + "grad_norm": 0.8083389401435852, + "learning_rate": 0.0003460298267623526, + "loss": 0.1184, + "mean_token_accuracy": 0.9747882932424545, + "num_tokens": 6265047.0, + "step": 710 + }, + { + "entropy": 1.7345463037490845, + "epoch": 2.5683257918552034, + "grad_norm": 0.6094368100166321, + "learning_rate": 0.0003455998239720565, + "loss": 0.1689, + "mean_token_accuracy": 0.9613602459430695, + "num_tokens": 6274460.0, + "step": 711 + }, + { + "entropy": 1.9464713335037231, + "epoch": 2.571945701357466, + "grad_norm": 0.6025084853172302, + "learning_rate": 0.0003451695760133025, + "loss": 0.1477, + "mean_token_accuracy": 0.9618766456842422, + "num_tokens": 6282700.0, + "step": 712 + }, + { + "entropy": 1.8449675738811493, + "epoch": 2.5755656108597282, + "grad_norm": 0.43869853019714355, + "learning_rate": 0.0003447390845202272, + "loss": 0.0892, + "mean_token_accuracy": 0.974039301276207, + "num_tokens": 6291627.0, + "step": 713 + }, + { + "entropy": 1.9028298556804657, + "epoch": 2.579185520361991, + "grad_norm": 0.5455291271209717, + "learning_rate": 0.0003443083511278922, + "loss": 0.0939, + "mean_token_accuracy": 0.9729337990283966, + "num_tokens": 6300198.0, + "step": 714 + }, + { + "entropy": 1.8395194113254547, + "epoch": 2.5828054298642535, + "grad_norm": 0.48734748363494873, + "learning_rate": 0.00034387737747227786, + "loss": 0.0791, + "mean_token_accuracy": 0.9785804748535156, + "num_tokens": 6309362.0, + "step": 715 + }, + { + "entropy": 1.8357026278972626, + "epoch": 2.586425339366516, + "grad_norm": 0.4359396994113922, + "learning_rate": 0.000343446165190277, + "loss": 0.0752, + "mean_token_accuracy": 0.9807359129190445, + "num_tokens": 6318232.0, + "step": 716 + }, + { + "entropy": 1.7531521618366241, + "epoch": 2.5900452488687784, + "grad_norm": 0.7446436882019043, + "learning_rate": 0.0003430147159196887, + "loss": 0.1467, + "mean_token_accuracy": 0.9661064445972443, + "num_tokens": 6327607.0, + "step": 717 + }, + { + "entropy": 1.83816197514534, + "epoch": 2.593665158371041, + "grad_norm": 0.3669150173664093, + "learning_rate": 0.0003425830312992125, + "loss": 0.076, + "mean_token_accuracy": 0.9777591675519943, + "num_tokens": 6336991.0, + "step": 718 + }, + { + "entropy": 1.9396244585514069, + "epoch": 2.5972850678733033, + "grad_norm": 0.6049129962921143, + "learning_rate": 0.00034215111296844147, + "loss": 0.1001, + "mean_token_accuracy": 0.968943640589714, + "num_tokens": 6345381.0, + "step": 719 + }, + { + "entropy": 1.8745197057724, + "epoch": 2.6009049773755657, + "grad_norm": 0.8561233878135681, + "learning_rate": 0.00034171896256785645, + "loss": 0.2378, + "mean_token_accuracy": 0.9442594349384308, + "num_tokens": 6354290.0, + "step": 720 + }, + { + "entropy": 1.8199078440666199, + "epoch": 2.604524886877828, + "grad_norm": 0.4546636939048767, + "learning_rate": 0.00034128658173881993, + "loss": 0.0407, + "mean_token_accuracy": 0.9873656630516052, + "num_tokens": 6362826.0, + "step": 721 + }, + { + "entropy": 1.8066097497940063, + "epoch": 2.6081447963800906, + "grad_norm": 0.6496687531471252, + "learning_rate": 0.0003408539721235691, + "loss": 0.1279, + "mean_token_accuracy": 0.9674505293369293, + "num_tokens": 6371666.0, + "step": 722 + }, + { + "entropy": 1.8027856945991516, + "epoch": 2.611764705882353, + "grad_norm": 0.6001412272453308, + "learning_rate": 0.0003404211353652106, + "loss": 0.1144, + "mean_token_accuracy": 0.9672902077436447, + "num_tokens": 6380469.0, + "step": 723 + }, + { + "entropy": 1.7859437465667725, + "epoch": 2.6153846153846154, + "grad_norm": 0.4654795229434967, + "learning_rate": 0.0003399880731077136, + "loss": 0.0655, + "mean_token_accuracy": 0.9804074019193649, + "num_tokens": 6389485.0, + "step": 724 + }, + { + "entropy": 1.722127079963684, + "epoch": 2.619004524886878, + "grad_norm": 0.5452624559402466, + "learning_rate": 0.0003395547869959037, + "loss": 0.0827, + "mean_token_accuracy": 0.972189649939537, + "num_tokens": 6398523.0, + "step": 725 + }, + { + "entropy": 1.7406074404716492, + "epoch": 2.6226244343891403, + "grad_norm": 0.5524203181266785, + "learning_rate": 0.00033912127867545685, + "loss": 0.1279, + "mean_token_accuracy": 0.9688322842121124, + "num_tokens": 6407560.0, + "step": 726 + }, + { + "entropy": 1.7783840000629425, + "epoch": 2.6262443438914027, + "grad_norm": 0.6428073644638062, + "learning_rate": 0.00033868754979289275, + "loss": 0.1392, + "mean_token_accuracy": 0.9665655642747879, + "num_tokens": 6416230.0, + "step": 727 + }, + { + "entropy": 1.7406431436538696, + "epoch": 2.629864253393665, + "grad_norm": 0.6197221875190735, + "learning_rate": 0.0003382536019955691, + "loss": 0.2688, + "mean_token_accuracy": 0.9567561745643616, + "num_tokens": 6425158.0, + "step": 728 + }, + { + "entropy": 1.7054848670959473, + "epoch": 2.6334841628959276, + "grad_norm": 0.499615877866745, + "learning_rate": 0.0003378194369316749, + "loss": 0.0765, + "mean_token_accuracy": 0.9788558930158615, + "num_tokens": 6434219.0, + "step": 729 + }, + { + "entropy": 1.8623437583446503, + "epoch": 2.63710407239819, + "grad_norm": 0.428608775138855, + "learning_rate": 0.0003373850562502243, + "loss": 0.044, + "mean_token_accuracy": 0.9862259030342102, + "num_tokens": 6442657.0, + "step": 730 + }, + { + "entropy": 1.6827208995819092, + "epoch": 2.6407239819004524, + "grad_norm": 0.46222713589668274, + "learning_rate": 0.00033695046160105076, + "loss": 0.0687, + "mean_token_accuracy": 0.9762164503335953, + "num_tokens": 6451550.0, + "step": 731 + }, + { + "entropy": 1.707773894071579, + "epoch": 2.644343891402715, + "grad_norm": 0.4701695442199707, + "learning_rate": 0.0003365156546347998, + "loss": 0.0622, + "mean_token_accuracy": 0.9804075062274933, + "num_tokens": 6460494.0, + "step": 732 + }, + { + "entropy": 1.7011042833328247, + "epoch": 2.6479638009049773, + "grad_norm": 0.5986224412918091, + "learning_rate": 0.0003360806370029239, + "loss": 0.0954, + "mean_token_accuracy": 0.9730664491653442, + "num_tokens": 6469728.0, + "step": 733 + }, + { + "entropy": 1.810427963733673, + "epoch": 2.6515837104072397, + "grad_norm": 0.8224559426307678, + "learning_rate": 0.0003356454103576754, + "loss": 0.1218, + "mean_token_accuracy": 0.9742488712072372, + "num_tokens": 6478643.0, + "step": 734 + }, + { + "entropy": 1.773183435201645, + "epoch": 2.655203619909502, + "grad_norm": 0.609344482421875, + "learning_rate": 0.0003352099763521006, + "loss": 0.0955, + "mean_token_accuracy": 0.9747250378131866, + "num_tokens": 6487314.0, + "step": 735 + }, + { + "entropy": 1.7761066555976868, + "epoch": 2.6588235294117646, + "grad_norm": 0.6947258114814758, + "learning_rate": 0.0003347743366400333, + "loss": 0.1188, + "mean_token_accuracy": 0.9693178832530975, + "num_tokens": 6496074.0, + "step": 736 + }, + { + "entropy": 1.7725336253643036, + "epoch": 2.662443438914027, + "grad_norm": 0.6928444504737854, + "learning_rate": 0.0003343384928760887, + "loss": 0.1589, + "mean_token_accuracy": 0.9603369683027267, + "num_tokens": 6504997.0, + "step": 737 + }, + { + "entropy": 1.8763961493968964, + "epoch": 2.6660633484162894, + "grad_norm": 0.6204855442047119, + "learning_rate": 0.00033390244671565694, + "loss": 0.1115, + "mean_token_accuracy": 0.9727036952972412, + "num_tokens": 6513639.0, + "step": 738 + }, + { + "entropy": 1.8347080647945404, + "epoch": 2.669683257918552, + "grad_norm": 0.4470975697040558, + "learning_rate": 0.00033346619981489687, + "loss": 0.0707, + "mean_token_accuracy": 0.9816004037857056, + "num_tokens": 6522524.0, + "step": 739 + }, + { + "entropy": 1.8440867066383362, + "epoch": 2.6733031674208148, + "grad_norm": 0.6848122477531433, + "learning_rate": 0.0003330297538307298, + "loss": 0.1133, + "mean_token_accuracy": 0.966602012515068, + "num_tokens": 6531421.0, + "step": 740 + }, + { + "entropy": 1.829009771347046, + "epoch": 2.676923076923077, + "grad_norm": 0.37875643372535706, + "learning_rate": 0.0003325931104208333, + "loss": 0.0539, + "mean_token_accuracy": 0.9850967526435852, + "num_tokens": 6540304.0, + "step": 741 + }, + { + "entropy": 1.8256315886974335, + "epoch": 2.6805429864253396, + "grad_norm": 0.4970630407333374, + "learning_rate": 0.00033215627124363466, + "loss": 0.1195, + "mean_token_accuracy": 0.9662436544895172, + "num_tokens": 6549267.0, + "step": 742 + }, + { + "entropy": 1.823629915714264, + "epoch": 2.684162895927602, + "grad_norm": 0.659981906414032, + "learning_rate": 0.0003317192379583047, + "loss": 0.1368, + "mean_token_accuracy": 0.9655566364526749, + "num_tokens": 6558447.0, + "step": 743 + }, + { + "entropy": 1.8459455370903015, + "epoch": 2.6877828054298645, + "grad_norm": 0.620197057723999, + "learning_rate": 0.0003312820122247515, + "loss": 0.1766, + "mean_token_accuracy": 0.9569400995969772, + "num_tokens": 6567424.0, + "step": 744 + }, + { + "entropy": 1.7685991525650024, + "epoch": 2.691402714932127, + "grad_norm": 0.34498465061187744, + "learning_rate": 0.0003308445957036142, + "loss": 0.0615, + "mean_token_accuracy": 0.982216015458107, + "num_tokens": 6577071.0, + "step": 745 + }, + { + "entropy": 1.8037284910678864, + "epoch": 2.6950226244343893, + "grad_norm": 0.5550521016120911, + "learning_rate": 0.00033040699005625654, + "loss": 0.0701, + "mean_token_accuracy": 0.9795115292072296, + "num_tokens": 6586396.0, + "step": 746 + }, + { + "entropy": 1.813001424074173, + "epoch": 2.6986425339366518, + "grad_norm": 0.4117080271244049, + "learning_rate": 0.0003299691969447603, + "loss": 0.0657, + "mean_token_accuracy": 0.978747770190239, + "num_tokens": 6595189.0, + "step": 747 + }, + { + "entropy": 1.844575196504593, + "epoch": 2.702262443438914, + "grad_norm": 0.32197874784469604, + "learning_rate": 0.00032953121803191976, + "loss": 0.0342, + "mean_token_accuracy": 0.9904316365718842, + "num_tokens": 6604169.0, + "step": 748 + }, + { + "entropy": 1.9490505158901215, + "epoch": 2.7058823529411766, + "grad_norm": 0.5810762047767639, + "learning_rate": 0.00032909305498123465, + "loss": 0.1419, + "mean_token_accuracy": 0.9646100401878357, + "num_tokens": 6612744.0, + "step": 749 + }, + { + "entropy": 1.9927488267421722, + "epoch": 2.709502262443439, + "grad_norm": 0.7435065507888794, + "learning_rate": 0.0003286547094569039, + "loss": 0.1368, + "mean_token_accuracy": 0.9609140008687973, + "num_tokens": 6621000.0, + "step": 750 + }, + { + "entropy": 1.8266884088516235, + "epoch": 2.7131221719457015, + "grad_norm": 0.6717537045478821, + "learning_rate": 0.00032821618312381975, + "loss": 0.1449, + "mean_token_accuracy": 0.9694183021783829, + "num_tokens": 6629893.0, + "step": 751 + }, + { + "entropy": 1.850794643163681, + "epoch": 2.716742081447964, + "grad_norm": 0.44241195917129517, + "learning_rate": 0.00032777747764756117, + "loss": 0.0602, + "mean_token_accuracy": 0.9823136776685715, + "num_tokens": 6638696.0, + "step": 752 + }, + { + "entropy": 1.8408480882644653, + "epoch": 2.7203619909502263, + "grad_norm": 0.6299809217453003, + "learning_rate": 0.00032733859469438736, + "loss": 0.1408, + "mean_token_accuracy": 0.9629880636930466, + "num_tokens": 6647431.0, + "step": 753 + }, + { + "entropy": 1.7875444293022156, + "epoch": 2.723981900452489, + "grad_norm": 0.48492106795310974, + "learning_rate": 0.00032689953593123175, + "loss": 0.0806, + "mean_token_accuracy": 0.9798424690961838, + "num_tokens": 6656443.0, + "step": 754 + }, + { + "entropy": 1.778283566236496, + "epoch": 2.727601809954751, + "grad_norm": 0.46145930886268616, + "learning_rate": 0.0003264603030256955, + "loss": 0.0707, + "mean_token_accuracy": 0.9741399586200714, + "num_tokens": 6665465.0, + "step": 755 + }, + { + "entropy": 1.7340950965881348, + "epoch": 2.7312217194570136, + "grad_norm": 0.5734900236129761, + "learning_rate": 0.00032602089764604126, + "loss": 0.1443, + "mean_token_accuracy": 0.96195288002491, + "num_tokens": 6674797.0, + "step": 756 + }, + { + "entropy": 1.7791962027549744, + "epoch": 2.734841628959276, + "grad_norm": 0.5199477076530457, + "learning_rate": 0.00032558132146118636, + "loss": 0.0794, + "mean_token_accuracy": 0.975062221288681, + "num_tokens": 6683578.0, + "step": 757 + }, + { + "entropy": 1.825905591249466, + "epoch": 2.7384615384615385, + "grad_norm": 0.5944926738739014, + "learning_rate": 0.0003251415761406975, + "loss": 0.0909, + "mean_token_accuracy": 0.954865038394928, + "num_tokens": 6691818.0, + "step": 758 + }, + { + "entropy": 1.804949015378952, + "epoch": 2.742081447963801, + "grad_norm": 0.7065241932868958, + "learning_rate": 0.0003247016633547833, + "loss": 0.1511, + "mean_token_accuracy": 0.9687065333127975, + "num_tokens": 6700619.0, + "step": 759 + }, + { + "entropy": 1.7419202327728271, + "epoch": 2.7457013574660634, + "grad_norm": 0.49316564202308655, + "learning_rate": 0.00032426158477428857, + "loss": 0.0867, + "mean_token_accuracy": 0.9774050414562225, + "num_tokens": 6709635.0, + "step": 760 + }, + { + "entropy": 1.8934829235076904, + "epoch": 2.749321266968326, + "grad_norm": 0.9417999386787415, + "learning_rate": 0.00032382134207068787, + "loss": 0.1464, + "mean_token_accuracy": 0.9591032713651657, + "num_tokens": 6717657.0, + "step": 761 + }, + { + "entropy": 1.7354997992515564, + "epoch": 2.7529411764705882, + "grad_norm": 0.7240809798240662, + "learning_rate": 0.00032338093691607907, + "loss": 0.13, + "mean_token_accuracy": 0.9705345183610916, + "num_tokens": 6726671.0, + "step": 762 + }, + { + "entropy": 1.7620687186717987, + "epoch": 2.7565610859728507, + "grad_norm": 0.4986638128757477, + "learning_rate": 0.0003229403709831772, + "loss": 0.0963, + "mean_token_accuracy": 0.9756871312856674, + "num_tokens": 6735157.0, + "step": 763 + }, + { + "entropy": 1.7719130218029022, + "epoch": 2.760180995475113, + "grad_norm": 0.6204966902732849, + "learning_rate": 0.00032249964594530757, + "loss": 0.0578, + "mean_token_accuracy": 0.9815829247236252, + "num_tokens": 6743855.0, + "step": 764 + }, + { + "entropy": 1.7228702902793884, + "epoch": 2.7638009049773755, + "grad_norm": 0.5283492207527161, + "learning_rate": 0.0003220587634764003, + "loss": 0.069, + "mean_token_accuracy": 0.9851528853178024, + "num_tokens": 6753040.0, + "step": 765 + }, + { + "entropy": 1.7129736840724945, + "epoch": 2.767420814479638, + "grad_norm": 0.49026060104370117, + "learning_rate": 0.0003216177252509831, + "loss": 0.0672, + "mean_token_accuracy": 0.9857761710882187, + "num_tokens": 6762014.0, + "step": 766 + }, + { + "entropy": 1.7600707411766052, + "epoch": 2.7710407239819004, + "grad_norm": 0.5250128507614136, + "learning_rate": 0.00032117653294417523, + "loss": 0.1134, + "mean_token_accuracy": 0.9638848602771759, + "num_tokens": 6771012.0, + "step": 767 + }, + { + "entropy": 1.768298476934433, + "epoch": 2.774660633484163, + "grad_norm": 0.5671310424804688, + "learning_rate": 0.00032073518823168143, + "loss": 0.057, + "mean_token_accuracy": 0.9840837568044662, + "num_tokens": 6779601.0, + "step": 768 + }, + { + "entropy": 1.7464122474193573, + "epoch": 2.7782805429864252, + "grad_norm": 0.6007266044616699, + "learning_rate": 0.0003202936927897852, + "loss": 0.081, + "mean_token_accuracy": 0.9773043692111969, + "num_tokens": 6788518.0, + "step": 769 + }, + { + "entropy": 1.6484523713588715, + "epoch": 2.7819004524886877, + "grad_norm": 0.5163906812667847, + "learning_rate": 0.00031985204829534236, + "loss": 0.1215, + "mean_token_accuracy": 0.9645300209522247, + "num_tokens": 6797924.0, + "step": 770 + }, + { + "entropy": 1.7306124567985535, + "epoch": 2.78552036199095, + "grad_norm": 0.5778948068618774, + "learning_rate": 0.00031941025642577515, + "loss": 0.127, + "mean_token_accuracy": 0.9713134616613388, + "num_tokens": 6806828.0, + "step": 771 + }, + { + "entropy": 1.6599189043045044, + "epoch": 2.7891402714932125, + "grad_norm": 0.5121646523475647, + "learning_rate": 0.0003189683188590653, + "loss": 0.1066, + "mean_token_accuracy": 0.9707446396350861, + "num_tokens": 6816144.0, + "step": 772 + }, + { + "entropy": 1.71377295255661, + "epoch": 2.792760180995475, + "grad_norm": 0.9535031318664551, + "learning_rate": 0.00031852623727374787, + "loss": 0.2316, + "mean_token_accuracy": 0.9587533473968506, + "num_tokens": 6824849.0, + "step": 773 + }, + { + "entropy": 1.7716725766658783, + "epoch": 2.7963800904977374, + "grad_norm": 0.5735589265823364, + "learning_rate": 0.00031808401334890537, + "loss": 0.1028, + "mean_token_accuracy": 0.9716143608093262, + "num_tokens": 6833331.0, + "step": 774 + }, + { + "entropy": 1.7134707272052765, + "epoch": 2.8, + "grad_norm": 0.7087857127189636, + "learning_rate": 0.00031764164876416036, + "loss": 0.1201, + "mean_token_accuracy": 0.9686445444822311, + "num_tokens": 6842254.0, + "step": 775 + }, + { + "entropy": 1.6055873930454254, + "epoch": 2.8036199095022623, + "grad_norm": 0.4578965902328491, + "learning_rate": 0.00031719914519967, + "loss": 0.0827, + "mean_token_accuracy": 0.972065269947052, + "num_tokens": 6851644.0, + "step": 776 + }, + { + "entropy": 1.6444376707077026, + "epoch": 2.8072398190045247, + "grad_norm": 0.5656917095184326, + "learning_rate": 0.0003167565043361194, + "loss": 0.1036, + "mean_token_accuracy": 0.9723617881536484, + "num_tokens": 6860787.0, + "step": 777 + }, + { + "entropy": 1.6980305314064026, + "epoch": 2.810859728506787, + "grad_norm": 0.7013098001480103, + "learning_rate": 0.0003163137278547146, + "loss": 0.0838, + "mean_token_accuracy": 0.9793482422828674, + "num_tokens": 6869378.0, + "step": 778 + }, + { + "entropy": 1.6744478940963745, + "epoch": 2.8144796380090495, + "grad_norm": 0.6889812350273132, + "learning_rate": 0.00031587081743717735, + "loss": 0.0964, + "mean_token_accuracy": 0.9762091189622879, + "num_tokens": 6878050.0, + "step": 779 + }, + { + "entropy": 1.6397214829921722, + "epoch": 2.818099547511312, + "grad_norm": 0.7166011333465576, + "learning_rate": 0.00031542777476573785, + "loss": 0.1792, + "mean_token_accuracy": 0.9539972990751266, + "num_tokens": 6887153.0, + "step": 780 + }, + { + "entropy": 1.6447750926017761, + "epoch": 2.8217194570135744, + "grad_norm": 0.7113035321235657, + "learning_rate": 0.0003149846015231286, + "loss": 0.1464, + "mean_token_accuracy": 0.96909099817276, + "num_tokens": 6895877.0, + "step": 781 + }, + { + "entropy": 1.6827795505523682, + "epoch": 2.825339366515837, + "grad_norm": 0.6915350556373596, + "learning_rate": 0.0003145412993925781, + "loss": 0.1335, + "mean_token_accuracy": 0.9615183472633362, + "num_tokens": 6904553.0, + "step": 782 + }, + { + "entropy": 1.6189779937267303, + "epoch": 2.8289592760180997, + "grad_norm": 0.467428982257843, + "learning_rate": 0.00031409787005780423, + "loss": 0.0829, + "mean_token_accuracy": 0.9781016558408737, + "num_tokens": 6913634.0, + "step": 783 + }, + { + "entropy": 1.6323690116405487, + "epoch": 2.832579185520362, + "grad_norm": 0.49170154333114624, + "learning_rate": 0.00031365431520300813, + "loss": 0.0828, + "mean_token_accuracy": 0.9719655811786652, + "num_tokens": 6922638.0, + "step": 784 + }, + { + "entropy": 1.6121336817741394, + "epoch": 2.8361990950226246, + "grad_norm": 0.5629302263259888, + "learning_rate": 0.00031321063651286777, + "loss": 0.0757, + "mean_token_accuracy": 0.9791934490203857, + "num_tokens": 6931590.0, + "step": 785 + }, + { + "entropy": 1.7345627546310425, + "epoch": 2.839819004524887, + "grad_norm": 0.5514137148857117, + "learning_rate": 0.0003127668356725313, + "loss": 0.0819, + "mean_token_accuracy": 0.9800210148096085, + "num_tokens": 6940137.0, + "step": 786 + }, + { + "entropy": 1.6671563386917114, + "epoch": 2.8434389140271494, + "grad_norm": 0.5090643167495728, + "learning_rate": 0.0003123229143676109, + "loss": 0.0794, + "mean_token_accuracy": 0.9826332330703735, + "num_tokens": 6948616.0, + "step": 787 + }, + { + "entropy": 1.551501840353012, + "epoch": 2.847058823529412, + "grad_norm": 0.3994922935962677, + "learning_rate": 0.0003118788742841761, + "loss": 0.0491, + "mean_token_accuracy": 0.9865831136703491, + "num_tokens": 6957369.0, + "step": 788 + }, + { + "entropy": 1.500845193862915, + "epoch": 2.8506787330316743, + "grad_norm": 0.6023295521736145, + "learning_rate": 0.00031143471710874795, + "loss": 0.114, + "mean_token_accuracy": 0.9669302552938461, + "num_tokens": 6966667.0, + "step": 789 + }, + { + "entropy": 1.5258118510246277, + "epoch": 2.8542986425339367, + "grad_norm": 0.5326524972915649, + "learning_rate": 0.00031099044452829186, + "loss": 0.0657, + "mean_token_accuracy": 0.9833361059427261, + "num_tokens": 6975880.0, + "step": 790 + }, + { + "entropy": 1.5674570798873901, + "epoch": 2.857918552036199, + "grad_norm": 0.4518730044364929, + "learning_rate": 0.00031054605823021186, + "loss": 0.0569, + "mean_token_accuracy": 0.9832890778779984, + "num_tokens": 6984824.0, + "step": 791 + }, + { + "entropy": 1.5301121771335602, + "epoch": 2.8615384615384616, + "grad_norm": 0.5933698415756226, + "learning_rate": 0.00031010155990234364, + "loss": 0.1129, + "mean_token_accuracy": 0.9684284627437592, + "num_tokens": 6994076.0, + "step": 792 + }, + { + "entropy": 1.5711756348609924, + "epoch": 2.865158371040724, + "grad_norm": 0.6634730696678162, + "learning_rate": 0.00030965695123294837, + "loss": 0.1204, + "mean_token_accuracy": 0.972825437784195, + "num_tokens": 7003048.0, + "step": 793 + }, + { + "entropy": 1.6537431180477142, + "epoch": 2.8687782805429864, + "grad_norm": 0.5688450336456299, + "learning_rate": 0.0003092122339107067, + "loss": 0.0659, + "mean_token_accuracy": 0.9861912727355957, + "num_tokens": 7011743.0, + "step": 794 + }, + { + "entropy": 1.731940358877182, + "epoch": 2.872398190045249, + "grad_norm": 0.9030163288116455, + "learning_rate": 0.0003087674096247115, + "loss": 0.0829, + "mean_token_accuracy": 0.9802074134349823, + "num_tokens": 7020003.0, + "step": 795 + }, + { + "entropy": 1.6672345995903015, + "epoch": 2.8760180995475113, + "grad_norm": 0.5129911303520203, + "learning_rate": 0.00030832248006446223, + "loss": 0.0823, + "mean_token_accuracy": 0.9805259853601456, + "num_tokens": 7029275.0, + "step": 796 + }, + { + "entropy": 1.7102139592170715, + "epoch": 2.8796380090497737, + "grad_norm": 0.6210790872573853, + "learning_rate": 0.00030787744691985797, + "loss": 0.1248, + "mean_token_accuracy": 0.9665560126304626, + "num_tokens": 7038068.0, + "step": 797 + }, + { + "entropy": 1.659182459115982, + "epoch": 2.883257918552036, + "grad_norm": 0.6379976868629456, + "learning_rate": 0.0003074323118811913, + "loss": 0.1065, + "mean_token_accuracy": 0.9647062122821808, + "num_tokens": 7047039.0, + "step": 798 + }, + { + "entropy": 1.6344517767429352, + "epoch": 2.8868778280542986, + "grad_norm": 0.5851842761039734, + "learning_rate": 0.00030698707663914186, + "loss": 0.1046, + "mean_token_accuracy": 0.9666399955749512, + "num_tokens": 7056105.0, + "step": 799 + }, + { + "entropy": 1.6803805828094482, + "epoch": 2.890497737556561, + "grad_norm": 0.5926725268363953, + "learning_rate": 0.00030654174288477, + "loss": 0.1019, + "mean_token_accuracy": 0.9712099581956863, + "num_tokens": 7064710.0, + "step": 800 + }, + { + "entropy": 1.7004003822803497, + "epoch": 2.8941176470588235, + "grad_norm": 0.6103729605674744, + "learning_rate": 0.0003060963123095098, + "loss": 0.091, + "mean_token_accuracy": 0.9780148714780807, + "num_tokens": 7073218.0, + "step": 801 + }, + { + "entropy": 1.8133964240550995, + "epoch": 2.897737556561086, + "grad_norm": 0.872008740901947, + "learning_rate": 0.0003056507866051636, + "loss": 0.3003, + "mean_token_accuracy": 0.9385994374752045, + "num_tokens": 7081791.0, + "step": 802 + }, + { + "entropy": 1.7527997195720673, + "epoch": 2.9013574660633483, + "grad_norm": 0.553669810295105, + "learning_rate": 0.0003052051674638945, + "loss": 0.0999, + "mean_token_accuracy": 0.9695112109184265, + "num_tokens": 7090196.0, + "step": 803 + }, + { + "entropy": 1.6374657154083252, + "epoch": 2.9049773755656108, + "grad_norm": 0.4158615469932556, + "learning_rate": 0.00030475945657822107, + "loss": 0.0682, + "mean_token_accuracy": 0.9802833646535873, + "num_tokens": 7099216.0, + "step": 804 + }, + { + "entropy": 1.6056133210659027, + "epoch": 2.908597285067873, + "grad_norm": 0.47468429803848267, + "learning_rate": 0.00030431365564101003, + "loss": 0.1188, + "mean_token_accuracy": 0.9720293581485748, + "num_tokens": 7108787.0, + "step": 805 + }, + { + "entropy": 1.7184821665287018, + "epoch": 2.9122171945701356, + "grad_norm": 0.6617569923400879, + "learning_rate": 0.00030386776634547003, + "loss": 0.1121, + "mean_token_accuracy": 0.9623472690582275, + "num_tokens": 7117158.0, + "step": 806 + }, + { + "entropy": 1.7546651065349579, + "epoch": 2.915837104072398, + "grad_norm": 0.5058173537254333, + "learning_rate": 0.0003034217903851454, + "loss": 0.0861, + "mean_token_accuracy": 0.9664297550916672, + "num_tokens": 7125800.0, + "step": 807 + }, + { + "entropy": 1.6985557675361633, + "epoch": 2.9194570135746605, + "grad_norm": 0.5197705626487732, + "learning_rate": 0.00030297572945390996, + "loss": 0.1009, + "mean_token_accuracy": 0.9677706956863403, + "num_tokens": 7134221.0, + "step": 808 + }, + { + "entropy": 1.6737182438373566, + "epoch": 2.9230769230769234, + "grad_norm": 0.4528989791870117, + "learning_rate": 0.00030252958524595966, + "loss": 0.0656, + "mean_token_accuracy": 0.9853187948465347, + "num_tokens": 7142716.0, + "step": 809 + }, + { + "entropy": 1.687746375799179, + "epoch": 2.926696832579186, + "grad_norm": 0.8552060723304749, + "learning_rate": 0.00030208335945580716, + "loss": 0.1584, + "mean_token_accuracy": 0.958037719130516, + "num_tokens": 7151288.0, + "step": 810 + }, + { + "entropy": 1.6994356215000153, + "epoch": 2.930316742081448, + "grad_norm": 0.470833957195282, + "learning_rate": 0.00030163705377827496, + "loss": 0.0537, + "mean_token_accuracy": 0.9804185479879379, + "num_tokens": 7159738.0, + "step": 811 + }, + { + "entropy": 1.7072536945343018, + "epoch": 2.9339366515837106, + "grad_norm": 0.5749104022979736, + "learning_rate": 0.0003011906699084888, + "loss": 0.0502, + "mean_token_accuracy": 0.9830235093832016, + "num_tokens": 7168101.0, + "step": 812 + }, + { + "entropy": 1.70310440659523, + "epoch": 2.937556561085973, + "grad_norm": 0.7587386965751648, + "learning_rate": 0.0003007442095418715, + "loss": 0.1362, + "mean_token_accuracy": 0.9594880938529968, + "num_tokens": 7176663.0, + "step": 813 + }, + { + "entropy": 1.6307457983493805, + "epoch": 2.9411764705882355, + "grad_norm": 0.5054190754890442, + "learning_rate": 0.00030029767437413665, + "loss": 0.0744, + "mean_token_accuracy": 0.9738886505365372, + "num_tokens": 7185376.0, + "step": 814 + }, + { + "entropy": 1.5872860848903656, + "epoch": 2.944796380090498, + "grad_norm": 0.5463546514511108, + "learning_rate": 0.00029985106610128147, + "loss": 0.0916, + "mean_token_accuracy": 0.9782509952783585, + "num_tokens": 7194304.0, + "step": 815 + }, + { + "entropy": 1.6643644273281097, + "epoch": 2.9484162895927604, + "grad_norm": 0.5434613823890686, + "learning_rate": 0.0002994043864195811, + "loss": 0.1007, + "mean_token_accuracy": 0.9665197134017944, + "num_tokens": 7202895.0, + "step": 816 + }, + { + "entropy": 1.701482743024826, + "epoch": 2.952036199095023, + "grad_norm": 1.2643967866897583, + "learning_rate": 0.00029895763702558206, + "loss": 0.1377, + "mean_token_accuracy": 0.9696027487516403, + "num_tokens": 7211000.0, + "step": 817 + }, + { + "entropy": 1.688760131597519, + "epoch": 2.9556561085972852, + "grad_norm": 0.5438109636306763, + "learning_rate": 0.00029851081961609536, + "loss": 0.0637, + "mean_token_accuracy": 0.9724639654159546, + "num_tokens": 7219274.0, + "step": 818 + }, + { + "entropy": 1.6547857522964478, + "epoch": 2.9592760180995477, + "grad_norm": 0.4520387649536133, + "learning_rate": 0.0002980639358881906, + "loss": 0.0376, + "mean_token_accuracy": 0.9887004494667053, + "num_tokens": 7228000.0, + "step": 819 + }, + { + "entropy": 1.5814381837844849, + "epoch": 2.96289592760181, + "grad_norm": 0.49122339487075806, + "learning_rate": 0.00029761698753918894, + "loss": 0.0533, + "mean_token_accuracy": 0.983299508690834, + "num_tokens": 7236798.0, + "step": 820 + }, + { + "entropy": 1.5796774625778198, + "epoch": 2.9665158371040725, + "grad_norm": 0.43303897976875305, + "learning_rate": 0.00029716997626665726, + "loss": 0.0517, + "mean_token_accuracy": 0.984140008687973, + "num_tokens": 7245570.0, + "step": 821 + }, + { + "entropy": 1.5434466302394867, + "epoch": 2.970135746606335, + "grad_norm": 0.5712567567825317, + "learning_rate": 0.0002967229037684014, + "loss": 0.0634, + "mean_token_accuracy": 0.9851510971784592, + "num_tokens": 7254482.0, + "step": 822 + }, + { + "entropy": 1.5368549823760986, + "epoch": 2.9737556561085974, + "grad_norm": 0.5042312741279602, + "learning_rate": 0.0002962757717424595, + "loss": 0.1041, + "mean_token_accuracy": 0.9698852747678757, + "num_tokens": 7263428.0, + "step": 823 + }, + { + "entropy": 1.5740615129470825, + "epoch": 2.97737556561086, + "grad_norm": 0.8506835699081421, + "learning_rate": 0.0002958285818870963, + "loss": 0.0653, + "mean_token_accuracy": 0.9827365875244141, + "num_tokens": 7272425.0, + "step": 824 + }, + { + "entropy": 1.625010073184967, + "epoch": 2.9809954751131222, + "grad_norm": 0.6260822415351868, + "learning_rate": 0.00029538133590079556, + "loss": 0.1112, + "mean_token_accuracy": 0.9715189933776855, + "num_tokens": 7281312.0, + "step": 825 + }, + { + "entropy": 1.6078990697860718, + "epoch": 2.9846153846153847, + "grad_norm": 0.4316014349460602, + "learning_rate": 0.00029493403548225467, + "loss": 0.059, + "mean_token_accuracy": 0.9821690768003464, + "num_tokens": 7289748.0, + "step": 826 + }, + { + "entropy": 1.6132618486881256, + "epoch": 2.988235294117647, + "grad_norm": 0.6471059322357178, + "learning_rate": 0.0002944866823303776, + "loss": 0.0839, + "mean_token_accuracy": 0.9747331887483597, + "num_tokens": 7298453.0, + "step": 827 + }, + { + "entropy": 1.6038751900196075, + "epoch": 2.9918552036199095, + "grad_norm": 0.5383681654930115, + "learning_rate": 0.0002940392781442686, + "loss": 0.0728, + "mean_token_accuracy": 0.9774085730314255, + "num_tokens": 7307116.0, + "step": 828 + }, + { + "entropy": 1.6446776688098907, + "epoch": 2.995475113122172, + "grad_norm": 0.5420554280281067, + "learning_rate": 0.0002935918246232259, + "loss": 0.0799, + "mean_token_accuracy": 0.977481946349144, + "num_tokens": 7315668.0, + "step": 829 + }, + { + "entropy": 1.5571844279766083, + "epoch": 2.9990950226244344, + "grad_norm": 0.6471306681632996, + "learning_rate": 0.00029314432346673485, + "loss": 0.1657, + "mean_token_accuracy": 0.9566951394081116, + "num_tokens": 7324721.0, + "step": 830 + }, + { + "entropy": 2.0783205032348633, + "epoch": 3.0, + "grad_norm": 3.195817232131958, + "learning_rate": 0.000292696776374462, + "loss": 0.0742, + "mean_token_accuracy": 0.96875, + "num_tokens": 7325175.0, + "step": 831 + }, + { + "epoch": 3.0, + "eval_entropy": 1.6213929740394033, + "eval_loss": 0.14780744910240173, + "eval_mean_token_accuracy": 0.9634173047251817, + "eval_num_tokens": 7325175.0, + "eval_runtime": 116.0041, + "eval_samples_per_second": 3.181, + "eval_steps_per_second": 1.06, + "step": 831 + } + ], + "logging_steps": 1, + "max_steps": 1662, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.948999773397185e+17, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-831/training_args.bin b/checkpoint-831/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..070a2de135e794840c49e066215a1c9f2e550d1f --- /dev/null +++ b/checkpoint-831/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc271f94ce32216bd6f2ee9866fb7d62a0583bc7ee0c7fa953fa57c302729c6c +size 6289 diff --git a/runs/Dec24_15-42-28_nid008456/events.out.tfevents.1766619767.nid008456.1051144.0 b/runs/Dec24_15-42-28_nid008456/events.out.tfevents.1766619767.nid008456.1051144.0 new file mode 100644 index 0000000000000000000000000000000000000000..5c112dcd5c33acc2bf21d5e1d929d30a893a6f1c --- /dev/null +++ b/runs/Dec24_15-42-28_nid008456/events.out.tfevents.1766619767.nid008456.1051144.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5b2ec98dacb9e48853bcb14b0cb3bea62ac2f3de8f3516b8a1fd717aed544e2 +size 6545 diff --git a/runs/Dec24_15-44-03_nid008456/events.out.tfevents.1766619862.nid008456.1052968.0 b/runs/Dec24_15-44-03_nid008456/events.out.tfevents.1766619862.nid008456.1052968.0 new file mode 100644 index 0000000000000000000000000000000000000000..662f0c47be223a76b86b71c82756d0ff2c79efc5 --- /dev/null +++ b/runs/Dec24_15-44-03_nid008456/events.out.tfevents.1766619862.nid008456.1052968.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa393668471eec4e56826183e92e2984538831b5272921b658e2ffd4114683b3 +size 6545 diff --git a/runs/Dec24_15-45-08_nid008456/events.out.tfevents.1766619927.nid008456.1054481.0 b/runs/Dec24_15-45-08_nid008456/events.out.tfevents.1766619927.nid008456.1054481.0 new file mode 100644 index 0000000000000000000000000000000000000000..50fdbd19f704588f6647385a3d9c9255f7a011b1 --- /dev/null +++ b/runs/Dec24_15-45-08_nid008456/events.out.tfevents.1766619927.nid008456.1054481.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a967b2a79a6b7b9eda8a42d1f3a32bd4de9aa48ffc4ce6af5c313d5b717d622e +size 6545 diff --git a/runs/Dec24_15-46-20_nid008456/events.out.tfevents.1766619999.nid008456.1055618.0 b/runs/Dec24_15-46-20_nid008456/events.out.tfevents.1766619999.nid008456.1055618.0 new file mode 100644 index 0000000000000000000000000000000000000000..2c85986984b506c65df0a28945158270b4e46fc2 --- /dev/null +++ b/runs/Dec24_15-46-20_nid008456/events.out.tfevents.1766619999.nid008456.1055618.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d02beb1e5bc0cd247773f1dfe380a568d819f7413393fb84ab22a5a41bc50db2 +size 6545 diff --git a/runs/Dec24_15-47-55_nid008456/events.out.tfevents.1766620094.nid008456.1057607.0 b/runs/Dec24_15-47-55_nid008456/events.out.tfevents.1766620094.nid008456.1057607.0 new file mode 100644 index 0000000000000000000000000000000000000000..10b6dea2433dd98fd1314938d63b786f7a442f87 --- /dev/null +++ b/runs/Dec24_15-47-55_nid008456/events.out.tfevents.1766620094.nid008456.1057607.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae2e070503fa32bf313616eb622a19b1787bf1590e90374452daece1f32dedbe +size 6545 diff --git a/runs/Dec24_15-49-20_nid008456/events.out.tfevents.1766620178.nid008456.1058442.0 b/runs/Dec24_15-49-20_nid008456/events.out.tfevents.1766620178.nid008456.1058442.0 new file mode 100644 index 0000000000000000000000000000000000000000..9e219cf45c983bf9047d5d5619dccff982438a55 --- /dev/null +++ b/runs/Dec24_15-49-20_nid008456/events.out.tfevents.1766620178.nid008456.1058442.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd3a77c71e2a68512a41dd9f9340ade45b76fb61ee482e07228fbf636118a582 +size 6545 diff --git a/runs/Dec24_15-51-11_nid008456/events.out.tfevents.1766620280.nid008456.1059239.0 b/runs/Dec24_15-51-11_nid008456/events.out.tfevents.1766620280.nid008456.1059239.0 new file mode 100644 index 0000000000000000000000000000000000000000..ae514c1fc8d3d5b7b38799cf08aa91086547ecec --- /dev/null +++ b/runs/Dec24_15-51-11_nid008456/events.out.tfevents.1766620280.nid008456.1059239.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f83ad324bdd007e8126b686b3c9c743c3653ab0ff0305d222060916ed66de1f +size 422971 diff --git a/runs/Dec25_05-10-12_nid008513/events.out.tfevents.1766668222.nid008513.1579240.0 b/runs/Dec25_05-10-12_nid008513/events.out.tfevents.1766668222.nid008513.1579240.0 new file mode 100644 index 0000000000000000000000000000000000000000..fe5edc0bf63e07593abd812bb0d7c28a6046188b --- /dev/null +++ b/runs/Dec25_05-10-12_nid008513/events.out.tfevents.1766668222.nid008513.1579240.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02ee262c5267c8eab4115df0ba2eff2958a62b155dc764e365768b4650f8ea5f +size 643528