diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..b7d7fc7d946ecdea9688cdc0285c2cb3b893cb83 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +outputs/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +outputs/checkpoint-1500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +outputs/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +outputs/checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +outputs/checkpoint-313/tokenizer.json filter=lfs diff=lfs merge=lfs -text +outputs/checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/outputs/.ipynb_checkpoints/README-checkpoint.md b/outputs/.ipynb_checkpoints/README-checkpoint.md new file mode 100644 index 0000000000000000000000000000000000000000..3c31d8bb65aa32ebf0ae5090690923003aa1ea22 --- /dev/null +++ b/outputs/.ipynb_checkpoints/README-checkpoint.md @@ -0,0 +1,59 @@ +--- +base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit +library_name: transformers +model_name: outputs +tags: +- generated_from_trainer +- trl +- sft +- unsloth +licence: license +--- + +# Model Card for outputs + +This model is a fine-tuned version of [unsloth/gpt-oss-20b-unsloth-bnb-4bit](https://huggingface.co/unsloth/gpt-oss-20b-unsloth-bnb-4bit). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.22.2 +- Transformers: 4.55.4 +- Pytorch: 2.8.0+cu128 +- Datasets: 3.6.0 +- Tokenizers: 0.21.4 + +## Citations + + + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/outputs/README.md b/outputs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0bf58439fd19bd30ce4c842fa359ce0d570c835d --- /dev/null +++ b/outputs/README.md @@ -0,0 +1,59 @@ +--- +base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit +library_name: transformers +model_name: outputs +tags: +- generated_from_trainer +- unsloth +- sft +- trl +licence: license +--- + +# Model Card for outputs + +This model is a fine-tuned version of [unsloth/gpt-oss-20b-unsloth-bnb-4bit](https://huggingface.co/unsloth/gpt-oss-20b-unsloth-bnb-4bit). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.22.2 +- Transformers: 4.55.4 +- Pytorch: 2.8.0 +- Datasets: 3.6.0 +- Tokenizers: 0.21.4 + +## Citations + + + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/outputs/checkpoint-1000/README.md b/outputs/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3abf956c074d00f34a12693c8d6da9738211d7c7 --- /dev/null +++ b/outputs/checkpoint-1000/README.md @@ -0,0 +1,209 @@ +--- +base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit +library_name: peft +tags: +- base_model:adapter:unsloth/gpt-oss-20b-unsloth-bnb-4bit +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/outputs/checkpoint-1000/adapter_config.json b/outputs/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e285b9b6e018b5b9f23736d6699eb1a4267764e7 --- /dev/null +++ b/outputs/checkpoint-1000/adapter_config.json @@ -0,0 +1,45 @@ +{ + "alpha_pattern": {}, + "auto_mapping": { + "base_model_class": "GptOssForCausalLM", + "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss" + }, + "base_model_name_or_path": "unsloth/gpt-oss-20b-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "up_proj", + "down_proj", + "gate_proj", + "k_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/outputs/checkpoint-1000/chat_template.jinja b/outputs/checkpoint-1000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..a3650f886e98b2834c25727759c8e0ab8495f316 --- /dev/null +++ b/outputs/checkpoint-1000/chat_template.jinja @@ -0,0 +1,315 @@ +{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #} +{#- + In addition to the normal inputs of `messages` and `tools`, this template also accepts the + following kwargs: + - "builtin_tools": A list, can contain "browser" and/or "python". + - "model_identity": A string that optionally describes the model identity. + - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium". + #} + +{#- Tool Definition Rendering ============================================== #} +{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%} + {%- if param_spec.type == "array" -%} + {%- if param_spec['items'] -%} + {%- if param_spec['items']['type'] == "string" -%} + {{- "string[]" }} + {%- elif param_spec['items']['type'] == "number" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "integer" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "boolean" -%} + {{- "boolean[]" }} + {%- else -%} + {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%} + {%- if inner_type == "object | object" or inner_type|length > 50 -%} + {{- "any[]" }} + {%- else -%} + {{- inner_type + "[]" }} + {%- endif -%} + {%- endif -%} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- else -%} + {{- "any[]" }} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%} + {#- Handle array of types like ["object", "object"] from Union[dict, list] #} + {%- if param_spec.type | length > 1 -%} + {{- param_spec.type | join(" | ") }} + {%- else -%} + {{- param_spec.type[0] }} + {%- endif -%} + {%- elif param_spec.oneOf -%} + {#- Handle oneOf schemas - check for complex unions and fallback to any #} + {%- set has_object_variants = false -%} + {%- for variant in param_spec.oneOf -%} + {%- if variant.type == "object" -%} + {%- set has_object_variants = true -%} + {%- endif -%} + {%- endfor -%} + {%- if has_object_variants and param_spec.oneOf|length > 1 -%} + {{- "any" }} + {%- else -%} + {%- for variant in param_spec.oneOf -%} + {{- render_typescript_type(variant, required_params) -}} + {%- if variant.description %} + {{- "// " + variant.description }} + {%- endif -%} + {%- if variant.default is defined %} + {{ "// default: " + variant.default|tojson }} + {%- endif -%} + {%- if not loop.last %} + {{- " | " }} + {% endif -%} + {%- endfor -%} + {%- endif -%} + {%- elif param_spec.type == "string" -%} + {%- if param_spec.enum -%} + {{- '"' + param_spec.enum|join('" | "') + '"' -}} + {%- else -%} + {{- "string" }} + {%- if param_spec.nullable %} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type == "number" -%} + {{- "number" }} + {%- elif param_spec.type == "integer" -%} + {{- "number" }} + {%- elif param_spec.type == "boolean" -%} + {{- "boolean" }} + + {%- elif param_spec.type == "object" -%} + {%- if param_spec.properties -%} + {{- "{\n" }} + {%- for prop_name, prop_spec in param_spec.properties.items() -%} + {{- prop_name -}} + {%- if prop_name not in (param_spec.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{ render_typescript_type(prop_spec, param_spec.required or []) }} + {%- if not loop.last -%} + {{-", " }} + {%- endif -%} + {%- endfor -%} + {{- "}" }} + {%- else -%} + {{- "object" }} + {%- endif -%} + {%- else -%} + {{- "any" }} + {%- endif -%} +{%- endmacro -%} + +{%- macro render_tool_namespace(namespace_name, tools) -%} + {{- "## " + namespace_name + "\n\n" }} + {{- "namespace " + namespace_name + " {\n\n" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- "// " + tool.description + "\n" }} + {{- "type "+ tool.name + " = " }} + {%- if tool.parameters and tool.parameters.properties -%} + {{- "(_: " }} + {{- "{\n" }} + {%- for param_name, param_spec in tool.parameters.properties.items() %} + {{- "// " + param_spec.description + "\n" }} + {{- param_name }} + {%- if param_name not in (tool.parameters.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{- render_typescript_type(param_spec, tool.parameters.required or []) }} + {%- if param_spec.default is defined -%} + {%- if param_spec.enum %} + {{- ", // default: " + param_spec.default }} + {%- elif param_spec.oneOf %} + {{- "// default: " + param_spec.default }} + {%- else %} + {{- ", // default: " + param_spec.default|tojson }} + {%- endif -%} + {%- endif -%} + {%- if not loop.last %} + {{- ",\n" }} + {%- else %} + {{- "\n" }} + {%- endif -%} + {%- endfor %} + {{- "}) => any;\n\n" }} + {%- else -%} + {{- "() => any;\n\n" }} + {%- endif -%} + {%- endfor %} + {{- "} // namespace " + namespace_name }} +{%- endmacro -%} + +{%- macro render_builtin_tools(browser_tool, python_tool) -%} + {%- if browser_tool %} + {{- "## browser\n\n" }} + {{- "// Tool for browsing.\n" }} + {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }} + {{- "// Cite information from the tool using the following format:\n" }} + {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }} + {{- "// Do not quote more than 10 words directly from the tool output.\n" }} + {{- "// sources=web (default: web)\n" }} + {{- "namespace browser {\n\n" }} + {{- "// Searches for information related to `query` and displays `topn` results.\n" }} + {{- "type search = (_: {\n" }} + {{- "query: string,\n" }} + {{- "topn?: number, // default: 10\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }} + {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }} + {{- "// If `cursor` is not provided, the most recent page is implied.\n" }} + {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }} + {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }} + {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }} + {{- "type open = (_: {\n" }} + {{- "id?: number | string, // default: -1\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "loc?: number, // default: -1\n" }} + {{- "num_lines?: number, // default: -1\n" }} + {{- "view_source?: boolean, // default: false\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }} + {{- "type find = (_: {\n" }} + {{- "pattern: string,\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "}) => any;\n\n" }} + {{- "} // namespace browser\n\n" }} + {%- endif -%} + + {%- if python_tool %} + {{- "## python\n\n" }} + {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }} + {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }} + {%- endif -%} +{%- endmacro -%} + +{#- System Message Construction ============================================ #} +{%- macro build_system_message() -%} + {%- if model_identity is not defined %} + {{- "You are ChatGPT, a large language model trained by OpenAI.\n" -}} + {%- else %} + {{- model_identity }} + {%- endif %} + {{- "Knowledge cutoff: 2024-06\n" }} + {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }} + {%- if reasoning_effort is not defined %} + {%- set reasoning_effort = "medium" %} + {%- endif %} + {{- "Reasoning: " + reasoning_effort + "\n\n" }} + {%- if builtin_tools is defined %} + {{- "# Tools\n\n" }} + {%- set available_builtin_tools = namespace(browser=false, python=false) %} + {%- for tool in builtin_tools %} + {%- if tool == "browser" %} + {%- set available_builtin_tools.browser = true %} + {%- elif tool == "python" %} + {%- set available_builtin_tools.python = true %} + {%- endif %} + {%- endfor %} + {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }} + {%- endif -%} + {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }} + {%- if tools is defined -%} + {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }} + {%- endif -%} +{%- endmacro -%} + +{#- Main Template Logic ================================================= #} +{#- Set defaults #} + +{#- Render system message #} +{{- "<|start|>system<|message|>" }} +{{- build_system_message() }} +{{- "<|end|>" }} + +{#- Extract developer message #} +{%- if messages[0].role == "developer" or messages[0].role == "system" %} + {%- set developer_message = messages[0].content %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set developer_message = "" %} + {%- set loop_messages = messages %} +{%- endif %} + +{#- Render developer message #} +{%- if developer_message or tools %} + {{- "<|start|>developer<|message|>" }} + {%- if developer_message %} + {{- "# Instructions\n\n" }} + {{- developer_message }} + {%- endif %} + {%- if tools -%} + {{- "\n\n" }} + {{- "# Tools\n\n" }} + {{- render_tool_namespace("functions", tools) }} + {%- endif -%} + {{- "<|end|>" }} +{%- endif %} + +{#- Render messages #} +{%- set last_tool_call = namespace(name=none) %} +{%- for message in loop_messages -%} + {#- At this point only assistant/user/tool messages should remain #} + {%- if message.role == 'assistant' -%} + {%- if "tool_calls" in message %} + {#- We assume max 1 tool call per message, and so we infer the tool call name #} + {#- in "tool" messages from the most recent assistant tool call name #} + {%- set tool_call = message.tool_calls[0] %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if message.content %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }} + {%- endif %} + {{- "<|start|>assistant to=" }} + {{- "functions." + tool_call.name + "<|channel|>commentary json<|message|>" }} + {{- tool_call.arguments|tojson }} + {{- "<|call|>" }} + {%- set last_tool_call.name = tool_call.name %} + {%- elif "thinking" in message and loop.last and not add_generation_prompt %} + {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #} + {#- This is a situation that should only occur in training, never in inference. #} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }} + {%- set last_tool_call.name = none %} + {%- elif "thinking" in message %} + {#- CoT is dropped during all previous turns, so we never render it for inference #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- elif loop.last and not add_generation_prompt %} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|message|>" + message.content + "<|return|>" }} + {%- else %} + {{- "<|start|>assistant<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- endif %} + {%- elif message.role == 'tool' -%} + {%- if last_tool_call.name is none %} + {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }} + {%- endif %} + {{- "<|start|>functions." + last_tool_call.name }} + {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} + {%- else -%} + {{- "<|start|>user<|message|>" + message.content + "<|end|>" }} + {%- endif -%} +{%- endfor -%} + +{#- Generation prompt #} +{%- if add_generation_prompt -%} +<|start|>assistant +{%- endif -%} +{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #} \ No newline at end of file diff --git a/outputs/checkpoint-1000/special_tokens_map.json b/outputs/checkpoint-1000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..6fba18753f4d09dbb8fcdf1482daff36b963d639 --- /dev/null +++ b/outputs/checkpoint-1000/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/outputs/checkpoint-1000/tokenizer.json b/outputs/checkpoint-1000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647 --- /dev/null +++ b/outputs/checkpoint-1000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3 +size 27868174 diff --git a/outputs/checkpoint-1000/tokenizer_config.json b/outputs/checkpoint-1000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..482ae30d27a74c38d2228e69dd37c529fc485a45 --- /dev/null +++ b/outputs/checkpoint-1000/tokenizer_config.json @@ -0,0 +1,185 @@ +{ + "added_tokens_decoder": { + "199998": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "199999": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200000": { + "content": "<|reserved_200000|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200001": { + "content": "<|reserved_200001|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200002": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200003": { + "content": "<|constrain|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200004": { + "content": "<|reserved_200004|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200005": { + "content": "<|channel|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200006": { + "content": "<|start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200008": { + "content": "<|message|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200009": { + "content": "<|reserved_200009|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200010": { + "content": "<|reserved_200010|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200011": { + "content": "<|reserved_200011|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200012": { + "content": "<|call|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200013": { + "content": "<|reserved_200013|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200014": { + "content": "<|reserved_200014|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200015": { + "content": "<|reserved_200015|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200016": { + "content": "<|reserved_200016|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200017": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200018": { + "content": "<|endofprompt|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|startoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|return|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|reserved_200017|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/outputs/checkpoint-1000/trainer_state.json b/outputs/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..172192f0d56d38a9f55c5ff8477f045dc5515bdc --- /dev/null +++ b/outputs/checkpoint-1000/trainer_state.json @@ -0,0 +1,7034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3886136209074128, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 13.684800148010254, + "learning_rate": 0.0, + "loss": 2.3276, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 13.660787582397461, + "learning_rate": 4e-05, + "loss": 2.2792, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 13.35280704498291, + "learning_rate": 8e-05, + "loss": 2.4151, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 6.15027379989624, + "learning_rate": 0.00012, + "loss": 1.7812, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 1.3168226480484009, + "learning_rate": 0.00016, + "loss": 1.4536, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.9872580170631409, + "learning_rate": 0.0002, + "loss": 1.4171, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.7496100664138794, + "learning_rate": 0.00019935064935064936, + "loss": 1.4168, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7376005053520203, + "learning_rate": 0.00019870129870129872, + "loss": 1.3659, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5281137824058533, + "learning_rate": 0.00019805194805194807, + "loss": 1.2566, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.5485746264457703, + "learning_rate": 0.00019740259740259742, + "loss": 1.3761, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5506592392921448, + "learning_rate": 0.00019675324675324675, + "loss": 1.3327, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.49382686614990234, + "learning_rate": 0.00019610389610389613, + "loss": 1.3727, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.36203011870384216, + "learning_rate": 0.00019545454545454548, + "loss": 1.1515, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.3528599739074707, + "learning_rate": 0.0001948051948051948, + "loss": 1.2636, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.31244418025016785, + "learning_rate": 0.00019415584415584416, + "loss": 1.1873, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.3379523754119873, + "learning_rate": 0.00019350649350649354, + "loss": 1.2657, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.3025083839893341, + "learning_rate": 0.00019285714285714286, + "loss": 1.2846, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.2560190260410309, + "learning_rate": 0.00019220779220779222, + "loss": 1.1587, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.2554129958152771, + "learning_rate": 0.00019155844155844157, + "loss": 1.2812, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.22662702202796936, + "learning_rate": 0.00019090909090909092, + "loss": 1.1664, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.2515714168548584, + "learning_rate": 0.00019025974025974027, + "loss": 1.2177, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.24396637082099915, + "learning_rate": 0.00018961038961038963, + "loss": 1.2053, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.24488303065299988, + "learning_rate": 0.00018896103896103895, + "loss": 1.2074, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.2168620079755783, + "learning_rate": 0.00018831168831168833, + "loss": 1.1284, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.24021224677562714, + "learning_rate": 0.00018766233766233769, + "loss": 1.2169, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.20057056844234467, + "learning_rate": 0.000187012987012987, + "loss": 1.1031, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.19900795817375183, + "learning_rate": 0.00018636363636363636, + "loss": 1.1004, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.2019268423318863, + "learning_rate": 0.00018571428571428572, + "loss": 1.1476, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.1996479034423828, + "learning_rate": 0.00018506493506493507, + "loss": 1.1455, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.25262022018432617, + "learning_rate": 0.00018441558441558442, + "loss": 1.1025, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.225438192486763, + "learning_rate": 0.00018376623376623378, + "loss": 1.1954, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.17834505438804626, + "learning_rate": 0.00018311688311688313, + "loss": 1.0934, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.20071206986904144, + "learning_rate": 0.00018246753246753248, + "loss": 1.0488, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.1920139640569687, + "learning_rate": 0.00018181818181818183, + "loss": 1.123, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.18714852631092072, + "learning_rate": 0.0001811688311688312, + "loss": 1.0798, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.18315713107585907, + "learning_rate": 0.00018051948051948054, + "loss": 1.1107, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.19156870245933533, + "learning_rate": 0.00017987012987012987, + "loss": 1.1125, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.21527768671512604, + "learning_rate": 0.00017922077922077922, + "loss": 1.1346, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.1871163249015808, + "learning_rate": 0.0001785714285714286, + "loss": 1.0742, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.17750784754753113, + "learning_rate": 0.00017792207792207792, + "loss": 1.1323, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.177419051527977, + "learning_rate": 0.00017727272727272728, + "loss": 1.1405, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.16714292764663696, + "learning_rate": 0.00017662337662337663, + "loss": 1.1084, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.1610356718301773, + "learning_rate": 0.00017597402597402598, + "loss": 1.1125, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.2548656761646271, + "learning_rate": 0.00017532467532467534, + "loss": 1.1114, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.1731044203042984, + "learning_rate": 0.0001746753246753247, + "loss": 1.1197, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.1739533394575119, + "learning_rate": 0.00017402597402597401, + "loss": 1.1777, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.2178352177143097, + "learning_rate": 0.0001733766233766234, + "loss": 1.1111, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.17247150838375092, + "learning_rate": 0.00017272727272727275, + "loss": 1.1253, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.18075324594974518, + "learning_rate": 0.00017207792207792207, + "loss": 1.1358, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.15898071229457855, + "learning_rate": 0.00017142857142857143, + "loss": 1.0606, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.16518613696098328, + "learning_rate": 0.0001707792207792208, + "loss": 1.0944, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.16035063564777374, + "learning_rate": 0.00017012987012987013, + "loss": 1.0554, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.1686483472585678, + "learning_rate": 0.00016948051948051948, + "loss": 1.0384, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.16575631499290466, + "learning_rate": 0.00016883116883116884, + "loss": 1.0243, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.16840039193630219, + "learning_rate": 0.0001681818181818182, + "loss": 1.117, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.17616064846515656, + "learning_rate": 0.00016753246753246754, + "loss": 1.0743, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.168218195438385, + "learning_rate": 0.0001668831168831169, + "loss": 1.0627, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.17026656866073608, + "learning_rate": 0.00016623376623376625, + "loss": 1.0059, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.16454458236694336, + "learning_rate": 0.0001655844155844156, + "loss": 0.9943, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.17185136675834656, + "learning_rate": 0.00016493506493506495, + "loss": 1.1545, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.17822986841201782, + "learning_rate": 0.00016428571428571428, + "loss": 1.073, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.1676608771085739, + "learning_rate": 0.00016363636363636366, + "loss": 1.0886, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.1727771908044815, + "learning_rate": 0.000162987012987013, + "loss": 1.0432, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.17827573418617249, + "learning_rate": 0.00016233766233766234, + "loss": 1.083, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.19807517528533936, + "learning_rate": 0.0001616883116883117, + "loss": 1.1208, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.17693684995174408, + "learning_rate": 0.00016103896103896104, + "loss": 1.089, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.15489234030246735, + "learning_rate": 0.0001603896103896104, + "loss": 0.9707, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.16443990170955658, + "learning_rate": 0.00015974025974025975, + "loss": 1.0643, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.2051103413105011, + "learning_rate": 0.0001590909090909091, + "loss": 1.1246, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.18824075162410736, + "learning_rate": 0.00015844155844155845, + "loss": 1.0855, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.18659448623657227, + "learning_rate": 0.0001577922077922078, + "loss": 1.1412, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.1854114979505539, + "learning_rate": 0.00015714285714285716, + "loss": 1.0249, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.1876193732023239, + "learning_rate": 0.00015649350649350649, + "loss": 1.1029, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.1888684630393982, + "learning_rate": 0.00015584415584415587, + "loss": 1.0789, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.20240606367588043, + "learning_rate": 0.0001551948051948052, + "loss": 1.0495, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.232120081782341, + "learning_rate": 0.00015454545454545454, + "loss": 1.0735, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.16897843778133392, + "learning_rate": 0.0001538961038961039, + "loss": 1.0164, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.18796634674072266, + "learning_rate": 0.00015324675324675325, + "loss": 1.0676, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.19574032723903656, + "learning_rate": 0.0001525974025974026, + "loss": 1.0456, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.18007811903953552, + "learning_rate": 0.00015194805194805196, + "loss": 1.0894, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.18932929635047913, + "learning_rate": 0.0001512987012987013, + "loss": 1.0729, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.20614288747310638, + "learning_rate": 0.00015064935064935066, + "loss": 1.0854, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.19291089475154877, + "learning_rate": 0.00015000000000000001, + "loss": 1.1217, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.18916529417037964, + "learning_rate": 0.00014935064935064934, + "loss": 1.0963, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.20306220650672913, + "learning_rate": 0.00014870129870129872, + "loss": 1.0898, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.17870067059993744, + "learning_rate": 0.00014805194805194807, + "loss": 1.0213, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.18411923944950104, + "learning_rate": 0.0001474025974025974, + "loss": 1.0844, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.18788227438926697, + "learning_rate": 0.00014675324675324675, + "loss": 1.0338, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.23874884843826294, + "learning_rate": 0.00014610389610389613, + "loss": 1.1118, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.19380499422550201, + "learning_rate": 0.00014545454545454546, + "loss": 1.0464, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.18968750536441803, + "learning_rate": 0.0001448051948051948, + "loss": 1.0569, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.19545753300189972, + "learning_rate": 0.00014415584415584416, + "loss": 1.1225, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.19170494377613068, + "learning_rate": 0.00014350649350649352, + "loss": 1.0602, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.17953918874263763, + "learning_rate": 0.00014285714285714287, + "loss": 1.032, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.1822536289691925, + "learning_rate": 0.00014220779220779222, + "loss": 1.0559, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.18591298162937164, + "learning_rate": 0.00014155844155844155, + "loss": 1.031, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.2129002958536148, + "learning_rate": 0.00014090909090909093, + "loss": 1.1391, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.18386681377887726, + "learning_rate": 0.00014025974025974028, + "loss": 0.9919, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.18314239382743835, + "learning_rate": 0.0001396103896103896, + "loss": 1.0445, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.1999066174030304, + "learning_rate": 0.00013896103896103896, + "loss": 1.0538, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.18741188943386078, + "learning_rate": 0.00013831168831168834, + "loss": 1.0722, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.19351010024547577, + "learning_rate": 0.00013766233766233766, + "loss": 1.0491, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.18859203159809113, + "learning_rate": 0.00013701298701298702, + "loss": 1.0593, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.1962767392396927, + "learning_rate": 0.00013636363636363637, + "loss": 1.1344, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.20819440484046936, + "learning_rate": 0.00013571428571428572, + "loss": 1.1137, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.19590184092521667, + "learning_rate": 0.00013506493506493507, + "loss": 1.0624, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.18631424009799957, + "learning_rate": 0.00013441558441558443, + "loss": 1.0587, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.19572143256664276, + "learning_rate": 0.00013376623376623375, + "loss": 1.0494, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.1910988837480545, + "learning_rate": 0.00013311688311688313, + "loss": 1.0481, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.19455869495868683, + "learning_rate": 0.00013246753246753249, + "loss": 1.029, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.18669827282428741, + "learning_rate": 0.0001318181818181818, + "loss": 1.0513, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.17523664236068726, + "learning_rate": 0.0001311688311688312, + "loss": 1.0126, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.17929129302501678, + "learning_rate": 0.00013051948051948052, + "loss": 1.0717, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.19380168616771698, + "learning_rate": 0.00012987012987012987, + "loss": 1.0324, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.18090228736400604, + "learning_rate": 0.00012922077922077922, + "loss": 1.0515, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.2067340910434723, + "learning_rate": 0.00012857142857142858, + "loss": 1.0939, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.1880485862493515, + "learning_rate": 0.00012792207792207793, + "loss": 1.0986, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.182168647646904, + "learning_rate": 0.00012727272727272728, + "loss": 1.0109, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.20187129080295563, + "learning_rate": 0.00012662337662337663, + "loss": 1.0668, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.2082669734954834, + "learning_rate": 0.000125974025974026, + "loss": 1.054, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.18294434249401093, + "learning_rate": 0.00012532467532467534, + "loss": 1.0397, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.20515067875385284, + "learning_rate": 0.00012467532467532467, + "loss": 1.1092, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.1758790761232376, + "learning_rate": 0.00012402597402597402, + "loss": 0.9755, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.2170792669057846, + "learning_rate": 0.0001233766233766234, + "loss": 1.0434, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.202157124876976, + "learning_rate": 0.00012272727272727272, + "loss": 1.1129, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.18556398153305054, + "learning_rate": 0.00012207792207792208, + "loss": 1.0665, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.20196087658405304, + "learning_rate": 0.00012142857142857143, + "loss": 1.1, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.1921566128730774, + "learning_rate": 0.0001207792207792208, + "loss": 1.0918, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.18866224586963654, + "learning_rate": 0.00012012987012987014, + "loss": 1.0014, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.207601398229599, + "learning_rate": 0.00011948051948051949, + "loss": 1.0726, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.21592366695404053, + "learning_rate": 0.00011883116883116883, + "loss": 1.1379, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.2016124576330185, + "learning_rate": 0.0001181818181818182, + "loss": 1.1428, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.20478437840938568, + "learning_rate": 0.00011753246753246753, + "loss": 1.121, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.22730594873428345, + "learning_rate": 0.00011688311688311689, + "loss": 1.0319, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.22592711448669434, + "learning_rate": 0.00011623376623376625, + "loss": 1.1264, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.20035041868686676, + "learning_rate": 0.00011558441558441559, + "loss": 1.0686, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.20648567378520966, + "learning_rate": 0.00011493506493506494, + "loss": 1.0817, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.21222743391990662, + "learning_rate": 0.00011428571428571428, + "loss": 1.0678, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.2075391560792923, + "learning_rate": 0.00011363636363636365, + "loss": 1.0897, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.1964101791381836, + "learning_rate": 0.000112987012987013, + "loss": 1.0906, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.22406511008739471, + "learning_rate": 0.00011233766233766234, + "loss": 1.0594, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.23787978291511536, + "learning_rate": 0.00011168831168831168, + "loss": 1.1053, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.21196185052394867, + "learning_rate": 0.00011103896103896105, + "loss": 1.0923, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.21042804419994354, + "learning_rate": 0.0001103896103896104, + "loss": 1.0381, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.2267436534166336, + "learning_rate": 0.00010974025974025974, + "loss": 1.0818, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.23742735385894775, + "learning_rate": 0.00010909090909090909, + "loss": 1.0872, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.17787213623523712, + "learning_rate": 0.00010844155844155846, + "loss": 1.03, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.22422832250595093, + "learning_rate": 0.0001077922077922078, + "loss": 1.0738, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.22946301102638245, + "learning_rate": 0.00010714285714285715, + "loss": 1.0274, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.2137996405363083, + "learning_rate": 0.00010649350649350649, + "loss": 1.0539, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.1748756766319275, + "learning_rate": 0.00010584415584415586, + "loss": 1.0355, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.22275175154209137, + "learning_rate": 0.0001051948051948052, + "loss": 1.1696, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.20996077358722687, + "learning_rate": 0.00010454545454545455, + "loss": 1.0303, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.1945938766002655, + "learning_rate": 0.00010389610389610389, + "loss": 0.9747, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.1970377266407013, + "learning_rate": 0.00010324675324675325, + "loss": 1.0358, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.18814732134342194, + "learning_rate": 0.00010259740259740261, + "loss": 0.9612, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.2153233289718628, + "learning_rate": 0.00010194805194805195, + "loss": 1.0749, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.21788008511066437, + "learning_rate": 0.0001012987012987013, + "loss": 1.0883, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.214650496840477, + "learning_rate": 0.00010064935064935067, + "loss": 1.0539, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.19312834739685059, + "learning_rate": 0.0001, + "loss": 1.0657, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.19916598498821259, + "learning_rate": 9.935064935064936e-05, + "loss": 1.0478, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.2057606726884842, + "learning_rate": 9.870129870129871e-05, + "loss": 1.0094, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.22159607708454132, + "learning_rate": 9.805194805194806e-05, + "loss": 1.0952, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.18274275958538055, + "learning_rate": 9.74025974025974e-05, + "loss": 1.0065, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.19835162162780762, + "learning_rate": 9.675324675324677e-05, + "loss": 1.0742, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.2114904820919037, + "learning_rate": 9.610389610389611e-05, + "loss": 1.1109, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.21488523483276367, + "learning_rate": 9.545454545454546e-05, + "loss": 1.0465, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.19870303571224213, + "learning_rate": 9.480519480519481e-05, + "loss": 1.0318, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.20413029193878174, + "learning_rate": 9.415584415584417e-05, + "loss": 1.0817, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.1847231239080429, + "learning_rate": 9.35064935064935e-05, + "loss": 1.0144, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.2715964913368225, + "learning_rate": 9.285714285714286e-05, + "loss": 0.9832, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.2225002497434616, + "learning_rate": 9.220779220779221e-05, + "loss": 1.1051, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.22931510210037231, + "learning_rate": 9.155844155844156e-05, + "loss": 1.1042, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.21848627924919128, + "learning_rate": 9.090909090909092e-05, + "loss": 1.1151, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.19852259755134583, + "learning_rate": 9.025974025974027e-05, + "loss": 1.0889, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.2080363780260086, + "learning_rate": 8.961038961038961e-05, + "loss": 1.0777, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.22391024231910706, + "learning_rate": 8.896103896103896e-05, + "loss": 1.1092, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.21793846786022186, + "learning_rate": 8.831168831168831e-05, + "loss": 1.044, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.2009749859571457, + "learning_rate": 8.766233766233767e-05, + "loss": 1.0198, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.19432318210601807, + "learning_rate": 8.701298701298701e-05, + "loss": 1.075, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.18634547293186188, + "learning_rate": 8.636363636363637e-05, + "loss": 0.9964, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.1947103589773178, + "learning_rate": 8.571428571428571e-05, + "loss": 1.0025, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.23098671436309814, + "learning_rate": 8.506493506493507e-05, + "loss": 1.0562, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.19686414301395416, + "learning_rate": 8.441558441558442e-05, + "loss": 1.0285, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.19852428138256073, + "learning_rate": 8.376623376623377e-05, + "loss": 1.0054, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.21483510732650757, + "learning_rate": 8.311688311688312e-05, + "loss": 1.108, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.23313644528388977, + "learning_rate": 8.246753246753248e-05, + "loss": 1.1383, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.21453145146369934, + "learning_rate": 8.181818181818183e-05, + "loss": 1.0911, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.20268195867538452, + "learning_rate": 8.116883116883117e-05, + "loss": 1.0145, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.20576398074626923, + "learning_rate": 8.051948051948052e-05, + "loss": 1.0829, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.21732626855373383, + "learning_rate": 7.987012987012987e-05, + "loss": 1.0152, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.22046895325183868, + "learning_rate": 7.922077922077923e-05, + "loss": 1.1311, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.19727715849876404, + "learning_rate": 7.857142857142858e-05, + "loss": 1.0364, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.20861488580703735, + "learning_rate": 7.792207792207793e-05, + "loss": 1.0435, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.18545083701610565, + "learning_rate": 7.727272727272727e-05, + "loss": 1.0299, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.19965052604675293, + "learning_rate": 7.662337662337662e-05, + "loss": 1.0511, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.23673909902572632, + "learning_rate": 7.597402597402598e-05, + "loss": 1.081, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.17583179473876953, + "learning_rate": 7.532467532467533e-05, + "loss": 0.9808, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.2129366099834442, + "learning_rate": 7.467532467532467e-05, + "loss": 1.0522, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.21679140627384186, + "learning_rate": 7.402597402597404e-05, + "loss": 1.0567, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.2032000720500946, + "learning_rate": 7.337662337662338e-05, + "loss": 1.0466, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.1887970268726349, + "learning_rate": 7.272727272727273e-05, + "loss": 1.0329, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.21060192584991455, + "learning_rate": 7.207792207792208e-05, + "loss": 1.1021, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.21191425621509552, + "learning_rate": 7.142857142857143e-05, + "loss": 0.99, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.1995989829301834, + "learning_rate": 7.077922077922077e-05, + "loss": 1.0526, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.1849513053894043, + "learning_rate": 7.012987012987014e-05, + "loss": 0.9998, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.1948779672384262, + "learning_rate": 6.948051948051948e-05, + "loss": 1.075, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.20374052226543427, + "learning_rate": 6.883116883116883e-05, + "loss": 1.0933, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.2102465033531189, + "learning_rate": 6.818181818181818e-05, + "loss": 1.1123, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.21376173198223114, + "learning_rate": 6.753246753246754e-05, + "loss": 1.1233, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.20934203267097473, + "learning_rate": 6.688311688311688e-05, + "loss": 1.1374, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.18604128062725067, + "learning_rate": 6.623376623376624e-05, + "loss": 1.0213, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.19644233584403992, + "learning_rate": 6.55844155844156e-05, + "loss": 1.0046, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.18479463458061218, + "learning_rate": 6.493506493506494e-05, + "loss": 0.9792, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.1945149153470993, + "learning_rate": 6.428571428571429e-05, + "loss": 1.0584, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.2070147544145584, + "learning_rate": 6.363636363636364e-05, + "loss": 1.071, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.19645985960960388, + "learning_rate": 6.2987012987013e-05, + "loss": 1.0721, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.1960117667913437, + "learning_rate": 6.233766233766233e-05, + "loss": 1.071, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.20168261229991913, + "learning_rate": 6.16883116883117e-05, + "loss": 1.0808, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.21254412829875946, + "learning_rate": 6.103896103896104e-05, + "loss": 1.0287, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.21271063387393951, + "learning_rate": 6.03896103896104e-05, + "loss": 1.0605, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.2081408053636551, + "learning_rate": 5.9740259740259744e-05, + "loss": 1.091, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.21113798022270203, + "learning_rate": 5.90909090909091e-05, + "loss": 1.1323, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.20670844614505768, + "learning_rate": 5.844155844155844e-05, + "loss": 1.0955, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.2010120451450348, + "learning_rate": 5.7792207792207796e-05, + "loss": 1.1068, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.20379121601581573, + "learning_rate": 5.714285714285714e-05, + "loss": 1.0419, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.22799807786941528, + "learning_rate": 5.64935064935065e-05, + "loss": 1.0904, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.2005995213985443, + "learning_rate": 5.584415584415584e-05, + "loss": 1.078, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.20329605042934418, + "learning_rate": 5.51948051948052e-05, + "loss": 1.0245, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.19283504784107208, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.0367, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.20624355971813202, + "learning_rate": 5.38961038961039e-05, + "loss": 1.1046, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.21362991631031036, + "learning_rate": 5.3246753246753245e-05, + "loss": 1.1104, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.20447863638401031, + "learning_rate": 5.25974025974026e-05, + "loss": 1.0514, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.1974381059408188, + "learning_rate": 5.1948051948051944e-05, + "loss": 1.0048, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.21237170696258545, + "learning_rate": 5.1298701298701304e-05, + "loss": 1.1299, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.21224971115589142, + "learning_rate": 5.064935064935065e-05, + "loss": 1.05, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.19865018129348755, + "learning_rate": 5e-05, + "loss": 1.0665, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.19199275970458984, + "learning_rate": 4.9350649350649355e-05, + "loss": 0.9531, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.19573214650154114, + "learning_rate": 4.87012987012987e-05, + "loss": 1.0318, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.21338805556297302, + "learning_rate": 4.8051948051948054e-05, + "loss": 1.0343, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.2254691869020462, + "learning_rate": 4.740259740259741e-05, + "loss": 1.0472, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.18101665377616882, + "learning_rate": 4.675324675324675e-05, + "loss": 1.017, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.22090592980384827, + "learning_rate": 4.6103896103896106e-05, + "loss": 1.0389, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.20865507423877716, + "learning_rate": 4.545454545454546e-05, + "loss": 1.0369, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.21619610488414764, + "learning_rate": 4.4805194805194805e-05, + "loss": 1.109, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.21694771945476532, + "learning_rate": 4.415584415584416e-05, + "loss": 1.0525, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.2182662934064865, + "learning_rate": 4.3506493506493503e-05, + "loss": 1.0331, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.2026486098766327, + "learning_rate": 4.2857142857142856e-05, + "loss": 1.027, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.19606547057628632, + "learning_rate": 4.220779220779221e-05, + "loss": 1.0242, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.22107470035552979, + "learning_rate": 4.155844155844156e-05, + "loss": 1.0924, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.19960008561611176, + "learning_rate": 4.0909090909090915e-05, + "loss": 1.0384, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.1945488154888153, + "learning_rate": 4.025974025974026e-05, + "loss": 1.0673, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.22067414224147797, + "learning_rate": 3.9610389610389614e-05, + "loss": 1.0426, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.19010980427265167, + "learning_rate": 3.8961038961038966e-05, + "loss": 1.0617, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.18781176209449768, + "learning_rate": 3.831168831168831e-05, + "loss": 1.0243, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.20388829708099365, + "learning_rate": 3.7662337662337665e-05, + "loss": 1.0476, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.19911155104637146, + "learning_rate": 3.701298701298702e-05, + "loss": 1.0324, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.19884039461612701, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.0242, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.19036105275154114, + "learning_rate": 3.571428571428572e-05, + "loss": 1.0323, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.20039844512939453, + "learning_rate": 3.506493506493507e-05, + "loss": 1.0749, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.1899934560060501, + "learning_rate": 3.4415584415584416e-05, + "loss": 1.0115, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.20019090175628662, + "learning_rate": 3.376623376623377e-05, + "loss": 1.0782, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.2020583152770996, + "learning_rate": 3.311688311688312e-05, + "loss": 1.0687, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.21407337486743927, + "learning_rate": 3.246753246753247e-05, + "loss": 1.1015, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.1871640682220459, + "learning_rate": 3.181818181818182e-05, + "loss": 0.9637, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.21622811257839203, + "learning_rate": 3.1168831168831166e-05, + "loss": 1.1222, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.22504661977291107, + "learning_rate": 3.051948051948052e-05, + "loss": 1.132, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.19177629053592682, + "learning_rate": 2.9870129870129872e-05, + "loss": 1.0281, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.1970544159412384, + "learning_rate": 2.922077922077922e-05, + "loss": 1.0393, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.21554522216320038, + "learning_rate": 2.857142857142857e-05, + "loss": 1.074, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.21131229400634766, + "learning_rate": 2.792207792207792e-05, + "loss": 1.054, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.19816523790359497, + "learning_rate": 2.7272727272727273e-05, + "loss": 1.0456, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.21075209975242615, + "learning_rate": 2.6623376623376623e-05, + "loss": 1.0758, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.2296527624130249, + "learning_rate": 2.5974025974025972e-05, + "loss": 1.0917, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.19722610712051392, + "learning_rate": 2.5324675324675325e-05, + "loss": 1.0704, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.18721099197864532, + "learning_rate": 2.4675324675324678e-05, + "loss": 0.9919, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.20244193077087402, + "learning_rate": 2.4025974025974027e-05, + "loss": 1.0368, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.19518914818763733, + "learning_rate": 2.3376623376623376e-05, + "loss": 1.0436, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.19650357961654663, + "learning_rate": 2.272727272727273e-05, + "loss": 1.0306, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.20320096611976624, + "learning_rate": 2.207792207792208e-05, + "loss": 1.0941, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.18296951055526733, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.9802, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.21357610821723938, + "learning_rate": 2.077922077922078e-05, + "loss": 1.0449, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.193921759724617, + "learning_rate": 2.012987012987013e-05, + "loss": 1.0116, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.1953902244567871, + "learning_rate": 1.9480519480519483e-05, + "loss": 1.0105, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.19440975785255432, + "learning_rate": 1.8831168831168833e-05, + "loss": 0.9952, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.21054105460643768, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.0701, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.18844804167747498, + "learning_rate": 1.7532467532467535e-05, + "loss": 1.0146, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.2067311704158783, + "learning_rate": 1.6883116883116884e-05, + "loss": 1.0781, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.1941213756799698, + "learning_rate": 1.6233766233766234e-05, + "loss": 0.9814, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.22726193070411682, + "learning_rate": 1.5584415584415583e-05, + "loss": 1.1431, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.18025581538677216, + "learning_rate": 1.4935064935064936e-05, + "loss": 0.9649, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.21535000205039978, + "learning_rate": 1.4285714285714285e-05, + "loss": 1.0441, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.20014546811580658, + "learning_rate": 1.3636363636363637e-05, + "loss": 1.0166, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.22738787531852722, + "learning_rate": 1.2987012987012986e-05, + "loss": 1.0564, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.2020861804485321, + "learning_rate": 1.2337662337662339e-05, + "loss": 1.1241, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.19888809323310852, + "learning_rate": 1.1688311688311688e-05, + "loss": 1.1114, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.20912377536296844, + "learning_rate": 1.103896103896104e-05, + "loss": 1.0971, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.21206621825695038, + "learning_rate": 1.038961038961039e-05, + "loss": 1.0601, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.18667680025100708, + "learning_rate": 9.740259740259742e-06, + "loss": 1.0291, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.21125559508800507, + "learning_rate": 9.090909090909091e-06, + "loss": 1.0483, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.21776145696640015, + "learning_rate": 8.441558441558442e-06, + "loss": 0.9912, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.20144303143024445, + "learning_rate": 7.792207792207792e-06, + "loss": 1.0357, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.1984029859304428, + "learning_rate": 7.142857142857143e-06, + "loss": 1.0648, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.17972829937934875, + "learning_rate": 6.493506493506493e-06, + "loss": 1.0033, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.1818286031484604, + "learning_rate": 5.844155844155844e-06, + "loss": 0.997, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.19670912623405457, + "learning_rate": 5.194805194805195e-06, + "loss": 1.0256, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.20527283847332, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.0348, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.19025909900665283, + "learning_rate": 3.896103896103896e-06, + "loss": 1.0682, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.19544818997383118, + "learning_rate": 3.2467532467532465e-06, + "loss": 0.9872, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.22112183272838593, + "learning_rate": 2.5974025974025976e-06, + "loss": 1.0661, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.23328153789043427, + "learning_rate": 1.948051948051948e-06, + "loss": 1.0691, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.20181375741958618, + "learning_rate": 1.2987012987012988e-06, + "loss": 0.9416, + "step": 312 + }, + { + "epoch": 1.0, + "grad_norm": 0.29312625527381897, + "learning_rate": 6.493506493506494e-07, + "loss": 1.1216, + "step": 313 + }, + { + "epoch": 0.12202467696492762, + "grad_norm": 0.2231415957212448, + "learning_rate": 0.0, + "loss": 1.0468, + "step": 314 + }, + { + "epoch": 0.12241329058583503, + "grad_norm": 0.22263288497924805, + "learning_rate": 0.00017594394706111328, + "loss": 1.0399, + "step": 315 + }, + { + "epoch": 0.12280190420674245, + "grad_norm": 0.22909891605377197, + "learning_rate": 0.00017586609575710393, + "loss": 1.1069, + "step": 316 + }, + { + "epoch": 0.12319051782764986, + "grad_norm": 0.23951445519924164, + "learning_rate": 0.0001757882444530946, + "loss": 1.1036, + "step": 317 + }, + { + "epoch": 0.12357913144855727, + "grad_norm": 0.2409268021583557, + "learning_rate": 0.00017571039314908526, + "loss": 1.1114, + "step": 318 + }, + { + "epoch": 0.12396774506946469, + "grad_norm": 0.23753899335861206, + "learning_rate": 0.00017563254184507592, + "loss": 1.1297, + "step": 319 + }, + { + "epoch": 0.12435635869037209, + "grad_norm": 0.2823902666568756, + "learning_rate": 0.00017555469054106657, + "loss": 1.1293, + "step": 320 + }, + { + "epoch": 0.12474497231127951, + "grad_norm": 0.24093545973300934, + "learning_rate": 0.00017547683923705722, + "loss": 1.0678, + "step": 321 + }, + { + "epoch": 0.12513358593218693, + "grad_norm": 0.22565563023090363, + "learning_rate": 0.0001753989879330479, + "loss": 1.1408, + "step": 322 + }, + { + "epoch": 0.12552219955309435, + "grad_norm": 0.22569572925567627, + "learning_rate": 0.00017532113662903855, + "loss": 1.0543, + "step": 323 + }, + { + "epoch": 0.12591081317400174, + "grad_norm": 0.24962866306304932, + "learning_rate": 0.0001752432853250292, + "loss": 1.0818, + "step": 324 + }, + { + "epoch": 0.12629942679490916, + "grad_norm": 0.22184576094150543, + "learning_rate": 0.00017516543402101986, + "loss": 1.0835, + "step": 325 + }, + { + "epoch": 0.12668804041581658, + "grad_norm": 0.2572194039821625, + "learning_rate": 0.0001750875827170105, + "loss": 1.0767, + "step": 326 + }, + { + "epoch": 0.127076654036724, + "grad_norm": 0.24131342768669128, + "learning_rate": 0.00017500973141300116, + "loss": 1.0981, + "step": 327 + }, + { + "epoch": 0.1274652676576314, + "grad_norm": 0.2386389970779419, + "learning_rate": 0.00017493188010899184, + "loss": 1.0828, + "step": 328 + }, + { + "epoch": 0.1278538812785388, + "grad_norm": 0.2654125690460205, + "learning_rate": 0.0001748540288049825, + "loss": 1.1266, + "step": 329 + }, + { + "epoch": 0.12824249489944622, + "grad_norm": 0.2925739884376526, + "learning_rate": 0.00017477617750097314, + "loss": 1.0983, + "step": 330 + }, + { + "epoch": 0.12863110852035364, + "grad_norm": 0.26589342951774597, + "learning_rate": 0.0001746983261969638, + "loss": 1.1029, + "step": 331 + }, + { + "epoch": 0.12901972214126106, + "grad_norm": 0.24565957486629486, + "learning_rate": 0.00017462047489295445, + "loss": 1.0975, + "step": 332 + }, + { + "epoch": 0.12940833576216845, + "grad_norm": 0.2459682673215866, + "learning_rate": 0.00017454262358894513, + "loss": 1.0566, + "step": 333 + }, + { + "epoch": 0.12979694938307587, + "grad_norm": 0.23349183797836304, + "learning_rate": 0.00017446477228493578, + "loss": 1.0833, + "step": 334 + }, + { + "epoch": 0.1301855630039833, + "grad_norm": 0.26166337728500366, + "learning_rate": 0.00017438692098092643, + "loss": 1.1598, + "step": 335 + }, + { + "epoch": 0.1305741766248907, + "grad_norm": 0.24188168346881866, + "learning_rate": 0.00017430906967691708, + "loss": 1.0728, + "step": 336 + }, + { + "epoch": 0.13096279024579813, + "grad_norm": 0.22922398149967194, + "learning_rate": 0.00017423121837290773, + "loss": 1.0311, + "step": 337 + }, + { + "epoch": 0.13135140386670552, + "grad_norm": 0.2652754485607147, + "learning_rate": 0.00017415336706889841, + "loss": 1.1096, + "step": 338 + }, + { + "epoch": 0.13174001748761294, + "grad_norm": 0.2355881780385971, + "learning_rate": 0.00017407551576488907, + "loss": 1.0964, + "step": 339 + }, + { + "epoch": 0.13212863110852036, + "grad_norm": 0.244523823261261, + "learning_rate": 0.00017399766446087972, + "loss": 1.142, + "step": 340 + }, + { + "epoch": 0.13251724472942777, + "grad_norm": 0.24705976247787476, + "learning_rate": 0.00017391981315687037, + "loss": 1.0943, + "step": 341 + }, + { + "epoch": 0.13290585835033517, + "grad_norm": 0.22817552089691162, + "learning_rate": 0.00017384196185286102, + "loss": 1.0621, + "step": 342 + }, + { + "epoch": 0.13329447197124258, + "grad_norm": 0.22605225443840027, + "learning_rate": 0.0001737641105488517, + "loss": 1.0714, + "step": 343 + }, + { + "epoch": 0.13368308559215, + "grad_norm": 0.2584545314311981, + "learning_rate": 0.00017368625924484235, + "loss": 1.1367, + "step": 344 + }, + { + "epoch": 0.13407169921305742, + "grad_norm": 0.2248220443725586, + "learning_rate": 0.000173608407940833, + "loss": 1.0872, + "step": 345 + }, + { + "epoch": 0.13446031283396484, + "grad_norm": 0.2141868770122528, + "learning_rate": 0.00017353055663682368, + "loss": 1.0572, + "step": 346 + }, + { + "epoch": 0.13484892645487223, + "grad_norm": 0.2615523934364319, + "learning_rate": 0.00017345270533281434, + "loss": 1.1048, + "step": 347 + }, + { + "epoch": 0.13523754007577965, + "grad_norm": 0.22990448772907257, + "learning_rate": 0.000173374854028805, + "loss": 1.0528, + "step": 348 + }, + { + "epoch": 0.13562615369668707, + "grad_norm": 0.2132262885570526, + "learning_rate": 0.00017329700272479564, + "loss": 1.0476, + "step": 349 + }, + { + "epoch": 0.1360147673175945, + "grad_norm": 0.2578272819519043, + "learning_rate": 0.00017321915142078632, + "loss": 1.0852, + "step": 350 + }, + { + "epoch": 0.1364033809385019, + "grad_norm": 0.22881457209587097, + "learning_rate": 0.00017314130011677697, + "loss": 1.1017, + "step": 351 + }, + { + "epoch": 0.1367919945594093, + "grad_norm": 0.21067696809768677, + "learning_rate": 0.00017306344881276762, + "loss": 1.0444, + "step": 352 + }, + { + "epoch": 0.13718060818031672, + "grad_norm": 0.2304215282201767, + "learning_rate": 0.0001729855975087583, + "loss": 1.0737, + "step": 353 + }, + { + "epoch": 0.13756922180122413, + "grad_norm": 0.2031925916671753, + "learning_rate": 0.00017290774620474895, + "loss": 1.0036, + "step": 354 + }, + { + "epoch": 0.13795783542213155, + "grad_norm": 0.27281051874160767, + "learning_rate": 0.0001728298949007396, + "loss": 1.148, + "step": 355 + }, + { + "epoch": 0.13834644904303897, + "grad_norm": 0.204191654920578, + "learning_rate": 0.00017275204359673026, + "loss": 0.9607, + "step": 356 + }, + { + "epoch": 0.13873506266394636, + "grad_norm": 0.221976637840271, + "learning_rate": 0.0001726741922927209, + "loss": 1.1068, + "step": 357 + }, + { + "epoch": 0.13912367628485378, + "grad_norm": 0.20831729471683502, + "learning_rate": 0.0001725963409887116, + "loss": 1.034, + "step": 358 + }, + { + "epoch": 0.1395122899057612, + "grad_norm": 0.21639779210090637, + "learning_rate": 0.00017251848968470224, + "loss": 1.0613, + "step": 359 + }, + { + "epoch": 0.13990090352666862, + "grad_norm": 0.1959424465894699, + "learning_rate": 0.0001724406383806929, + "loss": 1.0506, + "step": 360 + }, + { + "epoch": 0.140289517147576, + "grad_norm": 0.2044398933649063, + "learning_rate": 0.00017236278707668355, + "loss": 1.0316, + "step": 361 + }, + { + "epoch": 0.14067813076848343, + "grad_norm": 0.21483004093170166, + "learning_rate": 0.0001722849357726742, + "loss": 1.0361, + "step": 362 + }, + { + "epoch": 0.14106674438939085, + "grad_norm": 0.237701416015625, + "learning_rate": 0.00017220708446866485, + "loss": 1.1264, + "step": 363 + }, + { + "epoch": 0.14145535801029827, + "grad_norm": 0.20750795304775238, + "learning_rate": 0.00017212923316465553, + "loss": 1.0523, + "step": 364 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 0.2252965271472931, + "learning_rate": 0.00017205138186064618, + "loss": 1.0764, + "step": 365 + }, + { + "epoch": 0.14223258525211308, + "grad_norm": 0.2033565789461136, + "learning_rate": 0.00017197353055663683, + "loss": 1.064, + "step": 366 + }, + { + "epoch": 0.1426211988730205, + "grad_norm": 0.21123190224170685, + "learning_rate": 0.00017189567925262749, + "loss": 1.0515, + "step": 367 + }, + { + "epoch": 0.1430098124939279, + "grad_norm": 0.20646221935749054, + "learning_rate": 0.00017181782794861814, + "loss": 1.0617, + "step": 368 + }, + { + "epoch": 0.14339842611483533, + "grad_norm": 0.2079589068889618, + "learning_rate": 0.00017173997664460882, + "loss": 1.0569, + "step": 369 + }, + { + "epoch": 0.14378703973574275, + "grad_norm": 0.216246098279953, + "learning_rate": 0.00017166212534059947, + "loss": 1.0986, + "step": 370 + }, + { + "epoch": 0.14417565335665014, + "grad_norm": 0.20711806416511536, + "learning_rate": 0.00017158427403659012, + "loss": 1.1342, + "step": 371 + }, + { + "epoch": 0.14456426697755756, + "grad_norm": 0.235435351729393, + "learning_rate": 0.00017150642273258077, + "loss": 1.1082, + "step": 372 + }, + { + "epoch": 0.14495288059846498, + "grad_norm": 0.2273191511631012, + "learning_rate": 0.00017142857142857143, + "loss": 1.1064, + "step": 373 + }, + { + "epoch": 0.1453414942193724, + "grad_norm": 0.2075672745704651, + "learning_rate": 0.0001713507201245621, + "loss": 1.0536, + "step": 374 + }, + { + "epoch": 0.14573010784027982, + "grad_norm": 0.20764274895191193, + "learning_rate": 0.00017127286882055276, + "loss": 1.0673, + "step": 375 + }, + { + "epoch": 0.1461187214611872, + "grad_norm": 0.2441243678331375, + "learning_rate": 0.0001711950175165434, + "loss": 1.1271, + "step": 376 + }, + { + "epoch": 0.14650733508209463, + "grad_norm": 0.2383374124765396, + "learning_rate": 0.00017111716621253406, + "loss": 1.083, + "step": 377 + }, + { + "epoch": 0.14689594870300204, + "grad_norm": 0.2172410786151886, + "learning_rate": 0.0001710393149085247, + "loss": 1.0605, + "step": 378 + }, + { + "epoch": 0.14728456232390946, + "grad_norm": 0.22591541707515717, + "learning_rate": 0.0001709614636045154, + "loss": 1.0931, + "step": 379 + }, + { + "epoch": 0.14767317594481685, + "grad_norm": 0.23099495470523834, + "learning_rate": 0.00017088361230050604, + "loss": 1.1021, + "step": 380 + }, + { + "epoch": 0.14806178956572427, + "grad_norm": 0.21461094915866852, + "learning_rate": 0.0001708057609964967, + "loss": 1.0959, + "step": 381 + }, + { + "epoch": 0.1484504031866317, + "grad_norm": 0.21557241678237915, + "learning_rate": 0.00017072790969248735, + "loss": 1.0155, + "step": 382 + }, + { + "epoch": 0.1488390168075391, + "grad_norm": 0.234396293759346, + "learning_rate": 0.000170650058388478, + "loss": 1.1289, + "step": 383 + }, + { + "epoch": 0.14922763042844653, + "grad_norm": 0.22895503044128418, + "learning_rate": 0.00017057220708446868, + "loss": 0.9919, + "step": 384 + }, + { + "epoch": 0.14961624404935392, + "grad_norm": 0.2054683268070221, + "learning_rate": 0.00017049435578045933, + "loss": 1.0607, + "step": 385 + }, + { + "epoch": 0.15000485767026134, + "grad_norm": 0.25569215416908264, + "learning_rate": 0.00017041650447644998, + "loss": 1.0517, + "step": 386 + }, + { + "epoch": 0.15039347129116876, + "grad_norm": 0.2222641259431839, + "learning_rate": 0.00017033865317244064, + "loss": 1.0404, + "step": 387 + }, + { + "epoch": 0.15078208491207618, + "grad_norm": 0.20501169562339783, + "learning_rate": 0.0001702608018684313, + "loss": 0.9897, + "step": 388 + }, + { + "epoch": 0.1511706985329836, + "grad_norm": 0.22080403566360474, + "learning_rate": 0.00017018295056442197, + "loss": 1.1013, + "step": 389 + }, + { + "epoch": 0.15155931215389098, + "grad_norm": 0.21218529343605042, + "learning_rate": 0.00017010509926041262, + "loss": 1.0541, + "step": 390 + }, + { + "epoch": 0.1519479257747984, + "grad_norm": 0.23064807057380676, + "learning_rate": 0.00017002724795640327, + "loss": 1.037, + "step": 391 + }, + { + "epoch": 0.15233653939570582, + "grad_norm": 0.21164493262767792, + "learning_rate": 0.00016994939665239392, + "loss": 1.0769, + "step": 392 + }, + { + "epoch": 0.15272515301661324, + "grad_norm": 0.22565549612045288, + "learning_rate": 0.00016987154534838457, + "loss": 1.0638, + "step": 393 + }, + { + "epoch": 0.15311376663752063, + "grad_norm": 0.22492647171020508, + "learning_rate": 0.00016979369404437525, + "loss": 1.063, + "step": 394 + }, + { + "epoch": 0.15350238025842805, + "grad_norm": 0.22335395216941833, + "learning_rate": 0.0001697158427403659, + "loss": 1.1032, + "step": 395 + }, + { + "epoch": 0.15389099387933547, + "grad_norm": 0.2164154201745987, + "learning_rate": 0.00016963799143635656, + "loss": 1.1275, + "step": 396 + }, + { + "epoch": 0.1542796075002429, + "grad_norm": 0.22547736763954163, + "learning_rate": 0.0001695601401323472, + "loss": 1.1324, + "step": 397 + }, + { + "epoch": 0.1546682211211503, + "grad_norm": 0.2028045952320099, + "learning_rate": 0.0001694822888283379, + "loss": 1.0057, + "step": 398 + }, + { + "epoch": 0.1550568347420577, + "grad_norm": 0.20770573616027832, + "learning_rate": 0.00016940443752432854, + "loss": 1.0311, + "step": 399 + }, + { + "epoch": 0.15544544836296512, + "grad_norm": 0.2231476902961731, + "learning_rate": 0.0001693265862203192, + "loss": 1.0535, + "step": 400 + }, + { + "epoch": 0.15583406198387253, + "grad_norm": 0.21618099510669708, + "learning_rate": 0.00016924873491630987, + "loss": 1.0616, + "step": 401 + }, + { + "epoch": 0.15622267560477995, + "grad_norm": 0.24024419486522675, + "learning_rate": 0.00016917088361230052, + "loss": 1.1324, + "step": 402 + }, + { + "epoch": 0.15661128922568737, + "grad_norm": 0.2002171128988266, + "learning_rate": 0.00016909303230829118, + "loss": 1.015, + "step": 403 + }, + { + "epoch": 0.15699990284659476, + "grad_norm": 0.21771477162837982, + "learning_rate": 0.00016901518100428183, + "loss": 1.0817, + "step": 404 + }, + { + "epoch": 0.15738851646750218, + "grad_norm": 0.22052259743213654, + "learning_rate": 0.0001689373297002725, + "loss": 1.0836, + "step": 405 + }, + { + "epoch": 0.1577771300884096, + "grad_norm": 0.1964062750339508, + "learning_rate": 0.00016885947839626316, + "loss": 1.0505, + "step": 406 + }, + { + "epoch": 0.15816574370931702, + "grad_norm": 0.22714298963546753, + "learning_rate": 0.0001687816270922538, + "loss": 1.0702, + "step": 407 + }, + { + "epoch": 0.15855435733022444, + "grad_norm": 0.20647728443145752, + "learning_rate": 0.00016870377578824446, + "loss": 1.0349, + "step": 408 + }, + { + "epoch": 0.15894297095113183, + "grad_norm": 0.2355160117149353, + "learning_rate": 0.00016862592448423512, + "loss": 1.0305, + "step": 409 + }, + { + "epoch": 0.15933158457203925, + "grad_norm": 0.22890770435333252, + "learning_rate": 0.0001685480731802258, + "loss": 1.0854, + "step": 410 + }, + { + "epoch": 0.15972019819294667, + "grad_norm": 0.21947838366031647, + "learning_rate": 0.00016847022187621645, + "loss": 1.0948, + "step": 411 + }, + { + "epoch": 0.16010881181385409, + "grad_norm": 0.22334899008274078, + "learning_rate": 0.0001683923705722071, + "loss": 1.006, + "step": 412 + }, + { + "epoch": 0.16049742543476148, + "grad_norm": 0.22324936091899872, + "learning_rate": 0.00016831451926819775, + "loss": 1.0402, + "step": 413 + }, + { + "epoch": 0.1608860390556689, + "grad_norm": 0.21462097764015198, + "learning_rate": 0.0001682366679641884, + "loss": 1.077, + "step": 414 + }, + { + "epoch": 0.1612746526765763, + "grad_norm": 0.24567006528377533, + "learning_rate": 0.00016815881666017908, + "loss": 1.15, + "step": 415 + }, + { + "epoch": 0.16166326629748373, + "grad_norm": 0.26437243819236755, + "learning_rate": 0.00016808096535616973, + "loss": 1.1251, + "step": 416 + }, + { + "epoch": 0.16205187991839115, + "grad_norm": 0.2217959761619568, + "learning_rate": 0.00016800311405216039, + "loss": 1.1103, + "step": 417 + }, + { + "epoch": 0.16244049353929854, + "grad_norm": 0.24402475357055664, + "learning_rate": 0.00016792526274815104, + "loss": 1.0672, + "step": 418 + }, + { + "epoch": 0.16282910716020596, + "grad_norm": 0.21609526872634888, + "learning_rate": 0.0001678474114441417, + "loss": 1.0291, + "step": 419 + }, + { + "epoch": 0.16321772078111338, + "grad_norm": 0.20054642856121063, + "learning_rate": 0.00016776956014013237, + "loss": 1.0704, + "step": 420 + }, + { + "epoch": 0.1636063344020208, + "grad_norm": 0.22864869236946106, + "learning_rate": 0.00016769170883612302, + "loss": 1.0612, + "step": 421 + }, + { + "epoch": 0.16399494802292822, + "grad_norm": 0.22651974856853485, + "learning_rate": 0.00016761385753211367, + "loss": 1.0749, + "step": 422 + }, + { + "epoch": 0.1643835616438356, + "grad_norm": 0.21587328612804413, + "learning_rate": 0.00016753600622810433, + "loss": 1.0398, + "step": 423 + }, + { + "epoch": 0.16477217526474303, + "grad_norm": 0.1953774094581604, + "learning_rate": 0.00016745815492409498, + "loss": 1.0275, + "step": 424 + }, + { + "epoch": 0.16516078888565044, + "grad_norm": 0.21803410351276398, + "learning_rate": 0.00016738030362008566, + "loss": 1.1219, + "step": 425 + }, + { + "epoch": 0.16554940250655786, + "grad_norm": 0.2034682035446167, + "learning_rate": 0.0001673024523160763, + "loss": 1.0342, + "step": 426 + }, + { + "epoch": 0.16593801612746525, + "grad_norm": 0.20135951042175293, + "learning_rate": 0.00016722460101206696, + "loss": 0.9802, + "step": 427 + }, + { + "epoch": 0.16632662974837267, + "grad_norm": 0.23310376703739166, + "learning_rate": 0.0001671467497080576, + "loss": 1.0789, + "step": 428 + }, + { + "epoch": 0.1667152433692801, + "grad_norm": 0.21475404500961304, + "learning_rate": 0.00016706889840404827, + "loss": 1.0416, + "step": 429 + }, + { + "epoch": 0.1671038569901875, + "grad_norm": 0.21661072969436646, + "learning_rate": 0.00016699104710003894, + "loss": 1.0568, + "step": 430 + }, + { + "epoch": 0.16749247061109493, + "grad_norm": 0.20310629904270172, + "learning_rate": 0.0001669131957960296, + "loss": 0.9968, + "step": 431 + }, + { + "epoch": 0.16788108423200232, + "grad_norm": 0.2596947252750397, + "learning_rate": 0.00016683534449202025, + "loss": 1.0478, + "step": 432 + }, + { + "epoch": 0.16826969785290974, + "grad_norm": 0.22226987779140472, + "learning_rate": 0.0001667574931880109, + "loss": 1.0898, + "step": 433 + }, + { + "epoch": 0.16865831147381716, + "grad_norm": 0.22499911487102509, + "learning_rate": 0.00016667964188400155, + "loss": 1.07, + "step": 434 + }, + { + "epoch": 0.16904692509472458, + "grad_norm": 0.2717292308807373, + "learning_rate": 0.0001666017905799922, + "loss": 1.0562, + "step": 435 + }, + { + "epoch": 0.169435538715632, + "grad_norm": 0.22052323818206787, + "learning_rate": 0.00016652393927598288, + "loss": 1.0732, + "step": 436 + }, + { + "epoch": 0.16982415233653939, + "grad_norm": 0.21741728484630585, + "learning_rate": 0.00016644608797197354, + "loss": 1.0409, + "step": 437 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 0.20701193809509277, + "learning_rate": 0.0001663682366679642, + "loss": 1.0731, + "step": 438 + }, + { + "epoch": 0.17060137957835422, + "grad_norm": 0.22071130573749542, + "learning_rate": 0.00016629038536395484, + "loss": 1.0992, + "step": 439 + }, + { + "epoch": 0.17098999319926164, + "grad_norm": 0.20261412858963013, + "learning_rate": 0.0001662125340599455, + "loss": 1.0051, + "step": 440 + }, + { + "epoch": 0.17137860682016906, + "grad_norm": 0.2082947939634323, + "learning_rate": 0.00016613468275593617, + "loss": 1.0477, + "step": 441 + }, + { + "epoch": 0.17176722044107645, + "grad_norm": 0.22534717619419098, + "learning_rate": 0.00016605683145192682, + "loss": 1.041, + "step": 442 + }, + { + "epoch": 0.17215583406198387, + "grad_norm": 0.21547731757164001, + "learning_rate": 0.00016597898014791748, + "loss": 1.0528, + "step": 443 + }, + { + "epoch": 0.1725444476828913, + "grad_norm": 0.24141089618206024, + "learning_rate": 0.00016590112884390813, + "loss": 1.0928, + "step": 444 + }, + { + "epoch": 0.1729330613037987, + "grad_norm": 0.21910884976387024, + "learning_rate": 0.00016582327753989878, + "loss": 1.063, + "step": 445 + }, + { + "epoch": 0.1733216749247061, + "grad_norm": 0.21782316267490387, + "learning_rate": 0.00016574542623588946, + "loss": 1.0976, + "step": 446 + }, + { + "epoch": 0.17371028854561352, + "grad_norm": 0.21771778166294098, + "learning_rate": 0.0001656675749318801, + "loss": 1.0677, + "step": 447 + }, + { + "epoch": 0.17409890216652094, + "grad_norm": 0.22117659449577332, + "learning_rate": 0.00016558972362787076, + "loss": 1.0669, + "step": 448 + }, + { + "epoch": 0.17448751578742835, + "grad_norm": 0.21918092668056488, + "learning_rate": 0.00016551187232386141, + "loss": 1.0955, + "step": 449 + }, + { + "epoch": 0.17487612940833577, + "grad_norm": 0.22027818858623505, + "learning_rate": 0.0001654340210198521, + "loss": 1.0201, + "step": 450 + }, + { + "epoch": 0.17526474302924316, + "grad_norm": 0.2042885720729828, + "learning_rate": 0.00016535616971584275, + "loss": 1.0881, + "step": 451 + }, + { + "epoch": 0.17565335665015058, + "grad_norm": 0.21788261830806732, + "learning_rate": 0.0001652783184118334, + "loss": 1.0918, + "step": 452 + }, + { + "epoch": 0.176041970271058, + "grad_norm": 0.23332571983337402, + "learning_rate": 0.00016520046710782408, + "loss": 1.091, + "step": 453 + }, + { + "epoch": 0.17643058389196542, + "grad_norm": 0.20204192399978638, + "learning_rate": 0.00016512261580381473, + "loss": 1.0366, + "step": 454 + }, + { + "epoch": 0.17681919751287284, + "grad_norm": 0.21761906147003174, + "learning_rate": 0.00016504476449980538, + "loss": 1.0131, + "step": 455 + }, + { + "epoch": 0.17720781113378023, + "grad_norm": 0.2152051478624344, + "learning_rate": 0.00016496691319579606, + "loss": 1.0868, + "step": 456 + }, + { + "epoch": 0.17759642475468765, + "grad_norm": 0.22776494920253754, + "learning_rate": 0.0001648890618917867, + "loss": 1.0807, + "step": 457 + }, + { + "epoch": 0.17798503837559507, + "grad_norm": 0.2171342968940735, + "learning_rate": 0.00016481121058777736, + "loss": 1.0537, + "step": 458 + }, + { + "epoch": 0.17837365199650249, + "grad_norm": 0.2046273946762085, + "learning_rate": 0.00016473335928376802, + "loss": 1.0097, + "step": 459 + }, + { + "epoch": 0.17876226561740988, + "grad_norm": 0.2047681361436844, + "learning_rate": 0.00016465550797975867, + "loss": 1.0204, + "step": 460 + }, + { + "epoch": 0.1791508792383173, + "grad_norm": 0.1876862645149231, + "learning_rate": 0.00016457765667574935, + "loss": 0.9383, + "step": 461 + }, + { + "epoch": 0.17953949285922471, + "grad_norm": 0.218430757522583, + "learning_rate": 0.00016449980537174, + "loss": 1.0721, + "step": 462 + }, + { + "epoch": 0.17992810648013213, + "grad_norm": 0.2245480865240097, + "learning_rate": 0.00016442195406773065, + "loss": 1.0859, + "step": 463 + }, + { + "epoch": 0.18031672010103955, + "grad_norm": 0.22577151656150818, + "learning_rate": 0.0001643441027637213, + "loss": 1.0825, + "step": 464 + }, + { + "epoch": 0.18070533372194694, + "grad_norm": 0.20132745802402496, + "learning_rate": 0.00016426625145971196, + "loss": 1.0615, + "step": 465 + }, + { + "epoch": 0.18109394734285436, + "grad_norm": 0.2277505248785019, + "learning_rate": 0.00016418840015570263, + "loss": 1.0426, + "step": 466 + }, + { + "epoch": 0.18148256096376178, + "grad_norm": 0.22540105879306793, + "learning_rate": 0.0001641105488516933, + "loss": 1.0481, + "step": 467 + }, + { + "epoch": 0.1818711745846692, + "grad_norm": 0.20358088612556458, + "learning_rate": 0.00016403269754768394, + "loss": 1.0286, + "step": 468 + }, + { + "epoch": 0.18225978820557662, + "grad_norm": 0.22534145414829254, + "learning_rate": 0.0001639548462436746, + "loss": 1.1183, + "step": 469 + }, + { + "epoch": 0.182648401826484, + "grad_norm": 0.2188873142004013, + "learning_rate": 0.00016387699493966524, + "loss": 1.0439, + "step": 470 + }, + { + "epoch": 0.18303701544739143, + "grad_norm": 0.2128048539161682, + "learning_rate": 0.00016379914363565592, + "loss": 1.027, + "step": 471 + }, + { + "epoch": 0.18342562906829885, + "grad_norm": 0.2518141567707062, + "learning_rate": 0.00016372129233164657, + "loss": 1.0468, + "step": 472 + }, + { + "epoch": 0.18381424268920626, + "grad_norm": 0.2189142256975174, + "learning_rate": 0.00016364344102763723, + "loss": 1.0581, + "step": 473 + }, + { + "epoch": 0.18420285631011368, + "grad_norm": 0.31266725063323975, + "learning_rate": 0.00016356558972362788, + "loss": 1.0554, + "step": 474 + }, + { + "epoch": 0.18459146993102107, + "grad_norm": 0.21343916654586792, + "learning_rate": 0.00016348773841961853, + "loss": 1.0795, + "step": 475 + }, + { + "epoch": 0.1849800835519285, + "grad_norm": 0.22907280921936035, + "learning_rate": 0.00016340988711560918, + "loss": 1.0304, + "step": 476 + }, + { + "epoch": 0.1853686971728359, + "grad_norm": 0.2105257511138916, + "learning_rate": 0.00016333203581159986, + "loss": 1.0231, + "step": 477 + }, + { + "epoch": 0.18575731079374333, + "grad_norm": 0.19537831842899323, + "learning_rate": 0.00016325418450759051, + "loss": 1.0103, + "step": 478 + }, + { + "epoch": 0.18614592441465072, + "grad_norm": 0.20522372424602509, + "learning_rate": 0.00016317633320358117, + "loss": 1.0196, + "step": 479 + }, + { + "epoch": 0.18653453803555814, + "grad_norm": 0.21646477282047272, + "learning_rate": 0.00016309848189957182, + "loss": 1.0579, + "step": 480 + }, + { + "epoch": 0.18692315165646556, + "grad_norm": 0.21077193319797516, + "learning_rate": 0.00016302063059556247, + "loss": 1.0638, + "step": 481 + }, + { + "epoch": 0.18731176527737298, + "grad_norm": 0.20357473194599152, + "learning_rate": 0.00016294277929155315, + "loss": 1.0635, + "step": 482 + }, + { + "epoch": 0.1877003788982804, + "grad_norm": 0.2188001275062561, + "learning_rate": 0.0001628649279875438, + "loss": 1.0267, + "step": 483 + }, + { + "epoch": 0.1880889925191878, + "grad_norm": 0.2128928154706955, + "learning_rate": 0.00016278707668353445, + "loss": 0.9706, + "step": 484 + }, + { + "epoch": 0.1884776061400952, + "grad_norm": 0.22081372141838074, + "learning_rate": 0.0001627092253795251, + "loss": 1.08, + "step": 485 + }, + { + "epoch": 0.18886621976100262, + "grad_norm": 0.2250615805387497, + "learning_rate": 0.00016263137407551576, + "loss": 1.1451, + "step": 486 + }, + { + "epoch": 0.18925483338191004, + "grad_norm": 0.1984967589378357, + "learning_rate": 0.00016255352277150644, + "loss": 1.0744, + "step": 487 + }, + { + "epoch": 0.18964344700281746, + "grad_norm": 0.20778900384902954, + "learning_rate": 0.0001624756714674971, + "loss": 1.0623, + "step": 488 + }, + { + "epoch": 0.19003206062372485, + "grad_norm": 0.2026563137769699, + "learning_rate": 0.00016239782016348774, + "loss": 1.0714, + "step": 489 + }, + { + "epoch": 0.19042067424463227, + "grad_norm": 0.21598374843597412, + "learning_rate": 0.0001623199688594784, + "loss": 1.0869, + "step": 490 + }, + { + "epoch": 0.1908092878655397, + "grad_norm": 0.18944978713989258, + "learning_rate": 0.00016224211755546904, + "loss": 1.055, + "step": 491 + }, + { + "epoch": 0.1911979014864471, + "grad_norm": 0.20698946714401245, + "learning_rate": 0.00016216426625145972, + "loss": 1.0392, + "step": 492 + }, + { + "epoch": 0.1915865151073545, + "grad_norm": 0.22395353019237518, + "learning_rate": 0.00016208641494745038, + "loss": 1.0681, + "step": 493 + }, + { + "epoch": 0.19197512872826192, + "grad_norm": 0.22372962534427643, + "learning_rate": 0.00016200856364344103, + "loss": 1.0767, + "step": 494 + }, + { + "epoch": 0.19236374234916934, + "grad_norm": 0.2066701054573059, + "learning_rate": 0.00016193071233943168, + "loss": 1.0061, + "step": 495 + }, + { + "epoch": 0.19275235597007676, + "grad_norm": 0.19716408848762512, + "learning_rate": 0.00016185286103542233, + "loss": 1.039, + "step": 496 + }, + { + "epoch": 0.19314096959098417, + "grad_norm": 0.22159601747989655, + "learning_rate": 0.000161775009731413, + "loss": 1.0832, + "step": 497 + }, + { + "epoch": 0.19352958321189156, + "grad_norm": 0.21509626507759094, + "learning_rate": 0.00016169715842740366, + "loss": 1.0264, + "step": 498 + }, + { + "epoch": 0.19391819683279898, + "grad_norm": 0.21598199009895325, + "learning_rate": 0.00016161930712339431, + "loss": 1.049, + "step": 499 + }, + { + "epoch": 0.1943068104537064, + "grad_norm": 0.20279590785503387, + "learning_rate": 0.00016154145581938497, + "loss": 1.0505, + "step": 500 + }, + { + "epoch": 0.19469542407461382, + "grad_norm": 0.21796855330467224, + "learning_rate": 0.00016146360451537565, + "loss": 1.0885, + "step": 501 + }, + { + "epoch": 0.19508403769552124, + "grad_norm": 0.22128933668136597, + "learning_rate": 0.0001613857532113663, + "loss": 1.0903, + "step": 502 + }, + { + "epoch": 0.19547265131642863, + "grad_norm": 0.2032536417245865, + "learning_rate": 0.00016130790190735695, + "loss": 1.0285, + "step": 503 + }, + { + "epoch": 0.19586126493733605, + "grad_norm": 0.23738974332809448, + "learning_rate": 0.0001612300506033476, + "loss": 1.1188, + "step": 504 + }, + { + "epoch": 0.19624987855824347, + "grad_norm": 0.19614790380001068, + "learning_rate": 0.00016115219929933828, + "loss": 1.04, + "step": 505 + }, + { + "epoch": 0.1966384921791509, + "grad_norm": 0.2198178917169571, + "learning_rate": 0.00016107434799532893, + "loss": 1.0696, + "step": 506 + }, + { + "epoch": 0.1970271058000583, + "grad_norm": 0.18814648687839508, + "learning_rate": 0.00016099649669131959, + "loss": 1.0203, + "step": 507 + }, + { + "epoch": 0.1974157194209657, + "grad_norm": 0.20699037611484528, + "learning_rate": 0.00016091864538731026, + "loss": 1.1074, + "step": 508 + }, + { + "epoch": 0.19780433304187311, + "grad_norm": 0.21490445733070374, + "learning_rate": 0.00016084079408330092, + "loss": 1.0682, + "step": 509 + }, + { + "epoch": 0.19819294666278053, + "grad_norm": 0.2363848090171814, + "learning_rate": 0.00016076294277929157, + "loss": 1.0408, + "step": 510 + }, + { + "epoch": 0.19858156028368795, + "grad_norm": 0.20186659693717957, + "learning_rate": 0.00016068509147528222, + "loss": 1.026, + "step": 511 + }, + { + "epoch": 0.19897017390459534, + "grad_norm": 0.21564024686813354, + "learning_rate": 0.00016060724017127287, + "loss": 1.0418, + "step": 512 + }, + { + "epoch": 0.19935878752550276, + "grad_norm": 0.19151560962200165, + "learning_rate": 0.00016052938886726355, + "loss": 1.0037, + "step": 513 + }, + { + "epoch": 0.19974740114641018, + "grad_norm": 0.21038194000720978, + "learning_rate": 0.0001604515375632542, + "loss": 1.0545, + "step": 514 + }, + { + "epoch": 0.2001360147673176, + "grad_norm": 0.20496582984924316, + "learning_rate": 0.00016037368625924486, + "loss": 1.0543, + "step": 515 + }, + { + "epoch": 0.20052462838822502, + "grad_norm": 0.20689113438129425, + "learning_rate": 0.0001602958349552355, + "loss": 1.0905, + "step": 516 + }, + { + "epoch": 0.2009132420091324, + "grad_norm": 0.2284041792154312, + "learning_rate": 0.00016021798365122616, + "loss": 1.0717, + "step": 517 + }, + { + "epoch": 0.20130185563003983, + "grad_norm": 0.23457761108875275, + "learning_rate": 0.00016014013234721684, + "loss": 1.106, + "step": 518 + }, + { + "epoch": 0.20169046925094725, + "grad_norm": 0.2088528722524643, + "learning_rate": 0.0001600622810432075, + "loss": 1.0428, + "step": 519 + }, + { + "epoch": 0.20207908287185467, + "grad_norm": 0.2170068770647049, + "learning_rate": 0.00015998442973919814, + "loss": 0.9875, + "step": 520 + }, + { + "epoch": 0.20246769649276208, + "grad_norm": 0.2270561158657074, + "learning_rate": 0.0001599065784351888, + "loss": 1.0676, + "step": 521 + }, + { + "epoch": 0.20285631011366947, + "grad_norm": 0.2151324599981308, + "learning_rate": 0.00015982872713117945, + "loss": 1.0675, + "step": 522 + }, + { + "epoch": 0.2032449237345769, + "grad_norm": 0.23113249242305756, + "learning_rate": 0.00015975087582717013, + "loss": 1.0608, + "step": 523 + }, + { + "epoch": 0.2036335373554843, + "grad_norm": 0.2587106227874756, + "learning_rate": 0.00015967302452316078, + "loss": 1.0867, + "step": 524 + }, + { + "epoch": 0.20402215097639173, + "grad_norm": 0.21842992305755615, + "learning_rate": 0.00015959517321915143, + "loss": 1.0726, + "step": 525 + }, + { + "epoch": 0.20441076459729912, + "grad_norm": 0.20867805182933807, + "learning_rate": 0.00015951732191514208, + "loss": 1.0578, + "step": 526 + }, + { + "epoch": 0.20479937821820654, + "grad_norm": 0.2396962195634842, + "learning_rate": 0.00015943947061113273, + "loss": 1.0292, + "step": 527 + }, + { + "epoch": 0.20518799183911396, + "grad_norm": 0.221155047416687, + "learning_rate": 0.00015936161930712341, + "loss": 1.0019, + "step": 528 + }, + { + "epoch": 0.20557660546002138, + "grad_norm": 0.20032119750976562, + "learning_rate": 0.00015928376800311407, + "loss": 1.0435, + "step": 529 + }, + { + "epoch": 0.2059652190809288, + "grad_norm": 0.24095888435840607, + "learning_rate": 0.00015920591669910472, + "loss": 1.0355, + "step": 530 + }, + { + "epoch": 0.2063538327018362, + "grad_norm": 0.2286604344844818, + "learning_rate": 0.00015912806539509537, + "loss": 0.9989, + "step": 531 + }, + { + "epoch": 0.2067424463227436, + "grad_norm": 0.21537137031555176, + "learning_rate": 0.00015905021409108602, + "loss": 1.0642, + "step": 532 + }, + { + "epoch": 0.20713105994365102, + "grad_norm": 0.22447925806045532, + "learning_rate": 0.0001589723627870767, + "loss": 1.1244, + "step": 533 + }, + { + "epoch": 0.20751967356455844, + "grad_norm": 0.21077273786067963, + "learning_rate": 0.00015889451148306735, + "loss": 1.0167, + "step": 534 + }, + { + "epoch": 0.20790828718546586, + "grad_norm": 0.22340558469295502, + "learning_rate": 0.000158816660179058, + "loss": 1.0991, + "step": 535 + }, + { + "epoch": 0.20829690080637325, + "grad_norm": 0.223599374294281, + "learning_rate": 0.00015873880887504866, + "loss": 1.086, + "step": 536 + }, + { + "epoch": 0.20868551442728067, + "grad_norm": 0.2615208923816681, + "learning_rate": 0.0001586609575710393, + "loss": 1.0584, + "step": 537 + }, + { + "epoch": 0.2090741280481881, + "grad_norm": 0.2085907757282257, + "learning_rate": 0.00015858310626703, + "loss": 1.0994, + "step": 538 + }, + { + "epoch": 0.2094627416690955, + "grad_norm": 0.2170211672782898, + "learning_rate": 0.00015850525496302064, + "loss": 1.1105, + "step": 539 + }, + { + "epoch": 0.20985135529000293, + "grad_norm": 0.21978625655174255, + "learning_rate": 0.0001584274036590113, + "loss": 1.002, + "step": 540 + }, + { + "epoch": 0.21023996891091032, + "grad_norm": 0.23684021830558777, + "learning_rate": 0.00015834955235500194, + "loss": 1.1216, + "step": 541 + }, + { + "epoch": 0.21062858253181774, + "grad_norm": 0.220269113779068, + "learning_rate": 0.0001582717010509926, + "loss": 1.0773, + "step": 542 + }, + { + "epoch": 0.21101719615272516, + "grad_norm": 0.22447973489761353, + "learning_rate": 0.00015819384974698328, + "loss": 1.0941, + "step": 543 + }, + { + "epoch": 0.21140580977363257, + "grad_norm": 0.22435730695724487, + "learning_rate": 0.00015811599844297393, + "loss": 1.0138, + "step": 544 + }, + { + "epoch": 0.21179442339453997, + "grad_norm": 0.2230793684720993, + "learning_rate": 0.00015803814713896458, + "loss": 1.0343, + "step": 545 + }, + { + "epoch": 0.21218303701544738, + "grad_norm": 0.23491905629634857, + "learning_rate": 0.00015796029583495523, + "loss": 1.11, + "step": 546 + }, + { + "epoch": 0.2125716506363548, + "grad_norm": 0.213560551404953, + "learning_rate": 0.00015788244453094588, + "loss": 1.0615, + "step": 547 + }, + { + "epoch": 0.21296026425726222, + "grad_norm": 0.21392837166786194, + "learning_rate": 0.00015780459322693654, + "loss": 1.0872, + "step": 548 + }, + { + "epoch": 0.21334887787816964, + "grad_norm": 0.20007692277431488, + "learning_rate": 0.00015772674192292722, + "loss": 1.0394, + "step": 549 + }, + { + "epoch": 0.21373749149907703, + "grad_norm": 0.1969841718673706, + "learning_rate": 0.00015764889061891787, + "loss": 1.0381, + "step": 550 + }, + { + "epoch": 0.21412610511998445, + "grad_norm": 0.21874025464057922, + "learning_rate": 0.00015757103931490852, + "loss": 1.0822, + "step": 551 + }, + { + "epoch": 0.21451471874089187, + "grad_norm": 0.21824273467063904, + "learning_rate": 0.00015749318801089917, + "loss": 1.0802, + "step": 552 + }, + { + "epoch": 0.2149033323617993, + "grad_norm": 0.20942047238349915, + "learning_rate": 0.00015741533670688985, + "loss": 1.0634, + "step": 553 + }, + { + "epoch": 0.2152919459827067, + "grad_norm": 0.1940152943134308, + "learning_rate": 0.0001573374854028805, + "loss": 1.0264, + "step": 554 + }, + { + "epoch": 0.2156805596036141, + "grad_norm": 0.19859059154987335, + "learning_rate": 0.00015725963409887115, + "loss": 0.9701, + "step": 555 + }, + { + "epoch": 0.21606917322452152, + "grad_norm": 0.22239404916763306, + "learning_rate": 0.0001571817827948618, + "loss": 1.1282, + "step": 556 + }, + { + "epoch": 0.21645778684542893, + "grad_norm": 0.23820599913597107, + "learning_rate": 0.00015710393149085249, + "loss": 1.1123, + "step": 557 + }, + { + "epoch": 0.21684640046633635, + "grad_norm": 0.21279917657375336, + "learning_rate": 0.00015702608018684314, + "loss": 1.0542, + "step": 558 + }, + { + "epoch": 0.21723501408724374, + "grad_norm": 0.2065514773130417, + "learning_rate": 0.0001569482288828338, + "loss": 1.0685, + "step": 559 + }, + { + "epoch": 0.21762362770815116, + "grad_norm": 0.20130831003189087, + "learning_rate": 0.00015687037757882447, + "loss": 0.9869, + "step": 560 + }, + { + "epoch": 0.21801224132905858, + "grad_norm": 0.2187541127204895, + "learning_rate": 0.00015679252627481512, + "loss": 1.1095, + "step": 561 + }, + { + "epoch": 0.218400854949966, + "grad_norm": 0.21028277277946472, + "learning_rate": 0.00015671467497080577, + "loss": 1.0804, + "step": 562 + }, + { + "epoch": 0.21878946857087342, + "grad_norm": 0.8187636733055115, + "learning_rate": 0.00015663682366679643, + "loss": 1.0782, + "step": 563 + }, + { + "epoch": 0.2191780821917808, + "grad_norm": 0.20059974491596222, + "learning_rate": 0.0001565589723627871, + "loss": 1.0279, + "step": 564 + }, + { + "epoch": 0.21956669581268823, + "grad_norm": 0.20440839231014252, + "learning_rate": 0.00015648112105877776, + "loss": 0.9863, + "step": 565 + }, + { + "epoch": 0.21995530943359565, + "grad_norm": 0.21423624455928802, + "learning_rate": 0.0001564032697547684, + "loss": 1.0685, + "step": 566 + }, + { + "epoch": 0.22034392305450307, + "grad_norm": 0.22430062294006348, + "learning_rate": 0.00015632541845075906, + "loss": 1.0761, + "step": 567 + }, + { + "epoch": 0.22073253667541048, + "grad_norm": 0.22782258689403534, + "learning_rate": 0.0001562475671467497, + "loss": 1.1024, + "step": 568 + }, + { + "epoch": 0.22112115029631788, + "grad_norm": 0.21150320768356323, + "learning_rate": 0.0001561697158427404, + "loss": 1.0621, + "step": 569 + }, + { + "epoch": 0.2215097639172253, + "grad_norm": 0.20342351496219635, + "learning_rate": 0.00015609186453873104, + "loss": 1.0667, + "step": 570 + }, + { + "epoch": 0.2218983775381327, + "grad_norm": 0.22866711020469666, + "learning_rate": 0.0001560140132347217, + "loss": 1.0631, + "step": 571 + }, + { + "epoch": 0.22228699115904013, + "grad_norm": 0.2200063169002533, + "learning_rate": 0.00015593616193071235, + "loss": 1.0448, + "step": 572 + }, + { + "epoch": 0.22267560477994755, + "grad_norm": 0.19440248608589172, + "learning_rate": 0.000155858310626703, + "loss": 1.037, + "step": 573 + }, + { + "epoch": 0.22306421840085494, + "grad_norm": 0.205752432346344, + "learning_rate": 0.00015578045932269368, + "loss": 1.0465, + "step": 574 + }, + { + "epoch": 0.22345283202176236, + "grad_norm": 0.22247998416423798, + "learning_rate": 0.00015570260801868433, + "loss": 0.997, + "step": 575 + }, + { + "epoch": 0.22384144564266978, + "grad_norm": 0.22199274599552155, + "learning_rate": 0.00015562475671467498, + "loss": 1.0178, + "step": 576 + }, + { + "epoch": 0.2242300592635772, + "grad_norm": 0.2114989310503006, + "learning_rate": 0.00015554690541066564, + "loss": 1.0457, + "step": 577 + }, + { + "epoch": 0.2246186728844846, + "grad_norm": 0.24248506128787994, + "learning_rate": 0.0001554690541066563, + "loss": 1.002, + "step": 578 + }, + { + "epoch": 0.225007286505392, + "grad_norm": 0.2565505802631378, + "learning_rate": 0.00015539120280264697, + "loss": 1.0541, + "step": 579 + }, + { + "epoch": 0.22539590012629943, + "grad_norm": 0.22799409925937653, + "learning_rate": 0.00015531335149863762, + "loss": 1.0788, + "step": 580 + }, + { + "epoch": 0.22578451374720684, + "grad_norm": 0.2196080982685089, + "learning_rate": 0.00015523550019462827, + "loss": 1.0877, + "step": 581 + }, + { + "epoch": 0.22617312736811426, + "grad_norm": 0.21992824971675873, + "learning_rate": 0.00015515764889061892, + "loss": 1.0213, + "step": 582 + }, + { + "epoch": 0.22656174098902165, + "grad_norm": 0.22793298959732056, + "learning_rate": 0.00015507979758660957, + "loss": 1.0633, + "step": 583 + }, + { + "epoch": 0.22695035460992907, + "grad_norm": 0.21707972884178162, + "learning_rate": 0.00015500194628260023, + "loss": 1.081, + "step": 584 + }, + { + "epoch": 0.2273389682308365, + "grad_norm": 0.220685675740242, + "learning_rate": 0.0001549240949785909, + "loss": 1.0658, + "step": 585 + }, + { + "epoch": 0.2277275818517439, + "grad_norm": 0.22576668858528137, + "learning_rate": 0.00015484624367458156, + "loss": 1.0795, + "step": 586 + }, + { + "epoch": 0.22811619547265133, + "grad_norm": 0.21778982877731323, + "learning_rate": 0.0001547683923705722, + "loss": 1.033, + "step": 587 + }, + { + "epoch": 0.22850480909355872, + "grad_norm": 0.22748610377311707, + "learning_rate": 0.00015469054106656286, + "loss": 1.0948, + "step": 588 + }, + { + "epoch": 0.22889342271446614, + "grad_norm": 0.21561284363269806, + "learning_rate": 0.00015461268976255351, + "loss": 1.0022, + "step": 589 + }, + { + "epoch": 0.22928203633537356, + "grad_norm": 0.2419756054878235, + "learning_rate": 0.0001545348384585442, + "loss": 1.0786, + "step": 590 + }, + { + "epoch": 0.22967064995628098, + "grad_norm": 0.20479315519332886, + "learning_rate": 0.00015445698715453485, + "loss": 1.027, + "step": 591 + }, + { + "epoch": 0.2300592635771884, + "grad_norm": 0.21365883946418762, + "learning_rate": 0.0001543791358505255, + "loss": 1.0773, + "step": 592 + }, + { + "epoch": 0.23044787719809579, + "grad_norm": 0.23133166134357452, + "learning_rate": 0.00015430128454651615, + "loss": 1.0877, + "step": 593 + }, + { + "epoch": 0.2308364908190032, + "grad_norm": 0.2110515981912613, + "learning_rate": 0.0001542234332425068, + "loss": 1.0509, + "step": 594 + }, + { + "epoch": 0.23122510443991062, + "grad_norm": 0.20658442378044128, + "learning_rate": 0.00015414558193849748, + "loss": 1.0623, + "step": 595 + }, + { + "epoch": 0.23161371806081804, + "grad_norm": 0.21831996738910675, + "learning_rate": 0.00015406773063448813, + "loss": 1.021, + "step": 596 + }, + { + "epoch": 0.23200233168172543, + "grad_norm": 0.23015642166137695, + "learning_rate": 0.00015398987933047878, + "loss": 1.0358, + "step": 597 + }, + { + "epoch": 0.23239094530263285, + "grad_norm": 0.23071645200252533, + "learning_rate": 0.00015391202802646944, + "loss": 1.1255, + "step": 598 + }, + { + "epoch": 0.23277955892354027, + "grad_norm": 0.19513486325740814, + "learning_rate": 0.0001538341767224601, + "loss": 1.0189, + "step": 599 + }, + { + "epoch": 0.2331681725444477, + "grad_norm": 0.20821452140808105, + "learning_rate": 0.00015375632541845077, + "loss": 1.0843, + "step": 600 + }, + { + "epoch": 0.2335567861653551, + "grad_norm": 0.20563223958015442, + "learning_rate": 0.00015367847411444142, + "loss": 1.0012, + "step": 601 + }, + { + "epoch": 0.2339453997862625, + "grad_norm": 0.22674202919006348, + "learning_rate": 0.00015360062281043207, + "loss": 1.0371, + "step": 602 + }, + { + "epoch": 0.23433401340716992, + "grad_norm": 0.20744135975837708, + "learning_rate": 0.00015352277150642272, + "loss": 1.0466, + "step": 603 + }, + { + "epoch": 0.23472262702807734, + "grad_norm": 0.22103577852249146, + "learning_rate": 0.00015344492020241338, + "loss": 1.0942, + "step": 604 + }, + { + "epoch": 0.23511124064898475, + "grad_norm": 0.20643098652362823, + "learning_rate": 0.00015336706889840406, + "loss": 1.0682, + "step": 605 + }, + { + "epoch": 0.23549985426989217, + "grad_norm": 0.23436777293682098, + "learning_rate": 0.0001532892175943947, + "loss": 1.0613, + "step": 606 + }, + { + "epoch": 0.23588846789079956, + "grad_norm": 0.21898899972438812, + "learning_rate": 0.00015321136629038536, + "loss": 1.0571, + "step": 607 + }, + { + "epoch": 0.23627708151170698, + "grad_norm": 0.20569247007369995, + "learning_rate": 0.00015313351498637604, + "loss": 1.061, + "step": 608 + }, + { + "epoch": 0.2366656951326144, + "grad_norm": 0.2099207490682602, + "learning_rate": 0.0001530556636823667, + "loss": 1.0776, + "step": 609 + }, + { + "epoch": 0.23705430875352182, + "grad_norm": 0.20078738033771515, + "learning_rate": 0.00015297781237835734, + "loss": 1.0341, + "step": 610 + }, + { + "epoch": 0.2374429223744292, + "grad_norm": 0.20327065885066986, + "learning_rate": 0.000152899961074348, + "loss": 1.0168, + "step": 611 + }, + { + "epoch": 0.23783153599533663, + "grad_norm": 0.21741214394569397, + "learning_rate": 0.00015282210977033867, + "loss": 1.0726, + "step": 612 + }, + { + "epoch": 0.23822014961624405, + "grad_norm": 0.2065727263689041, + "learning_rate": 0.00015274425846632933, + "loss": 1.0474, + "step": 613 + }, + { + "epoch": 0.23860876323715147, + "grad_norm": 0.21241194009780884, + "learning_rate": 0.00015266640716231998, + "loss": 1.0666, + "step": 614 + }, + { + "epoch": 0.23899737685805889, + "grad_norm": 0.2194201797246933, + "learning_rate": 0.00015258855585831066, + "loss": 1.1411, + "step": 615 + }, + { + "epoch": 0.23938599047896628, + "grad_norm": 0.21537193655967712, + "learning_rate": 0.0001525107045543013, + "loss": 1.081, + "step": 616 + }, + { + "epoch": 0.2397746040998737, + "grad_norm": 0.21125951409339905, + "learning_rate": 0.00015243285325029196, + "loss": 1.0679, + "step": 617 + }, + { + "epoch": 0.2401632177207811, + "grad_norm": 0.21342721581459045, + "learning_rate": 0.0001523550019462826, + "loss": 1.0564, + "step": 618 + }, + { + "epoch": 0.24055183134168853, + "grad_norm": 0.2223503291606903, + "learning_rate": 0.00015227715064227327, + "loss": 1.1163, + "step": 619 + }, + { + "epoch": 0.24094044496259595, + "grad_norm": 0.21626527607440948, + "learning_rate": 0.00015219929933826394, + "loss": 1.0793, + "step": 620 + }, + { + "epoch": 0.24132905858350334, + "grad_norm": 0.21899500489234924, + "learning_rate": 0.0001521214480342546, + "loss": 1.0864, + "step": 621 + }, + { + "epoch": 0.24171767220441076, + "grad_norm": 0.2499915212392807, + "learning_rate": 0.00015204359673024525, + "loss": 1.1381, + "step": 622 + }, + { + "epoch": 0.24210628582531818, + "grad_norm": 0.2108345925807953, + "learning_rate": 0.0001519657454262359, + "loss": 1.0534, + "step": 623 + }, + { + "epoch": 0.2424948994462256, + "grad_norm": 0.2224910855293274, + "learning_rate": 0.00015188789412222655, + "loss": 1.0235, + "step": 624 + }, + { + "epoch": 0.24288351306713302, + "grad_norm": 0.22163094580173492, + "learning_rate": 0.0001518100428182172, + "loss": 1.0143, + "step": 625 + }, + { + "epoch": 0.2432721266880404, + "grad_norm": 0.20709283649921417, + "learning_rate": 0.00015173219151420788, + "loss": 1.0506, + "step": 626 + }, + { + "epoch": 0.24366074030894783, + "grad_norm": 0.2112802267074585, + "learning_rate": 0.00015165434021019854, + "loss": 1.0692, + "step": 627 + }, + { + "epoch": 0.24404935392985525, + "grad_norm": 0.23622830212116241, + "learning_rate": 0.0001515764889061892, + "loss": 1.0769, + "step": 628 + }, + { + "epoch": 0.24443796755076266, + "grad_norm": 0.23328271508216858, + "learning_rate": 0.00015149863760217984, + "loss": 1.1158, + "step": 629 + }, + { + "epoch": 0.24482658117167005, + "grad_norm": 0.2071760892868042, + "learning_rate": 0.0001514207862981705, + "loss": 1.0133, + "step": 630 + }, + { + "epoch": 0.24521519479257747, + "grad_norm": 0.21428920328617096, + "learning_rate": 0.00015134293499416117, + "loss": 1.0342, + "step": 631 + }, + { + "epoch": 0.2456038084134849, + "grad_norm": 0.22225375473499298, + "learning_rate": 0.00015126508369015182, + "loss": 1.1054, + "step": 632 + }, + { + "epoch": 0.2459924220343923, + "grad_norm": 0.2096671611070633, + "learning_rate": 0.00015118723238614248, + "loss": 1.0229, + "step": 633 + }, + { + "epoch": 0.24638103565529973, + "grad_norm": 0.21473252773284912, + "learning_rate": 0.00015110938108213313, + "loss": 1.0915, + "step": 634 + }, + { + "epoch": 0.24676964927620712, + "grad_norm": 0.2071562111377716, + "learning_rate": 0.00015103152977812378, + "loss": 1.047, + "step": 635 + }, + { + "epoch": 0.24715826289711454, + "grad_norm": 0.19868609309196472, + "learning_rate": 0.00015095367847411446, + "loss": 1.0073, + "step": 636 + }, + { + "epoch": 0.24754687651802196, + "grad_norm": 0.20937366783618927, + "learning_rate": 0.0001508758271701051, + "loss": 1.0155, + "step": 637 + }, + { + "epoch": 0.24793549013892938, + "grad_norm": 0.19225911796092987, + "learning_rate": 0.00015079797586609576, + "loss": 1.0163, + "step": 638 + }, + { + "epoch": 0.2483241037598368, + "grad_norm": 0.20427283644676208, + "learning_rate": 0.00015072012456208641, + "loss": 1.062, + "step": 639 + }, + { + "epoch": 0.24871271738074419, + "grad_norm": 0.21640253067016602, + "learning_rate": 0.00015064227325807707, + "loss": 1.025, + "step": 640 + }, + { + "epoch": 0.2491013310016516, + "grad_norm": 0.20416739583015442, + "learning_rate": 0.00015056442195406775, + "loss": 1.0635, + "step": 641 + }, + { + "epoch": 0.24948994462255902, + "grad_norm": 0.1990521252155304, + "learning_rate": 0.0001504865706500584, + "loss": 1.0757, + "step": 642 + }, + { + "epoch": 0.24987855824346644, + "grad_norm": 0.21636444330215454, + "learning_rate": 0.00015040871934604905, + "loss": 1.0441, + "step": 643 + }, + { + "epoch": 0.25026717186437386, + "grad_norm": 0.21253719925880432, + "learning_rate": 0.0001503308680420397, + "loss": 1.0574, + "step": 644 + }, + { + "epoch": 0.2506557854852813, + "grad_norm": 0.2134159356355667, + "learning_rate": 0.00015025301673803035, + "loss": 1.0396, + "step": 645 + }, + { + "epoch": 0.2510443991061887, + "grad_norm": 0.2018527239561081, + "learning_rate": 0.00015017516543402103, + "loss": 1.0606, + "step": 646 + }, + { + "epoch": 0.25143301272709606, + "grad_norm": 0.20320741832256317, + "learning_rate": 0.00015009731413001169, + "loss": 1.0093, + "step": 647 + }, + { + "epoch": 0.2518216263480035, + "grad_norm": 0.21007056534290314, + "learning_rate": 0.00015001946282600234, + "loss": 1.0284, + "step": 648 + }, + { + "epoch": 0.2522102399689109, + "grad_norm": 0.22453372180461884, + "learning_rate": 0.000149941611521993, + "loss": 1.0271, + "step": 649 + }, + { + "epoch": 0.2525988535898183, + "grad_norm": 0.19889335334300995, + "learning_rate": 0.00014986376021798364, + "loss": 1.0238, + "step": 650 + }, + { + "epoch": 0.25298746721072574, + "grad_norm": 0.19339965283870697, + "learning_rate": 0.00014978590891397432, + "loss": 1.024, + "step": 651 + }, + { + "epoch": 0.25337608083163315, + "grad_norm": 0.22362011671066284, + "learning_rate": 0.00014970805760996497, + "loss": 1.0722, + "step": 652 + }, + { + "epoch": 0.2537646944525406, + "grad_norm": 0.2110588103532791, + "learning_rate": 0.00014963020630595562, + "loss": 1.0541, + "step": 653 + }, + { + "epoch": 0.254153308073448, + "grad_norm": 0.203025683760643, + "learning_rate": 0.00014955235500194628, + "loss": 1.0335, + "step": 654 + }, + { + "epoch": 0.2545419216943554, + "grad_norm": 0.20884902775287628, + "learning_rate": 0.00014947450369793693, + "loss": 1.0507, + "step": 655 + }, + { + "epoch": 0.2549305353152628, + "grad_norm": 0.21234256029129028, + "learning_rate": 0.0001493966523939276, + "loss": 1.0372, + "step": 656 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 0.1984352171421051, + "learning_rate": 0.00014931880108991826, + "loss": 0.9979, + "step": 657 + }, + { + "epoch": 0.2557077625570776, + "grad_norm": 0.18848282098770142, + "learning_rate": 0.0001492409497859089, + "loss": 0.9973, + "step": 658 + }, + { + "epoch": 0.25609637617798503, + "grad_norm": 0.2201709896326065, + "learning_rate": 0.00014916309848189956, + "loss": 1.0386, + "step": 659 + }, + { + "epoch": 0.25648498979889245, + "grad_norm": 0.23094095289707184, + "learning_rate": 0.00014908524717789024, + "loss": 1.1205, + "step": 660 + }, + { + "epoch": 0.25687360341979987, + "grad_norm": 0.21087734401226044, + "learning_rate": 0.0001490073958738809, + "loss": 1.0231, + "step": 661 + }, + { + "epoch": 0.2572622170407073, + "grad_norm": 0.24970979988574982, + "learning_rate": 0.00014892954456987155, + "loss": 1.0421, + "step": 662 + }, + { + "epoch": 0.2576508306616147, + "grad_norm": 0.22024711966514587, + "learning_rate": 0.00014885169326586223, + "loss": 1.1033, + "step": 663 + }, + { + "epoch": 0.2580394442825221, + "grad_norm": 0.2195248156785965, + "learning_rate": 0.00014877384196185288, + "loss": 1.089, + "step": 664 + }, + { + "epoch": 0.25842805790342954, + "grad_norm": 0.20236417651176453, + "learning_rate": 0.00014869599065784353, + "loss": 1.0196, + "step": 665 + }, + { + "epoch": 0.2588166715243369, + "grad_norm": 0.21973329782485962, + "learning_rate": 0.00014861813935383418, + "loss": 1.0844, + "step": 666 + }, + { + "epoch": 0.2592052851452443, + "grad_norm": 0.2069879174232483, + "learning_rate": 0.00014854028804982486, + "loss": 1.0312, + "step": 667 + }, + { + "epoch": 0.25959389876615174, + "grad_norm": 0.2037455290555954, + "learning_rate": 0.00014846243674581551, + "loss": 1.0018, + "step": 668 + }, + { + "epoch": 0.25998251238705916, + "grad_norm": 0.24176378548145294, + "learning_rate": 0.00014838458544180617, + "loss": 1.0749, + "step": 669 + }, + { + "epoch": 0.2603711260079666, + "grad_norm": 0.2007879763841629, + "learning_rate": 0.00014830673413779682, + "loss": 1.0443, + "step": 670 + }, + { + "epoch": 0.260759739628874, + "grad_norm": 0.23503245413303375, + "learning_rate": 0.00014822888283378747, + "loss": 1.0674, + "step": 671 + }, + { + "epoch": 0.2611483532497814, + "grad_norm": 0.2166167050600052, + "learning_rate": 0.00014815103152977815, + "loss": 1.079, + "step": 672 + }, + { + "epoch": 0.26153696687068884, + "grad_norm": 0.2293982058763504, + "learning_rate": 0.0001480731802257688, + "loss": 1.0517, + "step": 673 + }, + { + "epoch": 0.26192558049159625, + "grad_norm": 0.21040330827236176, + "learning_rate": 0.00014799532892175945, + "loss": 1.0475, + "step": 674 + }, + { + "epoch": 0.2623141941125036, + "grad_norm": 0.20750463008880615, + "learning_rate": 0.0001479174776177501, + "loss": 1.025, + "step": 675 + }, + { + "epoch": 0.26270280773341104, + "grad_norm": 0.2748873233795166, + "learning_rate": 0.00014783962631374076, + "loss": 1.0212, + "step": 676 + }, + { + "epoch": 0.26309142135431846, + "grad_norm": 0.19212333858013153, + "learning_rate": 0.00014776177500973144, + "loss": 1.0049, + "step": 677 + }, + { + "epoch": 0.2634800349752259, + "grad_norm": 0.207731693983078, + "learning_rate": 0.0001476839237057221, + "loss": 1.0062, + "step": 678 + }, + { + "epoch": 0.2638686485961333, + "grad_norm": 0.2177981585264206, + "learning_rate": 0.00014760607240171274, + "loss": 1.0489, + "step": 679 + }, + { + "epoch": 0.2642572622170407, + "grad_norm": 0.23239290714263916, + "learning_rate": 0.0001475282210977034, + "loss": 1.0856, + "step": 680 + }, + { + "epoch": 0.26464587583794813, + "grad_norm": 0.2033151388168335, + "learning_rate": 0.00014745036979369404, + "loss": 1.0389, + "step": 681 + }, + { + "epoch": 0.26503448945885555, + "grad_norm": 0.20917408168315887, + "learning_rate": 0.00014737251848968472, + "loss": 1.1208, + "step": 682 + }, + { + "epoch": 0.26542310307976297, + "grad_norm": 0.22075454890727997, + "learning_rate": 0.00014729466718567538, + "loss": 1.0435, + "step": 683 + }, + { + "epoch": 0.26581171670067033, + "grad_norm": 0.23094993829727173, + "learning_rate": 0.00014721681588166603, + "loss": 1.0649, + "step": 684 + }, + { + "epoch": 0.26620033032157775, + "grad_norm": 0.21209536492824554, + "learning_rate": 0.00014713896457765668, + "loss": 1.0578, + "step": 685 + }, + { + "epoch": 0.26658894394248517, + "grad_norm": 0.21412219107151031, + "learning_rate": 0.00014706111327364733, + "loss": 1.1137, + "step": 686 + }, + { + "epoch": 0.2669775575633926, + "grad_norm": 0.21175475418567657, + "learning_rate": 0.000146983261969638, + "loss": 1.023, + "step": 687 + }, + { + "epoch": 0.2673661711843, + "grad_norm": 0.21968993544578552, + "learning_rate": 0.00014690541066562866, + "loss": 1.1183, + "step": 688 + }, + { + "epoch": 0.2677547848052074, + "grad_norm": 0.20414218306541443, + "learning_rate": 0.00014682755936161932, + "loss": 1.078, + "step": 689 + }, + { + "epoch": 0.26814339842611484, + "grad_norm": 0.18986597657203674, + "learning_rate": 0.00014674970805760997, + "loss": 1.0029, + "step": 690 + }, + { + "epoch": 0.26853201204702226, + "grad_norm": 0.21215832233428955, + "learning_rate": 0.00014667185675360062, + "loss": 1.0759, + "step": 691 + }, + { + "epoch": 0.2689206256679297, + "grad_norm": 0.2113744169473648, + "learning_rate": 0.0001465940054495913, + "loss": 1.1027, + "step": 692 + }, + { + "epoch": 0.2693092392888371, + "grad_norm": 0.22010880708694458, + "learning_rate": 0.00014651615414558195, + "loss": 1.0984, + "step": 693 + }, + { + "epoch": 0.26969785290974446, + "grad_norm": 0.203857421875, + "learning_rate": 0.0001464383028415726, + "loss": 1.0407, + "step": 694 + }, + { + "epoch": 0.2700864665306519, + "grad_norm": 0.21120867133140564, + "learning_rate": 0.00014636045153756325, + "loss": 1.0521, + "step": 695 + }, + { + "epoch": 0.2704750801515593, + "grad_norm": 0.20039112865924835, + "learning_rate": 0.0001462826002335539, + "loss": 1.0897, + "step": 696 + }, + { + "epoch": 0.2708636937724667, + "grad_norm": 0.22893202304840088, + "learning_rate": 0.00014620474892954456, + "loss": 1.0903, + "step": 697 + }, + { + "epoch": 0.27125230739337414, + "grad_norm": 0.19886267185211182, + "learning_rate": 0.00014612689762553524, + "loss": 1.0889, + "step": 698 + }, + { + "epoch": 0.27164092101428156, + "grad_norm": 0.18892349302768707, + "learning_rate": 0.0001460490463215259, + "loss": 0.981, + "step": 699 + }, + { + "epoch": 0.272029534635189, + "grad_norm": 0.20602507889270782, + "learning_rate": 0.00014597119501751654, + "loss": 1.0223, + "step": 700 + }, + { + "epoch": 0.2724181482560964, + "grad_norm": 0.21480505168437958, + "learning_rate": 0.0001458933437135072, + "loss": 1.0355, + "step": 701 + }, + { + "epoch": 0.2728067618770038, + "grad_norm": 0.21011753380298615, + "learning_rate": 0.00014581549240949785, + "loss": 1.0613, + "step": 702 + }, + { + "epoch": 0.2731953754979112, + "grad_norm": 0.19350819289684296, + "learning_rate": 0.00014573764110548853, + "loss": 1.0144, + "step": 703 + }, + { + "epoch": 0.2735839891188186, + "grad_norm": 0.207548126578331, + "learning_rate": 0.00014565978980147918, + "loss": 1.0465, + "step": 704 + }, + { + "epoch": 0.273972602739726, + "grad_norm": 0.22220565378665924, + "learning_rate": 0.00014558193849746983, + "loss": 1.1073, + "step": 705 + }, + { + "epoch": 0.27436121636063343, + "grad_norm": 0.193622425198555, + "learning_rate": 0.00014550408719346048, + "loss": 1.0357, + "step": 706 + }, + { + "epoch": 0.27474982998154085, + "grad_norm": 0.2067158818244934, + "learning_rate": 0.00014542623588945113, + "loss": 1.0502, + "step": 707 + }, + { + "epoch": 0.27513844360244827, + "grad_norm": 0.2218742072582245, + "learning_rate": 0.0001453483845854418, + "loss": 0.9934, + "step": 708 + }, + { + "epoch": 0.2755270572233557, + "grad_norm": 0.22316142916679382, + "learning_rate": 0.00014527053328143246, + "loss": 1.0707, + "step": 709 + }, + { + "epoch": 0.2759156708442631, + "grad_norm": 0.21004025638103485, + "learning_rate": 0.00014519268197742312, + "loss": 1.0543, + "step": 710 + }, + { + "epoch": 0.2763042844651705, + "grad_norm": 0.22070440649986267, + "learning_rate": 0.00014511483067341377, + "loss": 1.0467, + "step": 711 + }, + { + "epoch": 0.27669289808607794, + "grad_norm": 0.21463747322559357, + "learning_rate": 0.00014503697936940445, + "loss": 1.0793, + "step": 712 + }, + { + "epoch": 0.2770815117069853, + "grad_norm": 0.23452533781528473, + "learning_rate": 0.0001449591280653951, + "loss": 1.043, + "step": 713 + }, + { + "epoch": 0.2774701253278927, + "grad_norm": 0.2405795156955719, + "learning_rate": 0.00014488127676138575, + "loss": 1.0752, + "step": 714 + }, + { + "epoch": 0.27785873894880014, + "grad_norm": 0.21546585857868195, + "learning_rate": 0.00014480342545737643, + "loss": 1.0834, + "step": 715 + }, + { + "epoch": 0.27824735256970756, + "grad_norm": 0.22675828635692596, + "learning_rate": 0.00014472557415336708, + "loss": 1.055, + "step": 716 + }, + { + "epoch": 0.278635966190615, + "grad_norm": 0.2117871195077896, + "learning_rate": 0.00014464772284935774, + "loss": 1.03, + "step": 717 + }, + { + "epoch": 0.2790245798115224, + "grad_norm": 0.2193155735731125, + "learning_rate": 0.00014456987154534841, + "loss": 1.0073, + "step": 718 + }, + { + "epoch": 0.2794131934324298, + "grad_norm": 0.21447965502738953, + "learning_rate": 0.00014449202024133907, + "loss": 1.0174, + "step": 719 + }, + { + "epoch": 0.27980180705333724, + "grad_norm": 0.22867532074451447, + "learning_rate": 0.00014441416893732972, + "loss": 1.0948, + "step": 720 + }, + { + "epoch": 0.28019042067424466, + "grad_norm": 0.21570557355880737, + "learning_rate": 0.00014433631763332037, + "loss": 1.0105, + "step": 721 + }, + { + "epoch": 0.280579034295152, + "grad_norm": 0.20787014067173004, + "learning_rate": 0.00014425846632931102, + "loss": 1.0384, + "step": 722 + }, + { + "epoch": 0.28096764791605944, + "grad_norm": 0.19924762845039368, + "learning_rate": 0.0001441806150253017, + "loss": 1.0653, + "step": 723 + }, + { + "epoch": 0.28135626153696686, + "grad_norm": 0.1996215283870697, + "learning_rate": 0.00014410276372129235, + "loss": 1.0439, + "step": 724 + }, + { + "epoch": 0.2817448751578743, + "grad_norm": 0.2054813802242279, + "learning_rate": 0.000144024912417283, + "loss": 0.9895, + "step": 725 + }, + { + "epoch": 0.2821334887787817, + "grad_norm": 0.2268310785293579, + "learning_rate": 0.00014394706111327366, + "loss": 1.0993, + "step": 726 + }, + { + "epoch": 0.2825221023996891, + "grad_norm": 0.19867680966854095, + "learning_rate": 0.0001438692098092643, + "loss": 0.985, + "step": 727 + }, + { + "epoch": 0.28291071602059653, + "grad_norm": 0.21099598705768585, + "learning_rate": 0.000143791358505255, + "loss": 1.0333, + "step": 728 + }, + { + "epoch": 0.28329932964150395, + "grad_norm": 0.22479215264320374, + "learning_rate": 0.00014371350720124564, + "loss": 1.0449, + "step": 729 + }, + { + "epoch": 0.28368794326241137, + "grad_norm": 0.22717688977718353, + "learning_rate": 0.0001436356558972363, + "loss": 1.0482, + "step": 730 + }, + { + "epoch": 0.2840765568833188, + "grad_norm": 0.20389345288276672, + "learning_rate": 0.00014355780459322695, + "loss": 0.956, + "step": 731 + }, + { + "epoch": 0.28446517050422615, + "grad_norm": 0.21583619713783264, + "learning_rate": 0.0001434799532892176, + "loss": 1.0154, + "step": 732 + }, + { + "epoch": 0.28485378412513357, + "grad_norm": 0.2219148874282837, + "learning_rate": 0.00014340210198520825, + "loss": 1.0553, + "step": 733 + }, + { + "epoch": 0.285242397746041, + "grad_norm": 0.19920189678668976, + "learning_rate": 0.00014332425068119893, + "loss": 0.9881, + "step": 734 + }, + { + "epoch": 0.2856310113669484, + "grad_norm": 0.2295670360326767, + "learning_rate": 0.00014324639937718958, + "loss": 1.0529, + "step": 735 + }, + { + "epoch": 0.2860196249878558, + "grad_norm": 0.21271567046642303, + "learning_rate": 0.00014316854807318023, + "loss": 1.037, + "step": 736 + }, + { + "epoch": 0.28640823860876324, + "grad_norm": 0.21304361522197723, + "learning_rate": 0.00014309069676917088, + "loss": 1.048, + "step": 737 + }, + { + "epoch": 0.28679685222967066, + "grad_norm": 0.19902732968330383, + "learning_rate": 0.00014301284546516154, + "loss": 1.0306, + "step": 738 + }, + { + "epoch": 0.2871854658505781, + "grad_norm": 0.1995929330587387, + "learning_rate": 0.00014293499416115222, + "loss": 1.0394, + "step": 739 + }, + { + "epoch": 0.2875740794714855, + "grad_norm": 0.20426060259342194, + "learning_rate": 0.00014285714285714287, + "loss": 1.0052, + "step": 740 + }, + { + "epoch": 0.28796269309239286, + "grad_norm": 0.20284566283226013, + "learning_rate": 0.00014277929155313352, + "loss": 1.0115, + "step": 741 + }, + { + "epoch": 0.2883513067133003, + "grad_norm": 0.2041557878255844, + "learning_rate": 0.00014270144024912417, + "loss": 1.0473, + "step": 742 + }, + { + "epoch": 0.2887399203342077, + "grad_norm": 0.2152249962091446, + "learning_rate": 0.00014262358894511482, + "loss": 1.0802, + "step": 743 + }, + { + "epoch": 0.2891285339551151, + "grad_norm": 0.20569871366024017, + "learning_rate": 0.0001425457376411055, + "loss": 1.0203, + "step": 744 + }, + { + "epoch": 0.28951714757602254, + "grad_norm": 0.21128378808498383, + "learning_rate": 0.00014246788633709616, + "loss": 1.108, + "step": 745 + }, + { + "epoch": 0.28990576119692996, + "grad_norm": 0.19587135314941406, + "learning_rate": 0.0001423900350330868, + "loss": 1.0427, + "step": 746 + }, + { + "epoch": 0.2902943748178374, + "grad_norm": 0.22052550315856934, + "learning_rate": 0.00014231218372907746, + "loss": 1.055, + "step": 747 + }, + { + "epoch": 0.2906829884387448, + "grad_norm": 0.21291717886924744, + "learning_rate": 0.0001422343324250681, + "loss": 1.0591, + "step": 748 + }, + { + "epoch": 0.2910716020596522, + "grad_norm": 0.20634084939956665, + "learning_rate": 0.0001421564811210588, + "loss": 1.0527, + "step": 749 + }, + { + "epoch": 0.29146021568055963, + "grad_norm": 0.2075488269329071, + "learning_rate": 0.00014207862981704944, + "loss": 1.0786, + "step": 750 + }, + { + "epoch": 0.291848829301467, + "grad_norm": 0.19780080020427704, + "learning_rate": 0.0001420007785130401, + "loss": 1.059, + "step": 751 + }, + { + "epoch": 0.2922374429223744, + "grad_norm": 0.21212074160575867, + "learning_rate": 0.00014192292720903075, + "loss": 1.0346, + "step": 752 + }, + { + "epoch": 0.29262605654328183, + "grad_norm": 0.2218451350927353, + "learning_rate": 0.0001418450759050214, + "loss": 1.0908, + "step": 753 + }, + { + "epoch": 0.29301467016418925, + "grad_norm": 0.20107759535312653, + "learning_rate": 0.00014176722460101208, + "loss": 1.0202, + "step": 754 + }, + { + "epoch": 0.29340328378509667, + "grad_norm": 0.20933273434638977, + "learning_rate": 0.00014168937329700273, + "loss": 1.0719, + "step": 755 + }, + { + "epoch": 0.2937918974060041, + "grad_norm": 0.22369107604026794, + "learning_rate": 0.00014161152199299338, + "loss": 1.0433, + "step": 756 + }, + { + "epoch": 0.2941805110269115, + "grad_norm": 0.2113707810640335, + "learning_rate": 0.00014153367068898403, + "loss": 1.0637, + "step": 757 + }, + { + "epoch": 0.2945691246478189, + "grad_norm": 0.21105700731277466, + "learning_rate": 0.00014145581938497469, + "loss": 1.0468, + "step": 758 + }, + { + "epoch": 0.29495773826872634, + "grad_norm": 0.20189693570137024, + "learning_rate": 0.00014137796808096537, + "loss": 1.0281, + "step": 759 + }, + { + "epoch": 0.2953463518896337, + "grad_norm": 0.1954152137041092, + "learning_rate": 0.00014130011677695602, + "loss": 1.0519, + "step": 760 + }, + { + "epoch": 0.2957349655105411, + "grad_norm": 0.24295592308044434, + "learning_rate": 0.00014122226547294667, + "loss": 1.1303, + "step": 761 + }, + { + "epoch": 0.29612357913144854, + "grad_norm": 0.20158620178699493, + "learning_rate": 0.00014114441416893732, + "loss": 1.0367, + "step": 762 + }, + { + "epoch": 0.29651219275235596, + "grad_norm": 0.20734666287899017, + "learning_rate": 0.00014106656286492797, + "loss": 1.0392, + "step": 763 + }, + { + "epoch": 0.2969008063732634, + "grad_norm": 0.2177533656358719, + "learning_rate": 0.00014098871156091865, + "loss": 1.0619, + "step": 764 + }, + { + "epoch": 0.2972894199941708, + "grad_norm": 0.1961720883846283, + "learning_rate": 0.0001409108602569093, + "loss": 0.9872, + "step": 765 + }, + { + "epoch": 0.2976780336150782, + "grad_norm": 0.21530941128730774, + "learning_rate": 0.00014083300895289996, + "loss": 1.1246, + "step": 766 + }, + { + "epoch": 0.29806664723598564, + "grad_norm": 0.2039783000946045, + "learning_rate": 0.00014075515764889064, + "loss": 1.0789, + "step": 767 + }, + { + "epoch": 0.29845526085689306, + "grad_norm": 0.20641569793224335, + "learning_rate": 0.0001406773063448813, + "loss": 1.05, + "step": 768 + }, + { + "epoch": 0.2988438744778004, + "grad_norm": 0.2071225494146347, + "learning_rate": 0.00014059945504087194, + "loss": 1.047, + "step": 769 + }, + { + "epoch": 0.29923248809870784, + "grad_norm": 0.20367531478405, + "learning_rate": 0.00014052160373686262, + "loss": 1.0734, + "step": 770 + }, + { + "epoch": 0.29962110171961526, + "grad_norm": 0.21718619763851166, + "learning_rate": 0.00014044375243285327, + "loss": 1.0613, + "step": 771 + }, + { + "epoch": 0.3000097153405227, + "grad_norm": 0.21649087965488434, + "learning_rate": 0.00014036590112884392, + "loss": 1.0671, + "step": 772 + }, + { + "epoch": 0.3003983289614301, + "grad_norm": 0.22223225235939026, + "learning_rate": 0.00014028804982483458, + "loss": 1.0977, + "step": 773 + }, + { + "epoch": 0.3007869425823375, + "grad_norm": 0.23101870715618134, + "learning_rate": 0.00014021019852082523, + "loss": 1.1236, + "step": 774 + }, + { + "epoch": 0.30117555620324493, + "grad_norm": 0.22855506837368011, + "learning_rate": 0.0001401323472168159, + "loss": 1.0517, + "step": 775 + }, + { + "epoch": 0.30156416982415235, + "grad_norm": 0.20862117409706116, + "learning_rate": 0.00014005449591280656, + "loss": 1.0493, + "step": 776 + }, + { + "epoch": 0.30195278344505977, + "grad_norm": 0.21692048013210297, + "learning_rate": 0.0001399766446087972, + "loss": 1.0681, + "step": 777 + }, + { + "epoch": 0.3023413970659672, + "grad_norm": 0.21541331708431244, + "learning_rate": 0.00013989879330478786, + "loss": 1.0775, + "step": 778 + }, + { + "epoch": 0.30273001068687455, + "grad_norm": 0.21221749484539032, + "learning_rate": 0.00013982094200077851, + "loss": 1.0421, + "step": 779 + }, + { + "epoch": 0.30311862430778197, + "grad_norm": 0.22497743368148804, + "learning_rate": 0.0001397430906967692, + "loss": 1.1115, + "step": 780 + }, + { + "epoch": 0.3035072379286894, + "grad_norm": 0.1974119246006012, + "learning_rate": 0.00013966523939275985, + "loss": 1.0264, + "step": 781 + }, + { + "epoch": 0.3038958515495968, + "grad_norm": 0.20349323749542236, + "learning_rate": 0.0001395873880887505, + "loss": 1.0512, + "step": 782 + }, + { + "epoch": 0.3042844651705042, + "grad_norm": 0.21116937696933746, + "learning_rate": 0.00013950953678474115, + "loss": 1.0135, + "step": 783 + }, + { + "epoch": 0.30467307879141164, + "grad_norm": 0.2133677899837494, + "learning_rate": 0.0001394316854807318, + "loss": 1.0694, + "step": 784 + }, + { + "epoch": 0.30506169241231906, + "grad_norm": 0.20406191051006317, + "learning_rate": 0.00013935383417672248, + "loss": 1.0179, + "step": 785 + }, + { + "epoch": 0.3054503060332265, + "grad_norm": 0.21428678929805756, + "learning_rate": 0.00013927598287271313, + "loss": 1.0577, + "step": 786 + }, + { + "epoch": 0.3058389196541339, + "grad_norm": 0.20878921449184418, + "learning_rate": 0.00013919813156870379, + "loss": 1.0311, + "step": 787 + }, + { + "epoch": 0.30622753327504126, + "grad_norm": 0.19033175706863403, + "learning_rate": 0.00013912028026469444, + "loss": 0.976, + "step": 788 + }, + { + "epoch": 0.3066161468959487, + "grad_norm": 0.22138020396232605, + "learning_rate": 0.0001390424289606851, + "loss": 1.0438, + "step": 789 + }, + { + "epoch": 0.3070047605168561, + "grad_norm": 0.20765596628189087, + "learning_rate": 0.00013896457765667577, + "loss": 1.0865, + "step": 790 + }, + { + "epoch": 0.3073933741377635, + "grad_norm": 0.209733247756958, + "learning_rate": 0.00013888672635266642, + "loss": 1.0648, + "step": 791 + }, + { + "epoch": 0.30778198775867094, + "grad_norm": 0.1896686851978302, + "learning_rate": 0.00013880887504865707, + "loss": 1.0133, + "step": 792 + }, + { + "epoch": 0.30817060137957836, + "grad_norm": 0.21651998162269592, + "learning_rate": 0.00013873102374464772, + "loss": 1.0729, + "step": 793 + }, + { + "epoch": 0.3085592150004858, + "grad_norm": 0.21751996874809265, + "learning_rate": 0.00013865317244063838, + "loss": 1.0444, + "step": 794 + }, + { + "epoch": 0.3089478286213932, + "grad_norm": 0.20593520998954773, + "learning_rate": 0.00013857532113662906, + "loss": 1.0304, + "step": 795 + }, + { + "epoch": 0.3093364422423006, + "grad_norm": 0.19937261939048767, + "learning_rate": 0.0001384974698326197, + "loss": 1.0017, + "step": 796 + }, + { + "epoch": 0.30972505586320803, + "grad_norm": 0.18901696801185608, + "learning_rate": 0.00013841961852861036, + "loss": 1.0362, + "step": 797 + }, + { + "epoch": 0.3101136694841154, + "grad_norm": 0.2079760730266571, + "learning_rate": 0.000138341767224601, + "loss": 1.0784, + "step": 798 + }, + { + "epoch": 0.3105022831050228, + "grad_norm": 0.24873265624046326, + "learning_rate": 0.00013826391592059166, + "loss": 1.1026, + "step": 799 + }, + { + "epoch": 0.31089089672593023, + "grad_norm": 0.20185396075248718, + "learning_rate": 0.00013818606461658234, + "loss": 1.0235, + "step": 800 + }, + { + "epoch": 0.31127951034683765, + "grad_norm": 0.211393803358078, + "learning_rate": 0.000138108213312573, + "loss": 1.0999, + "step": 801 + }, + { + "epoch": 0.31166812396774507, + "grad_norm": 0.19948823750019073, + "learning_rate": 0.00013803036200856365, + "loss": 1.0242, + "step": 802 + }, + { + "epoch": 0.3120567375886525, + "grad_norm": 0.21470944583415985, + "learning_rate": 0.0001379525107045543, + "loss": 1.0736, + "step": 803 + }, + { + "epoch": 0.3124453512095599, + "grad_norm": 0.2195902317762375, + "learning_rate": 0.00013787465940054495, + "loss": 1.0368, + "step": 804 + }, + { + "epoch": 0.3128339648304673, + "grad_norm": 0.22142355144023895, + "learning_rate": 0.00013779680809653563, + "loss": 1.1022, + "step": 805 + }, + { + "epoch": 0.31322257845137474, + "grad_norm": 0.20487886667251587, + "learning_rate": 0.00013771895679252628, + "loss": 1.0478, + "step": 806 + }, + { + "epoch": 0.3136111920722821, + "grad_norm": 0.217549130320549, + "learning_rate": 0.00013764110548851693, + "loss": 1.0526, + "step": 807 + }, + { + "epoch": 0.3139998056931895, + "grad_norm": 0.20199982821941376, + "learning_rate": 0.0001375632541845076, + "loss": 0.9992, + "step": 808 + }, + { + "epoch": 0.31438841931409695, + "grad_norm": 0.19496634602546692, + "learning_rate": 0.00013748540288049824, + "loss": 1.0179, + "step": 809 + }, + { + "epoch": 0.31477703293500436, + "grad_norm": 0.21999460458755493, + "learning_rate": 0.0001374075515764889, + "loss": 1.0547, + "step": 810 + }, + { + "epoch": 0.3151656465559118, + "grad_norm": 0.21421074867248535, + "learning_rate": 0.00013732970027247957, + "loss": 1.0283, + "step": 811 + }, + { + "epoch": 0.3155542601768192, + "grad_norm": 0.1913364827632904, + "learning_rate": 0.00013725184896847022, + "loss": 0.9826, + "step": 812 + }, + { + "epoch": 0.3159428737977266, + "grad_norm": 0.20509806275367737, + "learning_rate": 0.00013717399766446087, + "loss": 1.0303, + "step": 813 + }, + { + "epoch": 0.31633148741863404, + "grad_norm": 0.20309868454933167, + "learning_rate": 0.00013709614636045153, + "loss": 1.0479, + "step": 814 + }, + { + "epoch": 0.31672010103954146, + "grad_norm": 0.2274443656206131, + "learning_rate": 0.0001370182950564422, + "loss": 1.1311, + "step": 815 + }, + { + "epoch": 0.3171087146604489, + "grad_norm": 0.22785170376300812, + "learning_rate": 0.00013694044375243286, + "loss": 1.1009, + "step": 816 + }, + { + "epoch": 0.31749732828135624, + "grad_norm": 0.2105439007282257, + "learning_rate": 0.0001368625924484235, + "loss": 1.0251, + "step": 817 + }, + { + "epoch": 0.31788594190226366, + "grad_norm": 0.20583970844745636, + "learning_rate": 0.00013678474114441416, + "loss": 1.0833, + "step": 818 + }, + { + "epoch": 0.3182745555231711, + "grad_norm": 0.21091191470623016, + "learning_rate": 0.00013670688984040484, + "loss": 1.071, + "step": 819 + }, + { + "epoch": 0.3186631691440785, + "grad_norm": 0.20645928382873535, + "learning_rate": 0.0001366290385363955, + "loss": 1.0605, + "step": 820 + }, + { + "epoch": 0.3190517827649859, + "grad_norm": 0.1990513950586319, + "learning_rate": 0.00013655118723238614, + "loss": 1.0461, + "step": 821 + }, + { + "epoch": 0.31944039638589333, + "grad_norm": 0.2192249745130539, + "learning_rate": 0.00013647333592837682, + "loss": 1.0975, + "step": 822 + }, + { + "epoch": 0.31982901000680075, + "grad_norm": 0.2157617211341858, + "learning_rate": 0.00013639548462436748, + "loss": 1.091, + "step": 823 + }, + { + "epoch": 0.32021762362770817, + "grad_norm": 0.21964526176452637, + "learning_rate": 0.00013631763332035813, + "loss": 1.0286, + "step": 824 + }, + { + "epoch": 0.3206062372486156, + "grad_norm": 0.2079797089099884, + "learning_rate": 0.00013623978201634878, + "loss": 1.0257, + "step": 825 + }, + { + "epoch": 0.32099485086952295, + "grad_norm": 0.21220168471336365, + "learning_rate": 0.00013616193071233946, + "loss": 1.0046, + "step": 826 + }, + { + "epoch": 0.32138346449043037, + "grad_norm": 0.2885231673717499, + "learning_rate": 0.0001360840794083301, + "loss": 1.1442, + "step": 827 + }, + { + "epoch": 0.3217720781113378, + "grad_norm": 0.2096511274576187, + "learning_rate": 0.00013600622810432076, + "loss": 1.0209, + "step": 828 + }, + { + "epoch": 0.3221606917322452, + "grad_norm": 0.2179451286792755, + "learning_rate": 0.00013592837680031142, + "loss": 1.0548, + "step": 829 + }, + { + "epoch": 0.3225493053531526, + "grad_norm": 0.2096329927444458, + "learning_rate": 0.00013585052549630207, + "loss": 1.0279, + "step": 830 + }, + { + "epoch": 0.32293791897406005, + "grad_norm": 0.22531811892986298, + "learning_rate": 0.00013577267419229275, + "loss": 1.0463, + "step": 831 + }, + { + "epoch": 0.32332653259496746, + "grad_norm": 0.22516901791095734, + "learning_rate": 0.0001356948228882834, + "loss": 1.1127, + "step": 832 + }, + { + "epoch": 0.3237151462158749, + "grad_norm": 0.22487780451774597, + "learning_rate": 0.00013561697158427405, + "loss": 1.0707, + "step": 833 + }, + { + "epoch": 0.3241037598367823, + "grad_norm": 0.20976543426513672, + "learning_rate": 0.0001355391202802647, + "loss": 1.0217, + "step": 834 + }, + { + "epoch": 0.32449237345768966, + "grad_norm": 0.19849295914173126, + "learning_rate": 0.00013546126897625535, + "loss": 1.021, + "step": 835 + }, + { + "epoch": 0.3248809870785971, + "grad_norm": 0.21772268414497375, + "learning_rate": 0.00013538341767224603, + "loss": 1.0605, + "step": 836 + }, + { + "epoch": 0.3252696006995045, + "grad_norm": 0.19670265913009644, + "learning_rate": 0.00013530556636823669, + "loss": 1.0165, + "step": 837 + }, + { + "epoch": 0.3256582143204119, + "grad_norm": 0.19339734315872192, + "learning_rate": 0.00013522771506422734, + "loss": 1.0203, + "step": 838 + }, + { + "epoch": 0.32604682794131934, + "grad_norm": 0.21289557218551636, + "learning_rate": 0.000135149863760218, + "loss": 1.0252, + "step": 839 + }, + { + "epoch": 0.32643544156222676, + "grad_norm": 0.1964789777994156, + "learning_rate": 0.00013507201245620864, + "loss": 1.0392, + "step": 840 + }, + { + "epoch": 0.3268240551831342, + "grad_norm": 0.20783716440200806, + "learning_rate": 0.00013499416115219932, + "loss": 1.0569, + "step": 841 + }, + { + "epoch": 0.3272126688040416, + "grad_norm": 0.22782161831855774, + "learning_rate": 0.00013491630984818997, + "loss": 1.0555, + "step": 842 + }, + { + "epoch": 0.327601282424949, + "grad_norm": 0.22771142423152924, + "learning_rate": 0.00013483845854418063, + "loss": 1.085, + "step": 843 + }, + { + "epoch": 0.32798989604585643, + "grad_norm": 0.19773711264133453, + "learning_rate": 0.00013476060724017128, + "loss": 1.008, + "step": 844 + }, + { + "epoch": 0.3283785096667638, + "grad_norm": 0.22399166226387024, + "learning_rate": 0.00013468275593616193, + "loss": 1.0511, + "step": 845 + }, + { + "epoch": 0.3287671232876712, + "grad_norm": 0.20488236844539642, + "learning_rate": 0.00013460490463215258, + "loss": 1.0883, + "step": 846 + }, + { + "epoch": 0.32915573690857863, + "grad_norm": 0.21387654542922974, + "learning_rate": 0.00013452705332814326, + "loss": 1.0808, + "step": 847 + }, + { + "epoch": 0.32954435052948605, + "grad_norm": 0.1972568780183792, + "learning_rate": 0.0001344492020241339, + "loss": 1.0555, + "step": 848 + }, + { + "epoch": 0.32993296415039347, + "grad_norm": 0.20835663378238678, + "learning_rate": 0.00013437135072012456, + "loss": 1.0473, + "step": 849 + }, + { + "epoch": 0.3303215777713009, + "grad_norm": 0.19707520306110382, + "learning_rate": 0.00013429349941611522, + "loss": 0.9585, + "step": 850 + }, + { + "epoch": 0.3307101913922083, + "grad_norm": 0.19163411855697632, + "learning_rate": 0.00013421564811210587, + "loss": 1.0025, + "step": 851 + }, + { + "epoch": 0.3310988050131157, + "grad_norm": 0.19730083644390106, + "learning_rate": 0.00013413779680809655, + "loss": 1.0696, + "step": 852 + }, + { + "epoch": 0.33148741863402315, + "grad_norm": 0.19537493586540222, + "learning_rate": 0.0001340599455040872, + "loss": 1.0466, + "step": 853 + }, + { + "epoch": 0.3318760322549305, + "grad_norm": 0.2255164235830307, + "learning_rate": 0.00013398209420007785, + "loss": 1.0659, + "step": 854 + }, + { + "epoch": 0.3322646458758379, + "grad_norm": 0.19774770736694336, + "learning_rate": 0.0001339042428960685, + "loss": 1.0326, + "step": 855 + }, + { + "epoch": 0.33265325949674535, + "grad_norm": 0.2004510909318924, + "learning_rate": 0.00013382639159205916, + "loss": 1.0327, + "step": 856 + }, + { + "epoch": 0.33304187311765276, + "grad_norm": 0.19187591969966888, + "learning_rate": 0.00013374854028804984, + "loss": 1.0069, + "step": 857 + }, + { + "epoch": 0.3334304867385602, + "grad_norm": 0.18775832653045654, + "learning_rate": 0.0001336706889840405, + "loss": 1.0083, + "step": 858 + }, + { + "epoch": 0.3338191003594676, + "grad_norm": 0.2005717158317566, + "learning_rate": 0.00013359283768003114, + "loss": 1.0398, + "step": 859 + }, + { + "epoch": 0.334207713980375, + "grad_norm": 0.19705893099308014, + "learning_rate": 0.0001335149863760218, + "loss": 1.0031, + "step": 860 + }, + { + "epoch": 0.33459632760128244, + "grad_norm": 0.19589562714099884, + "learning_rate": 0.00013343713507201244, + "loss": 0.9831, + "step": 861 + }, + { + "epoch": 0.33498494122218986, + "grad_norm": 0.19302591681480408, + "learning_rate": 0.00013335928376800312, + "loss": 1.0009, + "step": 862 + }, + { + "epoch": 0.3353735548430973, + "grad_norm": 0.20499618351459503, + "learning_rate": 0.00013328143246399377, + "loss": 1.0205, + "step": 863 + }, + { + "epoch": 0.33576216846400464, + "grad_norm": 0.20514456927776337, + "learning_rate": 0.00013320358115998443, + "loss": 1.0837, + "step": 864 + }, + { + "epoch": 0.33615078208491206, + "grad_norm": 0.19285848736763, + "learning_rate": 0.00013312572985597508, + "loss": 1.0167, + "step": 865 + }, + { + "epoch": 0.3365393957058195, + "grad_norm": 0.20891553163528442, + "learning_rate": 0.00013304787855196573, + "loss": 1.0127, + "step": 866 + }, + { + "epoch": 0.3369280093267269, + "grad_norm": 0.20511706173419952, + "learning_rate": 0.0001329700272479564, + "loss": 0.964, + "step": 867 + }, + { + "epoch": 0.3373166229476343, + "grad_norm": 0.1855512261390686, + "learning_rate": 0.00013289217594394706, + "loss": 0.9721, + "step": 868 + }, + { + "epoch": 0.33770523656854173, + "grad_norm": 0.20010098814964294, + "learning_rate": 0.00013281432463993771, + "loss": 1.0411, + "step": 869 + }, + { + "epoch": 0.33809385018944915, + "grad_norm": 0.1991325318813324, + "learning_rate": 0.0001327364733359284, + "loss": 0.9658, + "step": 870 + }, + { + "epoch": 0.33848246381035657, + "grad_norm": 0.19895736873149872, + "learning_rate": 0.00013265862203191905, + "loss": 1.0744, + "step": 871 + }, + { + "epoch": 0.338871077431264, + "grad_norm": 0.2091255635023117, + "learning_rate": 0.0001325807707279097, + "loss": 1.0375, + "step": 872 + }, + { + "epoch": 0.33925969105217135, + "grad_norm": 0.21355532109737396, + "learning_rate": 0.00013250291942390035, + "loss": 1.09, + "step": 873 + }, + { + "epoch": 0.33964830467307877, + "grad_norm": 0.21844851970672607, + "learning_rate": 0.00013242506811989103, + "loss": 1.0769, + "step": 874 + }, + { + "epoch": 0.3400369182939862, + "grad_norm": 0.1877543330192566, + "learning_rate": 0.00013234721681588168, + "loss": 1.0199, + "step": 875 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 0.2020038366317749, + "learning_rate": 0.00013226936551187233, + "loss": 1.0218, + "step": 876 + }, + { + "epoch": 0.340814145535801, + "grad_norm": 0.20682141184806824, + "learning_rate": 0.000132191514207863, + "loss": 1.0891, + "step": 877 + }, + { + "epoch": 0.34120275915670845, + "grad_norm": 0.21942824125289917, + "learning_rate": 0.00013211366290385366, + "loss": 0.9877, + "step": 878 + }, + { + "epoch": 0.34159137277761586, + "grad_norm": 0.21150313317775726, + "learning_rate": 0.00013203581159984432, + "loss": 1.0815, + "step": 879 + }, + { + "epoch": 0.3419799863985233, + "grad_norm": 0.2073293924331665, + "learning_rate": 0.00013195796029583497, + "loss": 1.0579, + "step": 880 + }, + { + "epoch": 0.3423686000194307, + "grad_norm": 0.221574068069458, + "learning_rate": 0.00013188010899182562, + "loss": 1.0279, + "step": 881 + }, + { + "epoch": 0.3427572136403381, + "grad_norm": 0.22334492206573486, + "learning_rate": 0.00013180225768781627, + "loss": 1.0837, + "step": 882 + }, + { + "epoch": 0.3431458272612455, + "grad_norm": 0.18817654252052307, + "learning_rate": 0.00013172440638380695, + "loss": 1.0262, + "step": 883 + }, + { + "epoch": 0.3435344408821529, + "grad_norm": 0.20126822590827942, + "learning_rate": 0.0001316465550797976, + "loss": 1.0679, + "step": 884 + }, + { + "epoch": 0.3439230545030603, + "grad_norm": 0.2128864973783493, + "learning_rate": 0.00013156870377578825, + "loss": 1.0316, + "step": 885 + }, + { + "epoch": 0.34431166812396774, + "grad_norm": 0.20054499804973602, + "learning_rate": 0.0001314908524717789, + "loss": 1.0024, + "step": 886 + }, + { + "epoch": 0.34470028174487516, + "grad_norm": 0.21358034014701843, + "learning_rate": 0.00013141300116776956, + "loss": 1.0475, + "step": 887 + }, + { + "epoch": 0.3450888953657826, + "grad_norm": 0.21377703547477722, + "learning_rate": 0.00013133514986376024, + "loss": 1.0957, + "step": 888 + }, + { + "epoch": 0.34547750898669, + "grad_norm": 0.20166514813899994, + "learning_rate": 0.0001312572985597509, + "loss": 1.0189, + "step": 889 + }, + { + "epoch": 0.3458661226075974, + "grad_norm": 0.20424878597259521, + "learning_rate": 0.00013117944725574154, + "loss": 1.0896, + "step": 890 + }, + { + "epoch": 0.34625473622850483, + "grad_norm": 0.19028648734092712, + "learning_rate": 0.0001311015959517322, + "loss": 0.9881, + "step": 891 + }, + { + "epoch": 0.3466433498494122, + "grad_norm": 0.20828665792942047, + "learning_rate": 0.00013102374464772285, + "loss": 0.9932, + "step": 892 + }, + { + "epoch": 0.3470319634703196, + "grad_norm": 0.20756572484970093, + "learning_rate": 0.00013094589334371353, + "loss": 1.0406, + "step": 893 + }, + { + "epoch": 0.34742057709122703, + "grad_norm": 0.20768921077251434, + "learning_rate": 0.00013086804203970418, + "loss": 0.9652, + "step": 894 + }, + { + "epoch": 0.34780919071213445, + "grad_norm": 0.20660027861595154, + "learning_rate": 0.00013079019073569483, + "loss": 1.0728, + "step": 895 + }, + { + "epoch": 0.34819780433304187, + "grad_norm": 0.20186837017536163, + "learning_rate": 0.00013071233943168548, + "loss": 1.0407, + "step": 896 + }, + { + "epoch": 0.3485864179539493, + "grad_norm": 0.20880667865276337, + "learning_rate": 0.00013063448812767613, + "loss": 1.0275, + "step": 897 + }, + { + "epoch": 0.3489750315748567, + "grad_norm": 0.22212949395179749, + "learning_rate": 0.0001305566368236668, + "loss": 1.0293, + "step": 898 + }, + { + "epoch": 0.3493636451957641, + "grad_norm": 0.20552745461463928, + "learning_rate": 0.00013047878551965746, + "loss": 1.0434, + "step": 899 + }, + { + "epoch": 0.34975225881667155, + "grad_norm": 0.21239839494228363, + "learning_rate": 0.00013040093421564812, + "loss": 1.052, + "step": 900 + }, + { + "epoch": 0.3501408724375789, + "grad_norm": 0.22420544922351837, + "learning_rate": 0.00013032308291163877, + "loss": 1.0236, + "step": 901 + }, + { + "epoch": 0.35052948605848633, + "grad_norm": 0.23435090482234955, + "learning_rate": 0.00013024523160762942, + "loss": 1.0876, + "step": 902 + }, + { + "epoch": 0.35091809967939375, + "grad_norm": 0.22763386368751526, + "learning_rate": 0.0001301673803036201, + "loss": 1.0636, + "step": 903 + }, + { + "epoch": 0.35130671330030117, + "grad_norm": 0.20948883891105652, + "learning_rate": 0.00013008952899961075, + "loss": 1.0083, + "step": 904 + }, + { + "epoch": 0.3516953269212086, + "grad_norm": 0.20408779382705688, + "learning_rate": 0.0001300116776956014, + "loss": 1.039, + "step": 905 + }, + { + "epoch": 0.352083940542116, + "grad_norm": 0.2126050591468811, + "learning_rate": 0.00012993382639159206, + "loss": 1.0365, + "step": 906 + }, + { + "epoch": 0.3524725541630234, + "grad_norm": 0.20314334332942963, + "learning_rate": 0.0001298559750875827, + "loss": 1.0474, + "step": 907 + }, + { + "epoch": 0.35286116778393084, + "grad_norm": 0.23720984160900116, + "learning_rate": 0.0001297781237835734, + "loss": 1.0529, + "step": 908 + }, + { + "epoch": 0.35324978140483826, + "grad_norm": 0.22642800211906433, + "learning_rate": 0.00012970027247956404, + "loss": 1.0586, + "step": 909 + }, + { + "epoch": 0.3536383950257457, + "grad_norm": 0.20469972491264343, + "learning_rate": 0.0001296224211755547, + "loss": 1.0267, + "step": 910 + }, + { + "epoch": 0.35402700864665304, + "grad_norm": 0.197368785738945, + "learning_rate": 0.00012954456987154534, + "loss": 1.0348, + "step": 911 + }, + { + "epoch": 0.35441562226756046, + "grad_norm": 0.21924498677253723, + "learning_rate": 0.000129466718567536, + "loss": 1.0861, + "step": 912 + }, + { + "epoch": 0.3548042358884679, + "grad_norm": 0.22006285190582275, + "learning_rate": 0.00012938886726352667, + "loss": 1.0545, + "step": 913 + }, + { + "epoch": 0.3551928495093753, + "grad_norm": 0.22419220209121704, + "learning_rate": 0.00012931101595951733, + "loss": 1.0716, + "step": 914 + }, + { + "epoch": 0.3555814631302827, + "grad_norm": 0.215990349650383, + "learning_rate": 0.00012923316465550798, + "loss": 1.0619, + "step": 915 + }, + { + "epoch": 0.35597007675119013, + "grad_norm": 0.20783264935016632, + "learning_rate": 0.00012915531335149863, + "loss": 1.0412, + "step": 916 + }, + { + "epoch": 0.35635869037209755, + "grad_norm": 0.24584618210792542, + "learning_rate": 0.00012907746204748928, + "loss": 1.1165, + "step": 917 + }, + { + "epoch": 0.35674730399300497, + "grad_norm": 0.23146122694015503, + "learning_rate": 0.00012899961074347996, + "loss": 1.1111, + "step": 918 + }, + { + "epoch": 0.3571359176139124, + "grad_norm": 0.19983729720115662, + "learning_rate": 0.00012892175943947061, + "loss": 1.0674, + "step": 919 + }, + { + "epoch": 0.35752453123481975, + "grad_norm": 0.2161000818014145, + "learning_rate": 0.00012884390813546127, + "loss": 1.076, + "step": 920 + }, + { + "epoch": 0.35791314485572717, + "grad_norm": 0.21042793989181519, + "learning_rate": 0.00012876605683145192, + "loss": 1.0535, + "step": 921 + }, + { + "epoch": 0.3583017584766346, + "grad_norm": 0.20135439932346344, + "learning_rate": 0.0001286882055274426, + "loss": 1.0059, + "step": 922 + }, + { + "epoch": 0.358690372097542, + "grad_norm": 0.19394971430301666, + "learning_rate": 0.00012861035422343325, + "loss": 1.0381, + "step": 923 + }, + { + "epoch": 0.35907898571844943, + "grad_norm": 0.21171030402183533, + "learning_rate": 0.0001285325029194239, + "loss": 1.0513, + "step": 924 + }, + { + "epoch": 0.35946759933935685, + "grad_norm": 0.19476690888404846, + "learning_rate": 0.00012845465161541458, + "loss": 1.0003, + "step": 925 + }, + { + "epoch": 0.35985621296026427, + "grad_norm": 0.20468670129776, + "learning_rate": 0.00012837680031140523, + "loss": 1.0608, + "step": 926 + }, + { + "epoch": 0.3602448265811717, + "grad_norm": 0.21159446239471436, + "learning_rate": 0.00012829894900739588, + "loss": 1.0734, + "step": 927 + }, + { + "epoch": 0.3606334402020791, + "grad_norm": 0.21179519593715668, + "learning_rate": 0.00012822109770338654, + "loss": 1.0957, + "step": 928 + }, + { + "epoch": 0.3610220538229865, + "grad_norm": 0.20997527241706848, + "learning_rate": 0.00012814324639937722, + "loss": 1.0644, + "step": 929 + }, + { + "epoch": 0.3614106674438939, + "grad_norm": 0.21178296208381653, + "learning_rate": 0.00012806539509536787, + "loss": 1.0208, + "step": 930 + }, + { + "epoch": 0.3617992810648013, + "grad_norm": 0.20890356600284576, + "learning_rate": 0.00012798754379135852, + "loss": 1.0888, + "step": 931 + }, + { + "epoch": 0.3621878946857087, + "grad_norm": 0.20177409052848816, + "learning_rate": 0.00012790969248734917, + "loss": 0.9741, + "step": 932 + }, + { + "epoch": 0.36257650830661614, + "grad_norm": 0.23504556715488434, + "learning_rate": 0.00012783184118333982, + "loss": 1.1048, + "step": 933 + }, + { + "epoch": 0.36296512192752356, + "grad_norm": 0.22829356789588928, + "learning_rate": 0.0001277539898793305, + "loss": 1.0798, + "step": 934 + }, + { + "epoch": 0.363353735548431, + "grad_norm": 0.2068483531475067, + "learning_rate": 0.00012767613857532116, + "loss": 1.0452, + "step": 935 + }, + { + "epoch": 0.3637423491693384, + "grad_norm": 0.2093171775341034, + "learning_rate": 0.0001275982872713118, + "loss": 1.0742, + "step": 936 + }, + { + "epoch": 0.3641309627902458, + "grad_norm": 0.21478736400604248, + "learning_rate": 0.00012752043596730246, + "loss": 1.0572, + "step": 937 + }, + { + "epoch": 0.36451957641115323, + "grad_norm": 0.1906953752040863, + "learning_rate": 0.0001274425846632931, + "loss": 1.0107, + "step": 938 + }, + { + "epoch": 0.3649081900320606, + "grad_norm": 0.20580604672431946, + "learning_rate": 0.0001273647333592838, + "loss": 1.0677, + "step": 939 + }, + { + "epoch": 0.365296803652968, + "grad_norm": 0.22586850821971893, + "learning_rate": 0.00012728688205527444, + "loss": 1.0389, + "step": 940 + }, + { + "epoch": 0.36568541727387543, + "grad_norm": 0.199899360537529, + "learning_rate": 0.0001272090307512651, + "loss": 1.0462, + "step": 941 + }, + { + "epoch": 0.36607403089478285, + "grad_norm": 0.19881689548492432, + "learning_rate": 0.00012713117944725575, + "loss": 1.0565, + "step": 942 + }, + { + "epoch": 0.3664626445156903, + "grad_norm": 0.21748925745487213, + "learning_rate": 0.0001270533281432464, + "loss": 1.0659, + "step": 943 + }, + { + "epoch": 0.3668512581365977, + "grad_norm": 0.19363689422607422, + "learning_rate": 0.00012697547683923708, + "loss": 1.0307, + "step": 944 + }, + { + "epoch": 0.3672398717575051, + "grad_norm": 0.21701784431934357, + "learning_rate": 0.00012689762553522773, + "loss": 1.0684, + "step": 945 + }, + { + "epoch": 0.36762848537841253, + "grad_norm": 0.21406958997249603, + "learning_rate": 0.00012681977423121838, + "loss": 1.0703, + "step": 946 + }, + { + "epoch": 0.36801709899931995, + "grad_norm": 0.23539729416370392, + "learning_rate": 0.00012674192292720903, + "loss": 1.1537, + "step": 947 + }, + { + "epoch": 0.36840571262022737, + "grad_norm": 0.2177354395389557, + "learning_rate": 0.00012666407162319969, + "loss": 1.0131, + "step": 948 + }, + { + "epoch": 0.36879432624113473, + "grad_norm": 0.255346417427063, + "learning_rate": 0.00012658622031919037, + "loss": 0.9807, + "step": 949 + }, + { + "epoch": 0.36918293986204215, + "grad_norm": 0.2139921486377716, + "learning_rate": 0.00012650836901518102, + "loss": 1.0392, + "step": 950 + }, + { + "epoch": 0.36957155348294957, + "grad_norm": 0.22490833699703217, + "learning_rate": 0.00012643051771117167, + "loss": 1.0512, + "step": 951 + }, + { + "epoch": 0.369960167103857, + "grad_norm": 0.20698820054531097, + "learning_rate": 0.00012635266640716232, + "loss": 1.0391, + "step": 952 + }, + { + "epoch": 0.3703487807247644, + "grad_norm": 0.2276201844215393, + "learning_rate": 0.00012627481510315297, + "loss": 1.0513, + "step": 953 + }, + { + "epoch": 0.3707373943456718, + "grad_norm": 0.2493600994348526, + "learning_rate": 0.00012619696379914365, + "loss": 1.0136, + "step": 954 + }, + { + "epoch": 0.37112600796657924, + "grad_norm": 0.2155001014471054, + "learning_rate": 0.0001261191124951343, + "loss": 1.0523, + "step": 955 + }, + { + "epoch": 0.37151462158748666, + "grad_norm": 0.21571211516857147, + "learning_rate": 0.00012604126119112496, + "loss": 1.0288, + "step": 956 + }, + { + "epoch": 0.3719032352083941, + "grad_norm": 0.23238877952098846, + "learning_rate": 0.0001259634098871156, + "loss": 1.0638, + "step": 957 + }, + { + "epoch": 0.37229184882930144, + "grad_norm": 0.2002813220024109, + "learning_rate": 0.00012588555858310626, + "loss": 0.9665, + "step": 958 + }, + { + "epoch": 0.37268046245020886, + "grad_norm": 0.21712858974933624, + "learning_rate": 0.0001258077072790969, + "loss": 1.0469, + "step": 959 + }, + { + "epoch": 0.3730690760711163, + "grad_norm": 0.2178192287683487, + "learning_rate": 0.0001257298559750876, + "loss": 1.0267, + "step": 960 + }, + { + "epoch": 0.3734576896920237, + "grad_norm": 0.25488024950027466, + "learning_rate": 0.00012565200467107824, + "loss": 1.0153, + "step": 961 + }, + { + "epoch": 0.3738463033129311, + "grad_norm": 0.20070038735866547, + "learning_rate": 0.0001255741533670689, + "loss": 1.0279, + "step": 962 + }, + { + "epoch": 0.37423491693383854, + "grad_norm": 0.21885356307029724, + "learning_rate": 0.00012549630206305955, + "loss": 1.0395, + "step": 963 + }, + { + "epoch": 0.37462353055474595, + "grad_norm": 0.2407921701669693, + "learning_rate": 0.0001254184507590502, + "loss": 1.0767, + "step": 964 + }, + { + "epoch": 0.3750121441756534, + "grad_norm": 0.20645053684711456, + "learning_rate": 0.00012534059945504088, + "loss": 1.0318, + "step": 965 + }, + { + "epoch": 0.3754007577965608, + "grad_norm": 0.21275092661380768, + "learning_rate": 0.00012526274815103153, + "loss": 1.0546, + "step": 966 + }, + { + "epoch": 0.3757893714174682, + "grad_norm": 0.21574917435646057, + "learning_rate": 0.00012518489684702218, + "loss": 1.032, + "step": 967 + }, + { + "epoch": 0.3761779850383756, + "grad_norm": 0.21589480340480804, + "learning_rate": 0.00012510704554301284, + "loss": 1.0834, + "step": 968 + }, + { + "epoch": 0.376566598659283, + "grad_norm": 0.19576796889305115, + "learning_rate": 0.0001250291942390035, + "loss": 1.0178, + "step": 969 + }, + { + "epoch": 0.3769552122801904, + "grad_norm": 0.20941287279129028, + "learning_rate": 0.00012495134293499417, + "loss": 1.0712, + "step": 970 + }, + { + "epoch": 0.37734382590109783, + "grad_norm": 0.22585494816303253, + "learning_rate": 0.00012487349163098482, + "loss": 1.0401, + "step": 971 + }, + { + "epoch": 0.37773243952200525, + "grad_norm": 0.21093420684337616, + "learning_rate": 0.00012479564032697547, + "loss": 1.0569, + "step": 972 + }, + { + "epoch": 0.37812105314291267, + "grad_norm": 0.22375014424324036, + "learning_rate": 0.00012471778902296612, + "loss": 1.0687, + "step": 973 + }, + { + "epoch": 0.3785096667638201, + "grad_norm": 0.19787487387657166, + "learning_rate": 0.0001246399377189568, + "loss": 1.0266, + "step": 974 + }, + { + "epoch": 0.3788982803847275, + "grad_norm": 0.20633013546466827, + "learning_rate": 0.00012456208641494745, + "loss": 0.9996, + "step": 975 + }, + { + "epoch": 0.3792868940056349, + "grad_norm": 0.21559873223304749, + "learning_rate": 0.0001244842351109381, + "loss": 1.0851, + "step": 976 + }, + { + "epoch": 0.3796755076265423, + "grad_norm": 0.2166333943605423, + "learning_rate": 0.00012440638380692879, + "loss": 1.0859, + "step": 977 + }, + { + "epoch": 0.3800641212474497, + "grad_norm": 0.18558773398399353, + "learning_rate": 0.00012432853250291944, + "loss": 0.9534, + "step": 978 + }, + { + "epoch": 0.3804527348683571, + "grad_norm": 0.2086942344903946, + "learning_rate": 0.0001242506811989101, + "loss": 1.0786, + "step": 979 + }, + { + "epoch": 0.38084134848926454, + "grad_norm": 0.2207823544740677, + "learning_rate": 0.00012417282989490074, + "loss": 1.0626, + "step": 980 + }, + { + "epoch": 0.38122996211017196, + "grad_norm": 0.21255749464035034, + "learning_rate": 0.00012409497859089142, + "loss": 1.063, + "step": 981 + }, + { + "epoch": 0.3816185757310794, + "grad_norm": 0.20682042837142944, + "learning_rate": 0.00012401712728688207, + "loss": 1.034, + "step": 982 + }, + { + "epoch": 0.3820071893519868, + "grad_norm": 0.2084134966135025, + "learning_rate": 0.00012393927598287272, + "loss": 1.0481, + "step": 983 + }, + { + "epoch": 0.3823958029728942, + "grad_norm": 0.1922312080860138, + "learning_rate": 0.00012386142467886338, + "loss": 1.0461, + "step": 984 + }, + { + "epoch": 0.38278441659380164, + "grad_norm": 0.20893707871437073, + "learning_rate": 0.00012378357337485406, + "loss": 1.0797, + "step": 985 + }, + { + "epoch": 0.383173030214709, + "grad_norm": 0.19717541337013245, + "learning_rate": 0.0001237057220708447, + "loss": 1.0028, + "step": 986 + }, + { + "epoch": 0.3835616438356164, + "grad_norm": 0.20688053965568542, + "learning_rate": 0.00012362787076683536, + "loss": 0.989, + "step": 987 + }, + { + "epoch": 0.38395025745652384, + "grad_norm": 0.20580583810806274, + "learning_rate": 0.000123550019462826, + "loss": 1.06, + "step": 988 + }, + { + "epoch": 0.38433887107743125, + "grad_norm": 0.2151709794998169, + "learning_rate": 0.00012347216815881666, + "loss": 1.0685, + "step": 989 + }, + { + "epoch": 0.3847274846983387, + "grad_norm": 0.19573980569839478, + "learning_rate": 0.00012339431685480734, + "loss": 1.0072, + "step": 990 + }, + { + "epoch": 0.3851160983192461, + "grad_norm": 0.1949119120836258, + "learning_rate": 0.000123316465550798, + "loss": 0.9995, + "step": 991 + }, + { + "epoch": 0.3855047119401535, + "grad_norm": 0.2062375247478485, + "learning_rate": 0.00012323861424678865, + "loss": 1.0694, + "step": 992 + }, + { + "epoch": 0.38589332556106093, + "grad_norm": 0.2007209211587906, + "learning_rate": 0.0001231607629427793, + "loss": 1.0397, + "step": 993 + }, + { + "epoch": 0.38628193918196835, + "grad_norm": 0.2231544405221939, + "learning_rate": 0.00012308291163876995, + "loss": 1.0755, + "step": 994 + }, + { + "epoch": 0.38667055280287577, + "grad_norm": 0.2103337049484253, + "learning_rate": 0.0001230050603347606, + "loss": 1.0505, + "step": 995 + }, + { + "epoch": 0.38705916642378313, + "grad_norm": 0.20178386569023132, + "learning_rate": 0.00012292720903075128, + "loss": 1.0696, + "step": 996 + }, + { + "epoch": 0.38744778004469055, + "grad_norm": 0.21268007159233093, + "learning_rate": 0.00012284935772674193, + "loss": 1.0262, + "step": 997 + }, + { + "epoch": 0.38783639366559797, + "grad_norm": 0.21439722180366516, + "learning_rate": 0.0001227715064227326, + "loss": 1.0718, + "step": 998 + }, + { + "epoch": 0.3882250072865054, + "grad_norm": 0.19691336154937744, + "learning_rate": 0.00012269365511872324, + "loss": 0.9663, + "step": 999 + }, + { + "epoch": 0.3886136209074128, + "grad_norm": 0.2165926694869995, + "learning_rate": 0.0001226158038147139, + "loss": 1.0432, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 2574, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.588818692527948e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/outputs/checkpoint-1500/README.md b/outputs/checkpoint-1500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3abf956c074d00f34a12693c8d6da9738211d7c7 --- /dev/null +++ b/outputs/checkpoint-1500/README.md @@ -0,0 +1,209 @@ +--- +base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit +library_name: peft +tags: +- base_model:adapter:unsloth/gpt-oss-20b-unsloth-bnb-4bit +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/outputs/checkpoint-1500/adapter_config.json b/outputs/checkpoint-1500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e285b9b6e018b5b9f23736d6699eb1a4267764e7 --- /dev/null +++ b/outputs/checkpoint-1500/adapter_config.json @@ -0,0 +1,45 @@ +{ + "alpha_pattern": {}, + "auto_mapping": { + "base_model_class": "GptOssForCausalLM", + "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss" + }, + "base_model_name_or_path": "unsloth/gpt-oss-20b-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "up_proj", + "down_proj", + "gate_proj", + "k_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/outputs/checkpoint-1500/chat_template.jinja b/outputs/checkpoint-1500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..a3650f886e98b2834c25727759c8e0ab8495f316 --- /dev/null +++ b/outputs/checkpoint-1500/chat_template.jinja @@ -0,0 +1,315 @@ +{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #} +{#- + In addition to the normal inputs of `messages` and `tools`, this template also accepts the + following kwargs: + - "builtin_tools": A list, can contain "browser" and/or "python". + - "model_identity": A string that optionally describes the model identity. + - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium". + #} + +{#- Tool Definition Rendering ============================================== #} +{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%} + {%- if param_spec.type == "array" -%} + {%- if param_spec['items'] -%} + {%- if param_spec['items']['type'] == "string" -%} + {{- "string[]" }} + {%- elif param_spec['items']['type'] == "number" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "integer" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "boolean" -%} + {{- "boolean[]" }} + {%- else -%} + {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%} + {%- if inner_type == "object | object" or inner_type|length > 50 -%} + {{- "any[]" }} + {%- else -%} + {{- inner_type + "[]" }} + {%- endif -%} + {%- endif -%} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- else -%} + {{- "any[]" }} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%} + {#- Handle array of types like ["object", "object"] from Union[dict, list] #} + {%- if param_spec.type | length > 1 -%} + {{- param_spec.type | join(" | ") }} + {%- else -%} + {{- param_spec.type[0] }} + {%- endif -%} + {%- elif param_spec.oneOf -%} + {#- Handle oneOf schemas - check for complex unions and fallback to any #} + {%- set has_object_variants = false -%} + {%- for variant in param_spec.oneOf -%} + {%- if variant.type == "object" -%} + {%- set has_object_variants = true -%} + {%- endif -%} + {%- endfor -%} + {%- if has_object_variants and param_spec.oneOf|length > 1 -%} + {{- "any" }} + {%- else -%} + {%- for variant in param_spec.oneOf -%} + {{- render_typescript_type(variant, required_params) -}} + {%- if variant.description %} + {{- "// " + variant.description }} + {%- endif -%} + {%- if variant.default is defined %} + {{ "// default: " + variant.default|tojson }} + {%- endif -%} + {%- if not loop.last %} + {{- " | " }} + {% endif -%} + {%- endfor -%} + {%- endif -%} + {%- elif param_spec.type == "string" -%} + {%- if param_spec.enum -%} + {{- '"' + param_spec.enum|join('" | "') + '"' -}} + {%- else -%} + {{- "string" }} + {%- if param_spec.nullable %} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type == "number" -%} + {{- "number" }} + {%- elif param_spec.type == "integer" -%} + {{- "number" }} + {%- elif param_spec.type == "boolean" -%} + {{- "boolean" }} + + {%- elif param_spec.type == "object" -%} + {%- if param_spec.properties -%} + {{- "{\n" }} + {%- for prop_name, prop_spec in param_spec.properties.items() -%} + {{- prop_name -}} + {%- if prop_name not in (param_spec.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{ render_typescript_type(prop_spec, param_spec.required or []) }} + {%- if not loop.last -%} + {{-", " }} + {%- endif -%} + {%- endfor -%} + {{- "}" }} + {%- else -%} + {{- "object" }} + {%- endif -%} + {%- else -%} + {{- "any" }} + {%- endif -%} +{%- endmacro -%} + +{%- macro render_tool_namespace(namespace_name, tools) -%} + {{- "## " + namespace_name + "\n\n" }} + {{- "namespace " + namespace_name + " {\n\n" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- "// " + tool.description + "\n" }} + {{- "type "+ tool.name + " = " }} + {%- if tool.parameters and tool.parameters.properties -%} + {{- "(_: " }} + {{- "{\n" }} + {%- for param_name, param_spec in tool.parameters.properties.items() %} + {{- "// " + param_spec.description + "\n" }} + {{- param_name }} + {%- if param_name not in (tool.parameters.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{- render_typescript_type(param_spec, tool.parameters.required or []) }} + {%- if param_spec.default is defined -%} + {%- if param_spec.enum %} + {{- ", // default: " + param_spec.default }} + {%- elif param_spec.oneOf %} + {{- "// default: " + param_spec.default }} + {%- else %} + {{- ", // default: " + param_spec.default|tojson }} + {%- endif -%} + {%- endif -%} + {%- if not loop.last %} + {{- ",\n" }} + {%- else %} + {{- "\n" }} + {%- endif -%} + {%- endfor %} + {{- "}) => any;\n\n" }} + {%- else -%} + {{- "() => any;\n\n" }} + {%- endif -%} + {%- endfor %} + {{- "} // namespace " + namespace_name }} +{%- endmacro -%} + +{%- macro render_builtin_tools(browser_tool, python_tool) -%} + {%- if browser_tool %} + {{- "## browser\n\n" }} + {{- "// Tool for browsing.\n" }} + {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }} + {{- "// Cite information from the tool using the following format:\n" }} + {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }} + {{- "// Do not quote more than 10 words directly from the tool output.\n" }} + {{- "// sources=web (default: web)\n" }} + {{- "namespace browser {\n\n" }} + {{- "// Searches for information related to `query` and displays `topn` results.\n" }} + {{- "type search = (_: {\n" }} + {{- "query: string,\n" }} + {{- "topn?: number, // default: 10\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }} + {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }} + {{- "// If `cursor` is not provided, the most recent page is implied.\n" }} + {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }} + {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }} + {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }} + {{- "type open = (_: {\n" }} + {{- "id?: number | string, // default: -1\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "loc?: number, // default: -1\n" }} + {{- "num_lines?: number, // default: -1\n" }} + {{- "view_source?: boolean, // default: false\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }} + {{- "type find = (_: {\n" }} + {{- "pattern: string,\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "}) => any;\n\n" }} + {{- "} // namespace browser\n\n" }} + {%- endif -%} + + {%- if python_tool %} + {{- "## python\n\n" }} + {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }} + {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }} + {%- endif -%} +{%- endmacro -%} + +{#- System Message Construction ============================================ #} +{%- macro build_system_message() -%} + {%- if model_identity is not defined %} + {{- "You are ChatGPT, a large language model trained by OpenAI.\n" -}} + {%- else %} + {{- model_identity }} + {%- endif %} + {{- "Knowledge cutoff: 2024-06\n" }} + {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }} + {%- if reasoning_effort is not defined %} + {%- set reasoning_effort = "medium" %} + {%- endif %} + {{- "Reasoning: " + reasoning_effort + "\n\n" }} + {%- if builtin_tools is defined %} + {{- "# Tools\n\n" }} + {%- set available_builtin_tools = namespace(browser=false, python=false) %} + {%- for tool in builtin_tools %} + {%- if tool == "browser" %} + {%- set available_builtin_tools.browser = true %} + {%- elif tool == "python" %} + {%- set available_builtin_tools.python = true %} + {%- endif %} + {%- endfor %} + {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }} + {%- endif -%} + {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }} + {%- if tools is defined -%} + {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }} + {%- endif -%} +{%- endmacro -%} + +{#- Main Template Logic ================================================= #} +{#- Set defaults #} + +{#- Render system message #} +{{- "<|start|>system<|message|>" }} +{{- build_system_message() }} +{{- "<|end|>" }} + +{#- Extract developer message #} +{%- if messages[0].role == "developer" or messages[0].role == "system" %} + {%- set developer_message = messages[0].content %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set developer_message = "" %} + {%- set loop_messages = messages %} +{%- endif %} + +{#- Render developer message #} +{%- if developer_message or tools %} + {{- "<|start|>developer<|message|>" }} + {%- if developer_message %} + {{- "# Instructions\n\n" }} + {{- developer_message }} + {%- endif %} + {%- if tools -%} + {{- "\n\n" }} + {{- "# Tools\n\n" }} + {{- render_tool_namespace("functions", tools) }} + {%- endif -%} + {{- "<|end|>" }} +{%- endif %} + +{#- Render messages #} +{%- set last_tool_call = namespace(name=none) %} +{%- for message in loop_messages -%} + {#- At this point only assistant/user/tool messages should remain #} + {%- if message.role == 'assistant' -%} + {%- if "tool_calls" in message %} + {#- We assume max 1 tool call per message, and so we infer the tool call name #} + {#- in "tool" messages from the most recent assistant tool call name #} + {%- set tool_call = message.tool_calls[0] %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if message.content %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }} + {%- endif %} + {{- "<|start|>assistant to=" }} + {{- "functions." + tool_call.name + "<|channel|>commentary json<|message|>" }} + {{- tool_call.arguments|tojson }} + {{- "<|call|>" }} + {%- set last_tool_call.name = tool_call.name %} + {%- elif "thinking" in message and loop.last and not add_generation_prompt %} + {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #} + {#- This is a situation that should only occur in training, never in inference. #} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }} + {%- set last_tool_call.name = none %} + {%- elif "thinking" in message %} + {#- CoT is dropped during all previous turns, so we never render it for inference #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- elif loop.last and not add_generation_prompt %} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|message|>" + message.content + "<|return|>" }} + {%- else %} + {{- "<|start|>assistant<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- endif %} + {%- elif message.role == 'tool' -%} + {%- if last_tool_call.name is none %} + {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }} + {%- endif %} + {{- "<|start|>functions." + last_tool_call.name }} + {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} + {%- else -%} + {{- "<|start|>user<|message|>" + message.content + "<|end|>" }} + {%- endif -%} +{%- endfor -%} + +{#- Generation prompt #} +{%- if add_generation_prompt -%} +<|start|>assistant +{%- endif -%} +{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #} \ No newline at end of file diff --git a/outputs/checkpoint-1500/optimizer.pt b/outputs/checkpoint-1500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..edb671a87ce6447336468309d51b19215040e05a --- /dev/null +++ b/outputs/checkpoint-1500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7f1f22a5f354441b5f815d259903c11b98274ed999c6581547affb39792f494 +size 16894883 diff --git a/outputs/checkpoint-1500/special_tokens_map.json b/outputs/checkpoint-1500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..6fba18753f4d09dbb8fcdf1482daff36b963d639 --- /dev/null +++ b/outputs/checkpoint-1500/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/outputs/checkpoint-1500/tokenizer.json b/outputs/checkpoint-1500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647 --- /dev/null +++ b/outputs/checkpoint-1500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3 +size 27868174 diff --git a/outputs/checkpoint-1500/tokenizer_config.json b/outputs/checkpoint-1500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..482ae30d27a74c38d2228e69dd37c529fc485a45 --- /dev/null +++ b/outputs/checkpoint-1500/tokenizer_config.json @@ -0,0 +1,185 @@ +{ + "added_tokens_decoder": { + "199998": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "199999": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200000": { + "content": "<|reserved_200000|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200001": { + "content": "<|reserved_200001|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200002": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200003": { + "content": "<|constrain|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200004": { + "content": "<|reserved_200004|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200005": { + "content": "<|channel|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200006": { + "content": "<|start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200008": { + "content": "<|message|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200009": { + "content": "<|reserved_200009|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200010": { + "content": "<|reserved_200010|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200011": { + "content": "<|reserved_200011|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200012": { + "content": "<|call|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200013": { + "content": "<|reserved_200013|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200014": { + "content": "<|reserved_200014|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200015": { + "content": "<|reserved_200015|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200016": { + "content": "<|reserved_200016|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200017": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200018": { + "content": "<|endofprompt|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|startoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|return|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|reserved_200017|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/outputs/checkpoint-1500/trainer_state.json b/outputs/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4b88f7892cfdf75d50e4740f4e2866b38fd846b3 --- /dev/null +++ b/outputs/checkpoint-1500/trainer_state.json @@ -0,0 +1,10534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5829204313611193, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 13.684800148010254, + "learning_rate": 0.0, + "loss": 2.3276, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 13.660787582397461, + "learning_rate": 4e-05, + "loss": 2.2792, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 13.35280704498291, + "learning_rate": 8e-05, + "loss": 2.4151, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 6.15027379989624, + "learning_rate": 0.00012, + "loss": 1.7812, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 1.3168226480484009, + "learning_rate": 0.00016, + "loss": 1.4536, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.9872580170631409, + "learning_rate": 0.0002, + "loss": 1.4171, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.7496100664138794, + "learning_rate": 0.00019935064935064936, + "loss": 1.4168, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7376005053520203, + "learning_rate": 0.00019870129870129872, + "loss": 1.3659, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5281137824058533, + "learning_rate": 0.00019805194805194807, + "loss": 1.2566, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.5485746264457703, + "learning_rate": 0.00019740259740259742, + "loss": 1.3761, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5506592392921448, + "learning_rate": 0.00019675324675324675, + "loss": 1.3327, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.49382686614990234, + "learning_rate": 0.00019610389610389613, + "loss": 1.3727, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.36203011870384216, + "learning_rate": 0.00019545454545454548, + "loss": 1.1515, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.3528599739074707, + "learning_rate": 0.0001948051948051948, + "loss": 1.2636, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.31244418025016785, + "learning_rate": 0.00019415584415584416, + "loss": 1.1873, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.3379523754119873, + "learning_rate": 0.00019350649350649354, + "loss": 1.2657, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.3025083839893341, + "learning_rate": 0.00019285714285714286, + "loss": 1.2846, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.2560190260410309, + "learning_rate": 0.00019220779220779222, + "loss": 1.1587, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.2554129958152771, + "learning_rate": 0.00019155844155844157, + "loss": 1.2812, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.22662702202796936, + "learning_rate": 0.00019090909090909092, + "loss": 1.1664, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.2515714168548584, + "learning_rate": 0.00019025974025974027, + "loss": 1.2177, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.24396637082099915, + "learning_rate": 0.00018961038961038963, + "loss": 1.2053, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.24488303065299988, + "learning_rate": 0.00018896103896103895, + "loss": 1.2074, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.2168620079755783, + "learning_rate": 0.00018831168831168833, + "loss": 1.1284, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.24021224677562714, + "learning_rate": 0.00018766233766233769, + "loss": 1.2169, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.20057056844234467, + "learning_rate": 0.000187012987012987, + "loss": 1.1031, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.19900795817375183, + "learning_rate": 0.00018636363636363636, + "loss": 1.1004, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.2019268423318863, + "learning_rate": 0.00018571428571428572, + "loss": 1.1476, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.1996479034423828, + "learning_rate": 0.00018506493506493507, + "loss": 1.1455, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.25262022018432617, + "learning_rate": 0.00018441558441558442, + "loss": 1.1025, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.225438192486763, + "learning_rate": 0.00018376623376623378, + "loss": 1.1954, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.17834505438804626, + "learning_rate": 0.00018311688311688313, + "loss": 1.0934, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.20071206986904144, + "learning_rate": 0.00018246753246753248, + "loss": 1.0488, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.1920139640569687, + "learning_rate": 0.00018181818181818183, + "loss": 1.123, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.18714852631092072, + "learning_rate": 0.0001811688311688312, + "loss": 1.0798, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.18315713107585907, + "learning_rate": 0.00018051948051948054, + "loss": 1.1107, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.19156870245933533, + "learning_rate": 0.00017987012987012987, + "loss": 1.1125, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.21527768671512604, + "learning_rate": 0.00017922077922077922, + "loss": 1.1346, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.1871163249015808, + "learning_rate": 0.0001785714285714286, + "loss": 1.0742, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.17750784754753113, + "learning_rate": 0.00017792207792207792, + "loss": 1.1323, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.177419051527977, + "learning_rate": 0.00017727272727272728, + "loss": 1.1405, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.16714292764663696, + "learning_rate": 0.00017662337662337663, + "loss": 1.1084, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.1610356718301773, + "learning_rate": 0.00017597402597402598, + "loss": 1.1125, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.2548656761646271, + "learning_rate": 0.00017532467532467534, + "loss": 1.1114, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.1731044203042984, + "learning_rate": 0.0001746753246753247, + "loss": 1.1197, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.1739533394575119, + "learning_rate": 0.00017402597402597401, + "loss": 1.1777, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.2178352177143097, + "learning_rate": 0.0001733766233766234, + "loss": 1.1111, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.17247150838375092, + "learning_rate": 0.00017272727272727275, + "loss": 1.1253, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.18075324594974518, + "learning_rate": 0.00017207792207792207, + "loss": 1.1358, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.15898071229457855, + "learning_rate": 0.00017142857142857143, + "loss": 1.0606, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.16518613696098328, + "learning_rate": 0.0001707792207792208, + "loss": 1.0944, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.16035063564777374, + "learning_rate": 0.00017012987012987013, + "loss": 1.0554, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.1686483472585678, + "learning_rate": 0.00016948051948051948, + "loss": 1.0384, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.16575631499290466, + "learning_rate": 0.00016883116883116884, + "loss": 1.0243, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.16840039193630219, + "learning_rate": 0.0001681818181818182, + "loss": 1.117, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.17616064846515656, + "learning_rate": 0.00016753246753246754, + "loss": 1.0743, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.168218195438385, + "learning_rate": 0.0001668831168831169, + "loss": 1.0627, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.17026656866073608, + "learning_rate": 0.00016623376623376625, + "loss": 1.0059, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.16454458236694336, + "learning_rate": 0.0001655844155844156, + "loss": 0.9943, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.17185136675834656, + "learning_rate": 0.00016493506493506495, + "loss": 1.1545, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.17822986841201782, + "learning_rate": 0.00016428571428571428, + "loss": 1.073, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.1676608771085739, + "learning_rate": 0.00016363636363636366, + "loss": 1.0886, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.1727771908044815, + "learning_rate": 0.000162987012987013, + "loss": 1.0432, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.17827573418617249, + "learning_rate": 0.00016233766233766234, + "loss": 1.083, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.19807517528533936, + "learning_rate": 0.0001616883116883117, + "loss": 1.1208, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.17693684995174408, + "learning_rate": 0.00016103896103896104, + "loss": 1.089, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.15489234030246735, + "learning_rate": 0.0001603896103896104, + "loss": 0.9707, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.16443990170955658, + "learning_rate": 0.00015974025974025975, + "loss": 1.0643, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.2051103413105011, + "learning_rate": 0.0001590909090909091, + "loss": 1.1246, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.18824075162410736, + "learning_rate": 0.00015844155844155845, + "loss": 1.0855, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.18659448623657227, + "learning_rate": 0.0001577922077922078, + "loss": 1.1412, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.1854114979505539, + "learning_rate": 0.00015714285714285716, + "loss": 1.0249, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.1876193732023239, + "learning_rate": 0.00015649350649350649, + "loss": 1.1029, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.1888684630393982, + "learning_rate": 0.00015584415584415587, + "loss": 1.0789, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.20240606367588043, + "learning_rate": 0.0001551948051948052, + "loss": 1.0495, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.232120081782341, + "learning_rate": 0.00015454545454545454, + "loss": 1.0735, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.16897843778133392, + "learning_rate": 0.0001538961038961039, + "loss": 1.0164, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.18796634674072266, + "learning_rate": 0.00015324675324675325, + "loss": 1.0676, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.19574032723903656, + "learning_rate": 0.0001525974025974026, + "loss": 1.0456, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.18007811903953552, + "learning_rate": 0.00015194805194805196, + "loss": 1.0894, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.18932929635047913, + "learning_rate": 0.0001512987012987013, + "loss": 1.0729, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.20614288747310638, + "learning_rate": 0.00015064935064935066, + "loss": 1.0854, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.19291089475154877, + "learning_rate": 0.00015000000000000001, + "loss": 1.1217, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.18916529417037964, + "learning_rate": 0.00014935064935064934, + "loss": 1.0963, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.20306220650672913, + "learning_rate": 0.00014870129870129872, + "loss": 1.0898, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.17870067059993744, + "learning_rate": 0.00014805194805194807, + "loss": 1.0213, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.18411923944950104, + "learning_rate": 0.0001474025974025974, + "loss": 1.0844, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.18788227438926697, + "learning_rate": 0.00014675324675324675, + "loss": 1.0338, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.23874884843826294, + "learning_rate": 0.00014610389610389613, + "loss": 1.1118, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.19380499422550201, + "learning_rate": 0.00014545454545454546, + "loss": 1.0464, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.18968750536441803, + "learning_rate": 0.0001448051948051948, + "loss": 1.0569, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.19545753300189972, + "learning_rate": 0.00014415584415584416, + "loss": 1.1225, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.19170494377613068, + "learning_rate": 0.00014350649350649352, + "loss": 1.0602, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.17953918874263763, + "learning_rate": 0.00014285714285714287, + "loss": 1.032, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.1822536289691925, + "learning_rate": 0.00014220779220779222, + "loss": 1.0559, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.18591298162937164, + "learning_rate": 0.00014155844155844155, + "loss": 1.031, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.2129002958536148, + "learning_rate": 0.00014090909090909093, + "loss": 1.1391, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.18386681377887726, + "learning_rate": 0.00014025974025974028, + "loss": 0.9919, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.18314239382743835, + "learning_rate": 0.0001396103896103896, + "loss": 1.0445, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.1999066174030304, + "learning_rate": 0.00013896103896103896, + "loss": 1.0538, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.18741188943386078, + "learning_rate": 0.00013831168831168834, + "loss": 1.0722, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.19351010024547577, + "learning_rate": 0.00013766233766233766, + "loss": 1.0491, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.18859203159809113, + "learning_rate": 0.00013701298701298702, + "loss": 1.0593, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.1962767392396927, + "learning_rate": 0.00013636363636363637, + "loss": 1.1344, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.20819440484046936, + "learning_rate": 0.00013571428571428572, + "loss": 1.1137, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.19590184092521667, + "learning_rate": 0.00013506493506493507, + "loss": 1.0624, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.18631424009799957, + "learning_rate": 0.00013441558441558443, + "loss": 1.0587, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.19572143256664276, + "learning_rate": 0.00013376623376623375, + "loss": 1.0494, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.1910988837480545, + "learning_rate": 0.00013311688311688313, + "loss": 1.0481, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.19455869495868683, + "learning_rate": 0.00013246753246753249, + "loss": 1.029, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.18669827282428741, + "learning_rate": 0.0001318181818181818, + "loss": 1.0513, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.17523664236068726, + "learning_rate": 0.0001311688311688312, + "loss": 1.0126, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.17929129302501678, + "learning_rate": 0.00013051948051948052, + "loss": 1.0717, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.19380168616771698, + "learning_rate": 0.00012987012987012987, + "loss": 1.0324, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.18090228736400604, + "learning_rate": 0.00012922077922077922, + "loss": 1.0515, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.2067340910434723, + "learning_rate": 0.00012857142857142858, + "loss": 1.0939, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.1880485862493515, + "learning_rate": 0.00012792207792207793, + "loss": 1.0986, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.182168647646904, + "learning_rate": 0.00012727272727272728, + "loss": 1.0109, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.20187129080295563, + "learning_rate": 0.00012662337662337663, + "loss": 1.0668, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.2082669734954834, + "learning_rate": 0.000125974025974026, + "loss": 1.054, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.18294434249401093, + "learning_rate": 0.00012532467532467534, + "loss": 1.0397, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.20515067875385284, + "learning_rate": 0.00012467532467532467, + "loss": 1.1092, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.1758790761232376, + "learning_rate": 0.00012402597402597402, + "loss": 0.9755, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.2170792669057846, + "learning_rate": 0.0001233766233766234, + "loss": 1.0434, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.202157124876976, + "learning_rate": 0.00012272727272727272, + "loss": 1.1129, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.18556398153305054, + "learning_rate": 0.00012207792207792208, + "loss": 1.0665, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.20196087658405304, + "learning_rate": 0.00012142857142857143, + "loss": 1.1, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.1921566128730774, + "learning_rate": 0.0001207792207792208, + "loss": 1.0918, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.18866224586963654, + "learning_rate": 0.00012012987012987014, + "loss": 1.0014, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.207601398229599, + "learning_rate": 0.00011948051948051949, + "loss": 1.0726, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.21592366695404053, + "learning_rate": 0.00011883116883116883, + "loss": 1.1379, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.2016124576330185, + "learning_rate": 0.0001181818181818182, + "loss": 1.1428, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.20478437840938568, + "learning_rate": 0.00011753246753246753, + "loss": 1.121, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.22730594873428345, + "learning_rate": 0.00011688311688311689, + "loss": 1.0319, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.22592711448669434, + "learning_rate": 0.00011623376623376625, + "loss": 1.1264, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.20035041868686676, + "learning_rate": 0.00011558441558441559, + "loss": 1.0686, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.20648567378520966, + "learning_rate": 0.00011493506493506494, + "loss": 1.0817, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.21222743391990662, + "learning_rate": 0.00011428571428571428, + "loss": 1.0678, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.2075391560792923, + "learning_rate": 0.00011363636363636365, + "loss": 1.0897, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.1964101791381836, + "learning_rate": 0.000112987012987013, + "loss": 1.0906, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.22406511008739471, + "learning_rate": 0.00011233766233766234, + "loss": 1.0594, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.23787978291511536, + "learning_rate": 0.00011168831168831168, + "loss": 1.1053, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.21196185052394867, + "learning_rate": 0.00011103896103896105, + "loss": 1.0923, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.21042804419994354, + "learning_rate": 0.0001103896103896104, + "loss": 1.0381, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.2267436534166336, + "learning_rate": 0.00010974025974025974, + "loss": 1.0818, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.23742735385894775, + "learning_rate": 0.00010909090909090909, + "loss": 1.0872, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.17787213623523712, + "learning_rate": 0.00010844155844155846, + "loss": 1.03, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.22422832250595093, + "learning_rate": 0.0001077922077922078, + "loss": 1.0738, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.22946301102638245, + "learning_rate": 0.00010714285714285715, + "loss": 1.0274, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.2137996405363083, + "learning_rate": 0.00010649350649350649, + "loss": 1.0539, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.1748756766319275, + "learning_rate": 0.00010584415584415586, + "loss": 1.0355, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.22275175154209137, + "learning_rate": 0.0001051948051948052, + "loss": 1.1696, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.20996077358722687, + "learning_rate": 0.00010454545454545455, + "loss": 1.0303, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.1945938766002655, + "learning_rate": 0.00010389610389610389, + "loss": 0.9747, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.1970377266407013, + "learning_rate": 0.00010324675324675325, + "loss": 1.0358, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.18814732134342194, + "learning_rate": 0.00010259740259740261, + "loss": 0.9612, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.2153233289718628, + "learning_rate": 0.00010194805194805195, + "loss": 1.0749, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.21788008511066437, + "learning_rate": 0.0001012987012987013, + "loss": 1.0883, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.214650496840477, + "learning_rate": 0.00010064935064935067, + "loss": 1.0539, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.19312834739685059, + "learning_rate": 0.0001, + "loss": 1.0657, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.19916598498821259, + "learning_rate": 9.935064935064936e-05, + "loss": 1.0478, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.2057606726884842, + "learning_rate": 9.870129870129871e-05, + "loss": 1.0094, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.22159607708454132, + "learning_rate": 9.805194805194806e-05, + "loss": 1.0952, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.18274275958538055, + "learning_rate": 9.74025974025974e-05, + "loss": 1.0065, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.19835162162780762, + "learning_rate": 9.675324675324677e-05, + "loss": 1.0742, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.2114904820919037, + "learning_rate": 9.610389610389611e-05, + "loss": 1.1109, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.21488523483276367, + "learning_rate": 9.545454545454546e-05, + "loss": 1.0465, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.19870303571224213, + "learning_rate": 9.480519480519481e-05, + "loss": 1.0318, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.20413029193878174, + "learning_rate": 9.415584415584417e-05, + "loss": 1.0817, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.1847231239080429, + "learning_rate": 9.35064935064935e-05, + "loss": 1.0144, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.2715964913368225, + "learning_rate": 9.285714285714286e-05, + "loss": 0.9832, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.2225002497434616, + "learning_rate": 9.220779220779221e-05, + "loss": 1.1051, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.22931510210037231, + "learning_rate": 9.155844155844156e-05, + "loss": 1.1042, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.21848627924919128, + "learning_rate": 9.090909090909092e-05, + "loss": 1.1151, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.19852259755134583, + "learning_rate": 9.025974025974027e-05, + "loss": 1.0889, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.2080363780260086, + "learning_rate": 8.961038961038961e-05, + "loss": 1.0777, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.22391024231910706, + "learning_rate": 8.896103896103896e-05, + "loss": 1.1092, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.21793846786022186, + "learning_rate": 8.831168831168831e-05, + "loss": 1.044, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.2009749859571457, + "learning_rate": 8.766233766233767e-05, + "loss": 1.0198, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.19432318210601807, + "learning_rate": 8.701298701298701e-05, + "loss": 1.075, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.18634547293186188, + "learning_rate": 8.636363636363637e-05, + "loss": 0.9964, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.1947103589773178, + "learning_rate": 8.571428571428571e-05, + "loss": 1.0025, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.23098671436309814, + "learning_rate": 8.506493506493507e-05, + "loss": 1.0562, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.19686414301395416, + "learning_rate": 8.441558441558442e-05, + "loss": 1.0285, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.19852428138256073, + "learning_rate": 8.376623376623377e-05, + "loss": 1.0054, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.21483510732650757, + "learning_rate": 8.311688311688312e-05, + "loss": 1.108, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.23313644528388977, + "learning_rate": 8.246753246753248e-05, + "loss": 1.1383, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.21453145146369934, + "learning_rate": 8.181818181818183e-05, + "loss": 1.0911, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.20268195867538452, + "learning_rate": 8.116883116883117e-05, + "loss": 1.0145, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.20576398074626923, + "learning_rate": 8.051948051948052e-05, + "loss": 1.0829, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.21732626855373383, + "learning_rate": 7.987012987012987e-05, + "loss": 1.0152, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.22046895325183868, + "learning_rate": 7.922077922077923e-05, + "loss": 1.1311, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.19727715849876404, + "learning_rate": 7.857142857142858e-05, + "loss": 1.0364, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.20861488580703735, + "learning_rate": 7.792207792207793e-05, + "loss": 1.0435, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.18545083701610565, + "learning_rate": 7.727272727272727e-05, + "loss": 1.0299, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.19965052604675293, + "learning_rate": 7.662337662337662e-05, + "loss": 1.0511, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.23673909902572632, + "learning_rate": 7.597402597402598e-05, + "loss": 1.081, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.17583179473876953, + "learning_rate": 7.532467532467533e-05, + "loss": 0.9808, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.2129366099834442, + "learning_rate": 7.467532467532467e-05, + "loss": 1.0522, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.21679140627384186, + "learning_rate": 7.402597402597404e-05, + "loss": 1.0567, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.2032000720500946, + "learning_rate": 7.337662337662338e-05, + "loss": 1.0466, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.1887970268726349, + "learning_rate": 7.272727272727273e-05, + "loss": 1.0329, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.21060192584991455, + "learning_rate": 7.207792207792208e-05, + "loss": 1.1021, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.21191425621509552, + "learning_rate": 7.142857142857143e-05, + "loss": 0.99, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.1995989829301834, + "learning_rate": 7.077922077922077e-05, + "loss": 1.0526, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.1849513053894043, + "learning_rate": 7.012987012987014e-05, + "loss": 0.9998, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.1948779672384262, + "learning_rate": 6.948051948051948e-05, + "loss": 1.075, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.20374052226543427, + "learning_rate": 6.883116883116883e-05, + "loss": 1.0933, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.2102465033531189, + "learning_rate": 6.818181818181818e-05, + "loss": 1.1123, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.21376173198223114, + "learning_rate": 6.753246753246754e-05, + "loss": 1.1233, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.20934203267097473, + "learning_rate": 6.688311688311688e-05, + "loss": 1.1374, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.18604128062725067, + "learning_rate": 6.623376623376624e-05, + "loss": 1.0213, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.19644233584403992, + "learning_rate": 6.55844155844156e-05, + "loss": 1.0046, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.18479463458061218, + "learning_rate": 6.493506493506494e-05, + "loss": 0.9792, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.1945149153470993, + "learning_rate": 6.428571428571429e-05, + "loss": 1.0584, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.2070147544145584, + "learning_rate": 6.363636363636364e-05, + "loss": 1.071, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.19645985960960388, + "learning_rate": 6.2987012987013e-05, + "loss": 1.0721, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.1960117667913437, + "learning_rate": 6.233766233766233e-05, + "loss": 1.071, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.20168261229991913, + "learning_rate": 6.16883116883117e-05, + "loss": 1.0808, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.21254412829875946, + "learning_rate": 6.103896103896104e-05, + "loss": 1.0287, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.21271063387393951, + "learning_rate": 6.03896103896104e-05, + "loss": 1.0605, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.2081408053636551, + "learning_rate": 5.9740259740259744e-05, + "loss": 1.091, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.21113798022270203, + "learning_rate": 5.90909090909091e-05, + "loss": 1.1323, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.20670844614505768, + "learning_rate": 5.844155844155844e-05, + "loss": 1.0955, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.2010120451450348, + "learning_rate": 5.7792207792207796e-05, + "loss": 1.1068, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.20379121601581573, + "learning_rate": 5.714285714285714e-05, + "loss": 1.0419, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.22799807786941528, + "learning_rate": 5.64935064935065e-05, + "loss": 1.0904, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.2005995213985443, + "learning_rate": 5.584415584415584e-05, + "loss": 1.078, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.20329605042934418, + "learning_rate": 5.51948051948052e-05, + "loss": 1.0245, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.19283504784107208, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.0367, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.20624355971813202, + "learning_rate": 5.38961038961039e-05, + "loss": 1.1046, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.21362991631031036, + "learning_rate": 5.3246753246753245e-05, + "loss": 1.1104, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.20447863638401031, + "learning_rate": 5.25974025974026e-05, + "loss": 1.0514, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.1974381059408188, + "learning_rate": 5.1948051948051944e-05, + "loss": 1.0048, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.21237170696258545, + "learning_rate": 5.1298701298701304e-05, + "loss": 1.1299, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.21224971115589142, + "learning_rate": 5.064935064935065e-05, + "loss": 1.05, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.19865018129348755, + "learning_rate": 5e-05, + "loss": 1.0665, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.19199275970458984, + "learning_rate": 4.9350649350649355e-05, + "loss": 0.9531, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.19573214650154114, + "learning_rate": 4.87012987012987e-05, + "loss": 1.0318, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.21338805556297302, + "learning_rate": 4.8051948051948054e-05, + "loss": 1.0343, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.2254691869020462, + "learning_rate": 4.740259740259741e-05, + "loss": 1.0472, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.18101665377616882, + "learning_rate": 4.675324675324675e-05, + "loss": 1.017, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.22090592980384827, + "learning_rate": 4.6103896103896106e-05, + "loss": 1.0389, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.20865507423877716, + "learning_rate": 4.545454545454546e-05, + "loss": 1.0369, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.21619610488414764, + "learning_rate": 4.4805194805194805e-05, + "loss": 1.109, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.21694771945476532, + "learning_rate": 4.415584415584416e-05, + "loss": 1.0525, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.2182662934064865, + "learning_rate": 4.3506493506493503e-05, + "loss": 1.0331, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.2026486098766327, + "learning_rate": 4.2857142857142856e-05, + "loss": 1.027, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.19606547057628632, + "learning_rate": 4.220779220779221e-05, + "loss": 1.0242, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.22107470035552979, + "learning_rate": 4.155844155844156e-05, + "loss": 1.0924, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.19960008561611176, + "learning_rate": 4.0909090909090915e-05, + "loss": 1.0384, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.1945488154888153, + "learning_rate": 4.025974025974026e-05, + "loss": 1.0673, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.22067414224147797, + "learning_rate": 3.9610389610389614e-05, + "loss": 1.0426, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.19010980427265167, + "learning_rate": 3.8961038961038966e-05, + "loss": 1.0617, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.18781176209449768, + "learning_rate": 3.831168831168831e-05, + "loss": 1.0243, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.20388829708099365, + "learning_rate": 3.7662337662337665e-05, + "loss": 1.0476, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.19911155104637146, + "learning_rate": 3.701298701298702e-05, + "loss": 1.0324, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.19884039461612701, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.0242, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.19036105275154114, + "learning_rate": 3.571428571428572e-05, + "loss": 1.0323, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.20039844512939453, + "learning_rate": 3.506493506493507e-05, + "loss": 1.0749, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.1899934560060501, + "learning_rate": 3.4415584415584416e-05, + "loss": 1.0115, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.20019090175628662, + "learning_rate": 3.376623376623377e-05, + "loss": 1.0782, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.2020583152770996, + "learning_rate": 3.311688311688312e-05, + "loss": 1.0687, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.21407337486743927, + "learning_rate": 3.246753246753247e-05, + "loss": 1.1015, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.1871640682220459, + "learning_rate": 3.181818181818182e-05, + "loss": 0.9637, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.21622811257839203, + "learning_rate": 3.1168831168831166e-05, + "loss": 1.1222, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.22504661977291107, + "learning_rate": 3.051948051948052e-05, + "loss": 1.132, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.19177629053592682, + "learning_rate": 2.9870129870129872e-05, + "loss": 1.0281, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.1970544159412384, + "learning_rate": 2.922077922077922e-05, + "loss": 1.0393, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.21554522216320038, + "learning_rate": 2.857142857142857e-05, + "loss": 1.074, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.21131229400634766, + "learning_rate": 2.792207792207792e-05, + "loss": 1.054, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.19816523790359497, + "learning_rate": 2.7272727272727273e-05, + "loss": 1.0456, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.21075209975242615, + "learning_rate": 2.6623376623376623e-05, + "loss": 1.0758, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.2296527624130249, + "learning_rate": 2.5974025974025972e-05, + "loss": 1.0917, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.19722610712051392, + "learning_rate": 2.5324675324675325e-05, + "loss": 1.0704, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.18721099197864532, + "learning_rate": 2.4675324675324678e-05, + "loss": 0.9919, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.20244193077087402, + "learning_rate": 2.4025974025974027e-05, + "loss": 1.0368, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.19518914818763733, + "learning_rate": 2.3376623376623376e-05, + "loss": 1.0436, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.19650357961654663, + "learning_rate": 2.272727272727273e-05, + "loss": 1.0306, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.20320096611976624, + "learning_rate": 2.207792207792208e-05, + "loss": 1.0941, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.18296951055526733, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.9802, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.21357610821723938, + "learning_rate": 2.077922077922078e-05, + "loss": 1.0449, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.193921759724617, + "learning_rate": 2.012987012987013e-05, + "loss": 1.0116, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.1953902244567871, + "learning_rate": 1.9480519480519483e-05, + "loss": 1.0105, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.19440975785255432, + "learning_rate": 1.8831168831168833e-05, + "loss": 0.9952, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.21054105460643768, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.0701, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.18844804167747498, + "learning_rate": 1.7532467532467535e-05, + "loss": 1.0146, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.2067311704158783, + "learning_rate": 1.6883116883116884e-05, + "loss": 1.0781, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.1941213756799698, + "learning_rate": 1.6233766233766234e-05, + "loss": 0.9814, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.22726193070411682, + "learning_rate": 1.5584415584415583e-05, + "loss": 1.1431, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.18025581538677216, + "learning_rate": 1.4935064935064936e-05, + "loss": 0.9649, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.21535000205039978, + "learning_rate": 1.4285714285714285e-05, + "loss": 1.0441, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.20014546811580658, + "learning_rate": 1.3636363636363637e-05, + "loss": 1.0166, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.22738787531852722, + "learning_rate": 1.2987012987012986e-05, + "loss": 1.0564, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.2020861804485321, + "learning_rate": 1.2337662337662339e-05, + "loss": 1.1241, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.19888809323310852, + "learning_rate": 1.1688311688311688e-05, + "loss": 1.1114, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.20912377536296844, + "learning_rate": 1.103896103896104e-05, + "loss": 1.0971, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.21206621825695038, + "learning_rate": 1.038961038961039e-05, + "loss": 1.0601, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.18667680025100708, + "learning_rate": 9.740259740259742e-06, + "loss": 1.0291, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.21125559508800507, + "learning_rate": 9.090909090909091e-06, + "loss": 1.0483, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.21776145696640015, + "learning_rate": 8.441558441558442e-06, + "loss": 0.9912, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.20144303143024445, + "learning_rate": 7.792207792207792e-06, + "loss": 1.0357, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.1984029859304428, + "learning_rate": 7.142857142857143e-06, + "loss": 1.0648, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.17972829937934875, + "learning_rate": 6.493506493506493e-06, + "loss": 1.0033, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.1818286031484604, + "learning_rate": 5.844155844155844e-06, + "loss": 0.997, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.19670912623405457, + "learning_rate": 5.194805194805195e-06, + "loss": 1.0256, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.20527283847332, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.0348, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.19025909900665283, + "learning_rate": 3.896103896103896e-06, + "loss": 1.0682, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.19544818997383118, + "learning_rate": 3.2467532467532465e-06, + "loss": 0.9872, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.22112183272838593, + "learning_rate": 2.5974025974025976e-06, + "loss": 1.0661, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.23328153789043427, + "learning_rate": 1.948051948051948e-06, + "loss": 1.0691, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.20181375741958618, + "learning_rate": 1.2987012987012988e-06, + "loss": 0.9416, + "step": 312 + }, + { + "epoch": 1.0, + "grad_norm": 0.29312625527381897, + "learning_rate": 6.493506493506494e-07, + "loss": 1.1216, + "step": 313 + }, + { + "epoch": 0.12202467696492762, + "grad_norm": 0.2231415957212448, + "learning_rate": 0.0, + "loss": 1.0468, + "step": 314 + }, + { + "epoch": 0.12241329058583503, + "grad_norm": 0.22263288497924805, + "learning_rate": 0.00017594394706111328, + "loss": 1.0399, + "step": 315 + }, + { + "epoch": 0.12280190420674245, + "grad_norm": 0.22909891605377197, + "learning_rate": 0.00017586609575710393, + "loss": 1.1069, + "step": 316 + }, + { + "epoch": 0.12319051782764986, + "grad_norm": 0.23951445519924164, + "learning_rate": 0.0001757882444530946, + "loss": 1.1036, + "step": 317 + }, + { + "epoch": 0.12357913144855727, + "grad_norm": 0.2409268021583557, + "learning_rate": 0.00017571039314908526, + "loss": 1.1114, + "step": 318 + }, + { + "epoch": 0.12396774506946469, + "grad_norm": 0.23753899335861206, + "learning_rate": 0.00017563254184507592, + "loss": 1.1297, + "step": 319 + }, + { + "epoch": 0.12435635869037209, + "grad_norm": 0.2823902666568756, + "learning_rate": 0.00017555469054106657, + "loss": 1.1293, + "step": 320 + }, + { + "epoch": 0.12474497231127951, + "grad_norm": 0.24093545973300934, + "learning_rate": 0.00017547683923705722, + "loss": 1.0678, + "step": 321 + }, + { + "epoch": 0.12513358593218693, + "grad_norm": 0.22565563023090363, + "learning_rate": 0.0001753989879330479, + "loss": 1.1408, + "step": 322 + }, + { + "epoch": 0.12552219955309435, + "grad_norm": 0.22569572925567627, + "learning_rate": 0.00017532113662903855, + "loss": 1.0543, + "step": 323 + }, + { + "epoch": 0.12591081317400174, + "grad_norm": 0.24962866306304932, + "learning_rate": 0.0001752432853250292, + "loss": 1.0818, + "step": 324 + }, + { + "epoch": 0.12629942679490916, + "grad_norm": 0.22184576094150543, + "learning_rate": 0.00017516543402101986, + "loss": 1.0835, + "step": 325 + }, + { + "epoch": 0.12668804041581658, + "grad_norm": 0.2572194039821625, + "learning_rate": 0.0001750875827170105, + "loss": 1.0767, + "step": 326 + }, + { + "epoch": 0.127076654036724, + "grad_norm": 0.24131342768669128, + "learning_rate": 0.00017500973141300116, + "loss": 1.0981, + "step": 327 + }, + { + "epoch": 0.1274652676576314, + "grad_norm": 0.2386389970779419, + "learning_rate": 0.00017493188010899184, + "loss": 1.0828, + "step": 328 + }, + { + "epoch": 0.1278538812785388, + "grad_norm": 0.2654125690460205, + "learning_rate": 0.0001748540288049825, + "loss": 1.1266, + "step": 329 + }, + { + "epoch": 0.12824249489944622, + "grad_norm": 0.2925739884376526, + "learning_rate": 0.00017477617750097314, + "loss": 1.0983, + "step": 330 + }, + { + "epoch": 0.12863110852035364, + "grad_norm": 0.26589342951774597, + "learning_rate": 0.0001746983261969638, + "loss": 1.1029, + "step": 331 + }, + { + "epoch": 0.12901972214126106, + "grad_norm": 0.24565957486629486, + "learning_rate": 0.00017462047489295445, + "loss": 1.0975, + "step": 332 + }, + { + "epoch": 0.12940833576216845, + "grad_norm": 0.2459682673215866, + "learning_rate": 0.00017454262358894513, + "loss": 1.0566, + "step": 333 + }, + { + "epoch": 0.12979694938307587, + "grad_norm": 0.23349183797836304, + "learning_rate": 0.00017446477228493578, + "loss": 1.0833, + "step": 334 + }, + { + "epoch": 0.1301855630039833, + "grad_norm": 0.26166337728500366, + "learning_rate": 0.00017438692098092643, + "loss": 1.1598, + "step": 335 + }, + { + "epoch": 0.1305741766248907, + "grad_norm": 0.24188168346881866, + "learning_rate": 0.00017430906967691708, + "loss": 1.0728, + "step": 336 + }, + { + "epoch": 0.13096279024579813, + "grad_norm": 0.22922398149967194, + "learning_rate": 0.00017423121837290773, + "loss": 1.0311, + "step": 337 + }, + { + "epoch": 0.13135140386670552, + "grad_norm": 0.2652754485607147, + "learning_rate": 0.00017415336706889841, + "loss": 1.1096, + "step": 338 + }, + { + "epoch": 0.13174001748761294, + "grad_norm": 0.2355881780385971, + "learning_rate": 0.00017407551576488907, + "loss": 1.0964, + "step": 339 + }, + { + "epoch": 0.13212863110852036, + "grad_norm": 0.244523823261261, + "learning_rate": 0.00017399766446087972, + "loss": 1.142, + "step": 340 + }, + { + "epoch": 0.13251724472942777, + "grad_norm": 0.24705976247787476, + "learning_rate": 0.00017391981315687037, + "loss": 1.0943, + "step": 341 + }, + { + "epoch": 0.13290585835033517, + "grad_norm": 0.22817552089691162, + "learning_rate": 0.00017384196185286102, + "loss": 1.0621, + "step": 342 + }, + { + "epoch": 0.13329447197124258, + "grad_norm": 0.22605225443840027, + "learning_rate": 0.0001737641105488517, + "loss": 1.0714, + "step": 343 + }, + { + "epoch": 0.13368308559215, + "grad_norm": 0.2584545314311981, + "learning_rate": 0.00017368625924484235, + "loss": 1.1367, + "step": 344 + }, + { + "epoch": 0.13407169921305742, + "grad_norm": 0.2248220443725586, + "learning_rate": 0.000173608407940833, + "loss": 1.0872, + "step": 345 + }, + { + "epoch": 0.13446031283396484, + "grad_norm": 0.2141868770122528, + "learning_rate": 0.00017353055663682368, + "loss": 1.0572, + "step": 346 + }, + { + "epoch": 0.13484892645487223, + "grad_norm": 0.2615523934364319, + "learning_rate": 0.00017345270533281434, + "loss": 1.1048, + "step": 347 + }, + { + "epoch": 0.13523754007577965, + "grad_norm": 0.22990448772907257, + "learning_rate": 0.000173374854028805, + "loss": 1.0528, + "step": 348 + }, + { + "epoch": 0.13562615369668707, + "grad_norm": 0.2132262885570526, + "learning_rate": 0.00017329700272479564, + "loss": 1.0476, + "step": 349 + }, + { + "epoch": 0.1360147673175945, + "grad_norm": 0.2578272819519043, + "learning_rate": 0.00017321915142078632, + "loss": 1.0852, + "step": 350 + }, + { + "epoch": 0.1364033809385019, + "grad_norm": 0.22881457209587097, + "learning_rate": 0.00017314130011677697, + "loss": 1.1017, + "step": 351 + }, + { + "epoch": 0.1367919945594093, + "grad_norm": 0.21067696809768677, + "learning_rate": 0.00017306344881276762, + "loss": 1.0444, + "step": 352 + }, + { + "epoch": 0.13718060818031672, + "grad_norm": 0.2304215282201767, + "learning_rate": 0.0001729855975087583, + "loss": 1.0737, + "step": 353 + }, + { + "epoch": 0.13756922180122413, + "grad_norm": 0.2031925916671753, + "learning_rate": 0.00017290774620474895, + "loss": 1.0036, + "step": 354 + }, + { + "epoch": 0.13795783542213155, + "grad_norm": 0.27281051874160767, + "learning_rate": 0.0001728298949007396, + "loss": 1.148, + "step": 355 + }, + { + "epoch": 0.13834644904303897, + "grad_norm": 0.204191654920578, + "learning_rate": 0.00017275204359673026, + "loss": 0.9607, + "step": 356 + }, + { + "epoch": 0.13873506266394636, + "grad_norm": 0.221976637840271, + "learning_rate": 0.0001726741922927209, + "loss": 1.1068, + "step": 357 + }, + { + "epoch": 0.13912367628485378, + "grad_norm": 0.20831729471683502, + "learning_rate": 0.0001725963409887116, + "loss": 1.034, + "step": 358 + }, + { + "epoch": 0.1395122899057612, + "grad_norm": 0.21639779210090637, + "learning_rate": 0.00017251848968470224, + "loss": 1.0613, + "step": 359 + }, + { + "epoch": 0.13990090352666862, + "grad_norm": 0.1959424465894699, + "learning_rate": 0.0001724406383806929, + "loss": 1.0506, + "step": 360 + }, + { + "epoch": 0.140289517147576, + "grad_norm": 0.2044398933649063, + "learning_rate": 0.00017236278707668355, + "loss": 1.0316, + "step": 361 + }, + { + "epoch": 0.14067813076848343, + "grad_norm": 0.21483004093170166, + "learning_rate": 0.0001722849357726742, + "loss": 1.0361, + "step": 362 + }, + { + "epoch": 0.14106674438939085, + "grad_norm": 0.237701416015625, + "learning_rate": 0.00017220708446866485, + "loss": 1.1264, + "step": 363 + }, + { + "epoch": 0.14145535801029827, + "grad_norm": 0.20750795304775238, + "learning_rate": 0.00017212923316465553, + "loss": 1.0523, + "step": 364 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 0.2252965271472931, + "learning_rate": 0.00017205138186064618, + "loss": 1.0764, + "step": 365 + }, + { + "epoch": 0.14223258525211308, + "grad_norm": 0.2033565789461136, + "learning_rate": 0.00017197353055663683, + "loss": 1.064, + "step": 366 + }, + { + "epoch": 0.1426211988730205, + "grad_norm": 0.21123190224170685, + "learning_rate": 0.00017189567925262749, + "loss": 1.0515, + "step": 367 + }, + { + "epoch": 0.1430098124939279, + "grad_norm": 0.20646221935749054, + "learning_rate": 0.00017181782794861814, + "loss": 1.0617, + "step": 368 + }, + { + "epoch": 0.14339842611483533, + "grad_norm": 0.2079589068889618, + "learning_rate": 0.00017173997664460882, + "loss": 1.0569, + "step": 369 + }, + { + "epoch": 0.14378703973574275, + "grad_norm": 0.216246098279953, + "learning_rate": 0.00017166212534059947, + "loss": 1.0986, + "step": 370 + }, + { + "epoch": 0.14417565335665014, + "grad_norm": 0.20711806416511536, + "learning_rate": 0.00017158427403659012, + "loss": 1.1342, + "step": 371 + }, + { + "epoch": 0.14456426697755756, + "grad_norm": 0.235435351729393, + "learning_rate": 0.00017150642273258077, + "loss": 1.1082, + "step": 372 + }, + { + "epoch": 0.14495288059846498, + "grad_norm": 0.2273191511631012, + "learning_rate": 0.00017142857142857143, + "loss": 1.1064, + "step": 373 + }, + { + "epoch": 0.1453414942193724, + "grad_norm": 0.2075672745704651, + "learning_rate": 0.0001713507201245621, + "loss": 1.0536, + "step": 374 + }, + { + "epoch": 0.14573010784027982, + "grad_norm": 0.20764274895191193, + "learning_rate": 0.00017127286882055276, + "loss": 1.0673, + "step": 375 + }, + { + "epoch": 0.1461187214611872, + "grad_norm": 0.2441243678331375, + "learning_rate": 0.0001711950175165434, + "loss": 1.1271, + "step": 376 + }, + { + "epoch": 0.14650733508209463, + "grad_norm": 0.2383374124765396, + "learning_rate": 0.00017111716621253406, + "loss": 1.083, + "step": 377 + }, + { + "epoch": 0.14689594870300204, + "grad_norm": 0.2172410786151886, + "learning_rate": 0.0001710393149085247, + "loss": 1.0605, + "step": 378 + }, + { + "epoch": 0.14728456232390946, + "grad_norm": 0.22591541707515717, + "learning_rate": 0.0001709614636045154, + "loss": 1.0931, + "step": 379 + }, + { + "epoch": 0.14767317594481685, + "grad_norm": 0.23099495470523834, + "learning_rate": 0.00017088361230050604, + "loss": 1.1021, + "step": 380 + }, + { + "epoch": 0.14806178956572427, + "grad_norm": 0.21461094915866852, + "learning_rate": 0.0001708057609964967, + "loss": 1.0959, + "step": 381 + }, + { + "epoch": 0.1484504031866317, + "grad_norm": 0.21557241678237915, + "learning_rate": 0.00017072790969248735, + "loss": 1.0155, + "step": 382 + }, + { + "epoch": 0.1488390168075391, + "grad_norm": 0.234396293759346, + "learning_rate": 0.000170650058388478, + "loss": 1.1289, + "step": 383 + }, + { + "epoch": 0.14922763042844653, + "grad_norm": 0.22895503044128418, + "learning_rate": 0.00017057220708446868, + "loss": 0.9919, + "step": 384 + }, + { + "epoch": 0.14961624404935392, + "grad_norm": 0.2054683268070221, + "learning_rate": 0.00017049435578045933, + "loss": 1.0607, + "step": 385 + }, + { + "epoch": 0.15000485767026134, + "grad_norm": 0.25569215416908264, + "learning_rate": 0.00017041650447644998, + "loss": 1.0517, + "step": 386 + }, + { + "epoch": 0.15039347129116876, + "grad_norm": 0.2222641259431839, + "learning_rate": 0.00017033865317244064, + "loss": 1.0404, + "step": 387 + }, + { + "epoch": 0.15078208491207618, + "grad_norm": 0.20501169562339783, + "learning_rate": 0.0001702608018684313, + "loss": 0.9897, + "step": 388 + }, + { + "epoch": 0.1511706985329836, + "grad_norm": 0.22080403566360474, + "learning_rate": 0.00017018295056442197, + "loss": 1.1013, + "step": 389 + }, + { + "epoch": 0.15155931215389098, + "grad_norm": 0.21218529343605042, + "learning_rate": 0.00017010509926041262, + "loss": 1.0541, + "step": 390 + }, + { + "epoch": 0.1519479257747984, + "grad_norm": 0.23064807057380676, + "learning_rate": 0.00017002724795640327, + "loss": 1.037, + "step": 391 + }, + { + "epoch": 0.15233653939570582, + "grad_norm": 0.21164493262767792, + "learning_rate": 0.00016994939665239392, + "loss": 1.0769, + "step": 392 + }, + { + "epoch": 0.15272515301661324, + "grad_norm": 0.22565549612045288, + "learning_rate": 0.00016987154534838457, + "loss": 1.0638, + "step": 393 + }, + { + "epoch": 0.15311376663752063, + "grad_norm": 0.22492647171020508, + "learning_rate": 0.00016979369404437525, + "loss": 1.063, + "step": 394 + }, + { + "epoch": 0.15350238025842805, + "grad_norm": 0.22335395216941833, + "learning_rate": 0.0001697158427403659, + "loss": 1.1032, + "step": 395 + }, + { + "epoch": 0.15389099387933547, + "grad_norm": 0.2164154201745987, + "learning_rate": 0.00016963799143635656, + "loss": 1.1275, + "step": 396 + }, + { + "epoch": 0.1542796075002429, + "grad_norm": 0.22547736763954163, + "learning_rate": 0.0001695601401323472, + "loss": 1.1324, + "step": 397 + }, + { + "epoch": 0.1546682211211503, + "grad_norm": 0.2028045952320099, + "learning_rate": 0.0001694822888283379, + "loss": 1.0057, + "step": 398 + }, + { + "epoch": 0.1550568347420577, + "grad_norm": 0.20770573616027832, + "learning_rate": 0.00016940443752432854, + "loss": 1.0311, + "step": 399 + }, + { + "epoch": 0.15544544836296512, + "grad_norm": 0.2231476902961731, + "learning_rate": 0.0001693265862203192, + "loss": 1.0535, + "step": 400 + }, + { + "epoch": 0.15583406198387253, + "grad_norm": 0.21618099510669708, + "learning_rate": 0.00016924873491630987, + "loss": 1.0616, + "step": 401 + }, + { + "epoch": 0.15622267560477995, + "grad_norm": 0.24024419486522675, + "learning_rate": 0.00016917088361230052, + "loss": 1.1324, + "step": 402 + }, + { + "epoch": 0.15661128922568737, + "grad_norm": 0.2002171128988266, + "learning_rate": 0.00016909303230829118, + "loss": 1.015, + "step": 403 + }, + { + "epoch": 0.15699990284659476, + "grad_norm": 0.21771477162837982, + "learning_rate": 0.00016901518100428183, + "loss": 1.0817, + "step": 404 + }, + { + "epoch": 0.15738851646750218, + "grad_norm": 0.22052259743213654, + "learning_rate": 0.0001689373297002725, + "loss": 1.0836, + "step": 405 + }, + { + "epoch": 0.1577771300884096, + "grad_norm": 0.1964062750339508, + "learning_rate": 0.00016885947839626316, + "loss": 1.0505, + "step": 406 + }, + { + "epoch": 0.15816574370931702, + "grad_norm": 0.22714298963546753, + "learning_rate": 0.0001687816270922538, + "loss": 1.0702, + "step": 407 + }, + { + "epoch": 0.15855435733022444, + "grad_norm": 0.20647728443145752, + "learning_rate": 0.00016870377578824446, + "loss": 1.0349, + "step": 408 + }, + { + "epoch": 0.15894297095113183, + "grad_norm": 0.2355160117149353, + "learning_rate": 0.00016862592448423512, + "loss": 1.0305, + "step": 409 + }, + { + "epoch": 0.15933158457203925, + "grad_norm": 0.22890770435333252, + "learning_rate": 0.0001685480731802258, + "loss": 1.0854, + "step": 410 + }, + { + "epoch": 0.15972019819294667, + "grad_norm": 0.21947838366031647, + "learning_rate": 0.00016847022187621645, + "loss": 1.0948, + "step": 411 + }, + { + "epoch": 0.16010881181385409, + "grad_norm": 0.22334899008274078, + "learning_rate": 0.0001683923705722071, + "loss": 1.006, + "step": 412 + }, + { + "epoch": 0.16049742543476148, + "grad_norm": 0.22324936091899872, + "learning_rate": 0.00016831451926819775, + "loss": 1.0402, + "step": 413 + }, + { + "epoch": 0.1608860390556689, + "grad_norm": 0.21462097764015198, + "learning_rate": 0.0001682366679641884, + "loss": 1.077, + "step": 414 + }, + { + "epoch": 0.1612746526765763, + "grad_norm": 0.24567006528377533, + "learning_rate": 0.00016815881666017908, + "loss": 1.15, + "step": 415 + }, + { + "epoch": 0.16166326629748373, + "grad_norm": 0.26437243819236755, + "learning_rate": 0.00016808096535616973, + "loss": 1.1251, + "step": 416 + }, + { + "epoch": 0.16205187991839115, + "grad_norm": 0.2217959761619568, + "learning_rate": 0.00016800311405216039, + "loss": 1.1103, + "step": 417 + }, + { + "epoch": 0.16244049353929854, + "grad_norm": 0.24402475357055664, + "learning_rate": 0.00016792526274815104, + "loss": 1.0672, + "step": 418 + }, + { + "epoch": 0.16282910716020596, + "grad_norm": 0.21609526872634888, + "learning_rate": 0.0001678474114441417, + "loss": 1.0291, + "step": 419 + }, + { + "epoch": 0.16321772078111338, + "grad_norm": 0.20054642856121063, + "learning_rate": 0.00016776956014013237, + "loss": 1.0704, + "step": 420 + }, + { + "epoch": 0.1636063344020208, + "grad_norm": 0.22864869236946106, + "learning_rate": 0.00016769170883612302, + "loss": 1.0612, + "step": 421 + }, + { + "epoch": 0.16399494802292822, + "grad_norm": 0.22651974856853485, + "learning_rate": 0.00016761385753211367, + "loss": 1.0749, + "step": 422 + }, + { + "epoch": 0.1643835616438356, + "grad_norm": 0.21587328612804413, + "learning_rate": 0.00016753600622810433, + "loss": 1.0398, + "step": 423 + }, + { + "epoch": 0.16477217526474303, + "grad_norm": 0.1953774094581604, + "learning_rate": 0.00016745815492409498, + "loss": 1.0275, + "step": 424 + }, + { + "epoch": 0.16516078888565044, + "grad_norm": 0.21803410351276398, + "learning_rate": 0.00016738030362008566, + "loss": 1.1219, + "step": 425 + }, + { + "epoch": 0.16554940250655786, + "grad_norm": 0.2034682035446167, + "learning_rate": 0.0001673024523160763, + "loss": 1.0342, + "step": 426 + }, + { + "epoch": 0.16593801612746525, + "grad_norm": 0.20135951042175293, + "learning_rate": 0.00016722460101206696, + "loss": 0.9802, + "step": 427 + }, + { + "epoch": 0.16632662974837267, + "grad_norm": 0.23310376703739166, + "learning_rate": 0.0001671467497080576, + "loss": 1.0789, + "step": 428 + }, + { + "epoch": 0.1667152433692801, + "grad_norm": 0.21475404500961304, + "learning_rate": 0.00016706889840404827, + "loss": 1.0416, + "step": 429 + }, + { + "epoch": 0.1671038569901875, + "grad_norm": 0.21661072969436646, + "learning_rate": 0.00016699104710003894, + "loss": 1.0568, + "step": 430 + }, + { + "epoch": 0.16749247061109493, + "grad_norm": 0.20310629904270172, + "learning_rate": 0.0001669131957960296, + "loss": 0.9968, + "step": 431 + }, + { + "epoch": 0.16788108423200232, + "grad_norm": 0.2596947252750397, + "learning_rate": 0.00016683534449202025, + "loss": 1.0478, + "step": 432 + }, + { + "epoch": 0.16826969785290974, + "grad_norm": 0.22226987779140472, + "learning_rate": 0.0001667574931880109, + "loss": 1.0898, + "step": 433 + }, + { + "epoch": 0.16865831147381716, + "grad_norm": 0.22499911487102509, + "learning_rate": 0.00016667964188400155, + "loss": 1.07, + "step": 434 + }, + { + "epoch": 0.16904692509472458, + "grad_norm": 0.2717292308807373, + "learning_rate": 0.0001666017905799922, + "loss": 1.0562, + "step": 435 + }, + { + "epoch": 0.169435538715632, + "grad_norm": 0.22052323818206787, + "learning_rate": 0.00016652393927598288, + "loss": 1.0732, + "step": 436 + }, + { + "epoch": 0.16982415233653939, + "grad_norm": 0.21741728484630585, + "learning_rate": 0.00016644608797197354, + "loss": 1.0409, + "step": 437 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 0.20701193809509277, + "learning_rate": 0.0001663682366679642, + "loss": 1.0731, + "step": 438 + }, + { + "epoch": 0.17060137957835422, + "grad_norm": 0.22071130573749542, + "learning_rate": 0.00016629038536395484, + "loss": 1.0992, + "step": 439 + }, + { + "epoch": 0.17098999319926164, + "grad_norm": 0.20261412858963013, + "learning_rate": 0.0001662125340599455, + "loss": 1.0051, + "step": 440 + }, + { + "epoch": 0.17137860682016906, + "grad_norm": 0.2082947939634323, + "learning_rate": 0.00016613468275593617, + "loss": 1.0477, + "step": 441 + }, + { + "epoch": 0.17176722044107645, + "grad_norm": 0.22534717619419098, + "learning_rate": 0.00016605683145192682, + "loss": 1.041, + "step": 442 + }, + { + "epoch": 0.17215583406198387, + "grad_norm": 0.21547731757164001, + "learning_rate": 0.00016597898014791748, + "loss": 1.0528, + "step": 443 + }, + { + "epoch": 0.1725444476828913, + "grad_norm": 0.24141089618206024, + "learning_rate": 0.00016590112884390813, + "loss": 1.0928, + "step": 444 + }, + { + "epoch": 0.1729330613037987, + "grad_norm": 0.21910884976387024, + "learning_rate": 0.00016582327753989878, + "loss": 1.063, + "step": 445 + }, + { + "epoch": 0.1733216749247061, + "grad_norm": 0.21782316267490387, + "learning_rate": 0.00016574542623588946, + "loss": 1.0976, + "step": 446 + }, + { + "epoch": 0.17371028854561352, + "grad_norm": 0.21771778166294098, + "learning_rate": 0.0001656675749318801, + "loss": 1.0677, + "step": 447 + }, + { + "epoch": 0.17409890216652094, + "grad_norm": 0.22117659449577332, + "learning_rate": 0.00016558972362787076, + "loss": 1.0669, + "step": 448 + }, + { + "epoch": 0.17448751578742835, + "grad_norm": 0.21918092668056488, + "learning_rate": 0.00016551187232386141, + "loss": 1.0955, + "step": 449 + }, + { + "epoch": 0.17487612940833577, + "grad_norm": 0.22027818858623505, + "learning_rate": 0.0001654340210198521, + "loss": 1.0201, + "step": 450 + }, + { + "epoch": 0.17526474302924316, + "grad_norm": 0.2042885720729828, + "learning_rate": 0.00016535616971584275, + "loss": 1.0881, + "step": 451 + }, + { + "epoch": 0.17565335665015058, + "grad_norm": 0.21788261830806732, + "learning_rate": 0.0001652783184118334, + "loss": 1.0918, + "step": 452 + }, + { + "epoch": 0.176041970271058, + "grad_norm": 0.23332571983337402, + "learning_rate": 0.00016520046710782408, + "loss": 1.091, + "step": 453 + }, + { + "epoch": 0.17643058389196542, + "grad_norm": 0.20204192399978638, + "learning_rate": 0.00016512261580381473, + "loss": 1.0366, + "step": 454 + }, + { + "epoch": 0.17681919751287284, + "grad_norm": 0.21761906147003174, + "learning_rate": 0.00016504476449980538, + "loss": 1.0131, + "step": 455 + }, + { + "epoch": 0.17720781113378023, + "grad_norm": 0.2152051478624344, + "learning_rate": 0.00016496691319579606, + "loss": 1.0868, + "step": 456 + }, + { + "epoch": 0.17759642475468765, + "grad_norm": 0.22776494920253754, + "learning_rate": 0.0001648890618917867, + "loss": 1.0807, + "step": 457 + }, + { + "epoch": 0.17798503837559507, + "grad_norm": 0.2171342968940735, + "learning_rate": 0.00016481121058777736, + "loss": 1.0537, + "step": 458 + }, + { + "epoch": 0.17837365199650249, + "grad_norm": 0.2046273946762085, + "learning_rate": 0.00016473335928376802, + "loss": 1.0097, + "step": 459 + }, + { + "epoch": 0.17876226561740988, + "grad_norm": 0.2047681361436844, + "learning_rate": 0.00016465550797975867, + "loss": 1.0204, + "step": 460 + }, + { + "epoch": 0.1791508792383173, + "grad_norm": 0.1876862645149231, + "learning_rate": 0.00016457765667574935, + "loss": 0.9383, + "step": 461 + }, + { + "epoch": 0.17953949285922471, + "grad_norm": 0.218430757522583, + "learning_rate": 0.00016449980537174, + "loss": 1.0721, + "step": 462 + }, + { + "epoch": 0.17992810648013213, + "grad_norm": 0.2245480865240097, + "learning_rate": 0.00016442195406773065, + "loss": 1.0859, + "step": 463 + }, + { + "epoch": 0.18031672010103955, + "grad_norm": 0.22577151656150818, + "learning_rate": 0.0001643441027637213, + "loss": 1.0825, + "step": 464 + }, + { + "epoch": 0.18070533372194694, + "grad_norm": 0.20132745802402496, + "learning_rate": 0.00016426625145971196, + "loss": 1.0615, + "step": 465 + }, + { + "epoch": 0.18109394734285436, + "grad_norm": 0.2277505248785019, + "learning_rate": 0.00016418840015570263, + "loss": 1.0426, + "step": 466 + }, + { + "epoch": 0.18148256096376178, + "grad_norm": 0.22540105879306793, + "learning_rate": 0.0001641105488516933, + "loss": 1.0481, + "step": 467 + }, + { + "epoch": 0.1818711745846692, + "grad_norm": 0.20358088612556458, + "learning_rate": 0.00016403269754768394, + "loss": 1.0286, + "step": 468 + }, + { + "epoch": 0.18225978820557662, + "grad_norm": 0.22534145414829254, + "learning_rate": 0.0001639548462436746, + "loss": 1.1183, + "step": 469 + }, + { + "epoch": 0.182648401826484, + "grad_norm": 0.2188873142004013, + "learning_rate": 0.00016387699493966524, + "loss": 1.0439, + "step": 470 + }, + { + "epoch": 0.18303701544739143, + "grad_norm": 0.2128048539161682, + "learning_rate": 0.00016379914363565592, + "loss": 1.027, + "step": 471 + }, + { + "epoch": 0.18342562906829885, + "grad_norm": 0.2518141567707062, + "learning_rate": 0.00016372129233164657, + "loss": 1.0468, + "step": 472 + }, + { + "epoch": 0.18381424268920626, + "grad_norm": 0.2189142256975174, + "learning_rate": 0.00016364344102763723, + "loss": 1.0581, + "step": 473 + }, + { + "epoch": 0.18420285631011368, + "grad_norm": 0.31266725063323975, + "learning_rate": 0.00016356558972362788, + "loss": 1.0554, + "step": 474 + }, + { + "epoch": 0.18459146993102107, + "grad_norm": 0.21343916654586792, + "learning_rate": 0.00016348773841961853, + "loss": 1.0795, + "step": 475 + }, + { + "epoch": 0.1849800835519285, + "grad_norm": 0.22907280921936035, + "learning_rate": 0.00016340988711560918, + "loss": 1.0304, + "step": 476 + }, + { + "epoch": 0.1853686971728359, + "grad_norm": 0.2105257511138916, + "learning_rate": 0.00016333203581159986, + "loss": 1.0231, + "step": 477 + }, + { + "epoch": 0.18575731079374333, + "grad_norm": 0.19537831842899323, + "learning_rate": 0.00016325418450759051, + "loss": 1.0103, + "step": 478 + }, + { + "epoch": 0.18614592441465072, + "grad_norm": 0.20522372424602509, + "learning_rate": 0.00016317633320358117, + "loss": 1.0196, + "step": 479 + }, + { + "epoch": 0.18653453803555814, + "grad_norm": 0.21646477282047272, + "learning_rate": 0.00016309848189957182, + "loss": 1.0579, + "step": 480 + }, + { + "epoch": 0.18692315165646556, + "grad_norm": 0.21077193319797516, + "learning_rate": 0.00016302063059556247, + "loss": 1.0638, + "step": 481 + }, + { + "epoch": 0.18731176527737298, + "grad_norm": 0.20357473194599152, + "learning_rate": 0.00016294277929155315, + "loss": 1.0635, + "step": 482 + }, + { + "epoch": 0.1877003788982804, + "grad_norm": 0.2188001275062561, + "learning_rate": 0.0001628649279875438, + "loss": 1.0267, + "step": 483 + }, + { + "epoch": 0.1880889925191878, + "grad_norm": 0.2128928154706955, + "learning_rate": 0.00016278707668353445, + "loss": 0.9706, + "step": 484 + }, + { + "epoch": 0.1884776061400952, + "grad_norm": 0.22081372141838074, + "learning_rate": 0.0001627092253795251, + "loss": 1.08, + "step": 485 + }, + { + "epoch": 0.18886621976100262, + "grad_norm": 0.2250615805387497, + "learning_rate": 0.00016263137407551576, + "loss": 1.1451, + "step": 486 + }, + { + "epoch": 0.18925483338191004, + "grad_norm": 0.1984967589378357, + "learning_rate": 0.00016255352277150644, + "loss": 1.0744, + "step": 487 + }, + { + "epoch": 0.18964344700281746, + "grad_norm": 0.20778900384902954, + "learning_rate": 0.0001624756714674971, + "loss": 1.0623, + "step": 488 + }, + { + "epoch": 0.19003206062372485, + "grad_norm": 0.2026563137769699, + "learning_rate": 0.00016239782016348774, + "loss": 1.0714, + "step": 489 + }, + { + "epoch": 0.19042067424463227, + "grad_norm": 0.21598374843597412, + "learning_rate": 0.0001623199688594784, + "loss": 1.0869, + "step": 490 + }, + { + "epoch": 0.1908092878655397, + "grad_norm": 0.18944978713989258, + "learning_rate": 0.00016224211755546904, + "loss": 1.055, + "step": 491 + }, + { + "epoch": 0.1911979014864471, + "grad_norm": 0.20698946714401245, + "learning_rate": 0.00016216426625145972, + "loss": 1.0392, + "step": 492 + }, + { + "epoch": 0.1915865151073545, + "grad_norm": 0.22395353019237518, + "learning_rate": 0.00016208641494745038, + "loss": 1.0681, + "step": 493 + }, + { + "epoch": 0.19197512872826192, + "grad_norm": 0.22372962534427643, + "learning_rate": 0.00016200856364344103, + "loss": 1.0767, + "step": 494 + }, + { + "epoch": 0.19236374234916934, + "grad_norm": 0.2066701054573059, + "learning_rate": 0.00016193071233943168, + "loss": 1.0061, + "step": 495 + }, + { + "epoch": 0.19275235597007676, + "grad_norm": 0.19716408848762512, + "learning_rate": 0.00016185286103542233, + "loss": 1.039, + "step": 496 + }, + { + "epoch": 0.19314096959098417, + "grad_norm": 0.22159601747989655, + "learning_rate": 0.000161775009731413, + "loss": 1.0832, + "step": 497 + }, + { + "epoch": 0.19352958321189156, + "grad_norm": 0.21509626507759094, + "learning_rate": 0.00016169715842740366, + "loss": 1.0264, + "step": 498 + }, + { + "epoch": 0.19391819683279898, + "grad_norm": 0.21598199009895325, + "learning_rate": 0.00016161930712339431, + "loss": 1.049, + "step": 499 + }, + { + "epoch": 0.1943068104537064, + "grad_norm": 0.20279590785503387, + "learning_rate": 0.00016154145581938497, + "loss": 1.0505, + "step": 500 + }, + { + "epoch": 0.19469542407461382, + "grad_norm": 0.21796855330467224, + "learning_rate": 0.00016146360451537565, + "loss": 1.0885, + "step": 501 + }, + { + "epoch": 0.19508403769552124, + "grad_norm": 0.22128933668136597, + "learning_rate": 0.0001613857532113663, + "loss": 1.0903, + "step": 502 + }, + { + "epoch": 0.19547265131642863, + "grad_norm": 0.2032536417245865, + "learning_rate": 0.00016130790190735695, + "loss": 1.0285, + "step": 503 + }, + { + "epoch": 0.19586126493733605, + "grad_norm": 0.23738974332809448, + "learning_rate": 0.0001612300506033476, + "loss": 1.1188, + "step": 504 + }, + { + "epoch": 0.19624987855824347, + "grad_norm": 0.19614790380001068, + "learning_rate": 0.00016115219929933828, + "loss": 1.04, + "step": 505 + }, + { + "epoch": 0.1966384921791509, + "grad_norm": 0.2198178917169571, + "learning_rate": 0.00016107434799532893, + "loss": 1.0696, + "step": 506 + }, + { + "epoch": 0.1970271058000583, + "grad_norm": 0.18814648687839508, + "learning_rate": 0.00016099649669131959, + "loss": 1.0203, + "step": 507 + }, + { + "epoch": 0.1974157194209657, + "grad_norm": 0.20699037611484528, + "learning_rate": 0.00016091864538731026, + "loss": 1.1074, + "step": 508 + }, + { + "epoch": 0.19780433304187311, + "grad_norm": 0.21490445733070374, + "learning_rate": 0.00016084079408330092, + "loss": 1.0682, + "step": 509 + }, + { + "epoch": 0.19819294666278053, + "grad_norm": 0.2363848090171814, + "learning_rate": 0.00016076294277929157, + "loss": 1.0408, + "step": 510 + }, + { + "epoch": 0.19858156028368795, + "grad_norm": 0.20186659693717957, + "learning_rate": 0.00016068509147528222, + "loss": 1.026, + "step": 511 + }, + { + "epoch": 0.19897017390459534, + "grad_norm": 0.21564024686813354, + "learning_rate": 0.00016060724017127287, + "loss": 1.0418, + "step": 512 + }, + { + "epoch": 0.19935878752550276, + "grad_norm": 0.19151560962200165, + "learning_rate": 0.00016052938886726355, + "loss": 1.0037, + "step": 513 + }, + { + "epoch": 0.19974740114641018, + "grad_norm": 0.21038194000720978, + "learning_rate": 0.0001604515375632542, + "loss": 1.0545, + "step": 514 + }, + { + "epoch": 0.2001360147673176, + "grad_norm": 0.20496582984924316, + "learning_rate": 0.00016037368625924486, + "loss": 1.0543, + "step": 515 + }, + { + "epoch": 0.20052462838822502, + "grad_norm": 0.20689113438129425, + "learning_rate": 0.0001602958349552355, + "loss": 1.0905, + "step": 516 + }, + { + "epoch": 0.2009132420091324, + "grad_norm": 0.2284041792154312, + "learning_rate": 0.00016021798365122616, + "loss": 1.0717, + "step": 517 + }, + { + "epoch": 0.20130185563003983, + "grad_norm": 0.23457761108875275, + "learning_rate": 0.00016014013234721684, + "loss": 1.106, + "step": 518 + }, + { + "epoch": 0.20169046925094725, + "grad_norm": 0.2088528722524643, + "learning_rate": 0.0001600622810432075, + "loss": 1.0428, + "step": 519 + }, + { + "epoch": 0.20207908287185467, + "grad_norm": 0.2170068770647049, + "learning_rate": 0.00015998442973919814, + "loss": 0.9875, + "step": 520 + }, + { + "epoch": 0.20246769649276208, + "grad_norm": 0.2270561158657074, + "learning_rate": 0.0001599065784351888, + "loss": 1.0676, + "step": 521 + }, + { + "epoch": 0.20285631011366947, + "grad_norm": 0.2151324599981308, + "learning_rate": 0.00015982872713117945, + "loss": 1.0675, + "step": 522 + }, + { + "epoch": 0.2032449237345769, + "grad_norm": 0.23113249242305756, + "learning_rate": 0.00015975087582717013, + "loss": 1.0608, + "step": 523 + }, + { + "epoch": 0.2036335373554843, + "grad_norm": 0.2587106227874756, + "learning_rate": 0.00015967302452316078, + "loss": 1.0867, + "step": 524 + }, + { + "epoch": 0.20402215097639173, + "grad_norm": 0.21842992305755615, + "learning_rate": 0.00015959517321915143, + "loss": 1.0726, + "step": 525 + }, + { + "epoch": 0.20441076459729912, + "grad_norm": 0.20867805182933807, + "learning_rate": 0.00015951732191514208, + "loss": 1.0578, + "step": 526 + }, + { + "epoch": 0.20479937821820654, + "grad_norm": 0.2396962195634842, + "learning_rate": 0.00015943947061113273, + "loss": 1.0292, + "step": 527 + }, + { + "epoch": 0.20518799183911396, + "grad_norm": 0.221155047416687, + "learning_rate": 0.00015936161930712341, + "loss": 1.0019, + "step": 528 + }, + { + "epoch": 0.20557660546002138, + "grad_norm": 0.20032119750976562, + "learning_rate": 0.00015928376800311407, + "loss": 1.0435, + "step": 529 + }, + { + "epoch": 0.2059652190809288, + "grad_norm": 0.24095888435840607, + "learning_rate": 0.00015920591669910472, + "loss": 1.0355, + "step": 530 + }, + { + "epoch": 0.2063538327018362, + "grad_norm": 0.2286604344844818, + "learning_rate": 0.00015912806539509537, + "loss": 0.9989, + "step": 531 + }, + { + "epoch": 0.2067424463227436, + "grad_norm": 0.21537137031555176, + "learning_rate": 0.00015905021409108602, + "loss": 1.0642, + "step": 532 + }, + { + "epoch": 0.20713105994365102, + "grad_norm": 0.22447925806045532, + "learning_rate": 0.0001589723627870767, + "loss": 1.1244, + "step": 533 + }, + { + "epoch": 0.20751967356455844, + "grad_norm": 0.21077273786067963, + "learning_rate": 0.00015889451148306735, + "loss": 1.0167, + "step": 534 + }, + { + "epoch": 0.20790828718546586, + "grad_norm": 0.22340558469295502, + "learning_rate": 0.000158816660179058, + "loss": 1.0991, + "step": 535 + }, + { + "epoch": 0.20829690080637325, + "grad_norm": 0.223599374294281, + "learning_rate": 0.00015873880887504866, + "loss": 1.086, + "step": 536 + }, + { + "epoch": 0.20868551442728067, + "grad_norm": 0.2615208923816681, + "learning_rate": 0.0001586609575710393, + "loss": 1.0584, + "step": 537 + }, + { + "epoch": 0.2090741280481881, + "grad_norm": 0.2085907757282257, + "learning_rate": 0.00015858310626703, + "loss": 1.0994, + "step": 538 + }, + { + "epoch": 0.2094627416690955, + "grad_norm": 0.2170211672782898, + "learning_rate": 0.00015850525496302064, + "loss": 1.1105, + "step": 539 + }, + { + "epoch": 0.20985135529000293, + "grad_norm": 0.21978625655174255, + "learning_rate": 0.0001584274036590113, + "loss": 1.002, + "step": 540 + }, + { + "epoch": 0.21023996891091032, + "grad_norm": 0.23684021830558777, + "learning_rate": 0.00015834955235500194, + "loss": 1.1216, + "step": 541 + }, + { + "epoch": 0.21062858253181774, + "grad_norm": 0.220269113779068, + "learning_rate": 0.0001582717010509926, + "loss": 1.0773, + "step": 542 + }, + { + "epoch": 0.21101719615272516, + "grad_norm": 0.22447973489761353, + "learning_rate": 0.00015819384974698328, + "loss": 1.0941, + "step": 543 + }, + { + "epoch": 0.21140580977363257, + "grad_norm": 0.22435730695724487, + "learning_rate": 0.00015811599844297393, + "loss": 1.0138, + "step": 544 + }, + { + "epoch": 0.21179442339453997, + "grad_norm": 0.2230793684720993, + "learning_rate": 0.00015803814713896458, + "loss": 1.0343, + "step": 545 + }, + { + "epoch": 0.21218303701544738, + "grad_norm": 0.23491905629634857, + "learning_rate": 0.00015796029583495523, + "loss": 1.11, + "step": 546 + }, + { + "epoch": 0.2125716506363548, + "grad_norm": 0.213560551404953, + "learning_rate": 0.00015788244453094588, + "loss": 1.0615, + "step": 547 + }, + { + "epoch": 0.21296026425726222, + "grad_norm": 0.21392837166786194, + "learning_rate": 0.00015780459322693654, + "loss": 1.0872, + "step": 548 + }, + { + "epoch": 0.21334887787816964, + "grad_norm": 0.20007692277431488, + "learning_rate": 0.00015772674192292722, + "loss": 1.0394, + "step": 549 + }, + { + "epoch": 0.21373749149907703, + "grad_norm": 0.1969841718673706, + "learning_rate": 0.00015764889061891787, + "loss": 1.0381, + "step": 550 + }, + { + "epoch": 0.21412610511998445, + "grad_norm": 0.21874025464057922, + "learning_rate": 0.00015757103931490852, + "loss": 1.0822, + "step": 551 + }, + { + "epoch": 0.21451471874089187, + "grad_norm": 0.21824273467063904, + "learning_rate": 0.00015749318801089917, + "loss": 1.0802, + "step": 552 + }, + { + "epoch": 0.2149033323617993, + "grad_norm": 0.20942047238349915, + "learning_rate": 0.00015741533670688985, + "loss": 1.0634, + "step": 553 + }, + { + "epoch": 0.2152919459827067, + "grad_norm": 0.1940152943134308, + "learning_rate": 0.0001573374854028805, + "loss": 1.0264, + "step": 554 + }, + { + "epoch": 0.2156805596036141, + "grad_norm": 0.19859059154987335, + "learning_rate": 0.00015725963409887115, + "loss": 0.9701, + "step": 555 + }, + { + "epoch": 0.21606917322452152, + "grad_norm": 0.22239404916763306, + "learning_rate": 0.0001571817827948618, + "loss": 1.1282, + "step": 556 + }, + { + "epoch": 0.21645778684542893, + "grad_norm": 0.23820599913597107, + "learning_rate": 0.00015710393149085249, + "loss": 1.1123, + "step": 557 + }, + { + "epoch": 0.21684640046633635, + "grad_norm": 0.21279917657375336, + "learning_rate": 0.00015702608018684314, + "loss": 1.0542, + "step": 558 + }, + { + "epoch": 0.21723501408724374, + "grad_norm": 0.2065514773130417, + "learning_rate": 0.0001569482288828338, + "loss": 1.0685, + "step": 559 + }, + { + "epoch": 0.21762362770815116, + "grad_norm": 0.20130831003189087, + "learning_rate": 0.00015687037757882447, + "loss": 0.9869, + "step": 560 + }, + { + "epoch": 0.21801224132905858, + "grad_norm": 0.2187541127204895, + "learning_rate": 0.00015679252627481512, + "loss": 1.1095, + "step": 561 + }, + { + "epoch": 0.218400854949966, + "grad_norm": 0.21028277277946472, + "learning_rate": 0.00015671467497080577, + "loss": 1.0804, + "step": 562 + }, + { + "epoch": 0.21878946857087342, + "grad_norm": 0.8187636733055115, + "learning_rate": 0.00015663682366679643, + "loss": 1.0782, + "step": 563 + }, + { + "epoch": 0.2191780821917808, + "grad_norm": 0.20059974491596222, + "learning_rate": 0.0001565589723627871, + "loss": 1.0279, + "step": 564 + }, + { + "epoch": 0.21956669581268823, + "grad_norm": 0.20440839231014252, + "learning_rate": 0.00015648112105877776, + "loss": 0.9863, + "step": 565 + }, + { + "epoch": 0.21995530943359565, + "grad_norm": 0.21423624455928802, + "learning_rate": 0.0001564032697547684, + "loss": 1.0685, + "step": 566 + }, + { + "epoch": 0.22034392305450307, + "grad_norm": 0.22430062294006348, + "learning_rate": 0.00015632541845075906, + "loss": 1.0761, + "step": 567 + }, + { + "epoch": 0.22073253667541048, + "grad_norm": 0.22782258689403534, + "learning_rate": 0.0001562475671467497, + "loss": 1.1024, + "step": 568 + }, + { + "epoch": 0.22112115029631788, + "grad_norm": 0.21150320768356323, + "learning_rate": 0.0001561697158427404, + "loss": 1.0621, + "step": 569 + }, + { + "epoch": 0.2215097639172253, + "grad_norm": 0.20342351496219635, + "learning_rate": 0.00015609186453873104, + "loss": 1.0667, + "step": 570 + }, + { + "epoch": 0.2218983775381327, + "grad_norm": 0.22866711020469666, + "learning_rate": 0.0001560140132347217, + "loss": 1.0631, + "step": 571 + }, + { + "epoch": 0.22228699115904013, + "grad_norm": 0.2200063169002533, + "learning_rate": 0.00015593616193071235, + "loss": 1.0448, + "step": 572 + }, + { + "epoch": 0.22267560477994755, + "grad_norm": 0.19440248608589172, + "learning_rate": 0.000155858310626703, + "loss": 1.037, + "step": 573 + }, + { + "epoch": 0.22306421840085494, + "grad_norm": 0.205752432346344, + "learning_rate": 0.00015578045932269368, + "loss": 1.0465, + "step": 574 + }, + { + "epoch": 0.22345283202176236, + "grad_norm": 0.22247998416423798, + "learning_rate": 0.00015570260801868433, + "loss": 0.997, + "step": 575 + }, + { + "epoch": 0.22384144564266978, + "grad_norm": 0.22199274599552155, + "learning_rate": 0.00015562475671467498, + "loss": 1.0178, + "step": 576 + }, + { + "epoch": 0.2242300592635772, + "grad_norm": 0.2114989310503006, + "learning_rate": 0.00015554690541066564, + "loss": 1.0457, + "step": 577 + }, + { + "epoch": 0.2246186728844846, + "grad_norm": 0.24248506128787994, + "learning_rate": 0.0001554690541066563, + "loss": 1.002, + "step": 578 + }, + { + "epoch": 0.225007286505392, + "grad_norm": 0.2565505802631378, + "learning_rate": 0.00015539120280264697, + "loss": 1.0541, + "step": 579 + }, + { + "epoch": 0.22539590012629943, + "grad_norm": 0.22799409925937653, + "learning_rate": 0.00015531335149863762, + "loss": 1.0788, + "step": 580 + }, + { + "epoch": 0.22578451374720684, + "grad_norm": 0.2196080982685089, + "learning_rate": 0.00015523550019462827, + "loss": 1.0877, + "step": 581 + }, + { + "epoch": 0.22617312736811426, + "grad_norm": 0.21992824971675873, + "learning_rate": 0.00015515764889061892, + "loss": 1.0213, + "step": 582 + }, + { + "epoch": 0.22656174098902165, + "grad_norm": 0.22793298959732056, + "learning_rate": 0.00015507979758660957, + "loss": 1.0633, + "step": 583 + }, + { + "epoch": 0.22695035460992907, + "grad_norm": 0.21707972884178162, + "learning_rate": 0.00015500194628260023, + "loss": 1.081, + "step": 584 + }, + { + "epoch": 0.2273389682308365, + "grad_norm": 0.220685675740242, + "learning_rate": 0.0001549240949785909, + "loss": 1.0658, + "step": 585 + }, + { + "epoch": 0.2277275818517439, + "grad_norm": 0.22576668858528137, + "learning_rate": 0.00015484624367458156, + "loss": 1.0795, + "step": 586 + }, + { + "epoch": 0.22811619547265133, + "grad_norm": 0.21778982877731323, + "learning_rate": 0.0001547683923705722, + "loss": 1.033, + "step": 587 + }, + { + "epoch": 0.22850480909355872, + "grad_norm": 0.22748610377311707, + "learning_rate": 0.00015469054106656286, + "loss": 1.0948, + "step": 588 + }, + { + "epoch": 0.22889342271446614, + "grad_norm": 0.21561284363269806, + "learning_rate": 0.00015461268976255351, + "loss": 1.0022, + "step": 589 + }, + { + "epoch": 0.22928203633537356, + "grad_norm": 0.2419756054878235, + "learning_rate": 0.0001545348384585442, + "loss": 1.0786, + "step": 590 + }, + { + "epoch": 0.22967064995628098, + "grad_norm": 0.20479315519332886, + "learning_rate": 0.00015445698715453485, + "loss": 1.027, + "step": 591 + }, + { + "epoch": 0.2300592635771884, + "grad_norm": 0.21365883946418762, + "learning_rate": 0.0001543791358505255, + "loss": 1.0773, + "step": 592 + }, + { + "epoch": 0.23044787719809579, + "grad_norm": 0.23133166134357452, + "learning_rate": 0.00015430128454651615, + "loss": 1.0877, + "step": 593 + }, + { + "epoch": 0.2308364908190032, + "grad_norm": 0.2110515981912613, + "learning_rate": 0.0001542234332425068, + "loss": 1.0509, + "step": 594 + }, + { + "epoch": 0.23122510443991062, + "grad_norm": 0.20658442378044128, + "learning_rate": 0.00015414558193849748, + "loss": 1.0623, + "step": 595 + }, + { + "epoch": 0.23161371806081804, + "grad_norm": 0.21831996738910675, + "learning_rate": 0.00015406773063448813, + "loss": 1.021, + "step": 596 + }, + { + "epoch": 0.23200233168172543, + "grad_norm": 0.23015642166137695, + "learning_rate": 0.00015398987933047878, + "loss": 1.0358, + "step": 597 + }, + { + "epoch": 0.23239094530263285, + "grad_norm": 0.23071645200252533, + "learning_rate": 0.00015391202802646944, + "loss": 1.1255, + "step": 598 + }, + { + "epoch": 0.23277955892354027, + "grad_norm": 0.19513486325740814, + "learning_rate": 0.0001538341767224601, + "loss": 1.0189, + "step": 599 + }, + { + "epoch": 0.2331681725444477, + "grad_norm": 0.20821452140808105, + "learning_rate": 0.00015375632541845077, + "loss": 1.0843, + "step": 600 + }, + { + "epoch": 0.2335567861653551, + "grad_norm": 0.20563223958015442, + "learning_rate": 0.00015367847411444142, + "loss": 1.0012, + "step": 601 + }, + { + "epoch": 0.2339453997862625, + "grad_norm": 0.22674202919006348, + "learning_rate": 0.00015360062281043207, + "loss": 1.0371, + "step": 602 + }, + { + "epoch": 0.23433401340716992, + "grad_norm": 0.20744135975837708, + "learning_rate": 0.00015352277150642272, + "loss": 1.0466, + "step": 603 + }, + { + "epoch": 0.23472262702807734, + "grad_norm": 0.22103577852249146, + "learning_rate": 0.00015344492020241338, + "loss": 1.0942, + "step": 604 + }, + { + "epoch": 0.23511124064898475, + "grad_norm": 0.20643098652362823, + "learning_rate": 0.00015336706889840406, + "loss": 1.0682, + "step": 605 + }, + { + "epoch": 0.23549985426989217, + "grad_norm": 0.23436777293682098, + "learning_rate": 0.0001532892175943947, + "loss": 1.0613, + "step": 606 + }, + { + "epoch": 0.23588846789079956, + "grad_norm": 0.21898899972438812, + "learning_rate": 0.00015321136629038536, + "loss": 1.0571, + "step": 607 + }, + { + "epoch": 0.23627708151170698, + "grad_norm": 0.20569247007369995, + "learning_rate": 0.00015313351498637604, + "loss": 1.061, + "step": 608 + }, + { + "epoch": 0.2366656951326144, + "grad_norm": 0.2099207490682602, + "learning_rate": 0.0001530556636823667, + "loss": 1.0776, + "step": 609 + }, + { + "epoch": 0.23705430875352182, + "grad_norm": 0.20078738033771515, + "learning_rate": 0.00015297781237835734, + "loss": 1.0341, + "step": 610 + }, + { + "epoch": 0.2374429223744292, + "grad_norm": 0.20327065885066986, + "learning_rate": 0.000152899961074348, + "loss": 1.0168, + "step": 611 + }, + { + "epoch": 0.23783153599533663, + "grad_norm": 0.21741214394569397, + "learning_rate": 0.00015282210977033867, + "loss": 1.0726, + "step": 612 + }, + { + "epoch": 0.23822014961624405, + "grad_norm": 0.2065727263689041, + "learning_rate": 0.00015274425846632933, + "loss": 1.0474, + "step": 613 + }, + { + "epoch": 0.23860876323715147, + "grad_norm": 0.21241194009780884, + "learning_rate": 0.00015266640716231998, + "loss": 1.0666, + "step": 614 + }, + { + "epoch": 0.23899737685805889, + "grad_norm": 0.2194201797246933, + "learning_rate": 0.00015258855585831066, + "loss": 1.1411, + "step": 615 + }, + { + "epoch": 0.23938599047896628, + "grad_norm": 0.21537193655967712, + "learning_rate": 0.0001525107045543013, + "loss": 1.081, + "step": 616 + }, + { + "epoch": 0.2397746040998737, + "grad_norm": 0.21125951409339905, + "learning_rate": 0.00015243285325029196, + "loss": 1.0679, + "step": 617 + }, + { + "epoch": 0.2401632177207811, + "grad_norm": 0.21342721581459045, + "learning_rate": 0.0001523550019462826, + "loss": 1.0564, + "step": 618 + }, + { + "epoch": 0.24055183134168853, + "grad_norm": 0.2223503291606903, + "learning_rate": 0.00015227715064227327, + "loss": 1.1163, + "step": 619 + }, + { + "epoch": 0.24094044496259595, + "grad_norm": 0.21626527607440948, + "learning_rate": 0.00015219929933826394, + "loss": 1.0793, + "step": 620 + }, + { + "epoch": 0.24132905858350334, + "grad_norm": 0.21899500489234924, + "learning_rate": 0.0001521214480342546, + "loss": 1.0864, + "step": 621 + }, + { + "epoch": 0.24171767220441076, + "grad_norm": 0.2499915212392807, + "learning_rate": 0.00015204359673024525, + "loss": 1.1381, + "step": 622 + }, + { + "epoch": 0.24210628582531818, + "grad_norm": 0.2108345925807953, + "learning_rate": 0.0001519657454262359, + "loss": 1.0534, + "step": 623 + }, + { + "epoch": 0.2424948994462256, + "grad_norm": 0.2224910855293274, + "learning_rate": 0.00015188789412222655, + "loss": 1.0235, + "step": 624 + }, + { + "epoch": 0.24288351306713302, + "grad_norm": 0.22163094580173492, + "learning_rate": 0.0001518100428182172, + "loss": 1.0143, + "step": 625 + }, + { + "epoch": 0.2432721266880404, + "grad_norm": 0.20709283649921417, + "learning_rate": 0.00015173219151420788, + "loss": 1.0506, + "step": 626 + }, + { + "epoch": 0.24366074030894783, + "grad_norm": 0.2112802267074585, + "learning_rate": 0.00015165434021019854, + "loss": 1.0692, + "step": 627 + }, + { + "epoch": 0.24404935392985525, + "grad_norm": 0.23622830212116241, + "learning_rate": 0.0001515764889061892, + "loss": 1.0769, + "step": 628 + }, + { + "epoch": 0.24443796755076266, + "grad_norm": 0.23328271508216858, + "learning_rate": 0.00015149863760217984, + "loss": 1.1158, + "step": 629 + }, + { + "epoch": 0.24482658117167005, + "grad_norm": 0.2071760892868042, + "learning_rate": 0.0001514207862981705, + "loss": 1.0133, + "step": 630 + }, + { + "epoch": 0.24521519479257747, + "grad_norm": 0.21428920328617096, + "learning_rate": 0.00015134293499416117, + "loss": 1.0342, + "step": 631 + }, + { + "epoch": 0.2456038084134849, + "grad_norm": 0.22225375473499298, + "learning_rate": 0.00015126508369015182, + "loss": 1.1054, + "step": 632 + }, + { + "epoch": 0.2459924220343923, + "grad_norm": 0.2096671611070633, + "learning_rate": 0.00015118723238614248, + "loss": 1.0229, + "step": 633 + }, + { + "epoch": 0.24638103565529973, + "grad_norm": 0.21473252773284912, + "learning_rate": 0.00015110938108213313, + "loss": 1.0915, + "step": 634 + }, + { + "epoch": 0.24676964927620712, + "grad_norm": 0.2071562111377716, + "learning_rate": 0.00015103152977812378, + "loss": 1.047, + "step": 635 + }, + { + "epoch": 0.24715826289711454, + "grad_norm": 0.19868609309196472, + "learning_rate": 0.00015095367847411446, + "loss": 1.0073, + "step": 636 + }, + { + "epoch": 0.24754687651802196, + "grad_norm": 0.20937366783618927, + "learning_rate": 0.0001508758271701051, + "loss": 1.0155, + "step": 637 + }, + { + "epoch": 0.24793549013892938, + "grad_norm": 0.19225911796092987, + "learning_rate": 0.00015079797586609576, + "loss": 1.0163, + "step": 638 + }, + { + "epoch": 0.2483241037598368, + "grad_norm": 0.20427283644676208, + "learning_rate": 0.00015072012456208641, + "loss": 1.062, + "step": 639 + }, + { + "epoch": 0.24871271738074419, + "grad_norm": 0.21640253067016602, + "learning_rate": 0.00015064227325807707, + "loss": 1.025, + "step": 640 + }, + { + "epoch": 0.2491013310016516, + "grad_norm": 0.20416739583015442, + "learning_rate": 0.00015056442195406775, + "loss": 1.0635, + "step": 641 + }, + { + "epoch": 0.24948994462255902, + "grad_norm": 0.1990521252155304, + "learning_rate": 0.0001504865706500584, + "loss": 1.0757, + "step": 642 + }, + { + "epoch": 0.24987855824346644, + "grad_norm": 0.21636444330215454, + "learning_rate": 0.00015040871934604905, + "loss": 1.0441, + "step": 643 + }, + { + "epoch": 0.25026717186437386, + "grad_norm": 0.21253719925880432, + "learning_rate": 0.0001503308680420397, + "loss": 1.0574, + "step": 644 + }, + { + "epoch": 0.2506557854852813, + "grad_norm": 0.2134159356355667, + "learning_rate": 0.00015025301673803035, + "loss": 1.0396, + "step": 645 + }, + { + "epoch": 0.2510443991061887, + "grad_norm": 0.2018527239561081, + "learning_rate": 0.00015017516543402103, + "loss": 1.0606, + "step": 646 + }, + { + "epoch": 0.25143301272709606, + "grad_norm": 0.20320741832256317, + "learning_rate": 0.00015009731413001169, + "loss": 1.0093, + "step": 647 + }, + { + "epoch": 0.2518216263480035, + "grad_norm": 0.21007056534290314, + "learning_rate": 0.00015001946282600234, + "loss": 1.0284, + "step": 648 + }, + { + "epoch": 0.2522102399689109, + "grad_norm": 0.22453372180461884, + "learning_rate": 0.000149941611521993, + "loss": 1.0271, + "step": 649 + }, + { + "epoch": 0.2525988535898183, + "grad_norm": 0.19889335334300995, + "learning_rate": 0.00014986376021798364, + "loss": 1.0238, + "step": 650 + }, + { + "epoch": 0.25298746721072574, + "grad_norm": 0.19339965283870697, + "learning_rate": 0.00014978590891397432, + "loss": 1.024, + "step": 651 + }, + { + "epoch": 0.25337608083163315, + "grad_norm": 0.22362011671066284, + "learning_rate": 0.00014970805760996497, + "loss": 1.0722, + "step": 652 + }, + { + "epoch": 0.2537646944525406, + "grad_norm": 0.2110588103532791, + "learning_rate": 0.00014963020630595562, + "loss": 1.0541, + "step": 653 + }, + { + "epoch": 0.254153308073448, + "grad_norm": 0.203025683760643, + "learning_rate": 0.00014955235500194628, + "loss": 1.0335, + "step": 654 + }, + { + "epoch": 0.2545419216943554, + "grad_norm": 0.20884902775287628, + "learning_rate": 0.00014947450369793693, + "loss": 1.0507, + "step": 655 + }, + { + "epoch": 0.2549305353152628, + "grad_norm": 0.21234256029129028, + "learning_rate": 0.0001493966523939276, + "loss": 1.0372, + "step": 656 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 0.1984352171421051, + "learning_rate": 0.00014931880108991826, + "loss": 0.9979, + "step": 657 + }, + { + "epoch": 0.2557077625570776, + "grad_norm": 0.18848282098770142, + "learning_rate": 0.0001492409497859089, + "loss": 0.9973, + "step": 658 + }, + { + "epoch": 0.25609637617798503, + "grad_norm": 0.2201709896326065, + "learning_rate": 0.00014916309848189956, + "loss": 1.0386, + "step": 659 + }, + { + "epoch": 0.25648498979889245, + "grad_norm": 0.23094095289707184, + "learning_rate": 0.00014908524717789024, + "loss": 1.1205, + "step": 660 + }, + { + "epoch": 0.25687360341979987, + "grad_norm": 0.21087734401226044, + "learning_rate": 0.0001490073958738809, + "loss": 1.0231, + "step": 661 + }, + { + "epoch": 0.2572622170407073, + "grad_norm": 0.24970979988574982, + "learning_rate": 0.00014892954456987155, + "loss": 1.0421, + "step": 662 + }, + { + "epoch": 0.2576508306616147, + "grad_norm": 0.22024711966514587, + "learning_rate": 0.00014885169326586223, + "loss": 1.1033, + "step": 663 + }, + { + "epoch": 0.2580394442825221, + "grad_norm": 0.2195248156785965, + "learning_rate": 0.00014877384196185288, + "loss": 1.089, + "step": 664 + }, + { + "epoch": 0.25842805790342954, + "grad_norm": 0.20236417651176453, + "learning_rate": 0.00014869599065784353, + "loss": 1.0196, + "step": 665 + }, + { + "epoch": 0.2588166715243369, + "grad_norm": 0.21973329782485962, + "learning_rate": 0.00014861813935383418, + "loss": 1.0844, + "step": 666 + }, + { + "epoch": 0.2592052851452443, + "grad_norm": 0.2069879174232483, + "learning_rate": 0.00014854028804982486, + "loss": 1.0312, + "step": 667 + }, + { + "epoch": 0.25959389876615174, + "grad_norm": 0.2037455290555954, + "learning_rate": 0.00014846243674581551, + "loss": 1.0018, + "step": 668 + }, + { + "epoch": 0.25998251238705916, + "grad_norm": 0.24176378548145294, + "learning_rate": 0.00014838458544180617, + "loss": 1.0749, + "step": 669 + }, + { + "epoch": 0.2603711260079666, + "grad_norm": 0.2007879763841629, + "learning_rate": 0.00014830673413779682, + "loss": 1.0443, + "step": 670 + }, + { + "epoch": 0.260759739628874, + "grad_norm": 0.23503245413303375, + "learning_rate": 0.00014822888283378747, + "loss": 1.0674, + "step": 671 + }, + { + "epoch": 0.2611483532497814, + "grad_norm": 0.2166167050600052, + "learning_rate": 0.00014815103152977815, + "loss": 1.079, + "step": 672 + }, + { + "epoch": 0.26153696687068884, + "grad_norm": 0.2293982058763504, + "learning_rate": 0.0001480731802257688, + "loss": 1.0517, + "step": 673 + }, + { + "epoch": 0.26192558049159625, + "grad_norm": 0.21040330827236176, + "learning_rate": 0.00014799532892175945, + "loss": 1.0475, + "step": 674 + }, + { + "epoch": 0.2623141941125036, + "grad_norm": 0.20750463008880615, + "learning_rate": 0.0001479174776177501, + "loss": 1.025, + "step": 675 + }, + { + "epoch": 0.26270280773341104, + "grad_norm": 0.2748873233795166, + "learning_rate": 0.00014783962631374076, + "loss": 1.0212, + "step": 676 + }, + { + "epoch": 0.26309142135431846, + "grad_norm": 0.19212333858013153, + "learning_rate": 0.00014776177500973144, + "loss": 1.0049, + "step": 677 + }, + { + "epoch": 0.2634800349752259, + "grad_norm": 0.207731693983078, + "learning_rate": 0.0001476839237057221, + "loss": 1.0062, + "step": 678 + }, + { + "epoch": 0.2638686485961333, + "grad_norm": 0.2177981585264206, + "learning_rate": 0.00014760607240171274, + "loss": 1.0489, + "step": 679 + }, + { + "epoch": 0.2642572622170407, + "grad_norm": 0.23239290714263916, + "learning_rate": 0.0001475282210977034, + "loss": 1.0856, + "step": 680 + }, + { + "epoch": 0.26464587583794813, + "grad_norm": 0.2033151388168335, + "learning_rate": 0.00014745036979369404, + "loss": 1.0389, + "step": 681 + }, + { + "epoch": 0.26503448945885555, + "grad_norm": 0.20917408168315887, + "learning_rate": 0.00014737251848968472, + "loss": 1.1208, + "step": 682 + }, + { + "epoch": 0.26542310307976297, + "grad_norm": 0.22075454890727997, + "learning_rate": 0.00014729466718567538, + "loss": 1.0435, + "step": 683 + }, + { + "epoch": 0.26581171670067033, + "grad_norm": 0.23094993829727173, + "learning_rate": 0.00014721681588166603, + "loss": 1.0649, + "step": 684 + }, + { + "epoch": 0.26620033032157775, + "grad_norm": 0.21209536492824554, + "learning_rate": 0.00014713896457765668, + "loss": 1.0578, + "step": 685 + }, + { + "epoch": 0.26658894394248517, + "grad_norm": 0.21412219107151031, + "learning_rate": 0.00014706111327364733, + "loss": 1.1137, + "step": 686 + }, + { + "epoch": 0.2669775575633926, + "grad_norm": 0.21175475418567657, + "learning_rate": 0.000146983261969638, + "loss": 1.023, + "step": 687 + }, + { + "epoch": 0.2673661711843, + "grad_norm": 0.21968993544578552, + "learning_rate": 0.00014690541066562866, + "loss": 1.1183, + "step": 688 + }, + { + "epoch": 0.2677547848052074, + "grad_norm": 0.20414218306541443, + "learning_rate": 0.00014682755936161932, + "loss": 1.078, + "step": 689 + }, + { + "epoch": 0.26814339842611484, + "grad_norm": 0.18986597657203674, + "learning_rate": 0.00014674970805760997, + "loss": 1.0029, + "step": 690 + }, + { + "epoch": 0.26853201204702226, + "grad_norm": 0.21215832233428955, + "learning_rate": 0.00014667185675360062, + "loss": 1.0759, + "step": 691 + }, + { + "epoch": 0.2689206256679297, + "grad_norm": 0.2113744169473648, + "learning_rate": 0.0001465940054495913, + "loss": 1.1027, + "step": 692 + }, + { + "epoch": 0.2693092392888371, + "grad_norm": 0.22010880708694458, + "learning_rate": 0.00014651615414558195, + "loss": 1.0984, + "step": 693 + }, + { + "epoch": 0.26969785290974446, + "grad_norm": 0.203857421875, + "learning_rate": 0.0001464383028415726, + "loss": 1.0407, + "step": 694 + }, + { + "epoch": 0.2700864665306519, + "grad_norm": 0.21120867133140564, + "learning_rate": 0.00014636045153756325, + "loss": 1.0521, + "step": 695 + }, + { + "epoch": 0.2704750801515593, + "grad_norm": 0.20039112865924835, + "learning_rate": 0.0001462826002335539, + "loss": 1.0897, + "step": 696 + }, + { + "epoch": 0.2708636937724667, + "grad_norm": 0.22893202304840088, + "learning_rate": 0.00014620474892954456, + "loss": 1.0903, + "step": 697 + }, + { + "epoch": 0.27125230739337414, + "grad_norm": 0.19886267185211182, + "learning_rate": 0.00014612689762553524, + "loss": 1.0889, + "step": 698 + }, + { + "epoch": 0.27164092101428156, + "grad_norm": 0.18892349302768707, + "learning_rate": 0.0001460490463215259, + "loss": 0.981, + "step": 699 + }, + { + "epoch": 0.272029534635189, + "grad_norm": 0.20602507889270782, + "learning_rate": 0.00014597119501751654, + "loss": 1.0223, + "step": 700 + }, + { + "epoch": 0.2724181482560964, + "grad_norm": 0.21480505168437958, + "learning_rate": 0.0001458933437135072, + "loss": 1.0355, + "step": 701 + }, + { + "epoch": 0.2728067618770038, + "grad_norm": 0.21011753380298615, + "learning_rate": 0.00014581549240949785, + "loss": 1.0613, + "step": 702 + }, + { + "epoch": 0.2731953754979112, + "grad_norm": 0.19350819289684296, + "learning_rate": 0.00014573764110548853, + "loss": 1.0144, + "step": 703 + }, + { + "epoch": 0.2735839891188186, + "grad_norm": 0.207548126578331, + "learning_rate": 0.00014565978980147918, + "loss": 1.0465, + "step": 704 + }, + { + "epoch": 0.273972602739726, + "grad_norm": 0.22220565378665924, + "learning_rate": 0.00014558193849746983, + "loss": 1.1073, + "step": 705 + }, + { + "epoch": 0.27436121636063343, + "grad_norm": 0.193622425198555, + "learning_rate": 0.00014550408719346048, + "loss": 1.0357, + "step": 706 + }, + { + "epoch": 0.27474982998154085, + "grad_norm": 0.2067158818244934, + "learning_rate": 0.00014542623588945113, + "loss": 1.0502, + "step": 707 + }, + { + "epoch": 0.27513844360244827, + "grad_norm": 0.2218742072582245, + "learning_rate": 0.0001453483845854418, + "loss": 0.9934, + "step": 708 + }, + { + "epoch": 0.2755270572233557, + "grad_norm": 0.22316142916679382, + "learning_rate": 0.00014527053328143246, + "loss": 1.0707, + "step": 709 + }, + { + "epoch": 0.2759156708442631, + "grad_norm": 0.21004025638103485, + "learning_rate": 0.00014519268197742312, + "loss": 1.0543, + "step": 710 + }, + { + "epoch": 0.2763042844651705, + "grad_norm": 0.22070440649986267, + "learning_rate": 0.00014511483067341377, + "loss": 1.0467, + "step": 711 + }, + { + "epoch": 0.27669289808607794, + "grad_norm": 0.21463747322559357, + "learning_rate": 0.00014503697936940445, + "loss": 1.0793, + "step": 712 + }, + { + "epoch": 0.2770815117069853, + "grad_norm": 0.23452533781528473, + "learning_rate": 0.0001449591280653951, + "loss": 1.043, + "step": 713 + }, + { + "epoch": 0.2774701253278927, + "grad_norm": 0.2405795156955719, + "learning_rate": 0.00014488127676138575, + "loss": 1.0752, + "step": 714 + }, + { + "epoch": 0.27785873894880014, + "grad_norm": 0.21546585857868195, + "learning_rate": 0.00014480342545737643, + "loss": 1.0834, + "step": 715 + }, + { + "epoch": 0.27824735256970756, + "grad_norm": 0.22675828635692596, + "learning_rate": 0.00014472557415336708, + "loss": 1.055, + "step": 716 + }, + { + "epoch": 0.278635966190615, + "grad_norm": 0.2117871195077896, + "learning_rate": 0.00014464772284935774, + "loss": 1.03, + "step": 717 + }, + { + "epoch": 0.2790245798115224, + "grad_norm": 0.2193155735731125, + "learning_rate": 0.00014456987154534841, + "loss": 1.0073, + "step": 718 + }, + { + "epoch": 0.2794131934324298, + "grad_norm": 0.21447965502738953, + "learning_rate": 0.00014449202024133907, + "loss": 1.0174, + "step": 719 + }, + { + "epoch": 0.27980180705333724, + "grad_norm": 0.22867532074451447, + "learning_rate": 0.00014441416893732972, + "loss": 1.0948, + "step": 720 + }, + { + "epoch": 0.28019042067424466, + "grad_norm": 0.21570557355880737, + "learning_rate": 0.00014433631763332037, + "loss": 1.0105, + "step": 721 + }, + { + "epoch": 0.280579034295152, + "grad_norm": 0.20787014067173004, + "learning_rate": 0.00014425846632931102, + "loss": 1.0384, + "step": 722 + }, + { + "epoch": 0.28096764791605944, + "grad_norm": 0.19924762845039368, + "learning_rate": 0.0001441806150253017, + "loss": 1.0653, + "step": 723 + }, + { + "epoch": 0.28135626153696686, + "grad_norm": 0.1996215283870697, + "learning_rate": 0.00014410276372129235, + "loss": 1.0439, + "step": 724 + }, + { + "epoch": 0.2817448751578743, + "grad_norm": 0.2054813802242279, + "learning_rate": 0.000144024912417283, + "loss": 0.9895, + "step": 725 + }, + { + "epoch": 0.2821334887787817, + "grad_norm": 0.2268310785293579, + "learning_rate": 0.00014394706111327366, + "loss": 1.0993, + "step": 726 + }, + { + "epoch": 0.2825221023996891, + "grad_norm": 0.19867680966854095, + "learning_rate": 0.0001438692098092643, + "loss": 0.985, + "step": 727 + }, + { + "epoch": 0.28291071602059653, + "grad_norm": 0.21099598705768585, + "learning_rate": 0.000143791358505255, + "loss": 1.0333, + "step": 728 + }, + { + "epoch": 0.28329932964150395, + "grad_norm": 0.22479215264320374, + "learning_rate": 0.00014371350720124564, + "loss": 1.0449, + "step": 729 + }, + { + "epoch": 0.28368794326241137, + "grad_norm": 0.22717688977718353, + "learning_rate": 0.0001436356558972363, + "loss": 1.0482, + "step": 730 + }, + { + "epoch": 0.2840765568833188, + "grad_norm": 0.20389345288276672, + "learning_rate": 0.00014355780459322695, + "loss": 0.956, + "step": 731 + }, + { + "epoch": 0.28446517050422615, + "grad_norm": 0.21583619713783264, + "learning_rate": 0.0001434799532892176, + "loss": 1.0154, + "step": 732 + }, + { + "epoch": 0.28485378412513357, + "grad_norm": 0.2219148874282837, + "learning_rate": 0.00014340210198520825, + "loss": 1.0553, + "step": 733 + }, + { + "epoch": 0.285242397746041, + "grad_norm": 0.19920189678668976, + "learning_rate": 0.00014332425068119893, + "loss": 0.9881, + "step": 734 + }, + { + "epoch": 0.2856310113669484, + "grad_norm": 0.2295670360326767, + "learning_rate": 0.00014324639937718958, + "loss": 1.0529, + "step": 735 + }, + { + "epoch": 0.2860196249878558, + "grad_norm": 0.21271567046642303, + "learning_rate": 0.00014316854807318023, + "loss": 1.037, + "step": 736 + }, + { + "epoch": 0.28640823860876324, + "grad_norm": 0.21304361522197723, + "learning_rate": 0.00014309069676917088, + "loss": 1.048, + "step": 737 + }, + { + "epoch": 0.28679685222967066, + "grad_norm": 0.19902732968330383, + "learning_rate": 0.00014301284546516154, + "loss": 1.0306, + "step": 738 + }, + { + "epoch": 0.2871854658505781, + "grad_norm": 0.1995929330587387, + "learning_rate": 0.00014293499416115222, + "loss": 1.0394, + "step": 739 + }, + { + "epoch": 0.2875740794714855, + "grad_norm": 0.20426060259342194, + "learning_rate": 0.00014285714285714287, + "loss": 1.0052, + "step": 740 + }, + { + "epoch": 0.28796269309239286, + "grad_norm": 0.20284566283226013, + "learning_rate": 0.00014277929155313352, + "loss": 1.0115, + "step": 741 + }, + { + "epoch": 0.2883513067133003, + "grad_norm": 0.2041557878255844, + "learning_rate": 0.00014270144024912417, + "loss": 1.0473, + "step": 742 + }, + { + "epoch": 0.2887399203342077, + "grad_norm": 0.2152249962091446, + "learning_rate": 0.00014262358894511482, + "loss": 1.0802, + "step": 743 + }, + { + "epoch": 0.2891285339551151, + "grad_norm": 0.20569871366024017, + "learning_rate": 0.0001425457376411055, + "loss": 1.0203, + "step": 744 + }, + { + "epoch": 0.28951714757602254, + "grad_norm": 0.21128378808498383, + "learning_rate": 0.00014246788633709616, + "loss": 1.108, + "step": 745 + }, + { + "epoch": 0.28990576119692996, + "grad_norm": 0.19587135314941406, + "learning_rate": 0.0001423900350330868, + "loss": 1.0427, + "step": 746 + }, + { + "epoch": 0.2902943748178374, + "grad_norm": 0.22052550315856934, + "learning_rate": 0.00014231218372907746, + "loss": 1.055, + "step": 747 + }, + { + "epoch": 0.2906829884387448, + "grad_norm": 0.21291717886924744, + "learning_rate": 0.0001422343324250681, + "loss": 1.0591, + "step": 748 + }, + { + "epoch": 0.2910716020596522, + "grad_norm": 0.20634084939956665, + "learning_rate": 0.0001421564811210588, + "loss": 1.0527, + "step": 749 + }, + { + "epoch": 0.29146021568055963, + "grad_norm": 0.2075488269329071, + "learning_rate": 0.00014207862981704944, + "loss": 1.0786, + "step": 750 + }, + { + "epoch": 0.291848829301467, + "grad_norm": 0.19780080020427704, + "learning_rate": 0.0001420007785130401, + "loss": 1.059, + "step": 751 + }, + { + "epoch": 0.2922374429223744, + "grad_norm": 0.21212074160575867, + "learning_rate": 0.00014192292720903075, + "loss": 1.0346, + "step": 752 + }, + { + "epoch": 0.29262605654328183, + "grad_norm": 0.2218451350927353, + "learning_rate": 0.0001418450759050214, + "loss": 1.0908, + "step": 753 + }, + { + "epoch": 0.29301467016418925, + "grad_norm": 0.20107759535312653, + "learning_rate": 0.00014176722460101208, + "loss": 1.0202, + "step": 754 + }, + { + "epoch": 0.29340328378509667, + "grad_norm": 0.20933273434638977, + "learning_rate": 0.00014168937329700273, + "loss": 1.0719, + "step": 755 + }, + { + "epoch": 0.2937918974060041, + "grad_norm": 0.22369107604026794, + "learning_rate": 0.00014161152199299338, + "loss": 1.0433, + "step": 756 + }, + { + "epoch": 0.2941805110269115, + "grad_norm": 0.2113707810640335, + "learning_rate": 0.00014153367068898403, + "loss": 1.0637, + "step": 757 + }, + { + "epoch": 0.2945691246478189, + "grad_norm": 0.21105700731277466, + "learning_rate": 0.00014145581938497469, + "loss": 1.0468, + "step": 758 + }, + { + "epoch": 0.29495773826872634, + "grad_norm": 0.20189693570137024, + "learning_rate": 0.00014137796808096537, + "loss": 1.0281, + "step": 759 + }, + { + "epoch": 0.2953463518896337, + "grad_norm": 0.1954152137041092, + "learning_rate": 0.00014130011677695602, + "loss": 1.0519, + "step": 760 + }, + { + "epoch": 0.2957349655105411, + "grad_norm": 0.24295592308044434, + "learning_rate": 0.00014122226547294667, + "loss": 1.1303, + "step": 761 + }, + { + "epoch": 0.29612357913144854, + "grad_norm": 0.20158620178699493, + "learning_rate": 0.00014114441416893732, + "loss": 1.0367, + "step": 762 + }, + { + "epoch": 0.29651219275235596, + "grad_norm": 0.20734666287899017, + "learning_rate": 0.00014106656286492797, + "loss": 1.0392, + "step": 763 + }, + { + "epoch": 0.2969008063732634, + "grad_norm": 0.2177533656358719, + "learning_rate": 0.00014098871156091865, + "loss": 1.0619, + "step": 764 + }, + { + "epoch": 0.2972894199941708, + "grad_norm": 0.1961720883846283, + "learning_rate": 0.0001409108602569093, + "loss": 0.9872, + "step": 765 + }, + { + "epoch": 0.2976780336150782, + "grad_norm": 0.21530941128730774, + "learning_rate": 0.00014083300895289996, + "loss": 1.1246, + "step": 766 + }, + { + "epoch": 0.29806664723598564, + "grad_norm": 0.2039783000946045, + "learning_rate": 0.00014075515764889064, + "loss": 1.0789, + "step": 767 + }, + { + "epoch": 0.29845526085689306, + "grad_norm": 0.20641569793224335, + "learning_rate": 0.0001406773063448813, + "loss": 1.05, + "step": 768 + }, + { + "epoch": 0.2988438744778004, + "grad_norm": 0.2071225494146347, + "learning_rate": 0.00014059945504087194, + "loss": 1.047, + "step": 769 + }, + { + "epoch": 0.29923248809870784, + "grad_norm": 0.20367531478405, + "learning_rate": 0.00014052160373686262, + "loss": 1.0734, + "step": 770 + }, + { + "epoch": 0.29962110171961526, + "grad_norm": 0.21718619763851166, + "learning_rate": 0.00014044375243285327, + "loss": 1.0613, + "step": 771 + }, + { + "epoch": 0.3000097153405227, + "grad_norm": 0.21649087965488434, + "learning_rate": 0.00014036590112884392, + "loss": 1.0671, + "step": 772 + }, + { + "epoch": 0.3003983289614301, + "grad_norm": 0.22223225235939026, + "learning_rate": 0.00014028804982483458, + "loss": 1.0977, + "step": 773 + }, + { + "epoch": 0.3007869425823375, + "grad_norm": 0.23101870715618134, + "learning_rate": 0.00014021019852082523, + "loss": 1.1236, + "step": 774 + }, + { + "epoch": 0.30117555620324493, + "grad_norm": 0.22855506837368011, + "learning_rate": 0.0001401323472168159, + "loss": 1.0517, + "step": 775 + }, + { + "epoch": 0.30156416982415235, + "grad_norm": 0.20862117409706116, + "learning_rate": 0.00014005449591280656, + "loss": 1.0493, + "step": 776 + }, + { + "epoch": 0.30195278344505977, + "grad_norm": 0.21692048013210297, + "learning_rate": 0.0001399766446087972, + "loss": 1.0681, + "step": 777 + }, + { + "epoch": 0.3023413970659672, + "grad_norm": 0.21541331708431244, + "learning_rate": 0.00013989879330478786, + "loss": 1.0775, + "step": 778 + }, + { + "epoch": 0.30273001068687455, + "grad_norm": 0.21221749484539032, + "learning_rate": 0.00013982094200077851, + "loss": 1.0421, + "step": 779 + }, + { + "epoch": 0.30311862430778197, + "grad_norm": 0.22497743368148804, + "learning_rate": 0.0001397430906967692, + "loss": 1.1115, + "step": 780 + }, + { + "epoch": 0.3035072379286894, + "grad_norm": 0.1974119246006012, + "learning_rate": 0.00013966523939275985, + "loss": 1.0264, + "step": 781 + }, + { + "epoch": 0.3038958515495968, + "grad_norm": 0.20349323749542236, + "learning_rate": 0.0001395873880887505, + "loss": 1.0512, + "step": 782 + }, + { + "epoch": 0.3042844651705042, + "grad_norm": 0.21116937696933746, + "learning_rate": 0.00013950953678474115, + "loss": 1.0135, + "step": 783 + }, + { + "epoch": 0.30467307879141164, + "grad_norm": 0.2133677899837494, + "learning_rate": 0.0001394316854807318, + "loss": 1.0694, + "step": 784 + }, + { + "epoch": 0.30506169241231906, + "grad_norm": 0.20406191051006317, + "learning_rate": 0.00013935383417672248, + "loss": 1.0179, + "step": 785 + }, + { + "epoch": 0.3054503060332265, + "grad_norm": 0.21428678929805756, + "learning_rate": 0.00013927598287271313, + "loss": 1.0577, + "step": 786 + }, + { + "epoch": 0.3058389196541339, + "grad_norm": 0.20878921449184418, + "learning_rate": 0.00013919813156870379, + "loss": 1.0311, + "step": 787 + }, + { + "epoch": 0.30622753327504126, + "grad_norm": 0.19033175706863403, + "learning_rate": 0.00013912028026469444, + "loss": 0.976, + "step": 788 + }, + { + "epoch": 0.3066161468959487, + "grad_norm": 0.22138020396232605, + "learning_rate": 0.0001390424289606851, + "loss": 1.0438, + "step": 789 + }, + { + "epoch": 0.3070047605168561, + "grad_norm": 0.20765596628189087, + "learning_rate": 0.00013896457765667577, + "loss": 1.0865, + "step": 790 + }, + { + "epoch": 0.3073933741377635, + "grad_norm": 0.209733247756958, + "learning_rate": 0.00013888672635266642, + "loss": 1.0648, + "step": 791 + }, + { + "epoch": 0.30778198775867094, + "grad_norm": 0.1896686851978302, + "learning_rate": 0.00013880887504865707, + "loss": 1.0133, + "step": 792 + }, + { + "epoch": 0.30817060137957836, + "grad_norm": 0.21651998162269592, + "learning_rate": 0.00013873102374464772, + "loss": 1.0729, + "step": 793 + }, + { + "epoch": 0.3085592150004858, + "grad_norm": 0.21751996874809265, + "learning_rate": 0.00013865317244063838, + "loss": 1.0444, + "step": 794 + }, + { + "epoch": 0.3089478286213932, + "grad_norm": 0.20593520998954773, + "learning_rate": 0.00013857532113662906, + "loss": 1.0304, + "step": 795 + }, + { + "epoch": 0.3093364422423006, + "grad_norm": 0.19937261939048767, + "learning_rate": 0.0001384974698326197, + "loss": 1.0017, + "step": 796 + }, + { + "epoch": 0.30972505586320803, + "grad_norm": 0.18901696801185608, + "learning_rate": 0.00013841961852861036, + "loss": 1.0362, + "step": 797 + }, + { + "epoch": 0.3101136694841154, + "grad_norm": 0.2079760730266571, + "learning_rate": 0.000138341767224601, + "loss": 1.0784, + "step": 798 + }, + { + "epoch": 0.3105022831050228, + "grad_norm": 0.24873265624046326, + "learning_rate": 0.00013826391592059166, + "loss": 1.1026, + "step": 799 + }, + { + "epoch": 0.31089089672593023, + "grad_norm": 0.20185396075248718, + "learning_rate": 0.00013818606461658234, + "loss": 1.0235, + "step": 800 + }, + { + "epoch": 0.31127951034683765, + "grad_norm": 0.211393803358078, + "learning_rate": 0.000138108213312573, + "loss": 1.0999, + "step": 801 + }, + { + "epoch": 0.31166812396774507, + "grad_norm": 0.19948823750019073, + "learning_rate": 0.00013803036200856365, + "loss": 1.0242, + "step": 802 + }, + { + "epoch": 0.3120567375886525, + "grad_norm": 0.21470944583415985, + "learning_rate": 0.0001379525107045543, + "loss": 1.0736, + "step": 803 + }, + { + "epoch": 0.3124453512095599, + "grad_norm": 0.2195902317762375, + "learning_rate": 0.00013787465940054495, + "loss": 1.0368, + "step": 804 + }, + { + "epoch": 0.3128339648304673, + "grad_norm": 0.22142355144023895, + "learning_rate": 0.00013779680809653563, + "loss": 1.1022, + "step": 805 + }, + { + "epoch": 0.31322257845137474, + "grad_norm": 0.20487886667251587, + "learning_rate": 0.00013771895679252628, + "loss": 1.0478, + "step": 806 + }, + { + "epoch": 0.3136111920722821, + "grad_norm": 0.217549130320549, + "learning_rate": 0.00013764110548851693, + "loss": 1.0526, + "step": 807 + }, + { + "epoch": 0.3139998056931895, + "grad_norm": 0.20199982821941376, + "learning_rate": 0.0001375632541845076, + "loss": 0.9992, + "step": 808 + }, + { + "epoch": 0.31438841931409695, + "grad_norm": 0.19496634602546692, + "learning_rate": 0.00013748540288049824, + "loss": 1.0179, + "step": 809 + }, + { + "epoch": 0.31477703293500436, + "grad_norm": 0.21999460458755493, + "learning_rate": 0.0001374075515764889, + "loss": 1.0547, + "step": 810 + }, + { + "epoch": 0.3151656465559118, + "grad_norm": 0.21421074867248535, + "learning_rate": 0.00013732970027247957, + "loss": 1.0283, + "step": 811 + }, + { + "epoch": 0.3155542601768192, + "grad_norm": 0.1913364827632904, + "learning_rate": 0.00013725184896847022, + "loss": 0.9826, + "step": 812 + }, + { + "epoch": 0.3159428737977266, + "grad_norm": 0.20509806275367737, + "learning_rate": 0.00013717399766446087, + "loss": 1.0303, + "step": 813 + }, + { + "epoch": 0.31633148741863404, + "grad_norm": 0.20309868454933167, + "learning_rate": 0.00013709614636045153, + "loss": 1.0479, + "step": 814 + }, + { + "epoch": 0.31672010103954146, + "grad_norm": 0.2274443656206131, + "learning_rate": 0.0001370182950564422, + "loss": 1.1311, + "step": 815 + }, + { + "epoch": 0.3171087146604489, + "grad_norm": 0.22785170376300812, + "learning_rate": 0.00013694044375243286, + "loss": 1.1009, + "step": 816 + }, + { + "epoch": 0.31749732828135624, + "grad_norm": 0.2105439007282257, + "learning_rate": 0.0001368625924484235, + "loss": 1.0251, + "step": 817 + }, + { + "epoch": 0.31788594190226366, + "grad_norm": 0.20583970844745636, + "learning_rate": 0.00013678474114441416, + "loss": 1.0833, + "step": 818 + }, + { + "epoch": 0.3182745555231711, + "grad_norm": 0.21091191470623016, + "learning_rate": 0.00013670688984040484, + "loss": 1.071, + "step": 819 + }, + { + "epoch": 0.3186631691440785, + "grad_norm": 0.20645928382873535, + "learning_rate": 0.0001366290385363955, + "loss": 1.0605, + "step": 820 + }, + { + "epoch": 0.3190517827649859, + "grad_norm": 0.1990513950586319, + "learning_rate": 0.00013655118723238614, + "loss": 1.0461, + "step": 821 + }, + { + "epoch": 0.31944039638589333, + "grad_norm": 0.2192249745130539, + "learning_rate": 0.00013647333592837682, + "loss": 1.0975, + "step": 822 + }, + { + "epoch": 0.31982901000680075, + "grad_norm": 0.2157617211341858, + "learning_rate": 0.00013639548462436748, + "loss": 1.091, + "step": 823 + }, + { + "epoch": 0.32021762362770817, + "grad_norm": 0.21964526176452637, + "learning_rate": 0.00013631763332035813, + "loss": 1.0286, + "step": 824 + }, + { + "epoch": 0.3206062372486156, + "grad_norm": 0.2079797089099884, + "learning_rate": 0.00013623978201634878, + "loss": 1.0257, + "step": 825 + }, + { + "epoch": 0.32099485086952295, + "grad_norm": 0.21220168471336365, + "learning_rate": 0.00013616193071233946, + "loss": 1.0046, + "step": 826 + }, + { + "epoch": 0.32138346449043037, + "grad_norm": 0.2885231673717499, + "learning_rate": 0.0001360840794083301, + "loss": 1.1442, + "step": 827 + }, + { + "epoch": 0.3217720781113378, + "grad_norm": 0.2096511274576187, + "learning_rate": 0.00013600622810432076, + "loss": 1.0209, + "step": 828 + }, + { + "epoch": 0.3221606917322452, + "grad_norm": 0.2179451286792755, + "learning_rate": 0.00013592837680031142, + "loss": 1.0548, + "step": 829 + }, + { + "epoch": 0.3225493053531526, + "grad_norm": 0.2096329927444458, + "learning_rate": 0.00013585052549630207, + "loss": 1.0279, + "step": 830 + }, + { + "epoch": 0.32293791897406005, + "grad_norm": 0.22531811892986298, + "learning_rate": 0.00013577267419229275, + "loss": 1.0463, + "step": 831 + }, + { + "epoch": 0.32332653259496746, + "grad_norm": 0.22516901791095734, + "learning_rate": 0.0001356948228882834, + "loss": 1.1127, + "step": 832 + }, + { + "epoch": 0.3237151462158749, + "grad_norm": 0.22487780451774597, + "learning_rate": 0.00013561697158427405, + "loss": 1.0707, + "step": 833 + }, + { + "epoch": 0.3241037598367823, + "grad_norm": 0.20976543426513672, + "learning_rate": 0.0001355391202802647, + "loss": 1.0217, + "step": 834 + }, + { + "epoch": 0.32449237345768966, + "grad_norm": 0.19849295914173126, + "learning_rate": 0.00013546126897625535, + "loss": 1.021, + "step": 835 + }, + { + "epoch": 0.3248809870785971, + "grad_norm": 0.21772268414497375, + "learning_rate": 0.00013538341767224603, + "loss": 1.0605, + "step": 836 + }, + { + "epoch": 0.3252696006995045, + "grad_norm": 0.19670265913009644, + "learning_rate": 0.00013530556636823669, + "loss": 1.0165, + "step": 837 + }, + { + "epoch": 0.3256582143204119, + "grad_norm": 0.19339734315872192, + "learning_rate": 0.00013522771506422734, + "loss": 1.0203, + "step": 838 + }, + { + "epoch": 0.32604682794131934, + "grad_norm": 0.21289557218551636, + "learning_rate": 0.000135149863760218, + "loss": 1.0252, + "step": 839 + }, + { + "epoch": 0.32643544156222676, + "grad_norm": 0.1964789777994156, + "learning_rate": 0.00013507201245620864, + "loss": 1.0392, + "step": 840 + }, + { + "epoch": 0.3268240551831342, + "grad_norm": 0.20783716440200806, + "learning_rate": 0.00013499416115219932, + "loss": 1.0569, + "step": 841 + }, + { + "epoch": 0.3272126688040416, + "grad_norm": 0.22782161831855774, + "learning_rate": 0.00013491630984818997, + "loss": 1.0555, + "step": 842 + }, + { + "epoch": 0.327601282424949, + "grad_norm": 0.22771142423152924, + "learning_rate": 0.00013483845854418063, + "loss": 1.085, + "step": 843 + }, + { + "epoch": 0.32798989604585643, + "grad_norm": 0.19773711264133453, + "learning_rate": 0.00013476060724017128, + "loss": 1.008, + "step": 844 + }, + { + "epoch": 0.3283785096667638, + "grad_norm": 0.22399166226387024, + "learning_rate": 0.00013468275593616193, + "loss": 1.0511, + "step": 845 + }, + { + "epoch": 0.3287671232876712, + "grad_norm": 0.20488236844539642, + "learning_rate": 0.00013460490463215258, + "loss": 1.0883, + "step": 846 + }, + { + "epoch": 0.32915573690857863, + "grad_norm": 0.21387654542922974, + "learning_rate": 0.00013452705332814326, + "loss": 1.0808, + "step": 847 + }, + { + "epoch": 0.32954435052948605, + "grad_norm": 0.1972568780183792, + "learning_rate": 0.0001344492020241339, + "loss": 1.0555, + "step": 848 + }, + { + "epoch": 0.32993296415039347, + "grad_norm": 0.20835663378238678, + "learning_rate": 0.00013437135072012456, + "loss": 1.0473, + "step": 849 + }, + { + "epoch": 0.3303215777713009, + "grad_norm": 0.19707520306110382, + "learning_rate": 0.00013429349941611522, + "loss": 0.9585, + "step": 850 + }, + { + "epoch": 0.3307101913922083, + "grad_norm": 0.19163411855697632, + "learning_rate": 0.00013421564811210587, + "loss": 1.0025, + "step": 851 + }, + { + "epoch": 0.3310988050131157, + "grad_norm": 0.19730083644390106, + "learning_rate": 0.00013413779680809655, + "loss": 1.0696, + "step": 852 + }, + { + "epoch": 0.33148741863402315, + "grad_norm": 0.19537493586540222, + "learning_rate": 0.0001340599455040872, + "loss": 1.0466, + "step": 853 + }, + { + "epoch": 0.3318760322549305, + "grad_norm": 0.2255164235830307, + "learning_rate": 0.00013398209420007785, + "loss": 1.0659, + "step": 854 + }, + { + "epoch": 0.3322646458758379, + "grad_norm": 0.19774770736694336, + "learning_rate": 0.0001339042428960685, + "loss": 1.0326, + "step": 855 + }, + { + "epoch": 0.33265325949674535, + "grad_norm": 0.2004510909318924, + "learning_rate": 0.00013382639159205916, + "loss": 1.0327, + "step": 856 + }, + { + "epoch": 0.33304187311765276, + "grad_norm": 0.19187591969966888, + "learning_rate": 0.00013374854028804984, + "loss": 1.0069, + "step": 857 + }, + { + "epoch": 0.3334304867385602, + "grad_norm": 0.18775832653045654, + "learning_rate": 0.0001336706889840405, + "loss": 1.0083, + "step": 858 + }, + { + "epoch": 0.3338191003594676, + "grad_norm": 0.2005717158317566, + "learning_rate": 0.00013359283768003114, + "loss": 1.0398, + "step": 859 + }, + { + "epoch": 0.334207713980375, + "grad_norm": 0.19705893099308014, + "learning_rate": 0.0001335149863760218, + "loss": 1.0031, + "step": 860 + }, + { + "epoch": 0.33459632760128244, + "grad_norm": 0.19589562714099884, + "learning_rate": 0.00013343713507201244, + "loss": 0.9831, + "step": 861 + }, + { + "epoch": 0.33498494122218986, + "grad_norm": 0.19302591681480408, + "learning_rate": 0.00013335928376800312, + "loss": 1.0009, + "step": 862 + }, + { + "epoch": 0.3353735548430973, + "grad_norm": 0.20499618351459503, + "learning_rate": 0.00013328143246399377, + "loss": 1.0205, + "step": 863 + }, + { + "epoch": 0.33576216846400464, + "grad_norm": 0.20514456927776337, + "learning_rate": 0.00013320358115998443, + "loss": 1.0837, + "step": 864 + }, + { + "epoch": 0.33615078208491206, + "grad_norm": 0.19285848736763, + "learning_rate": 0.00013312572985597508, + "loss": 1.0167, + "step": 865 + }, + { + "epoch": 0.3365393957058195, + "grad_norm": 0.20891553163528442, + "learning_rate": 0.00013304787855196573, + "loss": 1.0127, + "step": 866 + }, + { + "epoch": 0.3369280093267269, + "grad_norm": 0.20511706173419952, + "learning_rate": 0.0001329700272479564, + "loss": 0.964, + "step": 867 + }, + { + "epoch": 0.3373166229476343, + "grad_norm": 0.1855512261390686, + "learning_rate": 0.00013289217594394706, + "loss": 0.9721, + "step": 868 + }, + { + "epoch": 0.33770523656854173, + "grad_norm": 0.20010098814964294, + "learning_rate": 0.00013281432463993771, + "loss": 1.0411, + "step": 869 + }, + { + "epoch": 0.33809385018944915, + "grad_norm": 0.1991325318813324, + "learning_rate": 0.0001327364733359284, + "loss": 0.9658, + "step": 870 + }, + { + "epoch": 0.33848246381035657, + "grad_norm": 0.19895736873149872, + "learning_rate": 0.00013265862203191905, + "loss": 1.0744, + "step": 871 + }, + { + "epoch": 0.338871077431264, + "grad_norm": 0.2091255635023117, + "learning_rate": 0.0001325807707279097, + "loss": 1.0375, + "step": 872 + }, + { + "epoch": 0.33925969105217135, + "grad_norm": 0.21355532109737396, + "learning_rate": 0.00013250291942390035, + "loss": 1.09, + "step": 873 + }, + { + "epoch": 0.33964830467307877, + "grad_norm": 0.21844851970672607, + "learning_rate": 0.00013242506811989103, + "loss": 1.0769, + "step": 874 + }, + { + "epoch": 0.3400369182939862, + "grad_norm": 0.1877543330192566, + "learning_rate": 0.00013234721681588168, + "loss": 1.0199, + "step": 875 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 0.2020038366317749, + "learning_rate": 0.00013226936551187233, + "loss": 1.0218, + "step": 876 + }, + { + "epoch": 0.340814145535801, + "grad_norm": 0.20682141184806824, + "learning_rate": 0.000132191514207863, + "loss": 1.0891, + "step": 877 + }, + { + "epoch": 0.34120275915670845, + "grad_norm": 0.21942824125289917, + "learning_rate": 0.00013211366290385366, + "loss": 0.9877, + "step": 878 + }, + { + "epoch": 0.34159137277761586, + "grad_norm": 0.21150313317775726, + "learning_rate": 0.00013203581159984432, + "loss": 1.0815, + "step": 879 + }, + { + "epoch": 0.3419799863985233, + "grad_norm": 0.2073293924331665, + "learning_rate": 0.00013195796029583497, + "loss": 1.0579, + "step": 880 + }, + { + "epoch": 0.3423686000194307, + "grad_norm": 0.221574068069458, + "learning_rate": 0.00013188010899182562, + "loss": 1.0279, + "step": 881 + }, + { + "epoch": 0.3427572136403381, + "grad_norm": 0.22334492206573486, + "learning_rate": 0.00013180225768781627, + "loss": 1.0837, + "step": 882 + }, + { + "epoch": 0.3431458272612455, + "grad_norm": 0.18817654252052307, + "learning_rate": 0.00013172440638380695, + "loss": 1.0262, + "step": 883 + }, + { + "epoch": 0.3435344408821529, + "grad_norm": 0.20126822590827942, + "learning_rate": 0.0001316465550797976, + "loss": 1.0679, + "step": 884 + }, + { + "epoch": 0.3439230545030603, + "grad_norm": 0.2128864973783493, + "learning_rate": 0.00013156870377578825, + "loss": 1.0316, + "step": 885 + }, + { + "epoch": 0.34431166812396774, + "grad_norm": 0.20054499804973602, + "learning_rate": 0.0001314908524717789, + "loss": 1.0024, + "step": 886 + }, + { + "epoch": 0.34470028174487516, + "grad_norm": 0.21358034014701843, + "learning_rate": 0.00013141300116776956, + "loss": 1.0475, + "step": 887 + }, + { + "epoch": 0.3450888953657826, + "grad_norm": 0.21377703547477722, + "learning_rate": 0.00013133514986376024, + "loss": 1.0957, + "step": 888 + }, + { + "epoch": 0.34547750898669, + "grad_norm": 0.20166514813899994, + "learning_rate": 0.0001312572985597509, + "loss": 1.0189, + "step": 889 + }, + { + "epoch": 0.3458661226075974, + "grad_norm": 0.20424878597259521, + "learning_rate": 0.00013117944725574154, + "loss": 1.0896, + "step": 890 + }, + { + "epoch": 0.34625473622850483, + "grad_norm": 0.19028648734092712, + "learning_rate": 0.0001311015959517322, + "loss": 0.9881, + "step": 891 + }, + { + "epoch": 0.3466433498494122, + "grad_norm": 0.20828665792942047, + "learning_rate": 0.00013102374464772285, + "loss": 0.9932, + "step": 892 + }, + { + "epoch": 0.3470319634703196, + "grad_norm": 0.20756572484970093, + "learning_rate": 0.00013094589334371353, + "loss": 1.0406, + "step": 893 + }, + { + "epoch": 0.34742057709122703, + "grad_norm": 0.20768921077251434, + "learning_rate": 0.00013086804203970418, + "loss": 0.9652, + "step": 894 + }, + { + "epoch": 0.34780919071213445, + "grad_norm": 0.20660027861595154, + "learning_rate": 0.00013079019073569483, + "loss": 1.0728, + "step": 895 + }, + { + "epoch": 0.34819780433304187, + "grad_norm": 0.20186837017536163, + "learning_rate": 0.00013071233943168548, + "loss": 1.0407, + "step": 896 + }, + { + "epoch": 0.3485864179539493, + "grad_norm": 0.20880667865276337, + "learning_rate": 0.00013063448812767613, + "loss": 1.0275, + "step": 897 + }, + { + "epoch": 0.3489750315748567, + "grad_norm": 0.22212949395179749, + "learning_rate": 0.0001305566368236668, + "loss": 1.0293, + "step": 898 + }, + { + "epoch": 0.3493636451957641, + "grad_norm": 0.20552745461463928, + "learning_rate": 0.00013047878551965746, + "loss": 1.0434, + "step": 899 + }, + { + "epoch": 0.34975225881667155, + "grad_norm": 0.21239839494228363, + "learning_rate": 0.00013040093421564812, + "loss": 1.052, + "step": 900 + }, + { + "epoch": 0.3501408724375789, + "grad_norm": 0.22420544922351837, + "learning_rate": 0.00013032308291163877, + "loss": 1.0236, + "step": 901 + }, + { + "epoch": 0.35052948605848633, + "grad_norm": 0.23435090482234955, + "learning_rate": 0.00013024523160762942, + "loss": 1.0876, + "step": 902 + }, + { + "epoch": 0.35091809967939375, + "grad_norm": 0.22763386368751526, + "learning_rate": 0.0001301673803036201, + "loss": 1.0636, + "step": 903 + }, + { + "epoch": 0.35130671330030117, + "grad_norm": 0.20948883891105652, + "learning_rate": 0.00013008952899961075, + "loss": 1.0083, + "step": 904 + }, + { + "epoch": 0.3516953269212086, + "grad_norm": 0.20408779382705688, + "learning_rate": 0.0001300116776956014, + "loss": 1.039, + "step": 905 + }, + { + "epoch": 0.352083940542116, + "grad_norm": 0.2126050591468811, + "learning_rate": 0.00012993382639159206, + "loss": 1.0365, + "step": 906 + }, + { + "epoch": 0.3524725541630234, + "grad_norm": 0.20314334332942963, + "learning_rate": 0.0001298559750875827, + "loss": 1.0474, + "step": 907 + }, + { + "epoch": 0.35286116778393084, + "grad_norm": 0.23720984160900116, + "learning_rate": 0.0001297781237835734, + "loss": 1.0529, + "step": 908 + }, + { + "epoch": 0.35324978140483826, + "grad_norm": 0.22642800211906433, + "learning_rate": 0.00012970027247956404, + "loss": 1.0586, + "step": 909 + }, + { + "epoch": 0.3536383950257457, + "grad_norm": 0.20469972491264343, + "learning_rate": 0.0001296224211755547, + "loss": 1.0267, + "step": 910 + }, + { + "epoch": 0.35402700864665304, + "grad_norm": 0.197368785738945, + "learning_rate": 0.00012954456987154534, + "loss": 1.0348, + "step": 911 + }, + { + "epoch": 0.35441562226756046, + "grad_norm": 0.21924498677253723, + "learning_rate": 0.000129466718567536, + "loss": 1.0861, + "step": 912 + }, + { + "epoch": 0.3548042358884679, + "grad_norm": 0.22006285190582275, + "learning_rate": 0.00012938886726352667, + "loss": 1.0545, + "step": 913 + }, + { + "epoch": 0.3551928495093753, + "grad_norm": 0.22419220209121704, + "learning_rate": 0.00012931101595951733, + "loss": 1.0716, + "step": 914 + }, + { + "epoch": 0.3555814631302827, + "grad_norm": 0.215990349650383, + "learning_rate": 0.00012923316465550798, + "loss": 1.0619, + "step": 915 + }, + { + "epoch": 0.35597007675119013, + "grad_norm": 0.20783264935016632, + "learning_rate": 0.00012915531335149863, + "loss": 1.0412, + "step": 916 + }, + { + "epoch": 0.35635869037209755, + "grad_norm": 0.24584618210792542, + "learning_rate": 0.00012907746204748928, + "loss": 1.1165, + "step": 917 + }, + { + "epoch": 0.35674730399300497, + "grad_norm": 0.23146122694015503, + "learning_rate": 0.00012899961074347996, + "loss": 1.1111, + "step": 918 + }, + { + "epoch": 0.3571359176139124, + "grad_norm": 0.19983729720115662, + "learning_rate": 0.00012892175943947061, + "loss": 1.0674, + "step": 919 + }, + { + "epoch": 0.35752453123481975, + "grad_norm": 0.2161000818014145, + "learning_rate": 0.00012884390813546127, + "loss": 1.076, + "step": 920 + }, + { + "epoch": 0.35791314485572717, + "grad_norm": 0.21042793989181519, + "learning_rate": 0.00012876605683145192, + "loss": 1.0535, + "step": 921 + }, + { + "epoch": 0.3583017584766346, + "grad_norm": 0.20135439932346344, + "learning_rate": 0.0001286882055274426, + "loss": 1.0059, + "step": 922 + }, + { + "epoch": 0.358690372097542, + "grad_norm": 0.19394971430301666, + "learning_rate": 0.00012861035422343325, + "loss": 1.0381, + "step": 923 + }, + { + "epoch": 0.35907898571844943, + "grad_norm": 0.21171030402183533, + "learning_rate": 0.0001285325029194239, + "loss": 1.0513, + "step": 924 + }, + { + "epoch": 0.35946759933935685, + "grad_norm": 0.19476690888404846, + "learning_rate": 0.00012845465161541458, + "loss": 1.0003, + "step": 925 + }, + { + "epoch": 0.35985621296026427, + "grad_norm": 0.20468670129776, + "learning_rate": 0.00012837680031140523, + "loss": 1.0608, + "step": 926 + }, + { + "epoch": 0.3602448265811717, + "grad_norm": 0.21159446239471436, + "learning_rate": 0.00012829894900739588, + "loss": 1.0734, + "step": 927 + }, + { + "epoch": 0.3606334402020791, + "grad_norm": 0.21179519593715668, + "learning_rate": 0.00012822109770338654, + "loss": 1.0957, + "step": 928 + }, + { + "epoch": 0.3610220538229865, + "grad_norm": 0.20997527241706848, + "learning_rate": 0.00012814324639937722, + "loss": 1.0644, + "step": 929 + }, + { + "epoch": 0.3614106674438939, + "grad_norm": 0.21178296208381653, + "learning_rate": 0.00012806539509536787, + "loss": 1.0208, + "step": 930 + }, + { + "epoch": 0.3617992810648013, + "grad_norm": 0.20890356600284576, + "learning_rate": 0.00012798754379135852, + "loss": 1.0888, + "step": 931 + }, + { + "epoch": 0.3621878946857087, + "grad_norm": 0.20177409052848816, + "learning_rate": 0.00012790969248734917, + "loss": 0.9741, + "step": 932 + }, + { + "epoch": 0.36257650830661614, + "grad_norm": 0.23504556715488434, + "learning_rate": 0.00012783184118333982, + "loss": 1.1048, + "step": 933 + }, + { + "epoch": 0.36296512192752356, + "grad_norm": 0.22829356789588928, + "learning_rate": 0.0001277539898793305, + "loss": 1.0798, + "step": 934 + }, + { + "epoch": 0.363353735548431, + "grad_norm": 0.2068483531475067, + "learning_rate": 0.00012767613857532116, + "loss": 1.0452, + "step": 935 + }, + { + "epoch": 0.3637423491693384, + "grad_norm": 0.2093171775341034, + "learning_rate": 0.0001275982872713118, + "loss": 1.0742, + "step": 936 + }, + { + "epoch": 0.3641309627902458, + "grad_norm": 0.21478736400604248, + "learning_rate": 0.00012752043596730246, + "loss": 1.0572, + "step": 937 + }, + { + "epoch": 0.36451957641115323, + "grad_norm": 0.1906953752040863, + "learning_rate": 0.0001274425846632931, + "loss": 1.0107, + "step": 938 + }, + { + "epoch": 0.3649081900320606, + "grad_norm": 0.20580604672431946, + "learning_rate": 0.0001273647333592838, + "loss": 1.0677, + "step": 939 + }, + { + "epoch": 0.365296803652968, + "grad_norm": 0.22586850821971893, + "learning_rate": 0.00012728688205527444, + "loss": 1.0389, + "step": 940 + }, + { + "epoch": 0.36568541727387543, + "grad_norm": 0.199899360537529, + "learning_rate": 0.0001272090307512651, + "loss": 1.0462, + "step": 941 + }, + { + "epoch": 0.36607403089478285, + "grad_norm": 0.19881689548492432, + "learning_rate": 0.00012713117944725575, + "loss": 1.0565, + "step": 942 + }, + { + "epoch": 0.3664626445156903, + "grad_norm": 0.21748925745487213, + "learning_rate": 0.0001270533281432464, + "loss": 1.0659, + "step": 943 + }, + { + "epoch": 0.3668512581365977, + "grad_norm": 0.19363689422607422, + "learning_rate": 0.00012697547683923708, + "loss": 1.0307, + "step": 944 + }, + { + "epoch": 0.3672398717575051, + "grad_norm": 0.21701784431934357, + "learning_rate": 0.00012689762553522773, + "loss": 1.0684, + "step": 945 + }, + { + "epoch": 0.36762848537841253, + "grad_norm": 0.21406958997249603, + "learning_rate": 0.00012681977423121838, + "loss": 1.0703, + "step": 946 + }, + { + "epoch": 0.36801709899931995, + "grad_norm": 0.23539729416370392, + "learning_rate": 0.00012674192292720903, + "loss": 1.1537, + "step": 947 + }, + { + "epoch": 0.36840571262022737, + "grad_norm": 0.2177354395389557, + "learning_rate": 0.00012666407162319969, + "loss": 1.0131, + "step": 948 + }, + { + "epoch": 0.36879432624113473, + "grad_norm": 0.255346417427063, + "learning_rate": 0.00012658622031919037, + "loss": 0.9807, + "step": 949 + }, + { + "epoch": 0.36918293986204215, + "grad_norm": 0.2139921486377716, + "learning_rate": 0.00012650836901518102, + "loss": 1.0392, + "step": 950 + }, + { + "epoch": 0.36957155348294957, + "grad_norm": 0.22490833699703217, + "learning_rate": 0.00012643051771117167, + "loss": 1.0512, + "step": 951 + }, + { + "epoch": 0.369960167103857, + "grad_norm": 0.20698820054531097, + "learning_rate": 0.00012635266640716232, + "loss": 1.0391, + "step": 952 + }, + { + "epoch": 0.3703487807247644, + "grad_norm": 0.2276201844215393, + "learning_rate": 0.00012627481510315297, + "loss": 1.0513, + "step": 953 + }, + { + "epoch": 0.3707373943456718, + "grad_norm": 0.2493600994348526, + "learning_rate": 0.00012619696379914365, + "loss": 1.0136, + "step": 954 + }, + { + "epoch": 0.37112600796657924, + "grad_norm": 0.2155001014471054, + "learning_rate": 0.0001261191124951343, + "loss": 1.0523, + "step": 955 + }, + { + "epoch": 0.37151462158748666, + "grad_norm": 0.21571211516857147, + "learning_rate": 0.00012604126119112496, + "loss": 1.0288, + "step": 956 + }, + { + "epoch": 0.3719032352083941, + "grad_norm": 0.23238877952098846, + "learning_rate": 0.0001259634098871156, + "loss": 1.0638, + "step": 957 + }, + { + "epoch": 0.37229184882930144, + "grad_norm": 0.2002813220024109, + "learning_rate": 0.00012588555858310626, + "loss": 0.9665, + "step": 958 + }, + { + "epoch": 0.37268046245020886, + "grad_norm": 0.21712858974933624, + "learning_rate": 0.0001258077072790969, + "loss": 1.0469, + "step": 959 + }, + { + "epoch": 0.3730690760711163, + "grad_norm": 0.2178192287683487, + "learning_rate": 0.0001257298559750876, + "loss": 1.0267, + "step": 960 + }, + { + "epoch": 0.3734576896920237, + "grad_norm": 0.25488024950027466, + "learning_rate": 0.00012565200467107824, + "loss": 1.0153, + "step": 961 + }, + { + "epoch": 0.3738463033129311, + "grad_norm": 0.20070038735866547, + "learning_rate": 0.0001255741533670689, + "loss": 1.0279, + "step": 962 + }, + { + "epoch": 0.37423491693383854, + "grad_norm": 0.21885356307029724, + "learning_rate": 0.00012549630206305955, + "loss": 1.0395, + "step": 963 + }, + { + "epoch": 0.37462353055474595, + "grad_norm": 0.2407921701669693, + "learning_rate": 0.0001254184507590502, + "loss": 1.0767, + "step": 964 + }, + { + "epoch": 0.3750121441756534, + "grad_norm": 0.20645053684711456, + "learning_rate": 0.00012534059945504088, + "loss": 1.0318, + "step": 965 + }, + { + "epoch": 0.3754007577965608, + "grad_norm": 0.21275092661380768, + "learning_rate": 0.00012526274815103153, + "loss": 1.0546, + "step": 966 + }, + { + "epoch": 0.3757893714174682, + "grad_norm": 0.21574917435646057, + "learning_rate": 0.00012518489684702218, + "loss": 1.032, + "step": 967 + }, + { + "epoch": 0.3761779850383756, + "grad_norm": 0.21589480340480804, + "learning_rate": 0.00012510704554301284, + "loss": 1.0834, + "step": 968 + }, + { + "epoch": 0.376566598659283, + "grad_norm": 0.19576796889305115, + "learning_rate": 0.0001250291942390035, + "loss": 1.0178, + "step": 969 + }, + { + "epoch": 0.3769552122801904, + "grad_norm": 0.20941287279129028, + "learning_rate": 0.00012495134293499417, + "loss": 1.0712, + "step": 970 + }, + { + "epoch": 0.37734382590109783, + "grad_norm": 0.22585494816303253, + "learning_rate": 0.00012487349163098482, + "loss": 1.0401, + "step": 971 + }, + { + "epoch": 0.37773243952200525, + "grad_norm": 0.21093420684337616, + "learning_rate": 0.00012479564032697547, + "loss": 1.0569, + "step": 972 + }, + { + "epoch": 0.37812105314291267, + "grad_norm": 0.22375014424324036, + "learning_rate": 0.00012471778902296612, + "loss": 1.0687, + "step": 973 + }, + { + "epoch": 0.3785096667638201, + "grad_norm": 0.19787487387657166, + "learning_rate": 0.0001246399377189568, + "loss": 1.0266, + "step": 974 + }, + { + "epoch": 0.3788982803847275, + "grad_norm": 0.20633013546466827, + "learning_rate": 0.00012456208641494745, + "loss": 0.9996, + "step": 975 + }, + { + "epoch": 0.3792868940056349, + "grad_norm": 0.21559873223304749, + "learning_rate": 0.0001244842351109381, + "loss": 1.0851, + "step": 976 + }, + { + "epoch": 0.3796755076265423, + "grad_norm": 0.2166333943605423, + "learning_rate": 0.00012440638380692879, + "loss": 1.0859, + "step": 977 + }, + { + "epoch": 0.3800641212474497, + "grad_norm": 0.18558773398399353, + "learning_rate": 0.00012432853250291944, + "loss": 0.9534, + "step": 978 + }, + { + "epoch": 0.3804527348683571, + "grad_norm": 0.2086942344903946, + "learning_rate": 0.0001242506811989101, + "loss": 1.0786, + "step": 979 + }, + { + "epoch": 0.38084134848926454, + "grad_norm": 0.2207823544740677, + "learning_rate": 0.00012417282989490074, + "loss": 1.0626, + "step": 980 + }, + { + "epoch": 0.38122996211017196, + "grad_norm": 0.21255749464035034, + "learning_rate": 0.00012409497859089142, + "loss": 1.063, + "step": 981 + }, + { + "epoch": 0.3816185757310794, + "grad_norm": 0.20682042837142944, + "learning_rate": 0.00012401712728688207, + "loss": 1.034, + "step": 982 + }, + { + "epoch": 0.3820071893519868, + "grad_norm": 0.2084134966135025, + "learning_rate": 0.00012393927598287272, + "loss": 1.0481, + "step": 983 + }, + { + "epoch": 0.3823958029728942, + "grad_norm": 0.1922312080860138, + "learning_rate": 0.00012386142467886338, + "loss": 1.0461, + "step": 984 + }, + { + "epoch": 0.38278441659380164, + "grad_norm": 0.20893707871437073, + "learning_rate": 0.00012378357337485406, + "loss": 1.0797, + "step": 985 + }, + { + "epoch": 0.383173030214709, + "grad_norm": 0.19717541337013245, + "learning_rate": 0.0001237057220708447, + "loss": 1.0028, + "step": 986 + }, + { + "epoch": 0.3835616438356164, + "grad_norm": 0.20688053965568542, + "learning_rate": 0.00012362787076683536, + "loss": 0.989, + "step": 987 + }, + { + "epoch": 0.38395025745652384, + "grad_norm": 0.20580583810806274, + "learning_rate": 0.000123550019462826, + "loss": 1.06, + "step": 988 + }, + { + "epoch": 0.38433887107743125, + "grad_norm": 0.2151709794998169, + "learning_rate": 0.00012347216815881666, + "loss": 1.0685, + "step": 989 + }, + { + "epoch": 0.3847274846983387, + "grad_norm": 0.19573980569839478, + "learning_rate": 0.00012339431685480734, + "loss": 1.0072, + "step": 990 + }, + { + "epoch": 0.3851160983192461, + "grad_norm": 0.1949119120836258, + "learning_rate": 0.000123316465550798, + "loss": 0.9995, + "step": 991 + }, + { + "epoch": 0.3855047119401535, + "grad_norm": 0.2062375247478485, + "learning_rate": 0.00012323861424678865, + "loss": 1.0694, + "step": 992 + }, + { + "epoch": 0.38589332556106093, + "grad_norm": 0.2007209211587906, + "learning_rate": 0.0001231607629427793, + "loss": 1.0397, + "step": 993 + }, + { + "epoch": 0.38628193918196835, + "grad_norm": 0.2231544405221939, + "learning_rate": 0.00012308291163876995, + "loss": 1.0755, + "step": 994 + }, + { + "epoch": 0.38667055280287577, + "grad_norm": 0.2103337049484253, + "learning_rate": 0.0001230050603347606, + "loss": 1.0505, + "step": 995 + }, + { + "epoch": 0.38705916642378313, + "grad_norm": 0.20178386569023132, + "learning_rate": 0.00012292720903075128, + "loss": 1.0696, + "step": 996 + }, + { + "epoch": 0.38744778004469055, + "grad_norm": 0.21268007159233093, + "learning_rate": 0.00012284935772674193, + "loss": 1.0262, + "step": 997 + }, + { + "epoch": 0.38783639366559797, + "grad_norm": 0.21439722180366516, + "learning_rate": 0.0001227715064227326, + "loss": 1.0718, + "step": 998 + }, + { + "epoch": 0.3882250072865054, + "grad_norm": 0.19691336154937744, + "learning_rate": 0.00012269365511872324, + "loss": 0.9663, + "step": 999 + }, + { + "epoch": 0.3886136209074128, + "grad_norm": 0.2165926694869995, + "learning_rate": 0.0001226158038147139, + "loss": 1.0432, + "step": 1000 + }, + { + "epoch": 0.3890022345283202, + "grad_norm": 0.20730604231357574, + "learning_rate": 0.00012253795251070457, + "loss": 1.0386, + "step": 1001 + }, + { + "epoch": 0.38939084814922764, + "grad_norm": 0.2138068974018097, + "learning_rate": 0.00012246010120669522, + "loss": 1.0683, + "step": 1002 + }, + { + "epoch": 0.38977946177013506, + "grad_norm": 0.2118951678276062, + "learning_rate": 0.00012238224990268587, + "loss": 1.0393, + "step": 1003 + }, + { + "epoch": 0.3901680753910425, + "grad_norm": 0.20879961550235748, + "learning_rate": 0.00012230439859867653, + "loss": 1.0349, + "step": 1004 + }, + { + "epoch": 0.39055668901194984, + "grad_norm": 0.19588464498519897, + "learning_rate": 0.00012222654729466718, + "loss": 1.0226, + "step": 1005 + }, + { + "epoch": 0.39094530263285726, + "grad_norm": 0.2059485912322998, + "learning_rate": 0.00012214869599065786, + "loss": 1.052, + "step": 1006 + }, + { + "epoch": 0.3913339162537647, + "grad_norm": 0.2299761176109314, + "learning_rate": 0.0001220708446866485, + "loss": 1.1055, + "step": 1007 + }, + { + "epoch": 0.3917225298746721, + "grad_norm": 0.20196737349033356, + "learning_rate": 0.00012199299338263916, + "loss": 1.0497, + "step": 1008 + }, + { + "epoch": 0.3921111434955795, + "grad_norm": 0.20615293085575104, + "learning_rate": 0.00012191514207862981, + "loss": 1.047, + "step": 1009 + }, + { + "epoch": 0.39249975711648694, + "grad_norm": 0.20265278220176697, + "learning_rate": 0.00012183729077462047, + "loss": 1.0035, + "step": 1010 + }, + { + "epoch": 0.39288837073739435, + "grad_norm": 0.20197926461696625, + "learning_rate": 0.00012175943947061114, + "loss": 0.9847, + "step": 1011 + }, + { + "epoch": 0.3932769843583018, + "grad_norm": 0.19974152743816376, + "learning_rate": 0.0001216815881666018, + "loss": 1.0669, + "step": 1012 + }, + { + "epoch": 0.3936655979792092, + "grad_norm": 0.21684005856513977, + "learning_rate": 0.00012160373686259245, + "loss": 1.0562, + "step": 1013 + }, + { + "epoch": 0.3940542116001166, + "grad_norm": 0.2030404955148697, + "learning_rate": 0.00012152588555858311, + "loss": 1.0159, + "step": 1014 + }, + { + "epoch": 0.394442825221024, + "grad_norm": 0.2123572677373886, + "learning_rate": 0.00012144803425457377, + "loss": 1.0757, + "step": 1015 + }, + { + "epoch": 0.3948314388419314, + "grad_norm": 0.20320011675357819, + "learning_rate": 0.00012137018295056443, + "loss": 1.038, + "step": 1016 + }, + { + "epoch": 0.3952200524628388, + "grad_norm": 0.20120739936828613, + "learning_rate": 0.00012129233164655508, + "loss": 1.1015, + "step": 1017 + }, + { + "epoch": 0.39560866608374623, + "grad_norm": 0.19862449169158936, + "learning_rate": 0.00012121448034254575, + "loss": 1.0328, + "step": 1018 + }, + { + "epoch": 0.39599727970465365, + "grad_norm": 0.19761312007904053, + "learning_rate": 0.0001211366290385364, + "loss": 0.997, + "step": 1019 + }, + { + "epoch": 0.39638589332556107, + "grad_norm": 0.1943569928407669, + "learning_rate": 0.00012105877773452705, + "loss": 1.0099, + "step": 1020 + }, + { + "epoch": 0.3967745069464685, + "grad_norm": 0.2109062373638153, + "learning_rate": 0.00012098092643051773, + "loss": 1.1039, + "step": 1021 + }, + { + "epoch": 0.3971631205673759, + "grad_norm": 0.20966266095638275, + "learning_rate": 0.00012090307512650839, + "loss": 1.1208, + "step": 1022 + }, + { + "epoch": 0.3975517341882833, + "grad_norm": 0.19208088517189026, + "learning_rate": 0.00012082522382249904, + "loss": 1.0147, + "step": 1023 + }, + { + "epoch": 0.3979403478091907, + "grad_norm": 0.21821236610412598, + "learning_rate": 0.00012074737251848969, + "loss": 1.0615, + "step": 1024 + }, + { + "epoch": 0.3983289614300981, + "grad_norm": 0.20031368732452393, + "learning_rate": 0.00012066952121448034, + "loss": 1.0303, + "step": 1025 + }, + { + "epoch": 0.3987175750510055, + "grad_norm": 0.22910597920417786, + "learning_rate": 0.00012059166991047102, + "loss": 1.0182, + "step": 1026 + }, + { + "epoch": 0.39910618867191294, + "grad_norm": 0.20816978812217712, + "learning_rate": 0.00012051381860646167, + "loss": 1.0142, + "step": 1027 + }, + { + "epoch": 0.39949480229282036, + "grad_norm": 0.20989780128002167, + "learning_rate": 0.00012043596730245232, + "loss": 1.0676, + "step": 1028 + }, + { + "epoch": 0.3998834159137278, + "grad_norm": 0.21894055604934692, + "learning_rate": 0.00012035811599844298, + "loss": 1.0222, + "step": 1029 + }, + { + "epoch": 0.4002720295346352, + "grad_norm": 0.2170870155096054, + "learning_rate": 0.00012028026469443363, + "loss": 1.0319, + "step": 1030 + }, + { + "epoch": 0.4006606431555426, + "grad_norm": 0.20869679749011993, + "learning_rate": 0.00012020241339042428, + "loss": 1.055, + "step": 1031 + }, + { + "epoch": 0.40104925677645004, + "grad_norm": 0.18850640952587128, + "learning_rate": 0.00012012456208641496, + "loss": 0.9993, + "step": 1032 + }, + { + "epoch": 0.40143787039735745, + "grad_norm": 0.21462580561637878, + "learning_rate": 0.00012004671078240561, + "loss": 1.0115, + "step": 1033 + }, + { + "epoch": 0.4018264840182648, + "grad_norm": 0.2008499950170517, + "learning_rate": 0.00011996885947839626, + "loss": 1.0229, + "step": 1034 + }, + { + "epoch": 0.40221509763917224, + "grad_norm": 0.20063354074954987, + "learning_rate": 0.00011989100817438692, + "loss": 1.0295, + "step": 1035 + }, + { + "epoch": 0.40260371126007966, + "grad_norm": 0.20655786991119385, + "learning_rate": 0.00011981315687037757, + "loss": 1.0044, + "step": 1036 + }, + { + "epoch": 0.4029923248809871, + "grad_norm": 0.1985999196767807, + "learning_rate": 0.00011973530556636825, + "loss": 1.0063, + "step": 1037 + }, + { + "epoch": 0.4033809385018945, + "grad_norm": 0.2039060890674591, + "learning_rate": 0.0001196574542623589, + "loss": 1.044, + "step": 1038 + }, + { + "epoch": 0.4037695521228019, + "grad_norm": 0.21838189661502838, + "learning_rate": 0.00011957960295834955, + "loss": 1.1101, + "step": 1039 + }, + { + "epoch": 0.40415816574370933, + "grad_norm": 0.21508415043354034, + "learning_rate": 0.00011950175165434022, + "loss": 1.0764, + "step": 1040 + }, + { + "epoch": 0.40454677936461675, + "grad_norm": 0.2089119255542755, + "learning_rate": 0.00011942390035033087, + "loss": 0.9986, + "step": 1041 + }, + { + "epoch": 0.40493539298552417, + "grad_norm": 0.19859452545642853, + "learning_rate": 0.00011934604904632153, + "loss": 1.0122, + "step": 1042 + }, + { + "epoch": 0.40532400660643153, + "grad_norm": 0.2018653154373169, + "learning_rate": 0.00011926819774231219, + "loss": 1.0187, + "step": 1043 + }, + { + "epoch": 0.40571262022733895, + "grad_norm": 0.19892063736915588, + "learning_rate": 0.00011919034643830285, + "loss": 1.0029, + "step": 1044 + }, + { + "epoch": 0.40610123384824637, + "grad_norm": 0.20355650782585144, + "learning_rate": 0.0001191124951342935, + "loss": 1.0484, + "step": 1045 + }, + { + "epoch": 0.4064898474691538, + "grad_norm": 0.2033994495868683, + "learning_rate": 0.00011903464383028416, + "loss": 1.087, + "step": 1046 + }, + { + "epoch": 0.4068784610900612, + "grad_norm": 0.2047330141067505, + "learning_rate": 0.00011895679252627484, + "loss": 1.0774, + "step": 1047 + }, + { + "epoch": 0.4072670747109686, + "grad_norm": 0.21420112252235413, + "learning_rate": 0.00011887894122226549, + "loss": 1.0252, + "step": 1048 + }, + { + "epoch": 0.40765568833187604, + "grad_norm": 0.2030097395181656, + "learning_rate": 0.00011880108991825614, + "loss": 1.0501, + "step": 1049 + }, + { + "epoch": 0.40804430195278346, + "grad_norm": 0.2128026783466339, + "learning_rate": 0.00011872323861424679, + "loss": 1.1031, + "step": 1050 + }, + { + "epoch": 0.4084329155736909, + "grad_norm": 0.20724938809871674, + "learning_rate": 0.00011864538731023744, + "loss": 1.0327, + "step": 1051 + }, + { + "epoch": 0.40882152919459824, + "grad_norm": 0.20344072580337524, + "learning_rate": 0.00011856753600622812, + "loss": 1.0719, + "step": 1052 + }, + { + "epoch": 0.40921014281550566, + "grad_norm": 0.2145012468099594, + "learning_rate": 0.00011848968470221877, + "loss": 1.0582, + "step": 1053 + }, + { + "epoch": 0.4095987564364131, + "grad_norm": 0.220048725605011, + "learning_rate": 0.00011841183339820943, + "loss": 1.0825, + "step": 1054 + }, + { + "epoch": 0.4099873700573205, + "grad_norm": 0.19074465334415436, + "learning_rate": 0.00011833398209420008, + "loss": 0.9657, + "step": 1055 + }, + { + "epoch": 0.4103759836782279, + "grad_norm": 0.1958267241716385, + "learning_rate": 0.00011825613079019073, + "loss": 0.9864, + "step": 1056 + }, + { + "epoch": 0.41076459729913534, + "grad_norm": 0.21768233180046082, + "learning_rate": 0.00011817827948618141, + "loss": 0.9997, + "step": 1057 + }, + { + "epoch": 0.41115321092004276, + "grad_norm": 0.20218704640865326, + "learning_rate": 0.00011810042818217206, + "loss": 1.072, + "step": 1058 + }, + { + "epoch": 0.4115418245409502, + "grad_norm": 0.2035023719072342, + "learning_rate": 0.00011802257687816271, + "loss": 1.0415, + "step": 1059 + }, + { + "epoch": 0.4119304381618576, + "grad_norm": 0.22603970766067505, + "learning_rate": 0.00011794472557415337, + "loss": 1.0751, + "step": 1060 + }, + { + "epoch": 0.412319051782765, + "grad_norm": 0.2125842273235321, + "learning_rate": 0.00011786687427014402, + "loss": 1.0727, + "step": 1061 + }, + { + "epoch": 0.4127076654036724, + "grad_norm": 0.2005981206893921, + "learning_rate": 0.0001177890229661347, + "loss": 1.0191, + "step": 1062 + }, + { + "epoch": 0.4130962790245798, + "grad_norm": 0.22252701222896576, + "learning_rate": 0.00011771117166212535, + "loss": 1.0591, + "step": 1063 + }, + { + "epoch": 0.4134848926454872, + "grad_norm": 0.22205251455307007, + "learning_rate": 0.000117633320358116, + "loss": 1.1198, + "step": 1064 + }, + { + "epoch": 0.41387350626639463, + "grad_norm": 0.20037783682346344, + "learning_rate": 0.00011755546905410665, + "loss": 1.0548, + "step": 1065 + }, + { + "epoch": 0.41426211988730205, + "grad_norm": 0.21737834811210632, + "learning_rate": 0.00011747761775009732, + "loss": 1.0922, + "step": 1066 + }, + { + "epoch": 0.41465073350820947, + "grad_norm": 0.19312533736228943, + "learning_rate": 0.00011739976644608798, + "loss": 0.9836, + "step": 1067 + }, + { + "epoch": 0.4150393471291169, + "grad_norm": 0.22055000066757202, + "learning_rate": 0.00011732191514207864, + "loss": 1.0383, + "step": 1068 + }, + { + "epoch": 0.4154279607500243, + "grad_norm": 0.22623857855796814, + "learning_rate": 0.0001172440638380693, + "loss": 1.0704, + "step": 1069 + }, + { + "epoch": 0.4158165743709317, + "grad_norm": 0.21481367945671082, + "learning_rate": 0.00011716621253405995, + "loss": 1.052, + "step": 1070 + }, + { + "epoch": 0.4162051879918391, + "grad_norm": 0.21022087335586548, + "learning_rate": 0.0001170883612300506, + "loss": 1.1021, + "step": 1071 + }, + { + "epoch": 0.4165938016127465, + "grad_norm": 0.2154620885848999, + "learning_rate": 0.00011701050992604126, + "loss": 1.0128, + "step": 1072 + }, + { + "epoch": 0.4169824152336539, + "grad_norm": 0.20545578002929688, + "learning_rate": 0.00011693265862203194, + "loss": 1.0058, + "step": 1073 + }, + { + "epoch": 0.41737102885456134, + "grad_norm": 0.21726195514202118, + "learning_rate": 0.00011685480731802259, + "loss": 1.0753, + "step": 1074 + }, + { + "epoch": 0.41775964247546876, + "grad_norm": 0.2067115604877472, + "learning_rate": 0.00011677695601401324, + "loss": 1.0594, + "step": 1075 + }, + { + "epoch": 0.4181482560963762, + "grad_norm": 0.23024648427963257, + "learning_rate": 0.0001166991047100039, + "loss": 1.1039, + "step": 1076 + }, + { + "epoch": 0.4185368697172836, + "grad_norm": 0.20692144334316254, + "learning_rate": 0.00011662125340599455, + "loss": 1.0598, + "step": 1077 + }, + { + "epoch": 0.418925483338191, + "grad_norm": 0.19839999079704285, + "learning_rate": 0.00011654340210198522, + "loss": 1.054, + "step": 1078 + }, + { + "epoch": 0.41931409695909844, + "grad_norm": 0.19227825105190277, + "learning_rate": 0.00011646555079797588, + "loss": 0.9453, + "step": 1079 + }, + { + "epoch": 0.41970271058000586, + "grad_norm": 0.2112567275762558, + "learning_rate": 0.00011638769949396653, + "loss": 1.023, + "step": 1080 + }, + { + "epoch": 0.4200913242009132, + "grad_norm": 0.185299351811409, + "learning_rate": 0.00011630984818995718, + "loss": 0.9752, + "step": 1081 + }, + { + "epoch": 0.42047993782182064, + "grad_norm": 0.20148858428001404, + "learning_rate": 0.00011623199688594783, + "loss": 1.0659, + "step": 1082 + }, + { + "epoch": 0.42086855144272806, + "grad_norm": 0.1935974359512329, + "learning_rate": 0.00011615414558193851, + "loss": 1.0116, + "step": 1083 + }, + { + "epoch": 0.4212571650636355, + "grad_norm": 0.20433953404426575, + "learning_rate": 0.00011607629427792916, + "loss": 1.0671, + "step": 1084 + }, + { + "epoch": 0.4216457786845429, + "grad_norm": 0.20729799568653107, + "learning_rate": 0.00011599844297391982, + "loss": 1.0341, + "step": 1085 + }, + { + "epoch": 0.4220343923054503, + "grad_norm": 0.2126002460718155, + "learning_rate": 0.00011592059166991047, + "loss": 1.0188, + "step": 1086 + }, + { + "epoch": 0.42242300592635773, + "grad_norm": 0.19453707337379456, + "learning_rate": 0.00011584274036590112, + "loss": 1.0331, + "step": 1087 + }, + { + "epoch": 0.42281161954726515, + "grad_norm": 0.20909856259822845, + "learning_rate": 0.0001157648890618918, + "loss": 0.9984, + "step": 1088 + }, + { + "epoch": 0.42320023316817257, + "grad_norm": 0.19596272706985474, + "learning_rate": 0.00011568703775788245, + "loss": 1.0121, + "step": 1089 + }, + { + "epoch": 0.42358884678907993, + "grad_norm": 0.22045716643333435, + "learning_rate": 0.0001156091864538731, + "loss": 1.0591, + "step": 1090 + }, + { + "epoch": 0.42397746040998735, + "grad_norm": 0.22624897956848145, + "learning_rate": 0.00011553133514986376, + "loss": 1.0565, + "step": 1091 + }, + { + "epoch": 0.42436607403089477, + "grad_norm": 0.20263417065143585, + "learning_rate": 0.00011545348384585442, + "loss": 1.024, + "step": 1092 + }, + { + "epoch": 0.4247546876518022, + "grad_norm": 0.20179417729377747, + "learning_rate": 0.00011537563254184509, + "loss": 0.9806, + "step": 1093 + }, + { + "epoch": 0.4251433012727096, + "grad_norm": 0.30221593379974365, + "learning_rate": 0.00011529778123783574, + "loss": 1.0683, + "step": 1094 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 0.21195146441459656, + "learning_rate": 0.0001152199299338264, + "loss": 1.1283, + "step": 1095 + }, + { + "epoch": 0.42592052851452444, + "grad_norm": 0.21860192716121674, + "learning_rate": 0.00011514207862981706, + "loss": 1.0046, + "step": 1096 + }, + { + "epoch": 0.42630914213543186, + "grad_norm": 0.2234150469303131, + "learning_rate": 0.00011506422732580771, + "loss": 1.0461, + "step": 1097 + }, + { + "epoch": 0.4266977557563393, + "grad_norm": 0.21535125374794006, + "learning_rate": 0.00011498637602179837, + "loss": 1.0593, + "step": 1098 + }, + { + "epoch": 0.4270863693772467, + "grad_norm": 0.19313789904117584, + "learning_rate": 0.00011490852471778904, + "loss": 1.0357, + "step": 1099 + }, + { + "epoch": 0.42747498299815406, + "grad_norm": 0.19886989891529083, + "learning_rate": 0.00011483067341377969, + "loss": 0.9946, + "step": 1100 + }, + { + "epoch": 0.4278635966190615, + "grad_norm": 0.21028490364551544, + "learning_rate": 0.00011475282210977034, + "loss": 1.0765, + "step": 1101 + }, + { + "epoch": 0.4282522102399689, + "grad_norm": 0.2066621333360672, + "learning_rate": 0.000114674970805761, + "loss": 1.0405, + "step": 1102 + }, + { + "epoch": 0.4286408238608763, + "grad_norm": 0.18400220572948456, + "learning_rate": 0.00011459711950175168, + "loss": 0.9404, + "step": 1103 + }, + { + "epoch": 0.42902943748178374, + "grad_norm": 0.2058599591255188, + "learning_rate": 0.00011451926819774233, + "loss": 1.0505, + "step": 1104 + }, + { + "epoch": 0.42941805110269116, + "grad_norm": 0.19696786999702454, + "learning_rate": 0.00011444141689373298, + "loss": 1.032, + "step": 1105 + }, + { + "epoch": 0.4298066647235986, + "grad_norm": 0.2082854062318802, + "learning_rate": 0.00011436356558972363, + "loss": 1.0914, + "step": 1106 + }, + { + "epoch": 0.430195278344506, + "grad_norm": 0.20155015587806702, + "learning_rate": 0.00011428571428571428, + "loss": 1.0541, + "step": 1107 + }, + { + "epoch": 0.4305838919654134, + "grad_norm": 0.23419982194900513, + "learning_rate": 0.00011420786298170494, + "loss": 1.0684, + "step": 1108 + }, + { + "epoch": 0.4309725055863208, + "grad_norm": 0.23493975400924683, + "learning_rate": 0.00011413001167769561, + "loss": 1.0509, + "step": 1109 + }, + { + "epoch": 0.4313611192072282, + "grad_norm": 0.2089843600988388, + "learning_rate": 0.00011405216037368627, + "loss": 1.0479, + "step": 1110 + }, + { + "epoch": 0.4317497328281356, + "grad_norm": 0.21076850593090057, + "learning_rate": 0.00011397430906967692, + "loss": 1.064, + "step": 1111 + }, + { + "epoch": 0.43213834644904303, + "grad_norm": 0.20307987928390503, + "learning_rate": 0.00011389645776566757, + "loss": 1.0416, + "step": 1112 + }, + { + "epoch": 0.43252696006995045, + "grad_norm": 0.20955562591552734, + "learning_rate": 0.00011381860646165822, + "loss": 1.0158, + "step": 1113 + }, + { + "epoch": 0.43291557369085787, + "grad_norm": 0.2074531465768814, + "learning_rate": 0.0001137407551576489, + "loss": 1.0486, + "step": 1114 + }, + { + "epoch": 0.4333041873117653, + "grad_norm": 0.20907235145568848, + "learning_rate": 0.00011366290385363955, + "loss": 1.0352, + "step": 1115 + }, + { + "epoch": 0.4336928009326727, + "grad_norm": 0.21726477146148682, + "learning_rate": 0.0001135850525496302, + "loss": 1.0068, + "step": 1116 + }, + { + "epoch": 0.4340814145535801, + "grad_norm": 0.20231984555721283, + "learning_rate": 0.00011350720124562086, + "loss": 0.9757, + "step": 1117 + }, + { + "epoch": 0.4344700281744875, + "grad_norm": 0.23485834896564484, + "learning_rate": 0.00011342934994161152, + "loss": 1.0681, + "step": 1118 + }, + { + "epoch": 0.4348586417953949, + "grad_norm": 0.21286556124687195, + "learning_rate": 0.00011335149863760219, + "loss": 1.0399, + "step": 1119 + }, + { + "epoch": 0.4352472554163023, + "grad_norm": 0.2097872495651245, + "learning_rate": 0.00011327364733359284, + "loss": 1.0435, + "step": 1120 + }, + { + "epoch": 0.43563586903720974, + "grad_norm": 0.2224377542734146, + "learning_rate": 0.00011319579602958351, + "loss": 1.1664, + "step": 1121 + }, + { + "epoch": 0.43602448265811716, + "grad_norm": 0.19213411211967468, + "learning_rate": 0.00011311794472557416, + "loss": 1.0424, + "step": 1122 + }, + { + "epoch": 0.4364130962790246, + "grad_norm": 0.20974959433078766, + "learning_rate": 0.00011304009342156481, + "loss": 1.0943, + "step": 1123 + }, + { + "epoch": 0.436801709899932, + "grad_norm": 0.19943708181381226, + "learning_rate": 0.00011296224211755549, + "loss": 1.0652, + "step": 1124 + }, + { + "epoch": 0.4371903235208394, + "grad_norm": 0.1832750141620636, + "learning_rate": 0.00011288439081354614, + "loss": 0.9883, + "step": 1125 + }, + { + "epoch": 0.43757893714174684, + "grad_norm": 0.2205052226781845, + "learning_rate": 0.0001128065395095368, + "loss": 1.0733, + "step": 1126 + }, + { + "epoch": 0.43796755076265426, + "grad_norm": 0.2082854062318802, + "learning_rate": 0.00011272868820552745, + "loss": 1.0141, + "step": 1127 + }, + { + "epoch": 0.4383561643835616, + "grad_norm": 0.22755026817321777, + "learning_rate": 0.0001126508369015181, + "loss": 1.0942, + "step": 1128 + }, + { + "epoch": 0.43874477800446904, + "grad_norm": 0.2098863571882248, + "learning_rate": 0.00011257298559750878, + "loss": 0.9987, + "step": 1129 + }, + { + "epoch": 0.43913339162537646, + "grad_norm": 0.20559263229370117, + "learning_rate": 0.00011249513429349943, + "loss": 1.0345, + "step": 1130 + }, + { + "epoch": 0.4395220052462839, + "grad_norm": 0.21955084800720215, + "learning_rate": 0.00011241728298949008, + "loss": 1.1068, + "step": 1131 + }, + { + "epoch": 0.4399106188671913, + "grad_norm": 0.21353478729724884, + "learning_rate": 0.00011233943168548073, + "loss": 1.0094, + "step": 1132 + }, + { + "epoch": 0.4402992324880987, + "grad_norm": 0.19822491705417633, + "learning_rate": 0.00011226158038147139, + "loss": 0.9758, + "step": 1133 + }, + { + "epoch": 0.44068784610900613, + "grad_norm": 0.20079441368579865, + "learning_rate": 0.00011218372907746206, + "loss": 1.0202, + "step": 1134 + }, + { + "epoch": 0.44107645972991355, + "grad_norm": 0.2261926829814911, + "learning_rate": 0.00011210587777345272, + "loss": 0.9877, + "step": 1135 + }, + { + "epoch": 0.44146507335082097, + "grad_norm": 0.2264915257692337, + "learning_rate": 0.00011202802646944337, + "loss": 0.9887, + "step": 1136 + }, + { + "epoch": 0.44185368697172833, + "grad_norm": 0.21853779256343842, + "learning_rate": 0.00011195017516543402, + "loss": 1.0535, + "step": 1137 + }, + { + "epoch": 0.44224230059263575, + "grad_norm": 0.21332694590091705, + "learning_rate": 0.00011187232386142467, + "loss": 1.0824, + "step": 1138 + }, + { + "epoch": 0.44263091421354317, + "grad_norm": 0.21350236237049103, + "learning_rate": 0.00011179447255741535, + "loss": 1.0758, + "step": 1139 + }, + { + "epoch": 0.4430195278344506, + "grad_norm": 0.21305765211582184, + "learning_rate": 0.000111716621253406, + "loss": 1.035, + "step": 1140 + }, + { + "epoch": 0.443408141455358, + "grad_norm": 0.20486389100551605, + "learning_rate": 0.00011163876994939666, + "loss": 1.0413, + "step": 1141 + }, + { + "epoch": 0.4437967550762654, + "grad_norm": 0.19255472719669342, + "learning_rate": 0.00011156091864538731, + "loss": 0.9583, + "step": 1142 + }, + { + "epoch": 0.44418536869717284, + "grad_norm": 0.19824008643627167, + "learning_rate": 0.00011148306734137796, + "loss": 1.0331, + "step": 1143 + }, + { + "epoch": 0.44457398231808026, + "grad_norm": 0.20308080315589905, + "learning_rate": 0.00011140521603736863, + "loss": 1.0399, + "step": 1144 + }, + { + "epoch": 0.4449625959389877, + "grad_norm": 0.2193964123725891, + "learning_rate": 0.00011132736473335929, + "loss": 1.063, + "step": 1145 + }, + { + "epoch": 0.4453512095598951, + "grad_norm": 0.2151576578617096, + "learning_rate": 0.00011124951342934994, + "loss": 1.0795, + "step": 1146 + }, + { + "epoch": 0.44573982318080246, + "grad_norm": 0.23056697845458984, + "learning_rate": 0.00011117166212534061, + "loss": 1.0351, + "step": 1147 + }, + { + "epoch": 0.4461284368017099, + "grad_norm": 0.1973094493150711, + "learning_rate": 0.00011109381082133126, + "loss": 0.9866, + "step": 1148 + }, + { + "epoch": 0.4465170504226173, + "grad_norm": 0.2119562178850174, + "learning_rate": 0.00011101595951732191, + "loss": 1.0591, + "step": 1149 + }, + { + "epoch": 0.4469056640435247, + "grad_norm": 0.20407763123512268, + "learning_rate": 0.00011093810821331259, + "loss": 0.988, + "step": 1150 + }, + { + "epoch": 0.44729427766443214, + "grad_norm": 0.19474107027053833, + "learning_rate": 0.00011086025690930324, + "loss": 0.9729, + "step": 1151 + }, + { + "epoch": 0.44768289128533956, + "grad_norm": 0.2179928421974182, + "learning_rate": 0.0001107824056052939, + "loss": 1.0558, + "step": 1152 + }, + { + "epoch": 0.448071504906247, + "grad_norm": 0.44306451082229614, + "learning_rate": 0.00011070455430128455, + "loss": 1.0901, + "step": 1153 + }, + { + "epoch": 0.4484601185271544, + "grad_norm": 0.22060540318489075, + "learning_rate": 0.0001106267029972752, + "loss": 1.0009, + "step": 1154 + }, + { + "epoch": 0.4488487321480618, + "grad_norm": 0.20534972846508026, + "learning_rate": 0.00011054885169326588, + "loss": 0.9741, + "step": 1155 + }, + { + "epoch": 0.4492373457689692, + "grad_norm": 0.19488993287086487, + "learning_rate": 0.00011047100038925653, + "loss": 1.0, + "step": 1156 + }, + { + "epoch": 0.4496259593898766, + "grad_norm": 0.20462395250797272, + "learning_rate": 0.00011039314908524718, + "loss": 1.0309, + "step": 1157 + }, + { + "epoch": 0.450014573010784, + "grad_norm": 0.2170749306678772, + "learning_rate": 0.00011031529778123784, + "loss": 1.0726, + "step": 1158 + }, + { + "epoch": 0.45040318663169143, + "grad_norm": 0.2066730111837387, + "learning_rate": 0.00011023744647722849, + "loss": 1.0227, + "step": 1159 + }, + { + "epoch": 0.45079180025259885, + "grad_norm": 0.20625676214694977, + "learning_rate": 0.00011015959517321917, + "loss": 1.0287, + "step": 1160 + }, + { + "epoch": 0.45118041387350627, + "grad_norm": 0.19483047723770142, + "learning_rate": 0.00011008174386920982, + "loss": 0.9639, + "step": 1161 + }, + { + "epoch": 0.4515690274944137, + "grad_norm": 0.24705417454242706, + "learning_rate": 0.00011000389256520047, + "loss": 0.9903, + "step": 1162 + }, + { + "epoch": 0.4519576411153211, + "grad_norm": 0.2109205424785614, + "learning_rate": 0.00010992604126119112, + "loss": 1.054, + "step": 1163 + }, + { + "epoch": 0.4523462547362285, + "grad_norm": 0.20904991030693054, + "learning_rate": 0.00010984818995718178, + "loss": 1.0416, + "step": 1164 + }, + { + "epoch": 0.45273486835713594, + "grad_norm": 0.19841328263282776, + "learning_rate": 0.00010977033865317245, + "loss": 0.9986, + "step": 1165 + }, + { + "epoch": 0.4531234819780433, + "grad_norm": 0.20545506477355957, + "learning_rate": 0.0001096924873491631, + "loss": 1.0337, + "step": 1166 + }, + { + "epoch": 0.4535120955989507, + "grad_norm": 0.208644837141037, + "learning_rate": 0.00010961463604515376, + "loss": 1.0304, + "step": 1167 + }, + { + "epoch": 0.45390070921985815, + "grad_norm": 0.2111911028623581, + "learning_rate": 0.00010953678474114441, + "loss": 1.0398, + "step": 1168 + }, + { + "epoch": 0.45428932284076556, + "grad_norm": 0.2600184381008148, + "learning_rate": 0.00010945893343713506, + "loss": 1.0509, + "step": 1169 + }, + { + "epoch": 0.454677936461673, + "grad_norm": 0.2059030532836914, + "learning_rate": 0.00010938108213312574, + "loss": 0.9347, + "step": 1170 + }, + { + "epoch": 0.4550665500825804, + "grad_norm": 0.19232551753520966, + "learning_rate": 0.0001093032308291164, + "loss": 1.0162, + "step": 1171 + }, + { + "epoch": 0.4554551637034878, + "grad_norm": 0.19147330522537231, + "learning_rate": 0.00010922537952510705, + "loss": 0.9872, + "step": 1172 + }, + { + "epoch": 0.45584377732439524, + "grad_norm": 0.2599676251411438, + "learning_rate": 0.00010914752822109771, + "loss": 1.0402, + "step": 1173 + }, + { + "epoch": 0.45623239094530266, + "grad_norm": 0.2159397304058075, + "learning_rate": 0.00010906967691708836, + "loss": 1.0411, + "step": 1174 + }, + { + "epoch": 0.45662100456621, + "grad_norm": 0.23864266276359558, + "learning_rate": 0.00010899182561307903, + "loss": 1.054, + "step": 1175 + }, + { + "epoch": 0.45700961818711744, + "grad_norm": 0.2027217596769333, + "learning_rate": 0.0001089139743090697, + "loss": 0.9713, + "step": 1176 + }, + { + "epoch": 0.45739823180802486, + "grad_norm": 0.1837588995695114, + "learning_rate": 0.00010883612300506035, + "loss": 0.9698, + "step": 1177 + }, + { + "epoch": 0.4577868454289323, + "grad_norm": 0.20038527250289917, + "learning_rate": 0.000108758271701051, + "loss": 1.0456, + "step": 1178 + }, + { + "epoch": 0.4581754590498397, + "grad_norm": 0.21525044739246368, + "learning_rate": 0.00010868042039704165, + "loss": 1.021, + "step": 1179 + }, + { + "epoch": 0.4585640726707471, + "grad_norm": 0.18813730776309967, + "learning_rate": 0.0001086025690930323, + "loss": 0.9673, + "step": 1180 + }, + { + "epoch": 0.45895268629165453, + "grad_norm": 0.2056179642677307, + "learning_rate": 0.00010852471778902298, + "loss": 1.0119, + "step": 1181 + }, + { + "epoch": 0.45934129991256195, + "grad_norm": 0.21599683165550232, + "learning_rate": 0.00010844686648501363, + "loss": 1.0537, + "step": 1182 + }, + { + "epoch": 0.45972991353346937, + "grad_norm": 0.19750265777111053, + "learning_rate": 0.00010836901518100429, + "loss": 1.0203, + "step": 1183 + }, + { + "epoch": 0.4601185271543768, + "grad_norm": 0.22186161577701569, + "learning_rate": 0.00010829116387699494, + "loss": 1.0583, + "step": 1184 + }, + { + "epoch": 0.46050714077528415, + "grad_norm": 0.2109905481338501, + "learning_rate": 0.00010821331257298559, + "loss": 1.0022, + "step": 1185 + }, + { + "epoch": 0.46089575439619157, + "grad_norm": 0.2032858431339264, + "learning_rate": 0.00010813546126897627, + "loss": 0.9774, + "step": 1186 + }, + { + "epoch": 0.461284368017099, + "grad_norm": 0.20381197333335876, + "learning_rate": 0.00010805760996496692, + "loss": 0.9768, + "step": 1187 + }, + { + "epoch": 0.4616729816380064, + "grad_norm": 0.20488987863063812, + "learning_rate": 0.00010797975866095757, + "loss": 1.0448, + "step": 1188 + }, + { + "epoch": 0.4620615952589138, + "grad_norm": 0.20257477462291718, + "learning_rate": 0.00010790190735694823, + "loss": 1.0157, + "step": 1189 + }, + { + "epoch": 0.46245020887982125, + "grad_norm": 0.20761239528656006, + "learning_rate": 0.00010782405605293888, + "loss": 1.0328, + "step": 1190 + }, + { + "epoch": 0.46283882250072866, + "grad_norm": 0.22062581777572632, + "learning_rate": 0.00010774620474892956, + "loss": 1.0362, + "step": 1191 + }, + { + "epoch": 0.4632274361216361, + "grad_norm": 0.19970272481441498, + "learning_rate": 0.00010766835344492021, + "loss": 1.0783, + "step": 1192 + }, + { + "epoch": 0.4636160497425435, + "grad_norm": 0.2221893072128296, + "learning_rate": 0.00010759050214091086, + "loss": 1.0136, + "step": 1193 + }, + { + "epoch": 0.46400466336345086, + "grad_norm": 0.2124665081501007, + "learning_rate": 0.00010751265083690151, + "loss": 1.0528, + "step": 1194 + }, + { + "epoch": 0.4643932769843583, + "grad_norm": 0.2001204937696457, + "learning_rate": 0.00010743479953289218, + "loss": 1.0495, + "step": 1195 + }, + { + "epoch": 0.4647818906052657, + "grad_norm": 0.20979635417461395, + "learning_rate": 0.00010735694822888284, + "loss": 1.0664, + "step": 1196 + }, + { + "epoch": 0.4651705042261731, + "grad_norm": 0.190982848405838, + "learning_rate": 0.0001072790969248735, + "loss": 1.0256, + "step": 1197 + }, + { + "epoch": 0.46555911784708054, + "grad_norm": 0.19910745322704315, + "learning_rate": 0.00010720124562086415, + "loss": 1.0263, + "step": 1198 + }, + { + "epoch": 0.46594773146798796, + "grad_norm": 0.21624085307121277, + "learning_rate": 0.00010712339431685481, + "loss": 1.0768, + "step": 1199 + }, + { + "epoch": 0.4663363450888954, + "grad_norm": 0.20857703685760498, + "learning_rate": 0.00010704554301284547, + "loss": 1.0892, + "step": 1200 + }, + { + "epoch": 0.4667249587098028, + "grad_norm": 0.21897061169147491, + "learning_rate": 0.00010696769170883613, + "loss": 1.0873, + "step": 1201 + }, + { + "epoch": 0.4671135723307102, + "grad_norm": 0.1943386346101761, + "learning_rate": 0.0001068898404048268, + "loss": 1.0116, + "step": 1202 + }, + { + "epoch": 0.4675021859516176, + "grad_norm": 0.22607874870300293, + "learning_rate": 0.00010681198910081745, + "loss": 1.0328, + "step": 1203 + }, + { + "epoch": 0.467890799572525, + "grad_norm": 0.1898999959230423, + "learning_rate": 0.0001067341377968081, + "loss": 0.9791, + "step": 1204 + }, + { + "epoch": 0.4682794131934324, + "grad_norm": 0.2193334400653839, + "learning_rate": 0.00010665628649279875, + "loss": 1.0742, + "step": 1205 + }, + { + "epoch": 0.46866802681433983, + "grad_norm": 0.2096349149942398, + "learning_rate": 0.00010657843518878943, + "loss": 1.0683, + "step": 1206 + }, + { + "epoch": 0.46905664043524725, + "grad_norm": 0.2040576934814453, + "learning_rate": 0.00010650058388478008, + "loss": 1.0516, + "step": 1207 + }, + { + "epoch": 0.46944525405615467, + "grad_norm": 0.20619645714759827, + "learning_rate": 0.00010642273258077074, + "loss": 1.0429, + "step": 1208 + }, + { + "epoch": 0.4698338676770621, + "grad_norm": 0.19753660261631012, + "learning_rate": 0.00010634488127676139, + "loss": 1.0268, + "step": 1209 + }, + { + "epoch": 0.4702224812979695, + "grad_norm": 0.2201426476240158, + "learning_rate": 0.00010626702997275204, + "loss": 1.0879, + "step": 1210 + }, + { + "epoch": 0.4706110949188769, + "grad_norm": 0.21307805180549622, + "learning_rate": 0.00010618917866874272, + "loss": 1.0186, + "step": 1211 + }, + { + "epoch": 0.47099970853978435, + "grad_norm": 0.21142373979091644, + "learning_rate": 0.00010611132736473337, + "loss": 1.0417, + "step": 1212 + }, + { + "epoch": 0.4713883221606917, + "grad_norm": 0.20523706078529358, + "learning_rate": 0.00010603347606072402, + "loss": 1.0372, + "step": 1213 + }, + { + "epoch": 0.4717769357815991, + "grad_norm": 0.19843094050884247, + "learning_rate": 0.00010595562475671468, + "loss": 1.0062, + "step": 1214 + }, + { + "epoch": 0.47216554940250655, + "grad_norm": 0.2146739959716797, + "learning_rate": 0.00010587777345270533, + "loss": 1.0528, + "step": 1215 + }, + { + "epoch": 0.47255416302341396, + "grad_norm": 0.2136303037405014, + "learning_rate": 0.00010579992214869601, + "loss": 1.0521, + "step": 1216 + }, + { + "epoch": 0.4729427766443214, + "grad_norm": 0.21379397809505463, + "learning_rate": 0.00010572207084468666, + "loss": 1.0362, + "step": 1217 + }, + { + "epoch": 0.4733313902652288, + "grad_norm": 0.20459088683128357, + "learning_rate": 0.00010564421954067731, + "loss": 1.0455, + "step": 1218 + }, + { + "epoch": 0.4737200038861362, + "grad_norm": 0.20667988061904907, + "learning_rate": 0.00010556636823666796, + "loss": 1.0284, + "step": 1219 + }, + { + "epoch": 0.47410861750704364, + "grad_norm": 0.21820449829101562, + "learning_rate": 0.00010548851693265862, + "loss": 1.0584, + "step": 1220 + }, + { + "epoch": 0.47449723112795106, + "grad_norm": 0.19705156981945038, + "learning_rate": 0.00010541066562864928, + "loss": 1.004, + "step": 1221 + }, + { + "epoch": 0.4748858447488584, + "grad_norm": 0.19806528091430664, + "learning_rate": 0.00010533281432463995, + "loss": 1.0519, + "step": 1222 + }, + { + "epoch": 0.47527445836976584, + "grad_norm": 0.2006833702325821, + "learning_rate": 0.0001052549630206306, + "loss": 1.0119, + "step": 1223 + }, + { + "epoch": 0.47566307199067326, + "grad_norm": 0.21757058799266815, + "learning_rate": 0.00010517711171662125, + "loss": 1.0961, + "step": 1224 + }, + { + "epoch": 0.4760516856115807, + "grad_norm": 0.2015775889158249, + "learning_rate": 0.00010509926041261192, + "loss": 1.0419, + "step": 1225 + }, + { + "epoch": 0.4764402992324881, + "grad_norm": 0.19691923260688782, + "learning_rate": 0.00010502140910860257, + "loss": 1.0555, + "step": 1226 + }, + { + "epoch": 0.4768289128533955, + "grad_norm": 0.19924800097942352, + "learning_rate": 0.00010494355780459323, + "loss": 1.0106, + "step": 1227 + }, + { + "epoch": 0.47721752647430293, + "grad_norm": 0.21416346728801727, + "learning_rate": 0.0001048657065005839, + "loss": 1.0741, + "step": 1228 + }, + { + "epoch": 0.47760614009521035, + "grad_norm": 0.21823547780513763, + "learning_rate": 0.00010478785519657455, + "loss": 1.023, + "step": 1229 + }, + { + "epoch": 0.47799475371611777, + "grad_norm": 0.2083735466003418, + "learning_rate": 0.0001047100038925652, + "loss": 1.0424, + "step": 1230 + }, + { + "epoch": 0.4783833673370252, + "grad_norm": 0.2219141572713852, + "learning_rate": 0.00010463215258855586, + "loss": 1.0839, + "step": 1231 + }, + { + "epoch": 0.47877198095793255, + "grad_norm": 0.21334600448608398, + "learning_rate": 0.00010455430128454653, + "loss": 0.9888, + "step": 1232 + }, + { + "epoch": 0.47916059457883997, + "grad_norm": 0.2140086442232132, + "learning_rate": 0.00010447644998053719, + "loss": 1.0119, + "step": 1233 + }, + { + "epoch": 0.4795492081997474, + "grad_norm": 0.25360551476478577, + "learning_rate": 0.00010439859867652784, + "loss": 1.0026, + "step": 1234 + }, + { + "epoch": 0.4799378218206548, + "grad_norm": 0.20200380682945251, + "learning_rate": 0.00010432074737251849, + "loss": 1.0, + "step": 1235 + }, + { + "epoch": 0.4803264354415622, + "grad_norm": 0.22641289234161377, + "learning_rate": 0.00010424289606850914, + "loss": 1.1022, + "step": 1236 + }, + { + "epoch": 0.48071504906246965, + "grad_norm": 0.20538561046123505, + "learning_rate": 0.00010416504476449982, + "loss": 0.9847, + "step": 1237 + }, + { + "epoch": 0.48110366268337706, + "grad_norm": 0.206883504986763, + "learning_rate": 0.00010408719346049047, + "loss": 1.0152, + "step": 1238 + }, + { + "epoch": 0.4814922763042845, + "grad_norm": 0.21584320068359375, + "learning_rate": 0.00010400934215648113, + "loss": 1.0361, + "step": 1239 + }, + { + "epoch": 0.4818808899251919, + "grad_norm": 0.20963703095912933, + "learning_rate": 0.00010393149085247178, + "loss": 1.0814, + "step": 1240 + }, + { + "epoch": 0.48226950354609927, + "grad_norm": 0.1965872198343277, + "learning_rate": 0.00010385363954846243, + "loss": 1.0365, + "step": 1241 + }, + { + "epoch": 0.4826581171670067, + "grad_norm": 0.2030191719532013, + "learning_rate": 0.00010377578824445311, + "loss": 1.0374, + "step": 1242 + }, + { + "epoch": 0.4830467307879141, + "grad_norm": 0.21448804438114166, + "learning_rate": 0.00010369793694044376, + "loss": 0.9686, + "step": 1243 + }, + { + "epoch": 0.4834353444088215, + "grad_norm": 0.2181752622127533, + "learning_rate": 0.00010362008563643441, + "loss": 1.0812, + "step": 1244 + }, + { + "epoch": 0.48382395802972894, + "grad_norm": 0.19887101650238037, + "learning_rate": 0.00010354223433242507, + "loss": 1.036, + "step": 1245 + }, + { + "epoch": 0.48421257165063636, + "grad_norm": 0.19007287919521332, + "learning_rate": 0.00010346438302841572, + "loss": 1.0292, + "step": 1246 + }, + { + "epoch": 0.4846011852715438, + "grad_norm": 0.21390347182750702, + "learning_rate": 0.0001033865317244064, + "loss": 1.0284, + "step": 1247 + }, + { + "epoch": 0.4849897988924512, + "grad_norm": 0.23822663724422455, + "learning_rate": 0.00010330868042039705, + "loss": 1.1044, + "step": 1248 + }, + { + "epoch": 0.4853784125133586, + "grad_norm": 0.20779070258140564, + "learning_rate": 0.0001032308291163877, + "loss": 1.0475, + "step": 1249 + }, + { + "epoch": 0.48576702613426603, + "grad_norm": 0.19232134521007538, + "learning_rate": 0.00010315297781237835, + "loss": 0.9945, + "step": 1250 + }, + { + "epoch": 0.4861556397551734, + "grad_norm": 0.22378556430339813, + "learning_rate": 0.00010307512650836902, + "loss": 1.0462, + "step": 1251 + }, + { + "epoch": 0.4865442533760808, + "grad_norm": 0.22156798839569092, + "learning_rate": 0.00010299727520435968, + "loss": 1.051, + "step": 1252 + }, + { + "epoch": 0.48693286699698823, + "grad_norm": 0.19885733723640442, + "learning_rate": 0.00010291942390035034, + "loss": 1.0593, + "step": 1253 + }, + { + "epoch": 0.48732148061789565, + "grad_norm": 0.2172418236732483, + "learning_rate": 0.000102841572596341, + "loss": 1.0513, + "step": 1254 + }, + { + "epoch": 0.48771009423880307, + "grad_norm": 0.22136956453323364, + "learning_rate": 0.00010276372129233165, + "loss": 1.0438, + "step": 1255 + }, + { + "epoch": 0.4880987078597105, + "grad_norm": 0.21337302029132843, + "learning_rate": 0.0001026858699883223, + "loss": 1.0551, + "step": 1256 + }, + { + "epoch": 0.4884873214806179, + "grad_norm": 0.21376267075538635, + "learning_rate": 0.00010260801868431296, + "loss": 1.054, + "step": 1257 + }, + { + "epoch": 0.4888759351015253, + "grad_norm": 0.19498860836029053, + "learning_rate": 0.00010253016738030364, + "loss": 1.0045, + "step": 1258 + }, + { + "epoch": 0.48926454872243275, + "grad_norm": 0.22354961931705475, + "learning_rate": 0.00010245231607629429, + "loss": 1.096, + "step": 1259 + }, + { + "epoch": 0.4896531623433401, + "grad_norm": 0.2078939527273178, + "learning_rate": 0.00010237446477228494, + "loss": 1.0102, + "step": 1260 + }, + { + "epoch": 0.49004177596424753, + "grad_norm": 0.20992495119571686, + "learning_rate": 0.00010229661346827559, + "loss": 0.9814, + "step": 1261 + }, + { + "epoch": 0.49043038958515495, + "grad_norm": 0.2178875207901001, + "learning_rate": 0.00010221876216426625, + "loss": 1.0489, + "step": 1262 + }, + { + "epoch": 0.49081900320606237, + "grad_norm": 0.22152946889400482, + "learning_rate": 0.00010214091086025692, + "loss": 1.0808, + "step": 1263 + }, + { + "epoch": 0.4912076168269698, + "grad_norm": 0.21179009974002838, + "learning_rate": 0.00010206305955624758, + "loss": 1.0323, + "step": 1264 + }, + { + "epoch": 0.4915962304478772, + "grad_norm": 0.2126997411251068, + "learning_rate": 0.00010198520825223823, + "loss": 1.0093, + "step": 1265 + }, + { + "epoch": 0.4919848440687846, + "grad_norm": 0.20912809669971466, + "learning_rate": 0.00010190735694822888, + "loss": 1.0343, + "step": 1266 + }, + { + "epoch": 0.49237345768969204, + "grad_norm": 0.2231636494398117, + "learning_rate": 0.00010182950564421953, + "loss": 1.0587, + "step": 1267 + }, + { + "epoch": 0.49276207131059946, + "grad_norm": 0.1954583376646042, + "learning_rate": 0.00010175165434021021, + "loss": 0.9566, + "step": 1268 + }, + { + "epoch": 0.4931506849315068, + "grad_norm": 0.20520909130573273, + "learning_rate": 0.00010167380303620086, + "loss": 1.024, + "step": 1269 + }, + { + "epoch": 0.49353929855241424, + "grad_norm": 0.21736180782318115, + "learning_rate": 0.00010159595173219152, + "loss": 1.0434, + "step": 1270 + }, + { + "epoch": 0.49392791217332166, + "grad_norm": 0.2360561490058899, + "learning_rate": 0.00010151810042818217, + "loss": 1.114, + "step": 1271 + }, + { + "epoch": 0.4943165257942291, + "grad_norm": 0.20595967769622803, + "learning_rate": 0.00010144024912417282, + "loss": 0.9909, + "step": 1272 + }, + { + "epoch": 0.4947051394151365, + "grad_norm": 0.2161860466003418, + "learning_rate": 0.0001013623978201635, + "loss": 1.0536, + "step": 1273 + }, + { + "epoch": 0.4950937530360439, + "grad_norm": 0.19852355122566223, + "learning_rate": 0.00010128454651615415, + "loss": 1.0001, + "step": 1274 + }, + { + "epoch": 0.49548236665695133, + "grad_norm": 0.21081402897834778, + "learning_rate": 0.0001012066952121448, + "loss": 1.0151, + "step": 1275 + }, + { + "epoch": 0.49587098027785875, + "grad_norm": 0.2053362876176834, + "learning_rate": 0.00010112884390813547, + "loss": 1.018, + "step": 1276 + }, + { + "epoch": 0.49625959389876617, + "grad_norm": 0.21205593645572662, + "learning_rate": 0.00010105099260412612, + "loss": 0.9912, + "step": 1277 + }, + { + "epoch": 0.4966482075196736, + "grad_norm": 0.2005016952753067, + "learning_rate": 0.00010097314130011679, + "loss": 1.0069, + "step": 1278 + }, + { + "epoch": 0.49703682114058095, + "grad_norm": 0.21688181161880493, + "learning_rate": 0.00010089528999610744, + "loss": 1.0364, + "step": 1279 + }, + { + "epoch": 0.49742543476148837, + "grad_norm": 0.20582237839698792, + "learning_rate": 0.0001008174386920981, + "loss": 1.0138, + "step": 1280 + }, + { + "epoch": 0.4978140483823958, + "grad_norm": 0.20824448764324188, + "learning_rate": 0.00010073958738808876, + "loss": 0.9941, + "step": 1281 + }, + { + "epoch": 0.4982026620033032, + "grad_norm": 0.20749075710773468, + "learning_rate": 0.00010066173608407941, + "loss": 1.0478, + "step": 1282 + }, + { + "epoch": 0.49859127562421063, + "grad_norm": 0.20012183487415314, + "learning_rate": 0.00010058388478007009, + "loss": 0.995, + "step": 1283 + }, + { + "epoch": 0.49897988924511805, + "grad_norm": 0.20275959372520447, + "learning_rate": 0.00010050603347606074, + "loss": 1.097, + "step": 1284 + }, + { + "epoch": 0.49936850286602547, + "grad_norm": 0.19588243961334229, + "learning_rate": 0.00010042818217205139, + "loss": 1.0, + "step": 1285 + }, + { + "epoch": 0.4997571164869329, + "grad_norm": 0.20693185925483704, + "learning_rate": 0.00010035033086804204, + "loss": 1.0527, + "step": 1286 + }, + { + "epoch": 0.5001457301078402, + "grad_norm": 0.20330573618412018, + "learning_rate": 0.0001002724795640327, + "loss": 1.0137, + "step": 1287 + }, + { + "epoch": 0.5005343437287477, + "grad_norm": 0.19123876094818115, + "learning_rate": 0.00010019462826002337, + "loss": 0.9688, + "step": 1288 + }, + { + "epoch": 0.5009229573496551, + "grad_norm": 0.2184276431798935, + "learning_rate": 0.00010011677695601403, + "loss": 1.0367, + "step": 1289 + }, + { + "epoch": 0.5013115709705626, + "grad_norm": 0.21642108261585236, + "learning_rate": 0.00010003892565200468, + "loss": 1.102, + "step": 1290 + }, + { + "epoch": 0.5017001845914699, + "grad_norm": 0.20351074635982513, + "learning_rate": 9.996107434799533e-05, + "loss": 1.0327, + "step": 1291 + }, + { + "epoch": 0.5020887982123774, + "grad_norm": 0.22771553695201874, + "learning_rate": 9.9883223043986e-05, + "loss": 1.104, + "step": 1292 + }, + { + "epoch": 0.5024774118332848, + "grad_norm": 0.2271403968334198, + "learning_rate": 9.980537173997665e-05, + "loss": 1.1313, + "step": 1293 + }, + { + "epoch": 0.5028660254541921, + "grad_norm": 0.2157830148935318, + "learning_rate": 9.97275204359673e-05, + "loss": 1.0203, + "step": 1294 + }, + { + "epoch": 0.5032546390750996, + "grad_norm": 0.19555307924747467, + "learning_rate": 9.964966913195797e-05, + "loss": 1.0194, + "step": 1295 + }, + { + "epoch": 0.503643252696007, + "grad_norm": 0.1898549199104309, + "learning_rate": 9.957181782794862e-05, + "loss": 1.0034, + "step": 1296 + }, + { + "epoch": 0.5040318663169144, + "grad_norm": 0.23555906116962433, + "learning_rate": 9.949396652393928e-05, + "loss": 1.0298, + "step": 1297 + }, + { + "epoch": 0.5044204799378218, + "grad_norm": 0.20434850454330444, + "learning_rate": 9.941611521992994e-05, + "loss": 0.9999, + "step": 1298 + }, + { + "epoch": 0.5048090935587293, + "grad_norm": 0.21015289425849915, + "learning_rate": 9.933826391592059e-05, + "loss": 1.006, + "step": 1299 + }, + { + "epoch": 0.5051977071796366, + "grad_norm": 0.21147851645946503, + "learning_rate": 9.926041261191125e-05, + "loss": 1.0854, + "step": 1300 + }, + { + "epoch": 0.5055863208005441, + "grad_norm": 0.19666944444179535, + "learning_rate": 9.91825613079019e-05, + "loss": 1.0057, + "step": 1301 + }, + { + "epoch": 0.5059749344214515, + "grad_norm": 0.21233728528022766, + "learning_rate": 9.910471000389257e-05, + "loss": 1.0675, + "step": 1302 + }, + { + "epoch": 0.5063635480423588, + "grad_norm": 0.21905581653118134, + "learning_rate": 9.902685869988322e-05, + "loss": 1.0054, + "step": 1303 + }, + { + "epoch": 0.5067521616632663, + "grad_norm": 0.23434993624687195, + "learning_rate": 9.894900739587389e-05, + "loss": 0.9915, + "step": 1304 + }, + { + "epoch": 0.5071407752841737, + "grad_norm": 0.21684227883815765, + "learning_rate": 9.887115609186454e-05, + "loss": 1.1131, + "step": 1305 + }, + { + "epoch": 0.5075293889050811, + "grad_norm": 0.21699552237987518, + "learning_rate": 9.87933047878552e-05, + "loss": 1.0782, + "step": 1306 + }, + { + "epoch": 0.5079180025259885, + "grad_norm": 0.2218221127986908, + "learning_rate": 9.871545348384586e-05, + "loss": 1.0388, + "step": 1307 + }, + { + "epoch": 0.508306616146896, + "grad_norm": 0.20104359090328217, + "learning_rate": 9.863760217983652e-05, + "loss": 1.0336, + "step": 1308 + }, + { + "epoch": 0.5086952297678033, + "grad_norm": 0.21907050907611847, + "learning_rate": 9.855975087582718e-05, + "loss": 1.0587, + "step": 1309 + }, + { + "epoch": 0.5090838433887108, + "grad_norm": 0.2140391767024994, + "learning_rate": 9.848189957181784e-05, + "loss": 1.0351, + "step": 1310 + }, + { + "epoch": 0.5094724570096182, + "grad_norm": 0.33287563920021057, + "learning_rate": 9.84040482678085e-05, + "loss": 0.9908, + "step": 1311 + }, + { + "epoch": 0.5098610706305255, + "grad_norm": 0.2706705927848816, + "learning_rate": 9.832619696379915e-05, + "loss": 1.0078, + "step": 1312 + }, + { + "epoch": 0.510249684251433, + "grad_norm": 0.20216278731822968, + "learning_rate": 9.824834565978981e-05, + "loss": 1.0253, + "step": 1313 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 0.20736576616764069, + "learning_rate": 9.817049435578046e-05, + "loss": 1.0217, + "step": 1314 + }, + { + "epoch": 0.5110269114932479, + "grad_norm": 0.2275344580411911, + "learning_rate": 9.809264305177113e-05, + "loss": 1.0139, + "step": 1315 + }, + { + "epoch": 0.5114155251141552, + "grad_norm": 0.22243620455265045, + "learning_rate": 9.801479174776178e-05, + "loss": 1.0427, + "step": 1316 + }, + { + "epoch": 0.5118041387350627, + "grad_norm": 0.198841854929924, + "learning_rate": 9.793694044375243e-05, + "loss": 1.0231, + "step": 1317 + }, + { + "epoch": 0.5121927523559701, + "grad_norm": 0.2031068503856659, + "learning_rate": 9.78590891397431e-05, + "loss": 1.0184, + "step": 1318 + }, + { + "epoch": 0.5125813659768775, + "grad_norm": 0.21712587773799896, + "learning_rate": 9.778123783573375e-05, + "loss": 1.0205, + "step": 1319 + }, + { + "epoch": 0.5129699795977849, + "grad_norm": 0.19366060197353363, + "learning_rate": 9.77033865317244e-05, + "loss": 0.9623, + "step": 1320 + }, + { + "epoch": 0.5133585932186923, + "grad_norm": 0.19845952093601227, + "learning_rate": 9.762553522771507e-05, + "loss": 1.0209, + "step": 1321 + }, + { + "epoch": 0.5137472068395997, + "grad_norm": 0.19700276851654053, + "learning_rate": 9.754768392370572e-05, + "loss": 0.9506, + "step": 1322 + }, + { + "epoch": 0.5141358204605071, + "grad_norm": 0.19797460734844208, + "learning_rate": 9.746983261969639e-05, + "loss": 1.0928, + "step": 1323 + }, + { + "epoch": 0.5145244340814146, + "grad_norm": 0.20470699667930603, + "learning_rate": 9.739198131568704e-05, + "loss": 1.0835, + "step": 1324 + }, + { + "epoch": 0.5149130477023219, + "grad_norm": 0.19121742248535156, + "learning_rate": 9.731413001167769e-05, + "loss": 0.9877, + "step": 1325 + }, + { + "epoch": 0.5153016613232294, + "grad_norm": 0.20026616752147675, + "learning_rate": 9.723627870766836e-05, + "loss": 1.0094, + "step": 1326 + }, + { + "epoch": 0.5156902749441368, + "grad_norm": 0.2214539796113968, + "learning_rate": 9.715842740365901e-05, + "loss": 0.9867, + "step": 1327 + }, + { + "epoch": 0.5160788885650442, + "grad_norm": 0.22674603760242462, + "learning_rate": 9.708057609964967e-05, + "loss": 1.0738, + "step": 1328 + }, + { + "epoch": 0.5164675021859516, + "grad_norm": 0.21274834871292114, + "learning_rate": 9.700272479564033e-05, + "loss": 1.0458, + "step": 1329 + }, + { + "epoch": 0.5168561158068591, + "grad_norm": 0.20305052399635315, + "learning_rate": 9.692487349163099e-05, + "loss": 1.0041, + "step": 1330 + }, + { + "epoch": 0.5172447294277664, + "grad_norm": 0.1840772181749344, + "learning_rate": 9.684702218762166e-05, + "loss": 0.9498, + "step": 1331 + }, + { + "epoch": 0.5176333430486738, + "grad_norm": 0.2055782824754715, + "learning_rate": 9.676917088361231e-05, + "loss": 1.0223, + "step": 1332 + }, + { + "epoch": 0.5180219566695813, + "grad_norm": 0.21826402842998505, + "learning_rate": 9.669131957960297e-05, + "loss": 1.1068, + "step": 1333 + }, + { + "epoch": 0.5184105702904886, + "grad_norm": 0.22516922652721405, + "learning_rate": 9.661346827559363e-05, + "loss": 1.0957, + "step": 1334 + }, + { + "epoch": 0.5187991839113961, + "grad_norm": 0.21044284105300903, + "learning_rate": 9.653561697158428e-05, + "loss": 1.0384, + "step": 1335 + }, + { + "epoch": 0.5191877975323035, + "grad_norm": 0.20275571942329407, + "learning_rate": 9.645776566757494e-05, + "loss": 0.9978, + "step": 1336 + }, + { + "epoch": 0.519576411153211, + "grad_norm": 0.2077122926712036, + "learning_rate": 9.63799143635656e-05, + "loss": 1.0418, + "step": 1337 + }, + { + "epoch": 0.5199650247741183, + "grad_norm": 0.19158867001533508, + "learning_rate": 9.630206305955625e-05, + "loss": 1.0527, + "step": 1338 + }, + { + "epoch": 0.5203536383950258, + "grad_norm": 0.1932496577501297, + "learning_rate": 9.622421175554691e-05, + "loss": 1.0039, + "step": 1339 + }, + { + "epoch": 0.5207422520159332, + "grad_norm": 0.21937766671180725, + "learning_rate": 9.614636045153757e-05, + "loss": 1.0373, + "step": 1340 + }, + { + "epoch": 0.5211308656368405, + "grad_norm": 0.2268432229757309, + "learning_rate": 9.606850914752823e-05, + "loss": 1.0815, + "step": 1341 + }, + { + "epoch": 0.521519479257748, + "grad_norm": 0.2147454470396042, + "learning_rate": 9.599065784351888e-05, + "loss": 1.0331, + "step": 1342 + }, + { + "epoch": 0.5219080928786554, + "grad_norm": 0.19899709522724152, + "learning_rate": 9.591280653950954e-05, + "loss": 1.032, + "step": 1343 + }, + { + "epoch": 0.5222967064995628, + "grad_norm": 0.19646069407463074, + "learning_rate": 9.58349552355002e-05, + "loss": 0.9788, + "step": 1344 + }, + { + "epoch": 0.5226853201204702, + "grad_norm": 0.2146075963973999, + "learning_rate": 9.575710393149085e-05, + "loss": 1.0201, + "step": 1345 + }, + { + "epoch": 0.5230739337413777, + "grad_norm": 0.1968650370836258, + "learning_rate": 9.567925262748152e-05, + "loss": 0.9894, + "step": 1346 + }, + { + "epoch": 0.523462547362285, + "grad_norm": 0.21111296117305756, + "learning_rate": 9.560140132347217e-05, + "loss": 1.0961, + "step": 1347 + }, + { + "epoch": 0.5238511609831925, + "grad_norm": 0.20917272567749023, + "learning_rate": 9.552355001946282e-05, + "loss": 1.0435, + "step": 1348 + }, + { + "epoch": 0.5242397746040999, + "grad_norm": 0.2029752880334854, + "learning_rate": 9.544569871545349e-05, + "loss": 1.0328, + "step": 1349 + }, + { + "epoch": 0.5246283882250072, + "grad_norm": 0.20726613700389862, + "learning_rate": 9.536784741144414e-05, + "loss": 1.0465, + "step": 1350 + }, + { + "epoch": 0.5250170018459147, + "grad_norm": 0.19778740406036377, + "learning_rate": 9.52899961074348e-05, + "loss": 1.0058, + "step": 1351 + }, + { + "epoch": 0.5254056154668221, + "grad_norm": 0.19958540797233582, + "learning_rate": 9.521214480342546e-05, + "loss": 1.0164, + "step": 1352 + }, + { + "epoch": 0.5257942290877295, + "grad_norm": 0.2151395082473755, + "learning_rate": 9.513429349941611e-05, + "loss": 1.0703, + "step": 1353 + }, + { + "epoch": 0.5261828427086369, + "grad_norm": 0.2366979569196701, + "learning_rate": 9.505644219540678e-05, + "loss": 0.9832, + "step": 1354 + }, + { + "epoch": 0.5265714563295444, + "grad_norm": 0.22064165771007538, + "learning_rate": 9.497859089139743e-05, + "loss": 1.0181, + "step": 1355 + }, + { + "epoch": 0.5269600699504517, + "grad_norm": 0.20221936702728271, + "learning_rate": 9.49007395873881e-05, + "loss": 1.0424, + "step": 1356 + }, + { + "epoch": 0.5273486835713592, + "grad_norm": 0.19608759880065918, + "learning_rate": 9.482288828337876e-05, + "loss": 1.0074, + "step": 1357 + }, + { + "epoch": 0.5277372971922666, + "grad_norm": 0.20686689019203186, + "learning_rate": 9.474503697936941e-05, + "loss": 1.0213, + "step": 1358 + }, + { + "epoch": 0.528125910813174, + "grad_norm": 0.223610520362854, + "learning_rate": 9.466718567536008e-05, + "loss": 1.05, + "step": 1359 + }, + { + "epoch": 0.5285145244340814, + "grad_norm": 0.2135966569185257, + "learning_rate": 9.458933437135073e-05, + "loss": 1.034, + "step": 1360 + }, + { + "epoch": 0.5289031380549888, + "grad_norm": 0.1933239996433258, + "learning_rate": 9.451148306734138e-05, + "loss": 0.9883, + "step": 1361 + }, + { + "epoch": 0.5292917516758963, + "grad_norm": 0.20794694125652313, + "learning_rate": 9.443363176333205e-05, + "loss": 1.0103, + "step": 1362 + }, + { + "epoch": 0.5296803652968036, + "grad_norm": 0.20128493010997772, + "learning_rate": 9.43557804593227e-05, + "loss": 1.015, + "step": 1363 + }, + { + "epoch": 0.5300689789177111, + "grad_norm": 0.2128933072090149, + "learning_rate": 9.427792915531336e-05, + "loss": 1.0038, + "step": 1364 + }, + { + "epoch": 0.5304575925386185, + "grad_norm": 0.2046983689069748, + "learning_rate": 9.420007785130402e-05, + "loss": 0.9948, + "step": 1365 + }, + { + "epoch": 0.5308462061595259, + "grad_norm": 0.20909680426120758, + "learning_rate": 9.412222654729467e-05, + "loss": 1.0308, + "step": 1366 + }, + { + "epoch": 0.5312348197804333, + "grad_norm": 0.2182164192199707, + "learning_rate": 9.404437524328533e-05, + "loss": 1.0018, + "step": 1367 + }, + { + "epoch": 0.5316234334013407, + "grad_norm": 0.2107028216123581, + "learning_rate": 9.396652393927599e-05, + "loss": 1.0419, + "step": 1368 + }, + { + "epoch": 0.5320120470222481, + "grad_norm": 0.24631445109844208, + "learning_rate": 9.388867263526665e-05, + "loss": 1.0171, + "step": 1369 + }, + { + "epoch": 0.5324006606431555, + "grad_norm": 0.20331013202667236, + "learning_rate": 9.38108213312573e-05, + "loss": 1.0592, + "step": 1370 + }, + { + "epoch": 0.532789274264063, + "grad_norm": 0.19266058504581451, + "learning_rate": 9.373297002724796e-05, + "loss": 0.9912, + "step": 1371 + }, + { + "epoch": 0.5331778878849703, + "grad_norm": 0.22874227166175842, + "learning_rate": 9.365511872323862e-05, + "loss": 1.0533, + "step": 1372 + }, + { + "epoch": 0.5335665015058778, + "grad_norm": 0.2088235765695572, + "learning_rate": 9.357726741922927e-05, + "loss": 1.0464, + "step": 1373 + }, + { + "epoch": 0.5339551151267852, + "grad_norm": 0.2112397700548172, + "learning_rate": 9.349941611521994e-05, + "loss": 1.0503, + "step": 1374 + }, + { + "epoch": 0.5343437287476926, + "grad_norm": 0.20712170004844666, + "learning_rate": 9.342156481121059e-05, + "loss": 1.0237, + "step": 1375 + }, + { + "epoch": 0.5347323423686, + "grad_norm": 0.20077116787433624, + "learning_rate": 9.334371350720124e-05, + "loss": 1.0467, + "step": 1376 + }, + { + "epoch": 0.5351209559895075, + "grad_norm": 0.20394501090049744, + "learning_rate": 9.326586220319191e-05, + "loss": 1.0054, + "step": 1377 + }, + { + "epoch": 0.5355095696104148, + "grad_norm": 0.19459395110607147, + "learning_rate": 9.318801089918256e-05, + "loss": 0.9792, + "step": 1378 + }, + { + "epoch": 0.5358981832313222, + "grad_norm": 0.2116049826145172, + "learning_rate": 9.311015959517321e-05, + "loss": 1.0345, + "step": 1379 + }, + { + "epoch": 0.5362867968522297, + "grad_norm": 0.21672269701957703, + "learning_rate": 9.303230829116388e-05, + "loss": 1.0709, + "step": 1380 + }, + { + "epoch": 0.536675410473137, + "grad_norm": 0.20358407497406006, + "learning_rate": 9.295445698715453e-05, + "loss": 1.0534, + "step": 1381 + }, + { + "epoch": 0.5370640240940445, + "grad_norm": 0.19512853026390076, + "learning_rate": 9.28766056831452e-05, + "loss": 0.9397, + "step": 1382 + }, + { + "epoch": 0.5374526377149519, + "grad_norm": 0.2140122503042221, + "learning_rate": 9.279875437913586e-05, + "loss": 1.0164, + "step": 1383 + }, + { + "epoch": 0.5378412513358594, + "grad_norm": 0.20486049354076385, + "learning_rate": 9.272090307512651e-05, + "loss": 0.9892, + "step": 1384 + }, + { + "epoch": 0.5382298649567667, + "grad_norm": 0.20023222267627716, + "learning_rate": 9.264305177111718e-05, + "loss": 1.0019, + "step": 1385 + }, + { + "epoch": 0.5386184785776742, + "grad_norm": 0.20024439692497253, + "learning_rate": 9.256520046710783e-05, + "loss": 0.9717, + "step": 1386 + }, + { + "epoch": 0.5390070921985816, + "grad_norm": 0.21021386981010437, + "learning_rate": 9.24873491630985e-05, + "loss": 1.028, + "step": 1387 + }, + { + "epoch": 0.5393957058194889, + "grad_norm": 0.18508704006671906, + "learning_rate": 9.240949785908915e-05, + "loss": 1.0008, + "step": 1388 + }, + { + "epoch": 0.5397843194403964, + "grad_norm": 0.19351208209991455, + "learning_rate": 9.23316465550798e-05, + "loss": 0.9898, + "step": 1389 + }, + { + "epoch": 0.5401729330613038, + "grad_norm": 0.20341919362545013, + "learning_rate": 9.225379525107047e-05, + "loss": 1.0203, + "step": 1390 + }, + { + "epoch": 0.5405615466822112, + "grad_norm": 0.1942797303199768, + "learning_rate": 9.217594394706112e-05, + "loss": 1.003, + "step": 1391 + }, + { + "epoch": 0.5409501603031186, + "grad_norm": 0.2056138813495636, + "learning_rate": 9.209809264305178e-05, + "loss": 1.0149, + "step": 1392 + }, + { + "epoch": 0.5413387739240261, + "grad_norm": 0.21572062373161316, + "learning_rate": 9.202024133904244e-05, + "loss": 0.9808, + "step": 1393 + }, + { + "epoch": 0.5417273875449334, + "grad_norm": 0.19841499626636505, + "learning_rate": 9.194239003503309e-05, + "loss": 1.0467, + "step": 1394 + }, + { + "epoch": 0.5421160011658409, + "grad_norm": 0.20452147722244263, + "learning_rate": 9.186453873102375e-05, + "loss": 1.0378, + "step": 1395 + }, + { + "epoch": 0.5425046147867483, + "grad_norm": 0.2090451419353485, + "learning_rate": 9.17866874270144e-05, + "loss": 1.0823, + "step": 1396 + }, + { + "epoch": 0.5428932284076556, + "grad_norm": 0.215814009308815, + "learning_rate": 9.170883612300506e-05, + "loss": 1.0994, + "step": 1397 + }, + { + "epoch": 0.5432818420285631, + "grad_norm": 0.19924724102020264, + "learning_rate": 9.163098481899572e-05, + "loss": 1.0099, + "step": 1398 + }, + { + "epoch": 0.5436704556494705, + "grad_norm": 0.20074865221977234, + "learning_rate": 9.155313351498638e-05, + "loss": 1.0163, + "step": 1399 + }, + { + "epoch": 0.544059069270378, + "grad_norm": 0.21737203001976013, + "learning_rate": 9.147528221097704e-05, + "loss": 1.0527, + "step": 1400 + }, + { + "epoch": 0.5444476828912853, + "grad_norm": 0.2036885768175125, + "learning_rate": 9.139743090696769e-05, + "loss": 1.0208, + "step": 1401 + }, + { + "epoch": 0.5448362965121928, + "grad_norm": 0.20861585438251495, + "learning_rate": 9.131957960295835e-05, + "loss": 1.0175, + "step": 1402 + }, + { + "epoch": 0.5452249101331001, + "grad_norm": 0.23425570130348206, + "learning_rate": 9.124172829894901e-05, + "loss": 1.053, + "step": 1403 + }, + { + "epoch": 0.5456135237540076, + "grad_norm": 0.20389291644096375, + "learning_rate": 9.116387699493966e-05, + "loss": 1.0479, + "step": 1404 + }, + { + "epoch": 0.546002137374915, + "grad_norm": 0.20166678726673126, + "learning_rate": 9.108602569093033e-05, + "loss": 1.0064, + "step": 1405 + }, + { + "epoch": 0.5463907509958223, + "grad_norm": 0.21419203281402588, + "learning_rate": 9.100817438692098e-05, + "loss": 1.0122, + "step": 1406 + }, + { + "epoch": 0.5467793646167298, + "grad_norm": 0.20541758835315704, + "learning_rate": 9.093032308291165e-05, + "loss": 1.0355, + "step": 1407 + }, + { + "epoch": 0.5471679782376372, + "grad_norm": 0.21865367889404297, + "learning_rate": 9.08524717789023e-05, + "loss": 1.0201, + "step": 1408 + }, + { + "epoch": 0.5475565918585447, + "grad_norm": 0.21181468665599823, + "learning_rate": 9.077462047489296e-05, + "loss": 1.0501, + "step": 1409 + }, + { + "epoch": 0.547945205479452, + "grad_norm": 0.21016767621040344, + "learning_rate": 9.069676917088362e-05, + "loss": 1.0452, + "step": 1410 + }, + { + "epoch": 0.5483338191003595, + "grad_norm": 0.21119755506515503, + "learning_rate": 9.061891786687428e-05, + "loss": 1.0935, + "step": 1411 + }, + { + "epoch": 0.5487224327212669, + "grad_norm": 0.20688095688819885, + "learning_rate": 9.054106656286493e-05, + "loss": 1.0526, + "step": 1412 + }, + { + "epoch": 0.5491110463421743, + "grad_norm": 0.21857528388500214, + "learning_rate": 9.04632152588556e-05, + "loss": 1.0067, + "step": 1413 + }, + { + "epoch": 0.5494996599630817, + "grad_norm": 0.2196548581123352, + "learning_rate": 9.038536395484625e-05, + "loss": 1.0263, + "step": 1414 + }, + { + "epoch": 0.5498882735839892, + "grad_norm": 0.21952040493488312, + "learning_rate": 9.03075126508369e-05, + "loss": 1.0009, + "step": 1415 + }, + { + "epoch": 0.5502768872048965, + "grad_norm": 0.20059294998645782, + "learning_rate": 9.022966134682757e-05, + "loss": 1.0481, + "step": 1416 + }, + { + "epoch": 0.5506655008258039, + "grad_norm": 0.1960824728012085, + "learning_rate": 9.015181004281822e-05, + "loss": 1.0003, + "step": 1417 + }, + { + "epoch": 0.5510541144467114, + "grad_norm": 0.19051724672317505, + "learning_rate": 9.007395873880889e-05, + "loss": 0.9556, + "step": 1418 + }, + { + "epoch": 0.5514427280676187, + "grad_norm": 0.21008028090000153, + "learning_rate": 8.999610743479954e-05, + "loss": 1.0457, + "step": 1419 + }, + { + "epoch": 0.5518313416885262, + "grad_norm": 0.21465444564819336, + "learning_rate": 8.991825613079019e-05, + "loss": 1.0196, + "step": 1420 + }, + { + "epoch": 0.5522199553094336, + "grad_norm": 0.2062770277261734, + "learning_rate": 8.984040482678086e-05, + "loss": 1.0501, + "step": 1421 + }, + { + "epoch": 0.552608568930341, + "grad_norm": 0.21400012075901031, + "learning_rate": 8.976255352277151e-05, + "loss": 1.0711, + "step": 1422 + }, + { + "epoch": 0.5529971825512484, + "grad_norm": 0.19617624580860138, + "learning_rate": 8.968470221876217e-05, + "loss": 0.9858, + "step": 1423 + }, + { + "epoch": 0.5533857961721559, + "grad_norm": 0.20835624635219574, + "learning_rate": 8.960685091475283e-05, + "loss": 1.0122, + "step": 1424 + }, + { + "epoch": 0.5537744097930632, + "grad_norm": 0.21708111464977264, + "learning_rate": 8.952899961074348e-05, + "loss": 1.0108, + "step": 1425 + }, + { + "epoch": 0.5541630234139706, + "grad_norm": 0.20877864956855774, + "learning_rate": 8.945114830673414e-05, + "loss": 1.0389, + "step": 1426 + }, + { + "epoch": 0.5545516370348781, + "grad_norm": 0.1924441158771515, + "learning_rate": 8.93732970027248e-05, + "loss": 1.0088, + "step": 1427 + }, + { + "epoch": 0.5549402506557854, + "grad_norm": 0.20288826525211334, + "learning_rate": 8.929544569871546e-05, + "loss": 1.0296, + "step": 1428 + }, + { + "epoch": 0.5553288642766929, + "grad_norm": 0.2008143663406372, + "learning_rate": 8.921759439470611e-05, + "loss": 1.0521, + "step": 1429 + }, + { + "epoch": 0.5557174778976003, + "grad_norm": 0.24407047033309937, + "learning_rate": 8.913974309069677e-05, + "loss": 1.1038, + "step": 1430 + }, + { + "epoch": 0.5561060915185078, + "grad_norm": 0.2172536998987198, + "learning_rate": 8.906189178668743e-05, + "loss": 1.0811, + "step": 1431 + }, + { + "epoch": 0.5564947051394151, + "grad_norm": 0.21712054312229156, + "learning_rate": 8.898404048267808e-05, + "loss": 1.0642, + "step": 1432 + }, + { + "epoch": 0.5568833187603226, + "grad_norm": 0.22482797503471375, + "learning_rate": 8.890618917866875e-05, + "loss": 1.0742, + "step": 1433 + }, + { + "epoch": 0.55727193238123, + "grad_norm": 0.1974876970052719, + "learning_rate": 8.88283378746594e-05, + "loss": 0.9954, + "step": 1434 + }, + { + "epoch": 0.5576605460021373, + "grad_norm": 0.19162166118621826, + "learning_rate": 8.875048657065007e-05, + "loss": 1.0074, + "step": 1435 + }, + { + "epoch": 0.5580491596230448, + "grad_norm": 0.20439045131206512, + "learning_rate": 8.867263526664072e-05, + "loss": 1.026, + "step": 1436 + }, + { + "epoch": 0.5584377732439522, + "grad_norm": 0.1947651207447052, + "learning_rate": 8.859478396263138e-05, + "loss": 0.9848, + "step": 1437 + }, + { + "epoch": 0.5588263868648596, + "grad_norm": 0.21434316039085388, + "learning_rate": 8.851693265862204e-05, + "loss": 1.0843, + "step": 1438 + }, + { + "epoch": 0.559215000485767, + "grad_norm": 1.3314417600631714, + "learning_rate": 8.84390813546127e-05, + "loss": 1.0356, + "step": 1439 + }, + { + "epoch": 0.5596036141066745, + "grad_norm": 0.20131289958953857, + "learning_rate": 8.836123005060335e-05, + "loss": 1.0214, + "step": 1440 + }, + { + "epoch": 0.5599922277275818, + "grad_norm": 0.21596461534500122, + "learning_rate": 8.828337874659402e-05, + "loss": 1.0962, + "step": 1441 + }, + { + "epoch": 0.5603808413484893, + "grad_norm": 0.20477193593978882, + "learning_rate": 8.820552744258467e-05, + "loss": 1.0643, + "step": 1442 + }, + { + "epoch": 0.5607694549693967, + "grad_norm": 0.1978107988834381, + "learning_rate": 8.812767613857532e-05, + "loss": 1.0054, + "step": 1443 + }, + { + "epoch": 0.561158068590304, + "grad_norm": 0.219422847032547, + "learning_rate": 8.804982483456599e-05, + "loss": 1.0009, + "step": 1444 + }, + { + "epoch": 0.5615466822112115, + "grad_norm": 0.21489015221595764, + "learning_rate": 8.797197353055664e-05, + "loss": 1.052, + "step": 1445 + }, + { + "epoch": 0.5619352958321189, + "grad_norm": 0.2235930860042572, + "learning_rate": 8.78941222265473e-05, + "loss": 1.037, + "step": 1446 + }, + { + "epoch": 0.5623239094530263, + "grad_norm": 0.19922038912773132, + "learning_rate": 8.781627092253796e-05, + "loss": 1.0006, + "step": 1447 + }, + { + "epoch": 0.5627125230739337, + "grad_norm": 0.24740247428417206, + "learning_rate": 8.773841961852861e-05, + "loss": 1.0753, + "step": 1448 + }, + { + "epoch": 0.5631011366948412, + "grad_norm": 0.2148803174495697, + "learning_rate": 8.766056831451928e-05, + "loss": 1.0712, + "step": 1449 + }, + { + "epoch": 0.5634897503157485, + "grad_norm": 0.19838745892047882, + "learning_rate": 8.758271701050993e-05, + "loss": 1.027, + "step": 1450 + }, + { + "epoch": 0.563878363936656, + "grad_norm": 0.20328201353549957, + "learning_rate": 8.750486570650058e-05, + "loss": 1.0117, + "step": 1451 + }, + { + "epoch": 0.5642669775575634, + "grad_norm": 0.21230114996433258, + "learning_rate": 8.742701440249125e-05, + "loss": 1.0658, + "step": 1452 + }, + { + "epoch": 0.5646555911784708, + "grad_norm": 0.2030259519815445, + "learning_rate": 8.73491630984819e-05, + "loss": 1.0002, + "step": 1453 + }, + { + "epoch": 0.5650442047993782, + "grad_norm": 0.21404659748077393, + "learning_rate": 8.727131179447256e-05, + "loss": 1.0572, + "step": 1454 + }, + { + "epoch": 0.5654328184202856, + "grad_norm": 0.2148464322090149, + "learning_rate": 8.719346049046322e-05, + "loss": 1.0164, + "step": 1455 + }, + { + "epoch": 0.5658214320411931, + "grad_norm": 0.22083118557929993, + "learning_rate": 8.711560918645387e-05, + "loss": 0.9704, + "step": 1456 + }, + { + "epoch": 0.5662100456621004, + "grad_norm": 0.19305935502052307, + "learning_rate": 8.703775788244453e-05, + "loss": 1.0034, + "step": 1457 + }, + { + "epoch": 0.5665986592830079, + "grad_norm": 0.2100098729133606, + "learning_rate": 8.695990657843518e-05, + "loss": 1.0907, + "step": 1458 + }, + { + "epoch": 0.5669872729039153, + "grad_norm": 0.18947799503803253, + "learning_rate": 8.688205527442585e-05, + "loss": 0.9664, + "step": 1459 + }, + { + "epoch": 0.5673758865248227, + "grad_norm": 0.22341710329055786, + "learning_rate": 8.68042039704165e-05, + "loss": 1.0551, + "step": 1460 + }, + { + "epoch": 0.5677645001457301, + "grad_norm": 0.219679057598114, + "learning_rate": 8.672635266640717e-05, + "loss": 1.0398, + "step": 1461 + }, + { + "epoch": 0.5681531137666376, + "grad_norm": 0.22389841079711914, + "learning_rate": 8.664850136239782e-05, + "loss": 1.0472, + "step": 1462 + }, + { + "epoch": 0.5685417273875449, + "grad_norm": 0.21402975916862488, + "learning_rate": 8.657065005838849e-05, + "loss": 1.0224, + "step": 1463 + }, + { + "epoch": 0.5689303410084523, + "grad_norm": 0.20917154848575592, + "learning_rate": 8.649279875437915e-05, + "loss": 1.0526, + "step": 1464 + }, + { + "epoch": 0.5693189546293598, + "grad_norm": 0.2252056896686554, + "learning_rate": 8.64149474503698e-05, + "loss": 1.1064, + "step": 1465 + }, + { + "epoch": 0.5697075682502671, + "grad_norm": 0.21834802627563477, + "learning_rate": 8.633709614636046e-05, + "loss": 1.0318, + "step": 1466 + }, + { + "epoch": 0.5700961818711746, + "grad_norm": 0.21882353723049164, + "learning_rate": 8.625924484235112e-05, + "loss": 1.0285, + "step": 1467 + }, + { + "epoch": 0.570484795492082, + "grad_norm": 0.2028426229953766, + "learning_rate": 8.618139353834177e-05, + "loss": 1.0356, + "step": 1468 + }, + { + "epoch": 0.5708734091129894, + "grad_norm": 0.22297166287899017, + "learning_rate": 8.610354223433243e-05, + "loss": 1.0804, + "step": 1469 + }, + { + "epoch": 0.5712620227338968, + "grad_norm": 0.21775268018245697, + "learning_rate": 8.602569093032309e-05, + "loss": 0.9978, + "step": 1470 + }, + { + "epoch": 0.5716506363548043, + "grad_norm": 0.20362353324890137, + "learning_rate": 8.594783962631374e-05, + "loss": 0.9982, + "step": 1471 + }, + { + "epoch": 0.5720392499757117, + "grad_norm": 0.21854591369628906, + "learning_rate": 8.586998832230441e-05, + "loss": 1.0465, + "step": 1472 + }, + { + "epoch": 0.572427863596619, + "grad_norm": 0.20501428842544556, + "learning_rate": 8.579213701829506e-05, + "loss": 1.0468, + "step": 1473 + }, + { + "epoch": 0.5728164772175265, + "grad_norm": 0.21606214344501495, + "learning_rate": 8.571428571428571e-05, + "loss": 1.0477, + "step": 1474 + }, + { + "epoch": 0.5732050908384339, + "grad_norm": 0.2100660502910614, + "learning_rate": 8.563643441027638e-05, + "loss": 1.0071, + "step": 1475 + }, + { + "epoch": 0.5735937044593413, + "grad_norm": 0.21008896827697754, + "learning_rate": 8.555858310626703e-05, + "loss": 0.9914, + "step": 1476 + }, + { + "epoch": 0.5739823180802487, + "grad_norm": 0.22192159295082092, + "learning_rate": 8.54807318022577e-05, + "loss": 1.0385, + "step": 1477 + }, + { + "epoch": 0.5743709317011562, + "grad_norm": 0.20123356580734253, + "learning_rate": 8.540288049824835e-05, + "loss": 1.0062, + "step": 1478 + }, + { + "epoch": 0.5747595453220635, + "grad_norm": 0.201947420835495, + "learning_rate": 8.5325029194239e-05, + "loss": 1.0218, + "step": 1479 + }, + { + "epoch": 0.575148158942971, + "grad_norm": 0.22804415225982666, + "learning_rate": 8.524717789022967e-05, + "loss": 1.0445, + "step": 1480 + }, + { + "epoch": 0.5755367725638784, + "grad_norm": 0.20527036488056183, + "learning_rate": 8.516932658622032e-05, + "loss": 0.9972, + "step": 1481 + }, + { + "epoch": 0.5759253861847857, + "grad_norm": 0.20298773050308228, + "learning_rate": 8.509147528221098e-05, + "loss": 1.0272, + "step": 1482 + }, + { + "epoch": 0.5763139998056932, + "grad_norm": 0.22500957548618317, + "learning_rate": 8.501362397820164e-05, + "loss": 1.0982, + "step": 1483 + }, + { + "epoch": 0.5767026134266006, + "grad_norm": 0.1950521320104599, + "learning_rate": 8.493577267419229e-05, + "loss": 0.9848, + "step": 1484 + }, + { + "epoch": 0.577091227047508, + "grad_norm": 0.21087585389614105, + "learning_rate": 8.485792137018295e-05, + "loss": 1.0125, + "step": 1485 + }, + { + "epoch": 0.5774798406684154, + "grad_norm": 0.20122238993644714, + "learning_rate": 8.47800700661736e-05, + "loss": 1.0533, + "step": 1486 + }, + { + "epoch": 0.5778684542893229, + "grad_norm": 0.20149008929729462, + "learning_rate": 8.470221876216427e-05, + "loss": 1.0719, + "step": 1487 + }, + { + "epoch": 0.5782570679102302, + "grad_norm": 0.21307213604450226, + "learning_rate": 8.462436745815494e-05, + "loss": 1.0522, + "step": 1488 + }, + { + "epoch": 0.5786456815311377, + "grad_norm": 0.21828554570674896, + "learning_rate": 8.454651615414559e-05, + "loss": 1.0184, + "step": 1489 + }, + { + "epoch": 0.5790342951520451, + "grad_norm": 0.22002705931663513, + "learning_rate": 8.446866485013625e-05, + "loss": 1.0101, + "step": 1490 + }, + { + "epoch": 0.5794229087729524, + "grad_norm": 0.19479142129421234, + "learning_rate": 8.43908135461269e-05, + "loss": 0.9889, + "step": 1491 + }, + { + "epoch": 0.5798115223938599, + "grad_norm": 0.21346086263656616, + "learning_rate": 8.431296224211756e-05, + "loss": 1.0373, + "step": 1492 + }, + { + "epoch": 0.5802001360147673, + "grad_norm": 0.20177558064460754, + "learning_rate": 8.423511093810822e-05, + "loss": 1.0215, + "step": 1493 + }, + { + "epoch": 0.5805887496356748, + "grad_norm": 0.2117915153503418, + "learning_rate": 8.415725963409888e-05, + "loss": 1.0321, + "step": 1494 + }, + { + "epoch": 0.5809773632565821, + "grad_norm": 0.21304374933242798, + "learning_rate": 8.407940833008954e-05, + "loss": 1.0123, + "step": 1495 + }, + { + "epoch": 0.5813659768774896, + "grad_norm": 0.21173715591430664, + "learning_rate": 8.400155702608019e-05, + "loss": 1.0696, + "step": 1496 + }, + { + "epoch": 0.581754590498397, + "grad_norm": 0.20407019555568695, + "learning_rate": 8.392370572207085e-05, + "loss": 1.0086, + "step": 1497 + }, + { + "epoch": 0.5821432041193044, + "grad_norm": 0.209481880068779, + "learning_rate": 8.384585441806151e-05, + "loss": 0.9975, + "step": 1498 + }, + { + "epoch": 0.5825318177402118, + "grad_norm": 0.22184531390666962, + "learning_rate": 8.376800311405216e-05, + "loss": 1.0956, + "step": 1499 + }, + { + "epoch": 0.5829204313611193, + "grad_norm": 0.21344684064388275, + "learning_rate": 8.369015181004283e-05, + "loss": 1.0685, + "step": 1500 + } + ], + "logging_steps": 1, + "max_steps": 2574, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2833762852661166e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/outputs/checkpoint-2000/README.md b/outputs/checkpoint-2000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3abf956c074d00f34a12693c8d6da9738211d7c7 --- /dev/null +++ b/outputs/checkpoint-2000/README.md @@ -0,0 +1,209 @@ +--- +base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit +library_name: peft +tags: +- base_model:adapter:unsloth/gpt-oss-20b-unsloth-bnb-4bit +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/outputs/checkpoint-2000/adapter_config.json b/outputs/checkpoint-2000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e285b9b6e018b5b9f23736d6699eb1a4267764e7 --- /dev/null +++ b/outputs/checkpoint-2000/adapter_config.json @@ -0,0 +1,45 @@ +{ + "alpha_pattern": {}, + "auto_mapping": { + "base_model_class": "GptOssForCausalLM", + "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss" + }, + "base_model_name_or_path": "unsloth/gpt-oss-20b-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "up_proj", + "down_proj", + "gate_proj", + "k_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/outputs/checkpoint-2000/chat_template.jinja b/outputs/checkpoint-2000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..a3650f886e98b2834c25727759c8e0ab8495f316 --- /dev/null +++ b/outputs/checkpoint-2000/chat_template.jinja @@ -0,0 +1,315 @@ +{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #} +{#- + In addition to the normal inputs of `messages` and `tools`, this template also accepts the + following kwargs: + - "builtin_tools": A list, can contain "browser" and/or "python". + - "model_identity": A string that optionally describes the model identity. + - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium". + #} + +{#- Tool Definition Rendering ============================================== #} +{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%} + {%- if param_spec.type == "array" -%} + {%- if param_spec['items'] -%} + {%- if param_spec['items']['type'] == "string" -%} + {{- "string[]" }} + {%- elif param_spec['items']['type'] == "number" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "integer" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "boolean" -%} + {{- "boolean[]" }} + {%- else -%} + {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%} + {%- if inner_type == "object | object" or inner_type|length > 50 -%} + {{- "any[]" }} + {%- else -%} + {{- inner_type + "[]" }} + {%- endif -%} + {%- endif -%} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- else -%} + {{- "any[]" }} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%} + {#- Handle array of types like ["object", "object"] from Union[dict, list] #} + {%- if param_spec.type | length > 1 -%} + {{- param_spec.type | join(" | ") }} + {%- else -%} + {{- param_spec.type[0] }} + {%- endif -%} + {%- elif param_spec.oneOf -%} + {#- Handle oneOf schemas - check for complex unions and fallback to any #} + {%- set has_object_variants = false -%} + {%- for variant in param_spec.oneOf -%} + {%- if variant.type == "object" -%} + {%- set has_object_variants = true -%} + {%- endif -%} + {%- endfor -%} + {%- if has_object_variants and param_spec.oneOf|length > 1 -%} + {{- "any" }} + {%- else -%} + {%- for variant in param_spec.oneOf -%} + {{- render_typescript_type(variant, required_params) -}} + {%- if variant.description %} + {{- "// " + variant.description }} + {%- endif -%} + {%- if variant.default is defined %} + {{ "// default: " + variant.default|tojson }} + {%- endif -%} + {%- if not loop.last %} + {{- " | " }} + {% endif -%} + {%- endfor -%} + {%- endif -%} + {%- elif param_spec.type == "string" -%} + {%- if param_spec.enum -%} + {{- '"' + param_spec.enum|join('" | "') + '"' -}} + {%- else -%} + {{- "string" }} + {%- if param_spec.nullable %} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type == "number" -%} + {{- "number" }} + {%- elif param_spec.type == "integer" -%} + {{- "number" }} + {%- elif param_spec.type == "boolean" -%} + {{- "boolean" }} + + {%- elif param_spec.type == "object" -%} + {%- if param_spec.properties -%} + {{- "{\n" }} + {%- for prop_name, prop_spec in param_spec.properties.items() -%} + {{- prop_name -}} + {%- if prop_name not in (param_spec.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{ render_typescript_type(prop_spec, param_spec.required or []) }} + {%- if not loop.last -%} + {{-", " }} + {%- endif -%} + {%- endfor -%} + {{- "}" }} + {%- else -%} + {{- "object" }} + {%- endif -%} + {%- else -%} + {{- "any" }} + {%- endif -%} +{%- endmacro -%} + +{%- macro render_tool_namespace(namespace_name, tools) -%} + {{- "## " + namespace_name + "\n\n" }} + {{- "namespace " + namespace_name + " {\n\n" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- "// " + tool.description + "\n" }} + {{- "type "+ tool.name + " = " }} + {%- if tool.parameters and tool.parameters.properties -%} + {{- "(_: " }} + {{- "{\n" }} + {%- for param_name, param_spec in tool.parameters.properties.items() %} + {{- "// " + param_spec.description + "\n" }} + {{- param_name }} + {%- if param_name not in (tool.parameters.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{- render_typescript_type(param_spec, tool.parameters.required or []) }} + {%- if param_spec.default is defined -%} + {%- if param_spec.enum %} + {{- ", // default: " + param_spec.default }} + {%- elif param_spec.oneOf %} + {{- "// default: " + param_spec.default }} + {%- else %} + {{- ", // default: " + param_spec.default|tojson }} + {%- endif -%} + {%- endif -%} + {%- if not loop.last %} + {{- ",\n" }} + {%- else %} + {{- "\n" }} + {%- endif -%} + {%- endfor %} + {{- "}) => any;\n\n" }} + {%- else -%} + {{- "() => any;\n\n" }} + {%- endif -%} + {%- endfor %} + {{- "} // namespace " + namespace_name }} +{%- endmacro -%} + +{%- macro render_builtin_tools(browser_tool, python_tool) -%} + {%- if browser_tool %} + {{- "## browser\n\n" }} + {{- "// Tool for browsing.\n" }} + {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }} + {{- "// Cite information from the tool using the following format:\n" }} + {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }} + {{- "// Do not quote more than 10 words directly from the tool output.\n" }} + {{- "// sources=web (default: web)\n" }} + {{- "namespace browser {\n\n" }} + {{- "// Searches for information related to `query` and displays `topn` results.\n" }} + {{- "type search = (_: {\n" }} + {{- "query: string,\n" }} + {{- "topn?: number, // default: 10\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }} + {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }} + {{- "// If `cursor` is not provided, the most recent page is implied.\n" }} + {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }} + {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }} + {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }} + {{- "type open = (_: {\n" }} + {{- "id?: number | string, // default: -1\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "loc?: number, // default: -1\n" }} + {{- "num_lines?: number, // default: -1\n" }} + {{- "view_source?: boolean, // default: false\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }} + {{- "type find = (_: {\n" }} + {{- "pattern: string,\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "}) => any;\n\n" }} + {{- "} // namespace browser\n\n" }} + {%- endif -%} + + {%- if python_tool %} + {{- "## python\n\n" }} + {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }} + {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }} + {%- endif -%} +{%- endmacro -%} + +{#- System Message Construction ============================================ #} +{%- macro build_system_message() -%} + {%- if model_identity is not defined %} + {{- "You are ChatGPT, a large language model trained by OpenAI.\n" -}} + {%- else %} + {{- model_identity }} + {%- endif %} + {{- "Knowledge cutoff: 2024-06\n" }} + {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }} + {%- if reasoning_effort is not defined %} + {%- set reasoning_effort = "medium" %} + {%- endif %} + {{- "Reasoning: " + reasoning_effort + "\n\n" }} + {%- if builtin_tools is defined %} + {{- "# Tools\n\n" }} + {%- set available_builtin_tools = namespace(browser=false, python=false) %} + {%- for tool in builtin_tools %} + {%- if tool == "browser" %} + {%- set available_builtin_tools.browser = true %} + {%- elif tool == "python" %} + {%- set available_builtin_tools.python = true %} + {%- endif %} + {%- endfor %} + {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }} + {%- endif -%} + {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }} + {%- if tools is defined -%} + {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }} + {%- endif -%} +{%- endmacro -%} + +{#- Main Template Logic ================================================= #} +{#- Set defaults #} + +{#- Render system message #} +{{- "<|start|>system<|message|>" }} +{{- build_system_message() }} +{{- "<|end|>" }} + +{#- Extract developer message #} +{%- if messages[0].role == "developer" or messages[0].role == "system" %} + {%- set developer_message = messages[0].content %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set developer_message = "" %} + {%- set loop_messages = messages %} +{%- endif %} + +{#- Render developer message #} +{%- if developer_message or tools %} + {{- "<|start|>developer<|message|>" }} + {%- if developer_message %} + {{- "# Instructions\n\n" }} + {{- developer_message }} + {%- endif %} + {%- if tools -%} + {{- "\n\n" }} + {{- "# Tools\n\n" }} + {{- render_tool_namespace("functions", tools) }} + {%- endif -%} + {{- "<|end|>" }} +{%- endif %} + +{#- Render messages #} +{%- set last_tool_call = namespace(name=none) %} +{%- for message in loop_messages -%} + {#- At this point only assistant/user/tool messages should remain #} + {%- if message.role == 'assistant' -%} + {%- if "tool_calls" in message %} + {#- We assume max 1 tool call per message, and so we infer the tool call name #} + {#- in "tool" messages from the most recent assistant tool call name #} + {%- set tool_call = message.tool_calls[0] %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if message.content %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }} + {%- endif %} + {{- "<|start|>assistant to=" }} + {{- "functions." + tool_call.name + "<|channel|>commentary json<|message|>" }} + {{- tool_call.arguments|tojson }} + {{- "<|call|>" }} + {%- set last_tool_call.name = tool_call.name %} + {%- elif "thinking" in message and loop.last and not add_generation_prompt %} + {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #} + {#- This is a situation that should only occur in training, never in inference. #} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }} + {%- set last_tool_call.name = none %} + {%- elif "thinking" in message %} + {#- CoT is dropped during all previous turns, so we never render it for inference #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- elif loop.last and not add_generation_prompt %} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|message|>" + message.content + "<|return|>" }} + {%- else %} + {{- "<|start|>assistant<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- endif %} + {%- elif message.role == 'tool' -%} + {%- if last_tool_call.name is none %} + {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }} + {%- endif %} + {{- "<|start|>functions." + last_tool_call.name }} + {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} + {%- else -%} + {{- "<|start|>user<|message|>" + message.content + "<|end|>" }} + {%- endif -%} +{%- endfor -%} + +{#- Generation prompt #} +{%- if add_generation_prompt -%} +<|start|>assistant +{%- endif -%} +{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #} \ No newline at end of file diff --git a/outputs/checkpoint-2000/special_tokens_map.json b/outputs/checkpoint-2000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..6fba18753f4d09dbb8fcdf1482daff36b963d639 --- /dev/null +++ b/outputs/checkpoint-2000/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/outputs/checkpoint-2000/tokenizer.json b/outputs/checkpoint-2000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647 --- /dev/null +++ b/outputs/checkpoint-2000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3 +size 27868174 diff --git a/outputs/checkpoint-2000/tokenizer_config.json b/outputs/checkpoint-2000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..482ae30d27a74c38d2228e69dd37c529fc485a45 --- /dev/null +++ b/outputs/checkpoint-2000/tokenizer_config.json @@ -0,0 +1,185 @@ +{ + "added_tokens_decoder": { + "199998": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "199999": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200000": { + "content": "<|reserved_200000|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200001": { + "content": "<|reserved_200001|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200002": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200003": { + "content": "<|constrain|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200004": { + "content": "<|reserved_200004|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200005": { + "content": "<|channel|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200006": { + "content": "<|start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200008": { + "content": "<|message|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200009": { + "content": "<|reserved_200009|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200010": { + "content": "<|reserved_200010|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200011": { + "content": "<|reserved_200011|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200012": { + "content": "<|call|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200013": { + "content": "<|reserved_200013|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200014": { + "content": "<|reserved_200014|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200015": { + "content": "<|reserved_200015|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200016": { + "content": "<|reserved_200016|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200017": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200018": { + "content": "<|endofprompt|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|startoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|return|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|reserved_200017|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/outputs/checkpoint-2000/trainer_state.json b/outputs/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d64370de502ee9836f14148d772ef056a5aed5d9 --- /dev/null +++ b/outputs/checkpoint-2000/trainer_state.json @@ -0,0 +1,14034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7772272418148256, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 13.684800148010254, + "learning_rate": 0.0, + "loss": 2.3276, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 13.660787582397461, + "learning_rate": 4e-05, + "loss": 2.2792, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 13.35280704498291, + "learning_rate": 8e-05, + "loss": 2.4151, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 6.15027379989624, + "learning_rate": 0.00012, + "loss": 1.7812, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 1.3168226480484009, + "learning_rate": 0.00016, + "loss": 1.4536, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.9872580170631409, + "learning_rate": 0.0002, + "loss": 1.4171, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.7496100664138794, + "learning_rate": 0.00019935064935064936, + "loss": 1.4168, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7376005053520203, + "learning_rate": 0.00019870129870129872, + "loss": 1.3659, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5281137824058533, + "learning_rate": 0.00019805194805194807, + "loss": 1.2566, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.5485746264457703, + "learning_rate": 0.00019740259740259742, + "loss": 1.3761, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5506592392921448, + "learning_rate": 0.00019675324675324675, + "loss": 1.3327, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.49382686614990234, + "learning_rate": 0.00019610389610389613, + "loss": 1.3727, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.36203011870384216, + "learning_rate": 0.00019545454545454548, + "loss": 1.1515, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.3528599739074707, + "learning_rate": 0.0001948051948051948, + "loss": 1.2636, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.31244418025016785, + "learning_rate": 0.00019415584415584416, + "loss": 1.1873, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.3379523754119873, + "learning_rate": 0.00019350649350649354, + "loss": 1.2657, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.3025083839893341, + "learning_rate": 0.00019285714285714286, + "loss": 1.2846, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.2560190260410309, + "learning_rate": 0.00019220779220779222, + "loss": 1.1587, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.2554129958152771, + "learning_rate": 0.00019155844155844157, + "loss": 1.2812, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.22662702202796936, + "learning_rate": 0.00019090909090909092, + "loss": 1.1664, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.2515714168548584, + "learning_rate": 0.00019025974025974027, + "loss": 1.2177, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.24396637082099915, + "learning_rate": 0.00018961038961038963, + "loss": 1.2053, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.24488303065299988, + "learning_rate": 0.00018896103896103895, + "loss": 1.2074, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.2168620079755783, + "learning_rate": 0.00018831168831168833, + "loss": 1.1284, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.24021224677562714, + "learning_rate": 0.00018766233766233769, + "loss": 1.2169, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.20057056844234467, + "learning_rate": 0.000187012987012987, + "loss": 1.1031, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.19900795817375183, + "learning_rate": 0.00018636363636363636, + "loss": 1.1004, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.2019268423318863, + "learning_rate": 0.00018571428571428572, + "loss": 1.1476, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.1996479034423828, + "learning_rate": 0.00018506493506493507, + "loss": 1.1455, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.25262022018432617, + "learning_rate": 0.00018441558441558442, + "loss": 1.1025, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.225438192486763, + "learning_rate": 0.00018376623376623378, + "loss": 1.1954, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.17834505438804626, + "learning_rate": 0.00018311688311688313, + "loss": 1.0934, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.20071206986904144, + "learning_rate": 0.00018246753246753248, + "loss": 1.0488, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.1920139640569687, + "learning_rate": 0.00018181818181818183, + "loss": 1.123, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.18714852631092072, + "learning_rate": 0.0001811688311688312, + "loss": 1.0798, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.18315713107585907, + "learning_rate": 0.00018051948051948054, + "loss": 1.1107, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.19156870245933533, + "learning_rate": 0.00017987012987012987, + "loss": 1.1125, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.21527768671512604, + "learning_rate": 0.00017922077922077922, + "loss": 1.1346, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.1871163249015808, + "learning_rate": 0.0001785714285714286, + "loss": 1.0742, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.17750784754753113, + "learning_rate": 0.00017792207792207792, + "loss": 1.1323, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.177419051527977, + "learning_rate": 0.00017727272727272728, + "loss": 1.1405, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.16714292764663696, + "learning_rate": 0.00017662337662337663, + "loss": 1.1084, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.1610356718301773, + "learning_rate": 0.00017597402597402598, + "loss": 1.1125, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.2548656761646271, + "learning_rate": 0.00017532467532467534, + "loss": 1.1114, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.1731044203042984, + "learning_rate": 0.0001746753246753247, + "loss": 1.1197, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.1739533394575119, + "learning_rate": 0.00017402597402597401, + "loss": 1.1777, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.2178352177143097, + "learning_rate": 0.0001733766233766234, + "loss": 1.1111, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.17247150838375092, + "learning_rate": 0.00017272727272727275, + "loss": 1.1253, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.18075324594974518, + "learning_rate": 0.00017207792207792207, + "loss": 1.1358, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.15898071229457855, + "learning_rate": 0.00017142857142857143, + "loss": 1.0606, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.16518613696098328, + "learning_rate": 0.0001707792207792208, + "loss": 1.0944, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.16035063564777374, + "learning_rate": 0.00017012987012987013, + "loss": 1.0554, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.1686483472585678, + "learning_rate": 0.00016948051948051948, + "loss": 1.0384, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.16575631499290466, + "learning_rate": 0.00016883116883116884, + "loss": 1.0243, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.16840039193630219, + "learning_rate": 0.0001681818181818182, + "loss": 1.117, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.17616064846515656, + "learning_rate": 0.00016753246753246754, + "loss": 1.0743, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.168218195438385, + "learning_rate": 0.0001668831168831169, + "loss": 1.0627, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.17026656866073608, + "learning_rate": 0.00016623376623376625, + "loss": 1.0059, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.16454458236694336, + "learning_rate": 0.0001655844155844156, + "loss": 0.9943, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.17185136675834656, + "learning_rate": 0.00016493506493506495, + "loss": 1.1545, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.17822986841201782, + "learning_rate": 0.00016428571428571428, + "loss": 1.073, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.1676608771085739, + "learning_rate": 0.00016363636363636366, + "loss": 1.0886, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.1727771908044815, + "learning_rate": 0.000162987012987013, + "loss": 1.0432, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.17827573418617249, + "learning_rate": 0.00016233766233766234, + "loss": 1.083, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.19807517528533936, + "learning_rate": 0.0001616883116883117, + "loss": 1.1208, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.17693684995174408, + "learning_rate": 0.00016103896103896104, + "loss": 1.089, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.15489234030246735, + "learning_rate": 0.0001603896103896104, + "loss": 0.9707, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.16443990170955658, + "learning_rate": 0.00015974025974025975, + "loss": 1.0643, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.2051103413105011, + "learning_rate": 0.0001590909090909091, + "loss": 1.1246, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.18824075162410736, + "learning_rate": 0.00015844155844155845, + "loss": 1.0855, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.18659448623657227, + "learning_rate": 0.0001577922077922078, + "loss": 1.1412, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.1854114979505539, + "learning_rate": 0.00015714285714285716, + "loss": 1.0249, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.1876193732023239, + "learning_rate": 0.00015649350649350649, + "loss": 1.1029, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.1888684630393982, + "learning_rate": 0.00015584415584415587, + "loss": 1.0789, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.20240606367588043, + "learning_rate": 0.0001551948051948052, + "loss": 1.0495, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.232120081782341, + "learning_rate": 0.00015454545454545454, + "loss": 1.0735, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.16897843778133392, + "learning_rate": 0.0001538961038961039, + "loss": 1.0164, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.18796634674072266, + "learning_rate": 0.00015324675324675325, + "loss": 1.0676, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.19574032723903656, + "learning_rate": 0.0001525974025974026, + "loss": 1.0456, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.18007811903953552, + "learning_rate": 0.00015194805194805196, + "loss": 1.0894, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.18932929635047913, + "learning_rate": 0.0001512987012987013, + "loss": 1.0729, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.20614288747310638, + "learning_rate": 0.00015064935064935066, + "loss": 1.0854, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.19291089475154877, + "learning_rate": 0.00015000000000000001, + "loss": 1.1217, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.18916529417037964, + "learning_rate": 0.00014935064935064934, + "loss": 1.0963, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.20306220650672913, + "learning_rate": 0.00014870129870129872, + "loss": 1.0898, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.17870067059993744, + "learning_rate": 0.00014805194805194807, + "loss": 1.0213, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.18411923944950104, + "learning_rate": 0.0001474025974025974, + "loss": 1.0844, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.18788227438926697, + "learning_rate": 0.00014675324675324675, + "loss": 1.0338, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.23874884843826294, + "learning_rate": 0.00014610389610389613, + "loss": 1.1118, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.19380499422550201, + "learning_rate": 0.00014545454545454546, + "loss": 1.0464, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.18968750536441803, + "learning_rate": 0.0001448051948051948, + "loss": 1.0569, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.19545753300189972, + "learning_rate": 0.00014415584415584416, + "loss": 1.1225, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.19170494377613068, + "learning_rate": 0.00014350649350649352, + "loss": 1.0602, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.17953918874263763, + "learning_rate": 0.00014285714285714287, + "loss": 1.032, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.1822536289691925, + "learning_rate": 0.00014220779220779222, + "loss": 1.0559, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.18591298162937164, + "learning_rate": 0.00014155844155844155, + "loss": 1.031, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.2129002958536148, + "learning_rate": 0.00014090909090909093, + "loss": 1.1391, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.18386681377887726, + "learning_rate": 0.00014025974025974028, + "loss": 0.9919, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.18314239382743835, + "learning_rate": 0.0001396103896103896, + "loss": 1.0445, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.1999066174030304, + "learning_rate": 0.00013896103896103896, + "loss": 1.0538, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.18741188943386078, + "learning_rate": 0.00013831168831168834, + "loss": 1.0722, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.19351010024547577, + "learning_rate": 0.00013766233766233766, + "loss": 1.0491, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.18859203159809113, + "learning_rate": 0.00013701298701298702, + "loss": 1.0593, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.1962767392396927, + "learning_rate": 0.00013636363636363637, + "loss": 1.1344, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.20819440484046936, + "learning_rate": 0.00013571428571428572, + "loss": 1.1137, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.19590184092521667, + "learning_rate": 0.00013506493506493507, + "loss": 1.0624, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.18631424009799957, + "learning_rate": 0.00013441558441558443, + "loss": 1.0587, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.19572143256664276, + "learning_rate": 0.00013376623376623375, + "loss": 1.0494, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.1910988837480545, + "learning_rate": 0.00013311688311688313, + "loss": 1.0481, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.19455869495868683, + "learning_rate": 0.00013246753246753249, + "loss": 1.029, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.18669827282428741, + "learning_rate": 0.0001318181818181818, + "loss": 1.0513, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.17523664236068726, + "learning_rate": 0.0001311688311688312, + "loss": 1.0126, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.17929129302501678, + "learning_rate": 0.00013051948051948052, + "loss": 1.0717, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.19380168616771698, + "learning_rate": 0.00012987012987012987, + "loss": 1.0324, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.18090228736400604, + "learning_rate": 0.00012922077922077922, + "loss": 1.0515, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.2067340910434723, + "learning_rate": 0.00012857142857142858, + "loss": 1.0939, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.1880485862493515, + "learning_rate": 0.00012792207792207793, + "loss": 1.0986, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.182168647646904, + "learning_rate": 0.00012727272727272728, + "loss": 1.0109, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.20187129080295563, + "learning_rate": 0.00012662337662337663, + "loss": 1.0668, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.2082669734954834, + "learning_rate": 0.000125974025974026, + "loss": 1.054, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.18294434249401093, + "learning_rate": 0.00012532467532467534, + "loss": 1.0397, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.20515067875385284, + "learning_rate": 0.00012467532467532467, + "loss": 1.1092, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.1758790761232376, + "learning_rate": 0.00012402597402597402, + "loss": 0.9755, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.2170792669057846, + "learning_rate": 0.0001233766233766234, + "loss": 1.0434, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.202157124876976, + "learning_rate": 0.00012272727272727272, + "loss": 1.1129, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.18556398153305054, + "learning_rate": 0.00012207792207792208, + "loss": 1.0665, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.20196087658405304, + "learning_rate": 0.00012142857142857143, + "loss": 1.1, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.1921566128730774, + "learning_rate": 0.0001207792207792208, + "loss": 1.0918, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.18866224586963654, + "learning_rate": 0.00012012987012987014, + "loss": 1.0014, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.207601398229599, + "learning_rate": 0.00011948051948051949, + "loss": 1.0726, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.21592366695404053, + "learning_rate": 0.00011883116883116883, + "loss": 1.1379, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.2016124576330185, + "learning_rate": 0.0001181818181818182, + "loss": 1.1428, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.20478437840938568, + "learning_rate": 0.00011753246753246753, + "loss": 1.121, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.22730594873428345, + "learning_rate": 0.00011688311688311689, + "loss": 1.0319, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.22592711448669434, + "learning_rate": 0.00011623376623376625, + "loss": 1.1264, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.20035041868686676, + "learning_rate": 0.00011558441558441559, + "loss": 1.0686, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.20648567378520966, + "learning_rate": 0.00011493506493506494, + "loss": 1.0817, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.21222743391990662, + "learning_rate": 0.00011428571428571428, + "loss": 1.0678, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.2075391560792923, + "learning_rate": 0.00011363636363636365, + "loss": 1.0897, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.1964101791381836, + "learning_rate": 0.000112987012987013, + "loss": 1.0906, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.22406511008739471, + "learning_rate": 0.00011233766233766234, + "loss": 1.0594, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.23787978291511536, + "learning_rate": 0.00011168831168831168, + "loss": 1.1053, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.21196185052394867, + "learning_rate": 0.00011103896103896105, + "loss": 1.0923, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.21042804419994354, + "learning_rate": 0.0001103896103896104, + "loss": 1.0381, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.2267436534166336, + "learning_rate": 0.00010974025974025974, + "loss": 1.0818, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.23742735385894775, + "learning_rate": 0.00010909090909090909, + "loss": 1.0872, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.17787213623523712, + "learning_rate": 0.00010844155844155846, + "loss": 1.03, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.22422832250595093, + "learning_rate": 0.0001077922077922078, + "loss": 1.0738, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.22946301102638245, + "learning_rate": 0.00010714285714285715, + "loss": 1.0274, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.2137996405363083, + "learning_rate": 0.00010649350649350649, + "loss": 1.0539, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.1748756766319275, + "learning_rate": 0.00010584415584415586, + "loss": 1.0355, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.22275175154209137, + "learning_rate": 0.0001051948051948052, + "loss": 1.1696, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.20996077358722687, + "learning_rate": 0.00010454545454545455, + "loss": 1.0303, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.1945938766002655, + "learning_rate": 0.00010389610389610389, + "loss": 0.9747, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.1970377266407013, + "learning_rate": 0.00010324675324675325, + "loss": 1.0358, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.18814732134342194, + "learning_rate": 0.00010259740259740261, + "loss": 0.9612, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.2153233289718628, + "learning_rate": 0.00010194805194805195, + "loss": 1.0749, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.21788008511066437, + "learning_rate": 0.0001012987012987013, + "loss": 1.0883, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.214650496840477, + "learning_rate": 0.00010064935064935067, + "loss": 1.0539, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.19312834739685059, + "learning_rate": 0.0001, + "loss": 1.0657, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.19916598498821259, + "learning_rate": 9.935064935064936e-05, + "loss": 1.0478, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.2057606726884842, + "learning_rate": 9.870129870129871e-05, + "loss": 1.0094, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.22159607708454132, + "learning_rate": 9.805194805194806e-05, + "loss": 1.0952, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.18274275958538055, + "learning_rate": 9.74025974025974e-05, + "loss": 1.0065, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.19835162162780762, + "learning_rate": 9.675324675324677e-05, + "loss": 1.0742, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.2114904820919037, + "learning_rate": 9.610389610389611e-05, + "loss": 1.1109, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.21488523483276367, + "learning_rate": 9.545454545454546e-05, + "loss": 1.0465, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.19870303571224213, + "learning_rate": 9.480519480519481e-05, + "loss": 1.0318, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.20413029193878174, + "learning_rate": 9.415584415584417e-05, + "loss": 1.0817, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.1847231239080429, + "learning_rate": 9.35064935064935e-05, + "loss": 1.0144, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.2715964913368225, + "learning_rate": 9.285714285714286e-05, + "loss": 0.9832, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.2225002497434616, + "learning_rate": 9.220779220779221e-05, + "loss": 1.1051, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.22931510210037231, + "learning_rate": 9.155844155844156e-05, + "loss": 1.1042, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.21848627924919128, + "learning_rate": 9.090909090909092e-05, + "loss": 1.1151, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.19852259755134583, + "learning_rate": 9.025974025974027e-05, + "loss": 1.0889, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.2080363780260086, + "learning_rate": 8.961038961038961e-05, + "loss": 1.0777, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.22391024231910706, + "learning_rate": 8.896103896103896e-05, + "loss": 1.1092, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.21793846786022186, + "learning_rate": 8.831168831168831e-05, + "loss": 1.044, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.2009749859571457, + "learning_rate": 8.766233766233767e-05, + "loss": 1.0198, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.19432318210601807, + "learning_rate": 8.701298701298701e-05, + "loss": 1.075, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.18634547293186188, + "learning_rate": 8.636363636363637e-05, + "loss": 0.9964, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.1947103589773178, + "learning_rate": 8.571428571428571e-05, + "loss": 1.0025, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.23098671436309814, + "learning_rate": 8.506493506493507e-05, + "loss": 1.0562, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.19686414301395416, + "learning_rate": 8.441558441558442e-05, + "loss": 1.0285, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.19852428138256073, + "learning_rate": 8.376623376623377e-05, + "loss": 1.0054, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.21483510732650757, + "learning_rate": 8.311688311688312e-05, + "loss": 1.108, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.23313644528388977, + "learning_rate": 8.246753246753248e-05, + "loss": 1.1383, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.21453145146369934, + "learning_rate": 8.181818181818183e-05, + "loss": 1.0911, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.20268195867538452, + "learning_rate": 8.116883116883117e-05, + "loss": 1.0145, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.20576398074626923, + "learning_rate": 8.051948051948052e-05, + "loss": 1.0829, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.21732626855373383, + "learning_rate": 7.987012987012987e-05, + "loss": 1.0152, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.22046895325183868, + "learning_rate": 7.922077922077923e-05, + "loss": 1.1311, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.19727715849876404, + "learning_rate": 7.857142857142858e-05, + "loss": 1.0364, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.20861488580703735, + "learning_rate": 7.792207792207793e-05, + "loss": 1.0435, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.18545083701610565, + "learning_rate": 7.727272727272727e-05, + "loss": 1.0299, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.19965052604675293, + "learning_rate": 7.662337662337662e-05, + "loss": 1.0511, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.23673909902572632, + "learning_rate": 7.597402597402598e-05, + "loss": 1.081, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.17583179473876953, + "learning_rate": 7.532467532467533e-05, + "loss": 0.9808, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.2129366099834442, + "learning_rate": 7.467532467532467e-05, + "loss": 1.0522, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.21679140627384186, + "learning_rate": 7.402597402597404e-05, + "loss": 1.0567, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.2032000720500946, + "learning_rate": 7.337662337662338e-05, + "loss": 1.0466, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.1887970268726349, + "learning_rate": 7.272727272727273e-05, + "loss": 1.0329, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.21060192584991455, + "learning_rate": 7.207792207792208e-05, + "loss": 1.1021, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.21191425621509552, + "learning_rate": 7.142857142857143e-05, + "loss": 0.99, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.1995989829301834, + "learning_rate": 7.077922077922077e-05, + "loss": 1.0526, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.1849513053894043, + "learning_rate": 7.012987012987014e-05, + "loss": 0.9998, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.1948779672384262, + "learning_rate": 6.948051948051948e-05, + "loss": 1.075, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.20374052226543427, + "learning_rate": 6.883116883116883e-05, + "loss": 1.0933, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.2102465033531189, + "learning_rate": 6.818181818181818e-05, + "loss": 1.1123, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.21376173198223114, + "learning_rate": 6.753246753246754e-05, + "loss": 1.1233, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.20934203267097473, + "learning_rate": 6.688311688311688e-05, + "loss": 1.1374, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.18604128062725067, + "learning_rate": 6.623376623376624e-05, + "loss": 1.0213, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.19644233584403992, + "learning_rate": 6.55844155844156e-05, + "loss": 1.0046, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.18479463458061218, + "learning_rate": 6.493506493506494e-05, + "loss": 0.9792, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.1945149153470993, + "learning_rate": 6.428571428571429e-05, + "loss": 1.0584, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.2070147544145584, + "learning_rate": 6.363636363636364e-05, + "loss": 1.071, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.19645985960960388, + "learning_rate": 6.2987012987013e-05, + "loss": 1.0721, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.1960117667913437, + "learning_rate": 6.233766233766233e-05, + "loss": 1.071, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.20168261229991913, + "learning_rate": 6.16883116883117e-05, + "loss": 1.0808, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.21254412829875946, + "learning_rate": 6.103896103896104e-05, + "loss": 1.0287, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.21271063387393951, + "learning_rate": 6.03896103896104e-05, + "loss": 1.0605, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.2081408053636551, + "learning_rate": 5.9740259740259744e-05, + "loss": 1.091, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.21113798022270203, + "learning_rate": 5.90909090909091e-05, + "loss": 1.1323, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.20670844614505768, + "learning_rate": 5.844155844155844e-05, + "loss": 1.0955, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.2010120451450348, + "learning_rate": 5.7792207792207796e-05, + "loss": 1.1068, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.20379121601581573, + "learning_rate": 5.714285714285714e-05, + "loss": 1.0419, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.22799807786941528, + "learning_rate": 5.64935064935065e-05, + "loss": 1.0904, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.2005995213985443, + "learning_rate": 5.584415584415584e-05, + "loss": 1.078, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.20329605042934418, + "learning_rate": 5.51948051948052e-05, + "loss": 1.0245, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.19283504784107208, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.0367, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.20624355971813202, + "learning_rate": 5.38961038961039e-05, + "loss": 1.1046, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.21362991631031036, + "learning_rate": 5.3246753246753245e-05, + "loss": 1.1104, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.20447863638401031, + "learning_rate": 5.25974025974026e-05, + "loss": 1.0514, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.1974381059408188, + "learning_rate": 5.1948051948051944e-05, + "loss": 1.0048, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.21237170696258545, + "learning_rate": 5.1298701298701304e-05, + "loss": 1.1299, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.21224971115589142, + "learning_rate": 5.064935064935065e-05, + "loss": 1.05, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.19865018129348755, + "learning_rate": 5e-05, + "loss": 1.0665, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.19199275970458984, + "learning_rate": 4.9350649350649355e-05, + "loss": 0.9531, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.19573214650154114, + "learning_rate": 4.87012987012987e-05, + "loss": 1.0318, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.21338805556297302, + "learning_rate": 4.8051948051948054e-05, + "loss": 1.0343, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.2254691869020462, + "learning_rate": 4.740259740259741e-05, + "loss": 1.0472, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.18101665377616882, + "learning_rate": 4.675324675324675e-05, + "loss": 1.017, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.22090592980384827, + "learning_rate": 4.6103896103896106e-05, + "loss": 1.0389, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.20865507423877716, + "learning_rate": 4.545454545454546e-05, + "loss": 1.0369, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.21619610488414764, + "learning_rate": 4.4805194805194805e-05, + "loss": 1.109, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.21694771945476532, + "learning_rate": 4.415584415584416e-05, + "loss": 1.0525, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.2182662934064865, + "learning_rate": 4.3506493506493503e-05, + "loss": 1.0331, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.2026486098766327, + "learning_rate": 4.2857142857142856e-05, + "loss": 1.027, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.19606547057628632, + "learning_rate": 4.220779220779221e-05, + "loss": 1.0242, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.22107470035552979, + "learning_rate": 4.155844155844156e-05, + "loss": 1.0924, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.19960008561611176, + "learning_rate": 4.0909090909090915e-05, + "loss": 1.0384, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.1945488154888153, + "learning_rate": 4.025974025974026e-05, + "loss": 1.0673, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.22067414224147797, + "learning_rate": 3.9610389610389614e-05, + "loss": 1.0426, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.19010980427265167, + "learning_rate": 3.8961038961038966e-05, + "loss": 1.0617, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.18781176209449768, + "learning_rate": 3.831168831168831e-05, + "loss": 1.0243, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.20388829708099365, + "learning_rate": 3.7662337662337665e-05, + "loss": 1.0476, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.19911155104637146, + "learning_rate": 3.701298701298702e-05, + "loss": 1.0324, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.19884039461612701, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.0242, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.19036105275154114, + "learning_rate": 3.571428571428572e-05, + "loss": 1.0323, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.20039844512939453, + "learning_rate": 3.506493506493507e-05, + "loss": 1.0749, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.1899934560060501, + "learning_rate": 3.4415584415584416e-05, + "loss": 1.0115, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.20019090175628662, + "learning_rate": 3.376623376623377e-05, + "loss": 1.0782, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.2020583152770996, + "learning_rate": 3.311688311688312e-05, + "loss": 1.0687, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.21407337486743927, + "learning_rate": 3.246753246753247e-05, + "loss": 1.1015, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.1871640682220459, + "learning_rate": 3.181818181818182e-05, + "loss": 0.9637, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.21622811257839203, + "learning_rate": 3.1168831168831166e-05, + "loss": 1.1222, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.22504661977291107, + "learning_rate": 3.051948051948052e-05, + "loss": 1.132, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.19177629053592682, + "learning_rate": 2.9870129870129872e-05, + "loss": 1.0281, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.1970544159412384, + "learning_rate": 2.922077922077922e-05, + "loss": 1.0393, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.21554522216320038, + "learning_rate": 2.857142857142857e-05, + "loss": 1.074, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.21131229400634766, + "learning_rate": 2.792207792207792e-05, + "loss": 1.054, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.19816523790359497, + "learning_rate": 2.7272727272727273e-05, + "loss": 1.0456, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.21075209975242615, + "learning_rate": 2.6623376623376623e-05, + "loss": 1.0758, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.2296527624130249, + "learning_rate": 2.5974025974025972e-05, + "loss": 1.0917, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.19722610712051392, + "learning_rate": 2.5324675324675325e-05, + "loss": 1.0704, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.18721099197864532, + "learning_rate": 2.4675324675324678e-05, + "loss": 0.9919, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.20244193077087402, + "learning_rate": 2.4025974025974027e-05, + "loss": 1.0368, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.19518914818763733, + "learning_rate": 2.3376623376623376e-05, + "loss": 1.0436, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.19650357961654663, + "learning_rate": 2.272727272727273e-05, + "loss": 1.0306, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.20320096611976624, + "learning_rate": 2.207792207792208e-05, + "loss": 1.0941, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.18296951055526733, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.9802, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.21357610821723938, + "learning_rate": 2.077922077922078e-05, + "loss": 1.0449, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.193921759724617, + "learning_rate": 2.012987012987013e-05, + "loss": 1.0116, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.1953902244567871, + "learning_rate": 1.9480519480519483e-05, + "loss": 1.0105, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.19440975785255432, + "learning_rate": 1.8831168831168833e-05, + "loss": 0.9952, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.21054105460643768, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.0701, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.18844804167747498, + "learning_rate": 1.7532467532467535e-05, + "loss": 1.0146, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.2067311704158783, + "learning_rate": 1.6883116883116884e-05, + "loss": 1.0781, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.1941213756799698, + "learning_rate": 1.6233766233766234e-05, + "loss": 0.9814, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.22726193070411682, + "learning_rate": 1.5584415584415583e-05, + "loss": 1.1431, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.18025581538677216, + "learning_rate": 1.4935064935064936e-05, + "loss": 0.9649, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.21535000205039978, + "learning_rate": 1.4285714285714285e-05, + "loss": 1.0441, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.20014546811580658, + "learning_rate": 1.3636363636363637e-05, + "loss": 1.0166, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.22738787531852722, + "learning_rate": 1.2987012987012986e-05, + "loss": 1.0564, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.2020861804485321, + "learning_rate": 1.2337662337662339e-05, + "loss": 1.1241, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.19888809323310852, + "learning_rate": 1.1688311688311688e-05, + "loss": 1.1114, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.20912377536296844, + "learning_rate": 1.103896103896104e-05, + "loss": 1.0971, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.21206621825695038, + "learning_rate": 1.038961038961039e-05, + "loss": 1.0601, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.18667680025100708, + "learning_rate": 9.740259740259742e-06, + "loss": 1.0291, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.21125559508800507, + "learning_rate": 9.090909090909091e-06, + "loss": 1.0483, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.21776145696640015, + "learning_rate": 8.441558441558442e-06, + "loss": 0.9912, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.20144303143024445, + "learning_rate": 7.792207792207792e-06, + "loss": 1.0357, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.1984029859304428, + "learning_rate": 7.142857142857143e-06, + "loss": 1.0648, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.17972829937934875, + "learning_rate": 6.493506493506493e-06, + "loss": 1.0033, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.1818286031484604, + "learning_rate": 5.844155844155844e-06, + "loss": 0.997, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.19670912623405457, + "learning_rate": 5.194805194805195e-06, + "loss": 1.0256, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.20527283847332, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.0348, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.19025909900665283, + "learning_rate": 3.896103896103896e-06, + "loss": 1.0682, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.19544818997383118, + "learning_rate": 3.2467532467532465e-06, + "loss": 0.9872, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.22112183272838593, + "learning_rate": 2.5974025974025976e-06, + "loss": 1.0661, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.23328153789043427, + "learning_rate": 1.948051948051948e-06, + "loss": 1.0691, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.20181375741958618, + "learning_rate": 1.2987012987012988e-06, + "loss": 0.9416, + "step": 312 + }, + { + "epoch": 1.0, + "grad_norm": 0.29312625527381897, + "learning_rate": 6.493506493506494e-07, + "loss": 1.1216, + "step": 313 + }, + { + "epoch": 0.12202467696492762, + "grad_norm": 0.2231415957212448, + "learning_rate": 0.0, + "loss": 1.0468, + "step": 314 + }, + { + "epoch": 0.12241329058583503, + "grad_norm": 0.22263288497924805, + "learning_rate": 0.00017594394706111328, + "loss": 1.0399, + "step": 315 + }, + { + "epoch": 0.12280190420674245, + "grad_norm": 0.22909891605377197, + "learning_rate": 0.00017586609575710393, + "loss": 1.1069, + "step": 316 + }, + { + "epoch": 0.12319051782764986, + "grad_norm": 0.23951445519924164, + "learning_rate": 0.0001757882444530946, + "loss": 1.1036, + "step": 317 + }, + { + "epoch": 0.12357913144855727, + "grad_norm": 0.2409268021583557, + "learning_rate": 0.00017571039314908526, + "loss": 1.1114, + "step": 318 + }, + { + "epoch": 0.12396774506946469, + "grad_norm": 0.23753899335861206, + "learning_rate": 0.00017563254184507592, + "loss": 1.1297, + "step": 319 + }, + { + "epoch": 0.12435635869037209, + "grad_norm": 0.2823902666568756, + "learning_rate": 0.00017555469054106657, + "loss": 1.1293, + "step": 320 + }, + { + "epoch": 0.12474497231127951, + "grad_norm": 0.24093545973300934, + "learning_rate": 0.00017547683923705722, + "loss": 1.0678, + "step": 321 + }, + { + "epoch": 0.12513358593218693, + "grad_norm": 0.22565563023090363, + "learning_rate": 0.0001753989879330479, + "loss": 1.1408, + "step": 322 + }, + { + "epoch": 0.12552219955309435, + "grad_norm": 0.22569572925567627, + "learning_rate": 0.00017532113662903855, + "loss": 1.0543, + "step": 323 + }, + { + "epoch": 0.12591081317400174, + "grad_norm": 0.24962866306304932, + "learning_rate": 0.0001752432853250292, + "loss": 1.0818, + "step": 324 + }, + { + "epoch": 0.12629942679490916, + "grad_norm": 0.22184576094150543, + "learning_rate": 0.00017516543402101986, + "loss": 1.0835, + "step": 325 + }, + { + "epoch": 0.12668804041581658, + "grad_norm": 0.2572194039821625, + "learning_rate": 0.0001750875827170105, + "loss": 1.0767, + "step": 326 + }, + { + "epoch": 0.127076654036724, + "grad_norm": 0.24131342768669128, + "learning_rate": 0.00017500973141300116, + "loss": 1.0981, + "step": 327 + }, + { + "epoch": 0.1274652676576314, + "grad_norm": 0.2386389970779419, + "learning_rate": 0.00017493188010899184, + "loss": 1.0828, + "step": 328 + }, + { + "epoch": 0.1278538812785388, + "grad_norm": 0.2654125690460205, + "learning_rate": 0.0001748540288049825, + "loss": 1.1266, + "step": 329 + }, + { + "epoch": 0.12824249489944622, + "grad_norm": 0.2925739884376526, + "learning_rate": 0.00017477617750097314, + "loss": 1.0983, + "step": 330 + }, + { + "epoch": 0.12863110852035364, + "grad_norm": 0.26589342951774597, + "learning_rate": 0.0001746983261969638, + "loss": 1.1029, + "step": 331 + }, + { + "epoch": 0.12901972214126106, + "grad_norm": 0.24565957486629486, + "learning_rate": 0.00017462047489295445, + "loss": 1.0975, + "step": 332 + }, + { + "epoch": 0.12940833576216845, + "grad_norm": 0.2459682673215866, + "learning_rate": 0.00017454262358894513, + "loss": 1.0566, + "step": 333 + }, + { + "epoch": 0.12979694938307587, + "grad_norm": 0.23349183797836304, + "learning_rate": 0.00017446477228493578, + "loss": 1.0833, + "step": 334 + }, + { + "epoch": 0.1301855630039833, + "grad_norm": 0.26166337728500366, + "learning_rate": 0.00017438692098092643, + "loss": 1.1598, + "step": 335 + }, + { + "epoch": 0.1305741766248907, + "grad_norm": 0.24188168346881866, + "learning_rate": 0.00017430906967691708, + "loss": 1.0728, + "step": 336 + }, + { + "epoch": 0.13096279024579813, + "grad_norm": 0.22922398149967194, + "learning_rate": 0.00017423121837290773, + "loss": 1.0311, + "step": 337 + }, + { + "epoch": 0.13135140386670552, + "grad_norm": 0.2652754485607147, + "learning_rate": 0.00017415336706889841, + "loss": 1.1096, + "step": 338 + }, + { + "epoch": 0.13174001748761294, + "grad_norm": 0.2355881780385971, + "learning_rate": 0.00017407551576488907, + "loss": 1.0964, + "step": 339 + }, + { + "epoch": 0.13212863110852036, + "grad_norm": 0.244523823261261, + "learning_rate": 0.00017399766446087972, + "loss": 1.142, + "step": 340 + }, + { + "epoch": 0.13251724472942777, + "grad_norm": 0.24705976247787476, + "learning_rate": 0.00017391981315687037, + "loss": 1.0943, + "step": 341 + }, + { + "epoch": 0.13290585835033517, + "grad_norm": 0.22817552089691162, + "learning_rate": 0.00017384196185286102, + "loss": 1.0621, + "step": 342 + }, + { + "epoch": 0.13329447197124258, + "grad_norm": 0.22605225443840027, + "learning_rate": 0.0001737641105488517, + "loss": 1.0714, + "step": 343 + }, + { + "epoch": 0.13368308559215, + "grad_norm": 0.2584545314311981, + "learning_rate": 0.00017368625924484235, + "loss": 1.1367, + "step": 344 + }, + { + "epoch": 0.13407169921305742, + "grad_norm": 0.2248220443725586, + "learning_rate": 0.000173608407940833, + "loss": 1.0872, + "step": 345 + }, + { + "epoch": 0.13446031283396484, + "grad_norm": 0.2141868770122528, + "learning_rate": 0.00017353055663682368, + "loss": 1.0572, + "step": 346 + }, + { + "epoch": 0.13484892645487223, + "grad_norm": 0.2615523934364319, + "learning_rate": 0.00017345270533281434, + "loss": 1.1048, + "step": 347 + }, + { + "epoch": 0.13523754007577965, + "grad_norm": 0.22990448772907257, + "learning_rate": 0.000173374854028805, + "loss": 1.0528, + "step": 348 + }, + { + "epoch": 0.13562615369668707, + "grad_norm": 0.2132262885570526, + "learning_rate": 0.00017329700272479564, + "loss": 1.0476, + "step": 349 + }, + { + "epoch": 0.1360147673175945, + "grad_norm": 0.2578272819519043, + "learning_rate": 0.00017321915142078632, + "loss": 1.0852, + "step": 350 + }, + { + "epoch": 0.1364033809385019, + "grad_norm": 0.22881457209587097, + "learning_rate": 0.00017314130011677697, + "loss": 1.1017, + "step": 351 + }, + { + "epoch": 0.1367919945594093, + "grad_norm": 0.21067696809768677, + "learning_rate": 0.00017306344881276762, + "loss": 1.0444, + "step": 352 + }, + { + "epoch": 0.13718060818031672, + "grad_norm": 0.2304215282201767, + "learning_rate": 0.0001729855975087583, + "loss": 1.0737, + "step": 353 + }, + { + "epoch": 0.13756922180122413, + "grad_norm": 0.2031925916671753, + "learning_rate": 0.00017290774620474895, + "loss": 1.0036, + "step": 354 + }, + { + "epoch": 0.13795783542213155, + "grad_norm": 0.27281051874160767, + "learning_rate": 0.0001728298949007396, + "loss": 1.148, + "step": 355 + }, + { + "epoch": 0.13834644904303897, + "grad_norm": 0.204191654920578, + "learning_rate": 0.00017275204359673026, + "loss": 0.9607, + "step": 356 + }, + { + "epoch": 0.13873506266394636, + "grad_norm": 0.221976637840271, + "learning_rate": 0.0001726741922927209, + "loss": 1.1068, + "step": 357 + }, + { + "epoch": 0.13912367628485378, + "grad_norm": 0.20831729471683502, + "learning_rate": 0.0001725963409887116, + "loss": 1.034, + "step": 358 + }, + { + "epoch": 0.1395122899057612, + "grad_norm": 0.21639779210090637, + "learning_rate": 0.00017251848968470224, + "loss": 1.0613, + "step": 359 + }, + { + "epoch": 0.13990090352666862, + "grad_norm": 0.1959424465894699, + "learning_rate": 0.0001724406383806929, + "loss": 1.0506, + "step": 360 + }, + { + "epoch": 0.140289517147576, + "grad_norm": 0.2044398933649063, + "learning_rate": 0.00017236278707668355, + "loss": 1.0316, + "step": 361 + }, + { + "epoch": 0.14067813076848343, + "grad_norm": 0.21483004093170166, + "learning_rate": 0.0001722849357726742, + "loss": 1.0361, + "step": 362 + }, + { + "epoch": 0.14106674438939085, + "grad_norm": 0.237701416015625, + "learning_rate": 0.00017220708446866485, + "loss": 1.1264, + "step": 363 + }, + { + "epoch": 0.14145535801029827, + "grad_norm": 0.20750795304775238, + "learning_rate": 0.00017212923316465553, + "loss": 1.0523, + "step": 364 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 0.2252965271472931, + "learning_rate": 0.00017205138186064618, + "loss": 1.0764, + "step": 365 + }, + { + "epoch": 0.14223258525211308, + "grad_norm": 0.2033565789461136, + "learning_rate": 0.00017197353055663683, + "loss": 1.064, + "step": 366 + }, + { + "epoch": 0.1426211988730205, + "grad_norm": 0.21123190224170685, + "learning_rate": 0.00017189567925262749, + "loss": 1.0515, + "step": 367 + }, + { + "epoch": 0.1430098124939279, + "grad_norm": 0.20646221935749054, + "learning_rate": 0.00017181782794861814, + "loss": 1.0617, + "step": 368 + }, + { + "epoch": 0.14339842611483533, + "grad_norm": 0.2079589068889618, + "learning_rate": 0.00017173997664460882, + "loss": 1.0569, + "step": 369 + }, + { + "epoch": 0.14378703973574275, + "grad_norm": 0.216246098279953, + "learning_rate": 0.00017166212534059947, + "loss": 1.0986, + "step": 370 + }, + { + "epoch": 0.14417565335665014, + "grad_norm": 0.20711806416511536, + "learning_rate": 0.00017158427403659012, + "loss": 1.1342, + "step": 371 + }, + { + "epoch": 0.14456426697755756, + "grad_norm": 0.235435351729393, + "learning_rate": 0.00017150642273258077, + "loss": 1.1082, + "step": 372 + }, + { + "epoch": 0.14495288059846498, + "grad_norm": 0.2273191511631012, + "learning_rate": 0.00017142857142857143, + "loss": 1.1064, + "step": 373 + }, + { + "epoch": 0.1453414942193724, + "grad_norm": 0.2075672745704651, + "learning_rate": 0.0001713507201245621, + "loss": 1.0536, + "step": 374 + }, + { + "epoch": 0.14573010784027982, + "grad_norm": 0.20764274895191193, + "learning_rate": 0.00017127286882055276, + "loss": 1.0673, + "step": 375 + }, + { + "epoch": 0.1461187214611872, + "grad_norm": 0.2441243678331375, + "learning_rate": 0.0001711950175165434, + "loss": 1.1271, + "step": 376 + }, + { + "epoch": 0.14650733508209463, + "grad_norm": 0.2383374124765396, + "learning_rate": 0.00017111716621253406, + "loss": 1.083, + "step": 377 + }, + { + "epoch": 0.14689594870300204, + "grad_norm": 0.2172410786151886, + "learning_rate": 0.0001710393149085247, + "loss": 1.0605, + "step": 378 + }, + { + "epoch": 0.14728456232390946, + "grad_norm": 0.22591541707515717, + "learning_rate": 0.0001709614636045154, + "loss": 1.0931, + "step": 379 + }, + { + "epoch": 0.14767317594481685, + "grad_norm": 0.23099495470523834, + "learning_rate": 0.00017088361230050604, + "loss": 1.1021, + "step": 380 + }, + { + "epoch": 0.14806178956572427, + "grad_norm": 0.21461094915866852, + "learning_rate": 0.0001708057609964967, + "loss": 1.0959, + "step": 381 + }, + { + "epoch": 0.1484504031866317, + "grad_norm": 0.21557241678237915, + "learning_rate": 0.00017072790969248735, + "loss": 1.0155, + "step": 382 + }, + { + "epoch": 0.1488390168075391, + "grad_norm": 0.234396293759346, + "learning_rate": 0.000170650058388478, + "loss": 1.1289, + "step": 383 + }, + { + "epoch": 0.14922763042844653, + "grad_norm": 0.22895503044128418, + "learning_rate": 0.00017057220708446868, + "loss": 0.9919, + "step": 384 + }, + { + "epoch": 0.14961624404935392, + "grad_norm": 0.2054683268070221, + "learning_rate": 0.00017049435578045933, + "loss": 1.0607, + "step": 385 + }, + { + "epoch": 0.15000485767026134, + "grad_norm": 0.25569215416908264, + "learning_rate": 0.00017041650447644998, + "loss": 1.0517, + "step": 386 + }, + { + "epoch": 0.15039347129116876, + "grad_norm": 0.2222641259431839, + "learning_rate": 0.00017033865317244064, + "loss": 1.0404, + "step": 387 + }, + { + "epoch": 0.15078208491207618, + "grad_norm": 0.20501169562339783, + "learning_rate": 0.0001702608018684313, + "loss": 0.9897, + "step": 388 + }, + { + "epoch": 0.1511706985329836, + "grad_norm": 0.22080403566360474, + "learning_rate": 0.00017018295056442197, + "loss": 1.1013, + "step": 389 + }, + { + "epoch": 0.15155931215389098, + "grad_norm": 0.21218529343605042, + "learning_rate": 0.00017010509926041262, + "loss": 1.0541, + "step": 390 + }, + { + "epoch": 0.1519479257747984, + "grad_norm": 0.23064807057380676, + "learning_rate": 0.00017002724795640327, + "loss": 1.037, + "step": 391 + }, + { + "epoch": 0.15233653939570582, + "grad_norm": 0.21164493262767792, + "learning_rate": 0.00016994939665239392, + "loss": 1.0769, + "step": 392 + }, + { + "epoch": 0.15272515301661324, + "grad_norm": 0.22565549612045288, + "learning_rate": 0.00016987154534838457, + "loss": 1.0638, + "step": 393 + }, + { + "epoch": 0.15311376663752063, + "grad_norm": 0.22492647171020508, + "learning_rate": 0.00016979369404437525, + "loss": 1.063, + "step": 394 + }, + { + "epoch": 0.15350238025842805, + "grad_norm": 0.22335395216941833, + "learning_rate": 0.0001697158427403659, + "loss": 1.1032, + "step": 395 + }, + { + "epoch": 0.15389099387933547, + "grad_norm": 0.2164154201745987, + "learning_rate": 0.00016963799143635656, + "loss": 1.1275, + "step": 396 + }, + { + "epoch": 0.1542796075002429, + "grad_norm": 0.22547736763954163, + "learning_rate": 0.0001695601401323472, + "loss": 1.1324, + "step": 397 + }, + { + "epoch": 0.1546682211211503, + "grad_norm": 0.2028045952320099, + "learning_rate": 0.0001694822888283379, + "loss": 1.0057, + "step": 398 + }, + { + "epoch": 0.1550568347420577, + "grad_norm": 0.20770573616027832, + "learning_rate": 0.00016940443752432854, + "loss": 1.0311, + "step": 399 + }, + { + "epoch": 0.15544544836296512, + "grad_norm": 0.2231476902961731, + "learning_rate": 0.0001693265862203192, + "loss": 1.0535, + "step": 400 + }, + { + "epoch": 0.15583406198387253, + "grad_norm": 0.21618099510669708, + "learning_rate": 0.00016924873491630987, + "loss": 1.0616, + "step": 401 + }, + { + "epoch": 0.15622267560477995, + "grad_norm": 0.24024419486522675, + "learning_rate": 0.00016917088361230052, + "loss": 1.1324, + "step": 402 + }, + { + "epoch": 0.15661128922568737, + "grad_norm": 0.2002171128988266, + "learning_rate": 0.00016909303230829118, + "loss": 1.015, + "step": 403 + }, + { + "epoch": 0.15699990284659476, + "grad_norm": 0.21771477162837982, + "learning_rate": 0.00016901518100428183, + "loss": 1.0817, + "step": 404 + }, + { + "epoch": 0.15738851646750218, + "grad_norm": 0.22052259743213654, + "learning_rate": 0.0001689373297002725, + "loss": 1.0836, + "step": 405 + }, + { + "epoch": 0.1577771300884096, + "grad_norm": 0.1964062750339508, + "learning_rate": 0.00016885947839626316, + "loss": 1.0505, + "step": 406 + }, + { + "epoch": 0.15816574370931702, + "grad_norm": 0.22714298963546753, + "learning_rate": 0.0001687816270922538, + "loss": 1.0702, + "step": 407 + }, + { + "epoch": 0.15855435733022444, + "grad_norm": 0.20647728443145752, + "learning_rate": 0.00016870377578824446, + "loss": 1.0349, + "step": 408 + }, + { + "epoch": 0.15894297095113183, + "grad_norm": 0.2355160117149353, + "learning_rate": 0.00016862592448423512, + "loss": 1.0305, + "step": 409 + }, + { + "epoch": 0.15933158457203925, + "grad_norm": 0.22890770435333252, + "learning_rate": 0.0001685480731802258, + "loss": 1.0854, + "step": 410 + }, + { + "epoch": 0.15972019819294667, + "grad_norm": 0.21947838366031647, + "learning_rate": 0.00016847022187621645, + "loss": 1.0948, + "step": 411 + }, + { + "epoch": 0.16010881181385409, + "grad_norm": 0.22334899008274078, + "learning_rate": 0.0001683923705722071, + "loss": 1.006, + "step": 412 + }, + { + "epoch": 0.16049742543476148, + "grad_norm": 0.22324936091899872, + "learning_rate": 0.00016831451926819775, + "loss": 1.0402, + "step": 413 + }, + { + "epoch": 0.1608860390556689, + "grad_norm": 0.21462097764015198, + "learning_rate": 0.0001682366679641884, + "loss": 1.077, + "step": 414 + }, + { + "epoch": 0.1612746526765763, + "grad_norm": 0.24567006528377533, + "learning_rate": 0.00016815881666017908, + "loss": 1.15, + "step": 415 + }, + { + "epoch": 0.16166326629748373, + "grad_norm": 0.26437243819236755, + "learning_rate": 0.00016808096535616973, + "loss": 1.1251, + "step": 416 + }, + { + "epoch": 0.16205187991839115, + "grad_norm": 0.2217959761619568, + "learning_rate": 0.00016800311405216039, + "loss": 1.1103, + "step": 417 + }, + { + "epoch": 0.16244049353929854, + "grad_norm": 0.24402475357055664, + "learning_rate": 0.00016792526274815104, + "loss": 1.0672, + "step": 418 + }, + { + "epoch": 0.16282910716020596, + "grad_norm": 0.21609526872634888, + "learning_rate": 0.0001678474114441417, + "loss": 1.0291, + "step": 419 + }, + { + "epoch": 0.16321772078111338, + "grad_norm": 0.20054642856121063, + "learning_rate": 0.00016776956014013237, + "loss": 1.0704, + "step": 420 + }, + { + "epoch": 0.1636063344020208, + "grad_norm": 0.22864869236946106, + "learning_rate": 0.00016769170883612302, + "loss": 1.0612, + "step": 421 + }, + { + "epoch": 0.16399494802292822, + "grad_norm": 0.22651974856853485, + "learning_rate": 0.00016761385753211367, + "loss": 1.0749, + "step": 422 + }, + { + "epoch": 0.1643835616438356, + "grad_norm": 0.21587328612804413, + "learning_rate": 0.00016753600622810433, + "loss": 1.0398, + "step": 423 + }, + { + "epoch": 0.16477217526474303, + "grad_norm": 0.1953774094581604, + "learning_rate": 0.00016745815492409498, + "loss": 1.0275, + "step": 424 + }, + { + "epoch": 0.16516078888565044, + "grad_norm": 0.21803410351276398, + "learning_rate": 0.00016738030362008566, + "loss": 1.1219, + "step": 425 + }, + { + "epoch": 0.16554940250655786, + "grad_norm": 0.2034682035446167, + "learning_rate": 0.0001673024523160763, + "loss": 1.0342, + "step": 426 + }, + { + "epoch": 0.16593801612746525, + "grad_norm": 0.20135951042175293, + "learning_rate": 0.00016722460101206696, + "loss": 0.9802, + "step": 427 + }, + { + "epoch": 0.16632662974837267, + "grad_norm": 0.23310376703739166, + "learning_rate": 0.0001671467497080576, + "loss": 1.0789, + "step": 428 + }, + { + "epoch": 0.1667152433692801, + "grad_norm": 0.21475404500961304, + "learning_rate": 0.00016706889840404827, + "loss": 1.0416, + "step": 429 + }, + { + "epoch": 0.1671038569901875, + "grad_norm": 0.21661072969436646, + "learning_rate": 0.00016699104710003894, + "loss": 1.0568, + "step": 430 + }, + { + "epoch": 0.16749247061109493, + "grad_norm": 0.20310629904270172, + "learning_rate": 0.0001669131957960296, + "loss": 0.9968, + "step": 431 + }, + { + "epoch": 0.16788108423200232, + "grad_norm": 0.2596947252750397, + "learning_rate": 0.00016683534449202025, + "loss": 1.0478, + "step": 432 + }, + { + "epoch": 0.16826969785290974, + "grad_norm": 0.22226987779140472, + "learning_rate": 0.0001667574931880109, + "loss": 1.0898, + "step": 433 + }, + { + "epoch": 0.16865831147381716, + "grad_norm": 0.22499911487102509, + "learning_rate": 0.00016667964188400155, + "loss": 1.07, + "step": 434 + }, + { + "epoch": 0.16904692509472458, + "grad_norm": 0.2717292308807373, + "learning_rate": 0.0001666017905799922, + "loss": 1.0562, + "step": 435 + }, + { + "epoch": 0.169435538715632, + "grad_norm": 0.22052323818206787, + "learning_rate": 0.00016652393927598288, + "loss": 1.0732, + "step": 436 + }, + { + "epoch": 0.16982415233653939, + "grad_norm": 0.21741728484630585, + "learning_rate": 0.00016644608797197354, + "loss": 1.0409, + "step": 437 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 0.20701193809509277, + "learning_rate": 0.0001663682366679642, + "loss": 1.0731, + "step": 438 + }, + { + "epoch": 0.17060137957835422, + "grad_norm": 0.22071130573749542, + "learning_rate": 0.00016629038536395484, + "loss": 1.0992, + "step": 439 + }, + { + "epoch": 0.17098999319926164, + "grad_norm": 0.20261412858963013, + "learning_rate": 0.0001662125340599455, + "loss": 1.0051, + "step": 440 + }, + { + "epoch": 0.17137860682016906, + "grad_norm": 0.2082947939634323, + "learning_rate": 0.00016613468275593617, + "loss": 1.0477, + "step": 441 + }, + { + "epoch": 0.17176722044107645, + "grad_norm": 0.22534717619419098, + "learning_rate": 0.00016605683145192682, + "loss": 1.041, + "step": 442 + }, + { + "epoch": 0.17215583406198387, + "grad_norm": 0.21547731757164001, + "learning_rate": 0.00016597898014791748, + "loss": 1.0528, + "step": 443 + }, + { + "epoch": 0.1725444476828913, + "grad_norm": 0.24141089618206024, + "learning_rate": 0.00016590112884390813, + "loss": 1.0928, + "step": 444 + }, + { + "epoch": 0.1729330613037987, + "grad_norm": 0.21910884976387024, + "learning_rate": 0.00016582327753989878, + "loss": 1.063, + "step": 445 + }, + { + "epoch": 0.1733216749247061, + "grad_norm": 0.21782316267490387, + "learning_rate": 0.00016574542623588946, + "loss": 1.0976, + "step": 446 + }, + { + "epoch": 0.17371028854561352, + "grad_norm": 0.21771778166294098, + "learning_rate": 0.0001656675749318801, + "loss": 1.0677, + "step": 447 + }, + { + "epoch": 0.17409890216652094, + "grad_norm": 0.22117659449577332, + "learning_rate": 0.00016558972362787076, + "loss": 1.0669, + "step": 448 + }, + { + "epoch": 0.17448751578742835, + "grad_norm": 0.21918092668056488, + "learning_rate": 0.00016551187232386141, + "loss": 1.0955, + "step": 449 + }, + { + "epoch": 0.17487612940833577, + "grad_norm": 0.22027818858623505, + "learning_rate": 0.0001654340210198521, + "loss": 1.0201, + "step": 450 + }, + { + "epoch": 0.17526474302924316, + "grad_norm": 0.2042885720729828, + "learning_rate": 0.00016535616971584275, + "loss": 1.0881, + "step": 451 + }, + { + "epoch": 0.17565335665015058, + "grad_norm": 0.21788261830806732, + "learning_rate": 0.0001652783184118334, + "loss": 1.0918, + "step": 452 + }, + { + "epoch": 0.176041970271058, + "grad_norm": 0.23332571983337402, + "learning_rate": 0.00016520046710782408, + "loss": 1.091, + "step": 453 + }, + { + "epoch": 0.17643058389196542, + "grad_norm": 0.20204192399978638, + "learning_rate": 0.00016512261580381473, + "loss": 1.0366, + "step": 454 + }, + { + "epoch": 0.17681919751287284, + "grad_norm": 0.21761906147003174, + "learning_rate": 0.00016504476449980538, + "loss": 1.0131, + "step": 455 + }, + { + "epoch": 0.17720781113378023, + "grad_norm": 0.2152051478624344, + "learning_rate": 0.00016496691319579606, + "loss": 1.0868, + "step": 456 + }, + { + "epoch": 0.17759642475468765, + "grad_norm": 0.22776494920253754, + "learning_rate": 0.0001648890618917867, + "loss": 1.0807, + "step": 457 + }, + { + "epoch": 0.17798503837559507, + "grad_norm": 0.2171342968940735, + "learning_rate": 0.00016481121058777736, + "loss": 1.0537, + "step": 458 + }, + { + "epoch": 0.17837365199650249, + "grad_norm": 0.2046273946762085, + "learning_rate": 0.00016473335928376802, + "loss": 1.0097, + "step": 459 + }, + { + "epoch": 0.17876226561740988, + "grad_norm": 0.2047681361436844, + "learning_rate": 0.00016465550797975867, + "loss": 1.0204, + "step": 460 + }, + { + "epoch": 0.1791508792383173, + "grad_norm": 0.1876862645149231, + "learning_rate": 0.00016457765667574935, + "loss": 0.9383, + "step": 461 + }, + { + "epoch": 0.17953949285922471, + "grad_norm": 0.218430757522583, + "learning_rate": 0.00016449980537174, + "loss": 1.0721, + "step": 462 + }, + { + "epoch": 0.17992810648013213, + "grad_norm": 0.2245480865240097, + "learning_rate": 0.00016442195406773065, + "loss": 1.0859, + "step": 463 + }, + { + "epoch": 0.18031672010103955, + "grad_norm": 0.22577151656150818, + "learning_rate": 0.0001643441027637213, + "loss": 1.0825, + "step": 464 + }, + { + "epoch": 0.18070533372194694, + "grad_norm": 0.20132745802402496, + "learning_rate": 0.00016426625145971196, + "loss": 1.0615, + "step": 465 + }, + { + "epoch": 0.18109394734285436, + "grad_norm": 0.2277505248785019, + "learning_rate": 0.00016418840015570263, + "loss": 1.0426, + "step": 466 + }, + { + "epoch": 0.18148256096376178, + "grad_norm": 0.22540105879306793, + "learning_rate": 0.0001641105488516933, + "loss": 1.0481, + "step": 467 + }, + { + "epoch": 0.1818711745846692, + "grad_norm": 0.20358088612556458, + "learning_rate": 0.00016403269754768394, + "loss": 1.0286, + "step": 468 + }, + { + "epoch": 0.18225978820557662, + "grad_norm": 0.22534145414829254, + "learning_rate": 0.0001639548462436746, + "loss": 1.1183, + "step": 469 + }, + { + "epoch": 0.182648401826484, + "grad_norm": 0.2188873142004013, + "learning_rate": 0.00016387699493966524, + "loss": 1.0439, + "step": 470 + }, + { + "epoch": 0.18303701544739143, + "grad_norm": 0.2128048539161682, + "learning_rate": 0.00016379914363565592, + "loss": 1.027, + "step": 471 + }, + { + "epoch": 0.18342562906829885, + "grad_norm": 0.2518141567707062, + "learning_rate": 0.00016372129233164657, + "loss": 1.0468, + "step": 472 + }, + { + "epoch": 0.18381424268920626, + "grad_norm": 0.2189142256975174, + "learning_rate": 0.00016364344102763723, + "loss": 1.0581, + "step": 473 + }, + { + "epoch": 0.18420285631011368, + "grad_norm": 0.31266725063323975, + "learning_rate": 0.00016356558972362788, + "loss": 1.0554, + "step": 474 + }, + { + "epoch": 0.18459146993102107, + "grad_norm": 0.21343916654586792, + "learning_rate": 0.00016348773841961853, + "loss": 1.0795, + "step": 475 + }, + { + "epoch": 0.1849800835519285, + "grad_norm": 0.22907280921936035, + "learning_rate": 0.00016340988711560918, + "loss": 1.0304, + "step": 476 + }, + { + "epoch": 0.1853686971728359, + "grad_norm": 0.2105257511138916, + "learning_rate": 0.00016333203581159986, + "loss": 1.0231, + "step": 477 + }, + { + "epoch": 0.18575731079374333, + "grad_norm": 0.19537831842899323, + "learning_rate": 0.00016325418450759051, + "loss": 1.0103, + "step": 478 + }, + { + "epoch": 0.18614592441465072, + "grad_norm": 0.20522372424602509, + "learning_rate": 0.00016317633320358117, + "loss": 1.0196, + "step": 479 + }, + { + "epoch": 0.18653453803555814, + "grad_norm": 0.21646477282047272, + "learning_rate": 0.00016309848189957182, + "loss": 1.0579, + "step": 480 + }, + { + "epoch": 0.18692315165646556, + "grad_norm": 0.21077193319797516, + "learning_rate": 0.00016302063059556247, + "loss": 1.0638, + "step": 481 + }, + { + "epoch": 0.18731176527737298, + "grad_norm": 0.20357473194599152, + "learning_rate": 0.00016294277929155315, + "loss": 1.0635, + "step": 482 + }, + { + "epoch": 0.1877003788982804, + "grad_norm": 0.2188001275062561, + "learning_rate": 0.0001628649279875438, + "loss": 1.0267, + "step": 483 + }, + { + "epoch": 0.1880889925191878, + "grad_norm": 0.2128928154706955, + "learning_rate": 0.00016278707668353445, + "loss": 0.9706, + "step": 484 + }, + { + "epoch": 0.1884776061400952, + "grad_norm": 0.22081372141838074, + "learning_rate": 0.0001627092253795251, + "loss": 1.08, + "step": 485 + }, + { + "epoch": 0.18886621976100262, + "grad_norm": 0.2250615805387497, + "learning_rate": 0.00016263137407551576, + "loss": 1.1451, + "step": 486 + }, + { + "epoch": 0.18925483338191004, + "grad_norm": 0.1984967589378357, + "learning_rate": 0.00016255352277150644, + "loss": 1.0744, + "step": 487 + }, + { + "epoch": 0.18964344700281746, + "grad_norm": 0.20778900384902954, + "learning_rate": 0.0001624756714674971, + "loss": 1.0623, + "step": 488 + }, + { + "epoch": 0.19003206062372485, + "grad_norm": 0.2026563137769699, + "learning_rate": 0.00016239782016348774, + "loss": 1.0714, + "step": 489 + }, + { + "epoch": 0.19042067424463227, + "grad_norm": 0.21598374843597412, + "learning_rate": 0.0001623199688594784, + "loss": 1.0869, + "step": 490 + }, + { + "epoch": 0.1908092878655397, + "grad_norm": 0.18944978713989258, + "learning_rate": 0.00016224211755546904, + "loss": 1.055, + "step": 491 + }, + { + "epoch": 0.1911979014864471, + "grad_norm": 0.20698946714401245, + "learning_rate": 0.00016216426625145972, + "loss": 1.0392, + "step": 492 + }, + { + "epoch": 0.1915865151073545, + "grad_norm": 0.22395353019237518, + "learning_rate": 0.00016208641494745038, + "loss": 1.0681, + "step": 493 + }, + { + "epoch": 0.19197512872826192, + "grad_norm": 0.22372962534427643, + "learning_rate": 0.00016200856364344103, + "loss": 1.0767, + "step": 494 + }, + { + "epoch": 0.19236374234916934, + "grad_norm": 0.2066701054573059, + "learning_rate": 0.00016193071233943168, + "loss": 1.0061, + "step": 495 + }, + { + "epoch": 0.19275235597007676, + "grad_norm": 0.19716408848762512, + "learning_rate": 0.00016185286103542233, + "loss": 1.039, + "step": 496 + }, + { + "epoch": 0.19314096959098417, + "grad_norm": 0.22159601747989655, + "learning_rate": 0.000161775009731413, + "loss": 1.0832, + "step": 497 + }, + { + "epoch": 0.19352958321189156, + "grad_norm": 0.21509626507759094, + "learning_rate": 0.00016169715842740366, + "loss": 1.0264, + "step": 498 + }, + { + "epoch": 0.19391819683279898, + "grad_norm": 0.21598199009895325, + "learning_rate": 0.00016161930712339431, + "loss": 1.049, + "step": 499 + }, + { + "epoch": 0.1943068104537064, + "grad_norm": 0.20279590785503387, + "learning_rate": 0.00016154145581938497, + "loss": 1.0505, + "step": 500 + }, + { + "epoch": 0.19469542407461382, + "grad_norm": 0.21796855330467224, + "learning_rate": 0.00016146360451537565, + "loss": 1.0885, + "step": 501 + }, + { + "epoch": 0.19508403769552124, + "grad_norm": 0.22128933668136597, + "learning_rate": 0.0001613857532113663, + "loss": 1.0903, + "step": 502 + }, + { + "epoch": 0.19547265131642863, + "grad_norm": 0.2032536417245865, + "learning_rate": 0.00016130790190735695, + "loss": 1.0285, + "step": 503 + }, + { + "epoch": 0.19586126493733605, + "grad_norm": 0.23738974332809448, + "learning_rate": 0.0001612300506033476, + "loss": 1.1188, + "step": 504 + }, + { + "epoch": 0.19624987855824347, + "grad_norm": 0.19614790380001068, + "learning_rate": 0.00016115219929933828, + "loss": 1.04, + "step": 505 + }, + { + "epoch": 0.1966384921791509, + "grad_norm": 0.2198178917169571, + "learning_rate": 0.00016107434799532893, + "loss": 1.0696, + "step": 506 + }, + { + "epoch": 0.1970271058000583, + "grad_norm": 0.18814648687839508, + "learning_rate": 0.00016099649669131959, + "loss": 1.0203, + "step": 507 + }, + { + "epoch": 0.1974157194209657, + "grad_norm": 0.20699037611484528, + "learning_rate": 0.00016091864538731026, + "loss": 1.1074, + "step": 508 + }, + { + "epoch": 0.19780433304187311, + "grad_norm": 0.21490445733070374, + "learning_rate": 0.00016084079408330092, + "loss": 1.0682, + "step": 509 + }, + { + "epoch": 0.19819294666278053, + "grad_norm": 0.2363848090171814, + "learning_rate": 0.00016076294277929157, + "loss": 1.0408, + "step": 510 + }, + { + "epoch": 0.19858156028368795, + "grad_norm": 0.20186659693717957, + "learning_rate": 0.00016068509147528222, + "loss": 1.026, + "step": 511 + }, + { + "epoch": 0.19897017390459534, + "grad_norm": 0.21564024686813354, + "learning_rate": 0.00016060724017127287, + "loss": 1.0418, + "step": 512 + }, + { + "epoch": 0.19935878752550276, + "grad_norm": 0.19151560962200165, + "learning_rate": 0.00016052938886726355, + "loss": 1.0037, + "step": 513 + }, + { + "epoch": 0.19974740114641018, + "grad_norm": 0.21038194000720978, + "learning_rate": 0.0001604515375632542, + "loss": 1.0545, + "step": 514 + }, + { + "epoch": 0.2001360147673176, + "grad_norm": 0.20496582984924316, + "learning_rate": 0.00016037368625924486, + "loss": 1.0543, + "step": 515 + }, + { + "epoch": 0.20052462838822502, + "grad_norm": 0.20689113438129425, + "learning_rate": 0.0001602958349552355, + "loss": 1.0905, + "step": 516 + }, + { + "epoch": 0.2009132420091324, + "grad_norm": 0.2284041792154312, + "learning_rate": 0.00016021798365122616, + "loss": 1.0717, + "step": 517 + }, + { + "epoch": 0.20130185563003983, + "grad_norm": 0.23457761108875275, + "learning_rate": 0.00016014013234721684, + "loss": 1.106, + "step": 518 + }, + { + "epoch": 0.20169046925094725, + "grad_norm": 0.2088528722524643, + "learning_rate": 0.0001600622810432075, + "loss": 1.0428, + "step": 519 + }, + { + "epoch": 0.20207908287185467, + "grad_norm": 0.2170068770647049, + "learning_rate": 0.00015998442973919814, + "loss": 0.9875, + "step": 520 + }, + { + "epoch": 0.20246769649276208, + "grad_norm": 0.2270561158657074, + "learning_rate": 0.0001599065784351888, + "loss": 1.0676, + "step": 521 + }, + { + "epoch": 0.20285631011366947, + "grad_norm": 0.2151324599981308, + "learning_rate": 0.00015982872713117945, + "loss": 1.0675, + "step": 522 + }, + { + "epoch": 0.2032449237345769, + "grad_norm": 0.23113249242305756, + "learning_rate": 0.00015975087582717013, + "loss": 1.0608, + "step": 523 + }, + { + "epoch": 0.2036335373554843, + "grad_norm": 0.2587106227874756, + "learning_rate": 0.00015967302452316078, + "loss": 1.0867, + "step": 524 + }, + { + "epoch": 0.20402215097639173, + "grad_norm": 0.21842992305755615, + "learning_rate": 0.00015959517321915143, + "loss": 1.0726, + "step": 525 + }, + { + "epoch": 0.20441076459729912, + "grad_norm": 0.20867805182933807, + "learning_rate": 0.00015951732191514208, + "loss": 1.0578, + "step": 526 + }, + { + "epoch": 0.20479937821820654, + "grad_norm": 0.2396962195634842, + "learning_rate": 0.00015943947061113273, + "loss": 1.0292, + "step": 527 + }, + { + "epoch": 0.20518799183911396, + "grad_norm": 0.221155047416687, + "learning_rate": 0.00015936161930712341, + "loss": 1.0019, + "step": 528 + }, + { + "epoch": 0.20557660546002138, + "grad_norm": 0.20032119750976562, + "learning_rate": 0.00015928376800311407, + "loss": 1.0435, + "step": 529 + }, + { + "epoch": 0.2059652190809288, + "grad_norm": 0.24095888435840607, + "learning_rate": 0.00015920591669910472, + "loss": 1.0355, + "step": 530 + }, + { + "epoch": 0.2063538327018362, + "grad_norm": 0.2286604344844818, + "learning_rate": 0.00015912806539509537, + "loss": 0.9989, + "step": 531 + }, + { + "epoch": 0.2067424463227436, + "grad_norm": 0.21537137031555176, + "learning_rate": 0.00015905021409108602, + "loss": 1.0642, + "step": 532 + }, + { + "epoch": 0.20713105994365102, + "grad_norm": 0.22447925806045532, + "learning_rate": 0.0001589723627870767, + "loss": 1.1244, + "step": 533 + }, + { + "epoch": 0.20751967356455844, + "grad_norm": 0.21077273786067963, + "learning_rate": 0.00015889451148306735, + "loss": 1.0167, + "step": 534 + }, + { + "epoch": 0.20790828718546586, + "grad_norm": 0.22340558469295502, + "learning_rate": 0.000158816660179058, + "loss": 1.0991, + "step": 535 + }, + { + "epoch": 0.20829690080637325, + "grad_norm": 0.223599374294281, + "learning_rate": 0.00015873880887504866, + "loss": 1.086, + "step": 536 + }, + { + "epoch": 0.20868551442728067, + "grad_norm": 0.2615208923816681, + "learning_rate": 0.0001586609575710393, + "loss": 1.0584, + "step": 537 + }, + { + "epoch": 0.2090741280481881, + "grad_norm": 0.2085907757282257, + "learning_rate": 0.00015858310626703, + "loss": 1.0994, + "step": 538 + }, + { + "epoch": 0.2094627416690955, + "grad_norm": 0.2170211672782898, + "learning_rate": 0.00015850525496302064, + "loss": 1.1105, + "step": 539 + }, + { + "epoch": 0.20985135529000293, + "grad_norm": 0.21978625655174255, + "learning_rate": 0.0001584274036590113, + "loss": 1.002, + "step": 540 + }, + { + "epoch": 0.21023996891091032, + "grad_norm": 0.23684021830558777, + "learning_rate": 0.00015834955235500194, + "loss": 1.1216, + "step": 541 + }, + { + "epoch": 0.21062858253181774, + "grad_norm": 0.220269113779068, + "learning_rate": 0.0001582717010509926, + "loss": 1.0773, + "step": 542 + }, + { + "epoch": 0.21101719615272516, + "grad_norm": 0.22447973489761353, + "learning_rate": 0.00015819384974698328, + "loss": 1.0941, + "step": 543 + }, + { + "epoch": 0.21140580977363257, + "grad_norm": 0.22435730695724487, + "learning_rate": 0.00015811599844297393, + "loss": 1.0138, + "step": 544 + }, + { + "epoch": 0.21179442339453997, + "grad_norm": 0.2230793684720993, + "learning_rate": 0.00015803814713896458, + "loss": 1.0343, + "step": 545 + }, + { + "epoch": 0.21218303701544738, + "grad_norm": 0.23491905629634857, + "learning_rate": 0.00015796029583495523, + "loss": 1.11, + "step": 546 + }, + { + "epoch": 0.2125716506363548, + "grad_norm": 0.213560551404953, + "learning_rate": 0.00015788244453094588, + "loss": 1.0615, + "step": 547 + }, + { + "epoch": 0.21296026425726222, + "grad_norm": 0.21392837166786194, + "learning_rate": 0.00015780459322693654, + "loss": 1.0872, + "step": 548 + }, + { + "epoch": 0.21334887787816964, + "grad_norm": 0.20007692277431488, + "learning_rate": 0.00015772674192292722, + "loss": 1.0394, + "step": 549 + }, + { + "epoch": 0.21373749149907703, + "grad_norm": 0.1969841718673706, + "learning_rate": 0.00015764889061891787, + "loss": 1.0381, + "step": 550 + }, + { + "epoch": 0.21412610511998445, + "grad_norm": 0.21874025464057922, + "learning_rate": 0.00015757103931490852, + "loss": 1.0822, + "step": 551 + }, + { + "epoch": 0.21451471874089187, + "grad_norm": 0.21824273467063904, + "learning_rate": 0.00015749318801089917, + "loss": 1.0802, + "step": 552 + }, + { + "epoch": 0.2149033323617993, + "grad_norm": 0.20942047238349915, + "learning_rate": 0.00015741533670688985, + "loss": 1.0634, + "step": 553 + }, + { + "epoch": 0.2152919459827067, + "grad_norm": 0.1940152943134308, + "learning_rate": 0.0001573374854028805, + "loss": 1.0264, + "step": 554 + }, + { + "epoch": 0.2156805596036141, + "grad_norm": 0.19859059154987335, + "learning_rate": 0.00015725963409887115, + "loss": 0.9701, + "step": 555 + }, + { + "epoch": 0.21606917322452152, + "grad_norm": 0.22239404916763306, + "learning_rate": 0.0001571817827948618, + "loss": 1.1282, + "step": 556 + }, + { + "epoch": 0.21645778684542893, + "grad_norm": 0.23820599913597107, + "learning_rate": 0.00015710393149085249, + "loss": 1.1123, + "step": 557 + }, + { + "epoch": 0.21684640046633635, + "grad_norm": 0.21279917657375336, + "learning_rate": 0.00015702608018684314, + "loss": 1.0542, + "step": 558 + }, + { + "epoch": 0.21723501408724374, + "grad_norm": 0.2065514773130417, + "learning_rate": 0.0001569482288828338, + "loss": 1.0685, + "step": 559 + }, + { + "epoch": 0.21762362770815116, + "grad_norm": 0.20130831003189087, + "learning_rate": 0.00015687037757882447, + "loss": 0.9869, + "step": 560 + }, + { + "epoch": 0.21801224132905858, + "grad_norm": 0.2187541127204895, + "learning_rate": 0.00015679252627481512, + "loss": 1.1095, + "step": 561 + }, + { + "epoch": 0.218400854949966, + "grad_norm": 0.21028277277946472, + "learning_rate": 0.00015671467497080577, + "loss": 1.0804, + "step": 562 + }, + { + "epoch": 0.21878946857087342, + "grad_norm": 0.8187636733055115, + "learning_rate": 0.00015663682366679643, + "loss": 1.0782, + "step": 563 + }, + { + "epoch": 0.2191780821917808, + "grad_norm": 0.20059974491596222, + "learning_rate": 0.0001565589723627871, + "loss": 1.0279, + "step": 564 + }, + { + "epoch": 0.21956669581268823, + "grad_norm": 0.20440839231014252, + "learning_rate": 0.00015648112105877776, + "loss": 0.9863, + "step": 565 + }, + { + "epoch": 0.21995530943359565, + "grad_norm": 0.21423624455928802, + "learning_rate": 0.0001564032697547684, + "loss": 1.0685, + "step": 566 + }, + { + "epoch": 0.22034392305450307, + "grad_norm": 0.22430062294006348, + "learning_rate": 0.00015632541845075906, + "loss": 1.0761, + "step": 567 + }, + { + "epoch": 0.22073253667541048, + "grad_norm": 0.22782258689403534, + "learning_rate": 0.0001562475671467497, + "loss": 1.1024, + "step": 568 + }, + { + "epoch": 0.22112115029631788, + "grad_norm": 0.21150320768356323, + "learning_rate": 0.0001561697158427404, + "loss": 1.0621, + "step": 569 + }, + { + "epoch": 0.2215097639172253, + "grad_norm": 0.20342351496219635, + "learning_rate": 0.00015609186453873104, + "loss": 1.0667, + "step": 570 + }, + { + "epoch": 0.2218983775381327, + "grad_norm": 0.22866711020469666, + "learning_rate": 0.0001560140132347217, + "loss": 1.0631, + "step": 571 + }, + { + "epoch": 0.22228699115904013, + "grad_norm": 0.2200063169002533, + "learning_rate": 0.00015593616193071235, + "loss": 1.0448, + "step": 572 + }, + { + "epoch": 0.22267560477994755, + "grad_norm": 0.19440248608589172, + "learning_rate": 0.000155858310626703, + "loss": 1.037, + "step": 573 + }, + { + "epoch": 0.22306421840085494, + "grad_norm": 0.205752432346344, + "learning_rate": 0.00015578045932269368, + "loss": 1.0465, + "step": 574 + }, + { + "epoch": 0.22345283202176236, + "grad_norm": 0.22247998416423798, + "learning_rate": 0.00015570260801868433, + "loss": 0.997, + "step": 575 + }, + { + "epoch": 0.22384144564266978, + "grad_norm": 0.22199274599552155, + "learning_rate": 0.00015562475671467498, + "loss": 1.0178, + "step": 576 + }, + { + "epoch": 0.2242300592635772, + "grad_norm": 0.2114989310503006, + "learning_rate": 0.00015554690541066564, + "loss": 1.0457, + "step": 577 + }, + { + "epoch": 0.2246186728844846, + "grad_norm": 0.24248506128787994, + "learning_rate": 0.0001554690541066563, + "loss": 1.002, + "step": 578 + }, + { + "epoch": 0.225007286505392, + "grad_norm": 0.2565505802631378, + "learning_rate": 0.00015539120280264697, + "loss": 1.0541, + "step": 579 + }, + { + "epoch": 0.22539590012629943, + "grad_norm": 0.22799409925937653, + "learning_rate": 0.00015531335149863762, + "loss": 1.0788, + "step": 580 + }, + { + "epoch": 0.22578451374720684, + "grad_norm": 0.2196080982685089, + "learning_rate": 0.00015523550019462827, + "loss": 1.0877, + "step": 581 + }, + { + "epoch": 0.22617312736811426, + "grad_norm": 0.21992824971675873, + "learning_rate": 0.00015515764889061892, + "loss": 1.0213, + "step": 582 + }, + { + "epoch": 0.22656174098902165, + "grad_norm": 0.22793298959732056, + "learning_rate": 0.00015507979758660957, + "loss": 1.0633, + "step": 583 + }, + { + "epoch": 0.22695035460992907, + "grad_norm": 0.21707972884178162, + "learning_rate": 0.00015500194628260023, + "loss": 1.081, + "step": 584 + }, + { + "epoch": 0.2273389682308365, + "grad_norm": 0.220685675740242, + "learning_rate": 0.0001549240949785909, + "loss": 1.0658, + "step": 585 + }, + { + "epoch": 0.2277275818517439, + "grad_norm": 0.22576668858528137, + "learning_rate": 0.00015484624367458156, + "loss": 1.0795, + "step": 586 + }, + { + "epoch": 0.22811619547265133, + "grad_norm": 0.21778982877731323, + "learning_rate": 0.0001547683923705722, + "loss": 1.033, + "step": 587 + }, + { + "epoch": 0.22850480909355872, + "grad_norm": 0.22748610377311707, + "learning_rate": 0.00015469054106656286, + "loss": 1.0948, + "step": 588 + }, + { + "epoch": 0.22889342271446614, + "grad_norm": 0.21561284363269806, + "learning_rate": 0.00015461268976255351, + "loss": 1.0022, + "step": 589 + }, + { + "epoch": 0.22928203633537356, + "grad_norm": 0.2419756054878235, + "learning_rate": 0.0001545348384585442, + "loss": 1.0786, + "step": 590 + }, + { + "epoch": 0.22967064995628098, + "grad_norm": 0.20479315519332886, + "learning_rate": 0.00015445698715453485, + "loss": 1.027, + "step": 591 + }, + { + "epoch": 0.2300592635771884, + "grad_norm": 0.21365883946418762, + "learning_rate": 0.0001543791358505255, + "loss": 1.0773, + "step": 592 + }, + { + "epoch": 0.23044787719809579, + "grad_norm": 0.23133166134357452, + "learning_rate": 0.00015430128454651615, + "loss": 1.0877, + "step": 593 + }, + { + "epoch": 0.2308364908190032, + "grad_norm": 0.2110515981912613, + "learning_rate": 0.0001542234332425068, + "loss": 1.0509, + "step": 594 + }, + { + "epoch": 0.23122510443991062, + "grad_norm": 0.20658442378044128, + "learning_rate": 0.00015414558193849748, + "loss": 1.0623, + "step": 595 + }, + { + "epoch": 0.23161371806081804, + "grad_norm": 0.21831996738910675, + "learning_rate": 0.00015406773063448813, + "loss": 1.021, + "step": 596 + }, + { + "epoch": 0.23200233168172543, + "grad_norm": 0.23015642166137695, + "learning_rate": 0.00015398987933047878, + "loss": 1.0358, + "step": 597 + }, + { + "epoch": 0.23239094530263285, + "grad_norm": 0.23071645200252533, + "learning_rate": 0.00015391202802646944, + "loss": 1.1255, + "step": 598 + }, + { + "epoch": 0.23277955892354027, + "grad_norm": 0.19513486325740814, + "learning_rate": 0.0001538341767224601, + "loss": 1.0189, + "step": 599 + }, + { + "epoch": 0.2331681725444477, + "grad_norm": 0.20821452140808105, + "learning_rate": 0.00015375632541845077, + "loss": 1.0843, + "step": 600 + }, + { + "epoch": 0.2335567861653551, + "grad_norm": 0.20563223958015442, + "learning_rate": 0.00015367847411444142, + "loss": 1.0012, + "step": 601 + }, + { + "epoch": 0.2339453997862625, + "grad_norm": 0.22674202919006348, + "learning_rate": 0.00015360062281043207, + "loss": 1.0371, + "step": 602 + }, + { + "epoch": 0.23433401340716992, + "grad_norm": 0.20744135975837708, + "learning_rate": 0.00015352277150642272, + "loss": 1.0466, + "step": 603 + }, + { + "epoch": 0.23472262702807734, + "grad_norm": 0.22103577852249146, + "learning_rate": 0.00015344492020241338, + "loss": 1.0942, + "step": 604 + }, + { + "epoch": 0.23511124064898475, + "grad_norm": 0.20643098652362823, + "learning_rate": 0.00015336706889840406, + "loss": 1.0682, + "step": 605 + }, + { + "epoch": 0.23549985426989217, + "grad_norm": 0.23436777293682098, + "learning_rate": 0.0001532892175943947, + "loss": 1.0613, + "step": 606 + }, + { + "epoch": 0.23588846789079956, + "grad_norm": 0.21898899972438812, + "learning_rate": 0.00015321136629038536, + "loss": 1.0571, + "step": 607 + }, + { + "epoch": 0.23627708151170698, + "grad_norm": 0.20569247007369995, + "learning_rate": 0.00015313351498637604, + "loss": 1.061, + "step": 608 + }, + { + "epoch": 0.2366656951326144, + "grad_norm": 0.2099207490682602, + "learning_rate": 0.0001530556636823667, + "loss": 1.0776, + "step": 609 + }, + { + "epoch": 0.23705430875352182, + "grad_norm": 0.20078738033771515, + "learning_rate": 0.00015297781237835734, + "loss": 1.0341, + "step": 610 + }, + { + "epoch": 0.2374429223744292, + "grad_norm": 0.20327065885066986, + "learning_rate": 0.000152899961074348, + "loss": 1.0168, + "step": 611 + }, + { + "epoch": 0.23783153599533663, + "grad_norm": 0.21741214394569397, + "learning_rate": 0.00015282210977033867, + "loss": 1.0726, + "step": 612 + }, + { + "epoch": 0.23822014961624405, + "grad_norm": 0.2065727263689041, + "learning_rate": 0.00015274425846632933, + "loss": 1.0474, + "step": 613 + }, + { + "epoch": 0.23860876323715147, + "grad_norm": 0.21241194009780884, + "learning_rate": 0.00015266640716231998, + "loss": 1.0666, + "step": 614 + }, + { + "epoch": 0.23899737685805889, + "grad_norm": 0.2194201797246933, + "learning_rate": 0.00015258855585831066, + "loss": 1.1411, + "step": 615 + }, + { + "epoch": 0.23938599047896628, + "grad_norm": 0.21537193655967712, + "learning_rate": 0.0001525107045543013, + "loss": 1.081, + "step": 616 + }, + { + "epoch": 0.2397746040998737, + "grad_norm": 0.21125951409339905, + "learning_rate": 0.00015243285325029196, + "loss": 1.0679, + "step": 617 + }, + { + "epoch": 0.2401632177207811, + "grad_norm": 0.21342721581459045, + "learning_rate": 0.0001523550019462826, + "loss": 1.0564, + "step": 618 + }, + { + "epoch": 0.24055183134168853, + "grad_norm": 0.2223503291606903, + "learning_rate": 0.00015227715064227327, + "loss": 1.1163, + "step": 619 + }, + { + "epoch": 0.24094044496259595, + "grad_norm": 0.21626527607440948, + "learning_rate": 0.00015219929933826394, + "loss": 1.0793, + "step": 620 + }, + { + "epoch": 0.24132905858350334, + "grad_norm": 0.21899500489234924, + "learning_rate": 0.0001521214480342546, + "loss": 1.0864, + "step": 621 + }, + { + "epoch": 0.24171767220441076, + "grad_norm": 0.2499915212392807, + "learning_rate": 0.00015204359673024525, + "loss": 1.1381, + "step": 622 + }, + { + "epoch": 0.24210628582531818, + "grad_norm": 0.2108345925807953, + "learning_rate": 0.0001519657454262359, + "loss": 1.0534, + "step": 623 + }, + { + "epoch": 0.2424948994462256, + "grad_norm": 0.2224910855293274, + "learning_rate": 0.00015188789412222655, + "loss": 1.0235, + "step": 624 + }, + { + "epoch": 0.24288351306713302, + "grad_norm": 0.22163094580173492, + "learning_rate": 0.0001518100428182172, + "loss": 1.0143, + "step": 625 + }, + { + "epoch": 0.2432721266880404, + "grad_norm": 0.20709283649921417, + "learning_rate": 0.00015173219151420788, + "loss": 1.0506, + "step": 626 + }, + { + "epoch": 0.24366074030894783, + "grad_norm": 0.2112802267074585, + "learning_rate": 0.00015165434021019854, + "loss": 1.0692, + "step": 627 + }, + { + "epoch": 0.24404935392985525, + "grad_norm": 0.23622830212116241, + "learning_rate": 0.0001515764889061892, + "loss": 1.0769, + "step": 628 + }, + { + "epoch": 0.24443796755076266, + "grad_norm": 0.23328271508216858, + "learning_rate": 0.00015149863760217984, + "loss": 1.1158, + "step": 629 + }, + { + "epoch": 0.24482658117167005, + "grad_norm": 0.2071760892868042, + "learning_rate": 0.0001514207862981705, + "loss": 1.0133, + "step": 630 + }, + { + "epoch": 0.24521519479257747, + "grad_norm": 0.21428920328617096, + "learning_rate": 0.00015134293499416117, + "loss": 1.0342, + "step": 631 + }, + { + "epoch": 0.2456038084134849, + "grad_norm": 0.22225375473499298, + "learning_rate": 0.00015126508369015182, + "loss": 1.1054, + "step": 632 + }, + { + "epoch": 0.2459924220343923, + "grad_norm": 0.2096671611070633, + "learning_rate": 0.00015118723238614248, + "loss": 1.0229, + "step": 633 + }, + { + "epoch": 0.24638103565529973, + "grad_norm": 0.21473252773284912, + "learning_rate": 0.00015110938108213313, + "loss": 1.0915, + "step": 634 + }, + { + "epoch": 0.24676964927620712, + "grad_norm": 0.2071562111377716, + "learning_rate": 0.00015103152977812378, + "loss": 1.047, + "step": 635 + }, + { + "epoch": 0.24715826289711454, + "grad_norm": 0.19868609309196472, + "learning_rate": 0.00015095367847411446, + "loss": 1.0073, + "step": 636 + }, + { + "epoch": 0.24754687651802196, + "grad_norm": 0.20937366783618927, + "learning_rate": 0.0001508758271701051, + "loss": 1.0155, + "step": 637 + }, + { + "epoch": 0.24793549013892938, + "grad_norm": 0.19225911796092987, + "learning_rate": 0.00015079797586609576, + "loss": 1.0163, + "step": 638 + }, + { + "epoch": 0.2483241037598368, + "grad_norm": 0.20427283644676208, + "learning_rate": 0.00015072012456208641, + "loss": 1.062, + "step": 639 + }, + { + "epoch": 0.24871271738074419, + "grad_norm": 0.21640253067016602, + "learning_rate": 0.00015064227325807707, + "loss": 1.025, + "step": 640 + }, + { + "epoch": 0.2491013310016516, + "grad_norm": 0.20416739583015442, + "learning_rate": 0.00015056442195406775, + "loss": 1.0635, + "step": 641 + }, + { + "epoch": 0.24948994462255902, + "grad_norm": 0.1990521252155304, + "learning_rate": 0.0001504865706500584, + "loss": 1.0757, + "step": 642 + }, + { + "epoch": 0.24987855824346644, + "grad_norm": 0.21636444330215454, + "learning_rate": 0.00015040871934604905, + "loss": 1.0441, + "step": 643 + }, + { + "epoch": 0.25026717186437386, + "grad_norm": 0.21253719925880432, + "learning_rate": 0.0001503308680420397, + "loss": 1.0574, + "step": 644 + }, + { + "epoch": 0.2506557854852813, + "grad_norm": 0.2134159356355667, + "learning_rate": 0.00015025301673803035, + "loss": 1.0396, + "step": 645 + }, + { + "epoch": 0.2510443991061887, + "grad_norm": 0.2018527239561081, + "learning_rate": 0.00015017516543402103, + "loss": 1.0606, + "step": 646 + }, + { + "epoch": 0.25143301272709606, + "grad_norm": 0.20320741832256317, + "learning_rate": 0.00015009731413001169, + "loss": 1.0093, + "step": 647 + }, + { + "epoch": 0.2518216263480035, + "grad_norm": 0.21007056534290314, + "learning_rate": 0.00015001946282600234, + "loss": 1.0284, + "step": 648 + }, + { + "epoch": 0.2522102399689109, + "grad_norm": 0.22453372180461884, + "learning_rate": 0.000149941611521993, + "loss": 1.0271, + "step": 649 + }, + { + "epoch": 0.2525988535898183, + "grad_norm": 0.19889335334300995, + "learning_rate": 0.00014986376021798364, + "loss": 1.0238, + "step": 650 + }, + { + "epoch": 0.25298746721072574, + "grad_norm": 0.19339965283870697, + "learning_rate": 0.00014978590891397432, + "loss": 1.024, + "step": 651 + }, + { + "epoch": 0.25337608083163315, + "grad_norm": 0.22362011671066284, + "learning_rate": 0.00014970805760996497, + "loss": 1.0722, + "step": 652 + }, + { + "epoch": 0.2537646944525406, + "grad_norm": 0.2110588103532791, + "learning_rate": 0.00014963020630595562, + "loss": 1.0541, + "step": 653 + }, + { + "epoch": 0.254153308073448, + "grad_norm": 0.203025683760643, + "learning_rate": 0.00014955235500194628, + "loss": 1.0335, + "step": 654 + }, + { + "epoch": 0.2545419216943554, + "grad_norm": 0.20884902775287628, + "learning_rate": 0.00014947450369793693, + "loss": 1.0507, + "step": 655 + }, + { + "epoch": 0.2549305353152628, + "grad_norm": 0.21234256029129028, + "learning_rate": 0.0001493966523939276, + "loss": 1.0372, + "step": 656 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 0.1984352171421051, + "learning_rate": 0.00014931880108991826, + "loss": 0.9979, + "step": 657 + }, + { + "epoch": 0.2557077625570776, + "grad_norm": 0.18848282098770142, + "learning_rate": 0.0001492409497859089, + "loss": 0.9973, + "step": 658 + }, + { + "epoch": 0.25609637617798503, + "grad_norm": 0.2201709896326065, + "learning_rate": 0.00014916309848189956, + "loss": 1.0386, + "step": 659 + }, + { + "epoch": 0.25648498979889245, + "grad_norm": 0.23094095289707184, + "learning_rate": 0.00014908524717789024, + "loss": 1.1205, + "step": 660 + }, + { + "epoch": 0.25687360341979987, + "grad_norm": 0.21087734401226044, + "learning_rate": 0.0001490073958738809, + "loss": 1.0231, + "step": 661 + }, + { + "epoch": 0.2572622170407073, + "grad_norm": 0.24970979988574982, + "learning_rate": 0.00014892954456987155, + "loss": 1.0421, + "step": 662 + }, + { + "epoch": 0.2576508306616147, + "grad_norm": 0.22024711966514587, + "learning_rate": 0.00014885169326586223, + "loss": 1.1033, + "step": 663 + }, + { + "epoch": 0.2580394442825221, + "grad_norm": 0.2195248156785965, + "learning_rate": 0.00014877384196185288, + "loss": 1.089, + "step": 664 + }, + { + "epoch": 0.25842805790342954, + "grad_norm": 0.20236417651176453, + "learning_rate": 0.00014869599065784353, + "loss": 1.0196, + "step": 665 + }, + { + "epoch": 0.2588166715243369, + "grad_norm": 0.21973329782485962, + "learning_rate": 0.00014861813935383418, + "loss": 1.0844, + "step": 666 + }, + { + "epoch": 0.2592052851452443, + "grad_norm": 0.2069879174232483, + "learning_rate": 0.00014854028804982486, + "loss": 1.0312, + "step": 667 + }, + { + "epoch": 0.25959389876615174, + "grad_norm": 0.2037455290555954, + "learning_rate": 0.00014846243674581551, + "loss": 1.0018, + "step": 668 + }, + { + "epoch": 0.25998251238705916, + "grad_norm": 0.24176378548145294, + "learning_rate": 0.00014838458544180617, + "loss": 1.0749, + "step": 669 + }, + { + "epoch": 0.2603711260079666, + "grad_norm": 0.2007879763841629, + "learning_rate": 0.00014830673413779682, + "loss": 1.0443, + "step": 670 + }, + { + "epoch": 0.260759739628874, + "grad_norm": 0.23503245413303375, + "learning_rate": 0.00014822888283378747, + "loss": 1.0674, + "step": 671 + }, + { + "epoch": 0.2611483532497814, + "grad_norm": 0.2166167050600052, + "learning_rate": 0.00014815103152977815, + "loss": 1.079, + "step": 672 + }, + { + "epoch": 0.26153696687068884, + "grad_norm": 0.2293982058763504, + "learning_rate": 0.0001480731802257688, + "loss": 1.0517, + "step": 673 + }, + { + "epoch": 0.26192558049159625, + "grad_norm": 0.21040330827236176, + "learning_rate": 0.00014799532892175945, + "loss": 1.0475, + "step": 674 + }, + { + "epoch": 0.2623141941125036, + "grad_norm": 0.20750463008880615, + "learning_rate": 0.0001479174776177501, + "loss": 1.025, + "step": 675 + }, + { + "epoch": 0.26270280773341104, + "grad_norm": 0.2748873233795166, + "learning_rate": 0.00014783962631374076, + "loss": 1.0212, + "step": 676 + }, + { + "epoch": 0.26309142135431846, + "grad_norm": 0.19212333858013153, + "learning_rate": 0.00014776177500973144, + "loss": 1.0049, + "step": 677 + }, + { + "epoch": 0.2634800349752259, + "grad_norm": 0.207731693983078, + "learning_rate": 0.0001476839237057221, + "loss": 1.0062, + "step": 678 + }, + { + "epoch": 0.2638686485961333, + "grad_norm": 0.2177981585264206, + "learning_rate": 0.00014760607240171274, + "loss": 1.0489, + "step": 679 + }, + { + "epoch": 0.2642572622170407, + "grad_norm": 0.23239290714263916, + "learning_rate": 0.0001475282210977034, + "loss": 1.0856, + "step": 680 + }, + { + "epoch": 0.26464587583794813, + "grad_norm": 0.2033151388168335, + "learning_rate": 0.00014745036979369404, + "loss": 1.0389, + "step": 681 + }, + { + "epoch": 0.26503448945885555, + "grad_norm": 0.20917408168315887, + "learning_rate": 0.00014737251848968472, + "loss": 1.1208, + "step": 682 + }, + { + "epoch": 0.26542310307976297, + "grad_norm": 0.22075454890727997, + "learning_rate": 0.00014729466718567538, + "loss": 1.0435, + "step": 683 + }, + { + "epoch": 0.26581171670067033, + "grad_norm": 0.23094993829727173, + "learning_rate": 0.00014721681588166603, + "loss": 1.0649, + "step": 684 + }, + { + "epoch": 0.26620033032157775, + "grad_norm": 0.21209536492824554, + "learning_rate": 0.00014713896457765668, + "loss": 1.0578, + "step": 685 + }, + { + "epoch": 0.26658894394248517, + "grad_norm": 0.21412219107151031, + "learning_rate": 0.00014706111327364733, + "loss": 1.1137, + "step": 686 + }, + { + "epoch": 0.2669775575633926, + "grad_norm": 0.21175475418567657, + "learning_rate": 0.000146983261969638, + "loss": 1.023, + "step": 687 + }, + { + "epoch": 0.2673661711843, + "grad_norm": 0.21968993544578552, + "learning_rate": 0.00014690541066562866, + "loss": 1.1183, + "step": 688 + }, + { + "epoch": 0.2677547848052074, + "grad_norm": 0.20414218306541443, + "learning_rate": 0.00014682755936161932, + "loss": 1.078, + "step": 689 + }, + { + "epoch": 0.26814339842611484, + "grad_norm": 0.18986597657203674, + "learning_rate": 0.00014674970805760997, + "loss": 1.0029, + "step": 690 + }, + { + "epoch": 0.26853201204702226, + "grad_norm": 0.21215832233428955, + "learning_rate": 0.00014667185675360062, + "loss": 1.0759, + "step": 691 + }, + { + "epoch": 0.2689206256679297, + "grad_norm": 0.2113744169473648, + "learning_rate": 0.0001465940054495913, + "loss": 1.1027, + "step": 692 + }, + { + "epoch": 0.2693092392888371, + "grad_norm": 0.22010880708694458, + "learning_rate": 0.00014651615414558195, + "loss": 1.0984, + "step": 693 + }, + { + "epoch": 0.26969785290974446, + "grad_norm": 0.203857421875, + "learning_rate": 0.0001464383028415726, + "loss": 1.0407, + "step": 694 + }, + { + "epoch": 0.2700864665306519, + "grad_norm": 0.21120867133140564, + "learning_rate": 0.00014636045153756325, + "loss": 1.0521, + "step": 695 + }, + { + "epoch": 0.2704750801515593, + "grad_norm": 0.20039112865924835, + "learning_rate": 0.0001462826002335539, + "loss": 1.0897, + "step": 696 + }, + { + "epoch": 0.2708636937724667, + "grad_norm": 0.22893202304840088, + "learning_rate": 0.00014620474892954456, + "loss": 1.0903, + "step": 697 + }, + { + "epoch": 0.27125230739337414, + "grad_norm": 0.19886267185211182, + "learning_rate": 0.00014612689762553524, + "loss": 1.0889, + "step": 698 + }, + { + "epoch": 0.27164092101428156, + "grad_norm": 0.18892349302768707, + "learning_rate": 0.0001460490463215259, + "loss": 0.981, + "step": 699 + }, + { + "epoch": 0.272029534635189, + "grad_norm": 0.20602507889270782, + "learning_rate": 0.00014597119501751654, + "loss": 1.0223, + "step": 700 + }, + { + "epoch": 0.2724181482560964, + "grad_norm": 0.21480505168437958, + "learning_rate": 0.0001458933437135072, + "loss": 1.0355, + "step": 701 + }, + { + "epoch": 0.2728067618770038, + "grad_norm": 0.21011753380298615, + "learning_rate": 0.00014581549240949785, + "loss": 1.0613, + "step": 702 + }, + { + "epoch": 0.2731953754979112, + "grad_norm": 0.19350819289684296, + "learning_rate": 0.00014573764110548853, + "loss": 1.0144, + "step": 703 + }, + { + "epoch": 0.2735839891188186, + "grad_norm": 0.207548126578331, + "learning_rate": 0.00014565978980147918, + "loss": 1.0465, + "step": 704 + }, + { + "epoch": 0.273972602739726, + "grad_norm": 0.22220565378665924, + "learning_rate": 0.00014558193849746983, + "loss": 1.1073, + "step": 705 + }, + { + "epoch": 0.27436121636063343, + "grad_norm": 0.193622425198555, + "learning_rate": 0.00014550408719346048, + "loss": 1.0357, + "step": 706 + }, + { + "epoch": 0.27474982998154085, + "grad_norm": 0.2067158818244934, + "learning_rate": 0.00014542623588945113, + "loss": 1.0502, + "step": 707 + }, + { + "epoch": 0.27513844360244827, + "grad_norm": 0.2218742072582245, + "learning_rate": 0.0001453483845854418, + "loss": 0.9934, + "step": 708 + }, + { + "epoch": 0.2755270572233557, + "grad_norm": 0.22316142916679382, + "learning_rate": 0.00014527053328143246, + "loss": 1.0707, + "step": 709 + }, + { + "epoch": 0.2759156708442631, + "grad_norm": 0.21004025638103485, + "learning_rate": 0.00014519268197742312, + "loss": 1.0543, + "step": 710 + }, + { + "epoch": 0.2763042844651705, + "grad_norm": 0.22070440649986267, + "learning_rate": 0.00014511483067341377, + "loss": 1.0467, + "step": 711 + }, + { + "epoch": 0.27669289808607794, + "grad_norm": 0.21463747322559357, + "learning_rate": 0.00014503697936940445, + "loss": 1.0793, + "step": 712 + }, + { + "epoch": 0.2770815117069853, + "grad_norm": 0.23452533781528473, + "learning_rate": 0.0001449591280653951, + "loss": 1.043, + "step": 713 + }, + { + "epoch": 0.2774701253278927, + "grad_norm": 0.2405795156955719, + "learning_rate": 0.00014488127676138575, + "loss": 1.0752, + "step": 714 + }, + { + "epoch": 0.27785873894880014, + "grad_norm": 0.21546585857868195, + "learning_rate": 0.00014480342545737643, + "loss": 1.0834, + "step": 715 + }, + { + "epoch": 0.27824735256970756, + "grad_norm": 0.22675828635692596, + "learning_rate": 0.00014472557415336708, + "loss": 1.055, + "step": 716 + }, + { + "epoch": 0.278635966190615, + "grad_norm": 0.2117871195077896, + "learning_rate": 0.00014464772284935774, + "loss": 1.03, + "step": 717 + }, + { + "epoch": 0.2790245798115224, + "grad_norm": 0.2193155735731125, + "learning_rate": 0.00014456987154534841, + "loss": 1.0073, + "step": 718 + }, + { + "epoch": 0.2794131934324298, + "grad_norm": 0.21447965502738953, + "learning_rate": 0.00014449202024133907, + "loss": 1.0174, + "step": 719 + }, + { + "epoch": 0.27980180705333724, + "grad_norm": 0.22867532074451447, + "learning_rate": 0.00014441416893732972, + "loss": 1.0948, + "step": 720 + }, + { + "epoch": 0.28019042067424466, + "grad_norm": 0.21570557355880737, + "learning_rate": 0.00014433631763332037, + "loss": 1.0105, + "step": 721 + }, + { + "epoch": 0.280579034295152, + "grad_norm": 0.20787014067173004, + "learning_rate": 0.00014425846632931102, + "loss": 1.0384, + "step": 722 + }, + { + "epoch": 0.28096764791605944, + "grad_norm": 0.19924762845039368, + "learning_rate": 0.0001441806150253017, + "loss": 1.0653, + "step": 723 + }, + { + "epoch": 0.28135626153696686, + "grad_norm": 0.1996215283870697, + "learning_rate": 0.00014410276372129235, + "loss": 1.0439, + "step": 724 + }, + { + "epoch": 0.2817448751578743, + "grad_norm": 0.2054813802242279, + "learning_rate": 0.000144024912417283, + "loss": 0.9895, + "step": 725 + }, + { + "epoch": 0.2821334887787817, + "grad_norm": 0.2268310785293579, + "learning_rate": 0.00014394706111327366, + "loss": 1.0993, + "step": 726 + }, + { + "epoch": 0.2825221023996891, + "grad_norm": 0.19867680966854095, + "learning_rate": 0.0001438692098092643, + "loss": 0.985, + "step": 727 + }, + { + "epoch": 0.28291071602059653, + "grad_norm": 0.21099598705768585, + "learning_rate": 0.000143791358505255, + "loss": 1.0333, + "step": 728 + }, + { + "epoch": 0.28329932964150395, + "grad_norm": 0.22479215264320374, + "learning_rate": 0.00014371350720124564, + "loss": 1.0449, + "step": 729 + }, + { + "epoch": 0.28368794326241137, + "grad_norm": 0.22717688977718353, + "learning_rate": 0.0001436356558972363, + "loss": 1.0482, + "step": 730 + }, + { + "epoch": 0.2840765568833188, + "grad_norm": 0.20389345288276672, + "learning_rate": 0.00014355780459322695, + "loss": 0.956, + "step": 731 + }, + { + "epoch": 0.28446517050422615, + "grad_norm": 0.21583619713783264, + "learning_rate": 0.0001434799532892176, + "loss": 1.0154, + "step": 732 + }, + { + "epoch": 0.28485378412513357, + "grad_norm": 0.2219148874282837, + "learning_rate": 0.00014340210198520825, + "loss": 1.0553, + "step": 733 + }, + { + "epoch": 0.285242397746041, + "grad_norm": 0.19920189678668976, + "learning_rate": 0.00014332425068119893, + "loss": 0.9881, + "step": 734 + }, + { + "epoch": 0.2856310113669484, + "grad_norm": 0.2295670360326767, + "learning_rate": 0.00014324639937718958, + "loss": 1.0529, + "step": 735 + }, + { + "epoch": 0.2860196249878558, + "grad_norm": 0.21271567046642303, + "learning_rate": 0.00014316854807318023, + "loss": 1.037, + "step": 736 + }, + { + "epoch": 0.28640823860876324, + "grad_norm": 0.21304361522197723, + "learning_rate": 0.00014309069676917088, + "loss": 1.048, + "step": 737 + }, + { + "epoch": 0.28679685222967066, + "grad_norm": 0.19902732968330383, + "learning_rate": 0.00014301284546516154, + "loss": 1.0306, + "step": 738 + }, + { + "epoch": 0.2871854658505781, + "grad_norm": 0.1995929330587387, + "learning_rate": 0.00014293499416115222, + "loss": 1.0394, + "step": 739 + }, + { + "epoch": 0.2875740794714855, + "grad_norm": 0.20426060259342194, + "learning_rate": 0.00014285714285714287, + "loss": 1.0052, + "step": 740 + }, + { + "epoch": 0.28796269309239286, + "grad_norm": 0.20284566283226013, + "learning_rate": 0.00014277929155313352, + "loss": 1.0115, + "step": 741 + }, + { + "epoch": 0.2883513067133003, + "grad_norm": 0.2041557878255844, + "learning_rate": 0.00014270144024912417, + "loss": 1.0473, + "step": 742 + }, + { + "epoch": 0.2887399203342077, + "grad_norm": 0.2152249962091446, + "learning_rate": 0.00014262358894511482, + "loss": 1.0802, + "step": 743 + }, + { + "epoch": 0.2891285339551151, + "grad_norm": 0.20569871366024017, + "learning_rate": 0.0001425457376411055, + "loss": 1.0203, + "step": 744 + }, + { + "epoch": 0.28951714757602254, + "grad_norm": 0.21128378808498383, + "learning_rate": 0.00014246788633709616, + "loss": 1.108, + "step": 745 + }, + { + "epoch": 0.28990576119692996, + "grad_norm": 0.19587135314941406, + "learning_rate": 0.0001423900350330868, + "loss": 1.0427, + "step": 746 + }, + { + "epoch": 0.2902943748178374, + "grad_norm": 0.22052550315856934, + "learning_rate": 0.00014231218372907746, + "loss": 1.055, + "step": 747 + }, + { + "epoch": 0.2906829884387448, + "grad_norm": 0.21291717886924744, + "learning_rate": 0.0001422343324250681, + "loss": 1.0591, + "step": 748 + }, + { + "epoch": 0.2910716020596522, + "grad_norm": 0.20634084939956665, + "learning_rate": 0.0001421564811210588, + "loss": 1.0527, + "step": 749 + }, + { + "epoch": 0.29146021568055963, + "grad_norm": 0.2075488269329071, + "learning_rate": 0.00014207862981704944, + "loss": 1.0786, + "step": 750 + }, + { + "epoch": 0.291848829301467, + "grad_norm": 0.19780080020427704, + "learning_rate": 0.0001420007785130401, + "loss": 1.059, + "step": 751 + }, + { + "epoch": 0.2922374429223744, + "grad_norm": 0.21212074160575867, + "learning_rate": 0.00014192292720903075, + "loss": 1.0346, + "step": 752 + }, + { + "epoch": 0.29262605654328183, + "grad_norm": 0.2218451350927353, + "learning_rate": 0.0001418450759050214, + "loss": 1.0908, + "step": 753 + }, + { + "epoch": 0.29301467016418925, + "grad_norm": 0.20107759535312653, + "learning_rate": 0.00014176722460101208, + "loss": 1.0202, + "step": 754 + }, + { + "epoch": 0.29340328378509667, + "grad_norm": 0.20933273434638977, + "learning_rate": 0.00014168937329700273, + "loss": 1.0719, + "step": 755 + }, + { + "epoch": 0.2937918974060041, + "grad_norm": 0.22369107604026794, + "learning_rate": 0.00014161152199299338, + "loss": 1.0433, + "step": 756 + }, + { + "epoch": 0.2941805110269115, + "grad_norm": 0.2113707810640335, + "learning_rate": 0.00014153367068898403, + "loss": 1.0637, + "step": 757 + }, + { + "epoch": 0.2945691246478189, + "grad_norm": 0.21105700731277466, + "learning_rate": 0.00014145581938497469, + "loss": 1.0468, + "step": 758 + }, + { + "epoch": 0.29495773826872634, + "grad_norm": 0.20189693570137024, + "learning_rate": 0.00014137796808096537, + "loss": 1.0281, + "step": 759 + }, + { + "epoch": 0.2953463518896337, + "grad_norm": 0.1954152137041092, + "learning_rate": 0.00014130011677695602, + "loss": 1.0519, + "step": 760 + }, + { + "epoch": 0.2957349655105411, + "grad_norm": 0.24295592308044434, + "learning_rate": 0.00014122226547294667, + "loss": 1.1303, + "step": 761 + }, + { + "epoch": 0.29612357913144854, + "grad_norm": 0.20158620178699493, + "learning_rate": 0.00014114441416893732, + "loss": 1.0367, + "step": 762 + }, + { + "epoch": 0.29651219275235596, + "grad_norm": 0.20734666287899017, + "learning_rate": 0.00014106656286492797, + "loss": 1.0392, + "step": 763 + }, + { + "epoch": 0.2969008063732634, + "grad_norm": 0.2177533656358719, + "learning_rate": 0.00014098871156091865, + "loss": 1.0619, + "step": 764 + }, + { + "epoch": 0.2972894199941708, + "grad_norm": 0.1961720883846283, + "learning_rate": 0.0001409108602569093, + "loss": 0.9872, + "step": 765 + }, + { + "epoch": 0.2976780336150782, + "grad_norm": 0.21530941128730774, + "learning_rate": 0.00014083300895289996, + "loss": 1.1246, + "step": 766 + }, + { + "epoch": 0.29806664723598564, + "grad_norm": 0.2039783000946045, + "learning_rate": 0.00014075515764889064, + "loss": 1.0789, + "step": 767 + }, + { + "epoch": 0.29845526085689306, + "grad_norm": 0.20641569793224335, + "learning_rate": 0.0001406773063448813, + "loss": 1.05, + "step": 768 + }, + { + "epoch": 0.2988438744778004, + "grad_norm": 0.2071225494146347, + "learning_rate": 0.00014059945504087194, + "loss": 1.047, + "step": 769 + }, + { + "epoch": 0.29923248809870784, + "grad_norm": 0.20367531478405, + "learning_rate": 0.00014052160373686262, + "loss": 1.0734, + "step": 770 + }, + { + "epoch": 0.29962110171961526, + "grad_norm": 0.21718619763851166, + "learning_rate": 0.00014044375243285327, + "loss": 1.0613, + "step": 771 + }, + { + "epoch": 0.3000097153405227, + "grad_norm": 0.21649087965488434, + "learning_rate": 0.00014036590112884392, + "loss": 1.0671, + "step": 772 + }, + { + "epoch": 0.3003983289614301, + "grad_norm": 0.22223225235939026, + "learning_rate": 0.00014028804982483458, + "loss": 1.0977, + "step": 773 + }, + { + "epoch": 0.3007869425823375, + "grad_norm": 0.23101870715618134, + "learning_rate": 0.00014021019852082523, + "loss": 1.1236, + "step": 774 + }, + { + "epoch": 0.30117555620324493, + "grad_norm": 0.22855506837368011, + "learning_rate": 0.0001401323472168159, + "loss": 1.0517, + "step": 775 + }, + { + "epoch": 0.30156416982415235, + "grad_norm": 0.20862117409706116, + "learning_rate": 0.00014005449591280656, + "loss": 1.0493, + "step": 776 + }, + { + "epoch": 0.30195278344505977, + "grad_norm": 0.21692048013210297, + "learning_rate": 0.0001399766446087972, + "loss": 1.0681, + "step": 777 + }, + { + "epoch": 0.3023413970659672, + "grad_norm": 0.21541331708431244, + "learning_rate": 0.00013989879330478786, + "loss": 1.0775, + "step": 778 + }, + { + "epoch": 0.30273001068687455, + "grad_norm": 0.21221749484539032, + "learning_rate": 0.00013982094200077851, + "loss": 1.0421, + "step": 779 + }, + { + "epoch": 0.30311862430778197, + "grad_norm": 0.22497743368148804, + "learning_rate": 0.0001397430906967692, + "loss": 1.1115, + "step": 780 + }, + { + "epoch": 0.3035072379286894, + "grad_norm": 0.1974119246006012, + "learning_rate": 0.00013966523939275985, + "loss": 1.0264, + "step": 781 + }, + { + "epoch": 0.3038958515495968, + "grad_norm": 0.20349323749542236, + "learning_rate": 0.0001395873880887505, + "loss": 1.0512, + "step": 782 + }, + { + "epoch": 0.3042844651705042, + "grad_norm": 0.21116937696933746, + "learning_rate": 0.00013950953678474115, + "loss": 1.0135, + "step": 783 + }, + { + "epoch": 0.30467307879141164, + "grad_norm": 0.2133677899837494, + "learning_rate": 0.0001394316854807318, + "loss": 1.0694, + "step": 784 + }, + { + "epoch": 0.30506169241231906, + "grad_norm": 0.20406191051006317, + "learning_rate": 0.00013935383417672248, + "loss": 1.0179, + "step": 785 + }, + { + "epoch": 0.3054503060332265, + "grad_norm": 0.21428678929805756, + "learning_rate": 0.00013927598287271313, + "loss": 1.0577, + "step": 786 + }, + { + "epoch": 0.3058389196541339, + "grad_norm": 0.20878921449184418, + "learning_rate": 0.00013919813156870379, + "loss": 1.0311, + "step": 787 + }, + { + "epoch": 0.30622753327504126, + "grad_norm": 0.19033175706863403, + "learning_rate": 0.00013912028026469444, + "loss": 0.976, + "step": 788 + }, + { + "epoch": 0.3066161468959487, + "grad_norm": 0.22138020396232605, + "learning_rate": 0.0001390424289606851, + "loss": 1.0438, + "step": 789 + }, + { + "epoch": 0.3070047605168561, + "grad_norm": 0.20765596628189087, + "learning_rate": 0.00013896457765667577, + "loss": 1.0865, + "step": 790 + }, + { + "epoch": 0.3073933741377635, + "grad_norm": 0.209733247756958, + "learning_rate": 0.00013888672635266642, + "loss": 1.0648, + "step": 791 + }, + { + "epoch": 0.30778198775867094, + "grad_norm": 0.1896686851978302, + "learning_rate": 0.00013880887504865707, + "loss": 1.0133, + "step": 792 + }, + { + "epoch": 0.30817060137957836, + "grad_norm": 0.21651998162269592, + "learning_rate": 0.00013873102374464772, + "loss": 1.0729, + "step": 793 + }, + { + "epoch": 0.3085592150004858, + "grad_norm": 0.21751996874809265, + "learning_rate": 0.00013865317244063838, + "loss": 1.0444, + "step": 794 + }, + { + "epoch": 0.3089478286213932, + "grad_norm": 0.20593520998954773, + "learning_rate": 0.00013857532113662906, + "loss": 1.0304, + "step": 795 + }, + { + "epoch": 0.3093364422423006, + "grad_norm": 0.19937261939048767, + "learning_rate": 0.0001384974698326197, + "loss": 1.0017, + "step": 796 + }, + { + "epoch": 0.30972505586320803, + "grad_norm": 0.18901696801185608, + "learning_rate": 0.00013841961852861036, + "loss": 1.0362, + "step": 797 + }, + { + "epoch": 0.3101136694841154, + "grad_norm": 0.2079760730266571, + "learning_rate": 0.000138341767224601, + "loss": 1.0784, + "step": 798 + }, + { + "epoch": 0.3105022831050228, + "grad_norm": 0.24873265624046326, + "learning_rate": 0.00013826391592059166, + "loss": 1.1026, + "step": 799 + }, + { + "epoch": 0.31089089672593023, + "grad_norm": 0.20185396075248718, + "learning_rate": 0.00013818606461658234, + "loss": 1.0235, + "step": 800 + }, + { + "epoch": 0.31127951034683765, + "grad_norm": 0.211393803358078, + "learning_rate": 0.000138108213312573, + "loss": 1.0999, + "step": 801 + }, + { + "epoch": 0.31166812396774507, + "grad_norm": 0.19948823750019073, + "learning_rate": 0.00013803036200856365, + "loss": 1.0242, + "step": 802 + }, + { + "epoch": 0.3120567375886525, + "grad_norm": 0.21470944583415985, + "learning_rate": 0.0001379525107045543, + "loss": 1.0736, + "step": 803 + }, + { + "epoch": 0.3124453512095599, + "grad_norm": 0.2195902317762375, + "learning_rate": 0.00013787465940054495, + "loss": 1.0368, + "step": 804 + }, + { + "epoch": 0.3128339648304673, + "grad_norm": 0.22142355144023895, + "learning_rate": 0.00013779680809653563, + "loss": 1.1022, + "step": 805 + }, + { + "epoch": 0.31322257845137474, + "grad_norm": 0.20487886667251587, + "learning_rate": 0.00013771895679252628, + "loss": 1.0478, + "step": 806 + }, + { + "epoch": 0.3136111920722821, + "grad_norm": 0.217549130320549, + "learning_rate": 0.00013764110548851693, + "loss": 1.0526, + "step": 807 + }, + { + "epoch": 0.3139998056931895, + "grad_norm": 0.20199982821941376, + "learning_rate": 0.0001375632541845076, + "loss": 0.9992, + "step": 808 + }, + { + "epoch": 0.31438841931409695, + "grad_norm": 0.19496634602546692, + "learning_rate": 0.00013748540288049824, + "loss": 1.0179, + "step": 809 + }, + { + "epoch": 0.31477703293500436, + "grad_norm": 0.21999460458755493, + "learning_rate": 0.0001374075515764889, + "loss": 1.0547, + "step": 810 + }, + { + "epoch": 0.3151656465559118, + "grad_norm": 0.21421074867248535, + "learning_rate": 0.00013732970027247957, + "loss": 1.0283, + "step": 811 + }, + { + "epoch": 0.3155542601768192, + "grad_norm": 0.1913364827632904, + "learning_rate": 0.00013725184896847022, + "loss": 0.9826, + "step": 812 + }, + { + "epoch": 0.3159428737977266, + "grad_norm": 0.20509806275367737, + "learning_rate": 0.00013717399766446087, + "loss": 1.0303, + "step": 813 + }, + { + "epoch": 0.31633148741863404, + "grad_norm": 0.20309868454933167, + "learning_rate": 0.00013709614636045153, + "loss": 1.0479, + "step": 814 + }, + { + "epoch": 0.31672010103954146, + "grad_norm": 0.2274443656206131, + "learning_rate": 0.0001370182950564422, + "loss": 1.1311, + "step": 815 + }, + { + "epoch": 0.3171087146604489, + "grad_norm": 0.22785170376300812, + "learning_rate": 0.00013694044375243286, + "loss": 1.1009, + "step": 816 + }, + { + "epoch": 0.31749732828135624, + "grad_norm": 0.2105439007282257, + "learning_rate": 0.0001368625924484235, + "loss": 1.0251, + "step": 817 + }, + { + "epoch": 0.31788594190226366, + "grad_norm": 0.20583970844745636, + "learning_rate": 0.00013678474114441416, + "loss": 1.0833, + "step": 818 + }, + { + "epoch": 0.3182745555231711, + "grad_norm": 0.21091191470623016, + "learning_rate": 0.00013670688984040484, + "loss": 1.071, + "step": 819 + }, + { + "epoch": 0.3186631691440785, + "grad_norm": 0.20645928382873535, + "learning_rate": 0.0001366290385363955, + "loss": 1.0605, + "step": 820 + }, + { + "epoch": 0.3190517827649859, + "grad_norm": 0.1990513950586319, + "learning_rate": 0.00013655118723238614, + "loss": 1.0461, + "step": 821 + }, + { + "epoch": 0.31944039638589333, + "grad_norm": 0.2192249745130539, + "learning_rate": 0.00013647333592837682, + "loss": 1.0975, + "step": 822 + }, + { + "epoch": 0.31982901000680075, + "grad_norm": 0.2157617211341858, + "learning_rate": 0.00013639548462436748, + "loss": 1.091, + "step": 823 + }, + { + "epoch": 0.32021762362770817, + "grad_norm": 0.21964526176452637, + "learning_rate": 0.00013631763332035813, + "loss": 1.0286, + "step": 824 + }, + { + "epoch": 0.3206062372486156, + "grad_norm": 0.2079797089099884, + "learning_rate": 0.00013623978201634878, + "loss": 1.0257, + "step": 825 + }, + { + "epoch": 0.32099485086952295, + "grad_norm": 0.21220168471336365, + "learning_rate": 0.00013616193071233946, + "loss": 1.0046, + "step": 826 + }, + { + "epoch": 0.32138346449043037, + "grad_norm": 0.2885231673717499, + "learning_rate": 0.0001360840794083301, + "loss": 1.1442, + "step": 827 + }, + { + "epoch": 0.3217720781113378, + "grad_norm": 0.2096511274576187, + "learning_rate": 0.00013600622810432076, + "loss": 1.0209, + "step": 828 + }, + { + "epoch": 0.3221606917322452, + "grad_norm": 0.2179451286792755, + "learning_rate": 0.00013592837680031142, + "loss": 1.0548, + "step": 829 + }, + { + "epoch": 0.3225493053531526, + "grad_norm": 0.2096329927444458, + "learning_rate": 0.00013585052549630207, + "loss": 1.0279, + "step": 830 + }, + { + "epoch": 0.32293791897406005, + "grad_norm": 0.22531811892986298, + "learning_rate": 0.00013577267419229275, + "loss": 1.0463, + "step": 831 + }, + { + "epoch": 0.32332653259496746, + "grad_norm": 0.22516901791095734, + "learning_rate": 0.0001356948228882834, + "loss": 1.1127, + "step": 832 + }, + { + "epoch": 0.3237151462158749, + "grad_norm": 0.22487780451774597, + "learning_rate": 0.00013561697158427405, + "loss": 1.0707, + "step": 833 + }, + { + "epoch": 0.3241037598367823, + "grad_norm": 0.20976543426513672, + "learning_rate": 0.0001355391202802647, + "loss": 1.0217, + "step": 834 + }, + { + "epoch": 0.32449237345768966, + "grad_norm": 0.19849295914173126, + "learning_rate": 0.00013546126897625535, + "loss": 1.021, + "step": 835 + }, + { + "epoch": 0.3248809870785971, + "grad_norm": 0.21772268414497375, + "learning_rate": 0.00013538341767224603, + "loss": 1.0605, + "step": 836 + }, + { + "epoch": 0.3252696006995045, + "grad_norm": 0.19670265913009644, + "learning_rate": 0.00013530556636823669, + "loss": 1.0165, + "step": 837 + }, + { + "epoch": 0.3256582143204119, + "grad_norm": 0.19339734315872192, + "learning_rate": 0.00013522771506422734, + "loss": 1.0203, + "step": 838 + }, + { + "epoch": 0.32604682794131934, + "grad_norm": 0.21289557218551636, + "learning_rate": 0.000135149863760218, + "loss": 1.0252, + "step": 839 + }, + { + "epoch": 0.32643544156222676, + "grad_norm": 0.1964789777994156, + "learning_rate": 0.00013507201245620864, + "loss": 1.0392, + "step": 840 + }, + { + "epoch": 0.3268240551831342, + "grad_norm": 0.20783716440200806, + "learning_rate": 0.00013499416115219932, + "loss": 1.0569, + "step": 841 + }, + { + "epoch": 0.3272126688040416, + "grad_norm": 0.22782161831855774, + "learning_rate": 0.00013491630984818997, + "loss": 1.0555, + "step": 842 + }, + { + "epoch": 0.327601282424949, + "grad_norm": 0.22771142423152924, + "learning_rate": 0.00013483845854418063, + "loss": 1.085, + "step": 843 + }, + { + "epoch": 0.32798989604585643, + "grad_norm": 0.19773711264133453, + "learning_rate": 0.00013476060724017128, + "loss": 1.008, + "step": 844 + }, + { + "epoch": 0.3283785096667638, + "grad_norm": 0.22399166226387024, + "learning_rate": 0.00013468275593616193, + "loss": 1.0511, + "step": 845 + }, + { + "epoch": 0.3287671232876712, + "grad_norm": 0.20488236844539642, + "learning_rate": 0.00013460490463215258, + "loss": 1.0883, + "step": 846 + }, + { + "epoch": 0.32915573690857863, + "grad_norm": 0.21387654542922974, + "learning_rate": 0.00013452705332814326, + "loss": 1.0808, + "step": 847 + }, + { + "epoch": 0.32954435052948605, + "grad_norm": 0.1972568780183792, + "learning_rate": 0.0001344492020241339, + "loss": 1.0555, + "step": 848 + }, + { + "epoch": 0.32993296415039347, + "grad_norm": 0.20835663378238678, + "learning_rate": 0.00013437135072012456, + "loss": 1.0473, + "step": 849 + }, + { + "epoch": 0.3303215777713009, + "grad_norm": 0.19707520306110382, + "learning_rate": 0.00013429349941611522, + "loss": 0.9585, + "step": 850 + }, + { + "epoch": 0.3307101913922083, + "grad_norm": 0.19163411855697632, + "learning_rate": 0.00013421564811210587, + "loss": 1.0025, + "step": 851 + }, + { + "epoch": 0.3310988050131157, + "grad_norm": 0.19730083644390106, + "learning_rate": 0.00013413779680809655, + "loss": 1.0696, + "step": 852 + }, + { + "epoch": 0.33148741863402315, + "grad_norm": 0.19537493586540222, + "learning_rate": 0.0001340599455040872, + "loss": 1.0466, + "step": 853 + }, + { + "epoch": 0.3318760322549305, + "grad_norm": 0.2255164235830307, + "learning_rate": 0.00013398209420007785, + "loss": 1.0659, + "step": 854 + }, + { + "epoch": 0.3322646458758379, + "grad_norm": 0.19774770736694336, + "learning_rate": 0.0001339042428960685, + "loss": 1.0326, + "step": 855 + }, + { + "epoch": 0.33265325949674535, + "grad_norm": 0.2004510909318924, + "learning_rate": 0.00013382639159205916, + "loss": 1.0327, + "step": 856 + }, + { + "epoch": 0.33304187311765276, + "grad_norm": 0.19187591969966888, + "learning_rate": 0.00013374854028804984, + "loss": 1.0069, + "step": 857 + }, + { + "epoch": 0.3334304867385602, + "grad_norm": 0.18775832653045654, + "learning_rate": 0.0001336706889840405, + "loss": 1.0083, + "step": 858 + }, + { + "epoch": 0.3338191003594676, + "grad_norm": 0.2005717158317566, + "learning_rate": 0.00013359283768003114, + "loss": 1.0398, + "step": 859 + }, + { + "epoch": 0.334207713980375, + "grad_norm": 0.19705893099308014, + "learning_rate": 0.0001335149863760218, + "loss": 1.0031, + "step": 860 + }, + { + "epoch": 0.33459632760128244, + "grad_norm": 0.19589562714099884, + "learning_rate": 0.00013343713507201244, + "loss": 0.9831, + "step": 861 + }, + { + "epoch": 0.33498494122218986, + "grad_norm": 0.19302591681480408, + "learning_rate": 0.00013335928376800312, + "loss": 1.0009, + "step": 862 + }, + { + "epoch": 0.3353735548430973, + "grad_norm": 0.20499618351459503, + "learning_rate": 0.00013328143246399377, + "loss": 1.0205, + "step": 863 + }, + { + "epoch": 0.33576216846400464, + "grad_norm": 0.20514456927776337, + "learning_rate": 0.00013320358115998443, + "loss": 1.0837, + "step": 864 + }, + { + "epoch": 0.33615078208491206, + "grad_norm": 0.19285848736763, + "learning_rate": 0.00013312572985597508, + "loss": 1.0167, + "step": 865 + }, + { + "epoch": 0.3365393957058195, + "grad_norm": 0.20891553163528442, + "learning_rate": 0.00013304787855196573, + "loss": 1.0127, + "step": 866 + }, + { + "epoch": 0.3369280093267269, + "grad_norm": 0.20511706173419952, + "learning_rate": 0.0001329700272479564, + "loss": 0.964, + "step": 867 + }, + { + "epoch": 0.3373166229476343, + "grad_norm": 0.1855512261390686, + "learning_rate": 0.00013289217594394706, + "loss": 0.9721, + "step": 868 + }, + { + "epoch": 0.33770523656854173, + "grad_norm": 0.20010098814964294, + "learning_rate": 0.00013281432463993771, + "loss": 1.0411, + "step": 869 + }, + { + "epoch": 0.33809385018944915, + "grad_norm": 0.1991325318813324, + "learning_rate": 0.0001327364733359284, + "loss": 0.9658, + "step": 870 + }, + { + "epoch": 0.33848246381035657, + "grad_norm": 0.19895736873149872, + "learning_rate": 0.00013265862203191905, + "loss": 1.0744, + "step": 871 + }, + { + "epoch": 0.338871077431264, + "grad_norm": 0.2091255635023117, + "learning_rate": 0.0001325807707279097, + "loss": 1.0375, + "step": 872 + }, + { + "epoch": 0.33925969105217135, + "grad_norm": 0.21355532109737396, + "learning_rate": 0.00013250291942390035, + "loss": 1.09, + "step": 873 + }, + { + "epoch": 0.33964830467307877, + "grad_norm": 0.21844851970672607, + "learning_rate": 0.00013242506811989103, + "loss": 1.0769, + "step": 874 + }, + { + "epoch": 0.3400369182939862, + "grad_norm": 0.1877543330192566, + "learning_rate": 0.00013234721681588168, + "loss": 1.0199, + "step": 875 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 0.2020038366317749, + "learning_rate": 0.00013226936551187233, + "loss": 1.0218, + "step": 876 + }, + { + "epoch": 0.340814145535801, + "grad_norm": 0.20682141184806824, + "learning_rate": 0.000132191514207863, + "loss": 1.0891, + "step": 877 + }, + { + "epoch": 0.34120275915670845, + "grad_norm": 0.21942824125289917, + "learning_rate": 0.00013211366290385366, + "loss": 0.9877, + "step": 878 + }, + { + "epoch": 0.34159137277761586, + "grad_norm": 0.21150313317775726, + "learning_rate": 0.00013203581159984432, + "loss": 1.0815, + "step": 879 + }, + { + "epoch": 0.3419799863985233, + "grad_norm": 0.2073293924331665, + "learning_rate": 0.00013195796029583497, + "loss": 1.0579, + "step": 880 + }, + { + "epoch": 0.3423686000194307, + "grad_norm": 0.221574068069458, + "learning_rate": 0.00013188010899182562, + "loss": 1.0279, + "step": 881 + }, + { + "epoch": 0.3427572136403381, + "grad_norm": 0.22334492206573486, + "learning_rate": 0.00013180225768781627, + "loss": 1.0837, + "step": 882 + }, + { + "epoch": 0.3431458272612455, + "grad_norm": 0.18817654252052307, + "learning_rate": 0.00013172440638380695, + "loss": 1.0262, + "step": 883 + }, + { + "epoch": 0.3435344408821529, + "grad_norm": 0.20126822590827942, + "learning_rate": 0.0001316465550797976, + "loss": 1.0679, + "step": 884 + }, + { + "epoch": 0.3439230545030603, + "grad_norm": 0.2128864973783493, + "learning_rate": 0.00013156870377578825, + "loss": 1.0316, + "step": 885 + }, + { + "epoch": 0.34431166812396774, + "grad_norm": 0.20054499804973602, + "learning_rate": 0.0001314908524717789, + "loss": 1.0024, + "step": 886 + }, + { + "epoch": 0.34470028174487516, + "grad_norm": 0.21358034014701843, + "learning_rate": 0.00013141300116776956, + "loss": 1.0475, + "step": 887 + }, + { + "epoch": 0.3450888953657826, + "grad_norm": 0.21377703547477722, + "learning_rate": 0.00013133514986376024, + "loss": 1.0957, + "step": 888 + }, + { + "epoch": 0.34547750898669, + "grad_norm": 0.20166514813899994, + "learning_rate": 0.0001312572985597509, + "loss": 1.0189, + "step": 889 + }, + { + "epoch": 0.3458661226075974, + "grad_norm": 0.20424878597259521, + "learning_rate": 0.00013117944725574154, + "loss": 1.0896, + "step": 890 + }, + { + "epoch": 0.34625473622850483, + "grad_norm": 0.19028648734092712, + "learning_rate": 0.0001311015959517322, + "loss": 0.9881, + "step": 891 + }, + { + "epoch": 0.3466433498494122, + "grad_norm": 0.20828665792942047, + "learning_rate": 0.00013102374464772285, + "loss": 0.9932, + "step": 892 + }, + { + "epoch": 0.3470319634703196, + "grad_norm": 0.20756572484970093, + "learning_rate": 0.00013094589334371353, + "loss": 1.0406, + "step": 893 + }, + { + "epoch": 0.34742057709122703, + "grad_norm": 0.20768921077251434, + "learning_rate": 0.00013086804203970418, + "loss": 0.9652, + "step": 894 + }, + { + "epoch": 0.34780919071213445, + "grad_norm": 0.20660027861595154, + "learning_rate": 0.00013079019073569483, + "loss": 1.0728, + "step": 895 + }, + { + "epoch": 0.34819780433304187, + "grad_norm": 0.20186837017536163, + "learning_rate": 0.00013071233943168548, + "loss": 1.0407, + "step": 896 + }, + { + "epoch": 0.3485864179539493, + "grad_norm": 0.20880667865276337, + "learning_rate": 0.00013063448812767613, + "loss": 1.0275, + "step": 897 + }, + { + "epoch": 0.3489750315748567, + "grad_norm": 0.22212949395179749, + "learning_rate": 0.0001305566368236668, + "loss": 1.0293, + "step": 898 + }, + { + "epoch": 0.3493636451957641, + "grad_norm": 0.20552745461463928, + "learning_rate": 0.00013047878551965746, + "loss": 1.0434, + "step": 899 + }, + { + "epoch": 0.34975225881667155, + "grad_norm": 0.21239839494228363, + "learning_rate": 0.00013040093421564812, + "loss": 1.052, + "step": 900 + }, + { + "epoch": 0.3501408724375789, + "grad_norm": 0.22420544922351837, + "learning_rate": 0.00013032308291163877, + "loss": 1.0236, + "step": 901 + }, + { + "epoch": 0.35052948605848633, + "grad_norm": 0.23435090482234955, + "learning_rate": 0.00013024523160762942, + "loss": 1.0876, + "step": 902 + }, + { + "epoch": 0.35091809967939375, + "grad_norm": 0.22763386368751526, + "learning_rate": 0.0001301673803036201, + "loss": 1.0636, + "step": 903 + }, + { + "epoch": 0.35130671330030117, + "grad_norm": 0.20948883891105652, + "learning_rate": 0.00013008952899961075, + "loss": 1.0083, + "step": 904 + }, + { + "epoch": 0.3516953269212086, + "grad_norm": 0.20408779382705688, + "learning_rate": 0.0001300116776956014, + "loss": 1.039, + "step": 905 + }, + { + "epoch": 0.352083940542116, + "grad_norm": 0.2126050591468811, + "learning_rate": 0.00012993382639159206, + "loss": 1.0365, + "step": 906 + }, + { + "epoch": 0.3524725541630234, + "grad_norm": 0.20314334332942963, + "learning_rate": 0.0001298559750875827, + "loss": 1.0474, + "step": 907 + }, + { + "epoch": 0.35286116778393084, + "grad_norm": 0.23720984160900116, + "learning_rate": 0.0001297781237835734, + "loss": 1.0529, + "step": 908 + }, + { + "epoch": 0.35324978140483826, + "grad_norm": 0.22642800211906433, + "learning_rate": 0.00012970027247956404, + "loss": 1.0586, + "step": 909 + }, + { + "epoch": 0.3536383950257457, + "grad_norm": 0.20469972491264343, + "learning_rate": 0.0001296224211755547, + "loss": 1.0267, + "step": 910 + }, + { + "epoch": 0.35402700864665304, + "grad_norm": 0.197368785738945, + "learning_rate": 0.00012954456987154534, + "loss": 1.0348, + "step": 911 + }, + { + "epoch": 0.35441562226756046, + "grad_norm": 0.21924498677253723, + "learning_rate": 0.000129466718567536, + "loss": 1.0861, + "step": 912 + }, + { + "epoch": 0.3548042358884679, + "grad_norm": 0.22006285190582275, + "learning_rate": 0.00012938886726352667, + "loss": 1.0545, + "step": 913 + }, + { + "epoch": 0.3551928495093753, + "grad_norm": 0.22419220209121704, + "learning_rate": 0.00012931101595951733, + "loss": 1.0716, + "step": 914 + }, + { + "epoch": 0.3555814631302827, + "grad_norm": 0.215990349650383, + "learning_rate": 0.00012923316465550798, + "loss": 1.0619, + "step": 915 + }, + { + "epoch": 0.35597007675119013, + "grad_norm": 0.20783264935016632, + "learning_rate": 0.00012915531335149863, + "loss": 1.0412, + "step": 916 + }, + { + "epoch": 0.35635869037209755, + "grad_norm": 0.24584618210792542, + "learning_rate": 0.00012907746204748928, + "loss": 1.1165, + "step": 917 + }, + { + "epoch": 0.35674730399300497, + "grad_norm": 0.23146122694015503, + "learning_rate": 0.00012899961074347996, + "loss": 1.1111, + "step": 918 + }, + { + "epoch": 0.3571359176139124, + "grad_norm": 0.19983729720115662, + "learning_rate": 0.00012892175943947061, + "loss": 1.0674, + "step": 919 + }, + { + "epoch": 0.35752453123481975, + "grad_norm": 0.2161000818014145, + "learning_rate": 0.00012884390813546127, + "loss": 1.076, + "step": 920 + }, + { + "epoch": 0.35791314485572717, + "grad_norm": 0.21042793989181519, + "learning_rate": 0.00012876605683145192, + "loss": 1.0535, + "step": 921 + }, + { + "epoch": 0.3583017584766346, + "grad_norm": 0.20135439932346344, + "learning_rate": 0.0001286882055274426, + "loss": 1.0059, + "step": 922 + }, + { + "epoch": 0.358690372097542, + "grad_norm": 0.19394971430301666, + "learning_rate": 0.00012861035422343325, + "loss": 1.0381, + "step": 923 + }, + { + "epoch": 0.35907898571844943, + "grad_norm": 0.21171030402183533, + "learning_rate": 0.0001285325029194239, + "loss": 1.0513, + "step": 924 + }, + { + "epoch": 0.35946759933935685, + "grad_norm": 0.19476690888404846, + "learning_rate": 0.00012845465161541458, + "loss": 1.0003, + "step": 925 + }, + { + "epoch": 0.35985621296026427, + "grad_norm": 0.20468670129776, + "learning_rate": 0.00012837680031140523, + "loss": 1.0608, + "step": 926 + }, + { + "epoch": 0.3602448265811717, + "grad_norm": 0.21159446239471436, + "learning_rate": 0.00012829894900739588, + "loss": 1.0734, + "step": 927 + }, + { + "epoch": 0.3606334402020791, + "grad_norm": 0.21179519593715668, + "learning_rate": 0.00012822109770338654, + "loss": 1.0957, + "step": 928 + }, + { + "epoch": 0.3610220538229865, + "grad_norm": 0.20997527241706848, + "learning_rate": 0.00012814324639937722, + "loss": 1.0644, + "step": 929 + }, + { + "epoch": 0.3614106674438939, + "grad_norm": 0.21178296208381653, + "learning_rate": 0.00012806539509536787, + "loss": 1.0208, + "step": 930 + }, + { + "epoch": 0.3617992810648013, + "grad_norm": 0.20890356600284576, + "learning_rate": 0.00012798754379135852, + "loss": 1.0888, + "step": 931 + }, + { + "epoch": 0.3621878946857087, + "grad_norm": 0.20177409052848816, + "learning_rate": 0.00012790969248734917, + "loss": 0.9741, + "step": 932 + }, + { + "epoch": 0.36257650830661614, + "grad_norm": 0.23504556715488434, + "learning_rate": 0.00012783184118333982, + "loss": 1.1048, + "step": 933 + }, + { + "epoch": 0.36296512192752356, + "grad_norm": 0.22829356789588928, + "learning_rate": 0.0001277539898793305, + "loss": 1.0798, + "step": 934 + }, + { + "epoch": 0.363353735548431, + "grad_norm": 0.2068483531475067, + "learning_rate": 0.00012767613857532116, + "loss": 1.0452, + "step": 935 + }, + { + "epoch": 0.3637423491693384, + "grad_norm": 0.2093171775341034, + "learning_rate": 0.0001275982872713118, + "loss": 1.0742, + "step": 936 + }, + { + "epoch": 0.3641309627902458, + "grad_norm": 0.21478736400604248, + "learning_rate": 0.00012752043596730246, + "loss": 1.0572, + "step": 937 + }, + { + "epoch": 0.36451957641115323, + "grad_norm": 0.1906953752040863, + "learning_rate": 0.0001274425846632931, + "loss": 1.0107, + "step": 938 + }, + { + "epoch": 0.3649081900320606, + "grad_norm": 0.20580604672431946, + "learning_rate": 0.0001273647333592838, + "loss": 1.0677, + "step": 939 + }, + { + "epoch": 0.365296803652968, + "grad_norm": 0.22586850821971893, + "learning_rate": 0.00012728688205527444, + "loss": 1.0389, + "step": 940 + }, + { + "epoch": 0.36568541727387543, + "grad_norm": 0.199899360537529, + "learning_rate": 0.0001272090307512651, + "loss": 1.0462, + "step": 941 + }, + { + "epoch": 0.36607403089478285, + "grad_norm": 0.19881689548492432, + "learning_rate": 0.00012713117944725575, + "loss": 1.0565, + "step": 942 + }, + { + "epoch": 0.3664626445156903, + "grad_norm": 0.21748925745487213, + "learning_rate": 0.0001270533281432464, + "loss": 1.0659, + "step": 943 + }, + { + "epoch": 0.3668512581365977, + "grad_norm": 0.19363689422607422, + "learning_rate": 0.00012697547683923708, + "loss": 1.0307, + "step": 944 + }, + { + "epoch": 0.3672398717575051, + "grad_norm": 0.21701784431934357, + "learning_rate": 0.00012689762553522773, + "loss": 1.0684, + "step": 945 + }, + { + "epoch": 0.36762848537841253, + "grad_norm": 0.21406958997249603, + "learning_rate": 0.00012681977423121838, + "loss": 1.0703, + "step": 946 + }, + { + "epoch": 0.36801709899931995, + "grad_norm": 0.23539729416370392, + "learning_rate": 0.00012674192292720903, + "loss": 1.1537, + "step": 947 + }, + { + "epoch": 0.36840571262022737, + "grad_norm": 0.2177354395389557, + "learning_rate": 0.00012666407162319969, + "loss": 1.0131, + "step": 948 + }, + { + "epoch": 0.36879432624113473, + "grad_norm": 0.255346417427063, + "learning_rate": 0.00012658622031919037, + "loss": 0.9807, + "step": 949 + }, + { + "epoch": 0.36918293986204215, + "grad_norm": 0.2139921486377716, + "learning_rate": 0.00012650836901518102, + "loss": 1.0392, + "step": 950 + }, + { + "epoch": 0.36957155348294957, + "grad_norm": 0.22490833699703217, + "learning_rate": 0.00012643051771117167, + "loss": 1.0512, + "step": 951 + }, + { + "epoch": 0.369960167103857, + "grad_norm": 0.20698820054531097, + "learning_rate": 0.00012635266640716232, + "loss": 1.0391, + "step": 952 + }, + { + "epoch": 0.3703487807247644, + "grad_norm": 0.2276201844215393, + "learning_rate": 0.00012627481510315297, + "loss": 1.0513, + "step": 953 + }, + { + "epoch": 0.3707373943456718, + "grad_norm": 0.2493600994348526, + "learning_rate": 0.00012619696379914365, + "loss": 1.0136, + "step": 954 + }, + { + "epoch": 0.37112600796657924, + "grad_norm": 0.2155001014471054, + "learning_rate": 0.0001261191124951343, + "loss": 1.0523, + "step": 955 + }, + { + "epoch": 0.37151462158748666, + "grad_norm": 0.21571211516857147, + "learning_rate": 0.00012604126119112496, + "loss": 1.0288, + "step": 956 + }, + { + "epoch": 0.3719032352083941, + "grad_norm": 0.23238877952098846, + "learning_rate": 0.0001259634098871156, + "loss": 1.0638, + "step": 957 + }, + { + "epoch": 0.37229184882930144, + "grad_norm": 0.2002813220024109, + "learning_rate": 0.00012588555858310626, + "loss": 0.9665, + "step": 958 + }, + { + "epoch": 0.37268046245020886, + "grad_norm": 0.21712858974933624, + "learning_rate": 0.0001258077072790969, + "loss": 1.0469, + "step": 959 + }, + { + "epoch": 0.3730690760711163, + "grad_norm": 0.2178192287683487, + "learning_rate": 0.0001257298559750876, + "loss": 1.0267, + "step": 960 + }, + { + "epoch": 0.3734576896920237, + "grad_norm": 0.25488024950027466, + "learning_rate": 0.00012565200467107824, + "loss": 1.0153, + "step": 961 + }, + { + "epoch": 0.3738463033129311, + "grad_norm": 0.20070038735866547, + "learning_rate": 0.0001255741533670689, + "loss": 1.0279, + "step": 962 + }, + { + "epoch": 0.37423491693383854, + "grad_norm": 0.21885356307029724, + "learning_rate": 0.00012549630206305955, + "loss": 1.0395, + "step": 963 + }, + { + "epoch": 0.37462353055474595, + "grad_norm": 0.2407921701669693, + "learning_rate": 0.0001254184507590502, + "loss": 1.0767, + "step": 964 + }, + { + "epoch": 0.3750121441756534, + "grad_norm": 0.20645053684711456, + "learning_rate": 0.00012534059945504088, + "loss": 1.0318, + "step": 965 + }, + { + "epoch": 0.3754007577965608, + "grad_norm": 0.21275092661380768, + "learning_rate": 0.00012526274815103153, + "loss": 1.0546, + "step": 966 + }, + { + "epoch": 0.3757893714174682, + "grad_norm": 0.21574917435646057, + "learning_rate": 0.00012518489684702218, + "loss": 1.032, + "step": 967 + }, + { + "epoch": 0.3761779850383756, + "grad_norm": 0.21589480340480804, + "learning_rate": 0.00012510704554301284, + "loss": 1.0834, + "step": 968 + }, + { + "epoch": 0.376566598659283, + "grad_norm": 0.19576796889305115, + "learning_rate": 0.0001250291942390035, + "loss": 1.0178, + "step": 969 + }, + { + "epoch": 0.3769552122801904, + "grad_norm": 0.20941287279129028, + "learning_rate": 0.00012495134293499417, + "loss": 1.0712, + "step": 970 + }, + { + "epoch": 0.37734382590109783, + "grad_norm": 0.22585494816303253, + "learning_rate": 0.00012487349163098482, + "loss": 1.0401, + "step": 971 + }, + { + "epoch": 0.37773243952200525, + "grad_norm": 0.21093420684337616, + "learning_rate": 0.00012479564032697547, + "loss": 1.0569, + "step": 972 + }, + { + "epoch": 0.37812105314291267, + "grad_norm": 0.22375014424324036, + "learning_rate": 0.00012471778902296612, + "loss": 1.0687, + "step": 973 + }, + { + "epoch": 0.3785096667638201, + "grad_norm": 0.19787487387657166, + "learning_rate": 0.0001246399377189568, + "loss": 1.0266, + "step": 974 + }, + { + "epoch": 0.3788982803847275, + "grad_norm": 0.20633013546466827, + "learning_rate": 0.00012456208641494745, + "loss": 0.9996, + "step": 975 + }, + { + "epoch": 0.3792868940056349, + "grad_norm": 0.21559873223304749, + "learning_rate": 0.0001244842351109381, + "loss": 1.0851, + "step": 976 + }, + { + "epoch": 0.3796755076265423, + "grad_norm": 0.2166333943605423, + "learning_rate": 0.00012440638380692879, + "loss": 1.0859, + "step": 977 + }, + { + "epoch": 0.3800641212474497, + "grad_norm": 0.18558773398399353, + "learning_rate": 0.00012432853250291944, + "loss": 0.9534, + "step": 978 + }, + { + "epoch": 0.3804527348683571, + "grad_norm": 0.2086942344903946, + "learning_rate": 0.0001242506811989101, + "loss": 1.0786, + "step": 979 + }, + { + "epoch": 0.38084134848926454, + "grad_norm": 0.2207823544740677, + "learning_rate": 0.00012417282989490074, + "loss": 1.0626, + "step": 980 + }, + { + "epoch": 0.38122996211017196, + "grad_norm": 0.21255749464035034, + "learning_rate": 0.00012409497859089142, + "loss": 1.063, + "step": 981 + }, + { + "epoch": 0.3816185757310794, + "grad_norm": 0.20682042837142944, + "learning_rate": 0.00012401712728688207, + "loss": 1.034, + "step": 982 + }, + { + "epoch": 0.3820071893519868, + "grad_norm": 0.2084134966135025, + "learning_rate": 0.00012393927598287272, + "loss": 1.0481, + "step": 983 + }, + { + "epoch": 0.3823958029728942, + "grad_norm": 0.1922312080860138, + "learning_rate": 0.00012386142467886338, + "loss": 1.0461, + "step": 984 + }, + { + "epoch": 0.38278441659380164, + "grad_norm": 0.20893707871437073, + "learning_rate": 0.00012378357337485406, + "loss": 1.0797, + "step": 985 + }, + { + "epoch": 0.383173030214709, + "grad_norm": 0.19717541337013245, + "learning_rate": 0.0001237057220708447, + "loss": 1.0028, + "step": 986 + }, + { + "epoch": 0.3835616438356164, + "grad_norm": 0.20688053965568542, + "learning_rate": 0.00012362787076683536, + "loss": 0.989, + "step": 987 + }, + { + "epoch": 0.38395025745652384, + "grad_norm": 0.20580583810806274, + "learning_rate": 0.000123550019462826, + "loss": 1.06, + "step": 988 + }, + { + "epoch": 0.38433887107743125, + "grad_norm": 0.2151709794998169, + "learning_rate": 0.00012347216815881666, + "loss": 1.0685, + "step": 989 + }, + { + "epoch": 0.3847274846983387, + "grad_norm": 0.19573980569839478, + "learning_rate": 0.00012339431685480734, + "loss": 1.0072, + "step": 990 + }, + { + "epoch": 0.3851160983192461, + "grad_norm": 0.1949119120836258, + "learning_rate": 0.000123316465550798, + "loss": 0.9995, + "step": 991 + }, + { + "epoch": 0.3855047119401535, + "grad_norm": 0.2062375247478485, + "learning_rate": 0.00012323861424678865, + "loss": 1.0694, + "step": 992 + }, + { + "epoch": 0.38589332556106093, + "grad_norm": 0.2007209211587906, + "learning_rate": 0.0001231607629427793, + "loss": 1.0397, + "step": 993 + }, + { + "epoch": 0.38628193918196835, + "grad_norm": 0.2231544405221939, + "learning_rate": 0.00012308291163876995, + "loss": 1.0755, + "step": 994 + }, + { + "epoch": 0.38667055280287577, + "grad_norm": 0.2103337049484253, + "learning_rate": 0.0001230050603347606, + "loss": 1.0505, + "step": 995 + }, + { + "epoch": 0.38705916642378313, + "grad_norm": 0.20178386569023132, + "learning_rate": 0.00012292720903075128, + "loss": 1.0696, + "step": 996 + }, + { + "epoch": 0.38744778004469055, + "grad_norm": 0.21268007159233093, + "learning_rate": 0.00012284935772674193, + "loss": 1.0262, + "step": 997 + }, + { + "epoch": 0.38783639366559797, + "grad_norm": 0.21439722180366516, + "learning_rate": 0.0001227715064227326, + "loss": 1.0718, + "step": 998 + }, + { + "epoch": 0.3882250072865054, + "grad_norm": 0.19691336154937744, + "learning_rate": 0.00012269365511872324, + "loss": 0.9663, + "step": 999 + }, + { + "epoch": 0.3886136209074128, + "grad_norm": 0.2165926694869995, + "learning_rate": 0.0001226158038147139, + "loss": 1.0432, + "step": 1000 + }, + { + "epoch": 0.3890022345283202, + "grad_norm": 0.20730604231357574, + "learning_rate": 0.00012253795251070457, + "loss": 1.0386, + "step": 1001 + }, + { + "epoch": 0.38939084814922764, + "grad_norm": 0.2138068974018097, + "learning_rate": 0.00012246010120669522, + "loss": 1.0683, + "step": 1002 + }, + { + "epoch": 0.38977946177013506, + "grad_norm": 0.2118951678276062, + "learning_rate": 0.00012238224990268587, + "loss": 1.0393, + "step": 1003 + }, + { + "epoch": 0.3901680753910425, + "grad_norm": 0.20879961550235748, + "learning_rate": 0.00012230439859867653, + "loss": 1.0349, + "step": 1004 + }, + { + "epoch": 0.39055668901194984, + "grad_norm": 0.19588464498519897, + "learning_rate": 0.00012222654729466718, + "loss": 1.0226, + "step": 1005 + }, + { + "epoch": 0.39094530263285726, + "grad_norm": 0.2059485912322998, + "learning_rate": 0.00012214869599065786, + "loss": 1.052, + "step": 1006 + }, + { + "epoch": 0.3913339162537647, + "grad_norm": 0.2299761176109314, + "learning_rate": 0.0001220708446866485, + "loss": 1.1055, + "step": 1007 + }, + { + "epoch": 0.3917225298746721, + "grad_norm": 0.20196737349033356, + "learning_rate": 0.00012199299338263916, + "loss": 1.0497, + "step": 1008 + }, + { + "epoch": 0.3921111434955795, + "grad_norm": 0.20615293085575104, + "learning_rate": 0.00012191514207862981, + "loss": 1.047, + "step": 1009 + }, + { + "epoch": 0.39249975711648694, + "grad_norm": 0.20265278220176697, + "learning_rate": 0.00012183729077462047, + "loss": 1.0035, + "step": 1010 + }, + { + "epoch": 0.39288837073739435, + "grad_norm": 0.20197926461696625, + "learning_rate": 0.00012175943947061114, + "loss": 0.9847, + "step": 1011 + }, + { + "epoch": 0.3932769843583018, + "grad_norm": 0.19974152743816376, + "learning_rate": 0.0001216815881666018, + "loss": 1.0669, + "step": 1012 + }, + { + "epoch": 0.3936655979792092, + "grad_norm": 0.21684005856513977, + "learning_rate": 0.00012160373686259245, + "loss": 1.0562, + "step": 1013 + }, + { + "epoch": 0.3940542116001166, + "grad_norm": 0.2030404955148697, + "learning_rate": 0.00012152588555858311, + "loss": 1.0159, + "step": 1014 + }, + { + "epoch": 0.394442825221024, + "grad_norm": 0.2123572677373886, + "learning_rate": 0.00012144803425457377, + "loss": 1.0757, + "step": 1015 + }, + { + "epoch": 0.3948314388419314, + "grad_norm": 0.20320011675357819, + "learning_rate": 0.00012137018295056443, + "loss": 1.038, + "step": 1016 + }, + { + "epoch": 0.3952200524628388, + "grad_norm": 0.20120739936828613, + "learning_rate": 0.00012129233164655508, + "loss": 1.1015, + "step": 1017 + }, + { + "epoch": 0.39560866608374623, + "grad_norm": 0.19862449169158936, + "learning_rate": 0.00012121448034254575, + "loss": 1.0328, + "step": 1018 + }, + { + "epoch": 0.39599727970465365, + "grad_norm": 0.19761312007904053, + "learning_rate": 0.0001211366290385364, + "loss": 0.997, + "step": 1019 + }, + { + "epoch": 0.39638589332556107, + "grad_norm": 0.1943569928407669, + "learning_rate": 0.00012105877773452705, + "loss": 1.0099, + "step": 1020 + }, + { + "epoch": 0.3967745069464685, + "grad_norm": 0.2109062373638153, + "learning_rate": 0.00012098092643051773, + "loss": 1.1039, + "step": 1021 + }, + { + "epoch": 0.3971631205673759, + "grad_norm": 0.20966266095638275, + "learning_rate": 0.00012090307512650839, + "loss": 1.1208, + "step": 1022 + }, + { + "epoch": 0.3975517341882833, + "grad_norm": 0.19208088517189026, + "learning_rate": 0.00012082522382249904, + "loss": 1.0147, + "step": 1023 + }, + { + "epoch": 0.3979403478091907, + "grad_norm": 0.21821236610412598, + "learning_rate": 0.00012074737251848969, + "loss": 1.0615, + "step": 1024 + }, + { + "epoch": 0.3983289614300981, + "grad_norm": 0.20031368732452393, + "learning_rate": 0.00012066952121448034, + "loss": 1.0303, + "step": 1025 + }, + { + "epoch": 0.3987175750510055, + "grad_norm": 0.22910597920417786, + "learning_rate": 0.00012059166991047102, + "loss": 1.0182, + "step": 1026 + }, + { + "epoch": 0.39910618867191294, + "grad_norm": 0.20816978812217712, + "learning_rate": 0.00012051381860646167, + "loss": 1.0142, + "step": 1027 + }, + { + "epoch": 0.39949480229282036, + "grad_norm": 0.20989780128002167, + "learning_rate": 0.00012043596730245232, + "loss": 1.0676, + "step": 1028 + }, + { + "epoch": 0.3998834159137278, + "grad_norm": 0.21894055604934692, + "learning_rate": 0.00012035811599844298, + "loss": 1.0222, + "step": 1029 + }, + { + "epoch": 0.4002720295346352, + "grad_norm": 0.2170870155096054, + "learning_rate": 0.00012028026469443363, + "loss": 1.0319, + "step": 1030 + }, + { + "epoch": 0.4006606431555426, + "grad_norm": 0.20869679749011993, + "learning_rate": 0.00012020241339042428, + "loss": 1.055, + "step": 1031 + }, + { + "epoch": 0.40104925677645004, + "grad_norm": 0.18850640952587128, + "learning_rate": 0.00012012456208641496, + "loss": 0.9993, + "step": 1032 + }, + { + "epoch": 0.40143787039735745, + "grad_norm": 0.21462580561637878, + "learning_rate": 0.00012004671078240561, + "loss": 1.0115, + "step": 1033 + }, + { + "epoch": 0.4018264840182648, + "grad_norm": 0.2008499950170517, + "learning_rate": 0.00011996885947839626, + "loss": 1.0229, + "step": 1034 + }, + { + "epoch": 0.40221509763917224, + "grad_norm": 0.20063354074954987, + "learning_rate": 0.00011989100817438692, + "loss": 1.0295, + "step": 1035 + }, + { + "epoch": 0.40260371126007966, + "grad_norm": 0.20655786991119385, + "learning_rate": 0.00011981315687037757, + "loss": 1.0044, + "step": 1036 + }, + { + "epoch": 0.4029923248809871, + "grad_norm": 0.1985999196767807, + "learning_rate": 0.00011973530556636825, + "loss": 1.0063, + "step": 1037 + }, + { + "epoch": 0.4033809385018945, + "grad_norm": 0.2039060890674591, + "learning_rate": 0.0001196574542623589, + "loss": 1.044, + "step": 1038 + }, + { + "epoch": 0.4037695521228019, + "grad_norm": 0.21838189661502838, + "learning_rate": 0.00011957960295834955, + "loss": 1.1101, + "step": 1039 + }, + { + "epoch": 0.40415816574370933, + "grad_norm": 0.21508415043354034, + "learning_rate": 0.00011950175165434022, + "loss": 1.0764, + "step": 1040 + }, + { + "epoch": 0.40454677936461675, + "grad_norm": 0.2089119255542755, + "learning_rate": 0.00011942390035033087, + "loss": 0.9986, + "step": 1041 + }, + { + "epoch": 0.40493539298552417, + "grad_norm": 0.19859452545642853, + "learning_rate": 0.00011934604904632153, + "loss": 1.0122, + "step": 1042 + }, + { + "epoch": 0.40532400660643153, + "grad_norm": 0.2018653154373169, + "learning_rate": 0.00011926819774231219, + "loss": 1.0187, + "step": 1043 + }, + { + "epoch": 0.40571262022733895, + "grad_norm": 0.19892063736915588, + "learning_rate": 0.00011919034643830285, + "loss": 1.0029, + "step": 1044 + }, + { + "epoch": 0.40610123384824637, + "grad_norm": 0.20355650782585144, + "learning_rate": 0.0001191124951342935, + "loss": 1.0484, + "step": 1045 + }, + { + "epoch": 0.4064898474691538, + "grad_norm": 0.2033994495868683, + "learning_rate": 0.00011903464383028416, + "loss": 1.087, + "step": 1046 + }, + { + "epoch": 0.4068784610900612, + "grad_norm": 0.2047330141067505, + "learning_rate": 0.00011895679252627484, + "loss": 1.0774, + "step": 1047 + }, + { + "epoch": 0.4072670747109686, + "grad_norm": 0.21420112252235413, + "learning_rate": 0.00011887894122226549, + "loss": 1.0252, + "step": 1048 + }, + { + "epoch": 0.40765568833187604, + "grad_norm": 0.2030097395181656, + "learning_rate": 0.00011880108991825614, + "loss": 1.0501, + "step": 1049 + }, + { + "epoch": 0.40804430195278346, + "grad_norm": 0.2128026783466339, + "learning_rate": 0.00011872323861424679, + "loss": 1.1031, + "step": 1050 + }, + { + "epoch": 0.4084329155736909, + "grad_norm": 0.20724938809871674, + "learning_rate": 0.00011864538731023744, + "loss": 1.0327, + "step": 1051 + }, + { + "epoch": 0.40882152919459824, + "grad_norm": 0.20344072580337524, + "learning_rate": 0.00011856753600622812, + "loss": 1.0719, + "step": 1052 + }, + { + "epoch": 0.40921014281550566, + "grad_norm": 0.2145012468099594, + "learning_rate": 0.00011848968470221877, + "loss": 1.0582, + "step": 1053 + }, + { + "epoch": 0.4095987564364131, + "grad_norm": 0.220048725605011, + "learning_rate": 0.00011841183339820943, + "loss": 1.0825, + "step": 1054 + }, + { + "epoch": 0.4099873700573205, + "grad_norm": 0.19074465334415436, + "learning_rate": 0.00011833398209420008, + "loss": 0.9657, + "step": 1055 + }, + { + "epoch": 0.4103759836782279, + "grad_norm": 0.1958267241716385, + "learning_rate": 0.00011825613079019073, + "loss": 0.9864, + "step": 1056 + }, + { + "epoch": 0.41076459729913534, + "grad_norm": 0.21768233180046082, + "learning_rate": 0.00011817827948618141, + "loss": 0.9997, + "step": 1057 + }, + { + "epoch": 0.41115321092004276, + "grad_norm": 0.20218704640865326, + "learning_rate": 0.00011810042818217206, + "loss": 1.072, + "step": 1058 + }, + { + "epoch": 0.4115418245409502, + "grad_norm": 0.2035023719072342, + "learning_rate": 0.00011802257687816271, + "loss": 1.0415, + "step": 1059 + }, + { + "epoch": 0.4119304381618576, + "grad_norm": 0.22603970766067505, + "learning_rate": 0.00011794472557415337, + "loss": 1.0751, + "step": 1060 + }, + { + "epoch": 0.412319051782765, + "grad_norm": 0.2125842273235321, + "learning_rate": 0.00011786687427014402, + "loss": 1.0727, + "step": 1061 + }, + { + "epoch": 0.4127076654036724, + "grad_norm": 0.2005981206893921, + "learning_rate": 0.0001177890229661347, + "loss": 1.0191, + "step": 1062 + }, + { + "epoch": 0.4130962790245798, + "grad_norm": 0.22252701222896576, + "learning_rate": 0.00011771117166212535, + "loss": 1.0591, + "step": 1063 + }, + { + "epoch": 0.4134848926454872, + "grad_norm": 0.22205251455307007, + "learning_rate": 0.000117633320358116, + "loss": 1.1198, + "step": 1064 + }, + { + "epoch": 0.41387350626639463, + "grad_norm": 0.20037783682346344, + "learning_rate": 0.00011755546905410665, + "loss": 1.0548, + "step": 1065 + }, + { + "epoch": 0.41426211988730205, + "grad_norm": 0.21737834811210632, + "learning_rate": 0.00011747761775009732, + "loss": 1.0922, + "step": 1066 + }, + { + "epoch": 0.41465073350820947, + "grad_norm": 0.19312533736228943, + "learning_rate": 0.00011739976644608798, + "loss": 0.9836, + "step": 1067 + }, + { + "epoch": 0.4150393471291169, + "grad_norm": 0.22055000066757202, + "learning_rate": 0.00011732191514207864, + "loss": 1.0383, + "step": 1068 + }, + { + "epoch": 0.4154279607500243, + "grad_norm": 0.22623857855796814, + "learning_rate": 0.0001172440638380693, + "loss": 1.0704, + "step": 1069 + }, + { + "epoch": 0.4158165743709317, + "grad_norm": 0.21481367945671082, + "learning_rate": 0.00011716621253405995, + "loss": 1.052, + "step": 1070 + }, + { + "epoch": 0.4162051879918391, + "grad_norm": 0.21022087335586548, + "learning_rate": 0.0001170883612300506, + "loss": 1.1021, + "step": 1071 + }, + { + "epoch": 0.4165938016127465, + "grad_norm": 0.2154620885848999, + "learning_rate": 0.00011701050992604126, + "loss": 1.0128, + "step": 1072 + }, + { + "epoch": 0.4169824152336539, + "grad_norm": 0.20545578002929688, + "learning_rate": 0.00011693265862203194, + "loss": 1.0058, + "step": 1073 + }, + { + "epoch": 0.41737102885456134, + "grad_norm": 0.21726195514202118, + "learning_rate": 0.00011685480731802259, + "loss": 1.0753, + "step": 1074 + }, + { + "epoch": 0.41775964247546876, + "grad_norm": 0.2067115604877472, + "learning_rate": 0.00011677695601401324, + "loss": 1.0594, + "step": 1075 + }, + { + "epoch": 0.4181482560963762, + "grad_norm": 0.23024648427963257, + "learning_rate": 0.0001166991047100039, + "loss": 1.1039, + "step": 1076 + }, + { + "epoch": 0.4185368697172836, + "grad_norm": 0.20692144334316254, + "learning_rate": 0.00011662125340599455, + "loss": 1.0598, + "step": 1077 + }, + { + "epoch": 0.418925483338191, + "grad_norm": 0.19839999079704285, + "learning_rate": 0.00011654340210198522, + "loss": 1.054, + "step": 1078 + }, + { + "epoch": 0.41931409695909844, + "grad_norm": 0.19227825105190277, + "learning_rate": 0.00011646555079797588, + "loss": 0.9453, + "step": 1079 + }, + { + "epoch": 0.41970271058000586, + "grad_norm": 0.2112567275762558, + "learning_rate": 0.00011638769949396653, + "loss": 1.023, + "step": 1080 + }, + { + "epoch": 0.4200913242009132, + "grad_norm": 0.185299351811409, + "learning_rate": 0.00011630984818995718, + "loss": 0.9752, + "step": 1081 + }, + { + "epoch": 0.42047993782182064, + "grad_norm": 0.20148858428001404, + "learning_rate": 0.00011623199688594783, + "loss": 1.0659, + "step": 1082 + }, + { + "epoch": 0.42086855144272806, + "grad_norm": 0.1935974359512329, + "learning_rate": 0.00011615414558193851, + "loss": 1.0116, + "step": 1083 + }, + { + "epoch": 0.4212571650636355, + "grad_norm": 0.20433953404426575, + "learning_rate": 0.00011607629427792916, + "loss": 1.0671, + "step": 1084 + }, + { + "epoch": 0.4216457786845429, + "grad_norm": 0.20729799568653107, + "learning_rate": 0.00011599844297391982, + "loss": 1.0341, + "step": 1085 + }, + { + "epoch": 0.4220343923054503, + "grad_norm": 0.2126002460718155, + "learning_rate": 0.00011592059166991047, + "loss": 1.0188, + "step": 1086 + }, + { + "epoch": 0.42242300592635773, + "grad_norm": 0.19453707337379456, + "learning_rate": 0.00011584274036590112, + "loss": 1.0331, + "step": 1087 + }, + { + "epoch": 0.42281161954726515, + "grad_norm": 0.20909856259822845, + "learning_rate": 0.0001157648890618918, + "loss": 0.9984, + "step": 1088 + }, + { + "epoch": 0.42320023316817257, + "grad_norm": 0.19596272706985474, + "learning_rate": 0.00011568703775788245, + "loss": 1.0121, + "step": 1089 + }, + { + "epoch": 0.42358884678907993, + "grad_norm": 0.22045716643333435, + "learning_rate": 0.0001156091864538731, + "loss": 1.0591, + "step": 1090 + }, + { + "epoch": 0.42397746040998735, + "grad_norm": 0.22624897956848145, + "learning_rate": 0.00011553133514986376, + "loss": 1.0565, + "step": 1091 + }, + { + "epoch": 0.42436607403089477, + "grad_norm": 0.20263417065143585, + "learning_rate": 0.00011545348384585442, + "loss": 1.024, + "step": 1092 + }, + { + "epoch": 0.4247546876518022, + "grad_norm": 0.20179417729377747, + "learning_rate": 0.00011537563254184509, + "loss": 0.9806, + "step": 1093 + }, + { + "epoch": 0.4251433012727096, + "grad_norm": 0.30221593379974365, + "learning_rate": 0.00011529778123783574, + "loss": 1.0683, + "step": 1094 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 0.21195146441459656, + "learning_rate": 0.0001152199299338264, + "loss": 1.1283, + "step": 1095 + }, + { + "epoch": 0.42592052851452444, + "grad_norm": 0.21860192716121674, + "learning_rate": 0.00011514207862981706, + "loss": 1.0046, + "step": 1096 + }, + { + "epoch": 0.42630914213543186, + "grad_norm": 0.2234150469303131, + "learning_rate": 0.00011506422732580771, + "loss": 1.0461, + "step": 1097 + }, + { + "epoch": 0.4266977557563393, + "grad_norm": 0.21535125374794006, + "learning_rate": 0.00011498637602179837, + "loss": 1.0593, + "step": 1098 + }, + { + "epoch": 0.4270863693772467, + "grad_norm": 0.19313789904117584, + "learning_rate": 0.00011490852471778904, + "loss": 1.0357, + "step": 1099 + }, + { + "epoch": 0.42747498299815406, + "grad_norm": 0.19886989891529083, + "learning_rate": 0.00011483067341377969, + "loss": 0.9946, + "step": 1100 + }, + { + "epoch": 0.4278635966190615, + "grad_norm": 0.21028490364551544, + "learning_rate": 0.00011475282210977034, + "loss": 1.0765, + "step": 1101 + }, + { + "epoch": 0.4282522102399689, + "grad_norm": 0.2066621333360672, + "learning_rate": 0.000114674970805761, + "loss": 1.0405, + "step": 1102 + }, + { + "epoch": 0.4286408238608763, + "grad_norm": 0.18400220572948456, + "learning_rate": 0.00011459711950175168, + "loss": 0.9404, + "step": 1103 + }, + { + "epoch": 0.42902943748178374, + "grad_norm": 0.2058599591255188, + "learning_rate": 0.00011451926819774233, + "loss": 1.0505, + "step": 1104 + }, + { + "epoch": 0.42941805110269116, + "grad_norm": 0.19696786999702454, + "learning_rate": 0.00011444141689373298, + "loss": 1.032, + "step": 1105 + }, + { + "epoch": 0.4298066647235986, + "grad_norm": 0.2082854062318802, + "learning_rate": 0.00011436356558972363, + "loss": 1.0914, + "step": 1106 + }, + { + "epoch": 0.430195278344506, + "grad_norm": 0.20155015587806702, + "learning_rate": 0.00011428571428571428, + "loss": 1.0541, + "step": 1107 + }, + { + "epoch": 0.4305838919654134, + "grad_norm": 0.23419982194900513, + "learning_rate": 0.00011420786298170494, + "loss": 1.0684, + "step": 1108 + }, + { + "epoch": 0.4309725055863208, + "grad_norm": 0.23493975400924683, + "learning_rate": 0.00011413001167769561, + "loss": 1.0509, + "step": 1109 + }, + { + "epoch": 0.4313611192072282, + "grad_norm": 0.2089843600988388, + "learning_rate": 0.00011405216037368627, + "loss": 1.0479, + "step": 1110 + }, + { + "epoch": 0.4317497328281356, + "grad_norm": 0.21076850593090057, + "learning_rate": 0.00011397430906967692, + "loss": 1.064, + "step": 1111 + }, + { + "epoch": 0.43213834644904303, + "grad_norm": 0.20307987928390503, + "learning_rate": 0.00011389645776566757, + "loss": 1.0416, + "step": 1112 + }, + { + "epoch": 0.43252696006995045, + "grad_norm": 0.20955562591552734, + "learning_rate": 0.00011381860646165822, + "loss": 1.0158, + "step": 1113 + }, + { + "epoch": 0.43291557369085787, + "grad_norm": 0.2074531465768814, + "learning_rate": 0.0001137407551576489, + "loss": 1.0486, + "step": 1114 + }, + { + "epoch": 0.4333041873117653, + "grad_norm": 0.20907235145568848, + "learning_rate": 0.00011366290385363955, + "loss": 1.0352, + "step": 1115 + }, + { + "epoch": 0.4336928009326727, + "grad_norm": 0.21726477146148682, + "learning_rate": 0.0001135850525496302, + "loss": 1.0068, + "step": 1116 + }, + { + "epoch": 0.4340814145535801, + "grad_norm": 0.20231984555721283, + "learning_rate": 0.00011350720124562086, + "loss": 0.9757, + "step": 1117 + }, + { + "epoch": 0.4344700281744875, + "grad_norm": 0.23485834896564484, + "learning_rate": 0.00011342934994161152, + "loss": 1.0681, + "step": 1118 + }, + { + "epoch": 0.4348586417953949, + "grad_norm": 0.21286556124687195, + "learning_rate": 0.00011335149863760219, + "loss": 1.0399, + "step": 1119 + }, + { + "epoch": 0.4352472554163023, + "grad_norm": 0.2097872495651245, + "learning_rate": 0.00011327364733359284, + "loss": 1.0435, + "step": 1120 + }, + { + "epoch": 0.43563586903720974, + "grad_norm": 0.2224377542734146, + "learning_rate": 0.00011319579602958351, + "loss": 1.1664, + "step": 1121 + }, + { + "epoch": 0.43602448265811716, + "grad_norm": 0.19213411211967468, + "learning_rate": 0.00011311794472557416, + "loss": 1.0424, + "step": 1122 + }, + { + "epoch": 0.4364130962790246, + "grad_norm": 0.20974959433078766, + "learning_rate": 0.00011304009342156481, + "loss": 1.0943, + "step": 1123 + }, + { + "epoch": 0.436801709899932, + "grad_norm": 0.19943708181381226, + "learning_rate": 0.00011296224211755549, + "loss": 1.0652, + "step": 1124 + }, + { + "epoch": 0.4371903235208394, + "grad_norm": 0.1832750141620636, + "learning_rate": 0.00011288439081354614, + "loss": 0.9883, + "step": 1125 + }, + { + "epoch": 0.43757893714174684, + "grad_norm": 0.2205052226781845, + "learning_rate": 0.0001128065395095368, + "loss": 1.0733, + "step": 1126 + }, + { + "epoch": 0.43796755076265426, + "grad_norm": 0.2082854062318802, + "learning_rate": 0.00011272868820552745, + "loss": 1.0141, + "step": 1127 + }, + { + "epoch": 0.4383561643835616, + "grad_norm": 0.22755026817321777, + "learning_rate": 0.0001126508369015181, + "loss": 1.0942, + "step": 1128 + }, + { + "epoch": 0.43874477800446904, + "grad_norm": 0.2098863571882248, + "learning_rate": 0.00011257298559750878, + "loss": 0.9987, + "step": 1129 + }, + { + "epoch": 0.43913339162537646, + "grad_norm": 0.20559263229370117, + "learning_rate": 0.00011249513429349943, + "loss": 1.0345, + "step": 1130 + }, + { + "epoch": 0.4395220052462839, + "grad_norm": 0.21955084800720215, + "learning_rate": 0.00011241728298949008, + "loss": 1.1068, + "step": 1131 + }, + { + "epoch": 0.4399106188671913, + "grad_norm": 0.21353478729724884, + "learning_rate": 0.00011233943168548073, + "loss": 1.0094, + "step": 1132 + }, + { + "epoch": 0.4402992324880987, + "grad_norm": 0.19822491705417633, + "learning_rate": 0.00011226158038147139, + "loss": 0.9758, + "step": 1133 + }, + { + "epoch": 0.44068784610900613, + "grad_norm": 0.20079441368579865, + "learning_rate": 0.00011218372907746206, + "loss": 1.0202, + "step": 1134 + }, + { + "epoch": 0.44107645972991355, + "grad_norm": 0.2261926829814911, + "learning_rate": 0.00011210587777345272, + "loss": 0.9877, + "step": 1135 + }, + { + "epoch": 0.44146507335082097, + "grad_norm": 0.2264915257692337, + "learning_rate": 0.00011202802646944337, + "loss": 0.9887, + "step": 1136 + }, + { + "epoch": 0.44185368697172833, + "grad_norm": 0.21853779256343842, + "learning_rate": 0.00011195017516543402, + "loss": 1.0535, + "step": 1137 + }, + { + "epoch": 0.44224230059263575, + "grad_norm": 0.21332694590091705, + "learning_rate": 0.00011187232386142467, + "loss": 1.0824, + "step": 1138 + }, + { + "epoch": 0.44263091421354317, + "grad_norm": 0.21350236237049103, + "learning_rate": 0.00011179447255741535, + "loss": 1.0758, + "step": 1139 + }, + { + "epoch": 0.4430195278344506, + "grad_norm": 0.21305765211582184, + "learning_rate": 0.000111716621253406, + "loss": 1.035, + "step": 1140 + }, + { + "epoch": 0.443408141455358, + "grad_norm": 0.20486389100551605, + "learning_rate": 0.00011163876994939666, + "loss": 1.0413, + "step": 1141 + }, + { + "epoch": 0.4437967550762654, + "grad_norm": 0.19255472719669342, + "learning_rate": 0.00011156091864538731, + "loss": 0.9583, + "step": 1142 + }, + { + "epoch": 0.44418536869717284, + "grad_norm": 0.19824008643627167, + "learning_rate": 0.00011148306734137796, + "loss": 1.0331, + "step": 1143 + }, + { + "epoch": 0.44457398231808026, + "grad_norm": 0.20308080315589905, + "learning_rate": 0.00011140521603736863, + "loss": 1.0399, + "step": 1144 + }, + { + "epoch": 0.4449625959389877, + "grad_norm": 0.2193964123725891, + "learning_rate": 0.00011132736473335929, + "loss": 1.063, + "step": 1145 + }, + { + "epoch": 0.4453512095598951, + "grad_norm": 0.2151576578617096, + "learning_rate": 0.00011124951342934994, + "loss": 1.0795, + "step": 1146 + }, + { + "epoch": 0.44573982318080246, + "grad_norm": 0.23056697845458984, + "learning_rate": 0.00011117166212534061, + "loss": 1.0351, + "step": 1147 + }, + { + "epoch": 0.4461284368017099, + "grad_norm": 0.1973094493150711, + "learning_rate": 0.00011109381082133126, + "loss": 0.9866, + "step": 1148 + }, + { + "epoch": 0.4465170504226173, + "grad_norm": 0.2119562178850174, + "learning_rate": 0.00011101595951732191, + "loss": 1.0591, + "step": 1149 + }, + { + "epoch": 0.4469056640435247, + "grad_norm": 0.20407763123512268, + "learning_rate": 0.00011093810821331259, + "loss": 0.988, + "step": 1150 + }, + { + "epoch": 0.44729427766443214, + "grad_norm": 0.19474107027053833, + "learning_rate": 0.00011086025690930324, + "loss": 0.9729, + "step": 1151 + }, + { + "epoch": 0.44768289128533956, + "grad_norm": 0.2179928421974182, + "learning_rate": 0.0001107824056052939, + "loss": 1.0558, + "step": 1152 + }, + { + "epoch": 0.448071504906247, + "grad_norm": 0.44306451082229614, + "learning_rate": 0.00011070455430128455, + "loss": 1.0901, + "step": 1153 + }, + { + "epoch": 0.4484601185271544, + "grad_norm": 0.22060540318489075, + "learning_rate": 0.0001106267029972752, + "loss": 1.0009, + "step": 1154 + }, + { + "epoch": 0.4488487321480618, + "grad_norm": 0.20534972846508026, + "learning_rate": 0.00011054885169326588, + "loss": 0.9741, + "step": 1155 + }, + { + "epoch": 0.4492373457689692, + "grad_norm": 0.19488993287086487, + "learning_rate": 0.00011047100038925653, + "loss": 1.0, + "step": 1156 + }, + { + "epoch": 0.4496259593898766, + "grad_norm": 0.20462395250797272, + "learning_rate": 0.00011039314908524718, + "loss": 1.0309, + "step": 1157 + }, + { + "epoch": 0.450014573010784, + "grad_norm": 0.2170749306678772, + "learning_rate": 0.00011031529778123784, + "loss": 1.0726, + "step": 1158 + }, + { + "epoch": 0.45040318663169143, + "grad_norm": 0.2066730111837387, + "learning_rate": 0.00011023744647722849, + "loss": 1.0227, + "step": 1159 + }, + { + "epoch": 0.45079180025259885, + "grad_norm": 0.20625676214694977, + "learning_rate": 0.00011015959517321917, + "loss": 1.0287, + "step": 1160 + }, + { + "epoch": 0.45118041387350627, + "grad_norm": 0.19483047723770142, + "learning_rate": 0.00011008174386920982, + "loss": 0.9639, + "step": 1161 + }, + { + "epoch": 0.4515690274944137, + "grad_norm": 0.24705417454242706, + "learning_rate": 0.00011000389256520047, + "loss": 0.9903, + "step": 1162 + }, + { + "epoch": 0.4519576411153211, + "grad_norm": 0.2109205424785614, + "learning_rate": 0.00010992604126119112, + "loss": 1.054, + "step": 1163 + }, + { + "epoch": 0.4523462547362285, + "grad_norm": 0.20904991030693054, + "learning_rate": 0.00010984818995718178, + "loss": 1.0416, + "step": 1164 + }, + { + "epoch": 0.45273486835713594, + "grad_norm": 0.19841328263282776, + "learning_rate": 0.00010977033865317245, + "loss": 0.9986, + "step": 1165 + }, + { + "epoch": 0.4531234819780433, + "grad_norm": 0.20545506477355957, + "learning_rate": 0.0001096924873491631, + "loss": 1.0337, + "step": 1166 + }, + { + "epoch": 0.4535120955989507, + "grad_norm": 0.208644837141037, + "learning_rate": 0.00010961463604515376, + "loss": 1.0304, + "step": 1167 + }, + { + "epoch": 0.45390070921985815, + "grad_norm": 0.2111911028623581, + "learning_rate": 0.00010953678474114441, + "loss": 1.0398, + "step": 1168 + }, + { + "epoch": 0.45428932284076556, + "grad_norm": 0.2600184381008148, + "learning_rate": 0.00010945893343713506, + "loss": 1.0509, + "step": 1169 + }, + { + "epoch": 0.454677936461673, + "grad_norm": 0.2059030532836914, + "learning_rate": 0.00010938108213312574, + "loss": 0.9347, + "step": 1170 + }, + { + "epoch": 0.4550665500825804, + "grad_norm": 0.19232551753520966, + "learning_rate": 0.0001093032308291164, + "loss": 1.0162, + "step": 1171 + }, + { + "epoch": 0.4554551637034878, + "grad_norm": 0.19147330522537231, + "learning_rate": 0.00010922537952510705, + "loss": 0.9872, + "step": 1172 + }, + { + "epoch": 0.45584377732439524, + "grad_norm": 0.2599676251411438, + "learning_rate": 0.00010914752822109771, + "loss": 1.0402, + "step": 1173 + }, + { + "epoch": 0.45623239094530266, + "grad_norm": 0.2159397304058075, + "learning_rate": 0.00010906967691708836, + "loss": 1.0411, + "step": 1174 + }, + { + "epoch": 0.45662100456621, + "grad_norm": 0.23864266276359558, + "learning_rate": 0.00010899182561307903, + "loss": 1.054, + "step": 1175 + }, + { + "epoch": 0.45700961818711744, + "grad_norm": 0.2027217596769333, + "learning_rate": 0.0001089139743090697, + "loss": 0.9713, + "step": 1176 + }, + { + "epoch": 0.45739823180802486, + "grad_norm": 0.1837588995695114, + "learning_rate": 0.00010883612300506035, + "loss": 0.9698, + "step": 1177 + }, + { + "epoch": 0.4577868454289323, + "grad_norm": 0.20038527250289917, + "learning_rate": 0.000108758271701051, + "loss": 1.0456, + "step": 1178 + }, + { + "epoch": 0.4581754590498397, + "grad_norm": 0.21525044739246368, + "learning_rate": 0.00010868042039704165, + "loss": 1.021, + "step": 1179 + }, + { + "epoch": 0.4585640726707471, + "grad_norm": 0.18813730776309967, + "learning_rate": 0.0001086025690930323, + "loss": 0.9673, + "step": 1180 + }, + { + "epoch": 0.45895268629165453, + "grad_norm": 0.2056179642677307, + "learning_rate": 0.00010852471778902298, + "loss": 1.0119, + "step": 1181 + }, + { + "epoch": 0.45934129991256195, + "grad_norm": 0.21599683165550232, + "learning_rate": 0.00010844686648501363, + "loss": 1.0537, + "step": 1182 + }, + { + "epoch": 0.45972991353346937, + "grad_norm": 0.19750265777111053, + "learning_rate": 0.00010836901518100429, + "loss": 1.0203, + "step": 1183 + }, + { + "epoch": 0.4601185271543768, + "grad_norm": 0.22186161577701569, + "learning_rate": 0.00010829116387699494, + "loss": 1.0583, + "step": 1184 + }, + { + "epoch": 0.46050714077528415, + "grad_norm": 0.2109905481338501, + "learning_rate": 0.00010821331257298559, + "loss": 1.0022, + "step": 1185 + }, + { + "epoch": 0.46089575439619157, + "grad_norm": 0.2032858431339264, + "learning_rate": 0.00010813546126897627, + "loss": 0.9774, + "step": 1186 + }, + { + "epoch": 0.461284368017099, + "grad_norm": 0.20381197333335876, + "learning_rate": 0.00010805760996496692, + "loss": 0.9768, + "step": 1187 + }, + { + "epoch": 0.4616729816380064, + "grad_norm": 0.20488987863063812, + "learning_rate": 0.00010797975866095757, + "loss": 1.0448, + "step": 1188 + }, + { + "epoch": 0.4620615952589138, + "grad_norm": 0.20257477462291718, + "learning_rate": 0.00010790190735694823, + "loss": 1.0157, + "step": 1189 + }, + { + "epoch": 0.46245020887982125, + "grad_norm": 0.20761239528656006, + "learning_rate": 0.00010782405605293888, + "loss": 1.0328, + "step": 1190 + }, + { + "epoch": 0.46283882250072866, + "grad_norm": 0.22062581777572632, + "learning_rate": 0.00010774620474892956, + "loss": 1.0362, + "step": 1191 + }, + { + "epoch": 0.4632274361216361, + "grad_norm": 0.19970272481441498, + "learning_rate": 0.00010766835344492021, + "loss": 1.0783, + "step": 1192 + }, + { + "epoch": 0.4636160497425435, + "grad_norm": 0.2221893072128296, + "learning_rate": 0.00010759050214091086, + "loss": 1.0136, + "step": 1193 + }, + { + "epoch": 0.46400466336345086, + "grad_norm": 0.2124665081501007, + "learning_rate": 0.00010751265083690151, + "loss": 1.0528, + "step": 1194 + }, + { + "epoch": 0.4643932769843583, + "grad_norm": 0.2001204937696457, + "learning_rate": 0.00010743479953289218, + "loss": 1.0495, + "step": 1195 + }, + { + "epoch": 0.4647818906052657, + "grad_norm": 0.20979635417461395, + "learning_rate": 0.00010735694822888284, + "loss": 1.0664, + "step": 1196 + }, + { + "epoch": 0.4651705042261731, + "grad_norm": 0.190982848405838, + "learning_rate": 0.0001072790969248735, + "loss": 1.0256, + "step": 1197 + }, + { + "epoch": 0.46555911784708054, + "grad_norm": 0.19910745322704315, + "learning_rate": 0.00010720124562086415, + "loss": 1.0263, + "step": 1198 + }, + { + "epoch": 0.46594773146798796, + "grad_norm": 0.21624085307121277, + "learning_rate": 0.00010712339431685481, + "loss": 1.0768, + "step": 1199 + }, + { + "epoch": 0.4663363450888954, + "grad_norm": 0.20857703685760498, + "learning_rate": 0.00010704554301284547, + "loss": 1.0892, + "step": 1200 + }, + { + "epoch": 0.4667249587098028, + "grad_norm": 0.21897061169147491, + "learning_rate": 0.00010696769170883613, + "loss": 1.0873, + "step": 1201 + }, + { + "epoch": 0.4671135723307102, + "grad_norm": 0.1943386346101761, + "learning_rate": 0.0001068898404048268, + "loss": 1.0116, + "step": 1202 + }, + { + "epoch": 0.4675021859516176, + "grad_norm": 0.22607874870300293, + "learning_rate": 0.00010681198910081745, + "loss": 1.0328, + "step": 1203 + }, + { + "epoch": 0.467890799572525, + "grad_norm": 0.1898999959230423, + "learning_rate": 0.0001067341377968081, + "loss": 0.9791, + "step": 1204 + }, + { + "epoch": 0.4682794131934324, + "grad_norm": 0.2193334400653839, + "learning_rate": 0.00010665628649279875, + "loss": 1.0742, + "step": 1205 + }, + { + "epoch": 0.46866802681433983, + "grad_norm": 0.2096349149942398, + "learning_rate": 0.00010657843518878943, + "loss": 1.0683, + "step": 1206 + }, + { + "epoch": 0.46905664043524725, + "grad_norm": 0.2040576934814453, + "learning_rate": 0.00010650058388478008, + "loss": 1.0516, + "step": 1207 + }, + { + "epoch": 0.46944525405615467, + "grad_norm": 0.20619645714759827, + "learning_rate": 0.00010642273258077074, + "loss": 1.0429, + "step": 1208 + }, + { + "epoch": 0.4698338676770621, + "grad_norm": 0.19753660261631012, + "learning_rate": 0.00010634488127676139, + "loss": 1.0268, + "step": 1209 + }, + { + "epoch": 0.4702224812979695, + "grad_norm": 0.2201426476240158, + "learning_rate": 0.00010626702997275204, + "loss": 1.0879, + "step": 1210 + }, + { + "epoch": 0.4706110949188769, + "grad_norm": 0.21307805180549622, + "learning_rate": 0.00010618917866874272, + "loss": 1.0186, + "step": 1211 + }, + { + "epoch": 0.47099970853978435, + "grad_norm": 0.21142373979091644, + "learning_rate": 0.00010611132736473337, + "loss": 1.0417, + "step": 1212 + }, + { + "epoch": 0.4713883221606917, + "grad_norm": 0.20523706078529358, + "learning_rate": 0.00010603347606072402, + "loss": 1.0372, + "step": 1213 + }, + { + "epoch": 0.4717769357815991, + "grad_norm": 0.19843094050884247, + "learning_rate": 0.00010595562475671468, + "loss": 1.0062, + "step": 1214 + }, + { + "epoch": 0.47216554940250655, + "grad_norm": 0.2146739959716797, + "learning_rate": 0.00010587777345270533, + "loss": 1.0528, + "step": 1215 + }, + { + "epoch": 0.47255416302341396, + "grad_norm": 0.2136303037405014, + "learning_rate": 0.00010579992214869601, + "loss": 1.0521, + "step": 1216 + }, + { + "epoch": 0.4729427766443214, + "grad_norm": 0.21379397809505463, + "learning_rate": 0.00010572207084468666, + "loss": 1.0362, + "step": 1217 + }, + { + "epoch": 0.4733313902652288, + "grad_norm": 0.20459088683128357, + "learning_rate": 0.00010564421954067731, + "loss": 1.0455, + "step": 1218 + }, + { + "epoch": 0.4737200038861362, + "grad_norm": 0.20667988061904907, + "learning_rate": 0.00010556636823666796, + "loss": 1.0284, + "step": 1219 + }, + { + "epoch": 0.47410861750704364, + "grad_norm": 0.21820449829101562, + "learning_rate": 0.00010548851693265862, + "loss": 1.0584, + "step": 1220 + }, + { + "epoch": 0.47449723112795106, + "grad_norm": 0.19705156981945038, + "learning_rate": 0.00010541066562864928, + "loss": 1.004, + "step": 1221 + }, + { + "epoch": 0.4748858447488584, + "grad_norm": 0.19806528091430664, + "learning_rate": 0.00010533281432463995, + "loss": 1.0519, + "step": 1222 + }, + { + "epoch": 0.47527445836976584, + "grad_norm": 0.2006833702325821, + "learning_rate": 0.0001052549630206306, + "loss": 1.0119, + "step": 1223 + }, + { + "epoch": 0.47566307199067326, + "grad_norm": 0.21757058799266815, + "learning_rate": 0.00010517711171662125, + "loss": 1.0961, + "step": 1224 + }, + { + "epoch": 0.4760516856115807, + "grad_norm": 0.2015775889158249, + "learning_rate": 0.00010509926041261192, + "loss": 1.0419, + "step": 1225 + }, + { + "epoch": 0.4764402992324881, + "grad_norm": 0.19691923260688782, + "learning_rate": 0.00010502140910860257, + "loss": 1.0555, + "step": 1226 + }, + { + "epoch": 0.4768289128533955, + "grad_norm": 0.19924800097942352, + "learning_rate": 0.00010494355780459323, + "loss": 1.0106, + "step": 1227 + }, + { + "epoch": 0.47721752647430293, + "grad_norm": 0.21416346728801727, + "learning_rate": 0.0001048657065005839, + "loss": 1.0741, + "step": 1228 + }, + { + "epoch": 0.47760614009521035, + "grad_norm": 0.21823547780513763, + "learning_rate": 0.00010478785519657455, + "loss": 1.023, + "step": 1229 + }, + { + "epoch": 0.47799475371611777, + "grad_norm": 0.2083735466003418, + "learning_rate": 0.0001047100038925652, + "loss": 1.0424, + "step": 1230 + }, + { + "epoch": 0.4783833673370252, + "grad_norm": 0.2219141572713852, + "learning_rate": 0.00010463215258855586, + "loss": 1.0839, + "step": 1231 + }, + { + "epoch": 0.47877198095793255, + "grad_norm": 0.21334600448608398, + "learning_rate": 0.00010455430128454653, + "loss": 0.9888, + "step": 1232 + }, + { + "epoch": 0.47916059457883997, + "grad_norm": 0.2140086442232132, + "learning_rate": 0.00010447644998053719, + "loss": 1.0119, + "step": 1233 + }, + { + "epoch": 0.4795492081997474, + "grad_norm": 0.25360551476478577, + "learning_rate": 0.00010439859867652784, + "loss": 1.0026, + "step": 1234 + }, + { + "epoch": 0.4799378218206548, + "grad_norm": 0.20200380682945251, + "learning_rate": 0.00010432074737251849, + "loss": 1.0, + "step": 1235 + }, + { + "epoch": 0.4803264354415622, + "grad_norm": 0.22641289234161377, + "learning_rate": 0.00010424289606850914, + "loss": 1.1022, + "step": 1236 + }, + { + "epoch": 0.48071504906246965, + "grad_norm": 0.20538561046123505, + "learning_rate": 0.00010416504476449982, + "loss": 0.9847, + "step": 1237 + }, + { + "epoch": 0.48110366268337706, + "grad_norm": 0.206883504986763, + "learning_rate": 0.00010408719346049047, + "loss": 1.0152, + "step": 1238 + }, + { + "epoch": 0.4814922763042845, + "grad_norm": 0.21584320068359375, + "learning_rate": 0.00010400934215648113, + "loss": 1.0361, + "step": 1239 + }, + { + "epoch": 0.4818808899251919, + "grad_norm": 0.20963703095912933, + "learning_rate": 0.00010393149085247178, + "loss": 1.0814, + "step": 1240 + }, + { + "epoch": 0.48226950354609927, + "grad_norm": 0.1965872198343277, + "learning_rate": 0.00010385363954846243, + "loss": 1.0365, + "step": 1241 + }, + { + "epoch": 0.4826581171670067, + "grad_norm": 0.2030191719532013, + "learning_rate": 0.00010377578824445311, + "loss": 1.0374, + "step": 1242 + }, + { + "epoch": 0.4830467307879141, + "grad_norm": 0.21448804438114166, + "learning_rate": 0.00010369793694044376, + "loss": 0.9686, + "step": 1243 + }, + { + "epoch": 0.4834353444088215, + "grad_norm": 0.2181752622127533, + "learning_rate": 0.00010362008563643441, + "loss": 1.0812, + "step": 1244 + }, + { + "epoch": 0.48382395802972894, + "grad_norm": 0.19887101650238037, + "learning_rate": 0.00010354223433242507, + "loss": 1.036, + "step": 1245 + }, + { + "epoch": 0.48421257165063636, + "grad_norm": 0.19007287919521332, + "learning_rate": 0.00010346438302841572, + "loss": 1.0292, + "step": 1246 + }, + { + "epoch": 0.4846011852715438, + "grad_norm": 0.21390347182750702, + "learning_rate": 0.0001033865317244064, + "loss": 1.0284, + "step": 1247 + }, + { + "epoch": 0.4849897988924512, + "grad_norm": 0.23822663724422455, + "learning_rate": 0.00010330868042039705, + "loss": 1.1044, + "step": 1248 + }, + { + "epoch": 0.4853784125133586, + "grad_norm": 0.20779070258140564, + "learning_rate": 0.0001032308291163877, + "loss": 1.0475, + "step": 1249 + }, + { + "epoch": 0.48576702613426603, + "grad_norm": 0.19232134521007538, + "learning_rate": 0.00010315297781237835, + "loss": 0.9945, + "step": 1250 + }, + { + "epoch": 0.4861556397551734, + "grad_norm": 0.22378556430339813, + "learning_rate": 0.00010307512650836902, + "loss": 1.0462, + "step": 1251 + }, + { + "epoch": 0.4865442533760808, + "grad_norm": 0.22156798839569092, + "learning_rate": 0.00010299727520435968, + "loss": 1.051, + "step": 1252 + }, + { + "epoch": 0.48693286699698823, + "grad_norm": 0.19885733723640442, + "learning_rate": 0.00010291942390035034, + "loss": 1.0593, + "step": 1253 + }, + { + "epoch": 0.48732148061789565, + "grad_norm": 0.2172418236732483, + "learning_rate": 0.000102841572596341, + "loss": 1.0513, + "step": 1254 + }, + { + "epoch": 0.48771009423880307, + "grad_norm": 0.22136956453323364, + "learning_rate": 0.00010276372129233165, + "loss": 1.0438, + "step": 1255 + }, + { + "epoch": 0.4880987078597105, + "grad_norm": 0.21337302029132843, + "learning_rate": 0.0001026858699883223, + "loss": 1.0551, + "step": 1256 + }, + { + "epoch": 0.4884873214806179, + "grad_norm": 0.21376267075538635, + "learning_rate": 0.00010260801868431296, + "loss": 1.054, + "step": 1257 + }, + { + "epoch": 0.4888759351015253, + "grad_norm": 0.19498860836029053, + "learning_rate": 0.00010253016738030364, + "loss": 1.0045, + "step": 1258 + }, + { + "epoch": 0.48926454872243275, + "grad_norm": 0.22354961931705475, + "learning_rate": 0.00010245231607629429, + "loss": 1.096, + "step": 1259 + }, + { + "epoch": 0.4896531623433401, + "grad_norm": 0.2078939527273178, + "learning_rate": 0.00010237446477228494, + "loss": 1.0102, + "step": 1260 + }, + { + "epoch": 0.49004177596424753, + "grad_norm": 0.20992495119571686, + "learning_rate": 0.00010229661346827559, + "loss": 0.9814, + "step": 1261 + }, + { + "epoch": 0.49043038958515495, + "grad_norm": 0.2178875207901001, + "learning_rate": 0.00010221876216426625, + "loss": 1.0489, + "step": 1262 + }, + { + "epoch": 0.49081900320606237, + "grad_norm": 0.22152946889400482, + "learning_rate": 0.00010214091086025692, + "loss": 1.0808, + "step": 1263 + }, + { + "epoch": 0.4912076168269698, + "grad_norm": 0.21179009974002838, + "learning_rate": 0.00010206305955624758, + "loss": 1.0323, + "step": 1264 + }, + { + "epoch": 0.4915962304478772, + "grad_norm": 0.2126997411251068, + "learning_rate": 0.00010198520825223823, + "loss": 1.0093, + "step": 1265 + }, + { + "epoch": 0.4919848440687846, + "grad_norm": 0.20912809669971466, + "learning_rate": 0.00010190735694822888, + "loss": 1.0343, + "step": 1266 + }, + { + "epoch": 0.49237345768969204, + "grad_norm": 0.2231636494398117, + "learning_rate": 0.00010182950564421953, + "loss": 1.0587, + "step": 1267 + }, + { + "epoch": 0.49276207131059946, + "grad_norm": 0.1954583376646042, + "learning_rate": 0.00010175165434021021, + "loss": 0.9566, + "step": 1268 + }, + { + "epoch": 0.4931506849315068, + "grad_norm": 0.20520909130573273, + "learning_rate": 0.00010167380303620086, + "loss": 1.024, + "step": 1269 + }, + { + "epoch": 0.49353929855241424, + "grad_norm": 0.21736180782318115, + "learning_rate": 0.00010159595173219152, + "loss": 1.0434, + "step": 1270 + }, + { + "epoch": 0.49392791217332166, + "grad_norm": 0.2360561490058899, + "learning_rate": 0.00010151810042818217, + "loss": 1.114, + "step": 1271 + }, + { + "epoch": 0.4943165257942291, + "grad_norm": 0.20595967769622803, + "learning_rate": 0.00010144024912417282, + "loss": 0.9909, + "step": 1272 + }, + { + "epoch": 0.4947051394151365, + "grad_norm": 0.2161860466003418, + "learning_rate": 0.0001013623978201635, + "loss": 1.0536, + "step": 1273 + }, + { + "epoch": 0.4950937530360439, + "grad_norm": 0.19852355122566223, + "learning_rate": 0.00010128454651615415, + "loss": 1.0001, + "step": 1274 + }, + { + "epoch": 0.49548236665695133, + "grad_norm": 0.21081402897834778, + "learning_rate": 0.0001012066952121448, + "loss": 1.0151, + "step": 1275 + }, + { + "epoch": 0.49587098027785875, + "grad_norm": 0.2053362876176834, + "learning_rate": 0.00010112884390813547, + "loss": 1.018, + "step": 1276 + }, + { + "epoch": 0.49625959389876617, + "grad_norm": 0.21205593645572662, + "learning_rate": 0.00010105099260412612, + "loss": 0.9912, + "step": 1277 + }, + { + "epoch": 0.4966482075196736, + "grad_norm": 0.2005016952753067, + "learning_rate": 0.00010097314130011679, + "loss": 1.0069, + "step": 1278 + }, + { + "epoch": 0.49703682114058095, + "grad_norm": 0.21688181161880493, + "learning_rate": 0.00010089528999610744, + "loss": 1.0364, + "step": 1279 + }, + { + "epoch": 0.49742543476148837, + "grad_norm": 0.20582237839698792, + "learning_rate": 0.0001008174386920981, + "loss": 1.0138, + "step": 1280 + }, + { + "epoch": 0.4978140483823958, + "grad_norm": 0.20824448764324188, + "learning_rate": 0.00010073958738808876, + "loss": 0.9941, + "step": 1281 + }, + { + "epoch": 0.4982026620033032, + "grad_norm": 0.20749075710773468, + "learning_rate": 0.00010066173608407941, + "loss": 1.0478, + "step": 1282 + }, + { + "epoch": 0.49859127562421063, + "grad_norm": 0.20012183487415314, + "learning_rate": 0.00010058388478007009, + "loss": 0.995, + "step": 1283 + }, + { + "epoch": 0.49897988924511805, + "grad_norm": 0.20275959372520447, + "learning_rate": 0.00010050603347606074, + "loss": 1.097, + "step": 1284 + }, + { + "epoch": 0.49936850286602547, + "grad_norm": 0.19588243961334229, + "learning_rate": 0.00010042818217205139, + "loss": 1.0, + "step": 1285 + }, + { + "epoch": 0.4997571164869329, + "grad_norm": 0.20693185925483704, + "learning_rate": 0.00010035033086804204, + "loss": 1.0527, + "step": 1286 + }, + { + "epoch": 0.5001457301078402, + "grad_norm": 0.20330573618412018, + "learning_rate": 0.0001002724795640327, + "loss": 1.0137, + "step": 1287 + }, + { + "epoch": 0.5005343437287477, + "grad_norm": 0.19123876094818115, + "learning_rate": 0.00010019462826002337, + "loss": 0.9688, + "step": 1288 + }, + { + "epoch": 0.5009229573496551, + "grad_norm": 0.2184276431798935, + "learning_rate": 0.00010011677695601403, + "loss": 1.0367, + "step": 1289 + }, + { + "epoch": 0.5013115709705626, + "grad_norm": 0.21642108261585236, + "learning_rate": 0.00010003892565200468, + "loss": 1.102, + "step": 1290 + }, + { + "epoch": 0.5017001845914699, + "grad_norm": 0.20351074635982513, + "learning_rate": 9.996107434799533e-05, + "loss": 1.0327, + "step": 1291 + }, + { + "epoch": 0.5020887982123774, + "grad_norm": 0.22771553695201874, + "learning_rate": 9.9883223043986e-05, + "loss": 1.104, + "step": 1292 + }, + { + "epoch": 0.5024774118332848, + "grad_norm": 0.2271403968334198, + "learning_rate": 9.980537173997665e-05, + "loss": 1.1313, + "step": 1293 + }, + { + "epoch": 0.5028660254541921, + "grad_norm": 0.2157830148935318, + "learning_rate": 9.97275204359673e-05, + "loss": 1.0203, + "step": 1294 + }, + { + "epoch": 0.5032546390750996, + "grad_norm": 0.19555307924747467, + "learning_rate": 9.964966913195797e-05, + "loss": 1.0194, + "step": 1295 + }, + { + "epoch": 0.503643252696007, + "grad_norm": 0.1898549199104309, + "learning_rate": 9.957181782794862e-05, + "loss": 1.0034, + "step": 1296 + }, + { + "epoch": 0.5040318663169144, + "grad_norm": 0.23555906116962433, + "learning_rate": 9.949396652393928e-05, + "loss": 1.0298, + "step": 1297 + }, + { + "epoch": 0.5044204799378218, + "grad_norm": 0.20434850454330444, + "learning_rate": 9.941611521992994e-05, + "loss": 0.9999, + "step": 1298 + }, + { + "epoch": 0.5048090935587293, + "grad_norm": 0.21015289425849915, + "learning_rate": 9.933826391592059e-05, + "loss": 1.006, + "step": 1299 + }, + { + "epoch": 0.5051977071796366, + "grad_norm": 0.21147851645946503, + "learning_rate": 9.926041261191125e-05, + "loss": 1.0854, + "step": 1300 + }, + { + "epoch": 0.5055863208005441, + "grad_norm": 0.19666944444179535, + "learning_rate": 9.91825613079019e-05, + "loss": 1.0057, + "step": 1301 + }, + { + "epoch": 0.5059749344214515, + "grad_norm": 0.21233728528022766, + "learning_rate": 9.910471000389257e-05, + "loss": 1.0675, + "step": 1302 + }, + { + "epoch": 0.5063635480423588, + "grad_norm": 0.21905581653118134, + "learning_rate": 9.902685869988322e-05, + "loss": 1.0054, + "step": 1303 + }, + { + "epoch": 0.5067521616632663, + "grad_norm": 0.23434993624687195, + "learning_rate": 9.894900739587389e-05, + "loss": 0.9915, + "step": 1304 + }, + { + "epoch": 0.5071407752841737, + "grad_norm": 0.21684227883815765, + "learning_rate": 9.887115609186454e-05, + "loss": 1.1131, + "step": 1305 + }, + { + "epoch": 0.5075293889050811, + "grad_norm": 0.21699552237987518, + "learning_rate": 9.87933047878552e-05, + "loss": 1.0782, + "step": 1306 + }, + { + "epoch": 0.5079180025259885, + "grad_norm": 0.2218221127986908, + "learning_rate": 9.871545348384586e-05, + "loss": 1.0388, + "step": 1307 + }, + { + "epoch": 0.508306616146896, + "grad_norm": 0.20104359090328217, + "learning_rate": 9.863760217983652e-05, + "loss": 1.0336, + "step": 1308 + }, + { + "epoch": 0.5086952297678033, + "grad_norm": 0.21907050907611847, + "learning_rate": 9.855975087582718e-05, + "loss": 1.0587, + "step": 1309 + }, + { + "epoch": 0.5090838433887108, + "grad_norm": 0.2140391767024994, + "learning_rate": 9.848189957181784e-05, + "loss": 1.0351, + "step": 1310 + }, + { + "epoch": 0.5094724570096182, + "grad_norm": 0.33287563920021057, + "learning_rate": 9.84040482678085e-05, + "loss": 0.9908, + "step": 1311 + }, + { + "epoch": 0.5098610706305255, + "grad_norm": 0.2706705927848816, + "learning_rate": 9.832619696379915e-05, + "loss": 1.0078, + "step": 1312 + }, + { + "epoch": 0.510249684251433, + "grad_norm": 0.20216278731822968, + "learning_rate": 9.824834565978981e-05, + "loss": 1.0253, + "step": 1313 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 0.20736576616764069, + "learning_rate": 9.817049435578046e-05, + "loss": 1.0217, + "step": 1314 + }, + { + "epoch": 0.5110269114932479, + "grad_norm": 0.2275344580411911, + "learning_rate": 9.809264305177113e-05, + "loss": 1.0139, + "step": 1315 + }, + { + "epoch": 0.5114155251141552, + "grad_norm": 0.22243620455265045, + "learning_rate": 9.801479174776178e-05, + "loss": 1.0427, + "step": 1316 + }, + { + "epoch": 0.5118041387350627, + "grad_norm": 0.198841854929924, + "learning_rate": 9.793694044375243e-05, + "loss": 1.0231, + "step": 1317 + }, + { + "epoch": 0.5121927523559701, + "grad_norm": 0.2031068503856659, + "learning_rate": 9.78590891397431e-05, + "loss": 1.0184, + "step": 1318 + }, + { + "epoch": 0.5125813659768775, + "grad_norm": 0.21712587773799896, + "learning_rate": 9.778123783573375e-05, + "loss": 1.0205, + "step": 1319 + }, + { + "epoch": 0.5129699795977849, + "grad_norm": 0.19366060197353363, + "learning_rate": 9.77033865317244e-05, + "loss": 0.9623, + "step": 1320 + }, + { + "epoch": 0.5133585932186923, + "grad_norm": 0.19845952093601227, + "learning_rate": 9.762553522771507e-05, + "loss": 1.0209, + "step": 1321 + }, + { + "epoch": 0.5137472068395997, + "grad_norm": 0.19700276851654053, + "learning_rate": 9.754768392370572e-05, + "loss": 0.9506, + "step": 1322 + }, + { + "epoch": 0.5141358204605071, + "grad_norm": 0.19797460734844208, + "learning_rate": 9.746983261969639e-05, + "loss": 1.0928, + "step": 1323 + }, + { + "epoch": 0.5145244340814146, + "grad_norm": 0.20470699667930603, + "learning_rate": 9.739198131568704e-05, + "loss": 1.0835, + "step": 1324 + }, + { + "epoch": 0.5149130477023219, + "grad_norm": 0.19121742248535156, + "learning_rate": 9.731413001167769e-05, + "loss": 0.9877, + "step": 1325 + }, + { + "epoch": 0.5153016613232294, + "grad_norm": 0.20026616752147675, + "learning_rate": 9.723627870766836e-05, + "loss": 1.0094, + "step": 1326 + }, + { + "epoch": 0.5156902749441368, + "grad_norm": 0.2214539796113968, + "learning_rate": 9.715842740365901e-05, + "loss": 0.9867, + "step": 1327 + }, + { + "epoch": 0.5160788885650442, + "grad_norm": 0.22674603760242462, + "learning_rate": 9.708057609964967e-05, + "loss": 1.0738, + "step": 1328 + }, + { + "epoch": 0.5164675021859516, + "grad_norm": 0.21274834871292114, + "learning_rate": 9.700272479564033e-05, + "loss": 1.0458, + "step": 1329 + }, + { + "epoch": 0.5168561158068591, + "grad_norm": 0.20305052399635315, + "learning_rate": 9.692487349163099e-05, + "loss": 1.0041, + "step": 1330 + }, + { + "epoch": 0.5172447294277664, + "grad_norm": 0.1840772181749344, + "learning_rate": 9.684702218762166e-05, + "loss": 0.9498, + "step": 1331 + }, + { + "epoch": 0.5176333430486738, + "grad_norm": 0.2055782824754715, + "learning_rate": 9.676917088361231e-05, + "loss": 1.0223, + "step": 1332 + }, + { + "epoch": 0.5180219566695813, + "grad_norm": 0.21826402842998505, + "learning_rate": 9.669131957960297e-05, + "loss": 1.1068, + "step": 1333 + }, + { + "epoch": 0.5184105702904886, + "grad_norm": 0.22516922652721405, + "learning_rate": 9.661346827559363e-05, + "loss": 1.0957, + "step": 1334 + }, + { + "epoch": 0.5187991839113961, + "grad_norm": 0.21044284105300903, + "learning_rate": 9.653561697158428e-05, + "loss": 1.0384, + "step": 1335 + }, + { + "epoch": 0.5191877975323035, + "grad_norm": 0.20275571942329407, + "learning_rate": 9.645776566757494e-05, + "loss": 0.9978, + "step": 1336 + }, + { + "epoch": 0.519576411153211, + "grad_norm": 0.2077122926712036, + "learning_rate": 9.63799143635656e-05, + "loss": 1.0418, + "step": 1337 + }, + { + "epoch": 0.5199650247741183, + "grad_norm": 0.19158867001533508, + "learning_rate": 9.630206305955625e-05, + "loss": 1.0527, + "step": 1338 + }, + { + "epoch": 0.5203536383950258, + "grad_norm": 0.1932496577501297, + "learning_rate": 9.622421175554691e-05, + "loss": 1.0039, + "step": 1339 + }, + { + "epoch": 0.5207422520159332, + "grad_norm": 0.21937766671180725, + "learning_rate": 9.614636045153757e-05, + "loss": 1.0373, + "step": 1340 + }, + { + "epoch": 0.5211308656368405, + "grad_norm": 0.2268432229757309, + "learning_rate": 9.606850914752823e-05, + "loss": 1.0815, + "step": 1341 + }, + { + "epoch": 0.521519479257748, + "grad_norm": 0.2147454470396042, + "learning_rate": 9.599065784351888e-05, + "loss": 1.0331, + "step": 1342 + }, + { + "epoch": 0.5219080928786554, + "grad_norm": 0.19899709522724152, + "learning_rate": 9.591280653950954e-05, + "loss": 1.032, + "step": 1343 + }, + { + "epoch": 0.5222967064995628, + "grad_norm": 0.19646069407463074, + "learning_rate": 9.58349552355002e-05, + "loss": 0.9788, + "step": 1344 + }, + { + "epoch": 0.5226853201204702, + "grad_norm": 0.2146075963973999, + "learning_rate": 9.575710393149085e-05, + "loss": 1.0201, + "step": 1345 + }, + { + "epoch": 0.5230739337413777, + "grad_norm": 0.1968650370836258, + "learning_rate": 9.567925262748152e-05, + "loss": 0.9894, + "step": 1346 + }, + { + "epoch": 0.523462547362285, + "grad_norm": 0.21111296117305756, + "learning_rate": 9.560140132347217e-05, + "loss": 1.0961, + "step": 1347 + }, + { + "epoch": 0.5238511609831925, + "grad_norm": 0.20917272567749023, + "learning_rate": 9.552355001946282e-05, + "loss": 1.0435, + "step": 1348 + }, + { + "epoch": 0.5242397746040999, + "grad_norm": 0.2029752880334854, + "learning_rate": 9.544569871545349e-05, + "loss": 1.0328, + "step": 1349 + }, + { + "epoch": 0.5246283882250072, + "grad_norm": 0.20726613700389862, + "learning_rate": 9.536784741144414e-05, + "loss": 1.0465, + "step": 1350 + }, + { + "epoch": 0.5250170018459147, + "grad_norm": 0.19778740406036377, + "learning_rate": 9.52899961074348e-05, + "loss": 1.0058, + "step": 1351 + }, + { + "epoch": 0.5254056154668221, + "grad_norm": 0.19958540797233582, + "learning_rate": 9.521214480342546e-05, + "loss": 1.0164, + "step": 1352 + }, + { + "epoch": 0.5257942290877295, + "grad_norm": 0.2151395082473755, + "learning_rate": 9.513429349941611e-05, + "loss": 1.0703, + "step": 1353 + }, + { + "epoch": 0.5261828427086369, + "grad_norm": 0.2366979569196701, + "learning_rate": 9.505644219540678e-05, + "loss": 0.9832, + "step": 1354 + }, + { + "epoch": 0.5265714563295444, + "grad_norm": 0.22064165771007538, + "learning_rate": 9.497859089139743e-05, + "loss": 1.0181, + "step": 1355 + }, + { + "epoch": 0.5269600699504517, + "grad_norm": 0.20221936702728271, + "learning_rate": 9.49007395873881e-05, + "loss": 1.0424, + "step": 1356 + }, + { + "epoch": 0.5273486835713592, + "grad_norm": 0.19608759880065918, + "learning_rate": 9.482288828337876e-05, + "loss": 1.0074, + "step": 1357 + }, + { + "epoch": 0.5277372971922666, + "grad_norm": 0.20686689019203186, + "learning_rate": 9.474503697936941e-05, + "loss": 1.0213, + "step": 1358 + }, + { + "epoch": 0.528125910813174, + "grad_norm": 0.223610520362854, + "learning_rate": 9.466718567536008e-05, + "loss": 1.05, + "step": 1359 + }, + { + "epoch": 0.5285145244340814, + "grad_norm": 0.2135966569185257, + "learning_rate": 9.458933437135073e-05, + "loss": 1.034, + "step": 1360 + }, + { + "epoch": 0.5289031380549888, + "grad_norm": 0.1933239996433258, + "learning_rate": 9.451148306734138e-05, + "loss": 0.9883, + "step": 1361 + }, + { + "epoch": 0.5292917516758963, + "grad_norm": 0.20794694125652313, + "learning_rate": 9.443363176333205e-05, + "loss": 1.0103, + "step": 1362 + }, + { + "epoch": 0.5296803652968036, + "grad_norm": 0.20128493010997772, + "learning_rate": 9.43557804593227e-05, + "loss": 1.015, + "step": 1363 + }, + { + "epoch": 0.5300689789177111, + "grad_norm": 0.2128933072090149, + "learning_rate": 9.427792915531336e-05, + "loss": 1.0038, + "step": 1364 + }, + { + "epoch": 0.5304575925386185, + "grad_norm": 0.2046983689069748, + "learning_rate": 9.420007785130402e-05, + "loss": 0.9948, + "step": 1365 + }, + { + "epoch": 0.5308462061595259, + "grad_norm": 0.20909680426120758, + "learning_rate": 9.412222654729467e-05, + "loss": 1.0308, + "step": 1366 + }, + { + "epoch": 0.5312348197804333, + "grad_norm": 0.2182164192199707, + "learning_rate": 9.404437524328533e-05, + "loss": 1.0018, + "step": 1367 + }, + { + "epoch": 0.5316234334013407, + "grad_norm": 0.2107028216123581, + "learning_rate": 9.396652393927599e-05, + "loss": 1.0419, + "step": 1368 + }, + { + "epoch": 0.5320120470222481, + "grad_norm": 0.24631445109844208, + "learning_rate": 9.388867263526665e-05, + "loss": 1.0171, + "step": 1369 + }, + { + "epoch": 0.5324006606431555, + "grad_norm": 0.20331013202667236, + "learning_rate": 9.38108213312573e-05, + "loss": 1.0592, + "step": 1370 + }, + { + "epoch": 0.532789274264063, + "grad_norm": 0.19266058504581451, + "learning_rate": 9.373297002724796e-05, + "loss": 0.9912, + "step": 1371 + }, + { + "epoch": 0.5331778878849703, + "grad_norm": 0.22874227166175842, + "learning_rate": 9.365511872323862e-05, + "loss": 1.0533, + "step": 1372 + }, + { + "epoch": 0.5335665015058778, + "grad_norm": 0.2088235765695572, + "learning_rate": 9.357726741922927e-05, + "loss": 1.0464, + "step": 1373 + }, + { + "epoch": 0.5339551151267852, + "grad_norm": 0.2112397700548172, + "learning_rate": 9.349941611521994e-05, + "loss": 1.0503, + "step": 1374 + }, + { + "epoch": 0.5343437287476926, + "grad_norm": 0.20712170004844666, + "learning_rate": 9.342156481121059e-05, + "loss": 1.0237, + "step": 1375 + }, + { + "epoch": 0.5347323423686, + "grad_norm": 0.20077116787433624, + "learning_rate": 9.334371350720124e-05, + "loss": 1.0467, + "step": 1376 + }, + { + "epoch": 0.5351209559895075, + "grad_norm": 0.20394501090049744, + "learning_rate": 9.326586220319191e-05, + "loss": 1.0054, + "step": 1377 + }, + { + "epoch": 0.5355095696104148, + "grad_norm": 0.19459395110607147, + "learning_rate": 9.318801089918256e-05, + "loss": 0.9792, + "step": 1378 + }, + { + "epoch": 0.5358981832313222, + "grad_norm": 0.2116049826145172, + "learning_rate": 9.311015959517321e-05, + "loss": 1.0345, + "step": 1379 + }, + { + "epoch": 0.5362867968522297, + "grad_norm": 0.21672269701957703, + "learning_rate": 9.303230829116388e-05, + "loss": 1.0709, + "step": 1380 + }, + { + "epoch": 0.536675410473137, + "grad_norm": 0.20358407497406006, + "learning_rate": 9.295445698715453e-05, + "loss": 1.0534, + "step": 1381 + }, + { + "epoch": 0.5370640240940445, + "grad_norm": 0.19512853026390076, + "learning_rate": 9.28766056831452e-05, + "loss": 0.9397, + "step": 1382 + }, + { + "epoch": 0.5374526377149519, + "grad_norm": 0.2140122503042221, + "learning_rate": 9.279875437913586e-05, + "loss": 1.0164, + "step": 1383 + }, + { + "epoch": 0.5378412513358594, + "grad_norm": 0.20486049354076385, + "learning_rate": 9.272090307512651e-05, + "loss": 0.9892, + "step": 1384 + }, + { + "epoch": 0.5382298649567667, + "grad_norm": 0.20023222267627716, + "learning_rate": 9.264305177111718e-05, + "loss": 1.0019, + "step": 1385 + }, + { + "epoch": 0.5386184785776742, + "grad_norm": 0.20024439692497253, + "learning_rate": 9.256520046710783e-05, + "loss": 0.9717, + "step": 1386 + }, + { + "epoch": 0.5390070921985816, + "grad_norm": 0.21021386981010437, + "learning_rate": 9.24873491630985e-05, + "loss": 1.028, + "step": 1387 + }, + { + "epoch": 0.5393957058194889, + "grad_norm": 0.18508704006671906, + "learning_rate": 9.240949785908915e-05, + "loss": 1.0008, + "step": 1388 + }, + { + "epoch": 0.5397843194403964, + "grad_norm": 0.19351208209991455, + "learning_rate": 9.23316465550798e-05, + "loss": 0.9898, + "step": 1389 + }, + { + "epoch": 0.5401729330613038, + "grad_norm": 0.20341919362545013, + "learning_rate": 9.225379525107047e-05, + "loss": 1.0203, + "step": 1390 + }, + { + "epoch": 0.5405615466822112, + "grad_norm": 0.1942797303199768, + "learning_rate": 9.217594394706112e-05, + "loss": 1.003, + "step": 1391 + }, + { + "epoch": 0.5409501603031186, + "grad_norm": 0.2056138813495636, + "learning_rate": 9.209809264305178e-05, + "loss": 1.0149, + "step": 1392 + }, + { + "epoch": 0.5413387739240261, + "grad_norm": 0.21572062373161316, + "learning_rate": 9.202024133904244e-05, + "loss": 0.9808, + "step": 1393 + }, + { + "epoch": 0.5417273875449334, + "grad_norm": 0.19841499626636505, + "learning_rate": 9.194239003503309e-05, + "loss": 1.0467, + "step": 1394 + }, + { + "epoch": 0.5421160011658409, + "grad_norm": 0.20452147722244263, + "learning_rate": 9.186453873102375e-05, + "loss": 1.0378, + "step": 1395 + }, + { + "epoch": 0.5425046147867483, + "grad_norm": 0.2090451419353485, + "learning_rate": 9.17866874270144e-05, + "loss": 1.0823, + "step": 1396 + }, + { + "epoch": 0.5428932284076556, + "grad_norm": 0.215814009308815, + "learning_rate": 9.170883612300506e-05, + "loss": 1.0994, + "step": 1397 + }, + { + "epoch": 0.5432818420285631, + "grad_norm": 0.19924724102020264, + "learning_rate": 9.163098481899572e-05, + "loss": 1.0099, + "step": 1398 + }, + { + "epoch": 0.5436704556494705, + "grad_norm": 0.20074865221977234, + "learning_rate": 9.155313351498638e-05, + "loss": 1.0163, + "step": 1399 + }, + { + "epoch": 0.544059069270378, + "grad_norm": 0.21737203001976013, + "learning_rate": 9.147528221097704e-05, + "loss": 1.0527, + "step": 1400 + }, + { + "epoch": 0.5444476828912853, + "grad_norm": 0.2036885768175125, + "learning_rate": 9.139743090696769e-05, + "loss": 1.0208, + "step": 1401 + }, + { + "epoch": 0.5448362965121928, + "grad_norm": 0.20861585438251495, + "learning_rate": 9.131957960295835e-05, + "loss": 1.0175, + "step": 1402 + }, + { + "epoch": 0.5452249101331001, + "grad_norm": 0.23425570130348206, + "learning_rate": 9.124172829894901e-05, + "loss": 1.053, + "step": 1403 + }, + { + "epoch": 0.5456135237540076, + "grad_norm": 0.20389291644096375, + "learning_rate": 9.116387699493966e-05, + "loss": 1.0479, + "step": 1404 + }, + { + "epoch": 0.546002137374915, + "grad_norm": 0.20166678726673126, + "learning_rate": 9.108602569093033e-05, + "loss": 1.0064, + "step": 1405 + }, + { + "epoch": 0.5463907509958223, + "grad_norm": 0.21419203281402588, + "learning_rate": 9.100817438692098e-05, + "loss": 1.0122, + "step": 1406 + }, + { + "epoch": 0.5467793646167298, + "grad_norm": 0.20541758835315704, + "learning_rate": 9.093032308291165e-05, + "loss": 1.0355, + "step": 1407 + }, + { + "epoch": 0.5471679782376372, + "grad_norm": 0.21865367889404297, + "learning_rate": 9.08524717789023e-05, + "loss": 1.0201, + "step": 1408 + }, + { + "epoch": 0.5475565918585447, + "grad_norm": 0.21181468665599823, + "learning_rate": 9.077462047489296e-05, + "loss": 1.0501, + "step": 1409 + }, + { + "epoch": 0.547945205479452, + "grad_norm": 0.21016767621040344, + "learning_rate": 9.069676917088362e-05, + "loss": 1.0452, + "step": 1410 + }, + { + "epoch": 0.5483338191003595, + "grad_norm": 0.21119755506515503, + "learning_rate": 9.061891786687428e-05, + "loss": 1.0935, + "step": 1411 + }, + { + "epoch": 0.5487224327212669, + "grad_norm": 0.20688095688819885, + "learning_rate": 9.054106656286493e-05, + "loss": 1.0526, + "step": 1412 + }, + { + "epoch": 0.5491110463421743, + "grad_norm": 0.21857528388500214, + "learning_rate": 9.04632152588556e-05, + "loss": 1.0067, + "step": 1413 + }, + { + "epoch": 0.5494996599630817, + "grad_norm": 0.2196548581123352, + "learning_rate": 9.038536395484625e-05, + "loss": 1.0263, + "step": 1414 + }, + { + "epoch": 0.5498882735839892, + "grad_norm": 0.21952040493488312, + "learning_rate": 9.03075126508369e-05, + "loss": 1.0009, + "step": 1415 + }, + { + "epoch": 0.5502768872048965, + "grad_norm": 0.20059294998645782, + "learning_rate": 9.022966134682757e-05, + "loss": 1.0481, + "step": 1416 + }, + { + "epoch": 0.5506655008258039, + "grad_norm": 0.1960824728012085, + "learning_rate": 9.015181004281822e-05, + "loss": 1.0003, + "step": 1417 + }, + { + "epoch": 0.5510541144467114, + "grad_norm": 0.19051724672317505, + "learning_rate": 9.007395873880889e-05, + "loss": 0.9556, + "step": 1418 + }, + { + "epoch": 0.5514427280676187, + "grad_norm": 0.21008028090000153, + "learning_rate": 8.999610743479954e-05, + "loss": 1.0457, + "step": 1419 + }, + { + "epoch": 0.5518313416885262, + "grad_norm": 0.21465444564819336, + "learning_rate": 8.991825613079019e-05, + "loss": 1.0196, + "step": 1420 + }, + { + "epoch": 0.5522199553094336, + "grad_norm": 0.2062770277261734, + "learning_rate": 8.984040482678086e-05, + "loss": 1.0501, + "step": 1421 + }, + { + "epoch": 0.552608568930341, + "grad_norm": 0.21400012075901031, + "learning_rate": 8.976255352277151e-05, + "loss": 1.0711, + "step": 1422 + }, + { + "epoch": 0.5529971825512484, + "grad_norm": 0.19617624580860138, + "learning_rate": 8.968470221876217e-05, + "loss": 0.9858, + "step": 1423 + }, + { + "epoch": 0.5533857961721559, + "grad_norm": 0.20835624635219574, + "learning_rate": 8.960685091475283e-05, + "loss": 1.0122, + "step": 1424 + }, + { + "epoch": 0.5537744097930632, + "grad_norm": 0.21708111464977264, + "learning_rate": 8.952899961074348e-05, + "loss": 1.0108, + "step": 1425 + }, + { + "epoch": 0.5541630234139706, + "grad_norm": 0.20877864956855774, + "learning_rate": 8.945114830673414e-05, + "loss": 1.0389, + "step": 1426 + }, + { + "epoch": 0.5545516370348781, + "grad_norm": 0.1924441158771515, + "learning_rate": 8.93732970027248e-05, + "loss": 1.0088, + "step": 1427 + }, + { + "epoch": 0.5549402506557854, + "grad_norm": 0.20288826525211334, + "learning_rate": 8.929544569871546e-05, + "loss": 1.0296, + "step": 1428 + }, + { + "epoch": 0.5553288642766929, + "grad_norm": 0.2008143663406372, + "learning_rate": 8.921759439470611e-05, + "loss": 1.0521, + "step": 1429 + }, + { + "epoch": 0.5557174778976003, + "grad_norm": 0.24407047033309937, + "learning_rate": 8.913974309069677e-05, + "loss": 1.1038, + "step": 1430 + }, + { + "epoch": 0.5561060915185078, + "grad_norm": 0.2172536998987198, + "learning_rate": 8.906189178668743e-05, + "loss": 1.0811, + "step": 1431 + }, + { + "epoch": 0.5564947051394151, + "grad_norm": 0.21712054312229156, + "learning_rate": 8.898404048267808e-05, + "loss": 1.0642, + "step": 1432 + }, + { + "epoch": 0.5568833187603226, + "grad_norm": 0.22482797503471375, + "learning_rate": 8.890618917866875e-05, + "loss": 1.0742, + "step": 1433 + }, + { + "epoch": 0.55727193238123, + "grad_norm": 0.1974876970052719, + "learning_rate": 8.88283378746594e-05, + "loss": 0.9954, + "step": 1434 + }, + { + "epoch": 0.5576605460021373, + "grad_norm": 0.19162166118621826, + "learning_rate": 8.875048657065007e-05, + "loss": 1.0074, + "step": 1435 + }, + { + "epoch": 0.5580491596230448, + "grad_norm": 0.20439045131206512, + "learning_rate": 8.867263526664072e-05, + "loss": 1.026, + "step": 1436 + }, + { + "epoch": 0.5584377732439522, + "grad_norm": 0.1947651207447052, + "learning_rate": 8.859478396263138e-05, + "loss": 0.9848, + "step": 1437 + }, + { + "epoch": 0.5588263868648596, + "grad_norm": 0.21434316039085388, + "learning_rate": 8.851693265862204e-05, + "loss": 1.0843, + "step": 1438 + }, + { + "epoch": 0.559215000485767, + "grad_norm": 1.3314417600631714, + "learning_rate": 8.84390813546127e-05, + "loss": 1.0356, + "step": 1439 + }, + { + "epoch": 0.5596036141066745, + "grad_norm": 0.20131289958953857, + "learning_rate": 8.836123005060335e-05, + "loss": 1.0214, + "step": 1440 + }, + { + "epoch": 0.5599922277275818, + "grad_norm": 0.21596461534500122, + "learning_rate": 8.828337874659402e-05, + "loss": 1.0962, + "step": 1441 + }, + { + "epoch": 0.5603808413484893, + "grad_norm": 0.20477193593978882, + "learning_rate": 8.820552744258467e-05, + "loss": 1.0643, + "step": 1442 + }, + { + "epoch": 0.5607694549693967, + "grad_norm": 0.1978107988834381, + "learning_rate": 8.812767613857532e-05, + "loss": 1.0054, + "step": 1443 + }, + { + "epoch": 0.561158068590304, + "grad_norm": 0.219422847032547, + "learning_rate": 8.804982483456599e-05, + "loss": 1.0009, + "step": 1444 + }, + { + "epoch": 0.5615466822112115, + "grad_norm": 0.21489015221595764, + "learning_rate": 8.797197353055664e-05, + "loss": 1.052, + "step": 1445 + }, + { + "epoch": 0.5619352958321189, + "grad_norm": 0.2235930860042572, + "learning_rate": 8.78941222265473e-05, + "loss": 1.037, + "step": 1446 + }, + { + "epoch": 0.5623239094530263, + "grad_norm": 0.19922038912773132, + "learning_rate": 8.781627092253796e-05, + "loss": 1.0006, + "step": 1447 + }, + { + "epoch": 0.5627125230739337, + "grad_norm": 0.24740247428417206, + "learning_rate": 8.773841961852861e-05, + "loss": 1.0753, + "step": 1448 + }, + { + "epoch": 0.5631011366948412, + "grad_norm": 0.2148803174495697, + "learning_rate": 8.766056831451928e-05, + "loss": 1.0712, + "step": 1449 + }, + { + "epoch": 0.5634897503157485, + "grad_norm": 0.19838745892047882, + "learning_rate": 8.758271701050993e-05, + "loss": 1.027, + "step": 1450 + }, + { + "epoch": 0.563878363936656, + "grad_norm": 0.20328201353549957, + "learning_rate": 8.750486570650058e-05, + "loss": 1.0117, + "step": 1451 + }, + { + "epoch": 0.5642669775575634, + "grad_norm": 0.21230114996433258, + "learning_rate": 8.742701440249125e-05, + "loss": 1.0658, + "step": 1452 + }, + { + "epoch": 0.5646555911784708, + "grad_norm": 0.2030259519815445, + "learning_rate": 8.73491630984819e-05, + "loss": 1.0002, + "step": 1453 + }, + { + "epoch": 0.5650442047993782, + "grad_norm": 0.21404659748077393, + "learning_rate": 8.727131179447256e-05, + "loss": 1.0572, + "step": 1454 + }, + { + "epoch": 0.5654328184202856, + "grad_norm": 0.2148464322090149, + "learning_rate": 8.719346049046322e-05, + "loss": 1.0164, + "step": 1455 + }, + { + "epoch": 0.5658214320411931, + "grad_norm": 0.22083118557929993, + "learning_rate": 8.711560918645387e-05, + "loss": 0.9704, + "step": 1456 + }, + { + "epoch": 0.5662100456621004, + "grad_norm": 0.19305935502052307, + "learning_rate": 8.703775788244453e-05, + "loss": 1.0034, + "step": 1457 + }, + { + "epoch": 0.5665986592830079, + "grad_norm": 0.2100098729133606, + "learning_rate": 8.695990657843518e-05, + "loss": 1.0907, + "step": 1458 + }, + { + "epoch": 0.5669872729039153, + "grad_norm": 0.18947799503803253, + "learning_rate": 8.688205527442585e-05, + "loss": 0.9664, + "step": 1459 + }, + { + "epoch": 0.5673758865248227, + "grad_norm": 0.22341710329055786, + "learning_rate": 8.68042039704165e-05, + "loss": 1.0551, + "step": 1460 + }, + { + "epoch": 0.5677645001457301, + "grad_norm": 0.219679057598114, + "learning_rate": 8.672635266640717e-05, + "loss": 1.0398, + "step": 1461 + }, + { + "epoch": 0.5681531137666376, + "grad_norm": 0.22389841079711914, + "learning_rate": 8.664850136239782e-05, + "loss": 1.0472, + "step": 1462 + }, + { + "epoch": 0.5685417273875449, + "grad_norm": 0.21402975916862488, + "learning_rate": 8.657065005838849e-05, + "loss": 1.0224, + "step": 1463 + }, + { + "epoch": 0.5689303410084523, + "grad_norm": 0.20917154848575592, + "learning_rate": 8.649279875437915e-05, + "loss": 1.0526, + "step": 1464 + }, + { + "epoch": 0.5693189546293598, + "grad_norm": 0.2252056896686554, + "learning_rate": 8.64149474503698e-05, + "loss": 1.1064, + "step": 1465 + }, + { + "epoch": 0.5697075682502671, + "grad_norm": 0.21834802627563477, + "learning_rate": 8.633709614636046e-05, + "loss": 1.0318, + "step": 1466 + }, + { + "epoch": 0.5700961818711746, + "grad_norm": 0.21882353723049164, + "learning_rate": 8.625924484235112e-05, + "loss": 1.0285, + "step": 1467 + }, + { + "epoch": 0.570484795492082, + "grad_norm": 0.2028426229953766, + "learning_rate": 8.618139353834177e-05, + "loss": 1.0356, + "step": 1468 + }, + { + "epoch": 0.5708734091129894, + "grad_norm": 0.22297166287899017, + "learning_rate": 8.610354223433243e-05, + "loss": 1.0804, + "step": 1469 + }, + { + "epoch": 0.5712620227338968, + "grad_norm": 0.21775268018245697, + "learning_rate": 8.602569093032309e-05, + "loss": 0.9978, + "step": 1470 + }, + { + "epoch": 0.5716506363548043, + "grad_norm": 0.20362353324890137, + "learning_rate": 8.594783962631374e-05, + "loss": 0.9982, + "step": 1471 + }, + { + "epoch": 0.5720392499757117, + "grad_norm": 0.21854591369628906, + "learning_rate": 8.586998832230441e-05, + "loss": 1.0465, + "step": 1472 + }, + { + "epoch": 0.572427863596619, + "grad_norm": 0.20501428842544556, + "learning_rate": 8.579213701829506e-05, + "loss": 1.0468, + "step": 1473 + }, + { + "epoch": 0.5728164772175265, + "grad_norm": 0.21606214344501495, + "learning_rate": 8.571428571428571e-05, + "loss": 1.0477, + "step": 1474 + }, + { + "epoch": 0.5732050908384339, + "grad_norm": 0.2100660502910614, + "learning_rate": 8.563643441027638e-05, + "loss": 1.0071, + "step": 1475 + }, + { + "epoch": 0.5735937044593413, + "grad_norm": 0.21008896827697754, + "learning_rate": 8.555858310626703e-05, + "loss": 0.9914, + "step": 1476 + }, + { + "epoch": 0.5739823180802487, + "grad_norm": 0.22192159295082092, + "learning_rate": 8.54807318022577e-05, + "loss": 1.0385, + "step": 1477 + }, + { + "epoch": 0.5743709317011562, + "grad_norm": 0.20123356580734253, + "learning_rate": 8.540288049824835e-05, + "loss": 1.0062, + "step": 1478 + }, + { + "epoch": 0.5747595453220635, + "grad_norm": 0.201947420835495, + "learning_rate": 8.5325029194239e-05, + "loss": 1.0218, + "step": 1479 + }, + { + "epoch": 0.575148158942971, + "grad_norm": 0.22804415225982666, + "learning_rate": 8.524717789022967e-05, + "loss": 1.0445, + "step": 1480 + }, + { + "epoch": 0.5755367725638784, + "grad_norm": 0.20527036488056183, + "learning_rate": 8.516932658622032e-05, + "loss": 0.9972, + "step": 1481 + }, + { + "epoch": 0.5759253861847857, + "grad_norm": 0.20298773050308228, + "learning_rate": 8.509147528221098e-05, + "loss": 1.0272, + "step": 1482 + }, + { + "epoch": 0.5763139998056932, + "grad_norm": 0.22500957548618317, + "learning_rate": 8.501362397820164e-05, + "loss": 1.0982, + "step": 1483 + }, + { + "epoch": 0.5767026134266006, + "grad_norm": 0.1950521320104599, + "learning_rate": 8.493577267419229e-05, + "loss": 0.9848, + "step": 1484 + }, + { + "epoch": 0.577091227047508, + "grad_norm": 0.21087585389614105, + "learning_rate": 8.485792137018295e-05, + "loss": 1.0125, + "step": 1485 + }, + { + "epoch": 0.5774798406684154, + "grad_norm": 0.20122238993644714, + "learning_rate": 8.47800700661736e-05, + "loss": 1.0533, + "step": 1486 + }, + { + "epoch": 0.5778684542893229, + "grad_norm": 0.20149008929729462, + "learning_rate": 8.470221876216427e-05, + "loss": 1.0719, + "step": 1487 + }, + { + "epoch": 0.5782570679102302, + "grad_norm": 0.21307213604450226, + "learning_rate": 8.462436745815494e-05, + "loss": 1.0522, + "step": 1488 + }, + { + "epoch": 0.5786456815311377, + "grad_norm": 0.21828554570674896, + "learning_rate": 8.454651615414559e-05, + "loss": 1.0184, + "step": 1489 + }, + { + "epoch": 0.5790342951520451, + "grad_norm": 0.22002705931663513, + "learning_rate": 8.446866485013625e-05, + "loss": 1.0101, + "step": 1490 + }, + { + "epoch": 0.5794229087729524, + "grad_norm": 0.19479142129421234, + "learning_rate": 8.43908135461269e-05, + "loss": 0.9889, + "step": 1491 + }, + { + "epoch": 0.5798115223938599, + "grad_norm": 0.21346086263656616, + "learning_rate": 8.431296224211756e-05, + "loss": 1.0373, + "step": 1492 + }, + { + "epoch": 0.5802001360147673, + "grad_norm": 0.20177558064460754, + "learning_rate": 8.423511093810822e-05, + "loss": 1.0215, + "step": 1493 + }, + { + "epoch": 0.5805887496356748, + "grad_norm": 0.2117915153503418, + "learning_rate": 8.415725963409888e-05, + "loss": 1.0321, + "step": 1494 + }, + { + "epoch": 0.5809773632565821, + "grad_norm": 0.21304374933242798, + "learning_rate": 8.407940833008954e-05, + "loss": 1.0123, + "step": 1495 + }, + { + "epoch": 0.5813659768774896, + "grad_norm": 0.21173715591430664, + "learning_rate": 8.400155702608019e-05, + "loss": 1.0696, + "step": 1496 + }, + { + "epoch": 0.581754590498397, + "grad_norm": 0.20407019555568695, + "learning_rate": 8.392370572207085e-05, + "loss": 1.0086, + "step": 1497 + }, + { + "epoch": 0.5821432041193044, + "grad_norm": 0.209481880068779, + "learning_rate": 8.384585441806151e-05, + "loss": 0.9975, + "step": 1498 + }, + { + "epoch": 0.5825318177402118, + "grad_norm": 0.22184531390666962, + "learning_rate": 8.376800311405216e-05, + "loss": 1.0956, + "step": 1499 + }, + { + "epoch": 0.5829204313611193, + "grad_norm": 0.21344684064388275, + "learning_rate": 8.369015181004283e-05, + "loss": 1.0685, + "step": 1500 + }, + { + "epoch": 0.5833090449820266, + "grad_norm": 0.19837221503257751, + "learning_rate": 8.361230050603348e-05, + "loss": 1.0149, + "step": 1501 + }, + { + "epoch": 0.583697658602934, + "grad_norm": 0.2133672833442688, + "learning_rate": 8.353444920202413e-05, + "loss": 1.0453, + "step": 1502 + }, + { + "epoch": 0.5840862722238415, + "grad_norm": 0.21944090723991394, + "learning_rate": 8.34565978980148e-05, + "loss": 1.04, + "step": 1503 + }, + { + "epoch": 0.5844748858447488, + "grad_norm": 0.1983667016029358, + "learning_rate": 8.337874659400545e-05, + "loss": 0.9919, + "step": 1504 + }, + { + "epoch": 0.5848634994656563, + "grad_norm": 0.2025303989648819, + "learning_rate": 8.33008952899961e-05, + "loss": 1.0021, + "step": 1505 + }, + { + "epoch": 0.5852521130865637, + "grad_norm": 0.2015170007944107, + "learning_rate": 8.322304398598677e-05, + "loss": 0.9945, + "step": 1506 + }, + { + "epoch": 0.5856407267074711, + "grad_norm": 0.20768272876739502, + "learning_rate": 8.314519268197742e-05, + "loss": 1.0465, + "step": 1507 + }, + { + "epoch": 0.5860293403283785, + "grad_norm": 0.20513412356376648, + "learning_rate": 8.306734137796809e-05, + "loss": 1.0124, + "step": 1508 + }, + { + "epoch": 0.586417953949286, + "grad_norm": 0.20268471539020538, + "learning_rate": 8.298949007395874e-05, + "loss": 1.0586, + "step": 1509 + }, + { + "epoch": 0.5868065675701933, + "grad_norm": 0.20915938913822174, + "learning_rate": 8.291163876994939e-05, + "loss": 1.0047, + "step": 1510 + }, + { + "epoch": 0.5871951811911007, + "grad_norm": 0.2161451131105423, + "learning_rate": 8.283378746594006e-05, + "loss": 1.0184, + "step": 1511 + }, + { + "epoch": 0.5875837948120082, + "grad_norm": 0.1915571093559265, + "learning_rate": 8.275593616193071e-05, + "loss": 1.0187, + "step": 1512 + }, + { + "epoch": 0.5879724084329155, + "grad_norm": 0.20907992124557495, + "learning_rate": 8.267808485792137e-05, + "loss": 1.0212, + "step": 1513 + }, + { + "epoch": 0.588361022053823, + "grad_norm": 0.20140786468982697, + "learning_rate": 8.260023355391204e-05, + "loss": 1.014, + "step": 1514 + }, + { + "epoch": 0.5887496356747304, + "grad_norm": 0.208252415060997, + "learning_rate": 8.252238224990269e-05, + "loss": 1.0806, + "step": 1515 + }, + { + "epoch": 0.5891382492956379, + "grad_norm": 0.20596125721931458, + "learning_rate": 8.244453094589336e-05, + "loss": 0.9823, + "step": 1516 + }, + { + "epoch": 0.5895268629165452, + "grad_norm": 0.18832452595233917, + "learning_rate": 8.236667964188401e-05, + "loss": 0.9925, + "step": 1517 + }, + { + "epoch": 0.5899154765374527, + "grad_norm": 0.2078334391117096, + "learning_rate": 8.228882833787467e-05, + "loss": 1.0587, + "step": 1518 + }, + { + "epoch": 0.59030409015836, + "grad_norm": 0.20121365785598755, + "learning_rate": 8.221097703386533e-05, + "loss": 1.0607, + "step": 1519 + }, + { + "epoch": 0.5906927037792674, + "grad_norm": 0.19666099548339844, + "learning_rate": 8.213312572985598e-05, + "loss": 1.0124, + "step": 1520 + }, + { + "epoch": 0.5910813174001749, + "grad_norm": 0.20176006853580475, + "learning_rate": 8.205527442584664e-05, + "loss": 1.0297, + "step": 1521 + }, + { + "epoch": 0.5914699310210823, + "grad_norm": 0.2038574516773224, + "learning_rate": 8.19774231218373e-05, + "loss": 1.0311, + "step": 1522 + }, + { + "epoch": 0.5918585446419897, + "grad_norm": 0.19517424702644348, + "learning_rate": 8.189957181782796e-05, + "loss": 0.9945, + "step": 1523 + }, + { + "epoch": 0.5922471582628971, + "grad_norm": 0.19599094986915588, + "learning_rate": 8.182172051381861e-05, + "loss": 1.0255, + "step": 1524 + }, + { + "epoch": 0.5926357718838046, + "grad_norm": 0.21409402787685394, + "learning_rate": 8.174386920980927e-05, + "loss": 1.0868, + "step": 1525 + }, + { + "epoch": 0.5930243855047119, + "grad_norm": 0.19567830860614777, + "learning_rate": 8.166601790579993e-05, + "loss": 0.9654, + "step": 1526 + }, + { + "epoch": 0.5934129991256194, + "grad_norm": 0.2275007963180542, + "learning_rate": 8.158816660179058e-05, + "loss": 1.0867, + "step": 1527 + }, + { + "epoch": 0.5938016127465268, + "grad_norm": 0.19826427102088928, + "learning_rate": 8.151031529778123e-05, + "loss": 1.0301, + "step": 1528 + }, + { + "epoch": 0.5941902263674341, + "grad_norm": 0.2051352709531784, + "learning_rate": 8.14324639937719e-05, + "loss": 1.023, + "step": 1529 + }, + { + "epoch": 0.5945788399883416, + "grad_norm": 0.19492043554782867, + "learning_rate": 8.135461268976255e-05, + "loss": 0.9608, + "step": 1530 + }, + { + "epoch": 0.594967453609249, + "grad_norm": 0.21521608531475067, + "learning_rate": 8.127676138575322e-05, + "loss": 1.0612, + "step": 1531 + }, + { + "epoch": 0.5953560672301564, + "grad_norm": 0.22739367187023163, + "learning_rate": 8.119891008174387e-05, + "loss": 1.0603, + "step": 1532 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 0.20334595441818237, + "learning_rate": 8.112105877773452e-05, + "loss": 1.0191, + "step": 1533 + }, + { + "epoch": 0.5961332944719713, + "grad_norm": 0.20985397696495056, + "learning_rate": 8.104320747372519e-05, + "loss": 1.0721, + "step": 1534 + }, + { + "epoch": 0.5965219080928786, + "grad_norm": 0.20472954213619232, + "learning_rate": 8.096535616971584e-05, + "loss": 1.0556, + "step": 1535 + }, + { + "epoch": 0.5969105217137861, + "grad_norm": 0.2112964689731598, + "learning_rate": 8.08875048657065e-05, + "loss": 1.0016, + "step": 1536 + }, + { + "epoch": 0.5972991353346935, + "grad_norm": 0.21330617368221283, + "learning_rate": 8.080965356169716e-05, + "loss": 1.0783, + "step": 1537 + }, + { + "epoch": 0.5976877489556008, + "grad_norm": 0.20907814800739288, + "learning_rate": 8.073180225768782e-05, + "loss": 1.071, + "step": 1538 + }, + { + "epoch": 0.5980763625765083, + "grad_norm": 0.2038964033126831, + "learning_rate": 8.065395095367848e-05, + "loss": 1.0039, + "step": 1539 + }, + { + "epoch": 0.5984649761974157, + "grad_norm": 0.2175542712211609, + "learning_rate": 8.057609964966914e-05, + "loss": 1.0015, + "step": 1540 + }, + { + "epoch": 0.5988535898183232, + "grad_norm": 0.21474529802799225, + "learning_rate": 8.049824834565979e-05, + "loss": 1.0273, + "step": 1541 + }, + { + "epoch": 0.5992422034392305, + "grad_norm": 0.21428482234477997, + "learning_rate": 8.042039704165046e-05, + "loss": 1.0767, + "step": 1542 + }, + { + "epoch": 0.599630817060138, + "grad_norm": 0.20287524163722992, + "learning_rate": 8.034254573764111e-05, + "loss": 1.064, + "step": 1543 + }, + { + "epoch": 0.6000194306810454, + "grad_norm": 0.20689848065376282, + "learning_rate": 8.026469443363178e-05, + "loss": 1.0084, + "step": 1544 + }, + { + "epoch": 0.6004080443019528, + "grad_norm": 0.22451332211494446, + "learning_rate": 8.018684312962243e-05, + "loss": 1.1039, + "step": 1545 + }, + { + "epoch": 0.6007966579228602, + "grad_norm": 0.21381956338882446, + "learning_rate": 8.010899182561308e-05, + "loss": 1.0551, + "step": 1546 + }, + { + "epoch": 0.6011852715437677, + "grad_norm": 0.20108483731746674, + "learning_rate": 8.003114052160375e-05, + "loss": 1.0326, + "step": 1547 + }, + { + "epoch": 0.601573885164675, + "grad_norm": 0.19739678502082825, + "learning_rate": 7.99532892175944e-05, + "loss": 1.0319, + "step": 1548 + }, + { + "epoch": 0.6019624987855824, + "grad_norm": 0.21635359525680542, + "learning_rate": 7.987543791358506e-05, + "loss": 1.0465, + "step": 1549 + }, + { + "epoch": 0.6023511124064899, + "grad_norm": 0.1949319988489151, + "learning_rate": 7.979758660957572e-05, + "loss": 1.0026, + "step": 1550 + }, + { + "epoch": 0.6027397260273972, + "grad_norm": 0.1989699900150299, + "learning_rate": 7.971973530556637e-05, + "loss": 1.021, + "step": 1551 + }, + { + "epoch": 0.6031283396483047, + "grad_norm": 0.24031391739845276, + "learning_rate": 7.964188400155703e-05, + "loss": 1.0293, + "step": 1552 + }, + { + "epoch": 0.6035169532692121, + "grad_norm": 0.21247251331806183, + "learning_rate": 7.956403269754769e-05, + "loss": 1.023, + "step": 1553 + }, + { + "epoch": 0.6039055668901195, + "grad_norm": 0.21565628051757812, + "learning_rate": 7.948618139353835e-05, + "loss": 1.1027, + "step": 1554 + }, + { + "epoch": 0.6042941805110269, + "grad_norm": 0.21207931637763977, + "learning_rate": 7.9408330089529e-05, + "loss": 1.0634, + "step": 1555 + }, + { + "epoch": 0.6046827941319344, + "grad_norm": 0.21354155242443085, + "learning_rate": 7.933047878551965e-05, + "loss": 1.0433, + "step": 1556 + }, + { + "epoch": 0.6050714077528417, + "grad_norm": 0.21708370745182037, + "learning_rate": 7.925262748151032e-05, + "loss": 1.0499, + "step": 1557 + }, + { + "epoch": 0.6054600213737491, + "grad_norm": 0.2051447182893753, + "learning_rate": 7.917477617750097e-05, + "loss": 1.0042, + "step": 1558 + }, + { + "epoch": 0.6058486349946566, + "grad_norm": 0.18768000602722168, + "learning_rate": 7.909692487349164e-05, + "loss": 1.009, + "step": 1559 + }, + { + "epoch": 0.6062372486155639, + "grad_norm": 0.2142931967973709, + "learning_rate": 7.901907356948229e-05, + "loss": 1.0458, + "step": 1560 + }, + { + "epoch": 0.6066258622364714, + "grad_norm": 0.21006444096565247, + "learning_rate": 7.894122226547294e-05, + "loss": 1.0286, + "step": 1561 + }, + { + "epoch": 0.6070144758573788, + "grad_norm": 0.2187039703130722, + "learning_rate": 7.886337096146361e-05, + "loss": 1.0103, + "step": 1562 + }, + { + "epoch": 0.6074030894782863, + "grad_norm": 0.19863669574260712, + "learning_rate": 7.878551965745426e-05, + "loss": 0.9925, + "step": 1563 + }, + { + "epoch": 0.6077917030991936, + "grad_norm": 0.21771976351737976, + "learning_rate": 7.870766835344493e-05, + "loss": 0.9853, + "step": 1564 + }, + { + "epoch": 0.6081803167201011, + "grad_norm": 0.21714983880519867, + "learning_rate": 7.862981704943558e-05, + "loss": 1.0123, + "step": 1565 + }, + { + "epoch": 0.6085689303410085, + "grad_norm": 0.2251398265361786, + "learning_rate": 7.855196574542624e-05, + "loss": 1.0265, + "step": 1566 + }, + { + "epoch": 0.6089575439619158, + "grad_norm": 0.22089716792106628, + "learning_rate": 7.84741144414169e-05, + "loss": 1.0689, + "step": 1567 + }, + { + "epoch": 0.6093461575828233, + "grad_norm": 0.2453841269016266, + "learning_rate": 7.839626313740756e-05, + "loss": 1.0185, + "step": 1568 + }, + { + "epoch": 0.6097347712037307, + "grad_norm": 0.21866528689861298, + "learning_rate": 7.831841183339821e-05, + "loss": 1.0361, + "step": 1569 + }, + { + "epoch": 0.6101233848246381, + "grad_norm": 0.22421486675739288, + "learning_rate": 7.824056052938888e-05, + "loss": 1.024, + "step": 1570 + }, + { + "epoch": 0.6105119984455455, + "grad_norm": 0.21107137203216553, + "learning_rate": 7.816270922537953e-05, + "loss": 1.0335, + "step": 1571 + }, + { + "epoch": 0.610900612066453, + "grad_norm": 0.20731772482395172, + "learning_rate": 7.80848579213702e-05, + "loss": 1.0563, + "step": 1572 + }, + { + "epoch": 0.6112892256873603, + "grad_norm": 0.19535884261131287, + "learning_rate": 7.800700661736085e-05, + "loss": 0.9698, + "step": 1573 + }, + { + "epoch": 0.6116778393082678, + "grad_norm": 0.20449021458625793, + "learning_rate": 7.79291553133515e-05, + "loss": 1.0125, + "step": 1574 + }, + { + "epoch": 0.6120664529291752, + "grad_norm": 0.19576509296894073, + "learning_rate": 7.785130400934217e-05, + "loss": 0.9326, + "step": 1575 + }, + { + "epoch": 0.6124550665500825, + "grad_norm": 0.18914124369621277, + "learning_rate": 7.777345270533282e-05, + "loss": 0.9939, + "step": 1576 + }, + { + "epoch": 0.61284368017099, + "grad_norm": 0.21239091455936432, + "learning_rate": 7.769560140132348e-05, + "loss": 1.0271, + "step": 1577 + }, + { + "epoch": 0.6132322937918974, + "grad_norm": 0.22204811871051788, + "learning_rate": 7.761775009731414e-05, + "loss": 1.0524, + "step": 1578 + }, + { + "epoch": 0.6136209074128048, + "grad_norm": 0.20047850906848907, + "learning_rate": 7.753989879330479e-05, + "loss": 1.0076, + "step": 1579 + }, + { + "epoch": 0.6140095210337122, + "grad_norm": 0.22619746625423431, + "learning_rate": 7.746204748929545e-05, + "loss": 1.0611, + "step": 1580 + }, + { + "epoch": 0.6143981346546197, + "grad_norm": 0.2500879466533661, + "learning_rate": 7.73841961852861e-05, + "loss": 1.0364, + "step": 1581 + }, + { + "epoch": 0.614786748275527, + "grad_norm": 0.23486928641796112, + "learning_rate": 7.730634488127676e-05, + "loss": 1.0472, + "step": 1582 + }, + { + "epoch": 0.6151753618964345, + "grad_norm": 0.19849038124084473, + "learning_rate": 7.722849357726742e-05, + "loss": 0.9847, + "step": 1583 + }, + { + "epoch": 0.6155639755173419, + "grad_norm": 0.21516263484954834, + "learning_rate": 7.715064227325807e-05, + "loss": 1.0351, + "step": 1584 + }, + { + "epoch": 0.6159525891382492, + "grad_norm": 0.20137760043144226, + "learning_rate": 7.707279096924874e-05, + "loss": 0.9879, + "step": 1585 + }, + { + "epoch": 0.6163412027591567, + "grad_norm": 0.2146228402853012, + "learning_rate": 7.699493966523939e-05, + "loss": 1.0792, + "step": 1586 + }, + { + "epoch": 0.6167298163800641, + "grad_norm": 0.19929760694503784, + "learning_rate": 7.691708836123004e-05, + "loss": 1.0313, + "step": 1587 + }, + { + "epoch": 0.6171184300009716, + "grad_norm": 0.201123908162117, + "learning_rate": 7.683923705722071e-05, + "loss": 1.0279, + "step": 1588 + }, + { + "epoch": 0.6175070436218789, + "grad_norm": 0.2154105007648468, + "learning_rate": 7.676138575321136e-05, + "loss": 1.075, + "step": 1589 + }, + { + "epoch": 0.6178956572427864, + "grad_norm": 0.2028442770242691, + "learning_rate": 7.668353444920203e-05, + "loss": 0.9771, + "step": 1590 + }, + { + "epoch": 0.6182842708636938, + "grad_norm": 0.18003074824810028, + "learning_rate": 7.660568314519268e-05, + "loss": 0.9677, + "step": 1591 + }, + { + "epoch": 0.6186728844846012, + "grad_norm": 0.23250891268253326, + "learning_rate": 7.652783184118335e-05, + "loss": 1.015, + "step": 1592 + }, + { + "epoch": 0.6190614981055086, + "grad_norm": 0.2047244906425476, + "learning_rate": 7.6449980537174e-05, + "loss": 1.0044, + "step": 1593 + }, + { + "epoch": 0.6194501117264161, + "grad_norm": 0.20011259615421295, + "learning_rate": 7.637212923316466e-05, + "loss": 1.0089, + "step": 1594 + }, + { + "epoch": 0.6198387253473234, + "grad_norm": 0.2212608903646469, + "learning_rate": 7.629427792915533e-05, + "loss": 1.0457, + "step": 1595 + }, + { + "epoch": 0.6202273389682308, + "grad_norm": 0.22725115716457367, + "learning_rate": 7.621642662514598e-05, + "loss": 1.1198, + "step": 1596 + }, + { + "epoch": 0.6206159525891383, + "grad_norm": 0.2065306007862091, + "learning_rate": 7.613857532113663e-05, + "loss": 1.0572, + "step": 1597 + }, + { + "epoch": 0.6210045662100456, + "grad_norm": 0.2132783830165863, + "learning_rate": 7.60607240171273e-05, + "loss": 1.0332, + "step": 1598 + }, + { + "epoch": 0.6213931798309531, + "grad_norm": 0.20527103543281555, + "learning_rate": 7.598287271311795e-05, + "loss": 1.0156, + "step": 1599 + }, + { + "epoch": 0.6217817934518605, + "grad_norm": 0.23608024418354034, + "learning_rate": 7.59050214091086e-05, + "loss": 1.0379, + "step": 1600 + }, + { + "epoch": 0.6221704070727679, + "grad_norm": 0.22227297723293304, + "learning_rate": 7.582717010509927e-05, + "loss": 1.0507, + "step": 1601 + }, + { + "epoch": 0.6225590206936753, + "grad_norm": 0.22359615564346313, + "learning_rate": 7.574931880108992e-05, + "loss": 1.0705, + "step": 1602 + }, + { + "epoch": 0.6229476343145828, + "grad_norm": 0.20478755235671997, + "learning_rate": 7.567146749708059e-05, + "loss": 1.0309, + "step": 1603 + }, + { + "epoch": 0.6233362479354901, + "grad_norm": 0.2223423272371292, + "learning_rate": 7.559361619307124e-05, + "loss": 1.0386, + "step": 1604 + }, + { + "epoch": 0.6237248615563975, + "grad_norm": 0.21232105791568756, + "learning_rate": 7.551576488906189e-05, + "loss": 1.0353, + "step": 1605 + }, + { + "epoch": 0.624113475177305, + "grad_norm": 0.22431129217147827, + "learning_rate": 7.543791358505256e-05, + "loss": 1.1017, + "step": 1606 + }, + { + "epoch": 0.6245020887982123, + "grad_norm": 0.20826031267642975, + "learning_rate": 7.536006228104321e-05, + "loss": 1.0172, + "step": 1607 + }, + { + "epoch": 0.6248907024191198, + "grad_norm": 0.2803161144256592, + "learning_rate": 7.528221097703387e-05, + "loss": 1.0554, + "step": 1608 + }, + { + "epoch": 0.6252793160400272, + "grad_norm": 0.2185174971818924, + "learning_rate": 7.520435967302453e-05, + "loss": 0.9842, + "step": 1609 + }, + { + "epoch": 0.6256679296609347, + "grad_norm": 0.2091478854417801, + "learning_rate": 7.512650836901518e-05, + "loss": 0.9783, + "step": 1610 + }, + { + "epoch": 0.626056543281842, + "grad_norm": 0.22342967987060547, + "learning_rate": 7.504865706500584e-05, + "loss": 0.9891, + "step": 1611 + }, + { + "epoch": 0.6264451569027495, + "grad_norm": 0.195283442735672, + "learning_rate": 7.49708057609965e-05, + "loss": 0.9654, + "step": 1612 + }, + { + "epoch": 0.6268337705236569, + "grad_norm": 0.21048255264759064, + "learning_rate": 7.489295445698716e-05, + "loss": 1.0112, + "step": 1613 + }, + { + "epoch": 0.6272223841445642, + "grad_norm": 0.21405541896820068, + "learning_rate": 7.481510315297781e-05, + "loss": 1.0498, + "step": 1614 + }, + { + "epoch": 0.6276109977654717, + "grad_norm": 0.2144453227519989, + "learning_rate": 7.473725184896846e-05, + "loss": 1.0487, + "step": 1615 + }, + { + "epoch": 0.627999611386379, + "grad_norm": 0.21963326632976532, + "learning_rate": 7.465940054495913e-05, + "loss": 1.0634, + "step": 1616 + }, + { + "epoch": 0.6283882250072865, + "grad_norm": 0.20100601017475128, + "learning_rate": 7.458154924094978e-05, + "loss": 1.0407, + "step": 1617 + }, + { + "epoch": 0.6287768386281939, + "grad_norm": 0.19469478726387024, + "learning_rate": 7.450369793694045e-05, + "loss": 0.9923, + "step": 1618 + }, + { + "epoch": 0.6291654522491014, + "grad_norm": 0.2114047408103943, + "learning_rate": 7.442584663293111e-05, + "loss": 1.0263, + "step": 1619 + }, + { + "epoch": 0.6295540658700087, + "grad_norm": 0.21080389618873596, + "learning_rate": 7.434799532892177e-05, + "loss": 1.0012, + "step": 1620 + }, + { + "epoch": 0.6299426794909162, + "grad_norm": 0.20366831123828888, + "learning_rate": 7.427014402491243e-05, + "loss": 1.0254, + "step": 1621 + }, + { + "epoch": 0.6303312931118236, + "grad_norm": 0.209821879863739, + "learning_rate": 7.419229272090308e-05, + "loss": 0.9416, + "step": 1622 + }, + { + "epoch": 0.6307199067327309, + "grad_norm": 0.2228868007659912, + "learning_rate": 7.411444141689374e-05, + "loss": 1.0128, + "step": 1623 + }, + { + "epoch": 0.6311085203536384, + "grad_norm": 0.19673995673656464, + "learning_rate": 7.40365901128844e-05, + "loss": 0.9709, + "step": 1624 + }, + { + "epoch": 0.6314971339745458, + "grad_norm": 0.21590839326381683, + "learning_rate": 7.395873880887505e-05, + "loss": 1.0251, + "step": 1625 + }, + { + "epoch": 0.6318857475954532, + "grad_norm": 0.20200593769550323, + "learning_rate": 7.388088750486572e-05, + "loss": 1.0307, + "step": 1626 + }, + { + "epoch": 0.6322743612163606, + "grad_norm": 0.19623909890651703, + "learning_rate": 7.380303620085637e-05, + "loss": 1.0375, + "step": 1627 + }, + { + "epoch": 0.6326629748372681, + "grad_norm": 0.19878128170967102, + "learning_rate": 7.372518489684702e-05, + "loss": 0.9844, + "step": 1628 + }, + { + "epoch": 0.6330515884581754, + "grad_norm": 0.21292422711849213, + "learning_rate": 7.364733359283769e-05, + "loss": 1.0228, + "step": 1629 + }, + { + "epoch": 0.6334402020790829, + "grad_norm": 0.1915559619665146, + "learning_rate": 7.356948228882834e-05, + "loss": 0.9818, + "step": 1630 + }, + { + "epoch": 0.6338288156999903, + "grad_norm": 0.2264430969953537, + "learning_rate": 7.3491630984819e-05, + "loss": 1.146, + "step": 1631 + }, + { + "epoch": 0.6342174293208978, + "grad_norm": 0.19332270324230194, + "learning_rate": 7.341377968080966e-05, + "loss": 1.0007, + "step": 1632 + }, + { + "epoch": 0.6346060429418051, + "grad_norm": 0.217147096991539, + "learning_rate": 7.333592837680031e-05, + "loss": 1.0498, + "step": 1633 + }, + { + "epoch": 0.6349946565627125, + "grad_norm": 0.22200679779052734, + "learning_rate": 7.325807707279098e-05, + "loss": 1.0358, + "step": 1634 + }, + { + "epoch": 0.63538327018362, + "grad_norm": 0.19485117495059967, + "learning_rate": 7.318022576878163e-05, + "loss": 0.9717, + "step": 1635 + }, + { + "epoch": 0.6357718838045273, + "grad_norm": 0.20595680177211761, + "learning_rate": 7.310237446477228e-05, + "loss": 1.0195, + "step": 1636 + }, + { + "epoch": 0.6361604974254348, + "grad_norm": 0.21184709668159485, + "learning_rate": 7.302452316076294e-05, + "loss": 1.0354, + "step": 1637 + }, + { + "epoch": 0.6365491110463422, + "grad_norm": 0.22607794404029846, + "learning_rate": 7.29466718567536e-05, + "loss": 1.0217, + "step": 1638 + }, + { + "epoch": 0.6369377246672496, + "grad_norm": 0.20236065983772278, + "learning_rate": 7.286882055274426e-05, + "loss": 1.0441, + "step": 1639 + }, + { + "epoch": 0.637326338288157, + "grad_norm": 0.19979622960090637, + "learning_rate": 7.279096924873491e-05, + "loss": 1.0105, + "step": 1640 + }, + { + "epoch": 0.6377149519090645, + "grad_norm": 0.2655459940433502, + "learning_rate": 7.271311794472557e-05, + "loss": 1.0726, + "step": 1641 + }, + { + "epoch": 0.6381035655299718, + "grad_norm": 0.25107496976852417, + "learning_rate": 7.263526664071623e-05, + "loss": 1.037, + "step": 1642 + }, + { + "epoch": 0.6384921791508792, + "grad_norm": 0.19250229001045227, + "learning_rate": 7.255741533670688e-05, + "loss": 0.9741, + "step": 1643 + }, + { + "epoch": 0.6388807927717867, + "grad_norm": 0.19324181973934174, + "learning_rate": 7.247956403269755e-05, + "loss": 1.0333, + "step": 1644 + }, + { + "epoch": 0.639269406392694, + "grad_norm": 0.22267483174800873, + "learning_rate": 7.240171272868822e-05, + "loss": 1.0313, + "step": 1645 + }, + { + "epoch": 0.6396580200136015, + "grad_norm": 0.2775348722934723, + "learning_rate": 7.232386142467887e-05, + "loss": 1.0686, + "step": 1646 + }, + { + "epoch": 0.6400466336345089, + "grad_norm": 0.1886623501777649, + "learning_rate": 7.224601012066953e-05, + "loss": 1.0029, + "step": 1647 + }, + { + "epoch": 0.6404352472554163, + "grad_norm": 0.20303374528884888, + "learning_rate": 7.216815881666019e-05, + "loss": 1.0346, + "step": 1648 + }, + { + "epoch": 0.6408238608763237, + "grad_norm": 0.20815756916999817, + "learning_rate": 7.209030751265085e-05, + "loss": 1.0258, + "step": 1649 + }, + { + "epoch": 0.6412124744972312, + "grad_norm": 0.22055703401565552, + "learning_rate": 7.20124562086415e-05, + "loss": 1.0215, + "step": 1650 + }, + { + "epoch": 0.6416010881181385, + "grad_norm": 0.20248562097549438, + "learning_rate": 7.193460490463215e-05, + "loss": 0.9979, + "step": 1651 + }, + { + "epoch": 0.6419897017390459, + "grad_norm": 0.2093247026205063, + "learning_rate": 7.185675360062282e-05, + "loss": 1.0605, + "step": 1652 + }, + { + "epoch": 0.6423783153599534, + "grad_norm": 0.22276204824447632, + "learning_rate": 7.177890229661347e-05, + "loss": 1.0788, + "step": 1653 + }, + { + "epoch": 0.6427669289808607, + "grad_norm": 0.19959624111652374, + "learning_rate": 7.170105099260412e-05, + "loss": 0.9954, + "step": 1654 + }, + { + "epoch": 0.6431555426017682, + "grad_norm": 0.20173248648643494, + "learning_rate": 7.162319968859479e-05, + "loss": 1.003, + "step": 1655 + }, + { + "epoch": 0.6435441562226756, + "grad_norm": 0.207533061504364, + "learning_rate": 7.154534838458544e-05, + "loss": 1.043, + "step": 1656 + }, + { + "epoch": 0.643932769843583, + "grad_norm": 0.21928350627422333, + "learning_rate": 7.146749708057611e-05, + "loss": 1.0472, + "step": 1657 + }, + { + "epoch": 0.6443213834644904, + "grad_norm": 0.2567078173160553, + "learning_rate": 7.138964577656676e-05, + "loss": 1.0946, + "step": 1658 + }, + { + "epoch": 0.6447099970853979, + "grad_norm": 0.19454176723957062, + "learning_rate": 7.131179447255741e-05, + "loss": 0.9437, + "step": 1659 + }, + { + "epoch": 0.6450986107063053, + "grad_norm": 0.19198423624038696, + "learning_rate": 7.123394316854808e-05, + "loss": 0.9976, + "step": 1660 + }, + { + "epoch": 0.6454872243272126, + "grad_norm": 0.1929445117712021, + "learning_rate": 7.115609186453873e-05, + "loss": 1.0279, + "step": 1661 + }, + { + "epoch": 0.6458758379481201, + "grad_norm": 0.2041027694940567, + "learning_rate": 7.10782405605294e-05, + "loss": 1.0458, + "step": 1662 + }, + { + "epoch": 0.6462644515690275, + "grad_norm": 0.23750995099544525, + "learning_rate": 7.100038925652005e-05, + "loss": 1.0916, + "step": 1663 + }, + { + "epoch": 0.6466530651899349, + "grad_norm": 0.1971994787454605, + "learning_rate": 7.09225379525107e-05, + "loss": 0.951, + "step": 1664 + }, + { + "epoch": 0.6470416788108423, + "grad_norm": 0.20459246635437012, + "learning_rate": 7.084468664850136e-05, + "loss": 0.9653, + "step": 1665 + }, + { + "epoch": 0.6474302924317498, + "grad_norm": 0.2137187272310257, + "learning_rate": 7.076683534449202e-05, + "loss": 1.0291, + "step": 1666 + }, + { + "epoch": 0.6478189060526571, + "grad_norm": 0.21235258877277374, + "learning_rate": 7.068898404048268e-05, + "loss": 1.0104, + "step": 1667 + }, + { + "epoch": 0.6482075196735646, + "grad_norm": 0.23120944201946259, + "learning_rate": 7.061113273647333e-05, + "loss": 1.0693, + "step": 1668 + }, + { + "epoch": 0.648596133294472, + "grad_norm": 1.38257896900177, + "learning_rate": 7.053328143246399e-05, + "loss": 1.0339, + "step": 1669 + }, + { + "epoch": 0.6489847469153793, + "grad_norm": 0.20898790657520294, + "learning_rate": 7.045543012845465e-05, + "loss": 1.004, + "step": 1670 + }, + { + "epoch": 0.6493733605362868, + "grad_norm": 0.20251236855983734, + "learning_rate": 7.037757882444532e-05, + "loss": 0.9992, + "step": 1671 + }, + { + "epoch": 0.6497619741571942, + "grad_norm": 0.2358030527830124, + "learning_rate": 7.029972752043597e-05, + "loss": 0.9854, + "step": 1672 + }, + { + "epoch": 0.6501505877781016, + "grad_norm": 0.18945704400539398, + "learning_rate": 7.022187621642664e-05, + "loss": 0.9677, + "step": 1673 + }, + { + "epoch": 0.650539201399009, + "grad_norm": 0.1965213567018509, + "learning_rate": 7.014402491241729e-05, + "loss": 1.0118, + "step": 1674 + }, + { + "epoch": 0.6509278150199165, + "grad_norm": 0.2340148687362671, + "learning_rate": 7.006617360840795e-05, + "loss": 1.0312, + "step": 1675 + }, + { + "epoch": 0.6513164286408238, + "grad_norm": 0.1992296278476715, + "learning_rate": 6.99883223043986e-05, + "loss": 1.0155, + "step": 1676 + }, + { + "epoch": 0.6517050422617313, + "grad_norm": 0.20410223305225372, + "learning_rate": 6.991047100038926e-05, + "loss": 1.0646, + "step": 1677 + }, + { + "epoch": 0.6520936558826387, + "grad_norm": 0.19254536926746368, + "learning_rate": 6.983261969637992e-05, + "loss": 0.9538, + "step": 1678 + }, + { + "epoch": 0.6524822695035462, + "grad_norm": 0.19980847835540771, + "learning_rate": 6.975476839237057e-05, + "loss": 0.9912, + "step": 1679 + }, + { + "epoch": 0.6528708831244535, + "grad_norm": 0.19503261148929596, + "learning_rate": 6.967691708836124e-05, + "loss": 0.9844, + "step": 1680 + }, + { + "epoch": 0.6532594967453609, + "grad_norm": 0.22375883162021637, + "learning_rate": 6.959906578435189e-05, + "loss": 1.1266, + "step": 1681 + }, + { + "epoch": 0.6536481103662684, + "grad_norm": 0.21456514298915863, + "learning_rate": 6.952121448034254e-05, + "loss": 1.0902, + "step": 1682 + }, + { + "epoch": 0.6540367239871757, + "grad_norm": 0.20348122715950012, + "learning_rate": 6.944336317633321e-05, + "loss": 1.0228, + "step": 1683 + }, + { + "epoch": 0.6544253376080832, + "grad_norm": 0.21647393703460693, + "learning_rate": 6.936551187232386e-05, + "loss": 1.0653, + "step": 1684 + }, + { + "epoch": 0.6548139512289906, + "grad_norm": 0.20160923898220062, + "learning_rate": 6.928766056831453e-05, + "loss": 1.0249, + "step": 1685 + }, + { + "epoch": 0.655202564849898, + "grad_norm": 0.20070499181747437, + "learning_rate": 6.920980926430518e-05, + "loss": 1.0585, + "step": 1686 + }, + { + "epoch": 0.6555911784708054, + "grad_norm": 0.2656902074813843, + "learning_rate": 6.913195796029583e-05, + "loss": 1.0042, + "step": 1687 + }, + { + "epoch": 0.6559797920917129, + "grad_norm": 0.1934545785188675, + "learning_rate": 6.90541066562865e-05, + "loss": 0.9831, + "step": 1688 + }, + { + "epoch": 0.6563684057126202, + "grad_norm": 0.21719245612621307, + "learning_rate": 6.897625535227715e-05, + "loss": 0.9934, + "step": 1689 + }, + { + "epoch": 0.6567570193335276, + "grad_norm": 0.20906969904899597, + "learning_rate": 6.889840404826782e-05, + "loss": 1.023, + "step": 1690 + }, + { + "epoch": 0.6571456329544351, + "grad_norm": 0.225227490067482, + "learning_rate": 6.882055274425847e-05, + "loss": 1.0265, + "step": 1691 + }, + { + "epoch": 0.6575342465753424, + "grad_norm": 0.22766710817813873, + "learning_rate": 6.874270144024912e-05, + "loss": 1.0306, + "step": 1692 + }, + { + "epoch": 0.6579228601962499, + "grad_norm": 0.20964065194129944, + "learning_rate": 6.866485013623978e-05, + "loss": 0.9431, + "step": 1693 + }, + { + "epoch": 0.6583114738171573, + "grad_norm": 0.19821231067180634, + "learning_rate": 6.858699883223044e-05, + "loss": 0.9959, + "step": 1694 + }, + { + "epoch": 0.6587000874380647, + "grad_norm": 0.2071307748556137, + "learning_rate": 6.85091475282211e-05, + "loss": 1.0332, + "step": 1695 + }, + { + "epoch": 0.6590887010589721, + "grad_norm": 0.27962490916252136, + "learning_rate": 6.843129622421175e-05, + "loss": 0.9755, + "step": 1696 + }, + { + "epoch": 0.6594773146798796, + "grad_norm": 0.21582698822021484, + "learning_rate": 6.835344492020242e-05, + "loss": 1.0305, + "step": 1697 + }, + { + "epoch": 0.6598659283007869, + "grad_norm": 0.1872921586036682, + "learning_rate": 6.827559361619307e-05, + "loss": 0.9693, + "step": 1698 + }, + { + "epoch": 0.6602545419216943, + "grad_norm": 0.27033379673957825, + "learning_rate": 6.819774231218374e-05, + "loss": 1.0756, + "step": 1699 + }, + { + "epoch": 0.6606431555426018, + "grad_norm": 0.2010008543729782, + "learning_rate": 6.811989100817439e-05, + "loss": 1.0077, + "step": 1700 + }, + { + "epoch": 0.6610317691635091, + "grad_norm": 0.20637495815753937, + "learning_rate": 6.804203970416506e-05, + "loss": 1.0208, + "step": 1701 + }, + { + "epoch": 0.6614203827844166, + "grad_norm": 0.21331818401813507, + "learning_rate": 6.796418840015571e-05, + "loss": 1.0242, + "step": 1702 + }, + { + "epoch": 0.661808996405324, + "grad_norm": 0.2092941552400589, + "learning_rate": 6.788633709614637e-05, + "loss": 1.0949, + "step": 1703 + }, + { + "epoch": 0.6621976100262315, + "grad_norm": 0.22332265973091125, + "learning_rate": 6.780848579213703e-05, + "loss": 1.1068, + "step": 1704 + }, + { + "epoch": 0.6625862236471388, + "grad_norm": 0.20077067613601685, + "learning_rate": 6.773063448812768e-05, + "loss": 0.9801, + "step": 1705 + }, + { + "epoch": 0.6629748372680463, + "grad_norm": 0.2057008296251297, + "learning_rate": 6.765278318411834e-05, + "loss": 1.0058, + "step": 1706 + }, + { + "epoch": 0.6633634508889537, + "grad_norm": 0.20337353646755219, + "learning_rate": 6.7574931880109e-05, + "loss": 1.0141, + "step": 1707 + }, + { + "epoch": 0.663752064509861, + "grad_norm": 0.22756130993366241, + "learning_rate": 6.749708057609966e-05, + "loss": 1.0287, + "step": 1708 + }, + { + "epoch": 0.6641406781307685, + "grad_norm": 0.2052423506975174, + "learning_rate": 6.741922927209031e-05, + "loss": 1.0069, + "step": 1709 + }, + { + "epoch": 0.6645292917516759, + "grad_norm": 0.1988023817539215, + "learning_rate": 6.734137796808096e-05, + "loss": 0.9761, + "step": 1710 + }, + { + "epoch": 0.6649179053725833, + "grad_norm": 0.20491188764572144, + "learning_rate": 6.726352666407163e-05, + "loss": 0.9767, + "step": 1711 + }, + { + "epoch": 0.6653065189934907, + "grad_norm": 0.18790274858474731, + "learning_rate": 6.718567536006228e-05, + "loss": 0.9944, + "step": 1712 + }, + { + "epoch": 0.6656951326143982, + "grad_norm": 0.19979891180992126, + "learning_rate": 6.710782405605293e-05, + "loss": 1.0842, + "step": 1713 + }, + { + "epoch": 0.6660837462353055, + "grad_norm": 0.22204813361167908, + "learning_rate": 6.70299727520436e-05, + "loss": 1.0561, + "step": 1714 + }, + { + "epoch": 0.666472359856213, + "grad_norm": 0.20182965695858002, + "learning_rate": 6.695212144803425e-05, + "loss": 1.0015, + "step": 1715 + }, + { + "epoch": 0.6668609734771204, + "grad_norm": 0.20719997584819794, + "learning_rate": 6.687427014402492e-05, + "loss": 1.0144, + "step": 1716 + }, + { + "epoch": 0.6672495870980278, + "grad_norm": 0.1944626122713089, + "learning_rate": 6.679641884001557e-05, + "loss": 1.0083, + "step": 1717 + }, + { + "epoch": 0.6676382007189352, + "grad_norm": 0.2072264701128006, + "learning_rate": 6.671856753600622e-05, + "loss": 1.0246, + "step": 1718 + }, + { + "epoch": 0.6680268143398426, + "grad_norm": 0.2134973257780075, + "learning_rate": 6.664071623199689e-05, + "loss": 1.0926, + "step": 1719 + }, + { + "epoch": 0.66841542796075, + "grad_norm": 0.2119186669588089, + "learning_rate": 6.656286492798754e-05, + "loss": 1.0129, + "step": 1720 + }, + { + "epoch": 0.6688040415816574, + "grad_norm": 0.21205540001392365, + "learning_rate": 6.64850136239782e-05, + "loss": 1.0611, + "step": 1721 + }, + { + "epoch": 0.6691926552025649, + "grad_norm": 0.21632088720798492, + "learning_rate": 6.640716231996886e-05, + "loss": 1.0821, + "step": 1722 + }, + { + "epoch": 0.6695812688234722, + "grad_norm": 0.21734434366226196, + "learning_rate": 6.632931101595952e-05, + "loss": 1.0821, + "step": 1723 + }, + { + "epoch": 0.6699698824443797, + "grad_norm": 0.2030603289604187, + "learning_rate": 6.625145971195017e-05, + "loss": 0.9976, + "step": 1724 + }, + { + "epoch": 0.6703584960652871, + "grad_norm": 0.19921456277370453, + "learning_rate": 6.617360840794084e-05, + "loss": 0.9187, + "step": 1725 + }, + { + "epoch": 0.6707471096861946, + "grad_norm": 0.20548826456069946, + "learning_rate": 6.60957571039315e-05, + "loss": 1.0486, + "step": 1726 + }, + { + "epoch": 0.6711357233071019, + "grad_norm": 0.21784676611423492, + "learning_rate": 6.601790579992216e-05, + "loss": 1.1089, + "step": 1727 + }, + { + "epoch": 0.6715243369280093, + "grad_norm": 0.2137753963470459, + "learning_rate": 6.594005449591281e-05, + "loss": 1.0075, + "step": 1728 + }, + { + "epoch": 0.6719129505489168, + "grad_norm": 0.20200639963150024, + "learning_rate": 6.586220319190348e-05, + "loss": 0.9915, + "step": 1729 + }, + { + "epoch": 0.6723015641698241, + "grad_norm": 0.20898796617984772, + "learning_rate": 6.578435188789413e-05, + "loss": 1.0292, + "step": 1730 + }, + { + "epoch": 0.6726901777907316, + "grad_norm": 0.22515977919101715, + "learning_rate": 6.570650058388478e-05, + "loss": 1.0118, + "step": 1731 + }, + { + "epoch": 0.673078791411639, + "grad_norm": 0.2132793813943863, + "learning_rate": 6.562864927987545e-05, + "loss": 1.1097, + "step": 1732 + }, + { + "epoch": 0.6734674050325464, + "grad_norm": 0.20358797907829285, + "learning_rate": 6.55507979758661e-05, + "loss": 1.0241, + "step": 1733 + }, + { + "epoch": 0.6738560186534538, + "grad_norm": 0.21155016124248505, + "learning_rate": 6.547294667185676e-05, + "loss": 1.0235, + "step": 1734 + }, + { + "epoch": 0.6742446322743613, + "grad_norm": 0.198009192943573, + "learning_rate": 6.539509536784741e-05, + "loss": 0.9542, + "step": 1735 + }, + { + "epoch": 0.6746332458952686, + "grad_norm": 0.20318005979061127, + "learning_rate": 6.531724406383807e-05, + "loss": 0.9993, + "step": 1736 + }, + { + "epoch": 0.675021859516176, + "grad_norm": 0.21384860575199127, + "learning_rate": 6.523939275982873e-05, + "loss": 1.1188, + "step": 1737 + }, + { + "epoch": 0.6754104731370835, + "grad_norm": 0.18736955523490906, + "learning_rate": 6.516154145581938e-05, + "loss": 0.9832, + "step": 1738 + }, + { + "epoch": 0.6757990867579908, + "grad_norm": 0.2002391368150711, + "learning_rate": 6.508369015181005e-05, + "loss": 1.0288, + "step": 1739 + }, + { + "epoch": 0.6761877003788983, + "grad_norm": 0.20011006295681, + "learning_rate": 6.50058388478007e-05, + "loss": 0.9588, + "step": 1740 + }, + { + "epoch": 0.6765763139998057, + "grad_norm": 0.20782291889190674, + "learning_rate": 6.492798754379135e-05, + "loss": 1.0033, + "step": 1741 + }, + { + "epoch": 0.6769649276207131, + "grad_norm": 0.2056814581155777, + "learning_rate": 6.485013623978202e-05, + "loss": 1.0648, + "step": 1742 + }, + { + "epoch": 0.6773535412416205, + "grad_norm": 0.2207457572221756, + "learning_rate": 6.477228493577267e-05, + "loss": 1.0758, + "step": 1743 + }, + { + "epoch": 0.677742154862528, + "grad_norm": 0.20437198877334595, + "learning_rate": 6.469443363176334e-05, + "loss": 1.0253, + "step": 1744 + }, + { + "epoch": 0.6781307684834353, + "grad_norm": 0.198721781373024, + "learning_rate": 6.461658232775399e-05, + "loss": 1.0087, + "step": 1745 + }, + { + "epoch": 0.6785193821043427, + "grad_norm": 0.22781015932559967, + "learning_rate": 6.453873102374464e-05, + "loss": 1.0692, + "step": 1746 + }, + { + "epoch": 0.6789079957252502, + "grad_norm": 0.21826857328414917, + "learning_rate": 6.446087971973531e-05, + "loss": 1.0232, + "step": 1747 + }, + { + "epoch": 0.6792966093461575, + "grad_norm": 0.2156928926706314, + "learning_rate": 6.438302841572596e-05, + "loss": 1.0686, + "step": 1748 + }, + { + "epoch": 0.679685222967065, + "grad_norm": 0.2161693125963211, + "learning_rate": 6.430517711171662e-05, + "loss": 1.0298, + "step": 1749 + }, + { + "epoch": 0.6800738365879724, + "grad_norm": 0.19139425456523895, + "learning_rate": 6.422732580770729e-05, + "loss": 0.9545, + "step": 1750 + }, + { + "epoch": 0.6804624502088799, + "grad_norm": 0.22626161575317383, + "learning_rate": 6.414947450369794e-05, + "loss": 1.0669, + "step": 1751 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 0.2135801464319229, + "learning_rate": 6.407162319968861e-05, + "loss": 1.0187, + "step": 1752 + }, + { + "epoch": 0.6812396774506947, + "grad_norm": 0.20803681015968323, + "learning_rate": 6.399377189567926e-05, + "loss": 1.0856, + "step": 1753 + }, + { + "epoch": 0.681628291071602, + "grad_norm": 0.21317154169082642, + "learning_rate": 6.391592059166991e-05, + "loss": 1.1018, + "step": 1754 + }, + { + "epoch": 0.6820169046925094, + "grad_norm": 0.20877891778945923, + "learning_rate": 6.383806928766058e-05, + "loss": 1.0383, + "step": 1755 + }, + { + "epoch": 0.6824055183134169, + "grad_norm": 0.20769146084785461, + "learning_rate": 6.376021798365123e-05, + "loss": 1.0852, + "step": 1756 + }, + { + "epoch": 0.6827941319343243, + "grad_norm": 0.2252657413482666, + "learning_rate": 6.36823666796419e-05, + "loss": 1.0749, + "step": 1757 + }, + { + "epoch": 0.6831827455552317, + "grad_norm": 0.24453257024288177, + "learning_rate": 6.360451537563255e-05, + "loss": 1.1042, + "step": 1758 + }, + { + "epoch": 0.6835713591761391, + "grad_norm": 0.2082965075969696, + "learning_rate": 6.35266640716232e-05, + "loss": 1.0729, + "step": 1759 + }, + { + "epoch": 0.6839599727970466, + "grad_norm": 0.20121856033802032, + "learning_rate": 6.344881276761387e-05, + "loss": 1.038, + "step": 1760 + }, + { + "epoch": 0.6843485864179539, + "grad_norm": 0.20096386969089508, + "learning_rate": 6.337096146360452e-05, + "loss": 0.9655, + "step": 1761 + }, + { + "epoch": 0.6847372000388614, + "grad_norm": 0.20015959441661835, + "learning_rate": 6.329311015959518e-05, + "loss": 1.0187, + "step": 1762 + }, + { + "epoch": 0.6851258136597688, + "grad_norm": 0.21056395769119263, + "learning_rate": 6.321525885558583e-05, + "loss": 1.0567, + "step": 1763 + }, + { + "epoch": 0.6855144272806762, + "grad_norm": 0.2211030125617981, + "learning_rate": 6.313740755157649e-05, + "loss": 1.0588, + "step": 1764 + }, + { + "epoch": 0.6859030409015836, + "grad_norm": 0.20809797942638397, + "learning_rate": 6.305955624756715e-05, + "loss": 0.9488, + "step": 1765 + }, + { + "epoch": 0.686291654522491, + "grad_norm": 0.2331530600786209, + "learning_rate": 6.29817049435578e-05, + "loss": 1.0789, + "step": 1766 + }, + { + "epoch": 0.6866802681433984, + "grad_norm": 0.21708674728870392, + "learning_rate": 6.290385363954846e-05, + "loss": 1.0518, + "step": 1767 + }, + { + "epoch": 0.6870688817643058, + "grad_norm": 0.2088184356689453, + "learning_rate": 6.282600233553912e-05, + "loss": 1.0178, + "step": 1768 + }, + { + "epoch": 0.6874574953852133, + "grad_norm": 0.20285943150520325, + "learning_rate": 6.274815103152977e-05, + "loss": 1.018, + "step": 1769 + }, + { + "epoch": 0.6878461090061206, + "grad_norm": 0.211436927318573, + "learning_rate": 6.267029972752044e-05, + "loss": 1.0572, + "step": 1770 + }, + { + "epoch": 0.6882347226270281, + "grad_norm": 0.21108384430408478, + "learning_rate": 6.259244842351109e-05, + "loss": 1.0227, + "step": 1771 + }, + { + "epoch": 0.6886233362479355, + "grad_norm": 0.2060437649488449, + "learning_rate": 6.251459711950174e-05, + "loss": 1.0251, + "step": 1772 + }, + { + "epoch": 0.689011949868843, + "grad_norm": 0.20819245278835297, + "learning_rate": 6.243674581549241e-05, + "loss": 1.0643, + "step": 1773 + }, + { + "epoch": 0.6894005634897503, + "grad_norm": 0.2172113060951233, + "learning_rate": 6.235889451148306e-05, + "loss": 1.0869, + "step": 1774 + }, + { + "epoch": 0.6897891771106577, + "grad_norm": 0.2087356299161911, + "learning_rate": 6.228104320747373e-05, + "loss": 1.0622, + "step": 1775 + }, + { + "epoch": 0.6901777907315652, + "grad_norm": 0.1958473175764084, + "learning_rate": 6.220319190346439e-05, + "loss": 0.9542, + "step": 1776 + }, + { + "epoch": 0.6905664043524725, + "grad_norm": 0.23630915582180023, + "learning_rate": 6.212534059945504e-05, + "loss": 1.0535, + "step": 1777 + }, + { + "epoch": 0.69095501797338, + "grad_norm": 0.2127649188041687, + "learning_rate": 6.204748929544571e-05, + "loss": 0.972, + "step": 1778 + }, + { + "epoch": 0.6913436315942874, + "grad_norm": 0.19873055815696716, + "learning_rate": 6.196963799143636e-05, + "loss": 0.9969, + "step": 1779 + }, + { + "epoch": 0.6917322452151948, + "grad_norm": 0.2013067901134491, + "learning_rate": 6.189178668742703e-05, + "loss": 1.0399, + "step": 1780 + }, + { + "epoch": 0.6921208588361022, + "grad_norm": 0.21300987899303436, + "learning_rate": 6.181393538341768e-05, + "loss": 1.0377, + "step": 1781 + }, + { + "epoch": 0.6925094724570097, + "grad_norm": 0.21665994822978973, + "learning_rate": 6.173608407940833e-05, + "loss": 1.008, + "step": 1782 + }, + { + "epoch": 0.692898086077917, + "grad_norm": 0.21622590720653534, + "learning_rate": 6.1658232775399e-05, + "loss": 1.1128, + "step": 1783 + }, + { + "epoch": 0.6932866996988244, + "grad_norm": 0.2000272423028946, + "learning_rate": 6.158038147138965e-05, + "loss": 1.0115, + "step": 1784 + }, + { + "epoch": 0.6936753133197319, + "grad_norm": 0.20774856209754944, + "learning_rate": 6.15025301673803e-05, + "loss": 1.066, + "step": 1785 + }, + { + "epoch": 0.6940639269406392, + "grad_norm": 0.18497461080551147, + "learning_rate": 6.142467886337097e-05, + "loss": 0.9608, + "step": 1786 + }, + { + "epoch": 0.6944525405615467, + "grad_norm": 0.19819007813930511, + "learning_rate": 6.134682755936162e-05, + "loss": 1.0114, + "step": 1787 + }, + { + "epoch": 0.6948411541824541, + "grad_norm": 0.22013314068317413, + "learning_rate": 6.126897625535229e-05, + "loss": 0.976, + "step": 1788 + }, + { + "epoch": 0.6952297678033615, + "grad_norm": 0.2066160887479782, + "learning_rate": 6.119112495134294e-05, + "loss": 1.0585, + "step": 1789 + }, + { + "epoch": 0.6956183814242689, + "grad_norm": 0.21364475786685944, + "learning_rate": 6.111327364733359e-05, + "loss": 1.0842, + "step": 1790 + }, + { + "epoch": 0.6960069950451764, + "grad_norm": 0.19731444120407104, + "learning_rate": 6.103542234332425e-05, + "loss": 0.9936, + "step": 1791 + }, + { + "epoch": 0.6963956086660837, + "grad_norm": 0.2162671983242035, + "learning_rate": 6.095757103931491e-05, + "loss": 1.0446, + "step": 1792 + }, + { + "epoch": 0.6967842222869911, + "grad_norm": 0.21486608684062958, + "learning_rate": 6.087971973530557e-05, + "loss": 1.0441, + "step": 1793 + }, + { + "epoch": 0.6971728359078986, + "grad_norm": 0.20850563049316406, + "learning_rate": 6.0801868431296224e-05, + "loss": 1.0431, + "step": 1794 + }, + { + "epoch": 0.6975614495288059, + "grad_norm": 0.20492027699947357, + "learning_rate": 6.072401712728688e-05, + "loss": 0.9845, + "step": 1795 + }, + { + "epoch": 0.6979500631497134, + "grad_norm": 0.1986648142337799, + "learning_rate": 6.064616582327754e-05, + "loss": 0.9855, + "step": 1796 + }, + { + "epoch": 0.6983386767706208, + "grad_norm": 0.20606310665607452, + "learning_rate": 6.05683145192682e-05, + "loss": 1.0608, + "step": 1797 + }, + { + "epoch": 0.6987272903915283, + "grad_norm": 0.20496073365211487, + "learning_rate": 6.0490463215258867e-05, + "loss": 1.0311, + "step": 1798 + }, + { + "epoch": 0.6991159040124356, + "grad_norm": 0.2153409719467163, + "learning_rate": 6.041261191124952e-05, + "loss": 1.0394, + "step": 1799 + }, + { + "epoch": 0.6995045176333431, + "grad_norm": 0.21410655975341797, + "learning_rate": 6.033476060724017e-05, + "loss": 1.0229, + "step": 1800 + }, + { + "epoch": 0.6998931312542505, + "grad_norm": 0.20418782532215118, + "learning_rate": 6.0256909303230836e-05, + "loss": 1.0382, + "step": 1801 + }, + { + "epoch": 0.7002817448751578, + "grad_norm": 0.19154146313667297, + "learning_rate": 6.017905799922149e-05, + "loss": 0.9891, + "step": 1802 + }, + { + "epoch": 0.7006703584960653, + "grad_norm": 0.19138328731060028, + "learning_rate": 6.010120669521214e-05, + "loss": 0.9638, + "step": 1803 + }, + { + "epoch": 0.7010589721169727, + "grad_norm": 0.19704872369766235, + "learning_rate": 6.0023355391202806e-05, + "loss": 0.9835, + "step": 1804 + }, + { + "epoch": 0.7014475857378801, + "grad_norm": 0.2175600379705429, + "learning_rate": 5.994550408719346e-05, + "loss": 1.1192, + "step": 1805 + }, + { + "epoch": 0.7018361993587875, + "grad_norm": 0.21614274382591248, + "learning_rate": 5.9867652783184124e-05, + "loss": 1.0877, + "step": 1806 + }, + { + "epoch": 0.702224812979695, + "grad_norm": 0.20461414754390717, + "learning_rate": 5.9789801479174776e-05, + "loss": 0.9706, + "step": 1807 + }, + { + "epoch": 0.7026134266006023, + "grad_norm": 0.1989748477935791, + "learning_rate": 5.9711950175165434e-05, + "loss": 1.0004, + "step": 1808 + }, + { + "epoch": 0.7030020402215098, + "grad_norm": 0.21304792165756226, + "learning_rate": 5.963409887115609e-05, + "loss": 1.0177, + "step": 1809 + }, + { + "epoch": 0.7033906538424172, + "grad_norm": 0.19023855030536652, + "learning_rate": 5.955624756714675e-05, + "loss": 0.9759, + "step": 1810 + }, + { + "epoch": 0.7037792674633246, + "grad_norm": 0.21915188431739807, + "learning_rate": 5.947839626313742e-05, + "loss": 1.0621, + "step": 1811 + }, + { + "epoch": 0.704167881084232, + "grad_norm": 0.21626822650432587, + "learning_rate": 5.940054495912807e-05, + "loss": 1.0144, + "step": 1812 + }, + { + "epoch": 0.7045564947051394, + "grad_norm": 0.20742040872573853, + "learning_rate": 5.932269365511872e-05, + "loss": 0.9778, + "step": 1813 + }, + { + "epoch": 0.7049451083260468, + "grad_norm": 0.2172158658504486, + "learning_rate": 5.924484235110939e-05, + "loss": 1.0416, + "step": 1814 + }, + { + "epoch": 0.7053337219469542, + "grad_norm": 0.209465891122818, + "learning_rate": 5.916699104710004e-05, + "loss": 1.0378, + "step": 1815 + }, + { + "epoch": 0.7057223355678617, + "grad_norm": 0.2097882628440857, + "learning_rate": 5.9089139743090705e-05, + "loss": 1.0166, + "step": 1816 + }, + { + "epoch": 0.706110949188769, + "grad_norm": 0.2251904308795929, + "learning_rate": 5.901128843908136e-05, + "loss": 1.0783, + "step": 1817 + }, + { + "epoch": 0.7064995628096765, + "grad_norm": 0.1952916979789734, + "learning_rate": 5.893343713507201e-05, + "loss": 0.993, + "step": 1818 + }, + { + "epoch": 0.7068881764305839, + "grad_norm": 0.20997455716133118, + "learning_rate": 5.8855585831062675e-05, + "loss": 1.0448, + "step": 1819 + }, + { + "epoch": 0.7072767900514914, + "grad_norm": 0.20070020854473114, + "learning_rate": 5.877773452705333e-05, + "loss": 0.9603, + "step": 1820 + }, + { + "epoch": 0.7076654036723987, + "grad_norm": 0.25765034556388855, + "learning_rate": 5.869988322304399e-05, + "loss": 1.0361, + "step": 1821 + }, + { + "epoch": 0.7080540172933061, + "grad_norm": 0.21948982775211334, + "learning_rate": 5.862203191903465e-05, + "loss": 1.0668, + "step": 1822 + }, + { + "epoch": 0.7084426309142136, + "grad_norm": 0.1867108792066574, + "learning_rate": 5.85441806150253e-05, + "loss": 0.9372, + "step": 1823 + }, + { + "epoch": 0.7088312445351209, + "grad_norm": 0.2037520408630371, + "learning_rate": 5.846632931101597e-05, + "loss": 0.9905, + "step": 1824 + }, + { + "epoch": 0.7092198581560284, + "grad_norm": 0.21352072060108185, + "learning_rate": 5.838847800700662e-05, + "loss": 1.0514, + "step": 1825 + }, + { + "epoch": 0.7096084717769358, + "grad_norm": 0.1949845850467682, + "learning_rate": 5.831062670299727e-05, + "loss": 0.9636, + "step": 1826 + }, + { + "epoch": 0.7099970853978432, + "grad_norm": 0.2092294692993164, + "learning_rate": 5.823277539898794e-05, + "loss": 1.0361, + "step": 1827 + }, + { + "epoch": 0.7103856990187506, + "grad_norm": 0.20054267346858978, + "learning_rate": 5.815492409497859e-05, + "loss": 1.0195, + "step": 1828 + }, + { + "epoch": 0.7107743126396581, + "grad_norm": 0.2202107012271881, + "learning_rate": 5.8077072790969256e-05, + "loss": 1.0918, + "step": 1829 + }, + { + "epoch": 0.7111629262605654, + "grad_norm": 0.2001042366027832, + "learning_rate": 5.799922148695991e-05, + "loss": 1.0142, + "step": 1830 + }, + { + "epoch": 0.7115515398814728, + "grad_norm": 0.2102631777524948, + "learning_rate": 5.792137018295056e-05, + "loss": 1.0231, + "step": 1831 + }, + { + "epoch": 0.7119401535023803, + "grad_norm": 0.21717461943626404, + "learning_rate": 5.7843518878941226e-05, + "loss": 1.0295, + "step": 1832 + }, + { + "epoch": 0.7123287671232876, + "grad_norm": 0.2001933753490448, + "learning_rate": 5.776566757493188e-05, + "loss": 1.022, + "step": 1833 + }, + { + "epoch": 0.7127173807441951, + "grad_norm": 0.2218201756477356, + "learning_rate": 5.7687816270922544e-05, + "loss": 1.0762, + "step": 1834 + }, + { + "epoch": 0.7131059943651025, + "grad_norm": 0.20680001378059387, + "learning_rate": 5.76099649669132e-05, + "loss": 1.0017, + "step": 1835 + }, + { + "epoch": 0.7134946079860099, + "grad_norm": 0.21511508524417877, + "learning_rate": 5.7532113662903854e-05, + "loss": 1.048, + "step": 1836 + }, + { + "epoch": 0.7138832216069173, + "grad_norm": 0.19720061123371124, + "learning_rate": 5.745426235889452e-05, + "loss": 0.9983, + "step": 1837 + }, + { + "epoch": 0.7142718352278248, + "grad_norm": 0.2005409449338913, + "learning_rate": 5.737641105488517e-05, + "loss": 0.9941, + "step": 1838 + }, + { + "epoch": 0.7146604488487321, + "grad_norm": 0.2222924679517746, + "learning_rate": 5.729855975087584e-05, + "loss": 1.0476, + "step": 1839 + }, + { + "epoch": 0.7150490624696395, + "grad_norm": 0.21131208539009094, + "learning_rate": 5.722070844686649e-05, + "loss": 1.0208, + "step": 1840 + }, + { + "epoch": 0.715437676090547, + "grad_norm": 0.2307305932044983, + "learning_rate": 5.714285714285714e-05, + "loss": 0.9867, + "step": 1841 + }, + { + "epoch": 0.7158262897114543, + "grad_norm": 0.1974973827600479, + "learning_rate": 5.706500583884781e-05, + "loss": 1.0285, + "step": 1842 + }, + { + "epoch": 0.7162149033323618, + "grad_norm": 0.2006559520959854, + "learning_rate": 5.698715453483846e-05, + "loss": 1.024, + "step": 1843 + }, + { + "epoch": 0.7166035169532692, + "grad_norm": 0.21160584688186646, + "learning_rate": 5.690930323082911e-05, + "loss": 1.0256, + "step": 1844 + }, + { + "epoch": 0.7169921305741767, + "grad_norm": 0.28184664249420166, + "learning_rate": 5.683145192681978e-05, + "loss": 1.0443, + "step": 1845 + }, + { + "epoch": 0.717380744195084, + "grad_norm": 0.2206653356552124, + "learning_rate": 5.675360062281043e-05, + "loss": 1.0458, + "step": 1846 + }, + { + "epoch": 0.7177693578159915, + "grad_norm": 0.21346066892147064, + "learning_rate": 5.6675749318801095e-05, + "loss": 1.0106, + "step": 1847 + }, + { + "epoch": 0.7181579714368989, + "grad_norm": 0.20931747555732727, + "learning_rate": 5.6597898014791753e-05, + "loss": 0.9831, + "step": 1848 + }, + { + "epoch": 0.7185465850578063, + "grad_norm": 0.2026771456003189, + "learning_rate": 5.6520046710782406e-05, + "loss": 1.0162, + "step": 1849 + }, + { + "epoch": 0.7189351986787137, + "grad_norm": 0.21388716995716095, + "learning_rate": 5.644219540677307e-05, + "loss": 1.0867, + "step": 1850 + }, + { + "epoch": 0.7193238122996211, + "grad_norm": 0.2039308398962021, + "learning_rate": 5.636434410276372e-05, + "loss": 1.0325, + "step": 1851 + }, + { + "epoch": 0.7197124259205285, + "grad_norm": 0.21741114556789398, + "learning_rate": 5.628649279875439e-05, + "loss": 1.0251, + "step": 1852 + }, + { + "epoch": 0.7201010395414359, + "grad_norm": 0.21343208849430084, + "learning_rate": 5.620864149474504e-05, + "loss": 1.0766, + "step": 1853 + }, + { + "epoch": 0.7204896531623434, + "grad_norm": 0.21712560951709747, + "learning_rate": 5.613079019073569e-05, + "loss": 1.0643, + "step": 1854 + }, + { + "epoch": 0.7208782667832507, + "grad_norm": 0.2176978886127472, + "learning_rate": 5.605293888672636e-05, + "loss": 1.0375, + "step": 1855 + }, + { + "epoch": 0.7212668804041582, + "grad_norm": 0.2065533846616745, + "learning_rate": 5.597508758271701e-05, + "loss": 1.0385, + "step": 1856 + }, + { + "epoch": 0.7216554940250656, + "grad_norm": 0.2169170081615448, + "learning_rate": 5.5897236278707676e-05, + "loss": 1.0197, + "step": 1857 + }, + { + "epoch": 0.722044107645973, + "grad_norm": 0.2047201544046402, + "learning_rate": 5.581938497469833e-05, + "loss": 0.9794, + "step": 1858 + }, + { + "epoch": 0.7224327212668804, + "grad_norm": 0.20898981392383575, + "learning_rate": 5.574153367068898e-05, + "loss": 1.032, + "step": 1859 + }, + { + "epoch": 0.7228213348877878, + "grad_norm": 0.2090533971786499, + "learning_rate": 5.5663682366679646e-05, + "loss": 1.0694, + "step": 1860 + }, + { + "epoch": 0.7232099485086952, + "grad_norm": 0.21963149309158325, + "learning_rate": 5.5585831062670305e-05, + "loss": 1.0367, + "step": 1861 + }, + { + "epoch": 0.7235985621296026, + "grad_norm": 0.1974373459815979, + "learning_rate": 5.550797975866096e-05, + "loss": 1.0402, + "step": 1862 + }, + { + "epoch": 0.7239871757505101, + "grad_norm": 0.1924194097518921, + "learning_rate": 5.543012845465162e-05, + "loss": 0.9647, + "step": 1863 + }, + { + "epoch": 0.7243757893714174, + "grad_norm": 0.21366077661514282, + "learning_rate": 5.5352277150642274e-05, + "loss": 1.0139, + "step": 1864 + }, + { + "epoch": 0.7247644029923249, + "grad_norm": 0.21722929179668427, + "learning_rate": 5.527442584663294e-05, + "loss": 1.0366, + "step": 1865 + }, + { + "epoch": 0.7251530166132323, + "grad_norm": 0.20646587014198303, + "learning_rate": 5.519657454262359e-05, + "loss": 1.0465, + "step": 1866 + }, + { + "epoch": 0.7255416302341398, + "grad_norm": 0.19144394993782043, + "learning_rate": 5.5118723238614244e-05, + "loss": 0.9645, + "step": 1867 + }, + { + "epoch": 0.7259302438550471, + "grad_norm": 0.19553838670253754, + "learning_rate": 5.504087193460491e-05, + "loss": 0.98, + "step": 1868 + }, + { + "epoch": 0.7263188574759545, + "grad_norm": 0.21739792823791504, + "learning_rate": 5.496302063059556e-05, + "loss": 1.002, + "step": 1869 + }, + { + "epoch": 0.726707471096862, + "grad_norm": 0.1910562962293625, + "learning_rate": 5.488516932658623e-05, + "loss": 0.985, + "step": 1870 + }, + { + "epoch": 0.7270960847177693, + "grad_norm": 0.2133384346961975, + "learning_rate": 5.480731802257688e-05, + "loss": 1.0325, + "step": 1871 + }, + { + "epoch": 0.7274846983386768, + "grad_norm": 0.21884119510650635, + "learning_rate": 5.472946671856753e-05, + "loss": 1.0412, + "step": 1872 + }, + { + "epoch": 0.7278733119595842, + "grad_norm": 0.21069306135177612, + "learning_rate": 5.46516154145582e-05, + "loss": 1.0474, + "step": 1873 + }, + { + "epoch": 0.7282619255804916, + "grad_norm": 0.19266243278980255, + "learning_rate": 5.4573764110548856e-05, + "loss": 0.9941, + "step": 1874 + }, + { + "epoch": 0.728650539201399, + "grad_norm": 0.21255099773406982, + "learning_rate": 5.4495912806539515e-05, + "loss": 1.0211, + "step": 1875 + }, + { + "epoch": 0.7290391528223065, + "grad_norm": 0.1924402117729187, + "learning_rate": 5.4418061502530173e-05, + "loss": 1.0117, + "step": 1876 + }, + { + "epoch": 0.7294277664432138, + "grad_norm": 0.2019895315170288, + "learning_rate": 5.4340210198520825e-05, + "loss": 0.9921, + "step": 1877 + }, + { + "epoch": 0.7298163800641212, + "grad_norm": 0.20398026704788208, + "learning_rate": 5.426235889451149e-05, + "loss": 1.0423, + "step": 1878 + }, + { + "epoch": 0.7302049936850287, + "grad_norm": 0.20153217017650604, + "learning_rate": 5.418450759050214e-05, + "loss": 1.0333, + "step": 1879 + }, + { + "epoch": 0.730593607305936, + "grad_norm": 0.21259640157222748, + "learning_rate": 5.4106656286492795e-05, + "loss": 1.0689, + "step": 1880 + }, + { + "epoch": 0.7309822209268435, + "grad_norm": 0.2037276029586792, + "learning_rate": 5.402880498248346e-05, + "loss": 1.0203, + "step": 1881 + }, + { + "epoch": 0.7313708345477509, + "grad_norm": 0.19976729154586792, + "learning_rate": 5.395095367847411e-05, + "loss": 1.0173, + "step": 1882 + }, + { + "epoch": 0.7317594481686583, + "grad_norm": 0.20481806993484497, + "learning_rate": 5.387310237446478e-05, + "loss": 0.9864, + "step": 1883 + }, + { + "epoch": 0.7321480617895657, + "grad_norm": 0.21900932490825653, + "learning_rate": 5.379525107045543e-05, + "loss": 1.0519, + "step": 1884 + }, + { + "epoch": 0.7325366754104732, + "grad_norm": 0.200319305062294, + "learning_rate": 5.371739976644609e-05, + "loss": 1.0834, + "step": 1885 + }, + { + "epoch": 0.7329252890313805, + "grad_norm": 0.19662296772003174, + "learning_rate": 5.363954846243675e-05, + "loss": 0.9794, + "step": 1886 + }, + { + "epoch": 0.7333139026522879, + "grad_norm": 0.2113952785730362, + "learning_rate": 5.356169715842741e-05, + "loss": 1.0763, + "step": 1887 + }, + { + "epoch": 0.7337025162731954, + "grad_norm": 0.21348755061626434, + "learning_rate": 5.3483845854418066e-05, + "loss": 1.0781, + "step": 1888 + }, + { + "epoch": 0.7340911298941027, + "grad_norm": 0.20673702657222748, + "learning_rate": 5.3405994550408725e-05, + "loss": 1.0513, + "step": 1889 + }, + { + "epoch": 0.7344797435150102, + "grad_norm": 0.210855171084404, + "learning_rate": 5.332814324639938e-05, + "loss": 0.9972, + "step": 1890 + }, + { + "epoch": 0.7348683571359176, + "grad_norm": 0.2136204093694687, + "learning_rate": 5.325029194239004e-05, + "loss": 1.03, + "step": 1891 + }, + { + "epoch": 0.7352569707568251, + "grad_norm": 0.20035260915756226, + "learning_rate": 5.3172440638380694e-05, + "loss": 0.9739, + "step": 1892 + }, + { + "epoch": 0.7356455843777324, + "grad_norm": 0.1943352371454239, + "learning_rate": 5.309458933437136e-05, + "loss": 0.9411, + "step": 1893 + }, + { + "epoch": 0.7360341979986399, + "grad_norm": 0.3994326889514923, + "learning_rate": 5.301673803036201e-05, + "loss": 1.0714, + "step": 1894 + }, + { + "epoch": 0.7364228116195473, + "grad_norm": 0.21691356599330902, + "learning_rate": 5.2938886726352664e-05, + "loss": 1.0648, + "step": 1895 + }, + { + "epoch": 0.7368114252404547, + "grad_norm": 0.19853095710277557, + "learning_rate": 5.286103542234333e-05, + "loss": 0.983, + "step": 1896 + }, + { + "epoch": 0.7372000388613621, + "grad_norm": 0.21836897730827332, + "learning_rate": 5.278318411833398e-05, + "loss": 1.0396, + "step": 1897 + }, + { + "epoch": 0.7375886524822695, + "grad_norm": 0.19596605002880096, + "learning_rate": 5.270533281432464e-05, + "loss": 0.9593, + "step": 1898 + }, + { + "epoch": 0.7379772661031769, + "grad_norm": 0.2141752541065216, + "learning_rate": 5.26274815103153e-05, + "loss": 1.0373, + "step": 1899 + }, + { + "epoch": 0.7383658797240843, + "grad_norm": 0.20552939176559448, + "learning_rate": 5.254963020630596e-05, + "loss": 1.0352, + "step": 1900 + }, + { + "epoch": 0.7387544933449918, + "grad_norm": 0.2095794975757599, + "learning_rate": 5.247177890229662e-05, + "loss": 1.0632, + "step": 1901 + }, + { + "epoch": 0.7391431069658991, + "grad_norm": 0.19894710183143616, + "learning_rate": 5.2393927598287276e-05, + "loss": 0.9886, + "step": 1902 + }, + { + "epoch": 0.7395317205868066, + "grad_norm": 0.22996319830417633, + "learning_rate": 5.231607629427793e-05, + "loss": 1.0826, + "step": 1903 + }, + { + "epoch": 0.739920334207714, + "grad_norm": 0.21416957676410675, + "learning_rate": 5.2238224990268593e-05, + "loss": 1.0161, + "step": 1904 + }, + { + "epoch": 0.7403089478286214, + "grad_norm": 0.21819345653057098, + "learning_rate": 5.2160373686259245e-05, + "loss": 1.0458, + "step": 1905 + }, + { + "epoch": 0.7406975614495288, + "grad_norm": 0.21327044069766998, + "learning_rate": 5.208252238224991e-05, + "loss": 1.0721, + "step": 1906 + }, + { + "epoch": 0.7410861750704362, + "grad_norm": 0.21436645090579987, + "learning_rate": 5.200467107824056e-05, + "loss": 1.0743, + "step": 1907 + }, + { + "epoch": 0.7414747886913436, + "grad_norm": 0.215640127658844, + "learning_rate": 5.1926819774231215e-05, + "loss": 1.0274, + "step": 1908 + }, + { + "epoch": 0.741863402312251, + "grad_norm": 0.2043589949607849, + "learning_rate": 5.184896847022188e-05, + "loss": 1.0618, + "step": 1909 + }, + { + "epoch": 0.7422520159331585, + "grad_norm": 0.2014230340719223, + "learning_rate": 5.177111716621253e-05, + "loss": 0.9892, + "step": 1910 + }, + { + "epoch": 0.7426406295540658, + "grad_norm": 0.19954468309879303, + "learning_rate": 5.16932658622032e-05, + "loss": 0.9815, + "step": 1911 + }, + { + "epoch": 0.7430292431749733, + "grad_norm": 0.23119708895683289, + "learning_rate": 5.161541455819385e-05, + "loss": 1.0783, + "step": 1912 + }, + { + "epoch": 0.7434178567958807, + "grad_norm": 0.20650482177734375, + "learning_rate": 5.153756325418451e-05, + "loss": 1.0162, + "step": 1913 + }, + { + "epoch": 0.7438064704167882, + "grad_norm": 0.20021970570087433, + "learning_rate": 5.145971195017517e-05, + "loss": 1.0062, + "step": 1914 + }, + { + "epoch": 0.7441950840376955, + "grad_norm": 0.23300811648368835, + "learning_rate": 5.138186064616583e-05, + "loss": 1.0049, + "step": 1915 + }, + { + "epoch": 0.7445836976586029, + "grad_norm": 0.23268327116966248, + "learning_rate": 5.130400934215648e-05, + "loss": 1.0138, + "step": 1916 + }, + { + "epoch": 0.7449723112795104, + "grad_norm": 0.20413407683372498, + "learning_rate": 5.1226158038147145e-05, + "loss": 0.9903, + "step": 1917 + }, + { + "epoch": 0.7453609249004177, + "grad_norm": 0.20714978873729706, + "learning_rate": 5.1148306734137797e-05, + "loss": 1.0374, + "step": 1918 + }, + { + "epoch": 0.7457495385213252, + "grad_norm": 0.2000850886106491, + "learning_rate": 5.107045543012846e-05, + "loss": 0.9885, + "step": 1919 + }, + { + "epoch": 0.7461381521422326, + "grad_norm": 0.2054719179868698, + "learning_rate": 5.0992604126119114e-05, + "loss": 1.0551, + "step": 1920 + }, + { + "epoch": 0.74652676576314, + "grad_norm": 0.2351357489824295, + "learning_rate": 5.0914752822109766e-05, + "loss": 1.0693, + "step": 1921 + }, + { + "epoch": 0.7469153793840474, + "grad_norm": 0.22370338439941406, + "learning_rate": 5.083690151810043e-05, + "loss": 0.9781, + "step": 1922 + }, + { + "epoch": 0.7473039930049549, + "grad_norm": 0.18734332919120789, + "learning_rate": 5.0759050214091084e-05, + "loss": 0.9329, + "step": 1923 + }, + { + "epoch": 0.7476926066258622, + "grad_norm": 0.22099906206130981, + "learning_rate": 5.068119891008175e-05, + "loss": 1.0498, + "step": 1924 + }, + { + "epoch": 0.7480812202467696, + "grad_norm": 0.20144490897655487, + "learning_rate": 5.06033476060724e-05, + "loss": 0.9865, + "step": 1925 + }, + { + "epoch": 0.7484698338676771, + "grad_norm": 0.21770039200782776, + "learning_rate": 5.052549630206306e-05, + "loss": 1.0867, + "step": 1926 + }, + { + "epoch": 0.7488584474885844, + "grad_norm": 0.19649921357631683, + "learning_rate": 5.044764499805372e-05, + "loss": 0.9887, + "step": 1927 + }, + { + "epoch": 0.7492470611094919, + "grad_norm": 0.1940620392560959, + "learning_rate": 5.036979369404438e-05, + "loss": 1.0073, + "step": 1928 + }, + { + "epoch": 0.7496356747303993, + "grad_norm": 0.20987650752067566, + "learning_rate": 5.0291942390035044e-05, + "loss": 1.046, + "step": 1929 + }, + { + "epoch": 0.7500242883513067, + "grad_norm": 0.2116398960351944, + "learning_rate": 5.0214091086025696e-05, + "loss": 1.0423, + "step": 1930 + }, + { + "epoch": 0.7504129019722141, + "grad_norm": 0.18996965885162354, + "learning_rate": 5.013623978201635e-05, + "loss": 0.9822, + "step": 1931 + }, + { + "epoch": 0.7508015155931216, + "grad_norm": 0.20942547917366028, + "learning_rate": 5.005838847800701e-05, + "loss": 1.0472, + "step": 1932 + }, + { + "epoch": 0.751190129214029, + "grad_norm": 0.19006839394569397, + "learning_rate": 4.9980537173997665e-05, + "loss": 0.993, + "step": 1933 + }, + { + "epoch": 0.7515787428349364, + "grad_norm": 0.21508941054344177, + "learning_rate": 4.9902685869988324e-05, + "loss": 1.0406, + "step": 1934 + }, + { + "epoch": 0.7519673564558438, + "grad_norm": 0.1989334225654602, + "learning_rate": 4.982483456597898e-05, + "loss": 0.9997, + "step": 1935 + }, + { + "epoch": 0.7523559700767511, + "grad_norm": 0.19993600249290466, + "learning_rate": 4.974698326196964e-05, + "loss": 1.0139, + "step": 1936 + }, + { + "epoch": 0.7527445836976586, + "grad_norm": 0.20927831530570984, + "learning_rate": 4.9669131957960294e-05, + "loss": 0.995, + "step": 1937 + }, + { + "epoch": 0.753133197318566, + "grad_norm": 0.20963850617408752, + "learning_rate": 4.959128065395095e-05, + "loss": 1.0678, + "step": 1938 + }, + { + "epoch": 0.7535218109394735, + "grad_norm": 0.19523034989833832, + "learning_rate": 4.951342934994161e-05, + "loss": 0.9883, + "step": 1939 + }, + { + "epoch": 0.7539104245603808, + "grad_norm": 0.21588142216205597, + "learning_rate": 4.943557804593227e-05, + "loss": 1.0398, + "step": 1940 + }, + { + "epoch": 0.7542990381812883, + "grad_norm": 0.19894704222679138, + "learning_rate": 4.935772674192293e-05, + "loss": 1.0125, + "step": 1941 + }, + { + "epoch": 0.7546876518021957, + "grad_norm": 0.2155168056488037, + "learning_rate": 4.927987543791359e-05, + "loss": 1.0447, + "step": 1942 + }, + { + "epoch": 0.7550762654231031, + "grad_norm": 0.212605819106102, + "learning_rate": 4.920202413390425e-05, + "loss": 1.077, + "step": 1943 + }, + { + "epoch": 0.7554648790440105, + "grad_norm": 0.2168148010969162, + "learning_rate": 4.9124172829894906e-05, + "loss": 1.0029, + "step": 1944 + }, + { + "epoch": 0.7558534926649179, + "grad_norm": 0.2020149528980255, + "learning_rate": 4.9046321525885565e-05, + "loss": 1.0684, + "step": 1945 + }, + { + "epoch": 0.7562421062858253, + "grad_norm": 0.21063408255577087, + "learning_rate": 4.8968470221876217e-05, + "loss": 1.0147, + "step": 1946 + }, + { + "epoch": 0.7566307199067327, + "grad_norm": 0.19599388539791107, + "learning_rate": 4.8890618917866875e-05, + "loss": 0.9719, + "step": 1947 + }, + { + "epoch": 0.7570193335276402, + "grad_norm": 0.2158602923154831, + "learning_rate": 4.8812767613857534e-05, + "loss": 1.0439, + "step": 1948 + }, + { + "epoch": 0.7574079471485475, + "grad_norm": 0.21013815701007843, + "learning_rate": 4.873491630984819e-05, + "loss": 1.0319, + "step": 1949 + }, + { + "epoch": 0.757796560769455, + "grad_norm": 0.2020798772573471, + "learning_rate": 4.8657065005838845e-05, + "loss": 1.0037, + "step": 1950 + }, + { + "epoch": 0.7581851743903624, + "grad_norm": 0.21202047169208527, + "learning_rate": 4.8579213701829504e-05, + "loss": 0.9823, + "step": 1951 + }, + { + "epoch": 0.7585737880112698, + "grad_norm": 0.20750083029270172, + "learning_rate": 4.850136239782016e-05, + "loss": 1.0073, + "step": 1952 + }, + { + "epoch": 0.7589624016321772, + "grad_norm": 0.20938372611999512, + "learning_rate": 4.842351109381083e-05, + "loss": 1.0326, + "step": 1953 + }, + { + "epoch": 0.7593510152530846, + "grad_norm": 0.21984544396400452, + "learning_rate": 4.834565978980149e-05, + "loss": 1.0363, + "step": 1954 + }, + { + "epoch": 0.759739628873992, + "grad_norm": 0.20306189358234406, + "learning_rate": 4.826780848579214e-05, + "loss": 1.0374, + "step": 1955 + }, + { + "epoch": 0.7601282424948994, + "grad_norm": 0.20631705224514008, + "learning_rate": 4.81899571817828e-05, + "loss": 1.0985, + "step": 1956 + }, + { + "epoch": 0.7605168561158069, + "grad_norm": 0.22092190384864807, + "learning_rate": 4.811210587777346e-05, + "loss": 1.0216, + "step": 1957 + }, + { + "epoch": 0.7609054697367142, + "grad_norm": 0.21419481933116913, + "learning_rate": 4.8034254573764116e-05, + "loss": 1.0327, + "step": 1958 + }, + { + "epoch": 0.7612940833576217, + "grad_norm": 0.1954476237297058, + "learning_rate": 4.795640326975477e-05, + "loss": 1.0139, + "step": 1959 + }, + { + "epoch": 0.7616826969785291, + "grad_norm": 0.21092113852500916, + "learning_rate": 4.7878551965745427e-05, + "loss": 1.0934, + "step": 1960 + }, + { + "epoch": 0.7620713105994366, + "grad_norm": 0.1998988837003708, + "learning_rate": 4.7800700661736085e-05, + "loss": 0.9782, + "step": 1961 + }, + { + "epoch": 0.7624599242203439, + "grad_norm": 0.20410674810409546, + "learning_rate": 4.7722849357726744e-05, + "loss": 1.0186, + "step": 1962 + }, + { + "epoch": 0.7628485378412513, + "grad_norm": 0.25312289595603943, + "learning_rate": 4.76449980537174e-05, + "loss": 1.0103, + "step": 1963 + }, + { + "epoch": 0.7632371514621588, + "grad_norm": 0.20648318529129028, + "learning_rate": 4.7567146749708055e-05, + "loss": 1.0314, + "step": 1964 + }, + { + "epoch": 0.7636257650830661, + "grad_norm": 0.20513702929019928, + "learning_rate": 4.7489295445698714e-05, + "loss": 0.981, + "step": 1965 + }, + { + "epoch": 0.7640143787039736, + "grad_norm": 0.20063039660453796, + "learning_rate": 4.741144414168938e-05, + "loss": 1.0218, + "step": 1966 + }, + { + "epoch": 0.764402992324881, + "grad_norm": 0.20328521728515625, + "learning_rate": 4.733359283768004e-05, + "loss": 1.0614, + "step": 1967 + }, + { + "epoch": 0.7647916059457884, + "grad_norm": 0.2209623008966446, + "learning_rate": 4.725574153367069e-05, + "loss": 1.0478, + "step": 1968 + }, + { + "epoch": 0.7651802195666958, + "grad_norm": 0.2023559957742691, + "learning_rate": 4.717789022966135e-05, + "loss": 1.0455, + "step": 1969 + }, + { + "epoch": 0.7655688331876033, + "grad_norm": 0.20461297035217285, + "learning_rate": 4.710003892565201e-05, + "loss": 0.9427, + "step": 1970 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 0.2108335793018341, + "learning_rate": 4.702218762164267e-05, + "loss": 1.0344, + "step": 1971 + }, + { + "epoch": 0.766346060429418, + "grad_norm": 0.20883473753929138, + "learning_rate": 4.6944336317633326e-05, + "loss": 1.0336, + "step": 1972 + }, + { + "epoch": 0.7667346740503255, + "grad_norm": 0.20144741237163544, + "learning_rate": 4.686648501362398e-05, + "loss": 1.0101, + "step": 1973 + }, + { + "epoch": 0.7671232876712328, + "grad_norm": 0.21269328892230988, + "learning_rate": 4.6788633709614637e-05, + "loss": 0.9989, + "step": 1974 + }, + { + "epoch": 0.7675119012921403, + "grad_norm": 0.20673738420009613, + "learning_rate": 4.6710782405605295e-05, + "loss": 1.0235, + "step": 1975 + }, + { + "epoch": 0.7679005149130477, + "grad_norm": 0.1966594159603119, + "learning_rate": 4.6632931101595954e-05, + "loss": 1.0081, + "step": 1976 + }, + { + "epoch": 0.7682891285339551, + "grad_norm": 0.22186829149723053, + "learning_rate": 4.6555079797586606e-05, + "loss": 1.0081, + "step": 1977 + }, + { + "epoch": 0.7686777421548625, + "grad_norm": 0.20602557063102722, + "learning_rate": 4.6477228493577265e-05, + "loss": 1.0381, + "step": 1978 + }, + { + "epoch": 0.76906635577577, + "grad_norm": 0.19581305980682373, + "learning_rate": 4.639937718956793e-05, + "loss": 1.0196, + "step": 1979 + }, + { + "epoch": 0.7694549693966773, + "grad_norm": 0.20162086188793182, + "learning_rate": 4.632152588555859e-05, + "loss": 1.0168, + "step": 1980 + }, + { + "epoch": 0.7698435830175848, + "grad_norm": 0.21967145800590515, + "learning_rate": 4.624367458154925e-05, + "loss": 1.0339, + "step": 1981 + }, + { + "epoch": 0.7702321966384922, + "grad_norm": 0.20245851576328278, + "learning_rate": 4.61658232775399e-05, + "loss": 1.0349, + "step": 1982 + }, + { + "epoch": 0.7706208102593995, + "grad_norm": 0.20409934222698212, + "learning_rate": 4.608797197353056e-05, + "loss": 1.0296, + "step": 1983 + }, + { + "epoch": 0.771009423880307, + "grad_norm": 0.19757163524627686, + "learning_rate": 4.601012066952122e-05, + "loss": 1.0443, + "step": 1984 + }, + { + "epoch": 0.7713980375012144, + "grad_norm": 0.20038221776485443, + "learning_rate": 4.593226936551188e-05, + "loss": 1.0431, + "step": 1985 + }, + { + "epoch": 0.7717866511221219, + "grad_norm": 0.2112458199262619, + "learning_rate": 4.585441806150253e-05, + "loss": 1.0553, + "step": 1986 + }, + { + "epoch": 0.7721752647430292, + "grad_norm": 0.21868042647838593, + "learning_rate": 4.577656675749319e-05, + "loss": 1.0061, + "step": 1987 + }, + { + "epoch": 0.7725638783639367, + "grad_norm": 0.22484582662582397, + "learning_rate": 4.5698715453483846e-05, + "loss": 1.0831, + "step": 1988 + }, + { + "epoch": 0.7729524919848441, + "grad_norm": 0.20265011489391327, + "learning_rate": 4.5620864149474505e-05, + "loss": 1.0206, + "step": 1989 + }, + { + "epoch": 0.7733411056057515, + "grad_norm": 0.2052810937166214, + "learning_rate": 4.5543012845465164e-05, + "loss": 1.0366, + "step": 1990 + }, + { + "epoch": 0.7737297192266589, + "grad_norm": 0.21016088128089905, + "learning_rate": 4.546516154145582e-05, + "loss": 0.9963, + "step": 1991 + }, + { + "epoch": 0.7741183328475663, + "grad_norm": 0.19719412922859192, + "learning_rate": 4.538731023744648e-05, + "loss": 0.9853, + "step": 1992 + }, + { + "epoch": 0.7745069464684737, + "grad_norm": 0.20447245240211487, + "learning_rate": 4.530945893343714e-05, + "loss": 0.9977, + "step": 1993 + }, + { + "epoch": 0.7748955600893811, + "grad_norm": 0.21796588599681854, + "learning_rate": 4.52316076294278e-05, + "loss": 1.0949, + "step": 1994 + }, + { + "epoch": 0.7752841737102886, + "grad_norm": 0.2041284590959549, + "learning_rate": 4.515375632541845e-05, + "loss": 1.0034, + "step": 1995 + }, + { + "epoch": 0.7756727873311959, + "grad_norm": 0.21134726703166962, + "learning_rate": 4.507590502140911e-05, + "loss": 1.0076, + "step": 1996 + }, + { + "epoch": 0.7760614009521034, + "grad_norm": 0.20730996131896973, + "learning_rate": 4.499805371739977e-05, + "loss": 1.0456, + "step": 1997 + }, + { + "epoch": 0.7764500145730108, + "grad_norm": 0.22316931188106537, + "learning_rate": 4.492020241339043e-05, + "loss": 0.9418, + "step": 1998 + }, + { + "epoch": 0.7768386281939182, + "grad_norm": 0.21494819223880768, + "learning_rate": 4.484235110938109e-05, + "loss": 1.0597, + "step": 1999 + }, + { + "epoch": 0.7772272418148256, + "grad_norm": 0.20344491302967072, + "learning_rate": 4.476449980537174e-05, + "loss": 0.9749, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 2574, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7079644445254222e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/outputs/checkpoint-2500/README.md b/outputs/checkpoint-2500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3abf956c074d00f34a12693c8d6da9738211d7c7 --- /dev/null +++ b/outputs/checkpoint-2500/README.md @@ -0,0 +1,209 @@ +--- +base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit +library_name: peft +tags: +- base_model:adapter:unsloth/gpt-oss-20b-unsloth-bnb-4bit +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/outputs/checkpoint-2500/adapter_config.json b/outputs/checkpoint-2500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e285b9b6e018b5b9f23736d6699eb1a4267764e7 --- /dev/null +++ b/outputs/checkpoint-2500/adapter_config.json @@ -0,0 +1,45 @@ +{ + "alpha_pattern": {}, + "auto_mapping": { + "base_model_class": "GptOssForCausalLM", + "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss" + }, + "base_model_name_or_path": "unsloth/gpt-oss-20b-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "up_proj", + "down_proj", + "gate_proj", + "k_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/outputs/checkpoint-2500/chat_template.jinja b/outputs/checkpoint-2500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..a3650f886e98b2834c25727759c8e0ab8495f316 --- /dev/null +++ b/outputs/checkpoint-2500/chat_template.jinja @@ -0,0 +1,315 @@ +{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #} +{#- + In addition to the normal inputs of `messages` and `tools`, this template also accepts the + following kwargs: + - "builtin_tools": A list, can contain "browser" and/or "python". + - "model_identity": A string that optionally describes the model identity. + - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium". + #} + +{#- Tool Definition Rendering ============================================== #} +{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%} + {%- if param_spec.type == "array" -%} + {%- if param_spec['items'] -%} + {%- if param_spec['items']['type'] == "string" -%} + {{- "string[]" }} + {%- elif param_spec['items']['type'] == "number" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "integer" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "boolean" -%} + {{- "boolean[]" }} + {%- else -%} + {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%} + {%- if inner_type == "object | object" or inner_type|length > 50 -%} + {{- "any[]" }} + {%- else -%} + {{- inner_type + "[]" }} + {%- endif -%} + {%- endif -%} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- else -%} + {{- "any[]" }} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%} + {#- Handle array of types like ["object", "object"] from Union[dict, list] #} + {%- if param_spec.type | length > 1 -%} + {{- param_spec.type | join(" | ") }} + {%- else -%} + {{- param_spec.type[0] }} + {%- endif -%} + {%- elif param_spec.oneOf -%} + {#- Handle oneOf schemas - check for complex unions and fallback to any #} + {%- set has_object_variants = false -%} + {%- for variant in param_spec.oneOf -%} + {%- if variant.type == "object" -%} + {%- set has_object_variants = true -%} + {%- endif -%} + {%- endfor -%} + {%- if has_object_variants and param_spec.oneOf|length > 1 -%} + {{- "any" }} + {%- else -%} + {%- for variant in param_spec.oneOf -%} + {{- render_typescript_type(variant, required_params) -}} + {%- if variant.description %} + {{- "// " + variant.description }} + {%- endif -%} + {%- if variant.default is defined %} + {{ "// default: " + variant.default|tojson }} + {%- endif -%} + {%- if not loop.last %} + {{- " | " }} + {% endif -%} + {%- endfor -%} + {%- endif -%} + {%- elif param_spec.type == "string" -%} + {%- if param_spec.enum -%} + {{- '"' + param_spec.enum|join('" | "') + '"' -}} + {%- else -%} + {{- "string" }} + {%- if param_spec.nullable %} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type == "number" -%} + {{- "number" }} + {%- elif param_spec.type == "integer" -%} + {{- "number" }} + {%- elif param_spec.type == "boolean" -%} + {{- "boolean" }} + + {%- elif param_spec.type == "object" -%} + {%- if param_spec.properties -%} + {{- "{\n" }} + {%- for prop_name, prop_spec in param_spec.properties.items() -%} + {{- prop_name -}} + {%- if prop_name not in (param_spec.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{ render_typescript_type(prop_spec, param_spec.required or []) }} + {%- if not loop.last -%} + {{-", " }} + {%- endif -%} + {%- endfor -%} + {{- "}" }} + {%- else -%} + {{- "object" }} + {%- endif -%} + {%- else -%} + {{- "any" }} + {%- endif -%} +{%- endmacro -%} + +{%- macro render_tool_namespace(namespace_name, tools) -%} + {{- "## " + namespace_name + "\n\n" }} + {{- "namespace " + namespace_name + " {\n\n" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- "// " + tool.description + "\n" }} + {{- "type "+ tool.name + " = " }} + {%- if tool.parameters and tool.parameters.properties -%} + {{- "(_: " }} + {{- "{\n" }} + {%- for param_name, param_spec in tool.parameters.properties.items() %} + {{- "// " + param_spec.description + "\n" }} + {{- param_name }} + {%- if param_name not in (tool.parameters.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{- render_typescript_type(param_spec, tool.parameters.required or []) }} + {%- if param_spec.default is defined -%} + {%- if param_spec.enum %} + {{- ", // default: " + param_spec.default }} + {%- elif param_spec.oneOf %} + {{- "// default: " + param_spec.default }} + {%- else %} + {{- ", // default: " + param_spec.default|tojson }} + {%- endif -%} + {%- endif -%} + {%- if not loop.last %} + {{- ",\n" }} + {%- else %} + {{- "\n" }} + {%- endif -%} + {%- endfor %} + {{- "}) => any;\n\n" }} + {%- else -%} + {{- "() => any;\n\n" }} + {%- endif -%} + {%- endfor %} + {{- "} // namespace " + namespace_name }} +{%- endmacro -%} + +{%- macro render_builtin_tools(browser_tool, python_tool) -%} + {%- if browser_tool %} + {{- "## browser\n\n" }} + {{- "// Tool for browsing.\n" }} + {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }} + {{- "// Cite information from the tool using the following format:\n" }} + {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }} + {{- "// Do not quote more than 10 words directly from the tool output.\n" }} + {{- "// sources=web (default: web)\n" }} + {{- "namespace browser {\n\n" }} + {{- "// Searches for information related to `query` and displays `topn` results.\n" }} + {{- "type search = (_: {\n" }} + {{- "query: string,\n" }} + {{- "topn?: number, // default: 10\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }} + {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }} + {{- "// If `cursor` is not provided, the most recent page is implied.\n" }} + {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }} + {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }} + {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }} + {{- "type open = (_: {\n" }} + {{- "id?: number | string, // default: -1\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "loc?: number, // default: -1\n" }} + {{- "num_lines?: number, // default: -1\n" }} + {{- "view_source?: boolean, // default: false\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }} + {{- "type find = (_: {\n" }} + {{- "pattern: string,\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "}) => any;\n\n" }} + {{- "} // namespace browser\n\n" }} + {%- endif -%} + + {%- if python_tool %} + {{- "## python\n\n" }} + {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }} + {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }} + {%- endif -%} +{%- endmacro -%} + +{#- System Message Construction ============================================ #} +{%- macro build_system_message() -%} + {%- if model_identity is not defined %} + {{- "You are ChatGPT, a large language model trained by OpenAI.\n" -}} + {%- else %} + {{- model_identity }} + {%- endif %} + {{- "Knowledge cutoff: 2024-06\n" }} + {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }} + {%- if reasoning_effort is not defined %} + {%- set reasoning_effort = "medium" %} + {%- endif %} + {{- "Reasoning: " + reasoning_effort + "\n\n" }} + {%- if builtin_tools is defined %} + {{- "# Tools\n\n" }} + {%- set available_builtin_tools = namespace(browser=false, python=false) %} + {%- for tool in builtin_tools %} + {%- if tool == "browser" %} + {%- set available_builtin_tools.browser = true %} + {%- elif tool == "python" %} + {%- set available_builtin_tools.python = true %} + {%- endif %} + {%- endfor %} + {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }} + {%- endif -%} + {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }} + {%- if tools is defined -%} + {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }} + {%- endif -%} +{%- endmacro -%} + +{#- Main Template Logic ================================================= #} +{#- Set defaults #} + +{#- Render system message #} +{{- "<|start|>system<|message|>" }} +{{- build_system_message() }} +{{- "<|end|>" }} + +{#- Extract developer message #} +{%- if messages[0].role == "developer" or messages[0].role == "system" %} + {%- set developer_message = messages[0].content %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set developer_message = "" %} + {%- set loop_messages = messages %} +{%- endif %} + +{#- Render developer message #} +{%- if developer_message or tools %} + {{- "<|start|>developer<|message|>" }} + {%- if developer_message %} + {{- "# Instructions\n\n" }} + {{- developer_message }} + {%- endif %} + {%- if tools -%} + {{- "\n\n" }} + {{- "# Tools\n\n" }} + {{- render_tool_namespace("functions", tools) }} + {%- endif -%} + {{- "<|end|>" }} +{%- endif %} + +{#- Render messages #} +{%- set last_tool_call = namespace(name=none) %} +{%- for message in loop_messages -%} + {#- At this point only assistant/user/tool messages should remain #} + {%- if message.role == 'assistant' -%} + {%- if "tool_calls" in message %} + {#- We assume max 1 tool call per message, and so we infer the tool call name #} + {#- in "tool" messages from the most recent assistant tool call name #} + {%- set tool_call = message.tool_calls[0] %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if message.content %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }} + {%- endif %} + {{- "<|start|>assistant to=" }} + {{- "functions." + tool_call.name + "<|channel|>commentary json<|message|>" }} + {{- tool_call.arguments|tojson }} + {{- "<|call|>" }} + {%- set last_tool_call.name = tool_call.name %} + {%- elif "thinking" in message and loop.last and not add_generation_prompt %} + {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #} + {#- This is a situation that should only occur in training, never in inference. #} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }} + {%- set last_tool_call.name = none %} + {%- elif "thinking" in message %} + {#- CoT is dropped during all previous turns, so we never render it for inference #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- elif loop.last and not add_generation_prompt %} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|message|>" + message.content + "<|return|>" }} + {%- else %} + {{- "<|start|>assistant<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- endif %} + {%- elif message.role == 'tool' -%} + {%- if last_tool_call.name is none %} + {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }} + {%- endif %} + {{- "<|start|>functions." + last_tool_call.name }} + {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} + {%- else -%} + {{- "<|start|>user<|message|>" + message.content + "<|end|>" }} + {%- endif -%} +{%- endfor -%} + +{#- Generation prompt #} +{%- if add_generation_prompt -%} +<|start|>assistant +{%- endif -%} +{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #} \ No newline at end of file diff --git a/outputs/checkpoint-2500/optimizer.pt b/outputs/checkpoint-2500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b35ff1d3f5514a357050a8186bbc57a31ead7aff --- /dev/null +++ b/outputs/checkpoint-2500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7e45070f968f70d7a53d726615310d86734504e3bce33d45c6aeee13b2a6a00 +size 16894883 diff --git a/outputs/checkpoint-2500/rng_state.pth b/outputs/checkpoint-2500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ef66339b9befa098183fd5d69faed6838e526b0 --- /dev/null +++ b/outputs/checkpoint-2500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1d565802a8e26c4e8a31328752b7a7fdc186d9401aa008e65697d0ad8c22e33 +size 14645 diff --git a/outputs/checkpoint-2500/special_tokens_map.json b/outputs/checkpoint-2500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..6fba18753f4d09dbb8fcdf1482daff36b963d639 --- /dev/null +++ b/outputs/checkpoint-2500/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/outputs/checkpoint-2500/tokenizer.json b/outputs/checkpoint-2500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647 --- /dev/null +++ b/outputs/checkpoint-2500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3 +size 27868174 diff --git a/outputs/checkpoint-2500/tokenizer_config.json b/outputs/checkpoint-2500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..482ae30d27a74c38d2228e69dd37c529fc485a45 --- /dev/null +++ b/outputs/checkpoint-2500/tokenizer_config.json @@ -0,0 +1,185 @@ +{ + "added_tokens_decoder": { + "199998": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "199999": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200000": { + "content": "<|reserved_200000|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200001": { + "content": "<|reserved_200001|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200002": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200003": { + "content": "<|constrain|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200004": { + "content": "<|reserved_200004|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200005": { + "content": "<|channel|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200006": { + "content": "<|start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200008": { + "content": "<|message|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200009": { + "content": "<|reserved_200009|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200010": { + "content": "<|reserved_200010|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200011": { + "content": "<|reserved_200011|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200012": { + "content": "<|call|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200013": { + "content": "<|reserved_200013|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200014": { + "content": "<|reserved_200014|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200015": { + "content": "<|reserved_200015|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200016": { + "content": "<|reserved_200016|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200017": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200018": { + "content": "<|endofprompt|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|startoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|return|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|reserved_200017|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/outputs/checkpoint-2500/trainer_state.json b/outputs/checkpoint-2500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ec05f859eef78c5a2160e814e1eb1c901377dfec --- /dev/null +++ b/outputs/checkpoint-2500/trainer_state.json @@ -0,0 +1,17534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9715340522685321, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 13.684800148010254, + "learning_rate": 0.0, + "loss": 2.3276, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 13.660787582397461, + "learning_rate": 4e-05, + "loss": 2.2792, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 13.35280704498291, + "learning_rate": 8e-05, + "loss": 2.4151, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 6.15027379989624, + "learning_rate": 0.00012, + "loss": 1.7812, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 1.3168226480484009, + "learning_rate": 0.00016, + "loss": 1.4536, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.9872580170631409, + "learning_rate": 0.0002, + "loss": 1.4171, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.7496100664138794, + "learning_rate": 0.00019935064935064936, + "loss": 1.4168, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7376005053520203, + "learning_rate": 0.00019870129870129872, + "loss": 1.3659, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5281137824058533, + "learning_rate": 0.00019805194805194807, + "loss": 1.2566, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.5485746264457703, + "learning_rate": 0.00019740259740259742, + "loss": 1.3761, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5506592392921448, + "learning_rate": 0.00019675324675324675, + "loss": 1.3327, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.49382686614990234, + "learning_rate": 0.00019610389610389613, + "loss": 1.3727, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.36203011870384216, + "learning_rate": 0.00019545454545454548, + "loss": 1.1515, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.3528599739074707, + "learning_rate": 0.0001948051948051948, + "loss": 1.2636, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.31244418025016785, + "learning_rate": 0.00019415584415584416, + "loss": 1.1873, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.3379523754119873, + "learning_rate": 0.00019350649350649354, + "loss": 1.2657, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.3025083839893341, + "learning_rate": 0.00019285714285714286, + "loss": 1.2846, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.2560190260410309, + "learning_rate": 0.00019220779220779222, + "loss": 1.1587, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.2554129958152771, + "learning_rate": 0.00019155844155844157, + "loss": 1.2812, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.22662702202796936, + "learning_rate": 0.00019090909090909092, + "loss": 1.1664, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.2515714168548584, + "learning_rate": 0.00019025974025974027, + "loss": 1.2177, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.24396637082099915, + "learning_rate": 0.00018961038961038963, + "loss": 1.2053, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.24488303065299988, + "learning_rate": 0.00018896103896103895, + "loss": 1.2074, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.2168620079755783, + "learning_rate": 0.00018831168831168833, + "loss": 1.1284, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.24021224677562714, + "learning_rate": 0.00018766233766233769, + "loss": 1.2169, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.20057056844234467, + "learning_rate": 0.000187012987012987, + "loss": 1.1031, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.19900795817375183, + "learning_rate": 0.00018636363636363636, + "loss": 1.1004, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.2019268423318863, + "learning_rate": 0.00018571428571428572, + "loss": 1.1476, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.1996479034423828, + "learning_rate": 0.00018506493506493507, + "loss": 1.1455, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.25262022018432617, + "learning_rate": 0.00018441558441558442, + "loss": 1.1025, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.225438192486763, + "learning_rate": 0.00018376623376623378, + "loss": 1.1954, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.17834505438804626, + "learning_rate": 0.00018311688311688313, + "loss": 1.0934, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.20071206986904144, + "learning_rate": 0.00018246753246753248, + "loss": 1.0488, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.1920139640569687, + "learning_rate": 0.00018181818181818183, + "loss": 1.123, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.18714852631092072, + "learning_rate": 0.0001811688311688312, + "loss": 1.0798, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.18315713107585907, + "learning_rate": 0.00018051948051948054, + "loss": 1.1107, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.19156870245933533, + "learning_rate": 0.00017987012987012987, + "loss": 1.1125, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.21527768671512604, + "learning_rate": 0.00017922077922077922, + "loss": 1.1346, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.1871163249015808, + "learning_rate": 0.0001785714285714286, + "loss": 1.0742, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.17750784754753113, + "learning_rate": 0.00017792207792207792, + "loss": 1.1323, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.177419051527977, + "learning_rate": 0.00017727272727272728, + "loss": 1.1405, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.16714292764663696, + "learning_rate": 0.00017662337662337663, + "loss": 1.1084, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.1610356718301773, + "learning_rate": 0.00017597402597402598, + "loss": 1.1125, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.2548656761646271, + "learning_rate": 0.00017532467532467534, + "loss": 1.1114, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.1731044203042984, + "learning_rate": 0.0001746753246753247, + "loss": 1.1197, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.1739533394575119, + "learning_rate": 0.00017402597402597401, + "loss": 1.1777, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.2178352177143097, + "learning_rate": 0.0001733766233766234, + "loss": 1.1111, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.17247150838375092, + "learning_rate": 0.00017272727272727275, + "loss": 1.1253, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.18075324594974518, + "learning_rate": 0.00017207792207792207, + "loss": 1.1358, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.15898071229457855, + "learning_rate": 0.00017142857142857143, + "loss": 1.0606, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.16518613696098328, + "learning_rate": 0.0001707792207792208, + "loss": 1.0944, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.16035063564777374, + "learning_rate": 0.00017012987012987013, + "loss": 1.0554, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.1686483472585678, + "learning_rate": 0.00016948051948051948, + "loss": 1.0384, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.16575631499290466, + "learning_rate": 0.00016883116883116884, + "loss": 1.0243, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.16840039193630219, + "learning_rate": 0.0001681818181818182, + "loss": 1.117, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.17616064846515656, + "learning_rate": 0.00016753246753246754, + "loss": 1.0743, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.168218195438385, + "learning_rate": 0.0001668831168831169, + "loss": 1.0627, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.17026656866073608, + "learning_rate": 0.00016623376623376625, + "loss": 1.0059, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.16454458236694336, + "learning_rate": 0.0001655844155844156, + "loss": 0.9943, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.17185136675834656, + "learning_rate": 0.00016493506493506495, + "loss": 1.1545, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.17822986841201782, + "learning_rate": 0.00016428571428571428, + "loss": 1.073, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.1676608771085739, + "learning_rate": 0.00016363636363636366, + "loss": 1.0886, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.1727771908044815, + "learning_rate": 0.000162987012987013, + "loss": 1.0432, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.17827573418617249, + "learning_rate": 0.00016233766233766234, + "loss": 1.083, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.19807517528533936, + "learning_rate": 0.0001616883116883117, + "loss": 1.1208, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.17693684995174408, + "learning_rate": 0.00016103896103896104, + "loss": 1.089, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.15489234030246735, + "learning_rate": 0.0001603896103896104, + "loss": 0.9707, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.16443990170955658, + "learning_rate": 0.00015974025974025975, + "loss": 1.0643, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.2051103413105011, + "learning_rate": 0.0001590909090909091, + "loss": 1.1246, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.18824075162410736, + "learning_rate": 0.00015844155844155845, + "loss": 1.0855, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.18659448623657227, + "learning_rate": 0.0001577922077922078, + "loss": 1.1412, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.1854114979505539, + "learning_rate": 0.00015714285714285716, + "loss": 1.0249, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.1876193732023239, + "learning_rate": 0.00015649350649350649, + "loss": 1.1029, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.1888684630393982, + "learning_rate": 0.00015584415584415587, + "loss": 1.0789, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.20240606367588043, + "learning_rate": 0.0001551948051948052, + "loss": 1.0495, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.232120081782341, + "learning_rate": 0.00015454545454545454, + "loss": 1.0735, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.16897843778133392, + "learning_rate": 0.0001538961038961039, + "loss": 1.0164, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.18796634674072266, + "learning_rate": 0.00015324675324675325, + "loss": 1.0676, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.19574032723903656, + "learning_rate": 0.0001525974025974026, + "loss": 1.0456, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.18007811903953552, + "learning_rate": 0.00015194805194805196, + "loss": 1.0894, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.18932929635047913, + "learning_rate": 0.0001512987012987013, + "loss": 1.0729, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.20614288747310638, + "learning_rate": 0.00015064935064935066, + "loss": 1.0854, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.19291089475154877, + "learning_rate": 0.00015000000000000001, + "loss": 1.1217, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.18916529417037964, + "learning_rate": 0.00014935064935064934, + "loss": 1.0963, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.20306220650672913, + "learning_rate": 0.00014870129870129872, + "loss": 1.0898, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.17870067059993744, + "learning_rate": 0.00014805194805194807, + "loss": 1.0213, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.18411923944950104, + "learning_rate": 0.0001474025974025974, + "loss": 1.0844, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.18788227438926697, + "learning_rate": 0.00014675324675324675, + "loss": 1.0338, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.23874884843826294, + "learning_rate": 0.00014610389610389613, + "loss": 1.1118, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.19380499422550201, + "learning_rate": 0.00014545454545454546, + "loss": 1.0464, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.18968750536441803, + "learning_rate": 0.0001448051948051948, + "loss": 1.0569, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.19545753300189972, + "learning_rate": 0.00014415584415584416, + "loss": 1.1225, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.19170494377613068, + "learning_rate": 0.00014350649350649352, + "loss": 1.0602, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.17953918874263763, + "learning_rate": 0.00014285714285714287, + "loss": 1.032, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.1822536289691925, + "learning_rate": 0.00014220779220779222, + "loss": 1.0559, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.18591298162937164, + "learning_rate": 0.00014155844155844155, + "loss": 1.031, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.2129002958536148, + "learning_rate": 0.00014090909090909093, + "loss": 1.1391, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.18386681377887726, + "learning_rate": 0.00014025974025974028, + "loss": 0.9919, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.18314239382743835, + "learning_rate": 0.0001396103896103896, + "loss": 1.0445, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.1999066174030304, + "learning_rate": 0.00013896103896103896, + "loss": 1.0538, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.18741188943386078, + "learning_rate": 0.00013831168831168834, + "loss": 1.0722, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.19351010024547577, + "learning_rate": 0.00013766233766233766, + "loss": 1.0491, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.18859203159809113, + "learning_rate": 0.00013701298701298702, + "loss": 1.0593, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.1962767392396927, + "learning_rate": 0.00013636363636363637, + "loss": 1.1344, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.20819440484046936, + "learning_rate": 0.00013571428571428572, + "loss": 1.1137, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.19590184092521667, + "learning_rate": 0.00013506493506493507, + "loss": 1.0624, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.18631424009799957, + "learning_rate": 0.00013441558441558443, + "loss": 1.0587, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.19572143256664276, + "learning_rate": 0.00013376623376623375, + "loss": 1.0494, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.1910988837480545, + "learning_rate": 0.00013311688311688313, + "loss": 1.0481, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.19455869495868683, + "learning_rate": 0.00013246753246753249, + "loss": 1.029, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.18669827282428741, + "learning_rate": 0.0001318181818181818, + "loss": 1.0513, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.17523664236068726, + "learning_rate": 0.0001311688311688312, + "loss": 1.0126, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.17929129302501678, + "learning_rate": 0.00013051948051948052, + "loss": 1.0717, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.19380168616771698, + "learning_rate": 0.00012987012987012987, + "loss": 1.0324, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.18090228736400604, + "learning_rate": 0.00012922077922077922, + "loss": 1.0515, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.2067340910434723, + "learning_rate": 0.00012857142857142858, + "loss": 1.0939, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.1880485862493515, + "learning_rate": 0.00012792207792207793, + "loss": 1.0986, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.182168647646904, + "learning_rate": 0.00012727272727272728, + "loss": 1.0109, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.20187129080295563, + "learning_rate": 0.00012662337662337663, + "loss": 1.0668, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.2082669734954834, + "learning_rate": 0.000125974025974026, + "loss": 1.054, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.18294434249401093, + "learning_rate": 0.00012532467532467534, + "loss": 1.0397, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.20515067875385284, + "learning_rate": 0.00012467532467532467, + "loss": 1.1092, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.1758790761232376, + "learning_rate": 0.00012402597402597402, + "loss": 0.9755, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.2170792669057846, + "learning_rate": 0.0001233766233766234, + "loss": 1.0434, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.202157124876976, + "learning_rate": 0.00012272727272727272, + "loss": 1.1129, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.18556398153305054, + "learning_rate": 0.00012207792207792208, + "loss": 1.0665, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.20196087658405304, + "learning_rate": 0.00012142857142857143, + "loss": 1.1, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.1921566128730774, + "learning_rate": 0.0001207792207792208, + "loss": 1.0918, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.18866224586963654, + "learning_rate": 0.00012012987012987014, + "loss": 1.0014, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.207601398229599, + "learning_rate": 0.00011948051948051949, + "loss": 1.0726, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.21592366695404053, + "learning_rate": 0.00011883116883116883, + "loss": 1.1379, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.2016124576330185, + "learning_rate": 0.0001181818181818182, + "loss": 1.1428, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.20478437840938568, + "learning_rate": 0.00011753246753246753, + "loss": 1.121, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.22730594873428345, + "learning_rate": 0.00011688311688311689, + "loss": 1.0319, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.22592711448669434, + "learning_rate": 0.00011623376623376625, + "loss": 1.1264, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.20035041868686676, + "learning_rate": 0.00011558441558441559, + "loss": 1.0686, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.20648567378520966, + "learning_rate": 0.00011493506493506494, + "loss": 1.0817, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.21222743391990662, + "learning_rate": 0.00011428571428571428, + "loss": 1.0678, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.2075391560792923, + "learning_rate": 0.00011363636363636365, + "loss": 1.0897, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.1964101791381836, + "learning_rate": 0.000112987012987013, + "loss": 1.0906, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.22406511008739471, + "learning_rate": 0.00011233766233766234, + "loss": 1.0594, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.23787978291511536, + "learning_rate": 0.00011168831168831168, + "loss": 1.1053, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.21196185052394867, + "learning_rate": 0.00011103896103896105, + "loss": 1.0923, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.21042804419994354, + "learning_rate": 0.0001103896103896104, + "loss": 1.0381, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.2267436534166336, + "learning_rate": 0.00010974025974025974, + "loss": 1.0818, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.23742735385894775, + "learning_rate": 0.00010909090909090909, + "loss": 1.0872, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.17787213623523712, + "learning_rate": 0.00010844155844155846, + "loss": 1.03, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.22422832250595093, + "learning_rate": 0.0001077922077922078, + "loss": 1.0738, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.22946301102638245, + "learning_rate": 0.00010714285714285715, + "loss": 1.0274, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.2137996405363083, + "learning_rate": 0.00010649350649350649, + "loss": 1.0539, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.1748756766319275, + "learning_rate": 0.00010584415584415586, + "loss": 1.0355, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.22275175154209137, + "learning_rate": 0.0001051948051948052, + "loss": 1.1696, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.20996077358722687, + "learning_rate": 0.00010454545454545455, + "loss": 1.0303, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.1945938766002655, + "learning_rate": 0.00010389610389610389, + "loss": 0.9747, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.1970377266407013, + "learning_rate": 0.00010324675324675325, + "loss": 1.0358, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.18814732134342194, + "learning_rate": 0.00010259740259740261, + "loss": 0.9612, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.2153233289718628, + "learning_rate": 0.00010194805194805195, + "loss": 1.0749, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.21788008511066437, + "learning_rate": 0.0001012987012987013, + "loss": 1.0883, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.214650496840477, + "learning_rate": 0.00010064935064935067, + "loss": 1.0539, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.19312834739685059, + "learning_rate": 0.0001, + "loss": 1.0657, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.19916598498821259, + "learning_rate": 9.935064935064936e-05, + "loss": 1.0478, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.2057606726884842, + "learning_rate": 9.870129870129871e-05, + "loss": 1.0094, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.22159607708454132, + "learning_rate": 9.805194805194806e-05, + "loss": 1.0952, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.18274275958538055, + "learning_rate": 9.74025974025974e-05, + "loss": 1.0065, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.19835162162780762, + "learning_rate": 9.675324675324677e-05, + "loss": 1.0742, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.2114904820919037, + "learning_rate": 9.610389610389611e-05, + "loss": 1.1109, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.21488523483276367, + "learning_rate": 9.545454545454546e-05, + "loss": 1.0465, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.19870303571224213, + "learning_rate": 9.480519480519481e-05, + "loss": 1.0318, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.20413029193878174, + "learning_rate": 9.415584415584417e-05, + "loss": 1.0817, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.1847231239080429, + "learning_rate": 9.35064935064935e-05, + "loss": 1.0144, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.2715964913368225, + "learning_rate": 9.285714285714286e-05, + "loss": 0.9832, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.2225002497434616, + "learning_rate": 9.220779220779221e-05, + "loss": 1.1051, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.22931510210037231, + "learning_rate": 9.155844155844156e-05, + "loss": 1.1042, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.21848627924919128, + "learning_rate": 9.090909090909092e-05, + "loss": 1.1151, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.19852259755134583, + "learning_rate": 9.025974025974027e-05, + "loss": 1.0889, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.2080363780260086, + "learning_rate": 8.961038961038961e-05, + "loss": 1.0777, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.22391024231910706, + "learning_rate": 8.896103896103896e-05, + "loss": 1.1092, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.21793846786022186, + "learning_rate": 8.831168831168831e-05, + "loss": 1.044, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.2009749859571457, + "learning_rate": 8.766233766233767e-05, + "loss": 1.0198, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.19432318210601807, + "learning_rate": 8.701298701298701e-05, + "loss": 1.075, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.18634547293186188, + "learning_rate": 8.636363636363637e-05, + "loss": 0.9964, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.1947103589773178, + "learning_rate": 8.571428571428571e-05, + "loss": 1.0025, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.23098671436309814, + "learning_rate": 8.506493506493507e-05, + "loss": 1.0562, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.19686414301395416, + "learning_rate": 8.441558441558442e-05, + "loss": 1.0285, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.19852428138256073, + "learning_rate": 8.376623376623377e-05, + "loss": 1.0054, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.21483510732650757, + "learning_rate": 8.311688311688312e-05, + "loss": 1.108, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.23313644528388977, + "learning_rate": 8.246753246753248e-05, + "loss": 1.1383, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.21453145146369934, + "learning_rate": 8.181818181818183e-05, + "loss": 1.0911, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.20268195867538452, + "learning_rate": 8.116883116883117e-05, + "loss": 1.0145, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.20576398074626923, + "learning_rate": 8.051948051948052e-05, + "loss": 1.0829, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.21732626855373383, + "learning_rate": 7.987012987012987e-05, + "loss": 1.0152, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.22046895325183868, + "learning_rate": 7.922077922077923e-05, + "loss": 1.1311, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.19727715849876404, + "learning_rate": 7.857142857142858e-05, + "loss": 1.0364, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.20861488580703735, + "learning_rate": 7.792207792207793e-05, + "loss": 1.0435, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.18545083701610565, + "learning_rate": 7.727272727272727e-05, + "loss": 1.0299, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.19965052604675293, + "learning_rate": 7.662337662337662e-05, + "loss": 1.0511, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.23673909902572632, + "learning_rate": 7.597402597402598e-05, + "loss": 1.081, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.17583179473876953, + "learning_rate": 7.532467532467533e-05, + "loss": 0.9808, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.2129366099834442, + "learning_rate": 7.467532467532467e-05, + "loss": 1.0522, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.21679140627384186, + "learning_rate": 7.402597402597404e-05, + "loss": 1.0567, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.2032000720500946, + "learning_rate": 7.337662337662338e-05, + "loss": 1.0466, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.1887970268726349, + "learning_rate": 7.272727272727273e-05, + "loss": 1.0329, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.21060192584991455, + "learning_rate": 7.207792207792208e-05, + "loss": 1.1021, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.21191425621509552, + "learning_rate": 7.142857142857143e-05, + "loss": 0.99, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.1995989829301834, + "learning_rate": 7.077922077922077e-05, + "loss": 1.0526, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.1849513053894043, + "learning_rate": 7.012987012987014e-05, + "loss": 0.9998, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.1948779672384262, + "learning_rate": 6.948051948051948e-05, + "loss": 1.075, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.20374052226543427, + "learning_rate": 6.883116883116883e-05, + "loss": 1.0933, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.2102465033531189, + "learning_rate": 6.818181818181818e-05, + "loss": 1.1123, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.21376173198223114, + "learning_rate": 6.753246753246754e-05, + "loss": 1.1233, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.20934203267097473, + "learning_rate": 6.688311688311688e-05, + "loss": 1.1374, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.18604128062725067, + "learning_rate": 6.623376623376624e-05, + "loss": 1.0213, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.19644233584403992, + "learning_rate": 6.55844155844156e-05, + "loss": 1.0046, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.18479463458061218, + "learning_rate": 6.493506493506494e-05, + "loss": 0.9792, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.1945149153470993, + "learning_rate": 6.428571428571429e-05, + "loss": 1.0584, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.2070147544145584, + "learning_rate": 6.363636363636364e-05, + "loss": 1.071, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.19645985960960388, + "learning_rate": 6.2987012987013e-05, + "loss": 1.0721, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.1960117667913437, + "learning_rate": 6.233766233766233e-05, + "loss": 1.071, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.20168261229991913, + "learning_rate": 6.16883116883117e-05, + "loss": 1.0808, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.21254412829875946, + "learning_rate": 6.103896103896104e-05, + "loss": 1.0287, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.21271063387393951, + "learning_rate": 6.03896103896104e-05, + "loss": 1.0605, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.2081408053636551, + "learning_rate": 5.9740259740259744e-05, + "loss": 1.091, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.21113798022270203, + "learning_rate": 5.90909090909091e-05, + "loss": 1.1323, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.20670844614505768, + "learning_rate": 5.844155844155844e-05, + "loss": 1.0955, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.2010120451450348, + "learning_rate": 5.7792207792207796e-05, + "loss": 1.1068, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.20379121601581573, + "learning_rate": 5.714285714285714e-05, + "loss": 1.0419, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.22799807786941528, + "learning_rate": 5.64935064935065e-05, + "loss": 1.0904, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.2005995213985443, + "learning_rate": 5.584415584415584e-05, + "loss": 1.078, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.20329605042934418, + "learning_rate": 5.51948051948052e-05, + "loss": 1.0245, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.19283504784107208, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.0367, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.20624355971813202, + "learning_rate": 5.38961038961039e-05, + "loss": 1.1046, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.21362991631031036, + "learning_rate": 5.3246753246753245e-05, + "loss": 1.1104, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.20447863638401031, + "learning_rate": 5.25974025974026e-05, + "loss": 1.0514, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.1974381059408188, + "learning_rate": 5.1948051948051944e-05, + "loss": 1.0048, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.21237170696258545, + "learning_rate": 5.1298701298701304e-05, + "loss": 1.1299, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.21224971115589142, + "learning_rate": 5.064935064935065e-05, + "loss": 1.05, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.19865018129348755, + "learning_rate": 5e-05, + "loss": 1.0665, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.19199275970458984, + "learning_rate": 4.9350649350649355e-05, + "loss": 0.9531, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.19573214650154114, + "learning_rate": 4.87012987012987e-05, + "loss": 1.0318, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.21338805556297302, + "learning_rate": 4.8051948051948054e-05, + "loss": 1.0343, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.2254691869020462, + "learning_rate": 4.740259740259741e-05, + "loss": 1.0472, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.18101665377616882, + "learning_rate": 4.675324675324675e-05, + "loss": 1.017, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.22090592980384827, + "learning_rate": 4.6103896103896106e-05, + "loss": 1.0389, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.20865507423877716, + "learning_rate": 4.545454545454546e-05, + "loss": 1.0369, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.21619610488414764, + "learning_rate": 4.4805194805194805e-05, + "loss": 1.109, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.21694771945476532, + "learning_rate": 4.415584415584416e-05, + "loss": 1.0525, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.2182662934064865, + "learning_rate": 4.3506493506493503e-05, + "loss": 1.0331, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.2026486098766327, + "learning_rate": 4.2857142857142856e-05, + "loss": 1.027, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.19606547057628632, + "learning_rate": 4.220779220779221e-05, + "loss": 1.0242, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.22107470035552979, + "learning_rate": 4.155844155844156e-05, + "loss": 1.0924, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.19960008561611176, + "learning_rate": 4.0909090909090915e-05, + "loss": 1.0384, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.1945488154888153, + "learning_rate": 4.025974025974026e-05, + "loss": 1.0673, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.22067414224147797, + "learning_rate": 3.9610389610389614e-05, + "loss": 1.0426, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.19010980427265167, + "learning_rate": 3.8961038961038966e-05, + "loss": 1.0617, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.18781176209449768, + "learning_rate": 3.831168831168831e-05, + "loss": 1.0243, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.20388829708099365, + "learning_rate": 3.7662337662337665e-05, + "loss": 1.0476, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.19911155104637146, + "learning_rate": 3.701298701298702e-05, + "loss": 1.0324, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.19884039461612701, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.0242, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.19036105275154114, + "learning_rate": 3.571428571428572e-05, + "loss": 1.0323, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.20039844512939453, + "learning_rate": 3.506493506493507e-05, + "loss": 1.0749, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.1899934560060501, + "learning_rate": 3.4415584415584416e-05, + "loss": 1.0115, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.20019090175628662, + "learning_rate": 3.376623376623377e-05, + "loss": 1.0782, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.2020583152770996, + "learning_rate": 3.311688311688312e-05, + "loss": 1.0687, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.21407337486743927, + "learning_rate": 3.246753246753247e-05, + "loss": 1.1015, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.1871640682220459, + "learning_rate": 3.181818181818182e-05, + "loss": 0.9637, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.21622811257839203, + "learning_rate": 3.1168831168831166e-05, + "loss": 1.1222, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.22504661977291107, + "learning_rate": 3.051948051948052e-05, + "loss": 1.132, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.19177629053592682, + "learning_rate": 2.9870129870129872e-05, + "loss": 1.0281, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.1970544159412384, + "learning_rate": 2.922077922077922e-05, + "loss": 1.0393, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.21554522216320038, + "learning_rate": 2.857142857142857e-05, + "loss": 1.074, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.21131229400634766, + "learning_rate": 2.792207792207792e-05, + "loss": 1.054, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.19816523790359497, + "learning_rate": 2.7272727272727273e-05, + "loss": 1.0456, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.21075209975242615, + "learning_rate": 2.6623376623376623e-05, + "loss": 1.0758, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.2296527624130249, + "learning_rate": 2.5974025974025972e-05, + "loss": 1.0917, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.19722610712051392, + "learning_rate": 2.5324675324675325e-05, + "loss": 1.0704, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.18721099197864532, + "learning_rate": 2.4675324675324678e-05, + "loss": 0.9919, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.20244193077087402, + "learning_rate": 2.4025974025974027e-05, + "loss": 1.0368, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.19518914818763733, + "learning_rate": 2.3376623376623376e-05, + "loss": 1.0436, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.19650357961654663, + "learning_rate": 2.272727272727273e-05, + "loss": 1.0306, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.20320096611976624, + "learning_rate": 2.207792207792208e-05, + "loss": 1.0941, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.18296951055526733, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.9802, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.21357610821723938, + "learning_rate": 2.077922077922078e-05, + "loss": 1.0449, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.193921759724617, + "learning_rate": 2.012987012987013e-05, + "loss": 1.0116, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.1953902244567871, + "learning_rate": 1.9480519480519483e-05, + "loss": 1.0105, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.19440975785255432, + "learning_rate": 1.8831168831168833e-05, + "loss": 0.9952, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.21054105460643768, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.0701, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.18844804167747498, + "learning_rate": 1.7532467532467535e-05, + "loss": 1.0146, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.2067311704158783, + "learning_rate": 1.6883116883116884e-05, + "loss": 1.0781, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.1941213756799698, + "learning_rate": 1.6233766233766234e-05, + "loss": 0.9814, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.22726193070411682, + "learning_rate": 1.5584415584415583e-05, + "loss": 1.1431, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.18025581538677216, + "learning_rate": 1.4935064935064936e-05, + "loss": 0.9649, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.21535000205039978, + "learning_rate": 1.4285714285714285e-05, + "loss": 1.0441, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.20014546811580658, + "learning_rate": 1.3636363636363637e-05, + "loss": 1.0166, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.22738787531852722, + "learning_rate": 1.2987012987012986e-05, + "loss": 1.0564, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.2020861804485321, + "learning_rate": 1.2337662337662339e-05, + "loss": 1.1241, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.19888809323310852, + "learning_rate": 1.1688311688311688e-05, + "loss": 1.1114, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.20912377536296844, + "learning_rate": 1.103896103896104e-05, + "loss": 1.0971, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.21206621825695038, + "learning_rate": 1.038961038961039e-05, + "loss": 1.0601, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.18667680025100708, + "learning_rate": 9.740259740259742e-06, + "loss": 1.0291, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.21125559508800507, + "learning_rate": 9.090909090909091e-06, + "loss": 1.0483, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.21776145696640015, + "learning_rate": 8.441558441558442e-06, + "loss": 0.9912, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.20144303143024445, + "learning_rate": 7.792207792207792e-06, + "loss": 1.0357, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.1984029859304428, + "learning_rate": 7.142857142857143e-06, + "loss": 1.0648, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.17972829937934875, + "learning_rate": 6.493506493506493e-06, + "loss": 1.0033, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.1818286031484604, + "learning_rate": 5.844155844155844e-06, + "loss": 0.997, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.19670912623405457, + "learning_rate": 5.194805194805195e-06, + "loss": 1.0256, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.20527283847332, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.0348, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.19025909900665283, + "learning_rate": 3.896103896103896e-06, + "loss": 1.0682, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.19544818997383118, + "learning_rate": 3.2467532467532465e-06, + "loss": 0.9872, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.22112183272838593, + "learning_rate": 2.5974025974025976e-06, + "loss": 1.0661, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.23328153789043427, + "learning_rate": 1.948051948051948e-06, + "loss": 1.0691, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.20181375741958618, + "learning_rate": 1.2987012987012988e-06, + "loss": 0.9416, + "step": 312 + }, + { + "epoch": 1.0, + "grad_norm": 0.29312625527381897, + "learning_rate": 6.493506493506494e-07, + "loss": 1.1216, + "step": 313 + }, + { + "epoch": 0.12202467696492762, + "grad_norm": 0.2231415957212448, + "learning_rate": 0.0, + "loss": 1.0468, + "step": 314 + }, + { + "epoch": 0.12241329058583503, + "grad_norm": 0.22263288497924805, + "learning_rate": 0.00017594394706111328, + "loss": 1.0399, + "step": 315 + }, + { + "epoch": 0.12280190420674245, + "grad_norm": 0.22909891605377197, + "learning_rate": 0.00017586609575710393, + "loss": 1.1069, + "step": 316 + }, + { + "epoch": 0.12319051782764986, + "grad_norm": 0.23951445519924164, + "learning_rate": 0.0001757882444530946, + "loss": 1.1036, + "step": 317 + }, + { + "epoch": 0.12357913144855727, + "grad_norm": 0.2409268021583557, + "learning_rate": 0.00017571039314908526, + "loss": 1.1114, + "step": 318 + }, + { + "epoch": 0.12396774506946469, + "grad_norm": 0.23753899335861206, + "learning_rate": 0.00017563254184507592, + "loss": 1.1297, + "step": 319 + }, + { + "epoch": 0.12435635869037209, + "grad_norm": 0.2823902666568756, + "learning_rate": 0.00017555469054106657, + "loss": 1.1293, + "step": 320 + }, + { + "epoch": 0.12474497231127951, + "grad_norm": 0.24093545973300934, + "learning_rate": 0.00017547683923705722, + "loss": 1.0678, + "step": 321 + }, + { + "epoch": 0.12513358593218693, + "grad_norm": 0.22565563023090363, + "learning_rate": 0.0001753989879330479, + "loss": 1.1408, + "step": 322 + }, + { + "epoch": 0.12552219955309435, + "grad_norm": 0.22569572925567627, + "learning_rate": 0.00017532113662903855, + "loss": 1.0543, + "step": 323 + }, + { + "epoch": 0.12591081317400174, + "grad_norm": 0.24962866306304932, + "learning_rate": 0.0001752432853250292, + "loss": 1.0818, + "step": 324 + }, + { + "epoch": 0.12629942679490916, + "grad_norm": 0.22184576094150543, + "learning_rate": 0.00017516543402101986, + "loss": 1.0835, + "step": 325 + }, + { + "epoch": 0.12668804041581658, + "grad_norm": 0.2572194039821625, + "learning_rate": 0.0001750875827170105, + "loss": 1.0767, + "step": 326 + }, + { + "epoch": 0.127076654036724, + "grad_norm": 0.24131342768669128, + "learning_rate": 0.00017500973141300116, + "loss": 1.0981, + "step": 327 + }, + { + "epoch": 0.1274652676576314, + "grad_norm": 0.2386389970779419, + "learning_rate": 0.00017493188010899184, + "loss": 1.0828, + "step": 328 + }, + { + "epoch": 0.1278538812785388, + "grad_norm": 0.2654125690460205, + "learning_rate": 0.0001748540288049825, + "loss": 1.1266, + "step": 329 + }, + { + "epoch": 0.12824249489944622, + "grad_norm": 0.2925739884376526, + "learning_rate": 0.00017477617750097314, + "loss": 1.0983, + "step": 330 + }, + { + "epoch": 0.12863110852035364, + "grad_norm": 0.26589342951774597, + "learning_rate": 0.0001746983261969638, + "loss": 1.1029, + "step": 331 + }, + { + "epoch": 0.12901972214126106, + "grad_norm": 0.24565957486629486, + "learning_rate": 0.00017462047489295445, + "loss": 1.0975, + "step": 332 + }, + { + "epoch": 0.12940833576216845, + "grad_norm": 0.2459682673215866, + "learning_rate": 0.00017454262358894513, + "loss": 1.0566, + "step": 333 + }, + { + "epoch": 0.12979694938307587, + "grad_norm": 0.23349183797836304, + "learning_rate": 0.00017446477228493578, + "loss": 1.0833, + "step": 334 + }, + { + "epoch": 0.1301855630039833, + "grad_norm": 0.26166337728500366, + "learning_rate": 0.00017438692098092643, + "loss": 1.1598, + "step": 335 + }, + { + "epoch": 0.1305741766248907, + "grad_norm": 0.24188168346881866, + "learning_rate": 0.00017430906967691708, + "loss": 1.0728, + "step": 336 + }, + { + "epoch": 0.13096279024579813, + "grad_norm": 0.22922398149967194, + "learning_rate": 0.00017423121837290773, + "loss": 1.0311, + "step": 337 + }, + { + "epoch": 0.13135140386670552, + "grad_norm": 0.2652754485607147, + "learning_rate": 0.00017415336706889841, + "loss": 1.1096, + "step": 338 + }, + { + "epoch": 0.13174001748761294, + "grad_norm": 0.2355881780385971, + "learning_rate": 0.00017407551576488907, + "loss": 1.0964, + "step": 339 + }, + { + "epoch": 0.13212863110852036, + "grad_norm": 0.244523823261261, + "learning_rate": 0.00017399766446087972, + "loss": 1.142, + "step": 340 + }, + { + "epoch": 0.13251724472942777, + "grad_norm": 0.24705976247787476, + "learning_rate": 0.00017391981315687037, + "loss": 1.0943, + "step": 341 + }, + { + "epoch": 0.13290585835033517, + "grad_norm": 0.22817552089691162, + "learning_rate": 0.00017384196185286102, + "loss": 1.0621, + "step": 342 + }, + { + "epoch": 0.13329447197124258, + "grad_norm": 0.22605225443840027, + "learning_rate": 0.0001737641105488517, + "loss": 1.0714, + "step": 343 + }, + { + "epoch": 0.13368308559215, + "grad_norm": 0.2584545314311981, + "learning_rate": 0.00017368625924484235, + "loss": 1.1367, + "step": 344 + }, + { + "epoch": 0.13407169921305742, + "grad_norm": 0.2248220443725586, + "learning_rate": 0.000173608407940833, + "loss": 1.0872, + "step": 345 + }, + { + "epoch": 0.13446031283396484, + "grad_norm": 0.2141868770122528, + "learning_rate": 0.00017353055663682368, + "loss": 1.0572, + "step": 346 + }, + { + "epoch": 0.13484892645487223, + "grad_norm": 0.2615523934364319, + "learning_rate": 0.00017345270533281434, + "loss": 1.1048, + "step": 347 + }, + { + "epoch": 0.13523754007577965, + "grad_norm": 0.22990448772907257, + "learning_rate": 0.000173374854028805, + "loss": 1.0528, + "step": 348 + }, + { + "epoch": 0.13562615369668707, + "grad_norm": 0.2132262885570526, + "learning_rate": 0.00017329700272479564, + "loss": 1.0476, + "step": 349 + }, + { + "epoch": 0.1360147673175945, + "grad_norm": 0.2578272819519043, + "learning_rate": 0.00017321915142078632, + "loss": 1.0852, + "step": 350 + }, + { + "epoch": 0.1364033809385019, + "grad_norm": 0.22881457209587097, + "learning_rate": 0.00017314130011677697, + "loss": 1.1017, + "step": 351 + }, + { + "epoch": 0.1367919945594093, + "grad_norm": 0.21067696809768677, + "learning_rate": 0.00017306344881276762, + "loss": 1.0444, + "step": 352 + }, + { + "epoch": 0.13718060818031672, + "grad_norm": 0.2304215282201767, + "learning_rate": 0.0001729855975087583, + "loss": 1.0737, + "step": 353 + }, + { + "epoch": 0.13756922180122413, + "grad_norm": 0.2031925916671753, + "learning_rate": 0.00017290774620474895, + "loss": 1.0036, + "step": 354 + }, + { + "epoch": 0.13795783542213155, + "grad_norm": 0.27281051874160767, + "learning_rate": 0.0001728298949007396, + "loss": 1.148, + "step": 355 + }, + { + "epoch": 0.13834644904303897, + "grad_norm": 0.204191654920578, + "learning_rate": 0.00017275204359673026, + "loss": 0.9607, + "step": 356 + }, + { + "epoch": 0.13873506266394636, + "grad_norm": 0.221976637840271, + "learning_rate": 0.0001726741922927209, + "loss": 1.1068, + "step": 357 + }, + { + "epoch": 0.13912367628485378, + "grad_norm": 0.20831729471683502, + "learning_rate": 0.0001725963409887116, + "loss": 1.034, + "step": 358 + }, + { + "epoch": 0.1395122899057612, + "grad_norm": 0.21639779210090637, + "learning_rate": 0.00017251848968470224, + "loss": 1.0613, + "step": 359 + }, + { + "epoch": 0.13990090352666862, + "grad_norm": 0.1959424465894699, + "learning_rate": 0.0001724406383806929, + "loss": 1.0506, + "step": 360 + }, + { + "epoch": 0.140289517147576, + "grad_norm": 0.2044398933649063, + "learning_rate": 0.00017236278707668355, + "loss": 1.0316, + "step": 361 + }, + { + "epoch": 0.14067813076848343, + "grad_norm": 0.21483004093170166, + "learning_rate": 0.0001722849357726742, + "loss": 1.0361, + "step": 362 + }, + { + "epoch": 0.14106674438939085, + "grad_norm": 0.237701416015625, + "learning_rate": 0.00017220708446866485, + "loss": 1.1264, + "step": 363 + }, + { + "epoch": 0.14145535801029827, + "grad_norm": 0.20750795304775238, + "learning_rate": 0.00017212923316465553, + "loss": 1.0523, + "step": 364 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 0.2252965271472931, + "learning_rate": 0.00017205138186064618, + "loss": 1.0764, + "step": 365 + }, + { + "epoch": 0.14223258525211308, + "grad_norm": 0.2033565789461136, + "learning_rate": 0.00017197353055663683, + "loss": 1.064, + "step": 366 + }, + { + "epoch": 0.1426211988730205, + "grad_norm": 0.21123190224170685, + "learning_rate": 0.00017189567925262749, + "loss": 1.0515, + "step": 367 + }, + { + "epoch": 0.1430098124939279, + "grad_norm": 0.20646221935749054, + "learning_rate": 0.00017181782794861814, + "loss": 1.0617, + "step": 368 + }, + { + "epoch": 0.14339842611483533, + "grad_norm": 0.2079589068889618, + "learning_rate": 0.00017173997664460882, + "loss": 1.0569, + "step": 369 + }, + { + "epoch": 0.14378703973574275, + "grad_norm": 0.216246098279953, + "learning_rate": 0.00017166212534059947, + "loss": 1.0986, + "step": 370 + }, + { + "epoch": 0.14417565335665014, + "grad_norm": 0.20711806416511536, + "learning_rate": 0.00017158427403659012, + "loss": 1.1342, + "step": 371 + }, + { + "epoch": 0.14456426697755756, + "grad_norm": 0.235435351729393, + "learning_rate": 0.00017150642273258077, + "loss": 1.1082, + "step": 372 + }, + { + "epoch": 0.14495288059846498, + "grad_norm": 0.2273191511631012, + "learning_rate": 0.00017142857142857143, + "loss": 1.1064, + "step": 373 + }, + { + "epoch": 0.1453414942193724, + "grad_norm": 0.2075672745704651, + "learning_rate": 0.0001713507201245621, + "loss": 1.0536, + "step": 374 + }, + { + "epoch": 0.14573010784027982, + "grad_norm": 0.20764274895191193, + "learning_rate": 0.00017127286882055276, + "loss": 1.0673, + "step": 375 + }, + { + "epoch": 0.1461187214611872, + "grad_norm": 0.2441243678331375, + "learning_rate": 0.0001711950175165434, + "loss": 1.1271, + "step": 376 + }, + { + "epoch": 0.14650733508209463, + "grad_norm": 0.2383374124765396, + "learning_rate": 0.00017111716621253406, + "loss": 1.083, + "step": 377 + }, + { + "epoch": 0.14689594870300204, + "grad_norm": 0.2172410786151886, + "learning_rate": 0.0001710393149085247, + "loss": 1.0605, + "step": 378 + }, + { + "epoch": 0.14728456232390946, + "grad_norm": 0.22591541707515717, + "learning_rate": 0.0001709614636045154, + "loss": 1.0931, + "step": 379 + }, + { + "epoch": 0.14767317594481685, + "grad_norm": 0.23099495470523834, + "learning_rate": 0.00017088361230050604, + "loss": 1.1021, + "step": 380 + }, + { + "epoch": 0.14806178956572427, + "grad_norm": 0.21461094915866852, + "learning_rate": 0.0001708057609964967, + "loss": 1.0959, + "step": 381 + }, + { + "epoch": 0.1484504031866317, + "grad_norm": 0.21557241678237915, + "learning_rate": 0.00017072790969248735, + "loss": 1.0155, + "step": 382 + }, + { + "epoch": 0.1488390168075391, + "grad_norm": 0.234396293759346, + "learning_rate": 0.000170650058388478, + "loss": 1.1289, + "step": 383 + }, + { + "epoch": 0.14922763042844653, + "grad_norm": 0.22895503044128418, + "learning_rate": 0.00017057220708446868, + "loss": 0.9919, + "step": 384 + }, + { + "epoch": 0.14961624404935392, + "grad_norm": 0.2054683268070221, + "learning_rate": 0.00017049435578045933, + "loss": 1.0607, + "step": 385 + }, + { + "epoch": 0.15000485767026134, + "grad_norm": 0.25569215416908264, + "learning_rate": 0.00017041650447644998, + "loss": 1.0517, + "step": 386 + }, + { + "epoch": 0.15039347129116876, + "grad_norm": 0.2222641259431839, + "learning_rate": 0.00017033865317244064, + "loss": 1.0404, + "step": 387 + }, + { + "epoch": 0.15078208491207618, + "grad_norm": 0.20501169562339783, + "learning_rate": 0.0001702608018684313, + "loss": 0.9897, + "step": 388 + }, + { + "epoch": 0.1511706985329836, + "grad_norm": 0.22080403566360474, + "learning_rate": 0.00017018295056442197, + "loss": 1.1013, + "step": 389 + }, + { + "epoch": 0.15155931215389098, + "grad_norm": 0.21218529343605042, + "learning_rate": 0.00017010509926041262, + "loss": 1.0541, + "step": 390 + }, + { + "epoch": 0.1519479257747984, + "grad_norm": 0.23064807057380676, + "learning_rate": 0.00017002724795640327, + "loss": 1.037, + "step": 391 + }, + { + "epoch": 0.15233653939570582, + "grad_norm": 0.21164493262767792, + "learning_rate": 0.00016994939665239392, + "loss": 1.0769, + "step": 392 + }, + { + "epoch": 0.15272515301661324, + "grad_norm": 0.22565549612045288, + "learning_rate": 0.00016987154534838457, + "loss": 1.0638, + "step": 393 + }, + { + "epoch": 0.15311376663752063, + "grad_norm": 0.22492647171020508, + "learning_rate": 0.00016979369404437525, + "loss": 1.063, + "step": 394 + }, + { + "epoch": 0.15350238025842805, + "grad_norm": 0.22335395216941833, + "learning_rate": 0.0001697158427403659, + "loss": 1.1032, + "step": 395 + }, + { + "epoch": 0.15389099387933547, + "grad_norm": 0.2164154201745987, + "learning_rate": 0.00016963799143635656, + "loss": 1.1275, + "step": 396 + }, + { + "epoch": 0.1542796075002429, + "grad_norm": 0.22547736763954163, + "learning_rate": 0.0001695601401323472, + "loss": 1.1324, + "step": 397 + }, + { + "epoch": 0.1546682211211503, + "grad_norm": 0.2028045952320099, + "learning_rate": 0.0001694822888283379, + "loss": 1.0057, + "step": 398 + }, + { + "epoch": 0.1550568347420577, + "grad_norm": 0.20770573616027832, + "learning_rate": 0.00016940443752432854, + "loss": 1.0311, + "step": 399 + }, + { + "epoch": 0.15544544836296512, + "grad_norm": 0.2231476902961731, + "learning_rate": 0.0001693265862203192, + "loss": 1.0535, + "step": 400 + }, + { + "epoch": 0.15583406198387253, + "grad_norm": 0.21618099510669708, + "learning_rate": 0.00016924873491630987, + "loss": 1.0616, + "step": 401 + }, + { + "epoch": 0.15622267560477995, + "grad_norm": 0.24024419486522675, + "learning_rate": 0.00016917088361230052, + "loss": 1.1324, + "step": 402 + }, + { + "epoch": 0.15661128922568737, + "grad_norm": 0.2002171128988266, + "learning_rate": 0.00016909303230829118, + "loss": 1.015, + "step": 403 + }, + { + "epoch": 0.15699990284659476, + "grad_norm": 0.21771477162837982, + "learning_rate": 0.00016901518100428183, + "loss": 1.0817, + "step": 404 + }, + { + "epoch": 0.15738851646750218, + "grad_norm": 0.22052259743213654, + "learning_rate": 0.0001689373297002725, + "loss": 1.0836, + "step": 405 + }, + { + "epoch": 0.1577771300884096, + "grad_norm": 0.1964062750339508, + "learning_rate": 0.00016885947839626316, + "loss": 1.0505, + "step": 406 + }, + { + "epoch": 0.15816574370931702, + "grad_norm": 0.22714298963546753, + "learning_rate": 0.0001687816270922538, + "loss": 1.0702, + "step": 407 + }, + { + "epoch": 0.15855435733022444, + "grad_norm": 0.20647728443145752, + "learning_rate": 0.00016870377578824446, + "loss": 1.0349, + "step": 408 + }, + { + "epoch": 0.15894297095113183, + "grad_norm": 0.2355160117149353, + "learning_rate": 0.00016862592448423512, + "loss": 1.0305, + "step": 409 + }, + { + "epoch": 0.15933158457203925, + "grad_norm": 0.22890770435333252, + "learning_rate": 0.0001685480731802258, + "loss": 1.0854, + "step": 410 + }, + { + "epoch": 0.15972019819294667, + "grad_norm": 0.21947838366031647, + "learning_rate": 0.00016847022187621645, + "loss": 1.0948, + "step": 411 + }, + { + "epoch": 0.16010881181385409, + "grad_norm": 0.22334899008274078, + "learning_rate": 0.0001683923705722071, + "loss": 1.006, + "step": 412 + }, + { + "epoch": 0.16049742543476148, + "grad_norm": 0.22324936091899872, + "learning_rate": 0.00016831451926819775, + "loss": 1.0402, + "step": 413 + }, + { + "epoch": 0.1608860390556689, + "grad_norm": 0.21462097764015198, + "learning_rate": 0.0001682366679641884, + "loss": 1.077, + "step": 414 + }, + { + "epoch": 0.1612746526765763, + "grad_norm": 0.24567006528377533, + "learning_rate": 0.00016815881666017908, + "loss": 1.15, + "step": 415 + }, + { + "epoch": 0.16166326629748373, + "grad_norm": 0.26437243819236755, + "learning_rate": 0.00016808096535616973, + "loss": 1.1251, + "step": 416 + }, + { + "epoch": 0.16205187991839115, + "grad_norm": 0.2217959761619568, + "learning_rate": 0.00016800311405216039, + "loss": 1.1103, + "step": 417 + }, + { + "epoch": 0.16244049353929854, + "grad_norm": 0.24402475357055664, + "learning_rate": 0.00016792526274815104, + "loss": 1.0672, + "step": 418 + }, + { + "epoch": 0.16282910716020596, + "grad_norm": 0.21609526872634888, + "learning_rate": 0.0001678474114441417, + "loss": 1.0291, + "step": 419 + }, + { + "epoch": 0.16321772078111338, + "grad_norm": 0.20054642856121063, + "learning_rate": 0.00016776956014013237, + "loss": 1.0704, + "step": 420 + }, + { + "epoch": 0.1636063344020208, + "grad_norm": 0.22864869236946106, + "learning_rate": 0.00016769170883612302, + "loss": 1.0612, + "step": 421 + }, + { + "epoch": 0.16399494802292822, + "grad_norm": 0.22651974856853485, + "learning_rate": 0.00016761385753211367, + "loss": 1.0749, + "step": 422 + }, + { + "epoch": 0.1643835616438356, + "grad_norm": 0.21587328612804413, + "learning_rate": 0.00016753600622810433, + "loss": 1.0398, + "step": 423 + }, + { + "epoch": 0.16477217526474303, + "grad_norm": 0.1953774094581604, + "learning_rate": 0.00016745815492409498, + "loss": 1.0275, + "step": 424 + }, + { + "epoch": 0.16516078888565044, + "grad_norm": 0.21803410351276398, + "learning_rate": 0.00016738030362008566, + "loss": 1.1219, + "step": 425 + }, + { + "epoch": 0.16554940250655786, + "grad_norm": 0.2034682035446167, + "learning_rate": 0.0001673024523160763, + "loss": 1.0342, + "step": 426 + }, + { + "epoch": 0.16593801612746525, + "grad_norm": 0.20135951042175293, + "learning_rate": 0.00016722460101206696, + "loss": 0.9802, + "step": 427 + }, + { + "epoch": 0.16632662974837267, + "grad_norm": 0.23310376703739166, + "learning_rate": 0.0001671467497080576, + "loss": 1.0789, + "step": 428 + }, + { + "epoch": 0.1667152433692801, + "grad_norm": 0.21475404500961304, + "learning_rate": 0.00016706889840404827, + "loss": 1.0416, + "step": 429 + }, + { + "epoch": 0.1671038569901875, + "grad_norm": 0.21661072969436646, + "learning_rate": 0.00016699104710003894, + "loss": 1.0568, + "step": 430 + }, + { + "epoch": 0.16749247061109493, + "grad_norm": 0.20310629904270172, + "learning_rate": 0.0001669131957960296, + "loss": 0.9968, + "step": 431 + }, + { + "epoch": 0.16788108423200232, + "grad_norm": 0.2596947252750397, + "learning_rate": 0.00016683534449202025, + "loss": 1.0478, + "step": 432 + }, + { + "epoch": 0.16826969785290974, + "grad_norm": 0.22226987779140472, + "learning_rate": 0.0001667574931880109, + "loss": 1.0898, + "step": 433 + }, + { + "epoch": 0.16865831147381716, + "grad_norm": 0.22499911487102509, + "learning_rate": 0.00016667964188400155, + "loss": 1.07, + "step": 434 + }, + { + "epoch": 0.16904692509472458, + "grad_norm": 0.2717292308807373, + "learning_rate": 0.0001666017905799922, + "loss": 1.0562, + "step": 435 + }, + { + "epoch": 0.169435538715632, + "grad_norm": 0.22052323818206787, + "learning_rate": 0.00016652393927598288, + "loss": 1.0732, + "step": 436 + }, + { + "epoch": 0.16982415233653939, + "grad_norm": 0.21741728484630585, + "learning_rate": 0.00016644608797197354, + "loss": 1.0409, + "step": 437 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 0.20701193809509277, + "learning_rate": 0.0001663682366679642, + "loss": 1.0731, + "step": 438 + }, + { + "epoch": 0.17060137957835422, + "grad_norm": 0.22071130573749542, + "learning_rate": 0.00016629038536395484, + "loss": 1.0992, + "step": 439 + }, + { + "epoch": 0.17098999319926164, + "grad_norm": 0.20261412858963013, + "learning_rate": 0.0001662125340599455, + "loss": 1.0051, + "step": 440 + }, + { + "epoch": 0.17137860682016906, + "grad_norm": 0.2082947939634323, + "learning_rate": 0.00016613468275593617, + "loss": 1.0477, + "step": 441 + }, + { + "epoch": 0.17176722044107645, + "grad_norm": 0.22534717619419098, + "learning_rate": 0.00016605683145192682, + "loss": 1.041, + "step": 442 + }, + { + "epoch": 0.17215583406198387, + "grad_norm": 0.21547731757164001, + "learning_rate": 0.00016597898014791748, + "loss": 1.0528, + "step": 443 + }, + { + "epoch": 0.1725444476828913, + "grad_norm": 0.24141089618206024, + "learning_rate": 0.00016590112884390813, + "loss": 1.0928, + "step": 444 + }, + { + "epoch": 0.1729330613037987, + "grad_norm": 0.21910884976387024, + "learning_rate": 0.00016582327753989878, + "loss": 1.063, + "step": 445 + }, + { + "epoch": 0.1733216749247061, + "grad_norm": 0.21782316267490387, + "learning_rate": 0.00016574542623588946, + "loss": 1.0976, + "step": 446 + }, + { + "epoch": 0.17371028854561352, + "grad_norm": 0.21771778166294098, + "learning_rate": 0.0001656675749318801, + "loss": 1.0677, + "step": 447 + }, + { + "epoch": 0.17409890216652094, + "grad_norm": 0.22117659449577332, + "learning_rate": 0.00016558972362787076, + "loss": 1.0669, + "step": 448 + }, + { + "epoch": 0.17448751578742835, + "grad_norm": 0.21918092668056488, + "learning_rate": 0.00016551187232386141, + "loss": 1.0955, + "step": 449 + }, + { + "epoch": 0.17487612940833577, + "grad_norm": 0.22027818858623505, + "learning_rate": 0.0001654340210198521, + "loss": 1.0201, + "step": 450 + }, + { + "epoch": 0.17526474302924316, + "grad_norm": 0.2042885720729828, + "learning_rate": 0.00016535616971584275, + "loss": 1.0881, + "step": 451 + }, + { + "epoch": 0.17565335665015058, + "grad_norm": 0.21788261830806732, + "learning_rate": 0.0001652783184118334, + "loss": 1.0918, + "step": 452 + }, + { + "epoch": 0.176041970271058, + "grad_norm": 0.23332571983337402, + "learning_rate": 0.00016520046710782408, + "loss": 1.091, + "step": 453 + }, + { + "epoch": 0.17643058389196542, + "grad_norm": 0.20204192399978638, + "learning_rate": 0.00016512261580381473, + "loss": 1.0366, + "step": 454 + }, + { + "epoch": 0.17681919751287284, + "grad_norm": 0.21761906147003174, + "learning_rate": 0.00016504476449980538, + "loss": 1.0131, + "step": 455 + }, + { + "epoch": 0.17720781113378023, + "grad_norm": 0.2152051478624344, + "learning_rate": 0.00016496691319579606, + "loss": 1.0868, + "step": 456 + }, + { + "epoch": 0.17759642475468765, + "grad_norm": 0.22776494920253754, + "learning_rate": 0.0001648890618917867, + "loss": 1.0807, + "step": 457 + }, + { + "epoch": 0.17798503837559507, + "grad_norm": 0.2171342968940735, + "learning_rate": 0.00016481121058777736, + "loss": 1.0537, + "step": 458 + }, + { + "epoch": 0.17837365199650249, + "grad_norm": 0.2046273946762085, + "learning_rate": 0.00016473335928376802, + "loss": 1.0097, + "step": 459 + }, + { + "epoch": 0.17876226561740988, + "grad_norm": 0.2047681361436844, + "learning_rate": 0.00016465550797975867, + "loss": 1.0204, + "step": 460 + }, + { + "epoch": 0.1791508792383173, + "grad_norm": 0.1876862645149231, + "learning_rate": 0.00016457765667574935, + "loss": 0.9383, + "step": 461 + }, + { + "epoch": 0.17953949285922471, + "grad_norm": 0.218430757522583, + "learning_rate": 0.00016449980537174, + "loss": 1.0721, + "step": 462 + }, + { + "epoch": 0.17992810648013213, + "grad_norm": 0.2245480865240097, + "learning_rate": 0.00016442195406773065, + "loss": 1.0859, + "step": 463 + }, + { + "epoch": 0.18031672010103955, + "grad_norm": 0.22577151656150818, + "learning_rate": 0.0001643441027637213, + "loss": 1.0825, + "step": 464 + }, + { + "epoch": 0.18070533372194694, + "grad_norm": 0.20132745802402496, + "learning_rate": 0.00016426625145971196, + "loss": 1.0615, + "step": 465 + }, + { + "epoch": 0.18109394734285436, + "grad_norm": 0.2277505248785019, + "learning_rate": 0.00016418840015570263, + "loss": 1.0426, + "step": 466 + }, + { + "epoch": 0.18148256096376178, + "grad_norm": 0.22540105879306793, + "learning_rate": 0.0001641105488516933, + "loss": 1.0481, + "step": 467 + }, + { + "epoch": 0.1818711745846692, + "grad_norm": 0.20358088612556458, + "learning_rate": 0.00016403269754768394, + "loss": 1.0286, + "step": 468 + }, + { + "epoch": 0.18225978820557662, + "grad_norm": 0.22534145414829254, + "learning_rate": 0.0001639548462436746, + "loss": 1.1183, + "step": 469 + }, + { + "epoch": 0.182648401826484, + "grad_norm": 0.2188873142004013, + "learning_rate": 0.00016387699493966524, + "loss": 1.0439, + "step": 470 + }, + { + "epoch": 0.18303701544739143, + "grad_norm": 0.2128048539161682, + "learning_rate": 0.00016379914363565592, + "loss": 1.027, + "step": 471 + }, + { + "epoch": 0.18342562906829885, + "grad_norm": 0.2518141567707062, + "learning_rate": 0.00016372129233164657, + "loss": 1.0468, + "step": 472 + }, + { + "epoch": 0.18381424268920626, + "grad_norm": 0.2189142256975174, + "learning_rate": 0.00016364344102763723, + "loss": 1.0581, + "step": 473 + }, + { + "epoch": 0.18420285631011368, + "grad_norm": 0.31266725063323975, + "learning_rate": 0.00016356558972362788, + "loss": 1.0554, + "step": 474 + }, + { + "epoch": 0.18459146993102107, + "grad_norm": 0.21343916654586792, + "learning_rate": 0.00016348773841961853, + "loss": 1.0795, + "step": 475 + }, + { + "epoch": 0.1849800835519285, + "grad_norm": 0.22907280921936035, + "learning_rate": 0.00016340988711560918, + "loss": 1.0304, + "step": 476 + }, + { + "epoch": 0.1853686971728359, + "grad_norm": 0.2105257511138916, + "learning_rate": 0.00016333203581159986, + "loss": 1.0231, + "step": 477 + }, + { + "epoch": 0.18575731079374333, + "grad_norm": 0.19537831842899323, + "learning_rate": 0.00016325418450759051, + "loss": 1.0103, + "step": 478 + }, + { + "epoch": 0.18614592441465072, + "grad_norm": 0.20522372424602509, + "learning_rate": 0.00016317633320358117, + "loss": 1.0196, + "step": 479 + }, + { + "epoch": 0.18653453803555814, + "grad_norm": 0.21646477282047272, + "learning_rate": 0.00016309848189957182, + "loss": 1.0579, + "step": 480 + }, + { + "epoch": 0.18692315165646556, + "grad_norm": 0.21077193319797516, + "learning_rate": 0.00016302063059556247, + "loss": 1.0638, + "step": 481 + }, + { + "epoch": 0.18731176527737298, + "grad_norm": 0.20357473194599152, + "learning_rate": 0.00016294277929155315, + "loss": 1.0635, + "step": 482 + }, + { + "epoch": 0.1877003788982804, + "grad_norm": 0.2188001275062561, + "learning_rate": 0.0001628649279875438, + "loss": 1.0267, + "step": 483 + }, + { + "epoch": 0.1880889925191878, + "grad_norm": 0.2128928154706955, + "learning_rate": 0.00016278707668353445, + "loss": 0.9706, + "step": 484 + }, + { + "epoch": 0.1884776061400952, + "grad_norm": 0.22081372141838074, + "learning_rate": 0.0001627092253795251, + "loss": 1.08, + "step": 485 + }, + { + "epoch": 0.18886621976100262, + "grad_norm": 0.2250615805387497, + "learning_rate": 0.00016263137407551576, + "loss": 1.1451, + "step": 486 + }, + { + "epoch": 0.18925483338191004, + "grad_norm": 0.1984967589378357, + "learning_rate": 0.00016255352277150644, + "loss": 1.0744, + "step": 487 + }, + { + "epoch": 0.18964344700281746, + "grad_norm": 0.20778900384902954, + "learning_rate": 0.0001624756714674971, + "loss": 1.0623, + "step": 488 + }, + { + "epoch": 0.19003206062372485, + "grad_norm": 0.2026563137769699, + "learning_rate": 0.00016239782016348774, + "loss": 1.0714, + "step": 489 + }, + { + "epoch": 0.19042067424463227, + "grad_norm": 0.21598374843597412, + "learning_rate": 0.0001623199688594784, + "loss": 1.0869, + "step": 490 + }, + { + "epoch": 0.1908092878655397, + "grad_norm": 0.18944978713989258, + "learning_rate": 0.00016224211755546904, + "loss": 1.055, + "step": 491 + }, + { + "epoch": 0.1911979014864471, + "grad_norm": 0.20698946714401245, + "learning_rate": 0.00016216426625145972, + "loss": 1.0392, + "step": 492 + }, + { + "epoch": 0.1915865151073545, + "grad_norm": 0.22395353019237518, + "learning_rate": 0.00016208641494745038, + "loss": 1.0681, + "step": 493 + }, + { + "epoch": 0.19197512872826192, + "grad_norm": 0.22372962534427643, + "learning_rate": 0.00016200856364344103, + "loss": 1.0767, + "step": 494 + }, + { + "epoch": 0.19236374234916934, + "grad_norm": 0.2066701054573059, + "learning_rate": 0.00016193071233943168, + "loss": 1.0061, + "step": 495 + }, + { + "epoch": 0.19275235597007676, + "grad_norm": 0.19716408848762512, + "learning_rate": 0.00016185286103542233, + "loss": 1.039, + "step": 496 + }, + { + "epoch": 0.19314096959098417, + "grad_norm": 0.22159601747989655, + "learning_rate": 0.000161775009731413, + "loss": 1.0832, + "step": 497 + }, + { + "epoch": 0.19352958321189156, + "grad_norm": 0.21509626507759094, + "learning_rate": 0.00016169715842740366, + "loss": 1.0264, + "step": 498 + }, + { + "epoch": 0.19391819683279898, + "grad_norm": 0.21598199009895325, + "learning_rate": 0.00016161930712339431, + "loss": 1.049, + "step": 499 + }, + { + "epoch": 0.1943068104537064, + "grad_norm": 0.20279590785503387, + "learning_rate": 0.00016154145581938497, + "loss": 1.0505, + "step": 500 + }, + { + "epoch": 0.19469542407461382, + "grad_norm": 0.21796855330467224, + "learning_rate": 0.00016146360451537565, + "loss": 1.0885, + "step": 501 + }, + { + "epoch": 0.19508403769552124, + "grad_norm": 0.22128933668136597, + "learning_rate": 0.0001613857532113663, + "loss": 1.0903, + "step": 502 + }, + { + "epoch": 0.19547265131642863, + "grad_norm": 0.2032536417245865, + "learning_rate": 0.00016130790190735695, + "loss": 1.0285, + "step": 503 + }, + { + "epoch": 0.19586126493733605, + "grad_norm": 0.23738974332809448, + "learning_rate": 0.0001612300506033476, + "loss": 1.1188, + "step": 504 + }, + { + "epoch": 0.19624987855824347, + "grad_norm": 0.19614790380001068, + "learning_rate": 0.00016115219929933828, + "loss": 1.04, + "step": 505 + }, + { + "epoch": 0.1966384921791509, + "grad_norm": 0.2198178917169571, + "learning_rate": 0.00016107434799532893, + "loss": 1.0696, + "step": 506 + }, + { + "epoch": 0.1970271058000583, + "grad_norm": 0.18814648687839508, + "learning_rate": 0.00016099649669131959, + "loss": 1.0203, + "step": 507 + }, + { + "epoch": 0.1974157194209657, + "grad_norm": 0.20699037611484528, + "learning_rate": 0.00016091864538731026, + "loss": 1.1074, + "step": 508 + }, + { + "epoch": 0.19780433304187311, + "grad_norm": 0.21490445733070374, + "learning_rate": 0.00016084079408330092, + "loss": 1.0682, + "step": 509 + }, + { + "epoch": 0.19819294666278053, + "grad_norm": 0.2363848090171814, + "learning_rate": 0.00016076294277929157, + "loss": 1.0408, + "step": 510 + }, + { + "epoch": 0.19858156028368795, + "grad_norm": 0.20186659693717957, + "learning_rate": 0.00016068509147528222, + "loss": 1.026, + "step": 511 + }, + { + "epoch": 0.19897017390459534, + "grad_norm": 0.21564024686813354, + "learning_rate": 0.00016060724017127287, + "loss": 1.0418, + "step": 512 + }, + { + "epoch": 0.19935878752550276, + "grad_norm": 0.19151560962200165, + "learning_rate": 0.00016052938886726355, + "loss": 1.0037, + "step": 513 + }, + { + "epoch": 0.19974740114641018, + "grad_norm": 0.21038194000720978, + "learning_rate": 0.0001604515375632542, + "loss": 1.0545, + "step": 514 + }, + { + "epoch": 0.2001360147673176, + "grad_norm": 0.20496582984924316, + "learning_rate": 0.00016037368625924486, + "loss": 1.0543, + "step": 515 + }, + { + "epoch": 0.20052462838822502, + "grad_norm": 0.20689113438129425, + "learning_rate": 0.0001602958349552355, + "loss": 1.0905, + "step": 516 + }, + { + "epoch": 0.2009132420091324, + "grad_norm": 0.2284041792154312, + "learning_rate": 0.00016021798365122616, + "loss": 1.0717, + "step": 517 + }, + { + "epoch": 0.20130185563003983, + "grad_norm": 0.23457761108875275, + "learning_rate": 0.00016014013234721684, + "loss": 1.106, + "step": 518 + }, + { + "epoch": 0.20169046925094725, + "grad_norm": 0.2088528722524643, + "learning_rate": 0.0001600622810432075, + "loss": 1.0428, + "step": 519 + }, + { + "epoch": 0.20207908287185467, + "grad_norm": 0.2170068770647049, + "learning_rate": 0.00015998442973919814, + "loss": 0.9875, + "step": 520 + }, + { + "epoch": 0.20246769649276208, + "grad_norm": 0.2270561158657074, + "learning_rate": 0.0001599065784351888, + "loss": 1.0676, + "step": 521 + }, + { + "epoch": 0.20285631011366947, + "grad_norm": 0.2151324599981308, + "learning_rate": 0.00015982872713117945, + "loss": 1.0675, + "step": 522 + }, + { + "epoch": 0.2032449237345769, + "grad_norm": 0.23113249242305756, + "learning_rate": 0.00015975087582717013, + "loss": 1.0608, + "step": 523 + }, + { + "epoch": 0.2036335373554843, + "grad_norm": 0.2587106227874756, + "learning_rate": 0.00015967302452316078, + "loss": 1.0867, + "step": 524 + }, + { + "epoch": 0.20402215097639173, + "grad_norm": 0.21842992305755615, + "learning_rate": 0.00015959517321915143, + "loss": 1.0726, + "step": 525 + }, + { + "epoch": 0.20441076459729912, + "grad_norm": 0.20867805182933807, + "learning_rate": 0.00015951732191514208, + "loss": 1.0578, + "step": 526 + }, + { + "epoch": 0.20479937821820654, + "grad_norm": 0.2396962195634842, + "learning_rate": 0.00015943947061113273, + "loss": 1.0292, + "step": 527 + }, + { + "epoch": 0.20518799183911396, + "grad_norm": 0.221155047416687, + "learning_rate": 0.00015936161930712341, + "loss": 1.0019, + "step": 528 + }, + { + "epoch": 0.20557660546002138, + "grad_norm": 0.20032119750976562, + "learning_rate": 0.00015928376800311407, + "loss": 1.0435, + "step": 529 + }, + { + "epoch": 0.2059652190809288, + "grad_norm": 0.24095888435840607, + "learning_rate": 0.00015920591669910472, + "loss": 1.0355, + "step": 530 + }, + { + "epoch": 0.2063538327018362, + "grad_norm": 0.2286604344844818, + "learning_rate": 0.00015912806539509537, + "loss": 0.9989, + "step": 531 + }, + { + "epoch": 0.2067424463227436, + "grad_norm": 0.21537137031555176, + "learning_rate": 0.00015905021409108602, + "loss": 1.0642, + "step": 532 + }, + { + "epoch": 0.20713105994365102, + "grad_norm": 0.22447925806045532, + "learning_rate": 0.0001589723627870767, + "loss": 1.1244, + "step": 533 + }, + { + "epoch": 0.20751967356455844, + "grad_norm": 0.21077273786067963, + "learning_rate": 0.00015889451148306735, + "loss": 1.0167, + "step": 534 + }, + { + "epoch": 0.20790828718546586, + "grad_norm": 0.22340558469295502, + "learning_rate": 0.000158816660179058, + "loss": 1.0991, + "step": 535 + }, + { + "epoch": 0.20829690080637325, + "grad_norm": 0.223599374294281, + "learning_rate": 0.00015873880887504866, + "loss": 1.086, + "step": 536 + }, + { + "epoch": 0.20868551442728067, + "grad_norm": 0.2615208923816681, + "learning_rate": 0.0001586609575710393, + "loss": 1.0584, + "step": 537 + }, + { + "epoch": 0.2090741280481881, + "grad_norm": 0.2085907757282257, + "learning_rate": 0.00015858310626703, + "loss": 1.0994, + "step": 538 + }, + { + "epoch": 0.2094627416690955, + "grad_norm": 0.2170211672782898, + "learning_rate": 0.00015850525496302064, + "loss": 1.1105, + "step": 539 + }, + { + "epoch": 0.20985135529000293, + "grad_norm": 0.21978625655174255, + "learning_rate": 0.0001584274036590113, + "loss": 1.002, + "step": 540 + }, + { + "epoch": 0.21023996891091032, + "grad_norm": 0.23684021830558777, + "learning_rate": 0.00015834955235500194, + "loss": 1.1216, + "step": 541 + }, + { + "epoch": 0.21062858253181774, + "grad_norm": 0.220269113779068, + "learning_rate": 0.0001582717010509926, + "loss": 1.0773, + "step": 542 + }, + { + "epoch": 0.21101719615272516, + "grad_norm": 0.22447973489761353, + "learning_rate": 0.00015819384974698328, + "loss": 1.0941, + "step": 543 + }, + { + "epoch": 0.21140580977363257, + "grad_norm": 0.22435730695724487, + "learning_rate": 0.00015811599844297393, + "loss": 1.0138, + "step": 544 + }, + { + "epoch": 0.21179442339453997, + "grad_norm": 0.2230793684720993, + "learning_rate": 0.00015803814713896458, + "loss": 1.0343, + "step": 545 + }, + { + "epoch": 0.21218303701544738, + "grad_norm": 0.23491905629634857, + "learning_rate": 0.00015796029583495523, + "loss": 1.11, + "step": 546 + }, + { + "epoch": 0.2125716506363548, + "grad_norm": 0.213560551404953, + "learning_rate": 0.00015788244453094588, + "loss": 1.0615, + "step": 547 + }, + { + "epoch": 0.21296026425726222, + "grad_norm": 0.21392837166786194, + "learning_rate": 0.00015780459322693654, + "loss": 1.0872, + "step": 548 + }, + { + "epoch": 0.21334887787816964, + "grad_norm": 0.20007692277431488, + "learning_rate": 0.00015772674192292722, + "loss": 1.0394, + "step": 549 + }, + { + "epoch": 0.21373749149907703, + "grad_norm": 0.1969841718673706, + "learning_rate": 0.00015764889061891787, + "loss": 1.0381, + "step": 550 + }, + { + "epoch": 0.21412610511998445, + "grad_norm": 0.21874025464057922, + "learning_rate": 0.00015757103931490852, + "loss": 1.0822, + "step": 551 + }, + { + "epoch": 0.21451471874089187, + "grad_norm": 0.21824273467063904, + "learning_rate": 0.00015749318801089917, + "loss": 1.0802, + "step": 552 + }, + { + "epoch": 0.2149033323617993, + "grad_norm": 0.20942047238349915, + "learning_rate": 0.00015741533670688985, + "loss": 1.0634, + "step": 553 + }, + { + "epoch": 0.2152919459827067, + "grad_norm": 0.1940152943134308, + "learning_rate": 0.0001573374854028805, + "loss": 1.0264, + "step": 554 + }, + { + "epoch": 0.2156805596036141, + "grad_norm": 0.19859059154987335, + "learning_rate": 0.00015725963409887115, + "loss": 0.9701, + "step": 555 + }, + { + "epoch": 0.21606917322452152, + "grad_norm": 0.22239404916763306, + "learning_rate": 0.0001571817827948618, + "loss": 1.1282, + "step": 556 + }, + { + "epoch": 0.21645778684542893, + "grad_norm": 0.23820599913597107, + "learning_rate": 0.00015710393149085249, + "loss": 1.1123, + "step": 557 + }, + { + "epoch": 0.21684640046633635, + "grad_norm": 0.21279917657375336, + "learning_rate": 0.00015702608018684314, + "loss": 1.0542, + "step": 558 + }, + { + "epoch": 0.21723501408724374, + "grad_norm": 0.2065514773130417, + "learning_rate": 0.0001569482288828338, + "loss": 1.0685, + "step": 559 + }, + { + "epoch": 0.21762362770815116, + "grad_norm": 0.20130831003189087, + "learning_rate": 0.00015687037757882447, + "loss": 0.9869, + "step": 560 + }, + { + "epoch": 0.21801224132905858, + "grad_norm": 0.2187541127204895, + "learning_rate": 0.00015679252627481512, + "loss": 1.1095, + "step": 561 + }, + { + "epoch": 0.218400854949966, + "grad_norm": 0.21028277277946472, + "learning_rate": 0.00015671467497080577, + "loss": 1.0804, + "step": 562 + }, + { + "epoch": 0.21878946857087342, + "grad_norm": 0.8187636733055115, + "learning_rate": 0.00015663682366679643, + "loss": 1.0782, + "step": 563 + }, + { + "epoch": 0.2191780821917808, + "grad_norm": 0.20059974491596222, + "learning_rate": 0.0001565589723627871, + "loss": 1.0279, + "step": 564 + }, + { + "epoch": 0.21956669581268823, + "grad_norm": 0.20440839231014252, + "learning_rate": 0.00015648112105877776, + "loss": 0.9863, + "step": 565 + }, + { + "epoch": 0.21995530943359565, + "grad_norm": 0.21423624455928802, + "learning_rate": 0.0001564032697547684, + "loss": 1.0685, + "step": 566 + }, + { + "epoch": 0.22034392305450307, + "grad_norm": 0.22430062294006348, + "learning_rate": 0.00015632541845075906, + "loss": 1.0761, + "step": 567 + }, + { + "epoch": 0.22073253667541048, + "grad_norm": 0.22782258689403534, + "learning_rate": 0.0001562475671467497, + "loss": 1.1024, + "step": 568 + }, + { + "epoch": 0.22112115029631788, + "grad_norm": 0.21150320768356323, + "learning_rate": 0.0001561697158427404, + "loss": 1.0621, + "step": 569 + }, + { + "epoch": 0.2215097639172253, + "grad_norm": 0.20342351496219635, + "learning_rate": 0.00015609186453873104, + "loss": 1.0667, + "step": 570 + }, + { + "epoch": 0.2218983775381327, + "grad_norm": 0.22866711020469666, + "learning_rate": 0.0001560140132347217, + "loss": 1.0631, + "step": 571 + }, + { + "epoch": 0.22228699115904013, + "grad_norm": 0.2200063169002533, + "learning_rate": 0.00015593616193071235, + "loss": 1.0448, + "step": 572 + }, + { + "epoch": 0.22267560477994755, + "grad_norm": 0.19440248608589172, + "learning_rate": 0.000155858310626703, + "loss": 1.037, + "step": 573 + }, + { + "epoch": 0.22306421840085494, + "grad_norm": 0.205752432346344, + "learning_rate": 0.00015578045932269368, + "loss": 1.0465, + "step": 574 + }, + { + "epoch": 0.22345283202176236, + "grad_norm": 0.22247998416423798, + "learning_rate": 0.00015570260801868433, + "loss": 0.997, + "step": 575 + }, + { + "epoch": 0.22384144564266978, + "grad_norm": 0.22199274599552155, + "learning_rate": 0.00015562475671467498, + "loss": 1.0178, + "step": 576 + }, + { + "epoch": 0.2242300592635772, + "grad_norm": 0.2114989310503006, + "learning_rate": 0.00015554690541066564, + "loss": 1.0457, + "step": 577 + }, + { + "epoch": 0.2246186728844846, + "grad_norm": 0.24248506128787994, + "learning_rate": 0.0001554690541066563, + "loss": 1.002, + "step": 578 + }, + { + "epoch": 0.225007286505392, + "grad_norm": 0.2565505802631378, + "learning_rate": 0.00015539120280264697, + "loss": 1.0541, + "step": 579 + }, + { + "epoch": 0.22539590012629943, + "grad_norm": 0.22799409925937653, + "learning_rate": 0.00015531335149863762, + "loss": 1.0788, + "step": 580 + }, + { + "epoch": 0.22578451374720684, + "grad_norm": 0.2196080982685089, + "learning_rate": 0.00015523550019462827, + "loss": 1.0877, + "step": 581 + }, + { + "epoch": 0.22617312736811426, + "grad_norm": 0.21992824971675873, + "learning_rate": 0.00015515764889061892, + "loss": 1.0213, + "step": 582 + }, + { + "epoch": 0.22656174098902165, + "grad_norm": 0.22793298959732056, + "learning_rate": 0.00015507979758660957, + "loss": 1.0633, + "step": 583 + }, + { + "epoch": 0.22695035460992907, + "grad_norm": 0.21707972884178162, + "learning_rate": 0.00015500194628260023, + "loss": 1.081, + "step": 584 + }, + { + "epoch": 0.2273389682308365, + "grad_norm": 0.220685675740242, + "learning_rate": 0.0001549240949785909, + "loss": 1.0658, + "step": 585 + }, + { + "epoch": 0.2277275818517439, + "grad_norm": 0.22576668858528137, + "learning_rate": 0.00015484624367458156, + "loss": 1.0795, + "step": 586 + }, + { + "epoch": 0.22811619547265133, + "grad_norm": 0.21778982877731323, + "learning_rate": 0.0001547683923705722, + "loss": 1.033, + "step": 587 + }, + { + "epoch": 0.22850480909355872, + "grad_norm": 0.22748610377311707, + "learning_rate": 0.00015469054106656286, + "loss": 1.0948, + "step": 588 + }, + { + "epoch": 0.22889342271446614, + "grad_norm": 0.21561284363269806, + "learning_rate": 0.00015461268976255351, + "loss": 1.0022, + "step": 589 + }, + { + "epoch": 0.22928203633537356, + "grad_norm": 0.2419756054878235, + "learning_rate": 0.0001545348384585442, + "loss": 1.0786, + "step": 590 + }, + { + "epoch": 0.22967064995628098, + "grad_norm": 0.20479315519332886, + "learning_rate": 0.00015445698715453485, + "loss": 1.027, + "step": 591 + }, + { + "epoch": 0.2300592635771884, + "grad_norm": 0.21365883946418762, + "learning_rate": 0.0001543791358505255, + "loss": 1.0773, + "step": 592 + }, + { + "epoch": 0.23044787719809579, + "grad_norm": 0.23133166134357452, + "learning_rate": 0.00015430128454651615, + "loss": 1.0877, + "step": 593 + }, + { + "epoch": 0.2308364908190032, + "grad_norm": 0.2110515981912613, + "learning_rate": 0.0001542234332425068, + "loss": 1.0509, + "step": 594 + }, + { + "epoch": 0.23122510443991062, + "grad_norm": 0.20658442378044128, + "learning_rate": 0.00015414558193849748, + "loss": 1.0623, + "step": 595 + }, + { + "epoch": 0.23161371806081804, + "grad_norm": 0.21831996738910675, + "learning_rate": 0.00015406773063448813, + "loss": 1.021, + "step": 596 + }, + { + "epoch": 0.23200233168172543, + "grad_norm": 0.23015642166137695, + "learning_rate": 0.00015398987933047878, + "loss": 1.0358, + "step": 597 + }, + { + "epoch": 0.23239094530263285, + "grad_norm": 0.23071645200252533, + "learning_rate": 0.00015391202802646944, + "loss": 1.1255, + "step": 598 + }, + { + "epoch": 0.23277955892354027, + "grad_norm": 0.19513486325740814, + "learning_rate": 0.0001538341767224601, + "loss": 1.0189, + "step": 599 + }, + { + "epoch": 0.2331681725444477, + "grad_norm": 0.20821452140808105, + "learning_rate": 0.00015375632541845077, + "loss": 1.0843, + "step": 600 + }, + { + "epoch": 0.2335567861653551, + "grad_norm": 0.20563223958015442, + "learning_rate": 0.00015367847411444142, + "loss": 1.0012, + "step": 601 + }, + { + "epoch": 0.2339453997862625, + "grad_norm": 0.22674202919006348, + "learning_rate": 0.00015360062281043207, + "loss": 1.0371, + "step": 602 + }, + { + "epoch": 0.23433401340716992, + "grad_norm": 0.20744135975837708, + "learning_rate": 0.00015352277150642272, + "loss": 1.0466, + "step": 603 + }, + { + "epoch": 0.23472262702807734, + "grad_norm": 0.22103577852249146, + "learning_rate": 0.00015344492020241338, + "loss": 1.0942, + "step": 604 + }, + { + "epoch": 0.23511124064898475, + "grad_norm": 0.20643098652362823, + "learning_rate": 0.00015336706889840406, + "loss": 1.0682, + "step": 605 + }, + { + "epoch": 0.23549985426989217, + "grad_norm": 0.23436777293682098, + "learning_rate": 0.0001532892175943947, + "loss": 1.0613, + "step": 606 + }, + { + "epoch": 0.23588846789079956, + "grad_norm": 0.21898899972438812, + "learning_rate": 0.00015321136629038536, + "loss": 1.0571, + "step": 607 + }, + { + "epoch": 0.23627708151170698, + "grad_norm": 0.20569247007369995, + "learning_rate": 0.00015313351498637604, + "loss": 1.061, + "step": 608 + }, + { + "epoch": 0.2366656951326144, + "grad_norm": 0.2099207490682602, + "learning_rate": 0.0001530556636823667, + "loss": 1.0776, + "step": 609 + }, + { + "epoch": 0.23705430875352182, + "grad_norm": 0.20078738033771515, + "learning_rate": 0.00015297781237835734, + "loss": 1.0341, + "step": 610 + }, + { + "epoch": 0.2374429223744292, + "grad_norm": 0.20327065885066986, + "learning_rate": 0.000152899961074348, + "loss": 1.0168, + "step": 611 + }, + { + "epoch": 0.23783153599533663, + "grad_norm": 0.21741214394569397, + "learning_rate": 0.00015282210977033867, + "loss": 1.0726, + "step": 612 + }, + { + "epoch": 0.23822014961624405, + "grad_norm": 0.2065727263689041, + "learning_rate": 0.00015274425846632933, + "loss": 1.0474, + "step": 613 + }, + { + "epoch": 0.23860876323715147, + "grad_norm": 0.21241194009780884, + "learning_rate": 0.00015266640716231998, + "loss": 1.0666, + "step": 614 + }, + { + "epoch": 0.23899737685805889, + "grad_norm": 0.2194201797246933, + "learning_rate": 0.00015258855585831066, + "loss": 1.1411, + "step": 615 + }, + { + "epoch": 0.23938599047896628, + "grad_norm": 0.21537193655967712, + "learning_rate": 0.0001525107045543013, + "loss": 1.081, + "step": 616 + }, + { + "epoch": 0.2397746040998737, + "grad_norm": 0.21125951409339905, + "learning_rate": 0.00015243285325029196, + "loss": 1.0679, + "step": 617 + }, + { + "epoch": 0.2401632177207811, + "grad_norm": 0.21342721581459045, + "learning_rate": 0.0001523550019462826, + "loss": 1.0564, + "step": 618 + }, + { + "epoch": 0.24055183134168853, + "grad_norm": 0.2223503291606903, + "learning_rate": 0.00015227715064227327, + "loss": 1.1163, + "step": 619 + }, + { + "epoch": 0.24094044496259595, + "grad_norm": 0.21626527607440948, + "learning_rate": 0.00015219929933826394, + "loss": 1.0793, + "step": 620 + }, + { + "epoch": 0.24132905858350334, + "grad_norm": 0.21899500489234924, + "learning_rate": 0.0001521214480342546, + "loss": 1.0864, + "step": 621 + }, + { + "epoch": 0.24171767220441076, + "grad_norm": 0.2499915212392807, + "learning_rate": 0.00015204359673024525, + "loss": 1.1381, + "step": 622 + }, + { + "epoch": 0.24210628582531818, + "grad_norm": 0.2108345925807953, + "learning_rate": 0.0001519657454262359, + "loss": 1.0534, + "step": 623 + }, + { + "epoch": 0.2424948994462256, + "grad_norm": 0.2224910855293274, + "learning_rate": 0.00015188789412222655, + "loss": 1.0235, + "step": 624 + }, + { + "epoch": 0.24288351306713302, + "grad_norm": 0.22163094580173492, + "learning_rate": 0.0001518100428182172, + "loss": 1.0143, + "step": 625 + }, + { + "epoch": 0.2432721266880404, + "grad_norm": 0.20709283649921417, + "learning_rate": 0.00015173219151420788, + "loss": 1.0506, + "step": 626 + }, + { + "epoch": 0.24366074030894783, + "grad_norm": 0.2112802267074585, + "learning_rate": 0.00015165434021019854, + "loss": 1.0692, + "step": 627 + }, + { + "epoch": 0.24404935392985525, + "grad_norm": 0.23622830212116241, + "learning_rate": 0.0001515764889061892, + "loss": 1.0769, + "step": 628 + }, + { + "epoch": 0.24443796755076266, + "grad_norm": 0.23328271508216858, + "learning_rate": 0.00015149863760217984, + "loss": 1.1158, + "step": 629 + }, + { + "epoch": 0.24482658117167005, + "grad_norm": 0.2071760892868042, + "learning_rate": 0.0001514207862981705, + "loss": 1.0133, + "step": 630 + }, + { + "epoch": 0.24521519479257747, + "grad_norm": 0.21428920328617096, + "learning_rate": 0.00015134293499416117, + "loss": 1.0342, + "step": 631 + }, + { + "epoch": 0.2456038084134849, + "grad_norm": 0.22225375473499298, + "learning_rate": 0.00015126508369015182, + "loss": 1.1054, + "step": 632 + }, + { + "epoch": 0.2459924220343923, + "grad_norm": 0.2096671611070633, + "learning_rate": 0.00015118723238614248, + "loss": 1.0229, + "step": 633 + }, + { + "epoch": 0.24638103565529973, + "grad_norm": 0.21473252773284912, + "learning_rate": 0.00015110938108213313, + "loss": 1.0915, + "step": 634 + }, + { + "epoch": 0.24676964927620712, + "grad_norm": 0.2071562111377716, + "learning_rate": 0.00015103152977812378, + "loss": 1.047, + "step": 635 + }, + { + "epoch": 0.24715826289711454, + "grad_norm": 0.19868609309196472, + "learning_rate": 0.00015095367847411446, + "loss": 1.0073, + "step": 636 + }, + { + "epoch": 0.24754687651802196, + "grad_norm": 0.20937366783618927, + "learning_rate": 0.0001508758271701051, + "loss": 1.0155, + "step": 637 + }, + { + "epoch": 0.24793549013892938, + "grad_norm": 0.19225911796092987, + "learning_rate": 0.00015079797586609576, + "loss": 1.0163, + "step": 638 + }, + { + "epoch": 0.2483241037598368, + "grad_norm": 0.20427283644676208, + "learning_rate": 0.00015072012456208641, + "loss": 1.062, + "step": 639 + }, + { + "epoch": 0.24871271738074419, + "grad_norm": 0.21640253067016602, + "learning_rate": 0.00015064227325807707, + "loss": 1.025, + "step": 640 + }, + { + "epoch": 0.2491013310016516, + "grad_norm": 0.20416739583015442, + "learning_rate": 0.00015056442195406775, + "loss": 1.0635, + "step": 641 + }, + { + "epoch": 0.24948994462255902, + "grad_norm": 0.1990521252155304, + "learning_rate": 0.0001504865706500584, + "loss": 1.0757, + "step": 642 + }, + { + "epoch": 0.24987855824346644, + "grad_norm": 0.21636444330215454, + "learning_rate": 0.00015040871934604905, + "loss": 1.0441, + "step": 643 + }, + { + "epoch": 0.25026717186437386, + "grad_norm": 0.21253719925880432, + "learning_rate": 0.0001503308680420397, + "loss": 1.0574, + "step": 644 + }, + { + "epoch": 0.2506557854852813, + "grad_norm": 0.2134159356355667, + "learning_rate": 0.00015025301673803035, + "loss": 1.0396, + "step": 645 + }, + { + "epoch": 0.2510443991061887, + "grad_norm": 0.2018527239561081, + "learning_rate": 0.00015017516543402103, + "loss": 1.0606, + "step": 646 + }, + { + "epoch": 0.25143301272709606, + "grad_norm": 0.20320741832256317, + "learning_rate": 0.00015009731413001169, + "loss": 1.0093, + "step": 647 + }, + { + "epoch": 0.2518216263480035, + "grad_norm": 0.21007056534290314, + "learning_rate": 0.00015001946282600234, + "loss": 1.0284, + "step": 648 + }, + { + "epoch": 0.2522102399689109, + "grad_norm": 0.22453372180461884, + "learning_rate": 0.000149941611521993, + "loss": 1.0271, + "step": 649 + }, + { + "epoch": 0.2525988535898183, + "grad_norm": 0.19889335334300995, + "learning_rate": 0.00014986376021798364, + "loss": 1.0238, + "step": 650 + }, + { + "epoch": 0.25298746721072574, + "grad_norm": 0.19339965283870697, + "learning_rate": 0.00014978590891397432, + "loss": 1.024, + "step": 651 + }, + { + "epoch": 0.25337608083163315, + "grad_norm": 0.22362011671066284, + "learning_rate": 0.00014970805760996497, + "loss": 1.0722, + "step": 652 + }, + { + "epoch": 0.2537646944525406, + "grad_norm": 0.2110588103532791, + "learning_rate": 0.00014963020630595562, + "loss": 1.0541, + "step": 653 + }, + { + "epoch": 0.254153308073448, + "grad_norm": 0.203025683760643, + "learning_rate": 0.00014955235500194628, + "loss": 1.0335, + "step": 654 + }, + { + "epoch": 0.2545419216943554, + "grad_norm": 0.20884902775287628, + "learning_rate": 0.00014947450369793693, + "loss": 1.0507, + "step": 655 + }, + { + "epoch": 0.2549305353152628, + "grad_norm": 0.21234256029129028, + "learning_rate": 0.0001493966523939276, + "loss": 1.0372, + "step": 656 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 0.1984352171421051, + "learning_rate": 0.00014931880108991826, + "loss": 0.9979, + "step": 657 + }, + { + "epoch": 0.2557077625570776, + "grad_norm": 0.18848282098770142, + "learning_rate": 0.0001492409497859089, + "loss": 0.9973, + "step": 658 + }, + { + "epoch": 0.25609637617798503, + "grad_norm": 0.2201709896326065, + "learning_rate": 0.00014916309848189956, + "loss": 1.0386, + "step": 659 + }, + { + "epoch": 0.25648498979889245, + "grad_norm": 0.23094095289707184, + "learning_rate": 0.00014908524717789024, + "loss": 1.1205, + "step": 660 + }, + { + "epoch": 0.25687360341979987, + "grad_norm": 0.21087734401226044, + "learning_rate": 0.0001490073958738809, + "loss": 1.0231, + "step": 661 + }, + { + "epoch": 0.2572622170407073, + "grad_norm": 0.24970979988574982, + "learning_rate": 0.00014892954456987155, + "loss": 1.0421, + "step": 662 + }, + { + "epoch": 0.2576508306616147, + "grad_norm": 0.22024711966514587, + "learning_rate": 0.00014885169326586223, + "loss": 1.1033, + "step": 663 + }, + { + "epoch": 0.2580394442825221, + "grad_norm": 0.2195248156785965, + "learning_rate": 0.00014877384196185288, + "loss": 1.089, + "step": 664 + }, + { + "epoch": 0.25842805790342954, + "grad_norm": 0.20236417651176453, + "learning_rate": 0.00014869599065784353, + "loss": 1.0196, + "step": 665 + }, + { + "epoch": 0.2588166715243369, + "grad_norm": 0.21973329782485962, + "learning_rate": 0.00014861813935383418, + "loss": 1.0844, + "step": 666 + }, + { + "epoch": 0.2592052851452443, + "grad_norm": 0.2069879174232483, + "learning_rate": 0.00014854028804982486, + "loss": 1.0312, + "step": 667 + }, + { + "epoch": 0.25959389876615174, + "grad_norm": 0.2037455290555954, + "learning_rate": 0.00014846243674581551, + "loss": 1.0018, + "step": 668 + }, + { + "epoch": 0.25998251238705916, + "grad_norm": 0.24176378548145294, + "learning_rate": 0.00014838458544180617, + "loss": 1.0749, + "step": 669 + }, + { + "epoch": 0.2603711260079666, + "grad_norm": 0.2007879763841629, + "learning_rate": 0.00014830673413779682, + "loss": 1.0443, + "step": 670 + }, + { + "epoch": 0.260759739628874, + "grad_norm": 0.23503245413303375, + "learning_rate": 0.00014822888283378747, + "loss": 1.0674, + "step": 671 + }, + { + "epoch": 0.2611483532497814, + "grad_norm": 0.2166167050600052, + "learning_rate": 0.00014815103152977815, + "loss": 1.079, + "step": 672 + }, + { + "epoch": 0.26153696687068884, + "grad_norm": 0.2293982058763504, + "learning_rate": 0.0001480731802257688, + "loss": 1.0517, + "step": 673 + }, + { + "epoch": 0.26192558049159625, + "grad_norm": 0.21040330827236176, + "learning_rate": 0.00014799532892175945, + "loss": 1.0475, + "step": 674 + }, + { + "epoch": 0.2623141941125036, + "grad_norm": 0.20750463008880615, + "learning_rate": 0.0001479174776177501, + "loss": 1.025, + "step": 675 + }, + { + "epoch": 0.26270280773341104, + "grad_norm": 0.2748873233795166, + "learning_rate": 0.00014783962631374076, + "loss": 1.0212, + "step": 676 + }, + { + "epoch": 0.26309142135431846, + "grad_norm": 0.19212333858013153, + "learning_rate": 0.00014776177500973144, + "loss": 1.0049, + "step": 677 + }, + { + "epoch": 0.2634800349752259, + "grad_norm": 0.207731693983078, + "learning_rate": 0.0001476839237057221, + "loss": 1.0062, + "step": 678 + }, + { + "epoch": 0.2638686485961333, + "grad_norm": 0.2177981585264206, + "learning_rate": 0.00014760607240171274, + "loss": 1.0489, + "step": 679 + }, + { + "epoch": 0.2642572622170407, + "grad_norm": 0.23239290714263916, + "learning_rate": 0.0001475282210977034, + "loss": 1.0856, + "step": 680 + }, + { + "epoch": 0.26464587583794813, + "grad_norm": 0.2033151388168335, + "learning_rate": 0.00014745036979369404, + "loss": 1.0389, + "step": 681 + }, + { + "epoch": 0.26503448945885555, + "grad_norm": 0.20917408168315887, + "learning_rate": 0.00014737251848968472, + "loss": 1.1208, + "step": 682 + }, + { + "epoch": 0.26542310307976297, + "grad_norm": 0.22075454890727997, + "learning_rate": 0.00014729466718567538, + "loss": 1.0435, + "step": 683 + }, + { + "epoch": 0.26581171670067033, + "grad_norm": 0.23094993829727173, + "learning_rate": 0.00014721681588166603, + "loss": 1.0649, + "step": 684 + }, + { + "epoch": 0.26620033032157775, + "grad_norm": 0.21209536492824554, + "learning_rate": 0.00014713896457765668, + "loss": 1.0578, + "step": 685 + }, + { + "epoch": 0.26658894394248517, + "grad_norm": 0.21412219107151031, + "learning_rate": 0.00014706111327364733, + "loss": 1.1137, + "step": 686 + }, + { + "epoch": 0.2669775575633926, + "grad_norm": 0.21175475418567657, + "learning_rate": 0.000146983261969638, + "loss": 1.023, + "step": 687 + }, + { + "epoch": 0.2673661711843, + "grad_norm": 0.21968993544578552, + "learning_rate": 0.00014690541066562866, + "loss": 1.1183, + "step": 688 + }, + { + "epoch": 0.2677547848052074, + "grad_norm": 0.20414218306541443, + "learning_rate": 0.00014682755936161932, + "loss": 1.078, + "step": 689 + }, + { + "epoch": 0.26814339842611484, + "grad_norm": 0.18986597657203674, + "learning_rate": 0.00014674970805760997, + "loss": 1.0029, + "step": 690 + }, + { + "epoch": 0.26853201204702226, + "grad_norm": 0.21215832233428955, + "learning_rate": 0.00014667185675360062, + "loss": 1.0759, + "step": 691 + }, + { + "epoch": 0.2689206256679297, + "grad_norm": 0.2113744169473648, + "learning_rate": 0.0001465940054495913, + "loss": 1.1027, + "step": 692 + }, + { + "epoch": 0.2693092392888371, + "grad_norm": 0.22010880708694458, + "learning_rate": 0.00014651615414558195, + "loss": 1.0984, + "step": 693 + }, + { + "epoch": 0.26969785290974446, + "grad_norm": 0.203857421875, + "learning_rate": 0.0001464383028415726, + "loss": 1.0407, + "step": 694 + }, + { + "epoch": 0.2700864665306519, + "grad_norm": 0.21120867133140564, + "learning_rate": 0.00014636045153756325, + "loss": 1.0521, + "step": 695 + }, + { + "epoch": 0.2704750801515593, + "grad_norm": 0.20039112865924835, + "learning_rate": 0.0001462826002335539, + "loss": 1.0897, + "step": 696 + }, + { + "epoch": 0.2708636937724667, + "grad_norm": 0.22893202304840088, + "learning_rate": 0.00014620474892954456, + "loss": 1.0903, + "step": 697 + }, + { + "epoch": 0.27125230739337414, + "grad_norm": 0.19886267185211182, + "learning_rate": 0.00014612689762553524, + "loss": 1.0889, + "step": 698 + }, + { + "epoch": 0.27164092101428156, + "grad_norm": 0.18892349302768707, + "learning_rate": 0.0001460490463215259, + "loss": 0.981, + "step": 699 + }, + { + "epoch": 0.272029534635189, + "grad_norm": 0.20602507889270782, + "learning_rate": 0.00014597119501751654, + "loss": 1.0223, + "step": 700 + }, + { + "epoch": 0.2724181482560964, + "grad_norm": 0.21480505168437958, + "learning_rate": 0.0001458933437135072, + "loss": 1.0355, + "step": 701 + }, + { + "epoch": 0.2728067618770038, + "grad_norm": 0.21011753380298615, + "learning_rate": 0.00014581549240949785, + "loss": 1.0613, + "step": 702 + }, + { + "epoch": 0.2731953754979112, + "grad_norm": 0.19350819289684296, + "learning_rate": 0.00014573764110548853, + "loss": 1.0144, + "step": 703 + }, + { + "epoch": 0.2735839891188186, + "grad_norm": 0.207548126578331, + "learning_rate": 0.00014565978980147918, + "loss": 1.0465, + "step": 704 + }, + { + "epoch": 0.273972602739726, + "grad_norm": 0.22220565378665924, + "learning_rate": 0.00014558193849746983, + "loss": 1.1073, + "step": 705 + }, + { + "epoch": 0.27436121636063343, + "grad_norm": 0.193622425198555, + "learning_rate": 0.00014550408719346048, + "loss": 1.0357, + "step": 706 + }, + { + "epoch": 0.27474982998154085, + "grad_norm": 0.2067158818244934, + "learning_rate": 0.00014542623588945113, + "loss": 1.0502, + "step": 707 + }, + { + "epoch": 0.27513844360244827, + "grad_norm": 0.2218742072582245, + "learning_rate": 0.0001453483845854418, + "loss": 0.9934, + "step": 708 + }, + { + "epoch": 0.2755270572233557, + "grad_norm": 0.22316142916679382, + "learning_rate": 0.00014527053328143246, + "loss": 1.0707, + "step": 709 + }, + { + "epoch": 0.2759156708442631, + "grad_norm": 0.21004025638103485, + "learning_rate": 0.00014519268197742312, + "loss": 1.0543, + "step": 710 + }, + { + "epoch": 0.2763042844651705, + "grad_norm": 0.22070440649986267, + "learning_rate": 0.00014511483067341377, + "loss": 1.0467, + "step": 711 + }, + { + "epoch": 0.27669289808607794, + "grad_norm": 0.21463747322559357, + "learning_rate": 0.00014503697936940445, + "loss": 1.0793, + "step": 712 + }, + { + "epoch": 0.2770815117069853, + "grad_norm": 0.23452533781528473, + "learning_rate": 0.0001449591280653951, + "loss": 1.043, + "step": 713 + }, + { + "epoch": 0.2774701253278927, + "grad_norm": 0.2405795156955719, + "learning_rate": 0.00014488127676138575, + "loss": 1.0752, + "step": 714 + }, + { + "epoch": 0.27785873894880014, + "grad_norm": 0.21546585857868195, + "learning_rate": 0.00014480342545737643, + "loss": 1.0834, + "step": 715 + }, + { + "epoch": 0.27824735256970756, + "grad_norm": 0.22675828635692596, + "learning_rate": 0.00014472557415336708, + "loss": 1.055, + "step": 716 + }, + { + "epoch": 0.278635966190615, + "grad_norm": 0.2117871195077896, + "learning_rate": 0.00014464772284935774, + "loss": 1.03, + "step": 717 + }, + { + "epoch": 0.2790245798115224, + "grad_norm": 0.2193155735731125, + "learning_rate": 0.00014456987154534841, + "loss": 1.0073, + "step": 718 + }, + { + "epoch": 0.2794131934324298, + "grad_norm": 0.21447965502738953, + "learning_rate": 0.00014449202024133907, + "loss": 1.0174, + "step": 719 + }, + { + "epoch": 0.27980180705333724, + "grad_norm": 0.22867532074451447, + "learning_rate": 0.00014441416893732972, + "loss": 1.0948, + "step": 720 + }, + { + "epoch": 0.28019042067424466, + "grad_norm": 0.21570557355880737, + "learning_rate": 0.00014433631763332037, + "loss": 1.0105, + "step": 721 + }, + { + "epoch": 0.280579034295152, + "grad_norm": 0.20787014067173004, + "learning_rate": 0.00014425846632931102, + "loss": 1.0384, + "step": 722 + }, + { + "epoch": 0.28096764791605944, + "grad_norm": 0.19924762845039368, + "learning_rate": 0.0001441806150253017, + "loss": 1.0653, + "step": 723 + }, + { + "epoch": 0.28135626153696686, + "grad_norm": 0.1996215283870697, + "learning_rate": 0.00014410276372129235, + "loss": 1.0439, + "step": 724 + }, + { + "epoch": 0.2817448751578743, + "grad_norm": 0.2054813802242279, + "learning_rate": 0.000144024912417283, + "loss": 0.9895, + "step": 725 + }, + { + "epoch": 0.2821334887787817, + "grad_norm": 0.2268310785293579, + "learning_rate": 0.00014394706111327366, + "loss": 1.0993, + "step": 726 + }, + { + "epoch": 0.2825221023996891, + "grad_norm": 0.19867680966854095, + "learning_rate": 0.0001438692098092643, + "loss": 0.985, + "step": 727 + }, + { + "epoch": 0.28291071602059653, + "grad_norm": 0.21099598705768585, + "learning_rate": 0.000143791358505255, + "loss": 1.0333, + "step": 728 + }, + { + "epoch": 0.28329932964150395, + "grad_norm": 0.22479215264320374, + "learning_rate": 0.00014371350720124564, + "loss": 1.0449, + "step": 729 + }, + { + "epoch": 0.28368794326241137, + "grad_norm": 0.22717688977718353, + "learning_rate": 0.0001436356558972363, + "loss": 1.0482, + "step": 730 + }, + { + "epoch": 0.2840765568833188, + "grad_norm": 0.20389345288276672, + "learning_rate": 0.00014355780459322695, + "loss": 0.956, + "step": 731 + }, + { + "epoch": 0.28446517050422615, + "grad_norm": 0.21583619713783264, + "learning_rate": 0.0001434799532892176, + "loss": 1.0154, + "step": 732 + }, + { + "epoch": 0.28485378412513357, + "grad_norm": 0.2219148874282837, + "learning_rate": 0.00014340210198520825, + "loss": 1.0553, + "step": 733 + }, + { + "epoch": 0.285242397746041, + "grad_norm": 0.19920189678668976, + "learning_rate": 0.00014332425068119893, + "loss": 0.9881, + "step": 734 + }, + { + "epoch": 0.2856310113669484, + "grad_norm": 0.2295670360326767, + "learning_rate": 0.00014324639937718958, + "loss": 1.0529, + "step": 735 + }, + { + "epoch": 0.2860196249878558, + "grad_norm": 0.21271567046642303, + "learning_rate": 0.00014316854807318023, + "loss": 1.037, + "step": 736 + }, + { + "epoch": 0.28640823860876324, + "grad_norm": 0.21304361522197723, + "learning_rate": 0.00014309069676917088, + "loss": 1.048, + "step": 737 + }, + { + "epoch": 0.28679685222967066, + "grad_norm": 0.19902732968330383, + "learning_rate": 0.00014301284546516154, + "loss": 1.0306, + "step": 738 + }, + { + "epoch": 0.2871854658505781, + "grad_norm": 0.1995929330587387, + "learning_rate": 0.00014293499416115222, + "loss": 1.0394, + "step": 739 + }, + { + "epoch": 0.2875740794714855, + "grad_norm": 0.20426060259342194, + "learning_rate": 0.00014285714285714287, + "loss": 1.0052, + "step": 740 + }, + { + "epoch": 0.28796269309239286, + "grad_norm": 0.20284566283226013, + "learning_rate": 0.00014277929155313352, + "loss": 1.0115, + "step": 741 + }, + { + "epoch": 0.2883513067133003, + "grad_norm": 0.2041557878255844, + "learning_rate": 0.00014270144024912417, + "loss": 1.0473, + "step": 742 + }, + { + "epoch": 0.2887399203342077, + "grad_norm": 0.2152249962091446, + "learning_rate": 0.00014262358894511482, + "loss": 1.0802, + "step": 743 + }, + { + "epoch": 0.2891285339551151, + "grad_norm": 0.20569871366024017, + "learning_rate": 0.0001425457376411055, + "loss": 1.0203, + "step": 744 + }, + { + "epoch": 0.28951714757602254, + "grad_norm": 0.21128378808498383, + "learning_rate": 0.00014246788633709616, + "loss": 1.108, + "step": 745 + }, + { + "epoch": 0.28990576119692996, + "grad_norm": 0.19587135314941406, + "learning_rate": 0.0001423900350330868, + "loss": 1.0427, + "step": 746 + }, + { + "epoch": 0.2902943748178374, + "grad_norm": 0.22052550315856934, + "learning_rate": 0.00014231218372907746, + "loss": 1.055, + "step": 747 + }, + { + "epoch": 0.2906829884387448, + "grad_norm": 0.21291717886924744, + "learning_rate": 0.0001422343324250681, + "loss": 1.0591, + "step": 748 + }, + { + "epoch": 0.2910716020596522, + "grad_norm": 0.20634084939956665, + "learning_rate": 0.0001421564811210588, + "loss": 1.0527, + "step": 749 + }, + { + "epoch": 0.29146021568055963, + "grad_norm": 0.2075488269329071, + "learning_rate": 0.00014207862981704944, + "loss": 1.0786, + "step": 750 + }, + { + "epoch": 0.291848829301467, + "grad_norm": 0.19780080020427704, + "learning_rate": 0.0001420007785130401, + "loss": 1.059, + "step": 751 + }, + { + "epoch": 0.2922374429223744, + "grad_norm": 0.21212074160575867, + "learning_rate": 0.00014192292720903075, + "loss": 1.0346, + "step": 752 + }, + { + "epoch": 0.29262605654328183, + "grad_norm": 0.2218451350927353, + "learning_rate": 0.0001418450759050214, + "loss": 1.0908, + "step": 753 + }, + { + "epoch": 0.29301467016418925, + "grad_norm": 0.20107759535312653, + "learning_rate": 0.00014176722460101208, + "loss": 1.0202, + "step": 754 + }, + { + "epoch": 0.29340328378509667, + "grad_norm": 0.20933273434638977, + "learning_rate": 0.00014168937329700273, + "loss": 1.0719, + "step": 755 + }, + { + "epoch": 0.2937918974060041, + "grad_norm": 0.22369107604026794, + "learning_rate": 0.00014161152199299338, + "loss": 1.0433, + "step": 756 + }, + { + "epoch": 0.2941805110269115, + "grad_norm": 0.2113707810640335, + "learning_rate": 0.00014153367068898403, + "loss": 1.0637, + "step": 757 + }, + { + "epoch": 0.2945691246478189, + "grad_norm": 0.21105700731277466, + "learning_rate": 0.00014145581938497469, + "loss": 1.0468, + "step": 758 + }, + { + "epoch": 0.29495773826872634, + "grad_norm": 0.20189693570137024, + "learning_rate": 0.00014137796808096537, + "loss": 1.0281, + "step": 759 + }, + { + "epoch": 0.2953463518896337, + "grad_norm": 0.1954152137041092, + "learning_rate": 0.00014130011677695602, + "loss": 1.0519, + "step": 760 + }, + { + "epoch": 0.2957349655105411, + "grad_norm": 0.24295592308044434, + "learning_rate": 0.00014122226547294667, + "loss": 1.1303, + "step": 761 + }, + { + "epoch": 0.29612357913144854, + "grad_norm": 0.20158620178699493, + "learning_rate": 0.00014114441416893732, + "loss": 1.0367, + "step": 762 + }, + { + "epoch": 0.29651219275235596, + "grad_norm": 0.20734666287899017, + "learning_rate": 0.00014106656286492797, + "loss": 1.0392, + "step": 763 + }, + { + "epoch": 0.2969008063732634, + "grad_norm": 0.2177533656358719, + "learning_rate": 0.00014098871156091865, + "loss": 1.0619, + "step": 764 + }, + { + "epoch": 0.2972894199941708, + "grad_norm": 0.1961720883846283, + "learning_rate": 0.0001409108602569093, + "loss": 0.9872, + "step": 765 + }, + { + "epoch": 0.2976780336150782, + "grad_norm": 0.21530941128730774, + "learning_rate": 0.00014083300895289996, + "loss": 1.1246, + "step": 766 + }, + { + "epoch": 0.29806664723598564, + "grad_norm": 0.2039783000946045, + "learning_rate": 0.00014075515764889064, + "loss": 1.0789, + "step": 767 + }, + { + "epoch": 0.29845526085689306, + "grad_norm": 0.20641569793224335, + "learning_rate": 0.0001406773063448813, + "loss": 1.05, + "step": 768 + }, + { + "epoch": 0.2988438744778004, + "grad_norm": 0.2071225494146347, + "learning_rate": 0.00014059945504087194, + "loss": 1.047, + "step": 769 + }, + { + "epoch": 0.29923248809870784, + "grad_norm": 0.20367531478405, + "learning_rate": 0.00014052160373686262, + "loss": 1.0734, + "step": 770 + }, + { + "epoch": 0.29962110171961526, + "grad_norm": 0.21718619763851166, + "learning_rate": 0.00014044375243285327, + "loss": 1.0613, + "step": 771 + }, + { + "epoch": 0.3000097153405227, + "grad_norm": 0.21649087965488434, + "learning_rate": 0.00014036590112884392, + "loss": 1.0671, + "step": 772 + }, + { + "epoch": 0.3003983289614301, + "grad_norm": 0.22223225235939026, + "learning_rate": 0.00014028804982483458, + "loss": 1.0977, + "step": 773 + }, + { + "epoch": 0.3007869425823375, + "grad_norm": 0.23101870715618134, + "learning_rate": 0.00014021019852082523, + "loss": 1.1236, + "step": 774 + }, + { + "epoch": 0.30117555620324493, + "grad_norm": 0.22855506837368011, + "learning_rate": 0.0001401323472168159, + "loss": 1.0517, + "step": 775 + }, + { + "epoch": 0.30156416982415235, + "grad_norm": 0.20862117409706116, + "learning_rate": 0.00014005449591280656, + "loss": 1.0493, + "step": 776 + }, + { + "epoch": 0.30195278344505977, + "grad_norm": 0.21692048013210297, + "learning_rate": 0.0001399766446087972, + "loss": 1.0681, + "step": 777 + }, + { + "epoch": 0.3023413970659672, + "grad_norm": 0.21541331708431244, + "learning_rate": 0.00013989879330478786, + "loss": 1.0775, + "step": 778 + }, + { + "epoch": 0.30273001068687455, + "grad_norm": 0.21221749484539032, + "learning_rate": 0.00013982094200077851, + "loss": 1.0421, + "step": 779 + }, + { + "epoch": 0.30311862430778197, + "grad_norm": 0.22497743368148804, + "learning_rate": 0.0001397430906967692, + "loss": 1.1115, + "step": 780 + }, + { + "epoch": 0.3035072379286894, + "grad_norm": 0.1974119246006012, + "learning_rate": 0.00013966523939275985, + "loss": 1.0264, + "step": 781 + }, + { + "epoch": 0.3038958515495968, + "grad_norm": 0.20349323749542236, + "learning_rate": 0.0001395873880887505, + "loss": 1.0512, + "step": 782 + }, + { + "epoch": 0.3042844651705042, + "grad_norm": 0.21116937696933746, + "learning_rate": 0.00013950953678474115, + "loss": 1.0135, + "step": 783 + }, + { + "epoch": 0.30467307879141164, + "grad_norm": 0.2133677899837494, + "learning_rate": 0.0001394316854807318, + "loss": 1.0694, + "step": 784 + }, + { + "epoch": 0.30506169241231906, + "grad_norm": 0.20406191051006317, + "learning_rate": 0.00013935383417672248, + "loss": 1.0179, + "step": 785 + }, + { + "epoch": 0.3054503060332265, + "grad_norm": 0.21428678929805756, + "learning_rate": 0.00013927598287271313, + "loss": 1.0577, + "step": 786 + }, + { + "epoch": 0.3058389196541339, + "grad_norm": 0.20878921449184418, + "learning_rate": 0.00013919813156870379, + "loss": 1.0311, + "step": 787 + }, + { + "epoch": 0.30622753327504126, + "grad_norm": 0.19033175706863403, + "learning_rate": 0.00013912028026469444, + "loss": 0.976, + "step": 788 + }, + { + "epoch": 0.3066161468959487, + "grad_norm": 0.22138020396232605, + "learning_rate": 0.0001390424289606851, + "loss": 1.0438, + "step": 789 + }, + { + "epoch": 0.3070047605168561, + "grad_norm": 0.20765596628189087, + "learning_rate": 0.00013896457765667577, + "loss": 1.0865, + "step": 790 + }, + { + "epoch": 0.3073933741377635, + "grad_norm": 0.209733247756958, + "learning_rate": 0.00013888672635266642, + "loss": 1.0648, + "step": 791 + }, + { + "epoch": 0.30778198775867094, + "grad_norm": 0.1896686851978302, + "learning_rate": 0.00013880887504865707, + "loss": 1.0133, + "step": 792 + }, + { + "epoch": 0.30817060137957836, + "grad_norm": 0.21651998162269592, + "learning_rate": 0.00013873102374464772, + "loss": 1.0729, + "step": 793 + }, + { + "epoch": 0.3085592150004858, + "grad_norm": 0.21751996874809265, + "learning_rate": 0.00013865317244063838, + "loss": 1.0444, + "step": 794 + }, + { + "epoch": 0.3089478286213932, + "grad_norm": 0.20593520998954773, + "learning_rate": 0.00013857532113662906, + "loss": 1.0304, + "step": 795 + }, + { + "epoch": 0.3093364422423006, + "grad_norm": 0.19937261939048767, + "learning_rate": 0.0001384974698326197, + "loss": 1.0017, + "step": 796 + }, + { + "epoch": 0.30972505586320803, + "grad_norm": 0.18901696801185608, + "learning_rate": 0.00013841961852861036, + "loss": 1.0362, + "step": 797 + }, + { + "epoch": 0.3101136694841154, + "grad_norm": 0.2079760730266571, + "learning_rate": 0.000138341767224601, + "loss": 1.0784, + "step": 798 + }, + { + "epoch": 0.3105022831050228, + "grad_norm": 0.24873265624046326, + "learning_rate": 0.00013826391592059166, + "loss": 1.1026, + "step": 799 + }, + { + "epoch": 0.31089089672593023, + "grad_norm": 0.20185396075248718, + "learning_rate": 0.00013818606461658234, + "loss": 1.0235, + "step": 800 + }, + { + "epoch": 0.31127951034683765, + "grad_norm": 0.211393803358078, + "learning_rate": 0.000138108213312573, + "loss": 1.0999, + "step": 801 + }, + { + "epoch": 0.31166812396774507, + "grad_norm": 0.19948823750019073, + "learning_rate": 0.00013803036200856365, + "loss": 1.0242, + "step": 802 + }, + { + "epoch": 0.3120567375886525, + "grad_norm": 0.21470944583415985, + "learning_rate": 0.0001379525107045543, + "loss": 1.0736, + "step": 803 + }, + { + "epoch": 0.3124453512095599, + "grad_norm": 0.2195902317762375, + "learning_rate": 0.00013787465940054495, + "loss": 1.0368, + "step": 804 + }, + { + "epoch": 0.3128339648304673, + "grad_norm": 0.22142355144023895, + "learning_rate": 0.00013779680809653563, + "loss": 1.1022, + "step": 805 + }, + { + "epoch": 0.31322257845137474, + "grad_norm": 0.20487886667251587, + "learning_rate": 0.00013771895679252628, + "loss": 1.0478, + "step": 806 + }, + { + "epoch": 0.3136111920722821, + "grad_norm": 0.217549130320549, + "learning_rate": 0.00013764110548851693, + "loss": 1.0526, + "step": 807 + }, + { + "epoch": 0.3139998056931895, + "grad_norm": 0.20199982821941376, + "learning_rate": 0.0001375632541845076, + "loss": 0.9992, + "step": 808 + }, + { + "epoch": 0.31438841931409695, + "grad_norm": 0.19496634602546692, + "learning_rate": 0.00013748540288049824, + "loss": 1.0179, + "step": 809 + }, + { + "epoch": 0.31477703293500436, + "grad_norm": 0.21999460458755493, + "learning_rate": 0.0001374075515764889, + "loss": 1.0547, + "step": 810 + }, + { + "epoch": 0.3151656465559118, + "grad_norm": 0.21421074867248535, + "learning_rate": 0.00013732970027247957, + "loss": 1.0283, + "step": 811 + }, + { + "epoch": 0.3155542601768192, + "grad_norm": 0.1913364827632904, + "learning_rate": 0.00013725184896847022, + "loss": 0.9826, + "step": 812 + }, + { + "epoch": 0.3159428737977266, + "grad_norm": 0.20509806275367737, + "learning_rate": 0.00013717399766446087, + "loss": 1.0303, + "step": 813 + }, + { + "epoch": 0.31633148741863404, + "grad_norm": 0.20309868454933167, + "learning_rate": 0.00013709614636045153, + "loss": 1.0479, + "step": 814 + }, + { + "epoch": 0.31672010103954146, + "grad_norm": 0.2274443656206131, + "learning_rate": 0.0001370182950564422, + "loss": 1.1311, + "step": 815 + }, + { + "epoch": 0.3171087146604489, + "grad_norm": 0.22785170376300812, + "learning_rate": 0.00013694044375243286, + "loss": 1.1009, + "step": 816 + }, + { + "epoch": 0.31749732828135624, + "grad_norm": 0.2105439007282257, + "learning_rate": 0.0001368625924484235, + "loss": 1.0251, + "step": 817 + }, + { + "epoch": 0.31788594190226366, + "grad_norm": 0.20583970844745636, + "learning_rate": 0.00013678474114441416, + "loss": 1.0833, + "step": 818 + }, + { + "epoch": 0.3182745555231711, + "grad_norm": 0.21091191470623016, + "learning_rate": 0.00013670688984040484, + "loss": 1.071, + "step": 819 + }, + { + "epoch": 0.3186631691440785, + "grad_norm": 0.20645928382873535, + "learning_rate": 0.0001366290385363955, + "loss": 1.0605, + "step": 820 + }, + { + "epoch": 0.3190517827649859, + "grad_norm": 0.1990513950586319, + "learning_rate": 0.00013655118723238614, + "loss": 1.0461, + "step": 821 + }, + { + "epoch": 0.31944039638589333, + "grad_norm": 0.2192249745130539, + "learning_rate": 0.00013647333592837682, + "loss": 1.0975, + "step": 822 + }, + { + "epoch": 0.31982901000680075, + "grad_norm": 0.2157617211341858, + "learning_rate": 0.00013639548462436748, + "loss": 1.091, + "step": 823 + }, + { + "epoch": 0.32021762362770817, + "grad_norm": 0.21964526176452637, + "learning_rate": 0.00013631763332035813, + "loss": 1.0286, + "step": 824 + }, + { + "epoch": 0.3206062372486156, + "grad_norm": 0.2079797089099884, + "learning_rate": 0.00013623978201634878, + "loss": 1.0257, + "step": 825 + }, + { + "epoch": 0.32099485086952295, + "grad_norm": 0.21220168471336365, + "learning_rate": 0.00013616193071233946, + "loss": 1.0046, + "step": 826 + }, + { + "epoch": 0.32138346449043037, + "grad_norm": 0.2885231673717499, + "learning_rate": 0.0001360840794083301, + "loss": 1.1442, + "step": 827 + }, + { + "epoch": 0.3217720781113378, + "grad_norm": 0.2096511274576187, + "learning_rate": 0.00013600622810432076, + "loss": 1.0209, + "step": 828 + }, + { + "epoch": 0.3221606917322452, + "grad_norm": 0.2179451286792755, + "learning_rate": 0.00013592837680031142, + "loss": 1.0548, + "step": 829 + }, + { + "epoch": 0.3225493053531526, + "grad_norm": 0.2096329927444458, + "learning_rate": 0.00013585052549630207, + "loss": 1.0279, + "step": 830 + }, + { + "epoch": 0.32293791897406005, + "grad_norm": 0.22531811892986298, + "learning_rate": 0.00013577267419229275, + "loss": 1.0463, + "step": 831 + }, + { + "epoch": 0.32332653259496746, + "grad_norm": 0.22516901791095734, + "learning_rate": 0.0001356948228882834, + "loss": 1.1127, + "step": 832 + }, + { + "epoch": 0.3237151462158749, + "grad_norm": 0.22487780451774597, + "learning_rate": 0.00013561697158427405, + "loss": 1.0707, + "step": 833 + }, + { + "epoch": 0.3241037598367823, + "grad_norm": 0.20976543426513672, + "learning_rate": 0.0001355391202802647, + "loss": 1.0217, + "step": 834 + }, + { + "epoch": 0.32449237345768966, + "grad_norm": 0.19849295914173126, + "learning_rate": 0.00013546126897625535, + "loss": 1.021, + "step": 835 + }, + { + "epoch": 0.3248809870785971, + "grad_norm": 0.21772268414497375, + "learning_rate": 0.00013538341767224603, + "loss": 1.0605, + "step": 836 + }, + { + "epoch": 0.3252696006995045, + "grad_norm": 0.19670265913009644, + "learning_rate": 0.00013530556636823669, + "loss": 1.0165, + "step": 837 + }, + { + "epoch": 0.3256582143204119, + "grad_norm": 0.19339734315872192, + "learning_rate": 0.00013522771506422734, + "loss": 1.0203, + "step": 838 + }, + { + "epoch": 0.32604682794131934, + "grad_norm": 0.21289557218551636, + "learning_rate": 0.000135149863760218, + "loss": 1.0252, + "step": 839 + }, + { + "epoch": 0.32643544156222676, + "grad_norm": 0.1964789777994156, + "learning_rate": 0.00013507201245620864, + "loss": 1.0392, + "step": 840 + }, + { + "epoch": 0.3268240551831342, + "grad_norm": 0.20783716440200806, + "learning_rate": 0.00013499416115219932, + "loss": 1.0569, + "step": 841 + }, + { + "epoch": 0.3272126688040416, + "grad_norm": 0.22782161831855774, + "learning_rate": 0.00013491630984818997, + "loss": 1.0555, + "step": 842 + }, + { + "epoch": 0.327601282424949, + "grad_norm": 0.22771142423152924, + "learning_rate": 0.00013483845854418063, + "loss": 1.085, + "step": 843 + }, + { + "epoch": 0.32798989604585643, + "grad_norm": 0.19773711264133453, + "learning_rate": 0.00013476060724017128, + "loss": 1.008, + "step": 844 + }, + { + "epoch": 0.3283785096667638, + "grad_norm": 0.22399166226387024, + "learning_rate": 0.00013468275593616193, + "loss": 1.0511, + "step": 845 + }, + { + "epoch": 0.3287671232876712, + "grad_norm": 0.20488236844539642, + "learning_rate": 0.00013460490463215258, + "loss": 1.0883, + "step": 846 + }, + { + "epoch": 0.32915573690857863, + "grad_norm": 0.21387654542922974, + "learning_rate": 0.00013452705332814326, + "loss": 1.0808, + "step": 847 + }, + { + "epoch": 0.32954435052948605, + "grad_norm": 0.1972568780183792, + "learning_rate": 0.0001344492020241339, + "loss": 1.0555, + "step": 848 + }, + { + "epoch": 0.32993296415039347, + "grad_norm": 0.20835663378238678, + "learning_rate": 0.00013437135072012456, + "loss": 1.0473, + "step": 849 + }, + { + "epoch": 0.3303215777713009, + "grad_norm": 0.19707520306110382, + "learning_rate": 0.00013429349941611522, + "loss": 0.9585, + "step": 850 + }, + { + "epoch": 0.3307101913922083, + "grad_norm": 0.19163411855697632, + "learning_rate": 0.00013421564811210587, + "loss": 1.0025, + "step": 851 + }, + { + "epoch": 0.3310988050131157, + "grad_norm": 0.19730083644390106, + "learning_rate": 0.00013413779680809655, + "loss": 1.0696, + "step": 852 + }, + { + "epoch": 0.33148741863402315, + "grad_norm": 0.19537493586540222, + "learning_rate": 0.0001340599455040872, + "loss": 1.0466, + "step": 853 + }, + { + "epoch": 0.3318760322549305, + "grad_norm": 0.2255164235830307, + "learning_rate": 0.00013398209420007785, + "loss": 1.0659, + "step": 854 + }, + { + "epoch": 0.3322646458758379, + "grad_norm": 0.19774770736694336, + "learning_rate": 0.0001339042428960685, + "loss": 1.0326, + "step": 855 + }, + { + "epoch": 0.33265325949674535, + "grad_norm": 0.2004510909318924, + "learning_rate": 0.00013382639159205916, + "loss": 1.0327, + "step": 856 + }, + { + "epoch": 0.33304187311765276, + "grad_norm": 0.19187591969966888, + "learning_rate": 0.00013374854028804984, + "loss": 1.0069, + "step": 857 + }, + { + "epoch": 0.3334304867385602, + "grad_norm": 0.18775832653045654, + "learning_rate": 0.0001336706889840405, + "loss": 1.0083, + "step": 858 + }, + { + "epoch": 0.3338191003594676, + "grad_norm": 0.2005717158317566, + "learning_rate": 0.00013359283768003114, + "loss": 1.0398, + "step": 859 + }, + { + "epoch": 0.334207713980375, + "grad_norm": 0.19705893099308014, + "learning_rate": 0.0001335149863760218, + "loss": 1.0031, + "step": 860 + }, + { + "epoch": 0.33459632760128244, + "grad_norm": 0.19589562714099884, + "learning_rate": 0.00013343713507201244, + "loss": 0.9831, + "step": 861 + }, + { + "epoch": 0.33498494122218986, + "grad_norm": 0.19302591681480408, + "learning_rate": 0.00013335928376800312, + "loss": 1.0009, + "step": 862 + }, + { + "epoch": 0.3353735548430973, + "grad_norm": 0.20499618351459503, + "learning_rate": 0.00013328143246399377, + "loss": 1.0205, + "step": 863 + }, + { + "epoch": 0.33576216846400464, + "grad_norm": 0.20514456927776337, + "learning_rate": 0.00013320358115998443, + "loss": 1.0837, + "step": 864 + }, + { + "epoch": 0.33615078208491206, + "grad_norm": 0.19285848736763, + "learning_rate": 0.00013312572985597508, + "loss": 1.0167, + "step": 865 + }, + { + "epoch": 0.3365393957058195, + "grad_norm": 0.20891553163528442, + "learning_rate": 0.00013304787855196573, + "loss": 1.0127, + "step": 866 + }, + { + "epoch": 0.3369280093267269, + "grad_norm": 0.20511706173419952, + "learning_rate": 0.0001329700272479564, + "loss": 0.964, + "step": 867 + }, + { + "epoch": 0.3373166229476343, + "grad_norm": 0.1855512261390686, + "learning_rate": 0.00013289217594394706, + "loss": 0.9721, + "step": 868 + }, + { + "epoch": 0.33770523656854173, + "grad_norm": 0.20010098814964294, + "learning_rate": 0.00013281432463993771, + "loss": 1.0411, + "step": 869 + }, + { + "epoch": 0.33809385018944915, + "grad_norm": 0.1991325318813324, + "learning_rate": 0.0001327364733359284, + "loss": 0.9658, + "step": 870 + }, + { + "epoch": 0.33848246381035657, + "grad_norm": 0.19895736873149872, + "learning_rate": 0.00013265862203191905, + "loss": 1.0744, + "step": 871 + }, + { + "epoch": 0.338871077431264, + "grad_norm": 0.2091255635023117, + "learning_rate": 0.0001325807707279097, + "loss": 1.0375, + "step": 872 + }, + { + "epoch": 0.33925969105217135, + "grad_norm": 0.21355532109737396, + "learning_rate": 0.00013250291942390035, + "loss": 1.09, + "step": 873 + }, + { + "epoch": 0.33964830467307877, + "grad_norm": 0.21844851970672607, + "learning_rate": 0.00013242506811989103, + "loss": 1.0769, + "step": 874 + }, + { + "epoch": 0.3400369182939862, + "grad_norm": 0.1877543330192566, + "learning_rate": 0.00013234721681588168, + "loss": 1.0199, + "step": 875 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 0.2020038366317749, + "learning_rate": 0.00013226936551187233, + "loss": 1.0218, + "step": 876 + }, + { + "epoch": 0.340814145535801, + "grad_norm": 0.20682141184806824, + "learning_rate": 0.000132191514207863, + "loss": 1.0891, + "step": 877 + }, + { + "epoch": 0.34120275915670845, + "grad_norm": 0.21942824125289917, + "learning_rate": 0.00013211366290385366, + "loss": 0.9877, + "step": 878 + }, + { + "epoch": 0.34159137277761586, + "grad_norm": 0.21150313317775726, + "learning_rate": 0.00013203581159984432, + "loss": 1.0815, + "step": 879 + }, + { + "epoch": 0.3419799863985233, + "grad_norm": 0.2073293924331665, + "learning_rate": 0.00013195796029583497, + "loss": 1.0579, + "step": 880 + }, + { + "epoch": 0.3423686000194307, + "grad_norm": 0.221574068069458, + "learning_rate": 0.00013188010899182562, + "loss": 1.0279, + "step": 881 + }, + { + "epoch": 0.3427572136403381, + "grad_norm": 0.22334492206573486, + "learning_rate": 0.00013180225768781627, + "loss": 1.0837, + "step": 882 + }, + { + "epoch": 0.3431458272612455, + "grad_norm": 0.18817654252052307, + "learning_rate": 0.00013172440638380695, + "loss": 1.0262, + "step": 883 + }, + { + "epoch": 0.3435344408821529, + "grad_norm": 0.20126822590827942, + "learning_rate": 0.0001316465550797976, + "loss": 1.0679, + "step": 884 + }, + { + "epoch": 0.3439230545030603, + "grad_norm": 0.2128864973783493, + "learning_rate": 0.00013156870377578825, + "loss": 1.0316, + "step": 885 + }, + { + "epoch": 0.34431166812396774, + "grad_norm": 0.20054499804973602, + "learning_rate": 0.0001314908524717789, + "loss": 1.0024, + "step": 886 + }, + { + "epoch": 0.34470028174487516, + "grad_norm": 0.21358034014701843, + "learning_rate": 0.00013141300116776956, + "loss": 1.0475, + "step": 887 + }, + { + "epoch": 0.3450888953657826, + "grad_norm": 0.21377703547477722, + "learning_rate": 0.00013133514986376024, + "loss": 1.0957, + "step": 888 + }, + { + "epoch": 0.34547750898669, + "grad_norm": 0.20166514813899994, + "learning_rate": 0.0001312572985597509, + "loss": 1.0189, + "step": 889 + }, + { + "epoch": 0.3458661226075974, + "grad_norm": 0.20424878597259521, + "learning_rate": 0.00013117944725574154, + "loss": 1.0896, + "step": 890 + }, + { + "epoch": 0.34625473622850483, + "grad_norm": 0.19028648734092712, + "learning_rate": 0.0001311015959517322, + "loss": 0.9881, + "step": 891 + }, + { + "epoch": 0.3466433498494122, + "grad_norm": 0.20828665792942047, + "learning_rate": 0.00013102374464772285, + "loss": 0.9932, + "step": 892 + }, + { + "epoch": 0.3470319634703196, + "grad_norm": 0.20756572484970093, + "learning_rate": 0.00013094589334371353, + "loss": 1.0406, + "step": 893 + }, + { + "epoch": 0.34742057709122703, + "grad_norm": 0.20768921077251434, + "learning_rate": 0.00013086804203970418, + "loss": 0.9652, + "step": 894 + }, + { + "epoch": 0.34780919071213445, + "grad_norm": 0.20660027861595154, + "learning_rate": 0.00013079019073569483, + "loss": 1.0728, + "step": 895 + }, + { + "epoch": 0.34819780433304187, + "grad_norm": 0.20186837017536163, + "learning_rate": 0.00013071233943168548, + "loss": 1.0407, + "step": 896 + }, + { + "epoch": 0.3485864179539493, + "grad_norm": 0.20880667865276337, + "learning_rate": 0.00013063448812767613, + "loss": 1.0275, + "step": 897 + }, + { + "epoch": 0.3489750315748567, + "grad_norm": 0.22212949395179749, + "learning_rate": 0.0001305566368236668, + "loss": 1.0293, + "step": 898 + }, + { + "epoch": 0.3493636451957641, + "grad_norm": 0.20552745461463928, + "learning_rate": 0.00013047878551965746, + "loss": 1.0434, + "step": 899 + }, + { + "epoch": 0.34975225881667155, + "grad_norm": 0.21239839494228363, + "learning_rate": 0.00013040093421564812, + "loss": 1.052, + "step": 900 + }, + { + "epoch": 0.3501408724375789, + "grad_norm": 0.22420544922351837, + "learning_rate": 0.00013032308291163877, + "loss": 1.0236, + "step": 901 + }, + { + "epoch": 0.35052948605848633, + "grad_norm": 0.23435090482234955, + "learning_rate": 0.00013024523160762942, + "loss": 1.0876, + "step": 902 + }, + { + "epoch": 0.35091809967939375, + "grad_norm": 0.22763386368751526, + "learning_rate": 0.0001301673803036201, + "loss": 1.0636, + "step": 903 + }, + { + "epoch": 0.35130671330030117, + "grad_norm": 0.20948883891105652, + "learning_rate": 0.00013008952899961075, + "loss": 1.0083, + "step": 904 + }, + { + "epoch": 0.3516953269212086, + "grad_norm": 0.20408779382705688, + "learning_rate": 0.0001300116776956014, + "loss": 1.039, + "step": 905 + }, + { + "epoch": 0.352083940542116, + "grad_norm": 0.2126050591468811, + "learning_rate": 0.00012993382639159206, + "loss": 1.0365, + "step": 906 + }, + { + "epoch": 0.3524725541630234, + "grad_norm": 0.20314334332942963, + "learning_rate": 0.0001298559750875827, + "loss": 1.0474, + "step": 907 + }, + { + "epoch": 0.35286116778393084, + "grad_norm": 0.23720984160900116, + "learning_rate": 0.0001297781237835734, + "loss": 1.0529, + "step": 908 + }, + { + "epoch": 0.35324978140483826, + "grad_norm": 0.22642800211906433, + "learning_rate": 0.00012970027247956404, + "loss": 1.0586, + "step": 909 + }, + { + "epoch": 0.3536383950257457, + "grad_norm": 0.20469972491264343, + "learning_rate": 0.0001296224211755547, + "loss": 1.0267, + "step": 910 + }, + { + "epoch": 0.35402700864665304, + "grad_norm": 0.197368785738945, + "learning_rate": 0.00012954456987154534, + "loss": 1.0348, + "step": 911 + }, + { + "epoch": 0.35441562226756046, + "grad_norm": 0.21924498677253723, + "learning_rate": 0.000129466718567536, + "loss": 1.0861, + "step": 912 + }, + { + "epoch": 0.3548042358884679, + "grad_norm": 0.22006285190582275, + "learning_rate": 0.00012938886726352667, + "loss": 1.0545, + "step": 913 + }, + { + "epoch": 0.3551928495093753, + "grad_norm": 0.22419220209121704, + "learning_rate": 0.00012931101595951733, + "loss": 1.0716, + "step": 914 + }, + { + "epoch": 0.3555814631302827, + "grad_norm": 0.215990349650383, + "learning_rate": 0.00012923316465550798, + "loss": 1.0619, + "step": 915 + }, + { + "epoch": 0.35597007675119013, + "grad_norm": 0.20783264935016632, + "learning_rate": 0.00012915531335149863, + "loss": 1.0412, + "step": 916 + }, + { + "epoch": 0.35635869037209755, + "grad_norm": 0.24584618210792542, + "learning_rate": 0.00012907746204748928, + "loss": 1.1165, + "step": 917 + }, + { + "epoch": 0.35674730399300497, + "grad_norm": 0.23146122694015503, + "learning_rate": 0.00012899961074347996, + "loss": 1.1111, + "step": 918 + }, + { + "epoch": 0.3571359176139124, + "grad_norm": 0.19983729720115662, + "learning_rate": 0.00012892175943947061, + "loss": 1.0674, + "step": 919 + }, + { + "epoch": 0.35752453123481975, + "grad_norm": 0.2161000818014145, + "learning_rate": 0.00012884390813546127, + "loss": 1.076, + "step": 920 + }, + { + "epoch": 0.35791314485572717, + "grad_norm": 0.21042793989181519, + "learning_rate": 0.00012876605683145192, + "loss": 1.0535, + "step": 921 + }, + { + "epoch": 0.3583017584766346, + "grad_norm": 0.20135439932346344, + "learning_rate": 0.0001286882055274426, + "loss": 1.0059, + "step": 922 + }, + { + "epoch": 0.358690372097542, + "grad_norm": 0.19394971430301666, + "learning_rate": 0.00012861035422343325, + "loss": 1.0381, + "step": 923 + }, + { + "epoch": 0.35907898571844943, + "grad_norm": 0.21171030402183533, + "learning_rate": 0.0001285325029194239, + "loss": 1.0513, + "step": 924 + }, + { + "epoch": 0.35946759933935685, + "grad_norm": 0.19476690888404846, + "learning_rate": 0.00012845465161541458, + "loss": 1.0003, + "step": 925 + }, + { + "epoch": 0.35985621296026427, + "grad_norm": 0.20468670129776, + "learning_rate": 0.00012837680031140523, + "loss": 1.0608, + "step": 926 + }, + { + "epoch": 0.3602448265811717, + "grad_norm": 0.21159446239471436, + "learning_rate": 0.00012829894900739588, + "loss": 1.0734, + "step": 927 + }, + { + "epoch": 0.3606334402020791, + "grad_norm": 0.21179519593715668, + "learning_rate": 0.00012822109770338654, + "loss": 1.0957, + "step": 928 + }, + { + "epoch": 0.3610220538229865, + "grad_norm": 0.20997527241706848, + "learning_rate": 0.00012814324639937722, + "loss": 1.0644, + "step": 929 + }, + { + "epoch": 0.3614106674438939, + "grad_norm": 0.21178296208381653, + "learning_rate": 0.00012806539509536787, + "loss": 1.0208, + "step": 930 + }, + { + "epoch": 0.3617992810648013, + "grad_norm": 0.20890356600284576, + "learning_rate": 0.00012798754379135852, + "loss": 1.0888, + "step": 931 + }, + { + "epoch": 0.3621878946857087, + "grad_norm": 0.20177409052848816, + "learning_rate": 0.00012790969248734917, + "loss": 0.9741, + "step": 932 + }, + { + "epoch": 0.36257650830661614, + "grad_norm": 0.23504556715488434, + "learning_rate": 0.00012783184118333982, + "loss": 1.1048, + "step": 933 + }, + { + "epoch": 0.36296512192752356, + "grad_norm": 0.22829356789588928, + "learning_rate": 0.0001277539898793305, + "loss": 1.0798, + "step": 934 + }, + { + "epoch": 0.363353735548431, + "grad_norm": 0.2068483531475067, + "learning_rate": 0.00012767613857532116, + "loss": 1.0452, + "step": 935 + }, + { + "epoch": 0.3637423491693384, + "grad_norm": 0.2093171775341034, + "learning_rate": 0.0001275982872713118, + "loss": 1.0742, + "step": 936 + }, + { + "epoch": 0.3641309627902458, + "grad_norm": 0.21478736400604248, + "learning_rate": 0.00012752043596730246, + "loss": 1.0572, + "step": 937 + }, + { + "epoch": 0.36451957641115323, + "grad_norm": 0.1906953752040863, + "learning_rate": 0.0001274425846632931, + "loss": 1.0107, + "step": 938 + }, + { + "epoch": 0.3649081900320606, + "grad_norm": 0.20580604672431946, + "learning_rate": 0.0001273647333592838, + "loss": 1.0677, + "step": 939 + }, + { + "epoch": 0.365296803652968, + "grad_norm": 0.22586850821971893, + "learning_rate": 0.00012728688205527444, + "loss": 1.0389, + "step": 940 + }, + { + "epoch": 0.36568541727387543, + "grad_norm": 0.199899360537529, + "learning_rate": 0.0001272090307512651, + "loss": 1.0462, + "step": 941 + }, + { + "epoch": 0.36607403089478285, + "grad_norm": 0.19881689548492432, + "learning_rate": 0.00012713117944725575, + "loss": 1.0565, + "step": 942 + }, + { + "epoch": 0.3664626445156903, + "grad_norm": 0.21748925745487213, + "learning_rate": 0.0001270533281432464, + "loss": 1.0659, + "step": 943 + }, + { + "epoch": 0.3668512581365977, + "grad_norm": 0.19363689422607422, + "learning_rate": 0.00012697547683923708, + "loss": 1.0307, + "step": 944 + }, + { + "epoch": 0.3672398717575051, + "grad_norm": 0.21701784431934357, + "learning_rate": 0.00012689762553522773, + "loss": 1.0684, + "step": 945 + }, + { + "epoch": 0.36762848537841253, + "grad_norm": 0.21406958997249603, + "learning_rate": 0.00012681977423121838, + "loss": 1.0703, + "step": 946 + }, + { + "epoch": 0.36801709899931995, + "grad_norm": 0.23539729416370392, + "learning_rate": 0.00012674192292720903, + "loss": 1.1537, + "step": 947 + }, + { + "epoch": 0.36840571262022737, + "grad_norm": 0.2177354395389557, + "learning_rate": 0.00012666407162319969, + "loss": 1.0131, + "step": 948 + }, + { + "epoch": 0.36879432624113473, + "grad_norm": 0.255346417427063, + "learning_rate": 0.00012658622031919037, + "loss": 0.9807, + "step": 949 + }, + { + "epoch": 0.36918293986204215, + "grad_norm": 0.2139921486377716, + "learning_rate": 0.00012650836901518102, + "loss": 1.0392, + "step": 950 + }, + { + "epoch": 0.36957155348294957, + "grad_norm": 0.22490833699703217, + "learning_rate": 0.00012643051771117167, + "loss": 1.0512, + "step": 951 + }, + { + "epoch": 0.369960167103857, + "grad_norm": 0.20698820054531097, + "learning_rate": 0.00012635266640716232, + "loss": 1.0391, + "step": 952 + }, + { + "epoch": 0.3703487807247644, + "grad_norm": 0.2276201844215393, + "learning_rate": 0.00012627481510315297, + "loss": 1.0513, + "step": 953 + }, + { + "epoch": 0.3707373943456718, + "grad_norm": 0.2493600994348526, + "learning_rate": 0.00012619696379914365, + "loss": 1.0136, + "step": 954 + }, + { + "epoch": 0.37112600796657924, + "grad_norm": 0.2155001014471054, + "learning_rate": 0.0001261191124951343, + "loss": 1.0523, + "step": 955 + }, + { + "epoch": 0.37151462158748666, + "grad_norm": 0.21571211516857147, + "learning_rate": 0.00012604126119112496, + "loss": 1.0288, + "step": 956 + }, + { + "epoch": 0.3719032352083941, + "grad_norm": 0.23238877952098846, + "learning_rate": 0.0001259634098871156, + "loss": 1.0638, + "step": 957 + }, + { + "epoch": 0.37229184882930144, + "grad_norm": 0.2002813220024109, + "learning_rate": 0.00012588555858310626, + "loss": 0.9665, + "step": 958 + }, + { + "epoch": 0.37268046245020886, + "grad_norm": 0.21712858974933624, + "learning_rate": 0.0001258077072790969, + "loss": 1.0469, + "step": 959 + }, + { + "epoch": 0.3730690760711163, + "grad_norm": 0.2178192287683487, + "learning_rate": 0.0001257298559750876, + "loss": 1.0267, + "step": 960 + }, + { + "epoch": 0.3734576896920237, + "grad_norm": 0.25488024950027466, + "learning_rate": 0.00012565200467107824, + "loss": 1.0153, + "step": 961 + }, + { + "epoch": 0.3738463033129311, + "grad_norm": 0.20070038735866547, + "learning_rate": 0.0001255741533670689, + "loss": 1.0279, + "step": 962 + }, + { + "epoch": 0.37423491693383854, + "grad_norm": 0.21885356307029724, + "learning_rate": 0.00012549630206305955, + "loss": 1.0395, + "step": 963 + }, + { + "epoch": 0.37462353055474595, + "grad_norm": 0.2407921701669693, + "learning_rate": 0.0001254184507590502, + "loss": 1.0767, + "step": 964 + }, + { + "epoch": 0.3750121441756534, + "grad_norm": 0.20645053684711456, + "learning_rate": 0.00012534059945504088, + "loss": 1.0318, + "step": 965 + }, + { + "epoch": 0.3754007577965608, + "grad_norm": 0.21275092661380768, + "learning_rate": 0.00012526274815103153, + "loss": 1.0546, + "step": 966 + }, + { + "epoch": 0.3757893714174682, + "grad_norm": 0.21574917435646057, + "learning_rate": 0.00012518489684702218, + "loss": 1.032, + "step": 967 + }, + { + "epoch": 0.3761779850383756, + "grad_norm": 0.21589480340480804, + "learning_rate": 0.00012510704554301284, + "loss": 1.0834, + "step": 968 + }, + { + "epoch": 0.376566598659283, + "grad_norm": 0.19576796889305115, + "learning_rate": 0.0001250291942390035, + "loss": 1.0178, + "step": 969 + }, + { + "epoch": 0.3769552122801904, + "grad_norm": 0.20941287279129028, + "learning_rate": 0.00012495134293499417, + "loss": 1.0712, + "step": 970 + }, + { + "epoch": 0.37734382590109783, + "grad_norm": 0.22585494816303253, + "learning_rate": 0.00012487349163098482, + "loss": 1.0401, + "step": 971 + }, + { + "epoch": 0.37773243952200525, + "grad_norm": 0.21093420684337616, + "learning_rate": 0.00012479564032697547, + "loss": 1.0569, + "step": 972 + }, + { + "epoch": 0.37812105314291267, + "grad_norm": 0.22375014424324036, + "learning_rate": 0.00012471778902296612, + "loss": 1.0687, + "step": 973 + }, + { + "epoch": 0.3785096667638201, + "grad_norm": 0.19787487387657166, + "learning_rate": 0.0001246399377189568, + "loss": 1.0266, + "step": 974 + }, + { + "epoch": 0.3788982803847275, + "grad_norm": 0.20633013546466827, + "learning_rate": 0.00012456208641494745, + "loss": 0.9996, + "step": 975 + }, + { + "epoch": 0.3792868940056349, + "grad_norm": 0.21559873223304749, + "learning_rate": 0.0001244842351109381, + "loss": 1.0851, + "step": 976 + }, + { + "epoch": 0.3796755076265423, + "grad_norm": 0.2166333943605423, + "learning_rate": 0.00012440638380692879, + "loss": 1.0859, + "step": 977 + }, + { + "epoch": 0.3800641212474497, + "grad_norm": 0.18558773398399353, + "learning_rate": 0.00012432853250291944, + "loss": 0.9534, + "step": 978 + }, + { + "epoch": 0.3804527348683571, + "grad_norm": 0.2086942344903946, + "learning_rate": 0.0001242506811989101, + "loss": 1.0786, + "step": 979 + }, + { + "epoch": 0.38084134848926454, + "grad_norm": 0.2207823544740677, + "learning_rate": 0.00012417282989490074, + "loss": 1.0626, + "step": 980 + }, + { + "epoch": 0.38122996211017196, + "grad_norm": 0.21255749464035034, + "learning_rate": 0.00012409497859089142, + "loss": 1.063, + "step": 981 + }, + { + "epoch": 0.3816185757310794, + "grad_norm": 0.20682042837142944, + "learning_rate": 0.00012401712728688207, + "loss": 1.034, + "step": 982 + }, + { + "epoch": 0.3820071893519868, + "grad_norm": 0.2084134966135025, + "learning_rate": 0.00012393927598287272, + "loss": 1.0481, + "step": 983 + }, + { + "epoch": 0.3823958029728942, + "grad_norm": 0.1922312080860138, + "learning_rate": 0.00012386142467886338, + "loss": 1.0461, + "step": 984 + }, + { + "epoch": 0.38278441659380164, + "grad_norm": 0.20893707871437073, + "learning_rate": 0.00012378357337485406, + "loss": 1.0797, + "step": 985 + }, + { + "epoch": 0.383173030214709, + "grad_norm": 0.19717541337013245, + "learning_rate": 0.0001237057220708447, + "loss": 1.0028, + "step": 986 + }, + { + "epoch": 0.3835616438356164, + "grad_norm": 0.20688053965568542, + "learning_rate": 0.00012362787076683536, + "loss": 0.989, + "step": 987 + }, + { + "epoch": 0.38395025745652384, + "grad_norm": 0.20580583810806274, + "learning_rate": 0.000123550019462826, + "loss": 1.06, + "step": 988 + }, + { + "epoch": 0.38433887107743125, + "grad_norm": 0.2151709794998169, + "learning_rate": 0.00012347216815881666, + "loss": 1.0685, + "step": 989 + }, + { + "epoch": 0.3847274846983387, + "grad_norm": 0.19573980569839478, + "learning_rate": 0.00012339431685480734, + "loss": 1.0072, + "step": 990 + }, + { + "epoch": 0.3851160983192461, + "grad_norm": 0.1949119120836258, + "learning_rate": 0.000123316465550798, + "loss": 0.9995, + "step": 991 + }, + { + "epoch": 0.3855047119401535, + "grad_norm": 0.2062375247478485, + "learning_rate": 0.00012323861424678865, + "loss": 1.0694, + "step": 992 + }, + { + "epoch": 0.38589332556106093, + "grad_norm": 0.2007209211587906, + "learning_rate": 0.0001231607629427793, + "loss": 1.0397, + "step": 993 + }, + { + "epoch": 0.38628193918196835, + "grad_norm": 0.2231544405221939, + "learning_rate": 0.00012308291163876995, + "loss": 1.0755, + "step": 994 + }, + { + "epoch": 0.38667055280287577, + "grad_norm": 0.2103337049484253, + "learning_rate": 0.0001230050603347606, + "loss": 1.0505, + "step": 995 + }, + { + "epoch": 0.38705916642378313, + "grad_norm": 0.20178386569023132, + "learning_rate": 0.00012292720903075128, + "loss": 1.0696, + "step": 996 + }, + { + "epoch": 0.38744778004469055, + "grad_norm": 0.21268007159233093, + "learning_rate": 0.00012284935772674193, + "loss": 1.0262, + "step": 997 + }, + { + "epoch": 0.38783639366559797, + "grad_norm": 0.21439722180366516, + "learning_rate": 0.0001227715064227326, + "loss": 1.0718, + "step": 998 + }, + { + "epoch": 0.3882250072865054, + "grad_norm": 0.19691336154937744, + "learning_rate": 0.00012269365511872324, + "loss": 0.9663, + "step": 999 + }, + { + "epoch": 0.3886136209074128, + "grad_norm": 0.2165926694869995, + "learning_rate": 0.0001226158038147139, + "loss": 1.0432, + "step": 1000 + }, + { + "epoch": 0.3890022345283202, + "grad_norm": 0.20730604231357574, + "learning_rate": 0.00012253795251070457, + "loss": 1.0386, + "step": 1001 + }, + { + "epoch": 0.38939084814922764, + "grad_norm": 0.2138068974018097, + "learning_rate": 0.00012246010120669522, + "loss": 1.0683, + "step": 1002 + }, + { + "epoch": 0.38977946177013506, + "grad_norm": 0.2118951678276062, + "learning_rate": 0.00012238224990268587, + "loss": 1.0393, + "step": 1003 + }, + { + "epoch": 0.3901680753910425, + "grad_norm": 0.20879961550235748, + "learning_rate": 0.00012230439859867653, + "loss": 1.0349, + "step": 1004 + }, + { + "epoch": 0.39055668901194984, + "grad_norm": 0.19588464498519897, + "learning_rate": 0.00012222654729466718, + "loss": 1.0226, + "step": 1005 + }, + { + "epoch": 0.39094530263285726, + "grad_norm": 0.2059485912322998, + "learning_rate": 0.00012214869599065786, + "loss": 1.052, + "step": 1006 + }, + { + "epoch": 0.3913339162537647, + "grad_norm": 0.2299761176109314, + "learning_rate": 0.0001220708446866485, + "loss": 1.1055, + "step": 1007 + }, + { + "epoch": 0.3917225298746721, + "grad_norm": 0.20196737349033356, + "learning_rate": 0.00012199299338263916, + "loss": 1.0497, + "step": 1008 + }, + { + "epoch": 0.3921111434955795, + "grad_norm": 0.20615293085575104, + "learning_rate": 0.00012191514207862981, + "loss": 1.047, + "step": 1009 + }, + { + "epoch": 0.39249975711648694, + "grad_norm": 0.20265278220176697, + "learning_rate": 0.00012183729077462047, + "loss": 1.0035, + "step": 1010 + }, + { + "epoch": 0.39288837073739435, + "grad_norm": 0.20197926461696625, + "learning_rate": 0.00012175943947061114, + "loss": 0.9847, + "step": 1011 + }, + { + "epoch": 0.3932769843583018, + "grad_norm": 0.19974152743816376, + "learning_rate": 0.0001216815881666018, + "loss": 1.0669, + "step": 1012 + }, + { + "epoch": 0.3936655979792092, + "grad_norm": 0.21684005856513977, + "learning_rate": 0.00012160373686259245, + "loss": 1.0562, + "step": 1013 + }, + { + "epoch": 0.3940542116001166, + "grad_norm": 0.2030404955148697, + "learning_rate": 0.00012152588555858311, + "loss": 1.0159, + "step": 1014 + }, + { + "epoch": 0.394442825221024, + "grad_norm": 0.2123572677373886, + "learning_rate": 0.00012144803425457377, + "loss": 1.0757, + "step": 1015 + }, + { + "epoch": 0.3948314388419314, + "grad_norm": 0.20320011675357819, + "learning_rate": 0.00012137018295056443, + "loss": 1.038, + "step": 1016 + }, + { + "epoch": 0.3952200524628388, + "grad_norm": 0.20120739936828613, + "learning_rate": 0.00012129233164655508, + "loss": 1.1015, + "step": 1017 + }, + { + "epoch": 0.39560866608374623, + "grad_norm": 0.19862449169158936, + "learning_rate": 0.00012121448034254575, + "loss": 1.0328, + "step": 1018 + }, + { + "epoch": 0.39599727970465365, + "grad_norm": 0.19761312007904053, + "learning_rate": 0.0001211366290385364, + "loss": 0.997, + "step": 1019 + }, + { + "epoch": 0.39638589332556107, + "grad_norm": 0.1943569928407669, + "learning_rate": 0.00012105877773452705, + "loss": 1.0099, + "step": 1020 + }, + { + "epoch": 0.3967745069464685, + "grad_norm": 0.2109062373638153, + "learning_rate": 0.00012098092643051773, + "loss": 1.1039, + "step": 1021 + }, + { + "epoch": 0.3971631205673759, + "grad_norm": 0.20966266095638275, + "learning_rate": 0.00012090307512650839, + "loss": 1.1208, + "step": 1022 + }, + { + "epoch": 0.3975517341882833, + "grad_norm": 0.19208088517189026, + "learning_rate": 0.00012082522382249904, + "loss": 1.0147, + "step": 1023 + }, + { + "epoch": 0.3979403478091907, + "grad_norm": 0.21821236610412598, + "learning_rate": 0.00012074737251848969, + "loss": 1.0615, + "step": 1024 + }, + { + "epoch": 0.3983289614300981, + "grad_norm": 0.20031368732452393, + "learning_rate": 0.00012066952121448034, + "loss": 1.0303, + "step": 1025 + }, + { + "epoch": 0.3987175750510055, + "grad_norm": 0.22910597920417786, + "learning_rate": 0.00012059166991047102, + "loss": 1.0182, + "step": 1026 + }, + { + "epoch": 0.39910618867191294, + "grad_norm": 0.20816978812217712, + "learning_rate": 0.00012051381860646167, + "loss": 1.0142, + "step": 1027 + }, + { + "epoch": 0.39949480229282036, + "grad_norm": 0.20989780128002167, + "learning_rate": 0.00012043596730245232, + "loss": 1.0676, + "step": 1028 + }, + { + "epoch": 0.3998834159137278, + "grad_norm": 0.21894055604934692, + "learning_rate": 0.00012035811599844298, + "loss": 1.0222, + "step": 1029 + }, + { + "epoch": 0.4002720295346352, + "grad_norm": 0.2170870155096054, + "learning_rate": 0.00012028026469443363, + "loss": 1.0319, + "step": 1030 + }, + { + "epoch": 0.4006606431555426, + "grad_norm": 0.20869679749011993, + "learning_rate": 0.00012020241339042428, + "loss": 1.055, + "step": 1031 + }, + { + "epoch": 0.40104925677645004, + "grad_norm": 0.18850640952587128, + "learning_rate": 0.00012012456208641496, + "loss": 0.9993, + "step": 1032 + }, + { + "epoch": 0.40143787039735745, + "grad_norm": 0.21462580561637878, + "learning_rate": 0.00012004671078240561, + "loss": 1.0115, + "step": 1033 + }, + { + "epoch": 0.4018264840182648, + "grad_norm": 0.2008499950170517, + "learning_rate": 0.00011996885947839626, + "loss": 1.0229, + "step": 1034 + }, + { + "epoch": 0.40221509763917224, + "grad_norm": 0.20063354074954987, + "learning_rate": 0.00011989100817438692, + "loss": 1.0295, + "step": 1035 + }, + { + "epoch": 0.40260371126007966, + "grad_norm": 0.20655786991119385, + "learning_rate": 0.00011981315687037757, + "loss": 1.0044, + "step": 1036 + }, + { + "epoch": 0.4029923248809871, + "grad_norm": 0.1985999196767807, + "learning_rate": 0.00011973530556636825, + "loss": 1.0063, + "step": 1037 + }, + { + "epoch": 0.4033809385018945, + "grad_norm": 0.2039060890674591, + "learning_rate": 0.0001196574542623589, + "loss": 1.044, + "step": 1038 + }, + { + "epoch": 0.4037695521228019, + "grad_norm": 0.21838189661502838, + "learning_rate": 0.00011957960295834955, + "loss": 1.1101, + "step": 1039 + }, + { + "epoch": 0.40415816574370933, + "grad_norm": 0.21508415043354034, + "learning_rate": 0.00011950175165434022, + "loss": 1.0764, + "step": 1040 + }, + { + "epoch": 0.40454677936461675, + "grad_norm": 0.2089119255542755, + "learning_rate": 0.00011942390035033087, + "loss": 0.9986, + "step": 1041 + }, + { + "epoch": 0.40493539298552417, + "grad_norm": 0.19859452545642853, + "learning_rate": 0.00011934604904632153, + "loss": 1.0122, + "step": 1042 + }, + { + "epoch": 0.40532400660643153, + "grad_norm": 0.2018653154373169, + "learning_rate": 0.00011926819774231219, + "loss": 1.0187, + "step": 1043 + }, + { + "epoch": 0.40571262022733895, + "grad_norm": 0.19892063736915588, + "learning_rate": 0.00011919034643830285, + "loss": 1.0029, + "step": 1044 + }, + { + "epoch": 0.40610123384824637, + "grad_norm": 0.20355650782585144, + "learning_rate": 0.0001191124951342935, + "loss": 1.0484, + "step": 1045 + }, + { + "epoch": 0.4064898474691538, + "grad_norm": 0.2033994495868683, + "learning_rate": 0.00011903464383028416, + "loss": 1.087, + "step": 1046 + }, + { + "epoch": 0.4068784610900612, + "grad_norm": 0.2047330141067505, + "learning_rate": 0.00011895679252627484, + "loss": 1.0774, + "step": 1047 + }, + { + "epoch": 0.4072670747109686, + "grad_norm": 0.21420112252235413, + "learning_rate": 0.00011887894122226549, + "loss": 1.0252, + "step": 1048 + }, + { + "epoch": 0.40765568833187604, + "grad_norm": 0.2030097395181656, + "learning_rate": 0.00011880108991825614, + "loss": 1.0501, + "step": 1049 + }, + { + "epoch": 0.40804430195278346, + "grad_norm": 0.2128026783466339, + "learning_rate": 0.00011872323861424679, + "loss": 1.1031, + "step": 1050 + }, + { + "epoch": 0.4084329155736909, + "grad_norm": 0.20724938809871674, + "learning_rate": 0.00011864538731023744, + "loss": 1.0327, + "step": 1051 + }, + { + "epoch": 0.40882152919459824, + "grad_norm": 0.20344072580337524, + "learning_rate": 0.00011856753600622812, + "loss": 1.0719, + "step": 1052 + }, + { + "epoch": 0.40921014281550566, + "grad_norm": 0.2145012468099594, + "learning_rate": 0.00011848968470221877, + "loss": 1.0582, + "step": 1053 + }, + { + "epoch": 0.4095987564364131, + "grad_norm": 0.220048725605011, + "learning_rate": 0.00011841183339820943, + "loss": 1.0825, + "step": 1054 + }, + { + "epoch": 0.4099873700573205, + "grad_norm": 0.19074465334415436, + "learning_rate": 0.00011833398209420008, + "loss": 0.9657, + "step": 1055 + }, + { + "epoch": 0.4103759836782279, + "grad_norm": 0.1958267241716385, + "learning_rate": 0.00011825613079019073, + "loss": 0.9864, + "step": 1056 + }, + { + "epoch": 0.41076459729913534, + "grad_norm": 0.21768233180046082, + "learning_rate": 0.00011817827948618141, + "loss": 0.9997, + "step": 1057 + }, + { + "epoch": 0.41115321092004276, + "grad_norm": 0.20218704640865326, + "learning_rate": 0.00011810042818217206, + "loss": 1.072, + "step": 1058 + }, + { + "epoch": 0.4115418245409502, + "grad_norm": 0.2035023719072342, + "learning_rate": 0.00011802257687816271, + "loss": 1.0415, + "step": 1059 + }, + { + "epoch": 0.4119304381618576, + "grad_norm": 0.22603970766067505, + "learning_rate": 0.00011794472557415337, + "loss": 1.0751, + "step": 1060 + }, + { + "epoch": 0.412319051782765, + "grad_norm": 0.2125842273235321, + "learning_rate": 0.00011786687427014402, + "loss": 1.0727, + "step": 1061 + }, + { + "epoch": 0.4127076654036724, + "grad_norm": 0.2005981206893921, + "learning_rate": 0.0001177890229661347, + "loss": 1.0191, + "step": 1062 + }, + { + "epoch": 0.4130962790245798, + "grad_norm": 0.22252701222896576, + "learning_rate": 0.00011771117166212535, + "loss": 1.0591, + "step": 1063 + }, + { + "epoch": 0.4134848926454872, + "grad_norm": 0.22205251455307007, + "learning_rate": 0.000117633320358116, + "loss": 1.1198, + "step": 1064 + }, + { + "epoch": 0.41387350626639463, + "grad_norm": 0.20037783682346344, + "learning_rate": 0.00011755546905410665, + "loss": 1.0548, + "step": 1065 + }, + { + "epoch": 0.41426211988730205, + "grad_norm": 0.21737834811210632, + "learning_rate": 0.00011747761775009732, + "loss": 1.0922, + "step": 1066 + }, + { + "epoch": 0.41465073350820947, + "grad_norm": 0.19312533736228943, + "learning_rate": 0.00011739976644608798, + "loss": 0.9836, + "step": 1067 + }, + { + "epoch": 0.4150393471291169, + "grad_norm": 0.22055000066757202, + "learning_rate": 0.00011732191514207864, + "loss": 1.0383, + "step": 1068 + }, + { + "epoch": 0.4154279607500243, + "grad_norm": 0.22623857855796814, + "learning_rate": 0.0001172440638380693, + "loss": 1.0704, + "step": 1069 + }, + { + "epoch": 0.4158165743709317, + "grad_norm": 0.21481367945671082, + "learning_rate": 0.00011716621253405995, + "loss": 1.052, + "step": 1070 + }, + { + "epoch": 0.4162051879918391, + "grad_norm": 0.21022087335586548, + "learning_rate": 0.0001170883612300506, + "loss": 1.1021, + "step": 1071 + }, + { + "epoch": 0.4165938016127465, + "grad_norm": 0.2154620885848999, + "learning_rate": 0.00011701050992604126, + "loss": 1.0128, + "step": 1072 + }, + { + "epoch": 0.4169824152336539, + "grad_norm": 0.20545578002929688, + "learning_rate": 0.00011693265862203194, + "loss": 1.0058, + "step": 1073 + }, + { + "epoch": 0.41737102885456134, + "grad_norm": 0.21726195514202118, + "learning_rate": 0.00011685480731802259, + "loss": 1.0753, + "step": 1074 + }, + { + "epoch": 0.41775964247546876, + "grad_norm": 0.2067115604877472, + "learning_rate": 0.00011677695601401324, + "loss": 1.0594, + "step": 1075 + }, + { + "epoch": 0.4181482560963762, + "grad_norm": 0.23024648427963257, + "learning_rate": 0.0001166991047100039, + "loss": 1.1039, + "step": 1076 + }, + { + "epoch": 0.4185368697172836, + "grad_norm": 0.20692144334316254, + "learning_rate": 0.00011662125340599455, + "loss": 1.0598, + "step": 1077 + }, + { + "epoch": 0.418925483338191, + "grad_norm": 0.19839999079704285, + "learning_rate": 0.00011654340210198522, + "loss": 1.054, + "step": 1078 + }, + { + "epoch": 0.41931409695909844, + "grad_norm": 0.19227825105190277, + "learning_rate": 0.00011646555079797588, + "loss": 0.9453, + "step": 1079 + }, + { + "epoch": 0.41970271058000586, + "grad_norm": 0.2112567275762558, + "learning_rate": 0.00011638769949396653, + "loss": 1.023, + "step": 1080 + }, + { + "epoch": 0.4200913242009132, + "grad_norm": 0.185299351811409, + "learning_rate": 0.00011630984818995718, + "loss": 0.9752, + "step": 1081 + }, + { + "epoch": 0.42047993782182064, + "grad_norm": 0.20148858428001404, + "learning_rate": 0.00011623199688594783, + "loss": 1.0659, + "step": 1082 + }, + { + "epoch": 0.42086855144272806, + "grad_norm": 0.1935974359512329, + "learning_rate": 0.00011615414558193851, + "loss": 1.0116, + "step": 1083 + }, + { + "epoch": 0.4212571650636355, + "grad_norm": 0.20433953404426575, + "learning_rate": 0.00011607629427792916, + "loss": 1.0671, + "step": 1084 + }, + { + "epoch": 0.4216457786845429, + "grad_norm": 0.20729799568653107, + "learning_rate": 0.00011599844297391982, + "loss": 1.0341, + "step": 1085 + }, + { + "epoch": 0.4220343923054503, + "grad_norm": 0.2126002460718155, + "learning_rate": 0.00011592059166991047, + "loss": 1.0188, + "step": 1086 + }, + { + "epoch": 0.42242300592635773, + "grad_norm": 0.19453707337379456, + "learning_rate": 0.00011584274036590112, + "loss": 1.0331, + "step": 1087 + }, + { + "epoch": 0.42281161954726515, + "grad_norm": 0.20909856259822845, + "learning_rate": 0.0001157648890618918, + "loss": 0.9984, + "step": 1088 + }, + { + "epoch": 0.42320023316817257, + "grad_norm": 0.19596272706985474, + "learning_rate": 0.00011568703775788245, + "loss": 1.0121, + "step": 1089 + }, + { + "epoch": 0.42358884678907993, + "grad_norm": 0.22045716643333435, + "learning_rate": 0.0001156091864538731, + "loss": 1.0591, + "step": 1090 + }, + { + "epoch": 0.42397746040998735, + "grad_norm": 0.22624897956848145, + "learning_rate": 0.00011553133514986376, + "loss": 1.0565, + "step": 1091 + }, + { + "epoch": 0.42436607403089477, + "grad_norm": 0.20263417065143585, + "learning_rate": 0.00011545348384585442, + "loss": 1.024, + "step": 1092 + }, + { + "epoch": 0.4247546876518022, + "grad_norm": 0.20179417729377747, + "learning_rate": 0.00011537563254184509, + "loss": 0.9806, + "step": 1093 + }, + { + "epoch": 0.4251433012727096, + "grad_norm": 0.30221593379974365, + "learning_rate": 0.00011529778123783574, + "loss": 1.0683, + "step": 1094 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 0.21195146441459656, + "learning_rate": 0.0001152199299338264, + "loss": 1.1283, + "step": 1095 + }, + { + "epoch": 0.42592052851452444, + "grad_norm": 0.21860192716121674, + "learning_rate": 0.00011514207862981706, + "loss": 1.0046, + "step": 1096 + }, + { + "epoch": 0.42630914213543186, + "grad_norm": 0.2234150469303131, + "learning_rate": 0.00011506422732580771, + "loss": 1.0461, + "step": 1097 + }, + { + "epoch": 0.4266977557563393, + "grad_norm": 0.21535125374794006, + "learning_rate": 0.00011498637602179837, + "loss": 1.0593, + "step": 1098 + }, + { + "epoch": 0.4270863693772467, + "grad_norm": 0.19313789904117584, + "learning_rate": 0.00011490852471778904, + "loss": 1.0357, + "step": 1099 + }, + { + "epoch": 0.42747498299815406, + "grad_norm": 0.19886989891529083, + "learning_rate": 0.00011483067341377969, + "loss": 0.9946, + "step": 1100 + }, + { + "epoch": 0.4278635966190615, + "grad_norm": 0.21028490364551544, + "learning_rate": 0.00011475282210977034, + "loss": 1.0765, + "step": 1101 + }, + { + "epoch": 0.4282522102399689, + "grad_norm": 0.2066621333360672, + "learning_rate": 0.000114674970805761, + "loss": 1.0405, + "step": 1102 + }, + { + "epoch": 0.4286408238608763, + "grad_norm": 0.18400220572948456, + "learning_rate": 0.00011459711950175168, + "loss": 0.9404, + "step": 1103 + }, + { + "epoch": 0.42902943748178374, + "grad_norm": 0.2058599591255188, + "learning_rate": 0.00011451926819774233, + "loss": 1.0505, + "step": 1104 + }, + { + "epoch": 0.42941805110269116, + "grad_norm": 0.19696786999702454, + "learning_rate": 0.00011444141689373298, + "loss": 1.032, + "step": 1105 + }, + { + "epoch": 0.4298066647235986, + "grad_norm": 0.2082854062318802, + "learning_rate": 0.00011436356558972363, + "loss": 1.0914, + "step": 1106 + }, + { + "epoch": 0.430195278344506, + "grad_norm": 0.20155015587806702, + "learning_rate": 0.00011428571428571428, + "loss": 1.0541, + "step": 1107 + }, + { + "epoch": 0.4305838919654134, + "grad_norm": 0.23419982194900513, + "learning_rate": 0.00011420786298170494, + "loss": 1.0684, + "step": 1108 + }, + { + "epoch": 0.4309725055863208, + "grad_norm": 0.23493975400924683, + "learning_rate": 0.00011413001167769561, + "loss": 1.0509, + "step": 1109 + }, + { + "epoch": 0.4313611192072282, + "grad_norm": 0.2089843600988388, + "learning_rate": 0.00011405216037368627, + "loss": 1.0479, + "step": 1110 + }, + { + "epoch": 0.4317497328281356, + "grad_norm": 0.21076850593090057, + "learning_rate": 0.00011397430906967692, + "loss": 1.064, + "step": 1111 + }, + { + "epoch": 0.43213834644904303, + "grad_norm": 0.20307987928390503, + "learning_rate": 0.00011389645776566757, + "loss": 1.0416, + "step": 1112 + }, + { + "epoch": 0.43252696006995045, + "grad_norm": 0.20955562591552734, + "learning_rate": 0.00011381860646165822, + "loss": 1.0158, + "step": 1113 + }, + { + "epoch": 0.43291557369085787, + "grad_norm": 0.2074531465768814, + "learning_rate": 0.0001137407551576489, + "loss": 1.0486, + "step": 1114 + }, + { + "epoch": 0.4333041873117653, + "grad_norm": 0.20907235145568848, + "learning_rate": 0.00011366290385363955, + "loss": 1.0352, + "step": 1115 + }, + { + "epoch": 0.4336928009326727, + "grad_norm": 0.21726477146148682, + "learning_rate": 0.0001135850525496302, + "loss": 1.0068, + "step": 1116 + }, + { + "epoch": 0.4340814145535801, + "grad_norm": 0.20231984555721283, + "learning_rate": 0.00011350720124562086, + "loss": 0.9757, + "step": 1117 + }, + { + "epoch": 0.4344700281744875, + "grad_norm": 0.23485834896564484, + "learning_rate": 0.00011342934994161152, + "loss": 1.0681, + "step": 1118 + }, + { + "epoch": 0.4348586417953949, + "grad_norm": 0.21286556124687195, + "learning_rate": 0.00011335149863760219, + "loss": 1.0399, + "step": 1119 + }, + { + "epoch": 0.4352472554163023, + "grad_norm": 0.2097872495651245, + "learning_rate": 0.00011327364733359284, + "loss": 1.0435, + "step": 1120 + }, + { + "epoch": 0.43563586903720974, + "grad_norm": 0.2224377542734146, + "learning_rate": 0.00011319579602958351, + "loss": 1.1664, + "step": 1121 + }, + { + "epoch": 0.43602448265811716, + "grad_norm": 0.19213411211967468, + "learning_rate": 0.00011311794472557416, + "loss": 1.0424, + "step": 1122 + }, + { + "epoch": 0.4364130962790246, + "grad_norm": 0.20974959433078766, + "learning_rate": 0.00011304009342156481, + "loss": 1.0943, + "step": 1123 + }, + { + "epoch": 0.436801709899932, + "grad_norm": 0.19943708181381226, + "learning_rate": 0.00011296224211755549, + "loss": 1.0652, + "step": 1124 + }, + { + "epoch": 0.4371903235208394, + "grad_norm": 0.1832750141620636, + "learning_rate": 0.00011288439081354614, + "loss": 0.9883, + "step": 1125 + }, + { + "epoch": 0.43757893714174684, + "grad_norm": 0.2205052226781845, + "learning_rate": 0.0001128065395095368, + "loss": 1.0733, + "step": 1126 + }, + { + "epoch": 0.43796755076265426, + "grad_norm": 0.2082854062318802, + "learning_rate": 0.00011272868820552745, + "loss": 1.0141, + "step": 1127 + }, + { + "epoch": 0.4383561643835616, + "grad_norm": 0.22755026817321777, + "learning_rate": 0.0001126508369015181, + "loss": 1.0942, + "step": 1128 + }, + { + "epoch": 0.43874477800446904, + "grad_norm": 0.2098863571882248, + "learning_rate": 0.00011257298559750878, + "loss": 0.9987, + "step": 1129 + }, + { + "epoch": 0.43913339162537646, + "grad_norm": 0.20559263229370117, + "learning_rate": 0.00011249513429349943, + "loss": 1.0345, + "step": 1130 + }, + { + "epoch": 0.4395220052462839, + "grad_norm": 0.21955084800720215, + "learning_rate": 0.00011241728298949008, + "loss": 1.1068, + "step": 1131 + }, + { + "epoch": 0.4399106188671913, + "grad_norm": 0.21353478729724884, + "learning_rate": 0.00011233943168548073, + "loss": 1.0094, + "step": 1132 + }, + { + "epoch": 0.4402992324880987, + "grad_norm": 0.19822491705417633, + "learning_rate": 0.00011226158038147139, + "loss": 0.9758, + "step": 1133 + }, + { + "epoch": 0.44068784610900613, + "grad_norm": 0.20079441368579865, + "learning_rate": 0.00011218372907746206, + "loss": 1.0202, + "step": 1134 + }, + { + "epoch": 0.44107645972991355, + "grad_norm": 0.2261926829814911, + "learning_rate": 0.00011210587777345272, + "loss": 0.9877, + "step": 1135 + }, + { + "epoch": 0.44146507335082097, + "grad_norm": 0.2264915257692337, + "learning_rate": 0.00011202802646944337, + "loss": 0.9887, + "step": 1136 + }, + { + "epoch": 0.44185368697172833, + "grad_norm": 0.21853779256343842, + "learning_rate": 0.00011195017516543402, + "loss": 1.0535, + "step": 1137 + }, + { + "epoch": 0.44224230059263575, + "grad_norm": 0.21332694590091705, + "learning_rate": 0.00011187232386142467, + "loss": 1.0824, + "step": 1138 + }, + { + "epoch": 0.44263091421354317, + "grad_norm": 0.21350236237049103, + "learning_rate": 0.00011179447255741535, + "loss": 1.0758, + "step": 1139 + }, + { + "epoch": 0.4430195278344506, + "grad_norm": 0.21305765211582184, + "learning_rate": 0.000111716621253406, + "loss": 1.035, + "step": 1140 + }, + { + "epoch": 0.443408141455358, + "grad_norm": 0.20486389100551605, + "learning_rate": 0.00011163876994939666, + "loss": 1.0413, + "step": 1141 + }, + { + "epoch": 0.4437967550762654, + "grad_norm": 0.19255472719669342, + "learning_rate": 0.00011156091864538731, + "loss": 0.9583, + "step": 1142 + }, + { + "epoch": 0.44418536869717284, + "grad_norm": 0.19824008643627167, + "learning_rate": 0.00011148306734137796, + "loss": 1.0331, + "step": 1143 + }, + { + "epoch": 0.44457398231808026, + "grad_norm": 0.20308080315589905, + "learning_rate": 0.00011140521603736863, + "loss": 1.0399, + "step": 1144 + }, + { + "epoch": 0.4449625959389877, + "grad_norm": 0.2193964123725891, + "learning_rate": 0.00011132736473335929, + "loss": 1.063, + "step": 1145 + }, + { + "epoch": 0.4453512095598951, + "grad_norm": 0.2151576578617096, + "learning_rate": 0.00011124951342934994, + "loss": 1.0795, + "step": 1146 + }, + { + "epoch": 0.44573982318080246, + "grad_norm": 0.23056697845458984, + "learning_rate": 0.00011117166212534061, + "loss": 1.0351, + "step": 1147 + }, + { + "epoch": 0.4461284368017099, + "grad_norm": 0.1973094493150711, + "learning_rate": 0.00011109381082133126, + "loss": 0.9866, + "step": 1148 + }, + { + "epoch": 0.4465170504226173, + "grad_norm": 0.2119562178850174, + "learning_rate": 0.00011101595951732191, + "loss": 1.0591, + "step": 1149 + }, + { + "epoch": 0.4469056640435247, + "grad_norm": 0.20407763123512268, + "learning_rate": 0.00011093810821331259, + "loss": 0.988, + "step": 1150 + }, + { + "epoch": 0.44729427766443214, + "grad_norm": 0.19474107027053833, + "learning_rate": 0.00011086025690930324, + "loss": 0.9729, + "step": 1151 + }, + { + "epoch": 0.44768289128533956, + "grad_norm": 0.2179928421974182, + "learning_rate": 0.0001107824056052939, + "loss": 1.0558, + "step": 1152 + }, + { + "epoch": 0.448071504906247, + "grad_norm": 0.44306451082229614, + "learning_rate": 0.00011070455430128455, + "loss": 1.0901, + "step": 1153 + }, + { + "epoch": 0.4484601185271544, + "grad_norm": 0.22060540318489075, + "learning_rate": 0.0001106267029972752, + "loss": 1.0009, + "step": 1154 + }, + { + "epoch": 0.4488487321480618, + "grad_norm": 0.20534972846508026, + "learning_rate": 0.00011054885169326588, + "loss": 0.9741, + "step": 1155 + }, + { + "epoch": 0.4492373457689692, + "grad_norm": 0.19488993287086487, + "learning_rate": 0.00011047100038925653, + "loss": 1.0, + "step": 1156 + }, + { + "epoch": 0.4496259593898766, + "grad_norm": 0.20462395250797272, + "learning_rate": 0.00011039314908524718, + "loss": 1.0309, + "step": 1157 + }, + { + "epoch": 0.450014573010784, + "grad_norm": 0.2170749306678772, + "learning_rate": 0.00011031529778123784, + "loss": 1.0726, + "step": 1158 + }, + { + "epoch": 0.45040318663169143, + "grad_norm": 0.2066730111837387, + "learning_rate": 0.00011023744647722849, + "loss": 1.0227, + "step": 1159 + }, + { + "epoch": 0.45079180025259885, + "grad_norm": 0.20625676214694977, + "learning_rate": 0.00011015959517321917, + "loss": 1.0287, + "step": 1160 + }, + { + "epoch": 0.45118041387350627, + "grad_norm": 0.19483047723770142, + "learning_rate": 0.00011008174386920982, + "loss": 0.9639, + "step": 1161 + }, + { + "epoch": 0.4515690274944137, + "grad_norm": 0.24705417454242706, + "learning_rate": 0.00011000389256520047, + "loss": 0.9903, + "step": 1162 + }, + { + "epoch": 0.4519576411153211, + "grad_norm": 0.2109205424785614, + "learning_rate": 0.00010992604126119112, + "loss": 1.054, + "step": 1163 + }, + { + "epoch": 0.4523462547362285, + "grad_norm": 0.20904991030693054, + "learning_rate": 0.00010984818995718178, + "loss": 1.0416, + "step": 1164 + }, + { + "epoch": 0.45273486835713594, + "grad_norm": 0.19841328263282776, + "learning_rate": 0.00010977033865317245, + "loss": 0.9986, + "step": 1165 + }, + { + "epoch": 0.4531234819780433, + "grad_norm": 0.20545506477355957, + "learning_rate": 0.0001096924873491631, + "loss": 1.0337, + "step": 1166 + }, + { + "epoch": 0.4535120955989507, + "grad_norm": 0.208644837141037, + "learning_rate": 0.00010961463604515376, + "loss": 1.0304, + "step": 1167 + }, + { + "epoch": 0.45390070921985815, + "grad_norm": 0.2111911028623581, + "learning_rate": 0.00010953678474114441, + "loss": 1.0398, + "step": 1168 + }, + { + "epoch": 0.45428932284076556, + "grad_norm": 0.2600184381008148, + "learning_rate": 0.00010945893343713506, + "loss": 1.0509, + "step": 1169 + }, + { + "epoch": 0.454677936461673, + "grad_norm": 0.2059030532836914, + "learning_rate": 0.00010938108213312574, + "loss": 0.9347, + "step": 1170 + }, + { + "epoch": 0.4550665500825804, + "grad_norm": 0.19232551753520966, + "learning_rate": 0.0001093032308291164, + "loss": 1.0162, + "step": 1171 + }, + { + "epoch": 0.4554551637034878, + "grad_norm": 0.19147330522537231, + "learning_rate": 0.00010922537952510705, + "loss": 0.9872, + "step": 1172 + }, + { + "epoch": 0.45584377732439524, + "grad_norm": 0.2599676251411438, + "learning_rate": 0.00010914752822109771, + "loss": 1.0402, + "step": 1173 + }, + { + "epoch": 0.45623239094530266, + "grad_norm": 0.2159397304058075, + "learning_rate": 0.00010906967691708836, + "loss": 1.0411, + "step": 1174 + }, + { + "epoch": 0.45662100456621, + "grad_norm": 0.23864266276359558, + "learning_rate": 0.00010899182561307903, + "loss": 1.054, + "step": 1175 + }, + { + "epoch": 0.45700961818711744, + "grad_norm": 0.2027217596769333, + "learning_rate": 0.0001089139743090697, + "loss": 0.9713, + "step": 1176 + }, + { + "epoch": 0.45739823180802486, + "grad_norm": 0.1837588995695114, + "learning_rate": 0.00010883612300506035, + "loss": 0.9698, + "step": 1177 + }, + { + "epoch": 0.4577868454289323, + "grad_norm": 0.20038527250289917, + "learning_rate": 0.000108758271701051, + "loss": 1.0456, + "step": 1178 + }, + { + "epoch": 0.4581754590498397, + "grad_norm": 0.21525044739246368, + "learning_rate": 0.00010868042039704165, + "loss": 1.021, + "step": 1179 + }, + { + "epoch": 0.4585640726707471, + "grad_norm": 0.18813730776309967, + "learning_rate": 0.0001086025690930323, + "loss": 0.9673, + "step": 1180 + }, + { + "epoch": 0.45895268629165453, + "grad_norm": 0.2056179642677307, + "learning_rate": 0.00010852471778902298, + "loss": 1.0119, + "step": 1181 + }, + { + "epoch": 0.45934129991256195, + "grad_norm": 0.21599683165550232, + "learning_rate": 0.00010844686648501363, + "loss": 1.0537, + "step": 1182 + }, + { + "epoch": 0.45972991353346937, + "grad_norm": 0.19750265777111053, + "learning_rate": 0.00010836901518100429, + "loss": 1.0203, + "step": 1183 + }, + { + "epoch": 0.4601185271543768, + "grad_norm": 0.22186161577701569, + "learning_rate": 0.00010829116387699494, + "loss": 1.0583, + "step": 1184 + }, + { + "epoch": 0.46050714077528415, + "grad_norm": 0.2109905481338501, + "learning_rate": 0.00010821331257298559, + "loss": 1.0022, + "step": 1185 + }, + { + "epoch": 0.46089575439619157, + "grad_norm": 0.2032858431339264, + "learning_rate": 0.00010813546126897627, + "loss": 0.9774, + "step": 1186 + }, + { + "epoch": 0.461284368017099, + "grad_norm": 0.20381197333335876, + "learning_rate": 0.00010805760996496692, + "loss": 0.9768, + "step": 1187 + }, + { + "epoch": 0.4616729816380064, + "grad_norm": 0.20488987863063812, + "learning_rate": 0.00010797975866095757, + "loss": 1.0448, + "step": 1188 + }, + { + "epoch": 0.4620615952589138, + "grad_norm": 0.20257477462291718, + "learning_rate": 0.00010790190735694823, + "loss": 1.0157, + "step": 1189 + }, + { + "epoch": 0.46245020887982125, + "grad_norm": 0.20761239528656006, + "learning_rate": 0.00010782405605293888, + "loss": 1.0328, + "step": 1190 + }, + { + "epoch": 0.46283882250072866, + "grad_norm": 0.22062581777572632, + "learning_rate": 0.00010774620474892956, + "loss": 1.0362, + "step": 1191 + }, + { + "epoch": 0.4632274361216361, + "grad_norm": 0.19970272481441498, + "learning_rate": 0.00010766835344492021, + "loss": 1.0783, + "step": 1192 + }, + { + "epoch": 0.4636160497425435, + "grad_norm": 0.2221893072128296, + "learning_rate": 0.00010759050214091086, + "loss": 1.0136, + "step": 1193 + }, + { + "epoch": 0.46400466336345086, + "grad_norm": 0.2124665081501007, + "learning_rate": 0.00010751265083690151, + "loss": 1.0528, + "step": 1194 + }, + { + "epoch": 0.4643932769843583, + "grad_norm": 0.2001204937696457, + "learning_rate": 0.00010743479953289218, + "loss": 1.0495, + "step": 1195 + }, + { + "epoch": 0.4647818906052657, + "grad_norm": 0.20979635417461395, + "learning_rate": 0.00010735694822888284, + "loss": 1.0664, + "step": 1196 + }, + { + "epoch": 0.4651705042261731, + "grad_norm": 0.190982848405838, + "learning_rate": 0.0001072790969248735, + "loss": 1.0256, + "step": 1197 + }, + { + "epoch": 0.46555911784708054, + "grad_norm": 0.19910745322704315, + "learning_rate": 0.00010720124562086415, + "loss": 1.0263, + "step": 1198 + }, + { + "epoch": 0.46594773146798796, + "grad_norm": 0.21624085307121277, + "learning_rate": 0.00010712339431685481, + "loss": 1.0768, + "step": 1199 + }, + { + "epoch": 0.4663363450888954, + "grad_norm": 0.20857703685760498, + "learning_rate": 0.00010704554301284547, + "loss": 1.0892, + "step": 1200 + }, + { + "epoch": 0.4667249587098028, + "grad_norm": 0.21897061169147491, + "learning_rate": 0.00010696769170883613, + "loss": 1.0873, + "step": 1201 + }, + { + "epoch": 0.4671135723307102, + "grad_norm": 0.1943386346101761, + "learning_rate": 0.0001068898404048268, + "loss": 1.0116, + "step": 1202 + }, + { + "epoch": 0.4675021859516176, + "grad_norm": 0.22607874870300293, + "learning_rate": 0.00010681198910081745, + "loss": 1.0328, + "step": 1203 + }, + { + "epoch": 0.467890799572525, + "grad_norm": 0.1898999959230423, + "learning_rate": 0.0001067341377968081, + "loss": 0.9791, + "step": 1204 + }, + { + "epoch": 0.4682794131934324, + "grad_norm": 0.2193334400653839, + "learning_rate": 0.00010665628649279875, + "loss": 1.0742, + "step": 1205 + }, + { + "epoch": 0.46866802681433983, + "grad_norm": 0.2096349149942398, + "learning_rate": 0.00010657843518878943, + "loss": 1.0683, + "step": 1206 + }, + { + "epoch": 0.46905664043524725, + "grad_norm": 0.2040576934814453, + "learning_rate": 0.00010650058388478008, + "loss": 1.0516, + "step": 1207 + }, + { + "epoch": 0.46944525405615467, + "grad_norm": 0.20619645714759827, + "learning_rate": 0.00010642273258077074, + "loss": 1.0429, + "step": 1208 + }, + { + "epoch": 0.4698338676770621, + "grad_norm": 0.19753660261631012, + "learning_rate": 0.00010634488127676139, + "loss": 1.0268, + "step": 1209 + }, + { + "epoch": 0.4702224812979695, + "grad_norm": 0.2201426476240158, + "learning_rate": 0.00010626702997275204, + "loss": 1.0879, + "step": 1210 + }, + { + "epoch": 0.4706110949188769, + "grad_norm": 0.21307805180549622, + "learning_rate": 0.00010618917866874272, + "loss": 1.0186, + "step": 1211 + }, + { + "epoch": 0.47099970853978435, + "grad_norm": 0.21142373979091644, + "learning_rate": 0.00010611132736473337, + "loss": 1.0417, + "step": 1212 + }, + { + "epoch": 0.4713883221606917, + "grad_norm": 0.20523706078529358, + "learning_rate": 0.00010603347606072402, + "loss": 1.0372, + "step": 1213 + }, + { + "epoch": 0.4717769357815991, + "grad_norm": 0.19843094050884247, + "learning_rate": 0.00010595562475671468, + "loss": 1.0062, + "step": 1214 + }, + { + "epoch": 0.47216554940250655, + "grad_norm": 0.2146739959716797, + "learning_rate": 0.00010587777345270533, + "loss": 1.0528, + "step": 1215 + }, + { + "epoch": 0.47255416302341396, + "grad_norm": 0.2136303037405014, + "learning_rate": 0.00010579992214869601, + "loss": 1.0521, + "step": 1216 + }, + { + "epoch": 0.4729427766443214, + "grad_norm": 0.21379397809505463, + "learning_rate": 0.00010572207084468666, + "loss": 1.0362, + "step": 1217 + }, + { + "epoch": 0.4733313902652288, + "grad_norm": 0.20459088683128357, + "learning_rate": 0.00010564421954067731, + "loss": 1.0455, + "step": 1218 + }, + { + "epoch": 0.4737200038861362, + "grad_norm": 0.20667988061904907, + "learning_rate": 0.00010556636823666796, + "loss": 1.0284, + "step": 1219 + }, + { + "epoch": 0.47410861750704364, + "grad_norm": 0.21820449829101562, + "learning_rate": 0.00010548851693265862, + "loss": 1.0584, + "step": 1220 + }, + { + "epoch": 0.47449723112795106, + "grad_norm": 0.19705156981945038, + "learning_rate": 0.00010541066562864928, + "loss": 1.004, + "step": 1221 + }, + { + "epoch": 0.4748858447488584, + "grad_norm": 0.19806528091430664, + "learning_rate": 0.00010533281432463995, + "loss": 1.0519, + "step": 1222 + }, + { + "epoch": 0.47527445836976584, + "grad_norm": 0.2006833702325821, + "learning_rate": 0.0001052549630206306, + "loss": 1.0119, + "step": 1223 + }, + { + "epoch": 0.47566307199067326, + "grad_norm": 0.21757058799266815, + "learning_rate": 0.00010517711171662125, + "loss": 1.0961, + "step": 1224 + }, + { + "epoch": 0.4760516856115807, + "grad_norm": 0.2015775889158249, + "learning_rate": 0.00010509926041261192, + "loss": 1.0419, + "step": 1225 + }, + { + "epoch": 0.4764402992324881, + "grad_norm": 0.19691923260688782, + "learning_rate": 0.00010502140910860257, + "loss": 1.0555, + "step": 1226 + }, + { + "epoch": 0.4768289128533955, + "grad_norm": 0.19924800097942352, + "learning_rate": 0.00010494355780459323, + "loss": 1.0106, + "step": 1227 + }, + { + "epoch": 0.47721752647430293, + "grad_norm": 0.21416346728801727, + "learning_rate": 0.0001048657065005839, + "loss": 1.0741, + "step": 1228 + }, + { + "epoch": 0.47760614009521035, + "grad_norm": 0.21823547780513763, + "learning_rate": 0.00010478785519657455, + "loss": 1.023, + "step": 1229 + }, + { + "epoch": 0.47799475371611777, + "grad_norm": 0.2083735466003418, + "learning_rate": 0.0001047100038925652, + "loss": 1.0424, + "step": 1230 + }, + { + "epoch": 0.4783833673370252, + "grad_norm": 0.2219141572713852, + "learning_rate": 0.00010463215258855586, + "loss": 1.0839, + "step": 1231 + }, + { + "epoch": 0.47877198095793255, + "grad_norm": 0.21334600448608398, + "learning_rate": 0.00010455430128454653, + "loss": 0.9888, + "step": 1232 + }, + { + "epoch": 0.47916059457883997, + "grad_norm": 0.2140086442232132, + "learning_rate": 0.00010447644998053719, + "loss": 1.0119, + "step": 1233 + }, + { + "epoch": 0.4795492081997474, + "grad_norm": 0.25360551476478577, + "learning_rate": 0.00010439859867652784, + "loss": 1.0026, + "step": 1234 + }, + { + "epoch": 0.4799378218206548, + "grad_norm": 0.20200380682945251, + "learning_rate": 0.00010432074737251849, + "loss": 1.0, + "step": 1235 + }, + { + "epoch": 0.4803264354415622, + "grad_norm": 0.22641289234161377, + "learning_rate": 0.00010424289606850914, + "loss": 1.1022, + "step": 1236 + }, + { + "epoch": 0.48071504906246965, + "grad_norm": 0.20538561046123505, + "learning_rate": 0.00010416504476449982, + "loss": 0.9847, + "step": 1237 + }, + { + "epoch": 0.48110366268337706, + "grad_norm": 0.206883504986763, + "learning_rate": 0.00010408719346049047, + "loss": 1.0152, + "step": 1238 + }, + { + "epoch": 0.4814922763042845, + "grad_norm": 0.21584320068359375, + "learning_rate": 0.00010400934215648113, + "loss": 1.0361, + "step": 1239 + }, + { + "epoch": 0.4818808899251919, + "grad_norm": 0.20963703095912933, + "learning_rate": 0.00010393149085247178, + "loss": 1.0814, + "step": 1240 + }, + { + "epoch": 0.48226950354609927, + "grad_norm": 0.1965872198343277, + "learning_rate": 0.00010385363954846243, + "loss": 1.0365, + "step": 1241 + }, + { + "epoch": 0.4826581171670067, + "grad_norm": 0.2030191719532013, + "learning_rate": 0.00010377578824445311, + "loss": 1.0374, + "step": 1242 + }, + { + "epoch": 0.4830467307879141, + "grad_norm": 0.21448804438114166, + "learning_rate": 0.00010369793694044376, + "loss": 0.9686, + "step": 1243 + }, + { + "epoch": 0.4834353444088215, + "grad_norm": 0.2181752622127533, + "learning_rate": 0.00010362008563643441, + "loss": 1.0812, + "step": 1244 + }, + { + "epoch": 0.48382395802972894, + "grad_norm": 0.19887101650238037, + "learning_rate": 0.00010354223433242507, + "loss": 1.036, + "step": 1245 + }, + { + "epoch": 0.48421257165063636, + "grad_norm": 0.19007287919521332, + "learning_rate": 0.00010346438302841572, + "loss": 1.0292, + "step": 1246 + }, + { + "epoch": 0.4846011852715438, + "grad_norm": 0.21390347182750702, + "learning_rate": 0.0001033865317244064, + "loss": 1.0284, + "step": 1247 + }, + { + "epoch": 0.4849897988924512, + "grad_norm": 0.23822663724422455, + "learning_rate": 0.00010330868042039705, + "loss": 1.1044, + "step": 1248 + }, + { + "epoch": 0.4853784125133586, + "grad_norm": 0.20779070258140564, + "learning_rate": 0.0001032308291163877, + "loss": 1.0475, + "step": 1249 + }, + { + "epoch": 0.48576702613426603, + "grad_norm": 0.19232134521007538, + "learning_rate": 0.00010315297781237835, + "loss": 0.9945, + "step": 1250 + }, + { + "epoch": 0.4861556397551734, + "grad_norm": 0.22378556430339813, + "learning_rate": 0.00010307512650836902, + "loss": 1.0462, + "step": 1251 + }, + { + "epoch": 0.4865442533760808, + "grad_norm": 0.22156798839569092, + "learning_rate": 0.00010299727520435968, + "loss": 1.051, + "step": 1252 + }, + { + "epoch": 0.48693286699698823, + "grad_norm": 0.19885733723640442, + "learning_rate": 0.00010291942390035034, + "loss": 1.0593, + "step": 1253 + }, + { + "epoch": 0.48732148061789565, + "grad_norm": 0.2172418236732483, + "learning_rate": 0.000102841572596341, + "loss": 1.0513, + "step": 1254 + }, + { + "epoch": 0.48771009423880307, + "grad_norm": 0.22136956453323364, + "learning_rate": 0.00010276372129233165, + "loss": 1.0438, + "step": 1255 + }, + { + "epoch": 0.4880987078597105, + "grad_norm": 0.21337302029132843, + "learning_rate": 0.0001026858699883223, + "loss": 1.0551, + "step": 1256 + }, + { + "epoch": 0.4884873214806179, + "grad_norm": 0.21376267075538635, + "learning_rate": 0.00010260801868431296, + "loss": 1.054, + "step": 1257 + }, + { + "epoch": 0.4888759351015253, + "grad_norm": 0.19498860836029053, + "learning_rate": 0.00010253016738030364, + "loss": 1.0045, + "step": 1258 + }, + { + "epoch": 0.48926454872243275, + "grad_norm": 0.22354961931705475, + "learning_rate": 0.00010245231607629429, + "loss": 1.096, + "step": 1259 + }, + { + "epoch": 0.4896531623433401, + "grad_norm": 0.2078939527273178, + "learning_rate": 0.00010237446477228494, + "loss": 1.0102, + "step": 1260 + }, + { + "epoch": 0.49004177596424753, + "grad_norm": 0.20992495119571686, + "learning_rate": 0.00010229661346827559, + "loss": 0.9814, + "step": 1261 + }, + { + "epoch": 0.49043038958515495, + "grad_norm": 0.2178875207901001, + "learning_rate": 0.00010221876216426625, + "loss": 1.0489, + "step": 1262 + }, + { + "epoch": 0.49081900320606237, + "grad_norm": 0.22152946889400482, + "learning_rate": 0.00010214091086025692, + "loss": 1.0808, + "step": 1263 + }, + { + "epoch": 0.4912076168269698, + "grad_norm": 0.21179009974002838, + "learning_rate": 0.00010206305955624758, + "loss": 1.0323, + "step": 1264 + }, + { + "epoch": 0.4915962304478772, + "grad_norm": 0.2126997411251068, + "learning_rate": 0.00010198520825223823, + "loss": 1.0093, + "step": 1265 + }, + { + "epoch": 0.4919848440687846, + "grad_norm": 0.20912809669971466, + "learning_rate": 0.00010190735694822888, + "loss": 1.0343, + "step": 1266 + }, + { + "epoch": 0.49237345768969204, + "grad_norm": 0.2231636494398117, + "learning_rate": 0.00010182950564421953, + "loss": 1.0587, + "step": 1267 + }, + { + "epoch": 0.49276207131059946, + "grad_norm": 0.1954583376646042, + "learning_rate": 0.00010175165434021021, + "loss": 0.9566, + "step": 1268 + }, + { + "epoch": 0.4931506849315068, + "grad_norm": 0.20520909130573273, + "learning_rate": 0.00010167380303620086, + "loss": 1.024, + "step": 1269 + }, + { + "epoch": 0.49353929855241424, + "grad_norm": 0.21736180782318115, + "learning_rate": 0.00010159595173219152, + "loss": 1.0434, + "step": 1270 + }, + { + "epoch": 0.49392791217332166, + "grad_norm": 0.2360561490058899, + "learning_rate": 0.00010151810042818217, + "loss": 1.114, + "step": 1271 + }, + { + "epoch": 0.4943165257942291, + "grad_norm": 0.20595967769622803, + "learning_rate": 0.00010144024912417282, + "loss": 0.9909, + "step": 1272 + }, + { + "epoch": 0.4947051394151365, + "grad_norm": 0.2161860466003418, + "learning_rate": 0.0001013623978201635, + "loss": 1.0536, + "step": 1273 + }, + { + "epoch": 0.4950937530360439, + "grad_norm": 0.19852355122566223, + "learning_rate": 0.00010128454651615415, + "loss": 1.0001, + "step": 1274 + }, + { + "epoch": 0.49548236665695133, + "grad_norm": 0.21081402897834778, + "learning_rate": 0.0001012066952121448, + "loss": 1.0151, + "step": 1275 + }, + { + "epoch": 0.49587098027785875, + "grad_norm": 0.2053362876176834, + "learning_rate": 0.00010112884390813547, + "loss": 1.018, + "step": 1276 + }, + { + "epoch": 0.49625959389876617, + "grad_norm": 0.21205593645572662, + "learning_rate": 0.00010105099260412612, + "loss": 0.9912, + "step": 1277 + }, + { + "epoch": 0.4966482075196736, + "grad_norm": 0.2005016952753067, + "learning_rate": 0.00010097314130011679, + "loss": 1.0069, + "step": 1278 + }, + { + "epoch": 0.49703682114058095, + "grad_norm": 0.21688181161880493, + "learning_rate": 0.00010089528999610744, + "loss": 1.0364, + "step": 1279 + }, + { + "epoch": 0.49742543476148837, + "grad_norm": 0.20582237839698792, + "learning_rate": 0.0001008174386920981, + "loss": 1.0138, + "step": 1280 + }, + { + "epoch": 0.4978140483823958, + "grad_norm": 0.20824448764324188, + "learning_rate": 0.00010073958738808876, + "loss": 0.9941, + "step": 1281 + }, + { + "epoch": 0.4982026620033032, + "grad_norm": 0.20749075710773468, + "learning_rate": 0.00010066173608407941, + "loss": 1.0478, + "step": 1282 + }, + { + "epoch": 0.49859127562421063, + "grad_norm": 0.20012183487415314, + "learning_rate": 0.00010058388478007009, + "loss": 0.995, + "step": 1283 + }, + { + "epoch": 0.49897988924511805, + "grad_norm": 0.20275959372520447, + "learning_rate": 0.00010050603347606074, + "loss": 1.097, + "step": 1284 + }, + { + "epoch": 0.49936850286602547, + "grad_norm": 0.19588243961334229, + "learning_rate": 0.00010042818217205139, + "loss": 1.0, + "step": 1285 + }, + { + "epoch": 0.4997571164869329, + "grad_norm": 0.20693185925483704, + "learning_rate": 0.00010035033086804204, + "loss": 1.0527, + "step": 1286 + }, + { + "epoch": 0.5001457301078402, + "grad_norm": 0.20330573618412018, + "learning_rate": 0.0001002724795640327, + "loss": 1.0137, + "step": 1287 + }, + { + "epoch": 0.5005343437287477, + "grad_norm": 0.19123876094818115, + "learning_rate": 0.00010019462826002337, + "loss": 0.9688, + "step": 1288 + }, + { + "epoch": 0.5009229573496551, + "grad_norm": 0.2184276431798935, + "learning_rate": 0.00010011677695601403, + "loss": 1.0367, + "step": 1289 + }, + { + "epoch": 0.5013115709705626, + "grad_norm": 0.21642108261585236, + "learning_rate": 0.00010003892565200468, + "loss": 1.102, + "step": 1290 + }, + { + "epoch": 0.5017001845914699, + "grad_norm": 0.20351074635982513, + "learning_rate": 9.996107434799533e-05, + "loss": 1.0327, + "step": 1291 + }, + { + "epoch": 0.5020887982123774, + "grad_norm": 0.22771553695201874, + "learning_rate": 9.9883223043986e-05, + "loss": 1.104, + "step": 1292 + }, + { + "epoch": 0.5024774118332848, + "grad_norm": 0.2271403968334198, + "learning_rate": 9.980537173997665e-05, + "loss": 1.1313, + "step": 1293 + }, + { + "epoch": 0.5028660254541921, + "grad_norm": 0.2157830148935318, + "learning_rate": 9.97275204359673e-05, + "loss": 1.0203, + "step": 1294 + }, + { + "epoch": 0.5032546390750996, + "grad_norm": 0.19555307924747467, + "learning_rate": 9.964966913195797e-05, + "loss": 1.0194, + "step": 1295 + }, + { + "epoch": 0.503643252696007, + "grad_norm": 0.1898549199104309, + "learning_rate": 9.957181782794862e-05, + "loss": 1.0034, + "step": 1296 + }, + { + "epoch": 0.5040318663169144, + "grad_norm": 0.23555906116962433, + "learning_rate": 9.949396652393928e-05, + "loss": 1.0298, + "step": 1297 + }, + { + "epoch": 0.5044204799378218, + "grad_norm": 0.20434850454330444, + "learning_rate": 9.941611521992994e-05, + "loss": 0.9999, + "step": 1298 + }, + { + "epoch": 0.5048090935587293, + "grad_norm": 0.21015289425849915, + "learning_rate": 9.933826391592059e-05, + "loss": 1.006, + "step": 1299 + }, + { + "epoch": 0.5051977071796366, + "grad_norm": 0.21147851645946503, + "learning_rate": 9.926041261191125e-05, + "loss": 1.0854, + "step": 1300 + }, + { + "epoch": 0.5055863208005441, + "grad_norm": 0.19666944444179535, + "learning_rate": 9.91825613079019e-05, + "loss": 1.0057, + "step": 1301 + }, + { + "epoch": 0.5059749344214515, + "grad_norm": 0.21233728528022766, + "learning_rate": 9.910471000389257e-05, + "loss": 1.0675, + "step": 1302 + }, + { + "epoch": 0.5063635480423588, + "grad_norm": 0.21905581653118134, + "learning_rate": 9.902685869988322e-05, + "loss": 1.0054, + "step": 1303 + }, + { + "epoch": 0.5067521616632663, + "grad_norm": 0.23434993624687195, + "learning_rate": 9.894900739587389e-05, + "loss": 0.9915, + "step": 1304 + }, + { + "epoch": 0.5071407752841737, + "grad_norm": 0.21684227883815765, + "learning_rate": 9.887115609186454e-05, + "loss": 1.1131, + "step": 1305 + }, + { + "epoch": 0.5075293889050811, + "grad_norm": 0.21699552237987518, + "learning_rate": 9.87933047878552e-05, + "loss": 1.0782, + "step": 1306 + }, + { + "epoch": 0.5079180025259885, + "grad_norm": 0.2218221127986908, + "learning_rate": 9.871545348384586e-05, + "loss": 1.0388, + "step": 1307 + }, + { + "epoch": 0.508306616146896, + "grad_norm": 0.20104359090328217, + "learning_rate": 9.863760217983652e-05, + "loss": 1.0336, + "step": 1308 + }, + { + "epoch": 0.5086952297678033, + "grad_norm": 0.21907050907611847, + "learning_rate": 9.855975087582718e-05, + "loss": 1.0587, + "step": 1309 + }, + { + "epoch": 0.5090838433887108, + "grad_norm": 0.2140391767024994, + "learning_rate": 9.848189957181784e-05, + "loss": 1.0351, + "step": 1310 + }, + { + "epoch": 0.5094724570096182, + "grad_norm": 0.33287563920021057, + "learning_rate": 9.84040482678085e-05, + "loss": 0.9908, + "step": 1311 + }, + { + "epoch": 0.5098610706305255, + "grad_norm": 0.2706705927848816, + "learning_rate": 9.832619696379915e-05, + "loss": 1.0078, + "step": 1312 + }, + { + "epoch": 0.510249684251433, + "grad_norm": 0.20216278731822968, + "learning_rate": 9.824834565978981e-05, + "loss": 1.0253, + "step": 1313 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 0.20736576616764069, + "learning_rate": 9.817049435578046e-05, + "loss": 1.0217, + "step": 1314 + }, + { + "epoch": 0.5110269114932479, + "grad_norm": 0.2275344580411911, + "learning_rate": 9.809264305177113e-05, + "loss": 1.0139, + "step": 1315 + }, + { + "epoch": 0.5114155251141552, + "grad_norm": 0.22243620455265045, + "learning_rate": 9.801479174776178e-05, + "loss": 1.0427, + "step": 1316 + }, + { + "epoch": 0.5118041387350627, + "grad_norm": 0.198841854929924, + "learning_rate": 9.793694044375243e-05, + "loss": 1.0231, + "step": 1317 + }, + { + "epoch": 0.5121927523559701, + "grad_norm": 0.2031068503856659, + "learning_rate": 9.78590891397431e-05, + "loss": 1.0184, + "step": 1318 + }, + { + "epoch": 0.5125813659768775, + "grad_norm": 0.21712587773799896, + "learning_rate": 9.778123783573375e-05, + "loss": 1.0205, + "step": 1319 + }, + { + "epoch": 0.5129699795977849, + "grad_norm": 0.19366060197353363, + "learning_rate": 9.77033865317244e-05, + "loss": 0.9623, + "step": 1320 + }, + { + "epoch": 0.5133585932186923, + "grad_norm": 0.19845952093601227, + "learning_rate": 9.762553522771507e-05, + "loss": 1.0209, + "step": 1321 + }, + { + "epoch": 0.5137472068395997, + "grad_norm": 0.19700276851654053, + "learning_rate": 9.754768392370572e-05, + "loss": 0.9506, + "step": 1322 + }, + { + "epoch": 0.5141358204605071, + "grad_norm": 0.19797460734844208, + "learning_rate": 9.746983261969639e-05, + "loss": 1.0928, + "step": 1323 + }, + { + "epoch": 0.5145244340814146, + "grad_norm": 0.20470699667930603, + "learning_rate": 9.739198131568704e-05, + "loss": 1.0835, + "step": 1324 + }, + { + "epoch": 0.5149130477023219, + "grad_norm": 0.19121742248535156, + "learning_rate": 9.731413001167769e-05, + "loss": 0.9877, + "step": 1325 + }, + { + "epoch": 0.5153016613232294, + "grad_norm": 0.20026616752147675, + "learning_rate": 9.723627870766836e-05, + "loss": 1.0094, + "step": 1326 + }, + { + "epoch": 0.5156902749441368, + "grad_norm": 0.2214539796113968, + "learning_rate": 9.715842740365901e-05, + "loss": 0.9867, + "step": 1327 + }, + { + "epoch": 0.5160788885650442, + "grad_norm": 0.22674603760242462, + "learning_rate": 9.708057609964967e-05, + "loss": 1.0738, + "step": 1328 + }, + { + "epoch": 0.5164675021859516, + "grad_norm": 0.21274834871292114, + "learning_rate": 9.700272479564033e-05, + "loss": 1.0458, + "step": 1329 + }, + { + "epoch": 0.5168561158068591, + "grad_norm": 0.20305052399635315, + "learning_rate": 9.692487349163099e-05, + "loss": 1.0041, + "step": 1330 + }, + { + "epoch": 0.5172447294277664, + "grad_norm": 0.1840772181749344, + "learning_rate": 9.684702218762166e-05, + "loss": 0.9498, + "step": 1331 + }, + { + "epoch": 0.5176333430486738, + "grad_norm": 0.2055782824754715, + "learning_rate": 9.676917088361231e-05, + "loss": 1.0223, + "step": 1332 + }, + { + "epoch": 0.5180219566695813, + "grad_norm": 0.21826402842998505, + "learning_rate": 9.669131957960297e-05, + "loss": 1.1068, + "step": 1333 + }, + { + "epoch": 0.5184105702904886, + "grad_norm": 0.22516922652721405, + "learning_rate": 9.661346827559363e-05, + "loss": 1.0957, + "step": 1334 + }, + { + "epoch": 0.5187991839113961, + "grad_norm": 0.21044284105300903, + "learning_rate": 9.653561697158428e-05, + "loss": 1.0384, + "step": 1335 + }, + { + "epoch": 0.5191877975323035, + "grad_norm": 0.20275571942329407, + "learning_rate": 9.645776566757494e-05, + "loss": 0.9978, + "step": 1336 + }, + { + "epoch": 0.519576411153211, + "grad_norm": 0.2077122926712036, + "learning_rate": 9.63799143635656e-05, + "loss": 1.0418, + "step": 1337 + }, + { + "epoch": 0.5199650247741183, + "grad_norm": 0.19158867001533508, + "learning_rate": 9.630206305955625e-05, + "loss": 1.0527, + "step": 1338 + }, + { + "epoch": 0.5203536383950258, + "grad_norm": 0.1932496577501297, + "learning_rate": 9.622421175554691e-05, + "loss": 1.0039, + "step": 1339 + }, + { + "epoch": 0.5207422520159332, + "grad_norm": 0.21937766671180725, + "learning_rate": 9.614636045153757e-05, + "loss": 1.0373, + "step": 1340 + }, + { + "epoch": 0.5211308656368405, + "grad_norm": 0.2268432229757309, + "learning_rate": 9.606850914752823e-05, + "loss": 1.0815, + "step": 1341 + }, + { + "epoch": 0.521519479257748, + "grad_norm": 0.2147454470396042, + "learning_rate": 9.599065784351888e-05, + "loss": 1.0331, + "step": 1342 + }, + { + "epoch": 0.5219080928786554, + "grad_norm": 0.19899709522724152, + "learning_rate": 9.591280653950954e-05, + "loss": 1.032, + "step": 1343 + }, + { + "epoch": 0.5222967064995628, + "grad_norm": 0.19646069407463074, + "learning_rate": 9.58349552355002e-05, + "loss": 0.9788, + "step": 1344 + }, + { + "epoch": 0.5226853201204702, + "grad_norm": 0.2146075963973999, + "learning_rate": 9.575710393149085e-05, + "loss": 1.0201, + "step": 1345 + }, + { + "epoch": 0.5230739337413777, + "grad_norm": 0.1968650370836258, + "learning_rate": 9.567925262748152e-05, + "loss": 0.9894, + "step": 1346 + }, + { + "epoch": 0.523462547362285, + "grad_norm": 0.21111296117305756, + "learning_rate": 9.560140132347217e-05, + "loss": 1.0961, + "step": 1347 + }, + { + "epoch": 0.5238511609831925, + "grad_norm": 0.20917272567749023, + "learning_rate": 9.552355001946282e-05, + "loss": 1.0435, + "step": 1348 + }, + { + "epoch": 0.5242397746040999, + "grad_norm": 0.2029752880334854, + "learning_rate": 9.544569871545349e-05, + "loss": 1.0328, + "step": 1349 + }, + { + "epoch": 0.5246283882250072, + "grad_norm": 0.20726613700389862, + "learning_rate": 9.536784741144414e-05, + "loss": 1.0465, + "step": 1350 + }, + { + "epoch": 0.5250170018459147, + "grad_norm": 0.19778740406036377, + "learning_rate": 9.52899961074348e-05, + "loss": 1.0058, + "step": 1351 + }, + { + "epoch": 0.5254056154668221, + "grad_norm": 0.19958540797233582, + "learning_rate": 9.521214480342546e-05, + "loss": 1.0164, + "step": 1352 + }, + { + "epoch": 0.5257942290877295, + "grad_norm": 0.2151395082473755, + "learning_rate": 9.513429349941611e-05, + "loss": 1.0703, + "step": 1353 + }, + { + "epoch": 0.5261828427086369, + "grad_norm": 0.2366979569196701, + "learning_rate": 9.505644219540678e-05, + "loss": 0.9832, + "step": 1354 + }, + { + "epoch": 0.5265714563295444, + "grad_norm": 0.22064165771007538, + "learning_rate": 9.497859089139743e-05, + "loss": 1.0181, + "step": 1355 + }, + { + "epoch": 0.5269600699504517, + "grad_norm": 0.20221936702728271, + "learning_rate": 9.49007395873881e-05, + "loss": 1.0424, + "step": 1356 + }, + { + "epoch": 0.5273486835713592, + "grad_norm": 0.19608759880065918, + "learning_rate": 9.482288828337876e-05, + "loss": 1.0074, + "step": 1357 + }, + { + "epoch": 0.5277372971922666, + "grad_norm": 0.20686689019203186, + "learning_rate": 9.474503697936941e-05, + "loss": 1.0213, + "step": 1358 + }, + { + "epoch": 0.528125910813174, + "grad_norm": 0.223610520362854, + "learning_rate": 9.466718567536008e-05, + "loss": 1.05, + "step": 1359 + }, + { + "epoch": 0.5285145244340814, + "grad_norm": 0.2135966569185257, + "learning_rate": 9.458933437135073e-05, + "loss": 1.034, + "step": 1360 + }, + { + "epoch": 0.5289031380549888, + "grad_norm": 0.1933239996433258, + "learning_rate": 9.451148306734138e-05, + "loss": 0.9883, + "step": 1361 + }, + { + "epoch": 0.5292917516758963, + "grad_norm": 0.20794694125652313, + "learning_rate": 9.443363176333205e-05, + "loss": 1.0103, + "step": 1362 + }, + { + "epoch": 0.5296803652968036, + "grad_norm": 0.20128493010997772, + "learning_rate": 9.43557804593227e-05, + "loss": 1.015, + "step": 1363 + }, + { + "epoch": 0.5300689789177111, + "grad_norm": 0.2128933072090149, + "learning_rate": 9.427792915531336e-05, + "loss": 1.0038, + "step": 1364 + }, + { + "epoch": 0.5304575925386185, + "grad_norm": 0.2046983689069748, + "learning_rate": 9.420007785130402e-05, + "loss": 0.9948, + "step": 1365 + }, + { + "epoch": 0.5308462061595259, + "grad_norm": 0.20909680426120758, + "learning_rate": 9.412222654729467e-05, + "loss": 1.0308, + "step": 1366 + }, + { + "epoch": 0.5312348197804333, + "grad_norm": 0.2182164192199707, + "learning_rate": 9.404437524328533e-05, + "loss": 1.0018, + "step": 1367 + }, + { + "epoch": 0.5316234334013407, + "grad_norm": 0.2107028216123581, + "learning_rate": 9.396652393927599e-05, + "loss": 1.0419, + "step": 1368 + }, + { + "epoch": 0.5320120470222481, + "grad_norm": 0.24631445109844208, + "learning_rate": 9.388867263526665e-05, + "loss": 1.0171, + "step": 1369 + }, + { + "epoch": 0.5324006606431555, + "grad_norm": 0.20331013202667236, + "learning_rate": 9.38108213312573e-05, + "loss": 1.0592, + "step": 1370 + }, + { + "epoch": 0.532789274264063, + "grad_norm": 0.19266058504581451, + "learning_rate": 9.373297002724796e-05, + "loss": 0.9912, + "step": 1371 + }, + { + "epoch": 0.5331778878849703, + "grad_norm": 0.22874227166175842, + "learning_rate": 9.365511872323862e-05, + "loss": 1.0533, + "step": 1372 + }, + { + "epoch": 0.5335665015058778, + "grad_norm": 0.2088235765695572, + "learning_rate": 9.357726741922927e-05, + "loss": 1.0464, + "step": 1373 + }, + { + "epoch": 0.5339551151267852, + "grad_norm": 0.2112397700548172, + "learning_rate": 9.349941611521994e-05, + "loss": 1.0503, + "step": 1374 + }, + { + "epoch": 0.5343437287476926, + "grad_norm": 0.20712170004844666, + "learning_rate": 9.342156481121059e-05, + "loss": 1.0237, + "step": 1375 + }, + { + "epoch": 0.5347323423686, + "grad_norm": 0.20077116787433624, + "learning_rate": 9.334371350720124e-05, + "loss": 1.0467, + "step": 1376 + }, + { + "epoch": 0.5351209559895075, + "grad_norm": 0.20394501090049744, + "learning_rate": 9.326586220319191e-05, + "loss": 1.0054, + "step": 1377 + }, + { + "epoch": 0.5355095696104148, + "grad_norm": 0.19459395110607147, + "learning_rate": 9.318801089918256e-05, + "loss": 0.9792, + "step": 1378 + }, + { + "epoch": 0.5358981832313222, + "grad_norm": 0.2116049826145172, + "learning_rate": 9.311015959517321e-05, + "loss": 1.0345, + "step": 1379 + }, + { + "epoch": 0.5362867968522297, + "grad_norm": 0.21672269701957703, + "learning_rate": 9.303230829116388e-05, + "loss": 1.0709, + "step": 1380 + }, + { + "epoch": 0.536675410473137, + "grad_norm": 0.20358407497406006, + "learning_rate": 9.295445698715453e-05, + "loss": 1.0534, + "step": 1381 + }, + { + "epoch": 0.5370640240940445, + "grad_norm": 0.19512853026390076, + "learning_rate": 9.28766056831452e-05, + "loss": 0.9397, + "step": 1382 + }, + { + "epoch": 0.5374526377149519, + "grad_norm": 0.2140122503042221, + "learning_rate": 9.279875437913586e-05, + "loss": 1.0164, + "step": 1383 + }, + { + "epoch": 0.5378412513358594, + "grad_norm": 0.20486049354076385, + "learning_rate": 9.272090307512651e-05, + "loss": 0.9892, + "step": 1384 + }, + { + "epoch": 0.5382298649567667, + "grad_norm": 0.20023222267627716, + "learning_rate": 9.264305177111718e-05, + "loss": 1.0019, + "step": 1385 + }, + { + "epoch": 0.5386184785776742, + "grad_norm": 0.20024439692497253, + "learning_rate": 9.256520046710783e-05, + "loss": 0.9717, + "step": 1386 + }, + { + "epoch": 0.5390070921985816, + "grad_norm": 0.21021386981010437, + "learning_rate": 9.24873491630985e-05, + "loss": 1.028, + "step": 1387 + }, + { + "epoch": 0.5393957058194889, + "grad_norm": 0.18508704006671906, + "learning_rate": 9.240949785908915e-05, + "loss": 1.0008, + "step": 1388 + }, + { + "epoch": 0.5397843194403964, + "grad_norm": 0.19351208209991455, + "learning_rate": 9.23316465550798e-05, + "loss": 0.9898, + "step": 1389 + }, + { + "epoch": 0.5401729330613038, + "grad_norm": 0.20341919362545013, + "learning_rate": 9.225379525107047e-05, + "loss": 1.0203, + "step": 1390 + }, + { + "epoch": 0.5405615466822112, + "grad_norm": 0.1942797303199768, + "learning_rate": 9.217594394706112e-05, + "loss": 1.003, + "step": 1391 + }, + { + "epoch": 0.5409501603031186, + "grad_norm": 0.2056138813495636, + "learning_rate": 9.209809264305178e-05, + "loss": 1.0149, + "step": 1392 + }, + { + "epoch": 0.5413387739240261, + "grad_norm": 0.21572062373161316, + "learning_rate": 9.202024133904244e-05, + "loss": 0.9808, + "step": 1393 + }, + { + "epoch": 0.5417273875449334, + "grad_norm": 0.19841499626636505, + "learning_rate": 9.194239003503309e-05, + "loss": 1.0467, + "step": 1394 + }, + { + "epoch": 0.5421160011658409, + "grad_norm": 0.20452147722244263, + "learning_rate": 9.186453873102375e-05, + "loss": 1.0378, + "step": 1395 + }, + { + "epoch": 0.5425046147867483, + "grad_norm": 0.2090451419353485, + "learning_rate": 9.17866874270144e-05, + "loss": 1.0823, + "step": 1396 + }, + { + "epoch": 0.5428932284076556, + "grad_norm": 0.215814009308815, + "learning_rate": 9.170883612300506e-05, + "loss": 1.0994, + "step": 1397 + }, + { + "epoch": 0.5432818420285631, + "grad_norm": 0.19924724102020264, + "learning_rate": 9.163098481899572e-05, + "loss": 1.0099, + "step": 1398 + }, + { + "epoch": 0.5436704556494705, + "grad_norm": 0.20074865221977234, + "learning_rate": 9.155313351498638e-05, + "loss": 1.0163, + "step": 1399 + }, + { + "epoch": 0.544059069270378, + "grad_norm": 0.21737203001976013, + "learning_rate": 9.147528221097704e-05, + "loss": 1.0527, + "step": 1400 + }, + { + "epoch": 0.5444476828912853, + "grad_norm": 0.2036885768175125, + "learning_rate": 9.139743090696769e-05, + "loss": 1.0208, + "step": 1401 + }, + { + "epoch": 0.5448362965121928, + "grad_norm": 0.20861585438251495, + "learning_rate": 9.131957960295835e-05, + "loss": 1.0175, + "step": 1402 + }, + { + "epoch": 0.5452249101331001, + "grad_norm": 0.23425570130348206, + "learning_rate": 9.124172829894901e-05, + "loss": 1.053, + "step": 1403 + }, + { + "epoch": 0.5456135237540076, + "grad_norm": 0.20389291644096375, + "learning_rate": 9.116387699493966e-05, + "loss": 1.0479, + "step": 1404 + }, + { + "epoch": 0.546002137374915, + "grad_norm": 0.20166678726673126, + "learning_rate": 9.108602569093033e-05, + "loss": 1.0064, + "step": 1405 + }, + { + "epoch": 0.5463907509958223, + "grad_norm": 0.21419203281402588, + "learning_rate": 9.100817438692098e-05, + "loss": 1.0122, + "step": 1406 + }, + { + "epoch": 0.5467793646167298, + "grad_norm": 0.20541758835315704, + "learning_rate": 9.093032308291165e-05, + "loss": 1.0355, + "step": 1407 + }, + { + "epoch": 0.5471679782376372, + "grad_norm": 0.21865367889404297, + "learning_rate": 9.08524717789023e-05, + "loss": 1.0201, + "step": 1408 + }, + { + "epoch": 0.5475565918585447, + "grad_norm": 0.21181468665599823, + "learning_rate": 9.077462047489296e-05, + "loss": 1.0501, + "step": 1409 + }, + { + "epoch": 0.547945205479452, + "grad_norm": 0.21016767621040344, + "learning_rate": 9.069676917088362e-05, + "loss": 1.0452, + "step": 1410 + }, + { + "epoch": 0.5483338191003595, + "grad_norm": 0.21119755506515503, + "learning_rate": 9.061891786687428e-05, + "loss": 1.0935, + "step": 1411 + }, + { + "epoch": 0.5487224327212669, + "grad_norm": 0.20688095688819885, + "learning_rate": 9.054106656286493e-05, + "loss": 1.0526, + "step": 1412 + }, + { + "epoch": 0.5491110463421743, + "grad_norm": 0.21857528388500214, + "learning_rate": 9.04632152588556e-05, + "loss": 1.0067, + "step": 1413 + }, + { + "epoch": 0.5494996599630817, + "grad_norm": 0.2196548581123352, + "learning_rate": 9.038536395484625e-05, + "loss": 1.0263, + "step": 1414 + }, + { + "epoch": 0.5498882735839892, + "grad_norm": 0.21952040493488312, + "learning_rate": 9.03075126508369e-05, + "loss": 1.0009, + "step": 1415 + }, + { + "epoch": 0.5502768872048965, + "grad_norm": 0.20059294998645782, + "learning_rate": 9.022966134682757e-05, + "loss": 1.0481, + "step": 1416 + }, + { + "epoch": 0.5506655008258039, + "grad_norm": 0.1960824728012085, + "learning_rate": 9.015181004281822e-05, + "loss": 1.0003, + "step": 1417 + }, + { + "epoch": 0.5510541144467114, + "grad_norm": 0.19051724672317505, + "learning_rate": 9.007395873880889e-05, + "loss": 0.9556, + "step": 1418 + }, + { + "epoch": 0.5514427280676187, + "grad_norm": 0.21008028090000153, + "learning_rate": 8.999610743479954e-05, + "loss": 1.0457, + "step": 1419 + }, + { + "epoch": 0.5518313416885262, + "grad_norm": 0.21465444564819336, + "learning_rate": 8.991825613079019e-05, + "loss": 1.0196, + "step": 1420 + }, + { + "epoch": 0.5522199553094336, + "grad_norm": 0.2062770277261734, + "learning_rate": 8.984040482678086e-05, + "loss": 1.0501, + "step": 1421 + }, + { + "epoch": 0.552608568930341, + "grad_norm": 0.21400012075901031, + "learning_rate": 8.976255352277151e-05, + "loss": 1.0711, + "step": 1422 + }, + { + "epoch": 0.5529971825512484, + "grad_norm": 0.19617624580860138, + "learning_rate": 8.968470221876217e-05, + "loss": 0.9858, + "step": 1423 + }, + { + "epoch": 0.5533857961721559, + "grad_norm": 0.20835624635219574, + "learning_rate": 8.960685091475283e-05, + "loss": 1.0122, + "step": 1424 + }, + { + "epoch": 0.5537744097930632, + "grad_norm": 0.21708111464977264, + "learning_rate": 8.952899961074348e-05, + "loss": 1.0108, + "step": 1425 + }, + { + "epoch": 0.5541630234139706, + "grad_norm": 0.20877864956855774, + "learning_rate": 8.945114830673414e-05, + "loss": 1.0389, + "step": 1426 + }, + { + "epoch": 0.5545516370348781, + "grad_norm": 0.1924441158771515, + "learning_rate": 8.93732970027248e-05, + "loss": 1.0088, + "step": 1427 + }, + { + "epoch": 0.5549402506557854, + "grad_norm": 0.20288826525211334, + "learning_rate": 8.929544569871546e-05, + "loss": 1.0296, + "step": 1428 + }, + { + "epoch": 0.5553288642766929, + "grad_norm": 0.2008143663406372, + "learning_rate": 8.921759439470611e-05, + "loss": 1.0521, + "step": 1429 + }, + { + "epoch": 0.5557174778976003, + "grad_norm": 0.24407047033309937, + "learning_rate": 8.913974309069677e-05, + "loss": 1.1038, + "step": 1430 + }, + { + "epoch": 0.5561060915185078, + "grad_norm": 0.2172536998987198, + "learning_rate": 8.906189178668743e-05, + "loss": 1.0811, + "step": 1431 + }, + { + "epoch": 0.5564947051394151, + "grad_norm": 0.21712054312229156, + "learning_rate": 8.898404048267808e-05, + "loss": 1.0642, + "step": 1432 + }, + { + "epoch": 0.5568833187603226, + "grad_norm": 0.22482797503471375, + "learning_rate": 8.890618917866875e-05, + "loss": 1.0742, + "step": 1433 + }, + { + "epoch": 0.55727193238123, + "grad_norm": 0.1974876970052719, + "learning_rate": 8.88283378746594e-05, + "loss": 0.9954, + "step": 1434 + }, + { + "epoch": 0.5576605460021373, + "grad_norm": 0.19162166118621826, + "learning_rate": 8.875048657065007e-05, + "loss": 1.0074, + "step": 1435 + }, + { + "epoch": 0.5580491596230448, + "grad_norm": 0.20439045131206512, + "learning_rate": 8.867263526664072e-05, + "loss": 1.026, + "step": 1436 + }, + { + "epoch": 0.5584377732439522, + "grad_norm": 0.1947651207447052, + "learning_rate": 8.859478396263138e-05, + "loss": 0.9848, + "step": 1437 + }, + { + "epoch": 0.5588263868648596, + "grad_norm": 0.21434316039085388, + "learning_rate": 8.851693265862204e-05, + "loss": 1.0843, + "step": 1438 + }, + { + "epoch": 0.559215000485767, + "grad_norm": 1.3314417600631714, + "learning_rate": 8.84390813546127e-05, + "loss": 1.0356, + "step": 1439 + }, + { + "epoch": 0.5596036141066745, + "grad_norm": 0.20131289958953857, + "learning_rate": 8.836123005060335e-05, + "loss": 1.0214, + "step": 1440 + }, + { + "epoch": 0.5599922277275818, + "grad_norm": 0.21596461534500122, + "learning_rate": 8.828337874659402e-05, + "loss": 1.0962, + "step": 1441 + }, + { + "epoch": 0.5603808413484893, + "grad_norm": 0.20477193593978882, + "learning_rate": 8.820552744258467e-05, + "loss": 1.0643, + "step": 1442 + }, + { + "epoch": 0.5607694549693967, + "grad_norm": 0.1978107988834381, + "learning_rate": 8.812767613857532e-05, + "loss": 1.0054, + "step": 1443 + }, + { + "epoch": 0.561158068590304, + "grad_norm": 0.219422847032547, + "learning_rate": 8.804982483456599e-05, + "loss": 1.0009, + "step": 1444 + }, + { + "epoch": 0.5615466822112115, + "grad_norm": 0.21489015221595764, + "learning_rate": 8.797197353055664e-05, + "loss": 1.052, + "step": 1445 + }, + { + "epoch": 0.5619352958321189, + "grad_norm": 0.2235930860042572, + "learning_rate": 8.78941222265473e-05, + "loss": 1.037, + "step": 1446 + }, + { + "epoch": 0.5623239094530263, + "grad_norm": 0.19922038912773132, + "learning_rate": 8.781627092253796e-05, + "loss": 1.0006, + "step": 1447 + }, + { + "epoch": 0.5627125230739337, + "grad_norm": 0.24740247428417206, + "learning_rate": 8.773841961852861e-05, + "loss": 1.0753, + "step": 1448 + }, + { + "epoch": 0.5631011366948412, + "grad_norm": 0.2148803174495697, + "learning_rate": 8.766056831451928e-05, + "loss": 1.0712, + "step": 1449 + }, + { + "epoch": 0.5634897503157485, + "grad_norm": 0.19838745892047882, + "learning_rate": 8.758271701050993e-05, + "loss": 1.027, + "step": 1450 + }, + { + "epoch": 0.563878363936656, + "grad_norm": 0.20328201353549957, + "learning_rate": 8.750486570650058e-05, + "loss": 1.0117, + "step": 1451 + }, + { + "epoch": 0.5642669775575634, + "grad_norm": 0.21230114996433258, + "learning_rate": 8.742701440249125e-05, + "loss": 1.0658, + "step": 1452 + }, + { + "epoch": 0.5646555911784708, + "grad_norm": 0.2030259519815445, + "learning_rate": 8.73491630984819e-05, + "loss": 1.0002, + "step": 1453 + }, + { + "epoch": 0.5650442047993782, + "grad_norm": 0.21404659748077393, + "learning_rate": 8.727131179447256e-05, + "loss": 1.0572, + "step": 1454 + }, + { + "epoch": 0.5654328184202856, + "grad_norm": 0.2148464322090149, + "learning_rate": 8.719346049046322e-05, + "loss": 1.0164, + "step": 1455 + }, + { + "epoch": 0.5658214320411931, + "grad_norm": 0.22083118557929993, + "learning_rate": 8.711560918645387e-05, + "loss": 0.9704, + "step": 1456 + }, + { + "epoch": 0.5662100456621004, + "grad_norm": 0.19305935502052307, + "learning_rate": 8.703775788244453e-05, + "loss": 1.0034, + "step": 1457 + }, + { + "epoch": 0.5665986592830079, + "grad_norm": 0.2100098729133606, + "learning_rate": 8.695990657843518e-05, + "loss": 1.0907, + "step": 1458 + }, + { + "epoch": 0.5669872729039153, + "grad_norm": 0.18947799503803253, + "learning_rate": 8.688205527442585e-05, + "loss": 0.9664, + "step": 1459 + }, + { + "epoch": 0.5673758865248227, + "grad_norm": 0.22341710329055786, + "learning_rate": 8.68042039704165e-05, + "loss": 1.0551, + "step": 1460 + }, + { + "epoch": 0.5677645001457301, + "grad_norm": 0.219679057598114, + "learning_rate": 8.672635266640717e-05, + "loss": 1.0398, + "step": 1461 + }, + { + "epoch": 0.5681531137666376, + "grad_norm": 0.22389841079711914, + "learning_rate": 8.664850136239782e-05, + "loss": 1.0472, + "step": 1462 + }, + { + "epoch": 0.5685417273875449, + "grad_norm": 0.21402975916862488, + "learning_rate": 8.657065005838849e-05, + "loss": 1.0224, + "step": 1463 + }, + { + "epoch": 0.5689303410084523, + "grad_norm": 0.20917154848575592, + "learning_rate": 8.649279875437915e-05, + "loss": 1.0526, + "step": 1464 + }, + { + "epoch": 0.5693189546293598, + "grad_norm": 0.2252056896686554, + "learning_rate": 8.64149474503698e-05, + "loss": 1.1064, + "step": 1465 + }, + { + "epoch": 0.5697075682502671, + "grad_norm": 0.21834802627563477, + "learning_rate": 8.633709614636046e-05, + "loss": 1.0318, + "step": 1466 + }, + { + "epoch": 0.5700961818711746, + "grad_norm": 0.21882353723049164, + "learning_rate": 8.625924484235112e-05, + "loss": 1.0285, + "step": 1467 + }, + { + "epoch": 0.570484795492082, + "grad_norm": 0.2028426229953766, + "learning_rate": 8.618139353834177e-05, + "loss": 1.0356, + "step": 1468 + }, + { + "epoch": 0.5708734091129894, + "grad_norm": 0.22297166287899017, + "learning_rate": 8.610354223433243e-05, + "loss": 1.0804, + "step": 1469 + }, + { + "epoch": 0.5712620227338968, + "grad_norm": 0.21775268018245697, + "learning_rate": 8.602569093032309e-05, + "loss": 0.9978, + "step": 1470 + }, + { + "epoch": 0.5716506363548043, + "grad_norm": 0.20362353324890137, + "learning_rate": 8.594783962631374e-05, + "loss": 0.9982, + "step": 1471 + }, + { + "epoch": 0.5720392499757117, + "grad_norm": 0.21854591369628906, + "learning_rate": 8.586998832230441e-05, + "loss": 1.0465, + "step": 1472 + }, + { + "epoch": 0.572427863596619, + "grad_norm": 0.20501428842544556, + "learning_rate": 8.579213701829506e-05, + "loss": 1.0468, + "step": 1473 + }, + { + "epoch": 0.5728164772175265, + "grad_norm": 0.21606214344501495, + "learning_rate": 8.571428571428571e-05, + "loss": 1.0477, + "step": 1474 + }, + { + "epoch": 0.5732050908384339, + "grad_norm": 0.2100660502910614, + "learning_rate": 8.563643441027638e-05, + "loss": 1.0071, + "step": 1475 + }, + { + "epoch": 0.5735937044593413, + "grad_norm": 0.21008896827697754, + "learning_rate": 8.555858310626703e-05, + "loss": 0.9914, + "step": 1476 + }, + { + "epoch": 0.5739823180802487, + "grad_norm": 0.22192159295082092, + "learning_rate": 8.54807318022577e-05, + "loss": 1.0385, + "step": 1477 + }, + { + "epoch": 0.5743709317011562, + "grad_norm": 0.20123356580734253, + "learning_rate": 8.540288049824835e-05, + "loss": 1.0062, + "step": 1478 + }, + { + "epoch": 0.5747595453220635, + "grad_norm": 0.201947420835495, + "learning_rate": 8.5325029194239e-05, + "loss": 1.0218, + "step": 1479 + }, + { + "epoch": 0.575148158942971, + "grad_norm": 0.22804415225982666, + "learning_rate": 8.524717789022967e-05, + "loss": 1.0445, + "step": 1480 + }, + { + "epoch": 0.5755367725638784, + "grad_norm": 0.20527036488056183, + "learning_rate": 8.516932658622032e-05, + "loss": 0.9972, + "step": 1481 + }, + { + "epoch": 0.5759253861847857, + "grad_norm": 0.20298773050308228, + "learning_rate": 8.509147528221098e-05, + "loss": 1.0272, + "step": 1482 + }, + { + "epoch": 0.5763139998056932, + "grad_norm": 0.22500957548618317, + "learning_rate": 8.501362397820164e-05, + "loss": 1.0982, + "step": 1483 + }, + { + "epoch": 0.5767026134266006, + "grad_norm": 0.1950521320104599, + "learning_rate": 8.493577267419229e-05, + "loss": 0.9848, + "step": 1484 + }, + { + "epoch": 0.577091227047508, + "grad_norm": 0.21087585389614105, + "learning_rate": 8.485792137018295e-05, + "loss": 1.0125, + "step": 1485 + }, + { + "epoch": 0.5774798406684154, + "grad_norm": 0.20122238993644714, + "learning_rate": 8.47800700661736e-05, + "loss": 1.0533, + "step": 1486 + }, + { + "epoch": 0.5778684542893229, + "grad_norm": 0.20149008929729462, + "learning_rate": 8.470221876216427e-05, + "loss": 1.0719, + "step": 1487 + }, + { + "epoch": 0.5782570679102302, + "grad_norm": 0.21307213604450226, + "learning_rate": 8.462436745815494e-05, + "loss": 1.0522, + "step": 1488 + }, + { + "epoch": 0.5786456815311377, + "grad_norm": 0.21828554570674896, + "learning_rate": 8.454651615414559e-05, + "loss": 1.0184, + "step": 1489 + }, + { + "epoch": 0.5790342951520451, + "grad_norm": 0.22002705931663513, + "learning_rate": 8.446866485013625e-05, + "loss": 1.0101, + "step": 1490 + }, + { + "epoch": 0.5794229087729524, + "grad_norm": 0.19479142129421234, + "learning_rate": 8.43908135461269e-05, + "loss": 0.9889, + "step": 1491 + }, + { + "epoch": 0.5798115223938599, + "grad_norm": 0.21346086263656616, + "learning_rate": 8.431296224211756e-05, + "loss": 1.0373, + "step": 1492 + }, + { + "epoch": 0.5802001360147673, + "grad_norm": 0.20177558064460754, + "learning_rate": 8.423511093810822e-05, + "loss": 1.0215, + "step": 1493 + }, + { + "epoch": 0.5805887496356748, + "grad_norm": 0.2117915153503418, + "learning_rate": 8.415725963409888e-05, + "loss": 1.0321, + "step": 1494 + }, + { + "epoch": 0.5809773632565821, + "grad_norm": 0.21304374933242798, + "learning_rate": 8.407940833008954e-05, + "loss": 1.0123, + "step": 1495 + }, + { + "epoch": 0.5813659768774896, + "grad_norm": 0.21173715591430664, + "learning_rate": 8.400155702608019e-05, + "loss": 1.0696, + "step": 1496 + }, + { + "epoch": 0.581754590498397, + "grad_norm": 0.20407019555568695, + "learning_rate": 8.392370572207085e-05, + "loss": 1.0086, + "step": 1497 + }, + { + "epoch": 0.5821432041193044, + "grad_norm": 0.209481880068779, + "learning_rate": 8.384585441806151e-05, + "loss": 0.9975, + "step": 1498 + }, + { + "epoch": 0.5825318177402118, + "grad_norm": 0.22184531390666962, + "learning_rate": 8.376800311405216e-05, + "loss": 1.0956, + "step": 1499 + }, + { + "epoch": 0.5829204313611193, + "grad_norm": 0.21344684064388275, + "learning_rate": 8.369015181004283e-05, + "loss": 1.0685, + "step": 1500 + }, + { + "epoch": 0.5833090449820266, + "grad_norm": 0.19837221503257751, + "learning_rate": 8.361230050603348e-05, + "loss": 1.0149, + "step": 1501 + }, + { + "epoch": 0.583697658602934, + "grad_norm": 0.2133672833442688, + "learning_rate": 8.353444920202413e-05, + "loss": 1.0453, + "step": 1502 + }, + { + "epoch": 0.5840862722238415, + "grad_norm": 0.21944090723991394, + "learning_rate": 8.34565978980148e-05, + "loss": 1.04, + "step": 1503 + }, + { + "epoch": 0.5844748858447488, + "grad_norm": 0.1983667016029358, + "learning_rate": 8.337874659400545e-05, + "loss": 0.9919, + "step": 1504 + }, + { + "epoch": 0.5848634994656563, + "grad_norm": 0.2025303989648819, + "learning_rate": 8.33008952899961e-05, + "loss": 1.0021, + "step": 1505 + }, + { + "epoch": 0.5852521130865637, + "grad_norm": 0.2015170007944107, + "learning_rate": 8.322304398598677e-05, + "loss": 0.9945, + "step": 1506 + }, + { + "epoch": 0.5856407267074711, + "grad_norm": 0.20768272876739502, + "learning_rate": 8.314519268197742e-05, + "loss": 1.0465, + "step": 1507 + }, + { + "epoch": 0.5860293403283785, + "grad_norm": 0.20513412356376648, + "learning_rate": 8.306734137796809e-05, + "loss": 1.0124, + "step": 1508 + }, + { + "epoch": 0.586417953949286, + "grad_norm": 0.20268471539020538, + "learning_rate": 8.298949007395874e-05, + "loss": 1.0586, + "step": 1509 + }, + { + "epoch": 0.5868065675701933, + "grad_norm": 0.20915938913822174, + "learning_rate": 8.291163876994939e-05, + "loss": 1.0047, + "step": 1510 + }, + { + "epoch": 0.5871951811911007, + "grad_norm": 0.2161451131105423, + "learning_rate": 8.283378746594006e-05, + "loss": 1.0184, + "step": 1511 + }, + { + "epoch": 0.5875837948120082, + "grad_norm": 0.1915571093559265, + "learning_rate": 8.275593616193071e-05, + "loss": 1.0187, + "step": 1512 + }, + { + "epoch": 0.5879724084329155, + "grad_norm": 0.20907992124557495, + "learning_rate": 8.267808485792137e-05, + "loss": 1.0212, + "step": 1513 + }, + { + "epoch": 0.588361022053823, + "grad_norm": 0.20140786468982697, + "learning_rate": 8.260023355391204e-05, + "loss": 1.014, + "step": 1514 + }, + { + "epoch": 0.5887496356747304, + "grad_norm": 0.208252415060997, + "learning_rate": 8.252238224990269e-05, + "loss": 1.0806, + "step": 1515 + }, + { + "epoch": 0.5891382492956379, + "grad_norm": 0.20596125721931458, + "learning_rate": 8.244453094589336e-05, + "loss": 0.9823, + "step": 1516 + }, + { + "epoch": 0.5895268629165452, + "grad_norm": 0.18832452595233917, + "learning_rate": 8.236667964188401e-05, + "loss": 0.9925, + "step": 1517 + }, + { + "epoch": 0.5899154765374527, + "grad_norm": 0.2078334391117096, + "learning_rate": 8.228882833787467e-05, + "loss": 1.0587, + "step": 1518 + }, + { + "epoch": 0.59030409015836, + "grad_norm": 0.20121365785598755, + "learning_rate": 8.221097703386533e-05, + "loss": 1.0607, + "step": 1519 + }, + { + "epoch": 0.5906927037792674, + "grad_norm": 0.19666099548339844, + "learning_rate": 8.213312572985598e-05, + "loss": 1.0124, + "step": 1520 + }, + { + "epoch": 0.5910813174001749, + "grad_norm": 0.20176006853580475, + "learning_rate": 8.205527442584664e-05, + "loss": 1.0297, + "step": 1521 + }, + { + "epoch": 0.5914699310210823, + "grad_norm": 0.2038574516773224, + "learning_rate": 8.19774231218373e-05, + "loss": 1.0311, + "step": 1522 + }, + { + "epoch": 0.5918585446419897, + "grad_norm": 0.19517424702644348, + "learning_rate": 8.189957181782796e-05, + "loss": 0.9945, + "step": 1523 + }, + { + "epoch": 0.5922471582628971, + "grad_norm": 0.19599094986915588, + "learning_rate": 8.182172051381861e-05, + "loss": 1.0255, + "step": 1524 + }, + { + "epoch": 0.5926357718838046, + "grad_norm": 0.21409402787685394, + "learning_rate": 8.174386920980927e-05, + "loss": 1.0868, + "step": 1525 + }, + { + "epoch": 0.5930243855047119, + "grad_norm": 0.19567830860614777, + "learning_rate": 8.166601790579993e-05, + "loss": 0.9654, + "step": 1526 + }, + { + "epoch": 0.5934129991256194, + "grad_norm": 0.2275007963180542, + "learning_rate": 8.158816660179058e-05, + "loss": 1.0867, + "step": 1527 + }, + { + "epoch": 0.5938016127465268, + "grad_norm": 0.19826427102088928, + "learning_rate": 8.151031529778123e-05, + "loss": 1.0301, + "step": 1528 + }, + { + "epoch": 0.5941902263674341, + "grad_norm": 0.2051352709531784, + "learning_rate": 8.14324639937719e-05, + "loss": 1.023, + "step": 1529 + }, + { + "epoch": 0.5945788399883416, + "grad_norm": 0.19492043554782867, + "learning_rate": 8.135461268976255e-05, + "loss": 0.9608, + "step": 1530 + }, + { + "epoch": 0.594967453609249, + "grad_norm": 0.21521608531475067, + "learning_rate": 8.127676138575322e-05, + "loss": 1.0612, + "step": 1531 + }, + { + "epoch": 0.5953560672301564, + "grad_norm": 0.22739367187023163, + "learning_rate": 8.119891008174387e-05, + "loss": 1.0603, + "step": 1532 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 0.20334595441818237, + "learning_rate": 8.112105877773452e-05, + "loss": 1.0191, + "step": 1533 + }, + { + "epoch": 0.5961332944719713, + "grad_norm": 0.20985397696495056, + "learning_rate": 8.104320747372519e-05, + "loss": 1.0721, + "step": 1534 + }, + { + "epoch": 0.5965219080928786, + "grad_norm": 0.20472954213619232, + "learning_rate": 8.096535616971584e-05, + "loss": 1.0556, + "step": 1535 + }, + { + "epoch": 0.5969105217137861, + "grad_norm": 0.2112964689731598, + "learning_rate": 8.08875048657065e-05, + "loss": 1.0016, + "step": 1536 + }, + { + "epoch": 0.5972991353346935, + "grad_norm": 0.21330617368221283, + "learning_rate": 8.080965356169716e-05, + "loss": 1.0783, + "step": 1537 + }, + { + "epoch": 0.5976877489556008, + "grad_norm": 0.20907814800739288, + "learning_rate": 8.073180225768782e-05, + "loss": 1.071, + "step": 1538 + }, + { + "epoch": 0.5980763625765083, + "grad_norm": 0.2038964033126831, + "learning_rate": 8.065395095367848e-05, + "loss": 1.0039, + "step": 1539 + }, + { + "epoch": 0.5984649761974157, + "grad_norm": 0.2175542712211609, + "learning_rate": 8.057609964966914e-05, + "loss": 1.0015, + "step": 1540 + }, + { + "epoch": 0.5988535898183232, + "grad_norm": 0.21474529802799225, + "learning_rate": 8.049824834565979e-05, + "loss": 1.0273, + "step": 1541 + }, + { + "epoch": 0.5992422034392305, + "grad_norm": 0.21428482234477997, + "learning_rate": 8.042039704165046e-05, + "loss": 1.0767, + "step": 1542 + }, + { + "epoch": 0.599630817060138, + "grad_norm": 0.20287524163722992, + "learning_rate": 8.034254573764111e-05, + "loss": 1.064, + "step": 1543 + }, + { + "epoch": 0.6000194306810454, + "grad_norm": 0.20689848065376282, + "learning_rate": 8.026469443363178e-05, + "loss": 1.0084, + "step": 1544 + }, + { + "epoch": 0.6004080443019528, + "grad_norm": 0.22451332211494446, + "learning_rate": 8.018684312962243e-05, + "loss": 1.1039, + "step": 1545 + }, + { + "epoch": 0.6007966579228602, + "grad_norm": 0.21381956338882446, + "learning_rate": 8.010899182561308e-05, + "loss": 1.0551, + "step": 1546 + }, + { + "epoch": 0.6011852715437677, + "grad_norm": 0.20108483731746674, + "learning_rate": 8.003114052160375e-05, + "loss": 1.0326, + "step": 1547 + }, + { + "epoch": 0.601573885164675, + "grad_norm": 0.19739678502082825, + "learning_rate": 7.99532892175944e-05, + "loss": 1.0319, + "step": 1548 + }, + { + "epoch": 0.6019624987855824, + "grad_norm": 0.21635359525680542, + "learning_rate": 7.987543791358506e-05, + "loss": 1.0465, + "step": 1549 + }, + { + "epoch": 0.6023511124064899, + "grad_norm": 0.1949319988489151, + "learning_rate": 7.979758660957572e-05, + "loss": 1.0026, + "step": 1550 + }, + { + "epoch": 0.6027397260273972, + "grad_norm": 0.1989699900150299, + "learning_rate": 7.971973530556637e-05, + "loss": 1.021, + "step": 1551 + }, + { + "epoch": 0.6031283396483047, + "grad_norm": 0.24031391739845276, + "learning_rate": 7.964188400155703e-05, + "loss": 1.0293, + "step": 1552 + }, + { + "epoch": 0.6035169532692121, + "grad_norm": 0.21247251331806183, + "learning_rate": 7.956403269754769e-05, + "loss": 1.023, + "step": 1553 + }, + { + "epoch": 0.6039055668901195, + "grad_norm": 0.21565628051757812, + "learning_rate": 7.948618139353835e-05, + "loss": 1.1027, + "step": 1554 + }, + { + "epoch": 0.6042941805110269, + "grad_norm": 0.21207931637763977, + "learning_rate": 7.9408330089529e-05, + "loss": 1.0634, + "step": 1555 + }, + { + "epoch": 0.6046827941319344, + "grad_norm": 0.21354155242443085, + "learning_rate": 7.933047878551965e-05, + "loss": 1.0433, + "step": 1556 + }, + { + "epoch": 0.6050714077528417, + "grad_norm": 0.21708370745182037, + "learning_rate": 7.925262748151032e-05, + "loss": 1.0499, + "step": 1557 + }, + { + "epoch": 0.6054600213737491, + "grad_norm": 0.2051447182893753, + "learning_rate": 7.917477617750097e-05, + "loss": 1.0042, + "step": 1558 + }, + { + "epoch": 0.6058486349946566, + "grad_norm": 0.18768000602722168, + "learning_rate": 7.909692487349164e-05, + "loss": 1.009, + "step": 1559 + }, + { + "epoch": 0.6062372486155639, + "grad_norm": 0.2142931967973709, + "learning_rate": 7.901907356948229e-05, + "loss": 1.0458, + "step": 1560 + }, + { + "epoch": 0.6066258622364714, + "grad_norm": 0.21006444096565247, + "learning_rate": 7.894122226547294e-05, + "loss": 1.0286, + "step": 1561 + }, + { + "epoch": 0.6070144758573788, + "grad_norm": 0.2187039703130722, + "learning_rate": 7.886337096146361e-05, + "loss": 1.0103, + "step": 1562 + }, + { + "epoch": 0.6074030894782863, + "grad_norm": 0.19863669574260712, + "learning_rate": 7.878551965745426e-05, + "loss": 0.9925, + "step": 1563 + }, + { + "epoch": 0.6077917030991936, + "grad_norm": 0.21771976351737976, + "learning_rate": 7.870766835344493e-05, + "loss": 0.9853, + "step": 1564 + }, + { + "epoch": 0.6081803167201011, + "grad_norm": 0.21714983880519867, + "learning_rate": 7.862981704943558e-05, + "loss": 1.0123, + "step": 1565 + }, + { + "epoch": 0.6085689303410085, + "grad_norm": 0.2251398265361786, + "learning_rate": 7.855196574542624e-05, + "loss": 1.0265, + "step": 1566 + }, + { + "epoch": 0.6089575439619158, + "grad_norm": 0.22089716792106628, + "learning_rate": 7.84741144414169e-05, + "loss": 1.0689, + "step": 1567 + }, + { + "epoch": 0.6093461575828233, + "grad_norm": 0.2453841269016266, + "learning_rate": 7.839626313740756e-05, + "loss": 1.0185, + "step": 1568 + }, + { + "epoch": 0.6097347712037307, + "grad_norm": 0.21866528689861298, + "learning_rate": 7.831841183339821e-05, + "loss": 1.0361, + "step": 1569 + }, + { + "epoch": 0.6101233848246381, + "grad_norm": 0.22421486675739288, + "learning_rate": 7.824056052938888e-05, + "loss": 1.024, + "step": 1570 + }, + { + "epoch": 0.6105119984455455, + "grad_norm": 0.21107137203216553, + "learning_rate": 7.816270922537953e-05, + "loss": 1.0335, + "step": 1571 + }, + { + "epoch": 0.610900612066453, + "grad_norm": 0.20731772482395172, + "learning_rate": 7.80848579213702e-05, + "loss": 1.0563, + "step": 1572 + }, + { + "epoch": 0.6112892256873603, + "grad_norm": 0.19535884261131287, + "learning_rate": 7.800700661736085e-05, + "loss": 0.9698, + "step": 1573 + }, + { + "epoch": 0.6116778393082678, + "grad_norm": 0.20449021458625793, + "learning_rate": 7.79291553133515e-05, + "loss": 1.0125, + "step": 1574 + }, + { + "epoch": 0.6120664529291752, + "grad_norm": 0.19576509296894073, + "learning_rate": 7.785130400934217e-05, + "loss": 0.9326, + "step": 1575 + }, + { + "epoch": 0.6124550665500825, + "grad_norm": 0.18914124369621277, + "learning_rate": 7.777345270533282e-05, + "loss": 0.9939, + "step": 1576 + }, + { + "epoch": 0.61284368017099, + "grad_norm": 0.21239091455936432, + "learning_rate": 7.769560140132348e-05, + "loss": 1.0271, + "step": 1577 + }, + { + "epoch": 0.6132322937918974, + "grad_norm": 0.22204811871051788, + "learning_rate": 7.761775009731414e-05, + "loss": 1.0524, + "step": 1578 + }, + { + "epoch": 0.6136209074128048, + "grad_norm": 0.20047850906848907, + "learning_rate": 7.753989879330479e-05, + "loss": 1.0076, + "step": 1579 + }, + { + "epoch": 0.6140095210337122, + "grad_norm": 0.22619746625423431, + "learning_rate": 7.746204748929545e-05, + "loss": 1.0611, + "step": 1580 + }, + { + "epoch": 0.6143981346546197, + "grad_norm": 0.2500879466533661, + "learning_rate": 7.73841961852861e-05, + "loss": 1.0364, + "step": 1581 + }, + { + "epoch": 0.614786748275527, + "grad_norm": 0.23486928641796112, + "learning_rate": 7.730634488127676e-05, + "loss": 1.0472, + "step": 1582 + }, + { + "epoch": 0.6151753618964345, + "grad_norm": 0.19849038124084473, + "learning_rate": 7.722849357726742e-05, + "loss": 0.9847, + "step": 1583 + }, + { + "epoch": 0.6155639755173419, + "grad_norm": 0.21516263484954834, + "learning_rate": 7.715064227325807e-05, + "loss": 1.0351, + "step": 1584 + }, + { + "epoch": 0.6159525891382492, + "grad_norm": 0.20137760043144226, + "learning_rate": 7.707279096924874e-05, + "loss": 0.9879, + "step": 1585 + }, + { + "epoch": 0.6163412027591567, + "grad_norm": 0.2146228402853012, + "learning_rate": 7.699493966523939e-05, + "loss": 1.0792, + "step": 1586 + }, + { + "epoch": 0.6167298163800641, + "grad_norm": 0.19929760694503784, + "learning_rate": 7.691708836123004e-05, + "loss": 1.0313, + "step": 1587 + }, + { + "epoch": 0.6171184300009716, + "grad_norm": 0.201123908162117, + "learning_rate": 7.683923705722071e-05, + "loss": 1.0279, + "step": 1588 + }, + { + "epoch": 0.6175070436218789, + "grad_norm": 0.2154105007648468, + "learning_rate": 7.676138575321136e-05, + "loss": 1.075, + "step": 1589 + }, + { + "epoch": 0.6178956572427864, + "grad_norm": 0.2028442770242691, + "learning_rate": 7.668353444920203e-05, + "loss": 0.9771, + "step": 1590 + }, + { + "epoch": 0.6182842708636938, + "grad_norm": 0.18003074824810028, + "learning_rate": 7.660568314519268e-05, + "loss": 0.9677, + "step": 1591 + }, + { + "epoch": 0.6186728844846012, + "grad_norm": 0.23250891268253326, + "learning_rate": 7.652783184118335e-05, + "loss": 1.015, + "step": 1592 + }, + { + "epoch": 0.6190614981055086, + "grad_norm": 0.2047244906425476, + "learning_rate": 7.6449980537174e-05, + "loss": 1.0044, + "step": 1593 + }, + { + "epoch": 0.6194501117264161, + "grad_norm": 0.20011259615421295, + "learning_rate": 7.637212923316466e-05, + "loss": 1.0089, + "step": 1594 + }, + { + "epoch": 0.6198387253473234, + "grad_norm": 0.2212608903646469, + "learning_rate": 7.629427792915533e-05, + "loss": 1.0457, + "step": 1595 + }, + { + "epoch": 0.6202273389682308, + "grad_norm": 0.22725115716457367, + "learning_rate": 7.621642662514598e-05, + "loss": 1.1198, + "step": 1596 + }, + { + "epoch": 0.6206159525891383, + "grad_norm": 0.2065306007862091, + "learning_rate": 7.613857532113663e-05, + "loss": 1.0572, + "step": 1597 + }, + { + "epoch": 0.6210045662100456, + "grad_norm": 0.2132783830165863, + "learning_rate": 7.60607240171273e-05, + "loss": 1.0332, + "step": 1598 + }, + { + "epoch": 0.6213931798309531, + "grad_norm": 0.20527103543281555, + "learning_rate": 7.598287271311795e-05, + "loss": 1.0156, + "step": 1599 + }, + { + "epoch": 0.6217817934518605, + "grad_norm": 0.23608024418354034, + "learning_rate": 7.59050214091086e-05, + "loss": 1.0379, + "step": 1600 + }, + { + "epoch": 0.6221704070727679, + "grad_norm": 0.22227297723293304, + "learning_rate": 7.582717010509927e-05, + "loss": 1.0507, + "step": 1601 + }, + { + "epoch": 0.6225590206936753, + "grad_norm": 0.22359615564346313, + "learning_rate": 7.574931880108992e-05, + "loss": 1.0705, + "step": 1602 + }, + { + "epoch": 0.6229476343145828, + "grad_norm": 0.20478755235671997, + "learning_rate": 7.567146749708059e-05, + "loss": 1.0309, + "step": 1603 + }, + { + "epoch": 0.6233362479354901, + "grad_norm": 0.2223423272371292, + "learning_rate": 7.559361619307124e-05, + "loss": 1.0386, + "step": 1604 + }, + { + "epoch": 0.6237248615563975, + "grad_norm": 0.21232105791568756, + "learning_rate": 7.551576488906189e-05, + "loss": 1.0353, + "step": 1605 + }, + { + "epoch": 0.624113475177305, + "grad_norm": 0.22431129217147827, + "learning_rate": 7.543791358505256e-05, + "loss": 1.1017, + "step": 1606 + }, + { + "epoch": 0.6245020887982123, + "grad_norm": 0.20826031267642975, + "learning_rate": 7.536006228104321e-05, + "loss": 1.0172, + "step": 1607 + }, + { + "epoch": 0.6248907024191198, + "grad_norm": 0.2803161144256592, + "learning_rate": 7.528221097703387e-05, + "loss": 1.0554, + "step": 1608 + }, + { + "epoch": 0.6252793160400272, + "grad_norm": 0.2185174971818924, + "learning_rate": 7.520435967302453e-05, + "loss": 0.9842, + "step": 1609 + }, + { + "epoch": 0.6256679296609347, + "grad_norm": 0.2091478854417801, + "learning_rate": 7.512650836901518e-05, + "loss": 0.9783, + "step": 1610 + }, + { + "epoch": 0.626056543281842, + "grad_norm": 0.22342967987060547, + "learning_rate": 7.504865706500584e-05, + "loss": 0.9891, + "step": 1611 + }, + { + "epoch": 0.6264451569027495, + "grad_norm": 0.195283442735672, + "learning_rate": 7.49708057609965e-05, + "loss": 0.9654, + "step": 1612 + }, + { + "epoch": 0.6268337705236569, + "grad_norm": 0.21048255264759064, + "learning_rate": 7.489295445698716e-05, + "loss": 1.0112, + "step": 1613 + }, + { + "epoch": 0.6272223841445642, + "grad_norm": 0.21405541896820068, + "learning_rate": 7.481510315297781e-05, + "loss": 1.0498, + "step": 1614 + }, + { + "epoch": 0.6276109977654717, + "grad_norm": 0.2144453227519989, + "learning_rate": 7.473725184896846e-05, + "loss": 1.0487, + "step": 1615 + }, + { + "epoch": 0.627999611386379, + "grad_norm": 0.21963326632976532, + "learning_rate": 7.465940054495913e-05, + "loss": 1.0634, + "step": 1616 + }, + { + "epoch": 0.6283882250072865, + "grad_norm": 0.20100601017475128, + "learning_rate": 7.458154924094978e-05, + "loss": 1.0407, + "step": 1617 + }, + { + "epoch": 0.6287768386281939, + "grad_norm": 0.19469478726387024, + "learning_rate": 7.450369793694045e-05, + "loss": 0.9923, + "step": 1618 + }, + { + "epoch": 0.6291654522491014, + "grad_norm": 0.2114047408103943, + "learning_rate": 7.442584663293111e-05, + "loss": 1.0263, + "step": 1619 + }, + { + "epoch": 0.6295540658700087, + "grad_norm": 0.21080389618873596, + "learning_rate": 7.434799532892177e-05, + "loss": 1.0012, + "step": 1620 + }, + { + "epoch": 0.6299426794909162, + "grad_norm": 0.20366831123828888, + "learning_rate": 7.427014402491243e-05, + "loss": 1.0254, + "step": 1621 + }, + { + "epoch": 0.6303312931118236, + "grad_norm": 0.209821879863739, + "learning_rate": 7.419229272090308e-05, + "loss": 0.9416, + "step": 1622 + }, + { + "epoch": 0.6307199067327309, + "grad_norm": 0.2228868007659912, + "learning_rate": 7.411444141689374e-05, + "loss": 1.0128, + "step": 1623 + }, + { + "epoch": 0.6311085203536384, + "grad_norm": 0.19673995673656464, + "learning_rate": 7.40365901128844e-05, + "loss": 0.9709, + "step": 1624 + }, + { + "epoch": 0.6314971339745458, + "grad_norm": 0.21590839326381683, + "learning_rate": 7.395873880887505e-05, + "loss": 1.0251, + "step": 1625 + }, + { + "epoch": 0.6318857475954532, + "grad_norm": 0.20200593769550323, + "learning_rate": 7.388088750486572e-05, + "loss": 1.0307, + "step": 1626 + }, + { + "epoch": 0.6322743612163606, + "grad_norm": 0.19623909890651703, + "learning_rate": 7.380303620085637e-05, + "loss": 1.0375, + "step": 1627 + }, + { + "epoch": 0.6326629748372681, + "grad_norm": 0.19878128170967102, + "learning_rate": 7.372518489684702e-05, + "loss": 0.9844, + "step": 1628 + }, + { + "epoch": 0.6330515884581754, + "grad_norm": 0.21292422711849213, + "learning_rate": 7.364733359283769e-05, + "loss": 1.0228, + "step": 1629 + }, + { + "epoch": 0.6334402020790829, + "grad_norm": 0.1915559619665146, + "learning_rate": 7.356948228882834e-05, + "loss": 0.9818, + "step": 1630 + }, + { + "epoch": 0.6338288156999903, + "grad_norm": 0.2264430969953537, + "learning_rate": 7.3491630984819e-05, + "loss": 1.146, + "step": 1631 + }, + { + "epoch": 0.6342174293208978, + "grad_norm": 0.19332270324230194, + "learning_rate": 7.341377968080966e-05, + "loss": 1.0007, + "step": 1632 + }, + { + "epoch": 0.6346060429418051, + "grad_norm": 0.217147096991539, + "learning_rate": 7.333592837680031e-05, + "loss": 1.0498, + "step": 1633 + }, + { + "epoch": 0.6349946565627125, + "grad_norm": 0.22200679779052734, + "learning_rate": 7.325807707279098e-05, + "loss": 1.0358, + "step": 1634 + }, + { + "epoch": 0.63538327018362, + "grad_norm": 0.19485117495059967, + "learning_rate": 7.318022576878163e-05, + "loss": 0.9717, + "step": 1635 + }, + { + "epoch": 0.6357718838045273, + "grad_norm": 0.20595680177211761, + "learning_rate": 7.310237446477228e-05, + "loss": 1.0195, + "step": 1636 + }, + { + "epoch": 0.6361604974254348, + "grad_norm": 0.21184709668159485, + "learning_rate": 7.302452316076294e-05, + "loss": 1.0354, + "step": 1637 + }, + { + "epoch": 0.6365491110463422, + "grad_norm": 0.22607794404029846, + "learning_rate": 7.29466718567536e-05, + "loss": 1.0217, + "step": 1638 + }, + { + "epoch": 0.6369377246672496, + "grad_norm": 0.20236065983772278, + "learning_rate": 7.286882055274426e-05, + "loss": 1.0441, + "step": 1639 + }, + { + "epoch": 0.637326338288157, + "grad_norm": 0.19979622960090637, + "learning_rate": 7.279096924873491e-05, + "loss": 1.0105, + "step": 1640 + }, + { + "epoch": 0.6377149519090645, + "grad_norm": 0.2655459940433502, + "learning_rate": 7.271311794472557e-05, + "loss": 1.0726, + "step": 1641 + }, + { + "epoch": 0.6381035655299718, + "grad_norm": 0.25107496976852417, + "learning_rate": 7.263526664071623e-05, + "loss": 1.037, + "step": 1642 + }, + { + "epoch": 0.6384921791508792, + "grad_norm": 0.19250229001045227, + "learning_rate": 7.255741533670688e-05, + "loss": 0.9741, + "step": 1643 + }, + { + "epoch": 0.6388807927717867, + "grad_norm": 0.19324181973934174, + "learning_rate": 7.247956403269755e-05, + "loss": 1.0333, + "step": 1644 + }, + { + "epoch": 0.639269406392694, + "grad_norm": 0.22267483174800873, + "learning_rate": 7.240171272868822e-05, + "loss": 1.0313, + "step": 1645 + }, + { + "epoch": 0.6396580200136015, + "grad_norm": 0.2775348722934723, + "learning_rate": 7.232386142467887e-05, + "loss": 1.0686, + "step": 1646 + }, + { + "epoch": 0.6400466336345089, + "grad_norm": 0.1886623501777649, + "learning_rate": 7.224601012066953e-05, + "loss": 1.0029, + "step": 1647 + }, + { + "epoch": 0.6404352472554163, + "grad_norm": 0.20303374528884888, + "learning_rate": 7.216815881666019e-05, + "loss": 1.0346, + "step": 1648 + }, + { + "epoch": 0.6408238608763237, + "grad_norm": 0.20815756916999817, + "learning_rate": 7.209030751265085e-05, + "loss": 1.0258, + "step": 1649 + }, + { + "epoch": 0.6412124744972312, + "grad_norm": 0.22055703401565552, + "learning_rate": 7.20124562086415e-05, + "loss": 1.0215, + "step": 1650 + }, + { + "epoch": 0.6416010881181385, + "grad_norm": 0.20248562097549438, + "learning_rate": 7.193460490463215e-05, + "loss": 0.9979, + "step": 1651 + }, + { + "epoch": 0.6419897017390459, + "grad_norm": 0.2093247026205063, + "learning_rate": 7.185675360062282e-05, + "loss": 1.0605, + "step": 1652 + }, + { + "epoch": 0.6423783153599534, + "grad_norm": 0.22276204824447632, + "learning_rate": 7.177890229661347e-05, + "loss": 1.0788, + "step": 1653 + }, + { + "epoch": 0.6427669289808607, + "grad_norm": 0.19959624111652374, + "learning_rate": 7.170105099260412e-05, + "loss": 0.9954, + "step": 1654 + }, + { + "epoch": 0.6431555426017682, + "grad_norm": 0.20173248648643494, + "learning_rate": 7.162319968859479e-05, + "loss": 1.003, + "step": 1655 + }, + { + "epoch": 0.6435441562226756, + "grad_norm": 0.207533061504364, + "learning_rate": 7.154534838458544e-05, + "loss": 1.043, + "step": 1656 + }, + { + "epoch": 0.643932769843583, + "grad_norm": 0.21928350627422333, + "learning_rate": 7.146749708057611e-05, + "loss": 1.0472, + "step": 1657 + }, + { + "epoch": 0.6443213834644904, + "grad_norm": 0.2567078173160553, + "learning_rate": 7.138964577656676e-05, + "loss": 1.0946, + "step": 1658 + }, + { + "epoch": 0.6447099970853979, + "grad_norm": 0.19454176723957062, + "learning_rate": 7.131179447255741e-05, + "loss": 0.9437, + "step": 1659 + }, + { + "epoch": 0.6450986107063053, + "grad_norm": 0.19198423624038696, + "learning_rate": 7.123394316854808e-05, + "loss": 0.9976, + "step": 1660 + }, + { + "epoch": 0.6454872243272126, + "grad_norm": 0.1929445117712021, + "learning_rate": 7.115609186453873e-05, + "loss": 1.0279, + "step": 1661 + }, + { + "epoch": 0.6458758379481201, + "grad_norm": 0.2041027694940567, + "learning_rate": 7.10782405605294e-05, + "loss": 1.0458, + "step": 1662 + }, + { + "epoch": 0.6462644515690275, + "grad_norm": 0.23750995099544525, + "learning_rate": 7.100038925652005e-05, + "loss": 1.0916, + "step": 1663 + }, + { + "epoch": 0.6466530651899349, + "grad_norm": 0.1971994787454605, + "learning_rate": 7.09225379525107e-05, + "loss": 0.951, + "step": 1664 + }, + { + "epoch": 0.6470416788108423, + "grad_norm": 0.20459246635437012, + "learning_rate": 7.084468664850136e-05, + "loss": 0.9653, + "step": 1665 + }, + { + "epoch": 0.6474302924317498, + "grad_norm": 0.2137187272310257, + "learning_rate": 7.076683534449202e-05, + "loss": 1.0291, + "step": 1666 + }, + { + "epoch": 0.6478189060526571, + "grad_norm": 0.21235258877277374, + "learning_rate": 7.068898404048268e-05, + "loss": 1.0104, + "step": 1667 + }, + { + "epoch": 0.6482075196735646, + "grad_norm": 0.23120944201946259, + "learning_rate": 7.061113273647333e-05, + "loss": 1.0693, + "step": 1668 + }, + { + "epoch": 0.648596133294472, + "grad_norm": 1.38257896900177, + "learning_rate": 7.053328143246399e-05, + "loss": 1.0339, + "step": 1669 + }, + { + "epoch": 0.6489847469153793, + "grad_norm": 0.20898790657520294, + "learning_rate": 7.045543012845465e-05, + "loss": 1.004, + "step": 1670 + }, + { + "epoch": 0.6493733605362868, + "grad_norm": 0.20251236855983734, + "learning_rate": 7.037757882444532e-05, + "loss": 0.9992, + "step": 1671 + }, + { + "epoch": 0.6497619741571942, + "grad_norm": 0.2358030527830124, + "learning_rate": 7.029972752043597e-05, + "loss": 0.9854, + "step": 1672 + }, + { + "epoch": 0.6501505877781016, + "grad_norm": 0.18945704400539398, + "learning_rate": 7.022187621642664e-05, + "loss": 0.9677, + "step": 1673 + }, + { + "epoch": 0.650539201399009, + "grad_norm": 0.1965213567018509, + "learning_rate": 7.014402491241729e-05, + "loss": 1.0118, + "step": 1674 + }, + { + "epoch": 0.6509278150199165, + "grad_norm": 0.2340148687362671, + "learning_rate": 7.006617360840795e-05, + "loss": 1.0312, + "step": 1675 + }, + { + "epoch": 0.6513164286408238, + "grad_norm": 0.1992296278476715, + "learning_rate": 6.99883223043986e-05, + "loss": 1.0155, + "step": 1676 + }, + { + "epoch": 0.6517050422617313, + "grad_norm": 0.20410223305225372, + "learning_rate": 6.991047100038926e-05, + "loss": 1.0646, + "step": 1677 + }, + { + "epoch": 0.6520936558826387, + "grad_norm": 0.19254536926746368, + "learning_rate": 6.983261969637992e-05, + "loss": 0.9538, + "step": 1678 + }, + { + "epoch": 0.6524822695035462, + "grad_norm": 0.19980847835540771, + "learning_rate": 6.975476839237057e-05, + "loss": 0.9912, + "step": 1679 + }, + { + "epoch": 0.6528708831244535, + "grad_norm": 0.19503261148929596, + "learning_rate": 6.967691708836124e-05, + "loss": 0.9844, + "step": 1680 + }, + { + "epoch": 0.6532594967453609, + "grad_norm": 0.22375883162021637, + "learning_rate": 6.959906578435189e-05, + "loss": 1.1266, + "step": 1681 + }, + { + "epoch": 0.6536481103662684, + "grad_norm": 0.21456514298915863, + "learning_rate": 6.952121448034254e-05, + "loss": 1.0902, + "step": 1682 + }, + { + "epoch": 0.6540367239871757, + "grad_norm": 0.20348122715950012, + "learning_rate": 6.944336317633321e-05, + "loss": 1.0228, + "step": 1683 + }, + { + "epoch": 0.6544253376080832, + "grad_norm": 0.21647393703460693, + "learning_rate": 6.936551187232386e-05, + "loss": 1.0653, + "step": 1684 + }, + { + "epoch": 0.6548139512289906, + "grad_norm": 0.20160923898220062, + "learning_rate": 6.928766056831453e-05, + "loss": 1.0249, + "step": 1685 + }, + { + "epoch": 0.655202564849898, + "grad_norm": 0.20070499181747437, + "learning_rate": 6.920980926430518e-05, + "loss": 1.0585, + "step": 1686 + }, + { + "epoch": 0.6555911784708054, + "grad_norm": 0.2656902074813843, + "learning_rate": 6.913195796029583e-05, + "loss": 1.0042, + "step": 1687 + }, + { + "epoch": 0.6559797920917129, + "grad_norm": 0.1934545785188675, + "learning_rate": 6.90541066562865e-05, + "loss": 0.9831, + "step": 1688 + }, + { + "epoch": 0.6563684057126202, + "grad_norm": 0.21719245612621307, + "learning_rate": 6.897625535227715e-05, + "loss": 0.9934, + "step": 1689 + }, + { + "epoch": 0.6567570193335276, + "grad_norm": 0.20906969904899597, + "learning_rate": 6.889840404826782e-05, + "loss": 1.023, + "step": 1690 + }, + { + "epoch": 0.6571456329544351, + "grad_norm": 0.225227490067482, + "learning_rate": 6.882055274425847e-05, + "loss": 1.0265, + "step": 1691 + }, + { + "epoch": 0.6575342465753424, + "grad_norm": 0.22766710817813873, + "learning_rate": 6.874270144024912e-05, + "loss": 1.0306, + "step": 1692 + }, + { + "epoch": 0.6579228601962499, + "grad_norm": 0.20964065194129944, + "learning_rate": 6.866485013623978e-05, + "loss": 0.9431, + "step": 1693 + }, + { + "epoch": 0.6583114738171573, + "grad_norm": 0.19821231067180634, + "learning_rate": 6.858699883223044e-05, + "loss": 0.9959, + "step": 1694 + }, + { + "epoch": 0.6587000874380647, + "grad_norm": 0.2071307748556137, + "learning_rate": 6.85091475282211e-05, + "loss": 1.0332, + "step": 1695 + }, + { + "epoch": 0.6590887010589721, + "grad_norm": 0.27962490916252136, + "learning_rate": 6.843129622421175e-05, + "loss": 0.9755, + "step": 1696 + }, + { + "epoch": 0.6594773146798796, + "grad_norm": 0.21582698822021484, + "learning_rate": 6.835344492020242e-05, + "loss": 1.0305, + "step": 1697 + }, + { + "epoch": 0.6598659283007869, + "grad_norm": 0.1872921586036682, + "learning_rate": 6.827559361619307e-05, + "loss": 0.9693, + "step": 1698 + }, + { + "epoch": 0.6602545419216943, + "grad_norm": 0.27033379673957825, + "learning_rate": 6.819774231218374e-05, + "loss": 1.0756, + "step": 1699 + }, + { + "epoch": 0.6606431555426018, + "grad_norm": 0.2010008543729782, + "learning_rate": 6.811989100817439e-05, + "loss": 1.0077, + "step": 1700 + }, + { + "epoch": 0.6610317691635091, + "grad_norm": 0.20637495815753937, + "learning_rate": 6.804203970416506e-05, + "loss": 1.0208, + "step": 1701 + }, + { + "epoch": 0.6614203827844166, + "grad_norm": 0.21331818401813507, + "learning_rate": 6.796418840015571e-05, + "loss": 1.0242, + "step": 1702 + }, + { + "epoch": 0.661808996405324, + "grad_norm": 0.2092941552400589, + "learning_rate": 6.788633709614637e-05, + "loss": 1.0949, + "step": 1703 + }, + { + "epoch": 0.6621976100262315, + "grad_norm": 0.22332265973091125, + "learning_rate": 6.780848579213703e-05, + "loss": 1.1068, + "step": 1704 + }, + { + "epoch": 0.6625862236471388, + "grad_norm": 0.20077067613601685, + "learning_rate": 6.773063448812768e-05, + "loss": 0.9801, + "step": 1705 + }, + { + "epoch": 0.6629748372680463, + "grad_norm": 0.2057008296251297, + "learning_rate": 6.765278318411834e-05, + "loss": 1.0058, + "step": 1706 + }, + { + "epoch": 0.6633634508889537, + "grad_norm": 0.20337353646755219, + "learning_rate": 6.7574931880109e-05, + "loss": 1.0141, + "step": 1707 + }, + { + "epoch": 0.663752064509861, + "grad_norm": 0.22756130993366241, + "learning_rate": 6.749708057609966e-05, + "loss": 1.0287, + "step": 1708 + }, + { + "epoch": 0.6641406781307685, + "grad_norm": 0.2052423506975174, + "learning_rate": 6.741922927209031e-05, + "loss": 1.0069, + "step": 1709 + }, + { + "epoch": 0.6645292917516759, + "grad_norm": 0.1988023817539215, + "learning_rate": 6.734137796808096e-05, + "loss": 0.9761, + "step": 1710 + }, + { + "epoch": 0.6649179053725833, + "grad_norm": 0.20491188764572144, + "learning_rate": 6.726352666407163e-05, + "loss": 0.9767, + "step": 1711 + }, + { + "epoch": 0.6653065189934907, + "grad_norm": 0.18790274858474731, + "learning_rate": 6.718567536006228e-05, + "loss": 0.9944, + "step": 1712 + }, + { + "epoch": 0.6656951326143982, + "grad_norm": 0.19979891180992126, + "learning_rate": 6.710782405605293e-05, + "loss": 1.0842, + "step": 1713 + }, + { + "epoch": 0.6660837462353055, + "grad_norm": 0.22204813361167908, + "learning_rate": 6.70299727520436e-05, + "loss": 1.0561, + "step": 1714 + }, + { + "epoch": 0.666472359856213, + "grad_norm": 0.20182965695858002, + "learning_rate": 6.695212144803425e-05, + "loss": 1.0015, + "step": 1715 + }, + { + "epoch": 0.6668609734771204, + "grad_norm": 0.20719997584819794, + "learning_rate": 6.687427014402492e-05, + "loss": 1.0144, + "step": 1716 + }, + { + "epoch": 0.6672495870980278, + "grad_norm": 0.1944626122713089, + "learning_rate": 6.679641884001557e-05, + "loss": 1.0083, + "step": 1717 + }, + { + "epoch": 0.6676382007189352, + "grad_norm": 0.2072264701128006, + "learning_rate": 6.671856753600622e-05, + "loss": 1.0246, + "step": 1718 + }, + { + "epoch": 0.6680268143398426, + "grad_norm": 0.2134973257780075, + "learning_rate": 6.664071623199689e-05, + "loss": 1.0926, + "step": 1719 + }, + { + "epoch": 0.66841542796075, + "grad_norm": 0.2119186669588089, + "learning_rate": 6.656286492798754e-05, + "loss": 1.0129, + "step": 1720 + }, + { + "epoch": 0.6688040415816574, + "grad_norm": 0.21205540001392365, + "learning_rate": 6.64850136239782e-05, + "loss": 1.0611, + "step": 1721 + }, + { + "epoch": 0.6691926552025649, + "grad_norm": 0.21632088720798492, + "learning_rate": 6.640716231996886e-05, + "loss": 1.0821, + "step": 1722 + }, + { + "epoch": 0.6695812688234722, + "grad_norm": 0.21734434366226196, + "learning_rate": 6.632931101595952e-05, + "loss": 1.0821, + "step": 1723 + }, + { + "epoch": 0.6699698824443797, + "grad_norm": 0.2030603289604187, + "learning_rate": 6.625145971195017e-05, + "loss": 0.9976, + "step": 1724 + }, + { + "epoch": 0.6703584960652871, + "grad_norm": 0.19921456277370453, + "learning_rate": 6.617360840794084e-05, + "loss": 0.9187, + "step": 1725 + }, + { + "epoch": 0.6707471096861946, + "grad_norm": 0.20548826456069946, + "learning_rate": 6.60957571039315e-05, + "loss": 1.0486, + "step": 1726 + }, + { + "epoch": 0.6711357233071019, + "grad_norm": 0.21784676611423492, + "learning_rate": 6.601790579992216e-05, + "loss": 1.1089, + "step": 1727 + }, + { + "epoch": 0.6715243369280093, + "grad_norm": 0.2137753963470459, + "learning_rate": 6.594005449591281e-05, + "loss": 1.0075, + "step": 1728 + }, + { + "epoch": 0.6719129505489168, + "grad_norm": 0.20200639963150024, + "learning_rate": 6.586220319190348e-05, + "loss": 0.9915, + "step": 1729 + }, + { + "epoch": 0.6723015641698241, + "grad_norm": 0.20898796617984772, + "learning_rate": 6.578435188789413e-05, + "loss": 1.0292, + "step": 1730 + }, + { + "epoch": 0.6726901777907316, + "grad_norm": 0.22515977919101715, + "learning_rate": 6.570650058388478e-05, + "loss": 1.0118, + "step": 1731 + }, + { + "epoch": 0.673078791411639, + "grad_norm": 0.2132793813943863, + "learning_rate": 6.562864927987545e-05, + "loss": 1.1097, + "step": 1732 + }, + { + "epoch": 0.6734674050325464, + "grad_norm": 0.20358797907829285, + "learning_rate": 6.55507979758661e-05, + "loss": 1.0241, + "step": 1733 + }, + { + "epoch": 0.6738560186534538, + "grad_norm": 0.21155016124248505, + "learning_rate": 6.547294667185676e-05, + "loss": 1.0235, + "step": 1734 + }, + { + "epoch": 0.6742446322743613, + "grad_norm": 0.198009192943573, + "learning_rate": 6.539509536784741e-05, + "loss": 0.9542, + "step": 1735 + }, + { + "epoch": 0.6746332458952686, + "grad_norm": 0.20318005979061127, + "learning_rate": 6.531724406383807e-05, + "loss": 0.9993, + "step": 1736 + }, + { + "epoch": 0.675021859516176, + "grad_norm": 0.21384860575199127, + "learning_rate": 6.523939275982873e-05, + "loss": 1.1188, + "step": 1737 + }, + { + "epoch": 0.6754104731370835, + "grad_norm": 0.18736955523490906, + "learning_rate": 6.516154145581938e-05, + "loss": 0.9832, + "step": 1738 + }, + { + "epoch": 0.6757990867579908, + "grad_norm": 0.2002391368150711, + "learning_rate": 6.508369015181005e-05, + "loss": 1.0288, + "step": 1739 + }, + { + "epoch": 0.6761877003788983, + "grad_norm": 0.20011006295681, + "learning_rate": 6.50058388478007e-05, + "loss": 0.9588, + "step": 1740 + }, + { + "epoch": 0.6765763139998057, + "grad_norm": 0.20782291889190674, + "learning_rate": 6.492798754379135e-05, + "loss": 1.0033, + "step": 1741 + }, + { + "epoch": 0.6769649276207131, + "grad_norm": 0.2056814581155777, + "learning_rate": 6.485013623978202e-05, + "loss": 1.0648, + "step": 1742 + }, + { + "epoch": 0.6773535412416205, + "grad_norm": 0.2207457572221756, + "learning_rate": 6.477228493577267e-05, + "loss": 1.0758, + "step": 1743 + }, + { + "epoch": 0.677742154862528, + "grad_norm": 0.20437198877334595, + "learning_rate": 6.469443363176334e-05, + "loss": 1.0253, + "step": 1744 + }, + { + "epoch": 0.6781307684834353, + "grad_norm": 0.198721781373024, + "learning_rate": 6.461658232775399e-05, + "loss": 1.0087, + "step": 1745 + }, + { + "epoch": 0.6785193821043427, + "grad_norm": 0.22781015932559967, + "learning_rate": 6.453873102374464e-05, + "loss": 1.0692, + "step": 1746 + }, + { + "epoch": 0.6789079957252502, + "grad_norm": 0.21826857328414917, + "learning_rate": 6.446087971973531e-05, + "loss": 1.0232, + "step": 1747 + }, + { + "epoch": 0.6792966093461575, + "grad_norm": 0.2156928926706314, + "learning_rate": 6.438302841572596e-05, + "loss": 1.0686, + "step": 1748 + }, + { + "epoch": 0.679685222967065, + "grad_norm": 0.2161693125963211, + "learning_rate": 6.430517711171662e-05, + "loss": 1.0298, + "step": 1749 + }, + { + "epoch": 0.6800738365879724, + "grad_norm": 0.19139425456523895, + "learning_rate": 6.422732580770729e-05, + "loss": 0.9545, + "step": 1750 + }, + { + "epoch": 0.6804624502088799, + "grad_norm": 0.22626161575317383, + "learning_rate": 6.414947450369794e-05, + "loss": 1.0669, + "step": 1751 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 0.2135801464319229, + "learning_rate": 6.407162319968861e-05, + "loss": 1.0187, + "step": 1752 + }, + { + "epoch": 0.6812396774506947, + "grad_norm": 0.20803681015968323, + "learning_rate": 6.399377189567926e-05, + "loss": 1.0856, + "step": 1753 + }, + { + "epoch": 0.681628291071602, + "grad_norm": 0.21317154169082642, + "learning_rate": 6.391592059166991e-05, + "loss": 1.1018, + "step": 1754 + }, + { + "epoch": 0.6820169046925094, + "grad_norm": 0.20877891778945923, + "learning_rate": 6.383806928766058e-05, + "loss": 1.0383, + "step": 1755 + }, + { + "epoch": 0.6824055183134169, + "grad_norm": 0.20769146084785461, + "learning_rate": 6.376021798365123e-05, + "loss": 1.0852, + "step": 1756 + }, + { + "epoch": 0.6827941319343243, + "grad_norm": 0.2252657413482666, + "learning_rate": 6.36823666796419e-05, + "loss": 1.0749, + "step": 1757 + }, + { + "epoch": 0.6831827455552317, + "grad_norm": 0.24453257024288177, + "learning_rate": 6.360451537563255e-05, + "loss": 1.1042, + "step": 1758 + }, + { + "epoch": 0.6835713591761391, + "grad_norm": 0.2082965075969696, + "learning_rate": 6.35266640716232e-05, + "loss": 1.0729, + "step": 1759 + }, + { + "epoch": 0.6839599727970466, + "grad_norm": 0.20121856033802032, + "learning_rate": 6.344881276761387e-05, + "loss": 1.038, + "step": 1760 + }, + { + "epoch": 0.6843485864179539, + "grad_norm": 0.20096386969089508, + "learning_rate": 6.337096146360452e-05, + "loss": 0.9655, + "step": 1761 + }, + { + "epoch": 0.6847372000388614, + "grad_norm": 0.20015959441661835, + "learning_rate": 6.329311015959518e-05, + "loss": 1.0187, + "step": 1762 + }, + { + "epoch": 0.6851258136597688, + "grad_norm": 0.21056395769119263, + "learning_rate": 6.321525885558583e-05, + "loss": 1.0567, + "step": 1763 + }, + { + "epoch": 0.6855144272806762, + "grad_norm": 0.2211030125617981, + "learning_rate": 6.313740755157649e-05, + "loss": 1.0588, + "step": 1764 + }, + { + "epoch": 0.6859030409015836, + "grad_norm": 0.20809797942638397, + "learning_rate": 6.305955624756715e-05, + "loss": 0.9488, + "step": 1765 + }, + { + "epoch": 0.686291654522491, + "grad_norm": 0.2331530600786209, + "learning_rate": 6.29817049435578e-05, + "loss": 1.0789, + "step": 1766 + }, + { + "epoch": 0.6866802681433984, + "grad_norm": 0.21708674728870392, + "learning_rate": 6.290385363954846e-05, + "loss": 1.0518, + "step": 1767 + }, + { + "epoch": 0.6870688817643058, + "grad_norm": 0.2088184356689453, + "learning_rate": 6.282600233553912e-05, + "loss": 1.0178, + "step": 1768 + }, + { + "epoch": 0.6874574953852133, + "grad_norm": 0.20285943150520325, + "learning_rate": 6.274815103152977e-05, + "loss": 1.018, + "step": 1769 + }, + { + "epoch": 0.6878461090061206, + "grad_norm": 0.211436927318573, + "learning_rate": 6.267029972752044e-05, + "loss": 1.0572, + "step": 1770 + }, + { + "epoch": 0.6882347226270281, + "grad_norm": 0.21108384430408478, + "learning_rate": 6.259244842351109e-05, + "loss": 1.0227, + "step": 1771 + }, + { + "epoch": 0.6886233362479355, + "grad_norm": 0.2060437649488449, + "learning_rate": 6.251459711950174e-05, + "loss": 1.0251, + "step": 1772 + }, + { + "epoch": 0.689011949868843, + "grad_norm": 0.20819245278835297, + "learning_rate": 6.243674581549241e-05, + "loss": 1.0643, + "step": 1773 + }, + { + "epoch": 0.6894005634897503, + "grad_norm": 0.2172113060951233, + "learning_rate": 6.235889451148306e-05, + "loss": 1.0869, + "step": 1774 + }, + { + "epoch": 0.6897891771106577, + "grad_norm": 0.2087356299161911, + "learning_rate": 6.228104320747373e-05, + "loss": 1.0622, + "step": 1775 + }, + { + "epoch": 0.6901777907315652, + "grad_norm": 0.1958473175764084, + "learning_rate": 6.220319190346439e-05, + "loss": 0.9542, + "step": 1776 + }, + { + "epoch": 0.6905664043524725, + "grad_norm": 0.23630915582180023, + "learning_rate": 6.212534059945504e-05, + "loss": 1.0535, + "step": 1777 + }, + { + "epoch": 0.69095501797338, + "grad_norm": 0.2127649188041687, + "learning_rate": 6.204748929544571e-05, + "loss": 0.972, + "step": 1778 + }, + { + "epoch": 0.6913436315942874, + "grad_norm": 0.19873055815696716, + "learning_rate": 6.196963799143636e-05, + "loss": 0.9969, + "step": 1779 + }, + { + "epoch": 0.6917322452151948, + "grad_norm": 0.2013067901134491, + "learning_rate": 6.189178668742703e-05, + "loss": 1.0399, + "step": 1780 + }, + { + "epoch": 0.6921208588361022, + "grad_norm": 0.21300987899303436, + "learning_rate": 6.181393538341768e-05, + "loss": 1.0377, + "step": 1781 + }, + { + "epoch": 0.6925094724570097, + "grad_norm": 0.21665994822978973, + "learning_rate": 6.173608407940833e-05, + "loss": 1.008, + "step": 1782 + }, + { + "epoch": 0.692898086077917, + "grad_norm": 0.21622590720653534, + "learning_rate": 6.1658232775399e-05, + "loss": 1.1128, + "step": 1783 + }, + { + "epoch": 0.6932866996988244, + "grad_norm": 0.2000272423028946, + "learning_rate": 6.158038147138965e-05, + "loss": 1.0115, + "step": 1784 + }, + { + "epoch": 0.6936753133197319, + "grad_norm": 0.20774856209754944, + "learning_rate": 6.15025301673803e-05, + "loss": 1.066, + "step": 1785 + }, + { + "epoch": 0.6940639269406392, + "grad_norm": 0.18497461080551147, + "learning_rate": 6.142467886337097e-05, + "loss": 0.9608, + "step": 1786 + }, + { + "epoch": 0.6944525405615467, + "grad_norm": 0.19819007813930511, + "learning_rate": 6.134682755936162e-05, + "loss": 1.0114, + "step": 1787 + }, + { + "epoch": 0.6948411541824541, + "grad_norm": 0.22013314068317413, + "learning_rate": 6.126897625535229e-05, + "loss": 0.976, + "step": 1788 + }, + { + "epoch": 0.6952297678033615, + "grad_norm": 0.2066160887479782, + "learning_rate": 6.119112495134294e-05, + "loss": 1.0585, + "step": 1789 + }, + { + "epoch": 0.6956183814242689, + "grad_norm": 0.21364475786685944, + "learning_rate": 6.111327364733359e-05, + "loss": 1.0842, + "step": 1790 + }, + { + "epoch": 0.6960069950451764, + "grad_norm": 0.19731444120407104, + "learning_rate": 6.103542234332425e-05, + "loss": 0.9936, + "step": 1791 + }, + { + "epoch": 0.6963956086660837, + "grad_norm": 0.2162671983242035, + "learning_rate": 6.095757103931491e-05, + "loss": 1.0446, + "step": 1792 + }, + { + "epoch": 0.6967842222869911, + "grad_norm": 0.21486608684062958, + "learning_rate": 6.087971973530557e-05, + "loss": 1.0441, + "step": 1793 + }, + { + "epoch": 0.6971728359078986, + "grad_norm": 0.20850563049316406, + "learning_rate": 6.0801868431296224e-05, + "loss": 1.0431, + "step": 1794 + }, + { + "epoch": 0.6975614495288059, + "grad_norm": 0.20492027699947357, + "learning_rate": 6.072401712728688e-05, + "loss": 0.9845, + "step": 1795 + }, + { + "epoch": 0.6979500631497134, + "grad_norm": 0.1986648142337799, + "learning_rate": 6.064616582327754e-05, + "loss": 0.9855, + "step": 1796 + }, + { + "epoch": 0.6983386767706208, + "grad_norm": 0.20606310665607452, + "learning_rate": 6.05683145192682e-05, + "loss": 1.0608, + "step": 1797 + }, + { + "epoch": 0.6987272903915283, + "grad_norm": 0.20496073365211487, + "learning_rate": 6.0490463215258867e-05, + "loss": 1.0311, + "step": 1798 + }, + { + "epoch": 0.6991159040124356, + "grad_norm": 0.2153409719467163, + "learning_rate": 6.041261191124952e-05, + "loss": 1.0394, + "step": 1799 + }, + { + "epoch": 0.6995045176333431, + "grad_norm": 0.21410655975341797, + "learning_rate": 6.033476060724017e-05, + "loss": 1.0229, + "step": 1800 + }, + { + "epoch": 0.6998931312542505, + "grad_norm": 0.20418782532215118, + "learning_rate": 6.0256909303230836e-05, + "loss": 1.0382, + "step": 1801 + }, + { + "epoch": 0.7002817448751578, + "grad_norm": 0.19154146313667297, + "learning_rate": 6.017905799922149e-05, + "loss": 0.9891, + "step": 1802 + }, + { + "epoch": 0.7006703584960653, + "grad_norm": 0.19138328731060028, + "learning_rate": 6.010120669521214e-05, + "loss": 0.9638, + "step": 1803 + }, + { + "epoch": 0.7010589721169727, + "grad_norm": 0.19704872369766235, + "learning_rate": 6.0023355391202806e-05, + "loss": 0.9835, + "step": 1804 + }, + { + "epoch": 0.7014475857378801, + "grad_norm": 0.2175600379705429, + "learning_rate": 5.994550408719346e-05, + "loss": 1.1192, + "step": 1805 + }, + { + "epoch": 0.7018361993587875, + "grad_norm": 0.21614274382591248, + "learning_rate": 5.9867652783184124e-05, + "loss": 1.0877, + "step": 1806 + }, + { + "epoch": 0.702224812979695, + "grad_norm": 0.20461414754390717, + "learning_rate": 5.9789801479174776e-05, + "loss": 0.9706, + "step": 1807 + }, + { + "epoch": 0.7026134266006023, + "grad_norm": 0.1989748477935791, + "learning_rate": 5.9711950175165434e-05, + "loss": 1.0004, + "step": 1808 + }, + { + "epoch": 0.7030020402215098, + "grad_norm": 0.21304792165756226, + "learning_rate": 5.963409887115609e-05, + "loss": 1.0177, + "step": 1809 + }, + { + "epoch": 0.7033906538424172, + "grad_norm": 0.19023855030536652, + "learning_rate": 5.955624756714675e-05, + "loss": 0.9759, + "step": 1810 + }, + { + "epoch": 0.7037792674633246, + "grad_norm": 0.21915188431739807, + "learning_rate": 5.947839626313742e-05, + "loss": 1.0621, + "step": 1811 + }, + { + "epoch": 0.704167881084232, + "grad_norm": 0.21626822650432587, + "learning_rate": 5.940054495912807e-05, + "loss": 1.0144, + "step": 1812 + }, + { + "epoch": 0.7045564947051394, + "grad_norm": 0.20742040872573853, + "learning_rate": 5.932269365511872e-05, + "loss": 0.9778, + "step": 1813 + }, + { + "epoch": 0.7049451083260468, + "grad_norm": 0.2172158658504486, + "learning_rate": 5.924484235110939e-05, + "loss": 1.0416, + "step": 1814 + }, + { + "epoch": 0.7053337219469542, + "grad_norm": 0.209465891122818, + "learning_rate": 5.916699104710004e-05, + "loss": 1.0378, + "step": 1815 + }, + { + "epoch": 0.7057223355678617, + "grad_norm": 0.2097882628440857, + "learning_rate": 5.9089139743090705e-05, + "loss": 1.0166, + "step": 1816 + }, + { + "epoch": 0.706110949188769, + "grad_norm": 0.2251904308795929, + "learning_rate": 5.901128843908136e-05, + "loss": 1.0783, + "step": 1817 + }, + { + "epoch": 0.7064995628096765, + "grad_norm": 0.1952916979789734, + "learning_rate": 5.893343713507201e-05, + "loss": 0.993, + "step": 1818 + }, + { + "epoch": 0.7068881764305839, + "grad_norm": 0.20997455716133118, + "learning_rate": 5.8855585831062675e-05, + "loss": 1.0448, + "step": 1819 + }, + { + "epoch": 0.7072767900514914, + "grad_norm": 0.20070020854473114, + "learning_rate": 5.877773452705333e-05, + "loss": 0.9603, + "step": 1820 + }, + { + "epoch": 0.7076654036723987, + "grad_norm": 0.25765034556388855, + "learning_rate": 5.869988322304399e-05, + "loss": 1.0361, + "step": 1821 + }, + { + "epoch": 0.7080540172933061, + "grad_norm": 0.21948982775211334, + "learning_rate": 5.862203191903465e-05, + "loss": 1.0668, + "step": 1822 + }, + { + "epoch": 0.7084426309142136, + "grad_norm": 0.1867108792066574, + "learning_rate": 5.85441806150253e-05, + "loss": 0.9372, + "step": 1823 + }, + { + "epoch": 0.7088312445351209, + "grad_norm": 0.2037520408630371, + "learning_rate": 5.846632931101597e-05, + "loss": 0.9905, + "step": 1824 + }, + { + "epoch": 0.7092198581560284, + "grad_norm": 0.21352072060108185, + "learning_rate": 5.838847800700662e-05, + "loss": 1.0514, + "step": 1825 + }, + { + "epoch": 0.7096084717769358, + "grad_norm": 0.1949845850467682, + "learning_rate": 5.831062670299727e-05, + "loss": 0.9636, + "step": 1826 + }, + { + "epoch": 0.7099970853978432, + "grad_norm": 0.2092294692993164, + "learning_rate": 5.823277539898794e-05, + "loss": 1.0361, + "step": 1827 + }, + { + "epoch": 0.7103856990187506, + "grad_norm": 0.20054267346858978, + "learning_rate": 5.815492409497859e-05, + "loss": 1.0195, + "step": 1828 + }, + { + "epoch": 0.7107743126396581, + "grad_norm": 0.2202107012271881, + "learning_rate": 5.8077072790969256e-05, + "loss": 1.0918, + "step": 1829 + }, + { + "epoch": 0.7111629262605654, + "grad_norm": 0.2001042366027832, + "learning_rate": 5.799922148695991e-05, + "loss": 1.0142, + "step": 1830 + }, + { + "epoch": 0.7115515398814728, + "grad_norm": 0.2102631777524948, + "learning_rate": 5.792137018295056e-05, + "loss": 1.0231, + "step": 1831 + }, + { + "epoch": 0.7119401535023803, + "grad_norm": 0.21717461943626404, + "learning_rate": 5.7843518878941226e-05, + "loss": 1.0295, + "step": 1832 + }, + { + "epoch": 0.7123287671232876, + "grad_norm": 0.2001933753490448, + "learning_rate": 5.776566757493188e-05, + "loss": 1.022, + "step": 1833 + }, + { + "epoch": 0.7127173807441951, + "grad_norm": 0.2218201756477356, + "learning_rate": 5.7687816270922544e-05, + "loss": 1.0762, + "step": 1834 + }, + { + "epoch": 0.7131059943651025, + "grad_norm": 0.20680001378059387, + "learning_rate": 5.76099649669132e-05, + "loss": 1.0017, + "step": 1835 + }, + { + "epoch": 0.7134946079860099, + "grad_norm": 0.21511508524417877, + "learning_rate": 5.7532113662903854e-05, + "loss": 1.048, + "step": 1836 + }, + { + "epoch": 0.7138832216069173, + "grad_norm": 0.19720061123371124, + "learning_rate": 5.745426235889452e-05, + "loss": 0.9983, + "step": 1837 + }, + { + "epoch": 0.7142718352278248, + "grad_norm": 0.2005409449338913, + "learning_rate": 5.737641105488517e-05, + "loss": 0.9941, + "step": 1838 + }, + { + "epoch": 0.7146604488487321, + "grad_norm": 0.2222924679517746, + "learning_rate": 5.729855975087584e-05, + "loss": 1.0476, + "step": 1839 + }, + { + "epoch": 0.7150490624696395, + "grad_norm": 0.21131208539009094, + "learning_rate": 5.722070844686649e-05, + "loss": 1.0208, + "step": 1840 + }, + { + "epoch": 0.715437676090547, + "grad_norm": 0.2307305932044983, + "learning_rate": 5.714285714285714e-05, + "loss": 0.9867, + "step": 1841 + }, + { + "epoch": 0.7158262897114543, + "grad_norm": 0.1974973827600479, + "learning_rate": 5.706500583884781e-05, + "loss": 1.0285, + "step": 1842 + }, + { + "epoch": 0.7162149033323618, + "grad_norm": 0.2006559520959854, + "learning_rate": 5.698715453483846e-05, + "loss": 1.024, + "step": 1843 + }, + { + "epoch": 0.7166035169532692, + "grad_norm": 0.21160584688186646, + "learning_rate": 5.690930323082911e-05, + "loss": 1.0256, + "step": 1844 + }, + { + "epoch": 0.7169921305741767, + "grad_norm": 0.28184664249420166, + "learning_rate": 5.683145192681978e-05, + "loss": 1.0443, + "step": 1845 + }, + { + "epoch": 0.717380744195084, + "grad_norm": 0.2206653356552124, + "learning_rate": 5.675360062281043e-05, + "loss": 1.0458, + "step": 1846 + }, + { + "epoch": 0.7177693578159915, + "grad_norm": 0.21346066892147064, + "learning_rate": 5.6675749318801095e-05, + "loss": 1.0106, + "step": 1847 + }, + { + "epoch": 0.7181579714368989, + "grad_norm": 0.20931747555732727, + "learning_rate": 5.6597898014791753e-05, + "loss": 0.9831, + "step": 1848 + }, + { + "epoch": 0.7185465850578063, + "grad_norm": 0.2026771456003189, + "learning_rate": 5.6520046710782406e-05, + "loss": 1.0162, + "step": 1849 + }, + { + "epoch": 0.7189351986787137, + "grad_norm": 0.21388716995716095, + "learning_rate": 5.644219540677307e-05, + "loss": 1.0867, + "step": 1850 + }, + { + "epoch": 0.7193238122996211, + "grad_norm": 0.2039308398962021, + "learning_rate": 5.636434410276372e-05, + "loss": 1.0325, + "step": 1851 + }, + { + "epoch": 0.7197124259205285, + "grad_norm": 0.21741114556789398, + "learning_rate": 5.628649279875439e-05, + "loss": 1.0251, + "step": 1852 + }, + { + "epoch": 0.7201010395414359, + "grad_norm": 0.21343208849430084, + "learning_rate": 5.620864149474504e-05, + "loss": 1.0766, + "step": 1853 + }, + { + "epoch": 0.7204896531623434, + "grad_norm": 0.21712560951709747, + "learning_rate": 5.613079019073569e-05, + "loss": 1.0643, + "step": 1854 + }, + { + "epoch": 0.7208782667832507, + "grad_norm": 0.2176978886127472, + "learning_rate": 5.605293888672636e-05, + "loss": 1.0375, + "step": 1855 + }, + { + "epoch": 0.7212668804041582, + "grad_norm": 0.2065533846616745, + "learning_rate": 5.597508758271701e-05, + "loss": 1.0385, + "step": 1856 + }, + { + "epoch": 0.7216554940250656, + "grad_norm": 0.2169170081615448, + "learning_rate": 5.5897236278707676e-05, + "loss": 1.0197, + "step": 1857 + }, + { + "epoch": 0.722044107645973, + "grad_norm": 0.2047201544046402, + "learning_rate": 5.581938497469833e-05, + "loss": 0.9794, + "step": 1858 + }, + { + "epoch": 0.7224327212668804, + "grad_norm": 0.20898981392383575, + "learning_rate": 5.574153367068898e-05, + "loss": 1.032, + "step": 1859 + }, + { + "epoch": 0.7228213348877878, + "grad_norm": 0.2090533971786499, + "learning_rate": 5.5663682366679646e-05, + "loss": 1.0694, + "step": 1860 + }, + { + "epoch": 0.7232099485086952, + "grad_norm": 0.21963149309158325, + "learning_rate": 5.5585831062670305e-05, + "loss": 1.0367, + "step": 1861 + }, + { + "epoch": 0.7235985621296026, + "grad_norm": 0.1974373459815979, + "learning_rate": 5.550797975866096e-05, + "loss": 1.0402, + "step": 1862 + }, + { + "epoch": 0.7239871757505101, + "grad_norm": 0.1924194097518921, + "learning_rate": 5.543012845465162e-05, + "loss": 0.9647, + "step": 1863 + }, + { + "epoch": 0.7243757893714174, + "grad_norm": 0.21366077661514282, + "learning_rate": 5.5352277150642274e-05, + "loss": 1.0139, + "step": 1864 + }, + { + "epoch": 0.7247644029923249, + "grad_norm": 0.21722929179668427, + "learning_rate": 5.527442584663294e-05, + "loss": 1.0366, + "step": 1865 + }, + { + "epoch": 0.7251530166132323, + "grad_norm": 0.20646587014198303, + "learning_rate": 5.519657454262359e-05, + "loss": 1.0465, + "step": 1866 + }, + { + "epoch": 0.7255416302341398, + "grad_norm": 0.19144394993782043, + "learning_rate": 5.5118723238614244e-05, + "loss": 0.9645, + "step": 1867 + }, + { + "epoch": 0.7259302438550471, + "grad_norm": 0.19553838670253754, + "learning_rate": 5.504087193460491e-05, + "loss": 0.98, + "step": 1868 + }, + { + "epoch": 0.7263188574759545, + "grad_norm": 0.21739792823791504, + "learning_rate": 5.496302063059556e-05, + "loss": 1.002, + "step": 1869 + }, + { + "epoch": 0.726707471096862, + "grad_norm": 0.1910562962293625, + "learning_rate": 5.488516932658623e-05, + "loss": 0.985, + "step": 1870 + }, + { + "epoch": 0.7270960847177693, + "grad_norm": 0.2133384346961975, + "learning_rate": 5.480731802257688e-05, + "loss": 1.0325, + "step": 1871 + }, + { + "epoch": 0.7274846983386768, + "grad_norm": 0.21884119510650635, + "learning_rate": 5.472946671856753e-05, + "loss": 1.0412, + "step": 1872 + }, + { + "epoch": 0.7278733119595842, + "grad_norm": 0.21069306135177612, + "learning_rate": 5.46516154145582e-05, + "loss": 1.0474, + "step": 1873 + }, + { + "epoch": 0.7282619255804916, + "grad_norm": 0.19266243278980255, + "learning_rate": 5.4573764110548856e-05, + "loss": 0.9941, + "step": 1874 + }, + { + "epoch": 0.728650539201399, + "grad_norm": 0.21255099773406982, + "learning_rate": 5.4495912806539515e-05, + "loss": 1.0211, + "step": 1875 + }, + { + "epoch": 0.7290391528223065, + "grad_norm": 0.1924402117729187, + "learning_rate": 5.4418061502530173e-05, + "loss": 1.0117, + "step": 1876 + }, + { + "epoch": 0.7294277664432138, + "grad_norm": 0.2019895315170288, + "learning_rate": 5.4340210198520825e-05, + "loss": 0.9921, + "step": 1877 + }, + { + "epoch": 0.7298163800641212, + "grad_norm": 0.20398026704788208, + "learning_rate": 5.426235889451149e-05, + "loss": 1.0423, + "step": 1878 + }, + { + "epoch": 0.7302049936850287, + "grad_norm": 0.20153217017650604, + "learning_rate": 5.418450759050214e-05, + "loss": 1.0333, + "step": 1879 + }, + { + "epoch": 0.730593607305936, + "grad_norm": 0.21259640157222748, + "learning_rate": 5.4106656286492795e-05, + "loss": 1.0689, + "step": 1880 + }, + { + "epoch": 0.7309822209268435, + "grad_norm": 0.2037276029586792, + "learning_rate": 5.402880498248346e-05, + "loss": 1.0203, + "step": 1881 + }, + { + "epoch": 0.7313708345477509, + "grad_norm": 0.19976729154586792, + "learning_rate": 5.395095367847411e-05, + "loss": 1.0173, + "step": 1882 + }, + { + "epoch": 0.7317594481686583, + "grad_norm": 0.20481806993484497, + "learning_rate": 5.387310237446478e-05, + "loss": 0.9864, + "step": 1883 + }, + { + "epoch": 0.7321480617895657, + "grad_norm": 0.21900932490825653, + "learning_rate": 5.379525107045543e-05, + "loss": 1.0519, + "step": 1884 + }, + { + "epoch": 0.7325366754104732, + "grad_norm": 0.200319305062294, + "learning_rate": 5.371739976644609e-05, + "loss": 1.0834, + "step": 1885 + }, + { + "epoch": 0.7329252890313805, + "grad_norm": 0.19662296772003174, + "learning_rate": 5.363954846243675e-05, + "loss": 0.9794, + "step": 1886 + }, + { + "epoch": 0.7333139026522879, + "grad_norm": 0.2113952785730362, + "learning_rate": 5.356169715842741e-05, + "loss": 1.0763, + "step": 1887 + }, + { + "epoch": 0.7337025162731954, + "grad_norm": 0.21348755061626434, + "learning_rate": 5.3483845854418066e-05, + "loss": 1.0781, + "step": 1888 + }, + { + "epoch": 0.7340911298941027, + "grad_norm": 0.20673702657222748, + "learning_rate": 5.3405994550408725e-05, + "loss": 1.0513, + "step": 1889 + }, + { + "epoch": 0.7344797435150102, + "grad_norm": 0.210855171084404, + "learning_rate": 5.332814324639938e-05, + "loss": 0.9972, + "step": 1890 + }, + { + "epoch": 0.7348683571359176, + "grad_norm": 0.2136204093694687, + "learning_rate": 5.325029194239004e-05, + "loss": 1.03, + "step": 1891 + }, + { + "epoch": 0.7352569707568251, + "grad_norm": 0.20035260915756226, + "learning_rate": 5.3172440638380694e-05, + "loss": 0.9739, + "step": 1892 + }, + { + "epoch": 0.7356455843777324, + "grad_norm": 0.1943352371454239, + "learning_rate": 5.309458933437136e-05, + "loss": 0.9411, + "step": 1893 + }, + { + "epoch": 0.7360341979986399, + "grad_norm": 0.3994326889514923, + "learning_rate": 5.301673803036201e-05, + "loss": 1.0714, + "step": 1894 + }, + { + "epoch": 0.7364228116195473, + "grad_norm": 0.21691356599330902, + "learning_rate": 5.2938886726352664e-05, + "loss": 1.0648, + "step": 1895 + }, + { + "epoch": 0.7368114252404547, + "grad_norm": 0.19853095710277557, + "learning_rate": 5.286103542234333e-05, + "loss": 0.983, + "step": 1896 + }, + { + "epoch": 0.7372000388613621, + "grad_norm": 0.21836897730827332, + "learning_rate": 5.278318411833398e-05, + "loss": 1.0396, + "step": 1897 + }, + { + "epoch": 0.7375886524822695, + "grad_norm": 0.19596605002880096, + "learning_rate": 5.270533281432464e-05, + "loss": 0.9593, + "step": 1898 + }, + { + "epoch": 0.7379772661031769, + "grad_norm": 0.2141752541065216, + "learning_rate": 5.26274815103153e-05, + "loss": 1.0373, + "step": 1899 + }, + { + "epoch": 0.7383658797240843, + "grad_norm": 0.20552939176559448, + "learning_rate": 5.254963020630596e-05, + "loss": 1.0352, + "step": 1900 + }, + { + "epoch": 0.7387544933449918, + "grad_norm": 0.2095794975757599, + "learning_rate": 5.247177890229662e-05, + "loss": 1.0632, + "step": 1901 + }, + { + "epoch": 0.7391431069658991, + "grad_norm": 0.19894710183143616, + "learning_rate": 5.2393927598287276e-05, + "loss": 0.9886, + "step": 1902 + }, + { + "epoch": 0.7395317205868066, + "grad_norm": 0.22996319830417633, + "learning_rate": 5.231607629427793e-05, + "loss": 1.0826, + "step": 1903 + }, + { + "epoch": 0.739920334207714, + "grad_norm": 0.21416957676410675, + "learning_rate": 5.2238224990268593e-05, + "loss": 1.0161, + "step": 1904 + }, + { + "epoch": 0.7403089478286214, + "grad_norm": 0.21819345653057098, + "learning_rate": 5.2160373686259245e-05, + "loss": 1.0458, + "step": 1905 + }, + { + "epoch": 0.7406975614495288, + "grad_norm": 0.21327044069766998, + "learning_rate": 5.208252238224991e-05, + "loss": 1.0721, + "step": 1906 + }, + { + "epoch": 0.7410861750704362, + "grad_norm": 0.21436645090579987, + "learning_rate": 5.200467107824056e-05, + "loss": 1.0743, + "step": 1907 + }, + { + "epoch": 0.7414747886913436, + "grad_norm": 0.215640127658844, + "learning_rate": 5.1926819774231215e-05, + "loss": 1.0274, + "step": 1908 + }, + { + "epoch": 0.741863402312251, + "grad_norm": 0.2043589949607849, + "learning_rate": 5.184896847022188e-05, + "loss": 1.0618, + "step": 1909 + }, + { + "epoch": 0.7422520159331585, + "grad_norm": 0.2014230340719223, + "learning_rate": 5.177111716621253e-05, + "loss": 0.9892, + "step": 1910 + }, + { + "epoch": 0.7426406295540658, + "grad_norm": 0.19954468309879303, + "learning_rate": 5.16932658622032e-05, + "loss": 0.9815, + "step": 1911 + }, + { + "epoch": 0.7430292431749733, + "grad_norm": 0.23119708895683289, + "learning_rate": 5.161541455819385e-05, + "loss": 1.0783, + "step": 1912 + }, + { + "epoch": 0.7434178567958807, + "grad_norm": 0.20650482177734375, + "learning_rate": 5.153756325418451e-05, + "loss": 1.0162, + "step": 1913 + }, + { + "epoch": 0.7438064704167882, + "grad_norm": 0.20021970570087433, + "learning_rate": 5.145971195017517e-05, + "loss": 1.0062, + "step": 1914 + }, + { + "epoch": 0.7441950840376955, + "grad_norm": 0.23300811648368835, + "learning_rate": 5.138186064616583e-05, + "loss": 1.0049, + "step": 1915 + }, + { + "epoch": 0.7445836976586029, + "grad_norm": 0.23268327116966248, + "learning_rate": 5.130400934215648e-05, + "loss": 1.0138, + "step": 1916 + }, + { + "epoch": 0.7449723112795104, + "grad_norm": 0.20413407683372498, + "learning_rate": 5.1226158038147145e-05, + "loss": 0.9903, + "step": 1917 + }, + { + "epoch": 0.7453609249004177, + "grad_norm": 0.20714978873729706, + "learning_rate": 5.1148306734137797e-05, + "loss": 1.0374, + "step": 1918 + }, + { + "epoch": 0.7457495385213252, + "grad_norm": 0.2000850886106491, + "learning_rate": 5.107045543012846e-05, + "loss": 0.9885, + "step": 1919 + }, + { + "epoch": 0.7461381521422326, + "grad_norm": 0.2054719179868698, + "learning_rate": 5.0992604126119114e-05, + "loss": 1.0551, + "step": 1920 + }, + { + "epoch": 0.74652676576314, + "grad_norm": 0.2351357489824295, + "learning_rate": 5.0914752822109766e-05, + "loss": 1.0693, + "step": 1921 + }, + { + "epoch": 0.7469153793840474, + "grad_norm": 0.22370338439941406, + "learning_rate": 5.083690151810043e-05, + "loss": 0.9781, + "step": 1922 + }, + { + "epoch": 0.7473039930049549, + "grad_norm": 0.18734332919120789, + "learning_rate": 5.0759050214091084e-05, + "loss": 0.9329, + "step": 1923 + }, + { + "epoch": 0.7476926066258622, + "grad_norm": 0.22099906206130981, + "learning_rate": 5.068119891008175e-05, + "loss": 1.0498, + "step": 1924 + }, + { + "epoch": 0.7480812202467696, + "grad_norm": 0.20144490897655487, + "learning_rate": 5.06033476060724e-05, + "loss": 0.9865, + "step": 1925 + }, + { + "epoch": 0.7484698338676771, + "grad_norm": 0.21770039200782776, + "learning_rate": 5.052549630206306e-05, + "loss": 1.0867, + "step": 1926 + }, + { + "epoch": 0.7488584474885844, + "grad_norm": 0.19649921357631683, + "learning_rate": 5.044764499805372e-05, + "loss": 0.9887, + "step": 1927 + }, + { + "epoch": 0.7492470611094919, + "grad_norm": 0.1940620392560959, + "learning_rate": 5.036979369404438e-05, + "loss": 1.0073, + "step": 1928 + }, + { + "epoch": 0.7496356747303993, + "grad_norm": 0.20987650752067566, + "learning_rate": 5.0291942390035044e-05, + "loss": 1.046, + "step": 1929 + }, + { + "epoch": 0.7500242883513067, + "grad_norm": 0.2116398960351944, + "learning_rate": 5.0214091086025696e-05, + "loss": 1.0423, + "step": 1930 + }, + { + "epoch": 0.7504129019722141, + "grad_norm": 0.18996965885162354, + "learning_rate": 5.013623978201635e-05, + "loss": 0.9822, + "step": 1931 + }, + { + "epoch": 0.7508015155931216, + "grad_norm": 0.20942547917366028, + "learning_rate": 5.005838847800701e-05, + "loss": 1.0472, + "step": 1932 + }, + { + "epoch": 0.751190129214029, + "grad_norm": 0.19006839394569397, + "learning_rate": 4.9980537173997665e-05, + "loss": 0.993, + "step": 1933 + }, + { + "epoch": 0.7515787428349364, + "grad_norm": 0.21508941054344177, + "learning_rate": 4.9902685869988324e-05, + "loss": 1.0406, + "step": 1934 + }, + { + "epoch": 0.7519673564558438, + "grad_norm": 0.1989334225654602, + "learning_rate": 4.982483456597898e-05, + "loss": 0.9997, + "step": 1935 + }, + { + "epoch": 0.7523559700767511, + "grad_norm": 0.19993600249290466, + "learning_rate": 4.974698326196964e-05, + "loss": 1.0139, + "step": 1936 + }, + { + "epoch": 0.7527445836976586, + "grad_norm": 0.20927831530570984, + "learning_rate": 4.9669131957960294e-05, + "loss": 0.995, + "step": 1937 + }, + { + "epoch": 0.753133197318566, + "grad_norm": 0.20963850617408752, + "learning_rate": 4.959128065395095e-05, + "loss": 1.0678, + "step": 1938 + }, + { + "epoch": 0.7535218109394735, + "grad_norm": 0.19523034989833832, + "learning_rate": 4.951342934994161e-05, + "loss": 0.9883, + "step": 1939 + }, + { + "epoch": 0.7539104245603808, + "grad_norm": 0.21588142216205597, + "learning_rate": 4.943557804593227e-05, + "loss": 1.0398, + "step": 1940 + }, + { + "epoch": 0.7542990381812883, + "grad_norm": 0.19894704222679138, + "learning_rate": 4.935772674192293e-05, + "loss": 1.0125, + "step": 1941 + }, + { + "epoch": 0.7546876518021957, + "grad_norm": 0.2155168056488037, + "learning_rate": 4.927987543791359e-05, + "loss": 1.0447, + "step": 1942 + }, + { + "epoch": 0.7550762654231031, + "grad_norm": 0.212605819106102, + "learning_rate": 4.920202413390425e-05, + "loss": 1.077, + "step": 1943 + }, + { + "epoch": 0.7554648790440105, + "grad_norm": 0.2168148010969162, + "learning_rate": 4.9124172829894906e-05, + "loss": 1.0029, + "step": 1944 + }, + { + "epoch": 0.7558534926649179, + "grad_norm": 0.2020149528980255, + "learning_rate": 4.9046321525885565e-05, + "loss": 1.0684, + "step": 1945 + }, + { + "epoch": 0.7562421062858253, + "grad_norm": 0.21063408255577087, + "learning_rate": 4.8968470221876217e-05, + "loss": 1.0147, + "step": 1946 + }, + { + "epoch": 0.7566307199067327, + "grad_norm": 0.19599388539791107, + "learning_rate": 4.8890618917866875e-05, + "loss": 0.9719, + "step": 1947 + }, + { + "epoch": 0.7570193335276402, + "grad_norm": 0.2158602923154831, + "learning_rate": 4.8812767613857534e-05, + "loss": 1.0439, + "step": 1948 + }, + { + "epoch": 0.7574079471485475, + "grad_norm": 0.21013815701007843, + "learning_rate": 4.873491630984819e-05, + "loss": 1.0319, + "step": 1949 + }, + { + "epoch": 0.757796560769455, + "grad_norm": 0.2020798772573471, + "learning_rate": 4.8657065005838845e-05, + "loss": 1.0037, + "step": 1950 + }, + { + "epoch": 0.7581851743903624, + "grad_norm": 0.21202047169208527, + "learning_rate": 4.8579213701829504e-05, + "loss": 0.9823, + "step": 1951 + }, + { + "epoch": 0.7585737880112698, + "grad_norm": 0.20750083029270172, + "learning_rate": 4.850136239782016e-05, + "loss": 1.0073, + "step": 1952 + }, + { + "epoch": 0.7589624016321772, + "grad_norm": 0.20938372611999512, + "learning_rate": 4.842351109381083e-05, + "loss": 1.0326, + "step": 1953 + }, + { + "epoch": 0.7593510152530846, + "grad_norm": 0.21984544396400452, + "learning_rate": 4.834565978980149e-05, + "loss": 1.0363, + "step": 1954 + }, + { + "epoch": 0.759739628873992, + "grad_norm": 0.20306189358234406, + "learning_rate": 4.826780848579214e-05, + "loss": 1.0374, + "step": 1955 + }, + { + "epoch": 0.7601282424948994, + "grad_norm": 0.20631705224514008, + "learning_rate": 4.81899571817828e-05, + "loss": 1.0985, + "step": 1956 + }, + { + "epoch": 0.7605168561158069, + "grad_norm": 0.22092190384864807, + "learning_rate": 4.811210587777346e-05, + "loss": 1.0216, + "step": 1957 + }, + { + "epoch": 0.7609054697367142, + "grad_norm": 0.21419481933116913, + "learning_rate": 4.8034254573764116e-05, + "loss": 1.0327, + "step": 1958 + }, + { + "epoch": 0.7612940833576217, + "grad_norm": 0.1954476237297058, + "learning_rate": 4.795640326975477e-05, + "loss": 1.0139, + "step": 1959 + }, + { + "epoch": 0.7616826969785291, + "grad_norm": 0.21092113852500916, + "learning_rate": 4.7878551965745427e-05, + "loss": 1.0934, + "step": 1960 + }, + { + "epoch": 0.7620713105994366, + "grad_norm": 0.1998988837003708, + "learning_rate": 4.7800700661736085e-05, + "loss": 0.9782, + "step": 1961 + }, + { + "epoch": 0.7624599242203439, + "grad_norm": 0.20410674810409546, + "learning_rate": 4.7722849357726744e-05, + "loss": 1.0186, + "step": 1962 + }, + { + "epoch": 0.7628485378412513, + "grad_norm": 0.25312289595603943, + "learning_rate": 4.76449980537174e-05, + "loss": 1.0103, + "step": 1963 + }, + { + "epoch": 0.7632371514621588, + "grad_norm": 0.20648318529129028, + "learning_rate": 4.7567146749708055e-05, + "loss": 1.0314, + "step": 1964 + }, + { + "epoch": 0.7636257650830661, + "grad_norm": 0.20513702929019928, + "learning_rate": 4.7489295445698714e-05, + "loss": 0.981, + "step": 1965 + }, + { + "epoch": 0.7640143787039736, + "grad_norm": 0.20063039660453796, + "learning_rate": 4.741144414168938e-05, + "loss": 1.0218, + "step": 1966 + }, + { + "epoch": 0.764402992324881, + "grad_norm": 0.20328521728515625, + "learning_rate": 4.733359283768004e-05, + "loss": 1.0614, + "step": 1967 + }, + { + "epoch": 0.7647916059457884, + "grad_norm": 0.2209623008966446, + "learning_rate": 4.725574153367069e-05, + "loss": 1.0478, + "step": 1968 + }, + { + "epoch": 0.7651802195666958, + "grad_norm": 0.2023559957742691, + "learning_rate": 4.717789022966135e-05, + "loss": 1.0455, + "step": 1969 + }, + { + "epoch": 0.7655688331876033, + "grad_norm": 0.20461297035217285, + "learning_rate": 4.710003892565201e-05, + "loss": 0.9427, + "step": 1970 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 0.2108335793018341, + "learning_rate": 4.702218762164267e-05, + "loss": 1.0344, + "step": 1971 + }, + { + "epoch": 0.766346060429418, + "grad_norm": 0.20883473753929138, + "learning_rate": 4.6944336317633326e-05, + "loss": 1.0336, + "step": 1972 + }, + { + "epoch": 0.7667346740503255, + "grad_norm": 0.20144741237163544, + "learning_rate": 4.686648501362398e-05, + "loss": 1.0101, + "step": 1973 + }, + { + "epoch": 0.7671232876712328, + "grad_norm": 0.21269328892230988, + "learning_rate": 4.6788633709614637e-05, + "loss": 0.9989, + "step": 1974 + }, + { + "epoch": 0.7675119012921403, + "grad_norm": 0.20673738420009613, + "learning_rate": 4.6710782405605295e-05, + "loss": 1.0235, + "step": 1975 + }, + { + "epoch": 0.7679005149130477, + "grad_norm": 0.1966594159603119, + "learning_rate": 4.6632931101595954e-05, + "loss": 1.0081, + "step": 1976 + }, + { + "epoch": 0.7682891285339551, + "grad_norm": 0.22186829149723053, + "learning_rate": 4.6555079797586606e-05, + "loss": 1.0081, + "step": 1977 + }, + { + "epoch": 0.7686777421548625, + "grad_norm": 0.20602557063102722, + "learning_rate": 4.6477228493577265e-05, + "loss": 1.0381, + "step": 1978 + }, + { + "epoch": 0.76906635577577, + "grad_norm": 0.19581305980682373, + "learning_rate": 4.639937718956793e-05, + "loss": 1.0196, + "step": 1979 + }, + { + "epoch": 0.7694549693966773, + "grad_norm": 0.20162086188793182, + "learning_rate": 4.632152588555859e-05, + "loss": 1.0168, + "step": 1980 + }, + { + "epoch": 0.7698435830175848, + "grad_norm": 0.21967145800590515, + "learning_rate": 4.624367458154925e-05, + "loss": 1.0339, + "step": 1981 + }, + { + "epoch": 0.7702321966384922, + "grad_norm": 0.20245851576328278, + "learning_rate": 4.61658232775399e-05, + "loss": 1.0349, + "step": 1982 + }, + { + "epoch": 0.7706208102593995, + "grad_norm": 0.20409934222698212, + "learning_rate": 4.608797197353056e-05, + "loss": 1.0296, + "step": 1983 + }, + { + "epoch": 0.771009423880307, + "grad_norm": 0.19757163524627686, + "learning_rate": 4.601012066952122e-05, + "loss": 1.0443, + "step": 1984 + }, + { + "epoch": 0.7713980375012144, + "grad_norm": 0.20038221776485443, + "learning_rate": 4.593226936551188e-05, + "loss": 1.0431, + "step": 1985 + }, + { + "epoch": 0.7717866511221219, + "grad_norm": 0.2112458199262619, + "learning_rate": 4.585441806150253e-05, + "loss": 1.0553, + "step": 1986 + }, + { + "epoch": 0.7721752647430292, + "grad_norm": 0.21868042647838593, + "learning_rate": 4.577656675749319e-05, + "loss": 1.0061, + "step": 1987 + }, + { + "epoch": 0.7725638783639367, + "grad_norm": 0.22484582662582397, + "learning_rate": 4.5698715453483846e-05, + "loss": 1.0831, + "step": 1988 + }, + { + "epoch": 0.7729524919848441, + "grad_norm": 0.20265011489391327, + "learning_rate": 4.5620864149474505e-05, + "loss": 1.0206, + "step": 1989 + }, + { + "epoch": 0.7733411056057515, + "grad_norm": 0.2052810937166214, + "learning_rate": 4.5543012845465164e-05, + "loss": 1.0366, + "step": 1990 + }, + { + "epoch": 0.7737297192266589, + "grad_norm": 0.21016088128089905, + "learning_rate": 4.546516154145582e-05, + "loss": 0.9963, + "step": 1991 + }, + { + "epoch": 0.7741183328475663, + "grad_norm": 0.19719412922859192, + "learning_rate": 4.538731023744648e-05, + "loss": 0.9853, + "step": 1992 + }, + { + "epoch": 0.7745069464684737, + "grad_norm": 0.20447245240211487, + "learning_rate": 4.530945893343714e-05, + "loss": 0.9977, + "step": 1993 + }, + { + "epoch": 0.7748955600893811, + "grad_norm": 0.21796588599681854, + "learning_rate": 4.52316076294278e-05, + "loss": 1.0949, + "step": 1994 + }, + { + "epoch": 0.7752841737102886, + "grad_norm": 0.2041284590959549, + "learning_rate": 4.515375632541845e-05, + "loss": 1.0034, + "step": 1995 + }, + { + "epoch": 0.7756727873311959, + "grad_norm": 0.21134726703166962, + "learning_rate": 4.507590502140911e-05, + "loss": 1.0076, + "step": 1996 + }, + { + "epoch": 0.7760614009521034, + "grad_norm": 0.20730996131896973, + "learning_rate": 4.499805371739977e-05, + "loss": 1.0456, + "step": 1997 + }, + { + "epoch": 0.7764500145730108, + "grad_norm": 0.22316931188106537, + "learning_rate": 4.492020241339043e-05, + "loss": 0.9418, + "step": 1998 + }, + { + "epoch": 0.7768386281939182, + "grad_norm": 0.21494819223880768, + "learning_rate": 4.484235110938109e-05, + "loss": 1.0597, + "step": 1999 + }, + { + "epoch": 0.7772272418148256, + "grad_norm": 0.20344491302967072, + "learning_rate": 4.476449980537174e-05, + "loss": 0.9749, + "step": 2000 + }, + { + "epoch": 0.777615855435733, + "grad_norm": 0.20816263556480408, + "learning_rate": 4.46866485013624e-05, + "loss": 1.0526, + "step": 2001 + }, + { + "epoch": 0.7780044690566404, + "grad_norm": 0.21490095555782318, + "learning_rate": 4.4608797197353056e-05, + "loss": 1.0311, + "step": 2002 + }, + { + "epoch": 0.7783930826775478, + "grad_norm": 0.2043679803609848, + "learning_rate": 4.4530945893343715e-05, + "loss": 1.0176, + "step": 2003 + }, + { + "epoch": 0.7787816962984553, + "grad_norm": 0.2015836238861084, + "learning_rate": 4.4453094589334374e-05, + "loss": 1.015, + "step": 2004 + }, + { + "epoch": 0.7791703099193626, + "grad_norm": 0.21843332052230835, + "learning_rate": 4.437524328532503e-05, + "loss": 1.0577, + "step": 2005 + }, + { + "epoch": 0.7795589235402701, + "grad_norm": 0.20447933673858643, + "learning_rate": 4.429739198131569e-05, + "loss": 1.0549, + "step": 2006 + }, + { + "epoch": 0.7799475371611775, + "grad_norm": 0.20317135751247406, + "learning_rate": 4.421954067730635e-05, + "loss": 1.0419, + "step": 2007 + }, + { + "epoch": 0.780336150782085, + "grad_norm": 0.20233985781669617, + "learning_rate": 4.414168937329701e-05, + "loss": 0.9743, + "step": 2008 + }, + { + "epoch": 0.7807247644029923, + "grad_norm": 0.1957770437002182, + "learning_rate": 4.406383806928766e-05, + "loss": 1.0306, + "step": 2009 + }, + { + "epoch": 0.7811133780238997, + "grad_norm": 0.2055465579032898, + "learning_rate": 4.398598676527832e-05, + "loss": 0.9917, + "step": 2010 + }, + { + "epoch": 0.7815019916448072, + "grad_norm": 0.1980140060186386, + "learning_rate": 4.390813546126898e-05, + "loss": 1.0002, + "step": 2011 + }, + { + "epoch": 0.7818906052657145, + "grad_norm": 0.21538390219211578, + "learning_rate": 4.383028415725964e-05, + "loss": 0.9784, + "step": 2012 + }, + { + "epoch": 0.782279218886622, + "grad_norm": 0.20209911465644836, + "learning_rate": 4.375243285325029e-05, + "loss": 1.0403, + "step": 2013 + }, + { + "epoch": 0.7826678325075294, + "grad_norm": 0.22064533829689026, + "learning_rate": 4.367458154924095e-05, + "loss": 1.0816, + "step": 2014 + }, + { + "epoch": 0.7830564461284368, + "grad_norm": 0.21721522510051727, + "learning_rate": 4.359673024523161e-05, + "loss": 1.0215, + "step": 2015 + }, + { + "epoch": 0.7834450597493442, + "grad_norm": 0.21042165160179138, + "learning_rate": 4.3518878941222266e-05, + "loss": 0.9993, + "step": 2016 + }, + { + "epoch": 0.7838336733702517, + "grad_norm": 0.2821733355522156, + "learning_rate": 4.3441027637212925e-05, + "loss": 1.0337, + "step": 2017 + }, + { + "epoch": 0.784222286991159, + "grad_norm": 0.1997404247522354, + "learning_rate": 4.3363176333203584e-05, + "loss": 0.9635, + "step": 2018 + }, + { + "epoch": 0.7846109006120664, + "grad_norm": 0.21088410913944244, + "learning_rate": 4.328532502919424e-05, + "loss": 1.0809, + "step": 2019 + }, + { + "epoch": 0.7849995142329739, + "grad_norm": 0.22041834890842438, + "learning_rate": 4.32074737251849e-05, + "loss": 1.0553, + "step": 2020 + }, + { + "epoch": 0.7853881278538812, + "grad_norm": 0.21541887521743774, + "learning_rate": 4.312962242117556e-05, + "loss": 1.0348, + "step": 2021 + }, + { + "epoch": 0.7857767414747887, + "grad_norm": 0.19423037767410278, + "learning_rate": 4.305177111716621e-05, + "loss": 0.9566, + "step": 2022 + }, + { + "epoch": 0.7861653550956961, + "grad_norm": 0.20975807309150696, + "learning_rate": 4.297391981315687e-05, + "loss": 0.9946, + "step": 2023 + }, + { + "epoch": 0.7865539687166035, + "grad_norm": 0.1911199390888214, + "learning_rate": 4.289606850914753e-05, + "loss": 0.9582, + "step": 2024 + }, + { + "epoch": 0.7869425823375109, + "grad_norm": 0.20895734429359436, + "learning_rate": 4.281821720513819e-05, + "loss": 1.02, + "step": 2025 + }, + { + "epoch": 0.7873311959584184, + "grad_norm": 0.19652803242206573, + "learning_rate": 4.274036590112885e-05, + "loss": 0.9919, + "step": 2026 + }, + { + "epoch": 0.7877198095793257, + "grad_norm": 0.21050991117954254, + "learning_rate": 4.26625145971195e-05, + "loss": 1.0363, + "step": 2027 + }, + { + "epoch": 0.7881084232002332, + "grad_norm": 0.18776053190231323, + "learning_rate": 4.258466329311016e-05, + "loss": 0.9747, + "step": 2028 + }, + { + "epoch": 0.7884970368211406, + "grad_norm": 0.20973272621631622, + "learning_rate": 4.250681198910082e-05, + "loss": 1.0457, + "step": 2029 + }, + { + "epoch": 0.788885650442048, + "grad_norm": 0.22028960287570953, + "learning_rate": 4.2428960685091476e-05, + "loss": 1.0769, + "step": 2030 + }, + { + "epoch": 0.7892742640629554, + "grad_norm": 0.20541588962078094, + "learning_rate": 4.2351109381082135e-05, + "loss": 1.0456, + "step": 2031 + }, + { + "epoch": 0.7896628776838628, + "grad_norm": 0.19365350902080536, + "learning_rate": 4.2273258077072794e-05, + "loss": 0.9708, + "step": 2032 + }, + { + "epoch": 0.7900514913047703, + "grad_norm": 0.21286098659038544, + "learning_rate": 4.219540677306345e-05, + "loss": 1.0443, + "step": 2033 + }, + { + "epoch": 0.7904401049256776, + "grad_norm": 0.20527319610118866, + "learning_rate": 4.211755546905411e-05, + "loss": 1.0165, + "step": 2034 + }, + { + "epoch": 0.7908287185465851, + "grad_norm": 0.20962440967559814, + "learning_rate": 4.203970416504477e-05, + "loss": 1.0723, + "step": 2035 + }, + { + "epoch": 0.7912173321674925, + "grad_norm": 0.21032460033893585, + "learning_rate": 4.196185286103542e-05, + "loss": 1.0384, + "step": 2036 + }, + { + "epoch": 0.7916059457883999, + "grad_norm": 0.22122742235660553, + "learning_rate": 4.188400155702608e-05, + "loss": 1.0239, + "step": 2037 + }, + { + "epoch": 0.7919945594093073, + "grad_norm": 0.21430088579654694, + "learning_rate": 4.180615025301674e-05, + "loss": 1.0421, + "step": 2038 + }, + { + "epoch": 0.7923831730302147, + "grad_norm": 0.200826957821846, + "learning_rate": 4.17282989490074e-05, + "loss": 1.0403, + "step": 2039 + }, + { + "epoch": 0.7927717866511221, + "grad_norm": 0.1936146765947342, + "learning_rate": 4.165044764499805e-05, + "loss": 0.9901, + "step": 2040 + }, + { + "epoch": 0.7931604002720295, + "grad_norm": 0.21162614226341248, + "learning_rate": 4.157259634098871e-05, + "loss": 1.0809, + "step": 2041 + }, + { + "epoch": 0.793549013892937, + "grad_norm": 0.1934708207845688, + "learning_rate": 4.149474503697937e-05, + "loss": 0.996, + "step": 2042 + }, + { + "epoch": 0.7939376275138443, + "grad_norm": 0.19730836153030396, + "learning_rate": 4.141689373297003e-05, + "loss": 1.0116, + "step": 2043 + }, + { + "epoch": 0.7943262411347518, + "grad_norm": 0.19641950726509094, + "learning_rate": 4.1339042428960686e-05, + "loss": 1.0554, + "step": 2044 + }, + { + "epoch": 0.7947148547556592, + "grad_norm": 0.1926102489233017, + "learning_rate": 4.1261191124951345e-05, + "loss": 0.9244, + "step": 2045 + }, + { + "epoch": 0.7951034683765666, + "grad_norm": 0.20683708786964417, + "learning_rate": 4.1183339820942004e-05, + "loss": 1.0247, + "step": 2046 + }, + { + "epoch": 0.795492081997474, + "grad_norm": 0.21519975364208221, + "learning_rate": 4.110548851693266e-05, + "loss": 1.0364, + "step": 2047 + }, + { + "epoch": 0.7958806956183814, + "grad_norm": 0.19510744512081146, + "learning_rate": 4.102763721292332e-05, + "loss": 0.9807, + "step": 2048 + }, + { + "epoch": 0.7962693092392888, + "grad_norm": 0.21060147881507874, + "learning_rate": 4.094978590891398e-05, + "loss": 1.0007, + "step": 2049 + }, + { + "epoch": 0.7966579228601962, + "grad_norm": 0.19922667741775513, + "learning_rate": 4.087193460490463e-05, + "loss": 0.9953, + "step": 2050 + }, + { + "epoch": 0.7970465364811037, + "grad_norm": 0.2217833250761032, + "learning_rate": 4.079408330089529e-05, + "loss": 1.0359, + "step": 2051 + }, + { + "epoch": 0.797435150102011, + "grad_norm": 0.2138615995645523, + "learning_rate": 4.071623199688595e-05, + "loss": 1.0473, + "step": 2052 + }, + { + "epoch": 0.7978237637229185, + "grad_norm": 0.20814841985702515, + "learning_rate": 4.063838069287661e-05, + "loss": 1.042, + "step": 2053 + }, + { + "epoch": 0.7982123773438259, + "grad_norm": 0.21378004550933838, + "learning_rate": 4.056052938886726e-05, + "loss": 1.0616, + "step": 2054 + }, + { + "epoch": 0.7986009909647334, + "grad_norm": 0.22064481675624847, + "learning_rate": 4.048267808485792e-05, + "loss": 1.063, + "step": 2055 + }, + { + "epoch": 0.7989896045856407, + "grad_norm": 0.21143454313278198, + "learning_rate": 4.040482678084858e-05, + "loss": 0.9999, + "step": 2056 + }, + { + "epoch": 0.7993782182065481, + "grad_norm": 0.2092997431755066, + "learning_rate": 4.032697547683924e-05, + "loss": 0.9958, + "step": 2057 + }, + { + "epoch": 0.7997668318274556, + "grad_norm": 0.2715415954589844, + "learning_rate": 4.0249124172829896e-05, + "loss": 0.9981, + "step": 2058 + }, + { + "epoch": 0.8001554454483629, + "grad_norm": 0.20481626689434052, + "learning_rate": 4.0171272868820555e-05, + "loss": 1.0187, + "step": 2059 + }, + { + "epoch": 0.8005440590692704, + "grad_norm": 0.2076139748096466, + "learning_rate": 4.0093421564811214e-05, + "loss": 1.0147, + "step": 2060 + }, + { + "epoch": 0.8009326726901778, + "grad_norm": 0.21985560655593872, + "learning_rate": 4.001557026080187e-05, + "loss": 1.0436, + "step": 2061 + }, + { + "epoch": 0.8013212863110852, + "grad_norm": 0.2088089883327484, + "learning_rate": 3.993771895679253e-05, + "loss": 1.067, + "step": 2062 + }, + { + "epoch": 0.8017098999319926, + "grad_norm": 0.23079900443553925, + "learning_rate": 3.9859867652783184e-05, + "loss": 1.0208, + "step": 2063 + }, + { + "epoch": 0.8020985135529001, + "grad_norm": 0.20904935896396637, + "learning_rate": 3.978201634877384e-05, + "loss": 1.0417, + "step": 2064 + }, + { + "epoch": 0.8024871271738074, + "grad_norm": 0.2027217298746109, + "learning_rate": 3.97041650447645e-05, + "loss": 1.0466, + "step": 2065 + }, + { + "epoch": 0.8028757407947149, + "grad_norm": 0.2080574333667755, + "learning_rate": 3.962631374075516e-05, + "loss": 1.0004, + "step": 2066 + }, + { + "epoch": 0.8032643544156223, + "grad_norm": 0.2076699584722519, + "learning_rate": 3.954846243674582e-05, + "loss": 1.0288, + "step": 2067 + }, + { + "epoch": 0.8036529680365296, + "grad_norm": 0.20526565611362457, + "learning_rate": 3.947061113273647e-05, + "loss": 0.9627, + "step": 2068 + }, + { + "epoch": 0.8040415816574371, + "grad_norm": 0.2086559236049652, + "learning_rate": 3.939275982872713e-05, + "loss": 1.057, + "step": 2069 + }, + { + "epoch": 0.8044301952783445, + "grad_norm": 0.21741564571857452, + "learning_rate": 3.931490852471779e-05, + "loss": 1.0575, + "step": 2070 + }, + { + "epoch": 0.804818808899252, + "grad_norm": 0.19239796698093414, + "learning_rate": 3.923705722070845e-05, + "loss": 1.0028, + "step": 2071 + }, + { + "epoch": 0.8052074225201593, + "grad_norm": 0.20606793463230133, + "learning_rate": 3.9159205916699106e-05, + "loss": 1.0305, + "step": 2072 + }, + { + "epoch": 0.8055960361410668, + "grad_norm": 0.2197132408618927, + "learning_rate": 3.9081354612689765e-05, + "loss": 1.0669, + "step": 2073 + }, + { + "epoch": 0.8059846497619741, + "grad_norm": 0.19510973989963531, + "learning_rate": 3.9003503308680424e-05, + "loss": 0.984, + "step": 2074 + }, + { + "epoch": 0.8063732633828816, + "grad_norm": 0.20135273039340973, + "learning_rate": 3.892565200467108e-05, + "loss": 1.0528, + "step": 2075 + }, + { + "epoch": 0.806761877003789, + "grad_norm": 0.20280520617961884, + "learning_rate": 3.884780070066174e-05, + "loss": 1.0185, + "step": 2076 + }, + { + "epoch": 0.8071504906246963, + "grad_norm": 0.21787187457084656, + "learning_rate": 3.8769949396652394e-05, + "loss": 1.0875, + "step": 2077 + }, + { + "epoch": 0.8075391042456038, + "grad_norm": 0.21521267294883728, + "learning_rate": 3.869209809264305e-05, + "loss": 1.0352, + "step": 2078 + }, + { + "epoch": 0.8079277178665112, + "grad_norm": 0.21675272285938263, + "learning_rate": 3.861424678863371e-05, + "loss": 1.0178, + "step": 2079 + }, + { + "epoch": 0.8083163314874187, + "grad_norm": 0.20301300287246704, + "learning_rate": 3.853639548462437e-05, + "loss": 1.042, + "step": 2080 + }, + { + "epoch": 0.808704945108326, + "grad_norm": 0.2025609016418457, + "learning_rate": 3.845854418061502e-05, + "loss": 1.0224, + "step": 2081 + }, + { + "epoch": 0.8090935587292335, + "grad_norm": 0.23724251985549927, + "learning_rate": 3.838069287660568e-05, + "loss": 1.0051, + "step": 2082 + }, + { + "epoch": 0.8094821723501409, + "grad_norm": 0.17473214864730835, + "learning_rate": 3.830284157259634e-05, + "loss": 0.9183, + "step": 2083 + }, + { + "epoch": 0.8098707859710483, + "grad_norm": 0.20575867593288422, + "learning_rate": 3.8224990268587e-05, + "loss": 1.0018, + "step": 2084 + }, + { + "epoch": 0.8102593995919557, + "grad_norm": 0.2054753601551056, + "learning_rate": 3.8147138964577664e-05, + "loss": 1.0326, + "step": 2085 + }, + { + "epoch": 0.8106480132128631, + "grad_norm": 0.22283188998699188, + "learning_rate": 3.8069287660568316e-05, + "loss": 1.0878, + "step": 2086 + }, + { + "epoch": 0.8110366268337705, + "grad_norm": 0.20678454637527466, + "learning_rate": 3.7991436356558975e-05, + "loss": 1.0382, + "step": 2087 + }, + { + "epoch": 0.8114252404546779, + "grad_norm": 0.22482691705226898, + "learning_rate": 3.7913585052549634e-05, + "loss": 1.0441, + "step": 2088 + }, + { + "epoch": 0.8118138540755854, + "grad_norm": 0.19913192093372345, + "learning_rate": 3.783573374854029e-05, + "loss": 0.9093, + "step": 2089 + }, + { + "epoch": 0.8122024676964927, + "grad_norm": 0.21512696146965027, + "learning_rate": 3.7757882444530945e-05, + "loss": 1.0589, + "step": 2090 + }, + { + "epoch": 0.8125910813174002, + "grad_norm": 0.20883330702781677, + "learning_rate": 3.7680031140521604e-05, + "loss": 0.9773, + "step": 2091 + }, + { + "epoch": 0.8129796949383076, + "grad_norm": 0.20254108309745789, + "learning_rate": 3.760217983651226e-05, + "loss": 1.0111, + "step": 2092 + }, + { + "epoch": 0.813368308559215, + "grad_norm": 0.22513622045516968, + "learning_rate": 3.752432853250292e-05, + "loss": 1.0471, + "step": 2093 + }, + { + "epoch": 0.8137569221801224, + "grad_norm": 0.20943938195705414, + "learning_rate": 3.744647722849358e-05, + "loss": 1.0261, + "step": 2094 + }, + { + "epoch": 0.8141455358010298, + "grad_norm": 0.19357722997665405, + "learning_rate": 3.736862592448423e-05, + "loss": 0.9891, + "step": 2095 + }, + { + "epoch": 0.8145341494219372, + "grad_norm": 0.20199090242385864, + "learning_rate": 3.729077462047489e-05, + "loss": 1.0017, + "step": 2096 + }, + { + "epoch": 0.8149227630428446, + "grad_norm": 0.22087882459163666, + "learning_rate": 3.721292331646556e-05, + "loss": 1.0176, + "step": 2097 + }, + { + "epoch": 0.8153113766637521, + "grad_norm": 0.19757211208343506, + "learning_rate": 3.7135072012456215e-05, + "loss": 0.9993, + "step": 2098 + }, + { + "epoch": 0.8156999902846594, + "grad_norm": 0.21485236287117004, + "learning_rate": 3.705722070844687e-05, + "loss": 1.0129, + "step": 2099 + }, + { + "epoch": 0.8160886039055669, + "grad_norm": 0.2095671445131302, + "learning_rate": 3.6979369404437526e-05, + "loss": 1.0576, + "step": 2100 + }, + { + "epoch": 0.8164772175264743, + "grad_norm": 0.21392807364463806, + "learning_rate": 3.6901518100428185e-05, + "loss": 1.0666, + "step": 2101 + }, + { + "epoch": 0.8168658311473818, + "grad_norm": 0.23267820477485657, + "learning_rate": 3.6823666796418844e-05, + "loss": 1.0691, + "step": 2102 + }, + { + "epoch": 0.8172544447682891, + "grad_norm": 0.3778455853462219, + "learning_rate": 3.67458154924095e-05, + "loss": 1.057, + "step": 2103 + }, + { + "epoch": 0.8176430583891965, + "grad_norm": 0.21719984710216522, + "learning_rate": 3.6667964188400155e-05, + "loss": 1.0564, + "step": 2104 + }, + { + "epoch": 0.818031672010104, + "grad_norm": 0.19418101012706757, + "learning_rate": 3.6590112884390814e-05, + "loss": 1.0156, + "step": 2105 + }, + { + "epoch": 0.8184202856310113, + "grad_norm": 0.20592990517616272, + "learning_rate": 3.651226158038147e-05, + "loss": 1.026, + "step": 2106 + }, + { + "epoch": 0.8188088992519188, + "grad_norm": 0.21999908983707428, + "learning_rate": 3.643441027637213e-05, + "loss": 1.0575, + "step": 2107 + }, + { + "epoch": 0.8191975128728262, + "grad_norm": 0.2080504447221756, + "learning_rate": 3.635655897236278e-05, + "loss": 1.0236, + "step": 2108 + }, + { + "epoch": 0.8195861264937336, + "grad_norm": 0.20104867219924927, + "learning_rate": 3.627870766835344e-05, + "loss": 0.9626, + "step": 2109 + }, + { + "epoch": 0.819974740114641, + "grad_norm": 0.18993836641311646, + "learning_rate": 3.620085636434411e-05, + "loss": 0.983, + "step": 2110 + }, + { + "epoch": 0.8203633537355485, + "grad_norm": 0.18710492551326752, + "learning_rate": 3.6123005060334767e-05, + "loss": 0.9674, + "step": 2111 + }, + { + "epoch": 0.8207519673564558, + "grad_norm": 0.2117459774017334, + "learning_rate": 3.6045153756325425e-05, + "loss": 1.0263, + "step": 2112 + }, + { + "epoch": 0.8211405809773633, + "grad_norm": 0.2005959451198578, + "learning_rate": 3.596730245231608e-05, + "loss": 1.0405, + "step": 2113 + }, + { + "epoch": 0.8215291945982707, + "grad_norm": 0.21586982905864716, + "learning_rate": 3.5889451148306736e-05, + "loss": 0.9715, + "step": 2114 + }, + { + "epoch": 0.821917808219178, + "grad_norm": 0.2229696810245514, + "learning_rate": 3.5811599844297395e-05, + "loss": 1.0427, + "step": 2115 + }, + { + "epoch": 0.8223064218400855, + "grad_norm": 0.22296395897865295, + "learning_rate": 3.5733748540288054e-05, + "loss": 1.093, + "step": 2116 + }, + { + "epoch": 0.8226950354609929, + "grad_norm": 0.22912591695785522, + "learning_rate": 3.5655897236278706e-05, + "loss": 1.0821, + "step": 2117 + }, + { + "epoch": 0.8230836490819003, + "grad_norm": 0.19285057485103607, + "learning_rate": 3.5578045932269365e-05, + "loss": 0.9694, + "step": 2118 + }, + { + "epoch": 0.8234722627028077, + "grad_norm": 0.2150295525789261, + "learning_rate": 3.5500194628260024e-05, + "loss": 1.0277, + "step": 2119 + }, + { + "epoch": 0.8238608763237152, + "grad_norm": 0.20686036348342896, + "learning_rate": 3.542234332425068e-05, + "loss": 0.9946, + "step": 2120 + }, + { + "epoch": 0.8242494899446225, + "grad_norm": 0.21742792427539825, + "learning_rate": 3.534449202024134e-05, + "loss": 1.0233, + "step": 2121 + }, + { + "epoch": 0.82463810356553, + "grad_norm": 0.2077355682849884, + "learning_rate": 3.526664071623199e-05, + "loss": 0.9918, + "step": 2122 + }, + { + "epoch": 0.8250267171864374, + "grad_norm": 0.2552899122238159, + "learning_rate": 3.518878941222266e-05, + "loss": 0.9648, + "step": 2123 + }, + { + "epoch": 0.8254153308073447, + "grad_norm": 0.21043844521045685, + "learning_rate": 3.511093810821332e-05, + "loss": 1.023, + "step": 2124 + }, + { + "epoch": 0.8258039444282522, + "grad_norm": 0.22360606491565704, + "learning_rate": 3.5033086804203977e-05, + "loss": 1.0862, + "step": 2125 + }, + { + "epoch": 0.8261925580491596, + "grad_norm": 0.20735731720924377, + "learning_rate": 3.495523550019463e-05, + "loss": 1.017, + "step": 2126 + }, + { + "epoch": 0.8265811716700671, + "grad_norm": 0.21998152136802673, + "learning_rate": 3.487738419618529e-05, + "loss": 1.0273, + "step": 2127 + }, + { + "epoch": 0.8269697852909744, + "grad_norm": 0.23547297716140747, + "learning_rate": 3.4799532892175946e-05, + "loss": 1.0353, + "step": 2128 + }, + { + "epoch": 0.8273583989118819, + "grad_norm": 0.20162945985794067, + "learning_rate": 3.4721681588166605e-05, + "loss": 1.0289, + "step": 2129 + }, + { + "epoch": 0.8277470125327893, + "grad_norm": 0.1959386169910431, + "learning_rate": 3.4643830284157264e-05, + "loss": 1.012, + "step": 2130 + }, + { + "epoch": 0.8281356261536967, + "grad_norm": 0.21625256538391113, + "learning_rate": 3.4565978980147916e-05, + "loss": 1.0718, + "step": 2131 + }, + { + "epoch": 0.8285242397746041, + "grad_norm": 0.2094646692276001, + "learning_rate": 3.4488127676138575e-05, + "loss": 1.0157, + "step": 2132 + }, + { + "epoch": 0.8289128533955115, + "grad_norm": 0.19329530000686646, + "learning_rate": 3.4410276372129234e-05, + "loss": 0.9652, + "step": 2133 + }, + { + "epoch": 0.8293014670164189, + "grad_norm": 0.19125741720199585, + "learning_rate": 3.433242506811989e-05, + "loss": 0.9964, + "step": 2134 + }, + { + "epoch": 0.8296900806373263, + "grad_norm": 0.1942203938961029, + "learning_rate": 3.425457376411055e-05, + "loss": 0.9795, + "step": 2135 + }, + { + "epoch": 0.8300786942582338, + "grad_norm": 0.2229314148426056, + "learning_rate": 3.417672246010121e-05, + "loss": 1.1052, + "step": 2136 + }, + { + "epoch": 0.8304673078791411, + "grad_norm": 0.2160118967294693, + "learning_rate": 3.409887115609187e-05, + "loss": 1.0263, + "step": 2137 + }, + { + "epoch": 0.8308559215000486, + "grad_norm": 0.2106090933084488, + "learning_rate": 3.402101985208253e-05, + "loss": 1.0151, + "step": 2138 + }, + { + "epoch": 0.831244535120956, + "grad_norm": 0.31897667050361633, + "learning_rate": 3.3943168548073187e-05, + "loss": 1.0122, + "step": 2139 + }, + { + "epoch": 0.8316331487418634, + "grad_norm": 0.20475897192955017, + "learning_rate": 3.386531724406384e-05, + "loss": 1.0239, + "step": 2140 + }, + { + "epoch": 0.8320217623627708, + "grad_norm": 0.21326549351215363, + "learning_rate": 3.37874659400545e-05, + "loss": 1.05, + "step": 2141 + }, + { + "epoch": 0.8324103759836782, + "grad_norm": 0.2130986452102661, + "learning_rate": 3.3709614636045156e-05, + "loss": 0.9979, + "step": 2142 + }, + { + "epoch": 0.8327989896045856, + "grad_norm": 0.20519514381885529, + "learning_rate": 3.3631763332035815e-05, + "loss": 1.035, + "step": 2143 + }, + { + "epoch": 0.833187603225493, + "grad_norm": 0.21058332920074463, + "learning_rate": 3.355391202802647e-05, + "loss": 1.0509, + "step": 2144 + }, + { + "epoch": 0.8335762168464005, + "grad_norm": 0.20692919194698334, + "learning_rate": 3.3476060724017126e-05, + "loss": 1.0262, + "step": 2145 + }, + { + "epoch": 0.8339648304673078, + "grad_norm": 0.20325800776481628, + "learning_rate": 3.3398209420007785e-05, + "loss": 1.0352, + "step": 2146 + }, + { + "epoch": 0.8343534440882153, + "grad_norm": 0.18956026434898376, + "learning_rate": 3.3320358115998444e-05, + "loss": 0.9618, + "step": 2147 + }, + { + "epoch": 0.8347420577091227, + "grad_norm": 0.24605980515480042, + "learning_rate": 3.32425068119891e-05, + "loss": 0.9785, + "step": 2148 + }, + { + "epoch": 0.8351306713300302, + "grad_norm": 0.20649299025535583, + "learning_rate": 3.316465550797976e-05, + "loss": 1.0051, + "step": 2149 + }, + { + "epoch": 0.8355192849509375, + "grad_norm": 0.21091307699680328, + "learning_rate": 3.308680420397042e-05, + "loss": 1.0321, + "step": 2150 + }, + { + "epoch": 0.835907898571845, + "grad_norm": 0.20463331043720245, + "learning_rate": 3.300895289996108e-05, + "loss": 1.0103, + "step": 2151 + }, + { + "epoch": 0.8362965121927524, + "grad_norm": 0.1851118803024292, + "learning_rate": 3.293110159595174e-05, + "loss": 0.9193, + "step": 2152 + }, + { + "epoch": 0.8366851258136597, + "grad_norm": 0.22127285599708557, + "learning_rate": 3.285325029194239e-05, + "loss": 1.0593, + "step": 2153 + }, + { + "epoch": 0.8370737394345672, + "grad_norm": 0.2060239166021347, + "learning_rate": 3.277539898793305e-05, + "loss": 1.1002, + "step": 2154 + }, + { + "epoch": 0.8374623530554746, + "grad_norm": 0.20628675818443298, + "learning_rate": 3.269754768392371e-05, + "loss": 1.0449, + "step": 2155 + }, + { + "epoch": 0.837850966676382, + "grad_norm": 0.2015877068042755, + "learning_rate": 3.2619696379914366e-05, + "loss": 1.0007, + "step": 2156 + }, + { + "epoch": 0.8382395802972894, + "grad_norm": 0.26001277565956116, + "learning_rate": 3.2541845075905025e-05, + "loss": 1.0593, + "step": 2157 + }, + { + "epoch": 0.8386281939181969, + "grad_norm": 0.21557845175266266, + "learning_rate": 3.246399377189568e-05, + "loss": 1.0206, + "step": 2158 + }, + { + "epoch": 0.8390168075391042, + "grad_norm": 0.21529968082904816, + "learning_rate": 3.2386142467886336e-05, + "loss": 1.0648, + "step": 2159 + }, + { + "epoch": 0.8394054211600117, + "grad_norm": 0.22108668088912964, + "learning_rate": 3.2308291163876995e-05, + "loss": 1.0192, + "step": 2160 + }, + { + "epoch": 0.8397940347809191, + "grad_norm": 0.20087426900863647, + "learning_rate": 3.2230439859867654e-05, + "loss": 0.9972, + "step": 2161 + }, + { + "epoch": 0.8401826484018264, + "grad_norm": 0.2194579839706421, + "learning_rate": 3.215258855585831e-05, + "loss": 1.0222, + "step": 2162 + }, + { + "epoch": 0.8405712620227339, + "grad_norm": 0.2581467926502228, + "learning_rate": 3.207473725184897e-05, + "loss": 1.0369, + "step": 2163 + }, + { + "epoch": 0.8409598756436413, + "grad_norm": 0.20566490292549133, + "learning_rate": 3.199688594783963e-05, + "loss": 1.0453, + "step": 2164 + }, + { + "epoch": 0.8413484892645487, + "grad_norm": 0.20137596130371094, + "learning_rate": 3.191903464383029e-05, + "loss": 1.0404, + "step": 2165 + }, + { + "epoch": 0.8417371028854561, + "grad_norm": 0.2136070281267166, + "learning_rate": 3.184118333982095e-05, + "loss": 0.998, + "step": 2166 + }, + { + "epoch": 0.8421257165063636, + "grad_norm": 0.2082609087228775, + "learning_rate": 3.17633320358116e-05, + "loss": 1.0617, + "step": 2167 + }, + { + "epoch": 0.842514330127271, + "grad_norm": 0.20818866789340973, + "learning_rate": 3.168548073180226e-05, + "loss": 0.9739, + "step": 2168 + }, + { + "epoch": 0.8429029437481784, + "grad_norm": 0.1998904049396515, + "learning_rate": 3.160762942779292e-05, + "loss": 0.9984, + "step": 2169 + }, + { + "epoch": 0.8432915573690858, + "grad_norm": 0.2000143975019455, + "learning_rate": 3.1529778123783576e-05, + "loss": 0.9975, + "step": 2170 + }, + { + "epoch": 0.8436801709899932, + "grad_norm": 0.20654286444187164, + "learning_rate": 3.145192681977423e-05, + "loss": 1.0403, + "step": 2171 + }, + { + "epoch": 0.8440687846109006, + "grad_norm": 0.20888234674930573, + "learning_rate": 3.137407551576489e-05, + "loss": 1.0072, + "step": 2172 + }, + { + "epoch": 0.844457398231808, + "grad_norm": 0.20207738876342773, + "learning_rate": 3.1296224211755546e-05, + "loss": 1.0361, + "step": 2173 + }, + { + "epoch": 0.8448460118527155, + "grad_norm": 0.2032788097858429, + "learning_rate": 3.1218372907746205e-05, + "loss": 1.0179, + "step": 2174 + }, + { + "epoch": 0.8452346254736228, + "grad_norm": 0.22794555127620697, + "learning_rate": 3.1140521603736864e-05, + "loss": 1.0337, + "step": 2175 + }, + { + "epoch": 0.8456232390945303, + "grad_norm": 0.20593926310539246, + "learning_rate": 3.106267029972752e-05, + "loss": 1.0336, + "step": 2176 + }, + { + "epoch": 0.8460118527154377, + "grad_norm": 0.20535798370838165, + "learning_rate": 3.098481899571818e-05, + "loss": 1.0465, + "step": 2177 + }, + { + "epoch": 0.8464004663363451, + "grad_norm": 0.2055482417345047, + "learning_rate": 3.090696769170884e-05, + "loss": 1.0073, + "step": 2178 + }, + { + "epoch": 0.8467890799572525, + "grad_norm": 0.20908206701278687, + "learning_rate": 3.08291163876995e-05, + "loss": 1.0478, + "step": 2179 + }, + { + "epoch": 0.8471776935781599, + "grad_norm": 0.20747126638889313, + "learning_rate": 3.075126508369015e-05, + "loss": 1.0621, + "step": 2180 + }, + { + "epoch": 0.8475663071990673, + "grad_norm": 0.28445661067962646, + "learning_rate": 3.067341377968081e-05, + "loss": 1.0546, + "step": 2181 + }, + { + "epoch": 0.8479549208199747, + "grad_norm": 0.1851411610841751, + "learning_rate": 3.059556247567147e-05, + "loss": 0.9759, + "step": 2182 + }, + { + "epoch": 0.8483435344408822, + "grad_norm": 0.1998148262500763, + "learning_rate": 3.051771117166213e-05, + "loss": 1.0138, + "step": 2183 + }, + { + "epoch": 0.8487321480617895, + "grad_norm": 0.20033158361911774, + "learning_rate": 3.0439859867652786e-05, + "loss": 1.0346, + "step": 2184 + }, + { + "epoch": 0.849120761682697, + "grad_norm": 0.1972794383764267, + "learning_rate": 3.036200856364344e-05, + "loss": 1.0476, + "step": 2185 + }, + { + "epoch": 0.8495093753036044, + "grad_norm": 0.23393818736076355, + "learning_rate": 3.02841572596341e-05, + "loss": 0.9738, + "step": 2186 + }, + { + "epoch": 0.8498979889245118, + "grad_norm": 0.1907467097043991, + "learning_rate": 3.020630595562476e-05, + "loss": 0.966, + "step": 2187 + }, + { + "epoch": 0.8502866025454192, + "grad_norm": 0.19281136989593506, + "learning_rate": 3.0128454651615418e-05, + "loss": 1.0016, + "step": 2188 + }, + { + "epoch": 0.8506752161663266, + "grad_norm": 0.2053443342447281, + "learning_rate": 3.005060334760607e-05, + "loss": 1.0659, + "step": 2189 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 0.2173933982849121, + "learning_rate": 2.997275204359673e-05, + "loss": 1.0137, + "step": 2190 + }, + { + "epoch": 0.8514524434081414, + "grad_norm": 0.22902634739875793, + "learning_rate": 2.9894900739587388e-05, + "loss": 1.0408, + "step": 2191 + }, + { + "epoch": 0.8518410570290489, + "grad_norm": 0.2113914042711258, + "learning_rate": 2.9817049435578047e-05, + "loss": 1.0769, + "step": 2192 + }, + { + "epoch": 0.8522296706499563, + "grad_norm": 0.20389114320278168, + "learning_rate": 2.973919813156871e-05, + "loss": 0.9835, + "step": 2193 + }, + { + "epoch": 0.8526182842708637, + "grad_norm": 0.2062385231256485, + "learning_rate": 2.966134682755936e-05, + "loss": 1.0397, + "step": 2194 + }, + { + "epoch": 0.8530068978917711, + "grad_norm": 0.20552967488765717, + "learning_rate": 2.958349552355002e-05, + "loss": 0.9949, + "step": 2195 + }, + { + "epoch": 0.8533955115126786, + "grad_norm": 0.1985877901315689, + "learning_rate": 2.950564421954068e-05, + "loss": 0.9909, + "step": 2196 + }, + { + "epoch": 0.8537841251335859, + "grad_norm": 0.20005984604358673, + "learning_rate": 2.9427792915531337e-05, + "loss": 0.9603, + "step": 2197 + }, + { + "epoch": 0.8541727387544934, + "grad_norm": 0.20039033889770508, + "learning_rate": 2.9349941611521996e-05, + "loss": 0.9832, + "step": 2198 + }, + { + "epoch": 0.8545613523754008, + "grad_norm": 0.19540533423423767, + "learning_rate": 2.927209030751265e-05, + "loss": 0.9563, + "step": 2199 + }, + { + "epoch": 0.8549499659963081, + "grad_norm": 0.21219204366207123, + "learning_rate": 2.919423900350331e-05, + "loss": 1.0914, + "step": 2200 + }, + { + "epoch": 0.8553385796172156, + "grad_norm": 0.1871120035648346, + "learning_rate": 2.911638769949397e-05, + "loss": 0.9683, + "step": 2201 + }, + { + "epoch": 0.855727193238123, + "grad_norm": 0.2022469937801361, + "learning_rate": 2.9038536395484628e-05, + "loss": 1.0552, + "step": 2202 + }, + { + "epoch": 0.8561158068590304, + "grad_norm": 0.21184539794921875, + "learning_rate": 2.896068509147528e-05, + "loss": 1.0544, + "step": 2203 + }, + { + "epoch": 0.8565044204799378, + "grad_norm": 0.21650457382202148, + "learning_rate": 2.888283378746594e-05, + "loss": 1.0683, + "step": 2204 + }, + { + "epoch": 0.8568930341008453, + "grad_norm": 0.19166558980941772, + "learning_rate": 2.88049824834566e-05, + "loss": 0.9317, + "step": 2205 + }, + { + "epoch": 0.8572816477217526, + "grad_norm": 0.21191413700580597, + "learning_rate": 2.872713117944726e-05, + "loss": 0.9775, + "step": 2206 + }, + { + "epoch": 0.8576702613426601, + "grad_norm": 0.1949252486228943, + "learning_rate": 2.864927987543792e-05, + "loss": 0.9771, + "step": 2207 + }, + { + "epoch": 0.8580588749635675, + "grad_norm": 0.18980230391025543, + "learning_rate": 2.857142857142857e-05, + "loss": 0.9816, + "step": 2208 + }, + { + "epoch": 0.8584474885844748, + "grad_norm": 0.20371113717556, + "learning_rate": 2.849357726741923e-05, + "loss": 1.0269, + "step": 2209 + }, + { + "epoch": 0.8588361022053823, + "grad_norm": 0.2025761753320694, + "learning_rate": 2.841572596340989e-05, + "loss": 0.9169, + "step": 2210 + }, + { + "epoch": 0.8592247158262897, + "grad_norm": 0.20668815076351166, + "learning_rate": 2.8337874659400547e-05, + "loss": 1.0409, + "step": 2211 + }, + { + "epoch": 0.8596133294471971, + "grad_norm": 0.19602157175540924, + "learning_rate": 2.8260023355391203e-05, + "loss": 0.9752, + "step": 2212 + }, + { + "epoch": 0.8600019430681045, + "grad_norm": 0.19047275185585022, + "learning_rate": 2.818217205138186e-05, + "loss": 0.9862, + "step": 2213 + }, + { + "epoch": 0.860390556689012, + "grad_norm": 0.20148906111717224, + "learning_rate": 2.810432074737252e-05, + "loss": 1.0339, + "step": 2214 + }, + { + "epoch": 0.8607791703099194, + "grad_norm": 0.19507504999637604, + "learning_rate": 2.802646944336318e-05, + "loss": 1.0452, + "step": 2215 + }, + { + "epoch": 0.8611677839308268, + "grad_norm": 0.22428153455257416, + "learning_rate": 2.7948618139353838e-05, + "loss": 1.0652, + "step": 2216 + }, + { + "epoch": 0.8615563975517342, + "grad_norm": 0.19588248431682587, + "learning_rate": 2.787076683534449e-05, + "loss": 0.9816, + "step": 2217 + }, + { + "epoch": 0.8619450111726416, + "grad_norm": 0.20823241770267487, + "learning_rate": 2.7792915531335152e-05, + "loss": 1.0239, + "step": 2218 + }, + { + "epoch": 0.862333624793549, + "grad_norm": 0.20268678665161133, + "learning_rate": 2.771506422732581e-05, + "loss": 1.0057, + "step": 2219 + }, + { + "epoch": 0.8627222384144564, + "grad_norm": 0.22147025167942047, + "learning_rate": 2.763721292331647e-05, + "loss": 1.0296, + "step": 2220 + }, + { + "epoch": 0.8631108520353639, + "grad_norm": 0.2015751451253891, + "learning_rate": 2.7559361619307122e-05, + "loss": 0.9884, + "step": 2221 + }, + { + "epoch": 0.8634994656562712, + "grad_norm": 0.20846128463745117, + "learning_rate": 2.748151031529778e-05, + "loss": 1.032, + "step": 2222 + }, + { + "epoch": 0.8638880792771787, + "grad_norm": 0.212540403008461, + "learning_rate": 2.740365901128844e-05, + "loss": 1.0432, + "step": 2223 + }, + { + "epoch": 0.8642766928980861, + "grad_norm": 0.19588392972946167, + "learning_rate": 2.73258077072791e-05, + "loss": 1.0203, + "step": 2224 + }, + { + "epoch": 0.8646653065189935, + "grad_norm": 0.2195088416337967, + "learning_rate": 2.7247956403269757e-05, + "loss": 1.0415, + "step": 2225 + }, + { + "epoch": 0.8650539201399009, + "grad_norm": 0.20950359106063843, + "learning_rate": 2.7170105099260413e-05, + "loss": 1.0114, + "step": 2226 + }, + { + "epoch": 0.8654425337608083, + "grad_norm": 0.23009665310382843, + "learning_rate": 2.709225379525107e-05, + "loss": 1.0018, + "step": 2227 + }, + { + "epoch": 0.8658311473817157, + "grad_norm": 0.19696195423603058, + "learning_rate": 2.701440249124173e-05, + "loss": 1.0135, + "step": 2228 + }, + { + "epoch": 0.8662197610026231, + "grad_norm": 0.2212006151676178, + "learning_rate": 2.693655118723239e-05, + "loss": 1.0557, + "step": 2229 + }, + { + "epoch": 0.8666083746235306, + "grad_norm": 0.21312370896339417, + "learning_rate": 2.6858699883223045e-05, + "loss": 1.0758, + "step": 2230 + }, + { + "epoch": 0.8669969882444379, + "grad_norm": 0.21425843238830566, + "learning_rate": 2.6780848579213703e-05, + "loss": 1.0234, + "step": 2231 + }, + { + "epoch": 0.8673856018653454, + "grad_norm": 0.2145942598581314, + "learning_rate": 2.6702997275204362e-05, + "loss": 0.9831, + "step": 2232 + }, + { + "epoch": 0.8677742154862528, + "grad_norm": 0.20881056785583496, + "learning_rate": 2.662514597119502e-05, + "loss": 1.0255, + "step": 2233 + }, + { + "epoch": 0.8681628291071602, + "grad_norm": 0.19835254549980164, + "learning_rate": 2.654729466718568e-05, + "loss": 0.9868, + "step": 2234 + }, + { + "epoch": 0.8685514427280676, + "grad_norm": 0.21160255372524261, + "learning_rate": 2.6469443363176332e-05, + "loss": 1.0024, + "step": 2235 + }, + { + "epoch": 0.868940056348975, + "grad_norm": 0.2119852900505066, + "learning_rate": 2.639159205916699e-05, + "loss": 0.9886, + "step": 2236 + }, + { + "epoch": 0.8693286699698825, + "grad_norm": 0.2107681930065155, + "learning_rate": 2.631374075515765e-05, + "loss": 1.0311, + "step": 2237 + }, + { + "epoch": 0.8697172835907898, + "grad_norm": 0.2076905369758606, + "learning_rate": 2.623588945114831e-05, + "loss": 1.0217, + "step": 2238 + }, + { + "epoch": 0.8701058972116973, + "grad_norm": 0.20869198441505432, + "learning_rate": 2.6158038147138964e-05, + "loss": 0.9488, + "step": 2239 + }, + { + "epoch": 0.8704945108326047, + "grad_norm": 0.1986512839794159, + "learning_rate": 2.6080186843129623e-05, + "loss": 1.0216, + "step": 2240 + }, + { + "epoch": 0.8708831244535121, + "grad_norm": 0.19954320788383484, + "learning_rate": 2.600233553912028e-05, + "loss": 0.988, + "step": 2241 + }, + { + "epoch": 0.8712717380744195, + "grad_norm": 0.22843138873577118, + "learning_rate": 2.592448423511094e-05, + "loss": 1.0979, + "step": 2242 + }, + { + "epoch": 0.871660351695327, + "grad_norm": 0.21942777931690216, + "learning_rate": 2.58466329311016e-05, + "loss": 1.0378, + "step": 2243 + }, + { + "epoch": 0.8720489653162343, + "grad_norm": 0.21504725515842438, + "learning_rate": 2.5768781627092255e-05, + "loss": 1.0628, + "step": 2244 + }, + { + "epoch": 0.8724375789371418, + "grad_norm": 0.21556456387043, + "learning_rate": 2.5690930323082913e-05, + "loss": 0.9943, + "step": 2245 + }, + { + "epoch": 0.8728261925580492, + "grad_norm": 0.2099362164735794, + "learning_rate": 2.5613079019073572e-05, + "loss": 1.0603, + "step": 2246 + }, + { + "epoch": 0.8732148061789565, + "grad_norm": 0.2027025669813156, + "learning_rate": 2.553522771506423e-05, + "loss": 1.0028, + "step": 2247 + }, + { + "epoch": 0.873603419799864, + "grad_norm": 0.2144668847322464, + "learning_rate": 2.5457376411054883e-05, + "loss": 1.0462, + "step": 2248 + }, + { + "epoch": 0.8739920334207714, + "grad_norm": 0.20712412893772125, + "learning_rate": 2.5379525107045542e-05, + "loss": 0.9842, + "step": 2249 + }, + { + "epoch": 0.8743806470416788, + "grad_norm": 0.19471199810504913, + "learning_rate": 2.53016738030362e-05, + "loss": 1.0171, + "step": 2250 + }, + { + "epoch": 0.8747692606625862, + "grad_norm": 0.19841787219047546, + "learning_rate": 2.522382249902686e-05, + "loss": 0.9034, + "step": 2251 + }, + { + "epoch": 0.8751578742834937, + "grad_norm": 0.20370744168758392, + "learning_rate": 2.5145971195017522e-05, + "loss": 1.0249, + "step": 2252 + }, + { + "epoch": 0.875546487904401, + "grad_norm": 0.22168315947055817, + "learning_rate": 2.5068119891008174e-05, + "loss": 1.0624, + "step": 2253 + }, + { + "epoch": 0.8759351015253085, + "grad_norm": 0.200806125998497, + "learning_rate": 2.4990268586998833e-05, + "loss": 1.0452, + "step": 2254 + }, + { + "epoch": 0.8763237151462159, + "grad_norm": 0.19972844421863556, + "learning_rate": 2.491241728298949e-05, + "loss": 1.0563, + "step": 2255 + }, + { + "epoch": 0.8767123287671232, + "grad_norm": 0.19919687509536743, + "learning_rate": 2.4834565978980147e-05, + "loss": 1.0249, + "step": 2256 + }, + { + "epoch": 0.8771009423880307, + "grad_norm": 0.19924059510231018, + "learning_rate": 2.4756714674970806e-05, + "loss": 1.016, + "step": 2257 + }, + { + "epoch": 0.8774895560089381, + "grad_norm": 0.2038920521736145, + "learning_rate": 2.4678863370961465e-05, + "loss": 1.0116, + "step": 2258 + }, + { + "epoch": 0.8778781696298456, + "grad_norm": 0.20609620213508606, + "learning_rate": 2.4601012066952123e-05, + "loss": 1.0153, + "step": 2259 + }, + { + "epoch": 0.8782667832507529, + "grad_norm": 0.20705272257328033, + "learning_rate": 2.4523160762942782e-05, + "loss": 1.013, + "step": 2260 + }, + { + "epoch": 0.8786553968716604, + "grad_norm": 0.19973833858966827, + "learning_rate": 2.4445309458933438e-05, + "loss": 0.9932, + "step": 2261 + }, + { + "epoch": 0.8790440104925678, + "grad_norm": 0.20942817628383636, + "learning_rate": 2.4367458154924097e-05, + "loss": 1.0091, + "step": 2262 + }, + { + "epoch": 0.8794326241134752, + "grad_norm": 0.3686840236186981, + "learning_rate": 2.4289606850914752e-05, + "loss": 1.0157, + "step": 2263 + }, + { + "epoch": 0.8798212377343826, + "grad_norm": 0.20390458405017853, + "learning_rate": 2.4211755546905414e-05, + "loss": 1.0431, + "step": 2264 + }, + { + "epoch": 0.88020985135529, + "grad_norm": 0.2211003601551056, + "learning_rate": 2.413390424289607e-05, + "loss": 1.089, + "step": 2265 + }, + { + "epoch": 0.8805984649761974, + "grad_norm": 0.20558148622512817, + "learning_rate": 2.405605293888673e-05, + "loss": 0.9798, + "step": 2266 + }, + { + "epoch": 0.8809870785971048, + "grad_norm": 0.19347704946994781, + "learning_rate": 2.3978201634877384e-05, + "loss": 0.97, + "step": 2267 + }, + { + "epoch": 0.8813756922180123, + "grad_norm": 0.19454139471054077, + "learning_rate": 2.3900350330868043e-05, + "loss": 1.0265, + "step": 2268 + }, + { + "epoch": 0.8817643058389196, + "grad_norm": 0.19511118531227112, + "learning_rate": 2.38224990268587e-05, + "loss": 0.994, + "step": 2269 + }, + { + "epoch": 0.8821529194598271, + "grad_norm": 0.19948701560497284, + "learning_rate": 2.3744647722849357e-05, + "loss": 0.9911, + "step": 2270 + }, + { + "epoch": 0.8825415330807345, + "grad_norm": 0.21110126376152039, + "learning_rate": 2.366679641884002e-05, + "loss": 1.0484, + "step": 2271 + }, + { + "epoch": 0.8829301467016419, + "grad_norm": 0.20160740613937378, + "learning_rate": 2.3588945114830675e-05, + "loss": 0.9934, + "step": 2272 + }, + { + "epoch": 0.8833187603225493, + "grad_norm": 0.20967216789722443, + "learning_rate": 2.3511093810821333e-05, + "loss": 1.0081, + "step": 2273 + }, + { + "epoch": 0.8837073739434567, + "grad_norm": 0.1981070339679718, + "learning_rate": 2.343324250681199e-05, + "loss": 1.0093, + "step": 2274 + }, + { + "epoch": 0.8840959875643641, + "grad_norm": 0.21609579026699066, + "learning_rate": 2.3355391202802648e-05, + "loss": 1.0954, + "step": 2275 + }, + { + "epoch": 0.8844846011852715, + "grad_norm": 0.18667754530906677, + "learning_rate": 2.3277539898793303e-05, + "loss": 0.9833, + "step": 2276 + }, + { + "epoch": 0.884873214806179, + "grad_norm": 0.2127734273672104, + "learning_rate": 2.3199688594783965e-05, + "loss": 1.0508, + "step": 2277 + }, + { + "epoch": 0.8852618284270863, + "grad_norm": 0.2117089331150055, + "learning_rate": 2.3121837290774624e-05, + "loss": 1.0557, + "step": 2278 + }, + { + "epoch": 0.8856504420479938, + "grad_norm": 0.21022644639015198, + "learning_rate": 2.304398598676528e-05, + "loss": 1.0297, + "step": 2279 + }, + { + "epoch": 0.8860390556689012, + "grad_norm": 0.19904713332653046, + "learning_rate": 2.296613468275594e-05, + "loss": 0.9693, + "step": 2280 + }, + { + "epoch": 0.8864276692898087, + "grad_norm": 0.23006491363048553, + "learning_rate": 2.2888283378746594e-05, + "loss": 1.0409, + "step": 2281 + }, + { + "epoch": 0.886816282910716, + "grad_norm": 0.2179296761751175, + "learning_rate": 2.2810432074737253e-05, + "loss": 1.0433, + "step": 2282 + }, + { + "epoch": 0.8872048965316235, + "grad_norm": 0.19764657318592072, + "learning_rate": 2.273258077072791e-05, + "loss": 0.9807, + "step": 2283 + }, + { + "epoch": 0.8875935101525309, + "grad_norm": 0.23379875719547272, + "learning_rate": 2.265472946671857e-05, + "loss": 1.1025, + "step": 2284 + }, + { + "epoch": 0.8879821237734382, + "grad_norm": 0.2069517821073532, + "learning_rate": 2.2576878162709226e-05, + "loss": 1.0466, + "step": 2285 + }, + { + "epoch": 0.8883707373943457, + "grad_norm": 0.22321875393390656, + "learning_rate": 2.2499026858699885e-05, + "loss": 1.0548, + "step": 2286 + }, + { + "epoch": 0.888759351015253, + "grad_norm": 0.2070666253566742, + "learning_rate": 2.2421175554690543e-05, + "loss": 1.0168, + "step": 2287 + }, + { + "epoch": 0.8891479646361605, + "grad_norm": 0.1939924657344818, + "learning_rate": 2.23433242506812e-05, + "loss": 1.0008, + "step": 2288 + }, + { + "epoch": 0.8895365782570679, + "grad_norm": 0.22350658476352692, + "learning_rate": 2.2265472946671858e-05, + "loss": 1.0469, + "step": 2289 + }, + { + "epoch": 0.8899251918779754, + "grad_norm": 0.19934551417827606, + "learning_rate": 2.2187621642662516e-05, + "loss": 0.977, + "step": 2290 + }, + { + "epoch": 0.8903138054988827, + "grad_norm": 0.22848142683506012, + "learning_rate": 2.2109770338653175e-05, + "loss": 1.0642, + "step": 2291 + }, + { + "epoch": 0.8907024191197902, + "grad_norm": 0.20296107232570648, + "learning_rate": 2.203191903464383e-05, + "loss": 1.0332, + "step": 2292 + }, + { + "epoch": 0.8910910327406976, + "grad_norm": 0.19952169060707092, + "learning_rate": 2.195406773063449e-05, + "loss": 1.0249, + "step": 2293 + }, + { + "epoch": 0.8914796463616049, + "grad_norm": 0.22449292242527008, + "learning_rate": 2.1876216426625145e-05, + "loss": 1.0572, + "step": 2294 + }, + { + "epoch": 0.8918682599825124, + "grad_norm": 0.20287659764289856, + "learning_rate": 2.1798365122615804e-05, + "loss": 1.0331, + "step": 2295 + }, + { + "epoch": 0.8922568736034198, + "grad_norm": 0.2029801905155182, + "learning_rate": 2.1720513818606463e-05, + "loss": 1.0326, + "step": 2296 + }, + { + "epoch": 0.8926454872243272, + "grad_norm": 0.21909672021865845, + "learning_rate": 2.164266251459712e-05, + "loss": 1.0903, + "step": 2297 + }, + { + "epoch": 0.8930341008452346, + "grad_norm": 0.21067824959754944, + "learning_rate": 2.156481121058778e-05, + "loss": 1.0425, + "step": 2298 + }, + { + "epoch": 0.8934227144661421, + "grad_norm": 0.20612956583499908, + "learning_rate": 2.1486959906578436e-05, + "loss": 1.0269, + "step": 2299 + }, + { + "epoch": 0.8938113280870494, + "grad_norm": 0.22750885784626007, + "learning_rate": 2.1409108602569095e-05, + "loss": 1.081, + "step": 2300 + }, + { + "epoch": 0.8941999417079569, + "grad_norm": 0.2192569077014923, + "learning_rate": 2.133125729855975e-05, + "loss": 1.0305, + "step": 2301 + }, + { + "epoch": 0.8945885553288643, + "grad_norm": 0.2150728702545166, + "learning_rate": 2.125340599455041e-05, + "loss": 1.0369, + "step": 2302 + }, + { + "epoch": 0.8949771689497716, + "grad_norm": 0.2095833718776703, + "learning_rate": 2.1175554690541068e-05, + "loss": 1.0392, + "step": 2303 + }, + { + "epoch": 0.8953657825706791, + "grad_norm": 0.2074289619922638, + "learning_rate": 2.1097703386531726e-05, + "loss": 0.9893, + "step": 2304 + }, + { + "epoch": 0.8957543961915865, + "grad_norm": 0.20826508104801178, + "learning_rate": 2.1019852082522385e-05, + "loss": 1.0737, + "step": 2305 + }, + { + "epoch": 0.896143009812494, + "grad_norm": 0.20254862308502197, + "learning_rate": 2.094200077851304e-05, + "loss": 1.0251, + "step": 2306 + }, + { + "epoch": 0.8965316234334013, + "grad_norm": 0.20950356125831604, + "learning_rate": 2.08641494745037e-05, + "loss": 1.026, + "step": 2307 + }, + { + "epoch": 0.8969202370543088, + "grad_norm": 0.20761284232139587, + "learning_rate": 2.0786298170494355e-05, + "loss": 1.0556, + "step": 2308 + }, + { + "epoch": 0.8973088506752162, + "grad_norm": 0.1943255513906479, + "learning_rate": 2.0708446866485014e-05, + "loss": 0.9745, + "step": 2309 + }, + { + "epoch": 0.8976974642961236, + "grad_norm": 0.19723530113697052, + "learning_rate": 2.0630595562475673e-05, + "loss": 0.9764, + "step": 2310 + }, + { + "epoch": 0.898086077917031, + "grad_norm": 0.21135687828063965, + "learning_rate": 2.055274425846633e-05, + "loss": 1.0289, + "step": 2311 + }, + { + "epoch": 0.8984746915379384, + "grad_norm": 0.20867012441158295, + "learning_rate": 2.047489295445699e-05, + "loss": 1.0659, + "step": 2312 + }, + { + "epoch": 0.8988633051588458, + "grad_norm": 0.1999632567167282, + "learning_rate": 2.0397041650447646e-05, + "loss": 0.9699, + "step": 2313 + }, + { + "epoch": 0.8992519187797532, + "grad_norm": 0.2080952674150467, + "learning_rate": 2.0319190346438305e-05, + "loss": 1.0097, + "step": 2314 + }, + { + "epoch": 0.8996405324006607, + "grad_norm": 0.20419847965240479, + "learning_rate": 2.024133904242896e-05, + "loss": 1.0272, + "step": 2315 + }, + { + "epoch": 0.900029146021568, + "grad_norm": 0.19433575868606567, + "learning_rate": 2.016348773841962e-05, + "loss": 0.9892, + "step": 2316 + }, + { + "epoch": 0.9004177596424755, + "grad_norm": 0.20644325017929077, + "learning_rate": 2.0085636434410278e-05, + "loss": 0.9978, + "step": 2317 + }, + { + "epoch": 0.9008063732633829, + "grad_norm": 0.2145605981349945, + "learning_rate": 2.0007785130400936e-05, + "loss": 1.0569, + "step": 2318 + }, + { + "epoch": 0.9011949868842903, + "grad_norm": 0.2073410153388977, + "learning_rate": 1.9929933826391592e-05, + "loss": 1.0937, + "step": 2319 + }, + { + "epoch": 0.9015836005051977, + "grad_norm": 0.2169773280620575, + "learning_rate": 1.985208252238225e-05, + "loss": 1.0559, + "step": 2320 + }, + { + "epoch": 0.9019722141261051, + "grad_norm": 0.2153279334306717, + "learning_rate": 1.977423121837291e-05, + "loss": 1.074, + "step": 2321 + }, + { + "epoch": 0.9023608277470125, + "grad_norm": 0.2089853584766388, + "learning_rate": 1.9696379914363565e-05, + "loss": 0.9971, + "step": 2322 + }, + { + "epoch": 0.9027494413679199, + "grad_norm": 0.21813471615314484, + "learning_rate": 1.9618528610354224e-05, + "loss": 1.0408, + "step": 2323 + }, + { + "epoch": 0.9031380549888274, + "grad_norm": 0.19753578305244446, + "learning_rate": 1.9540677306344883e-05, + "loss": 0.9429, + "step": 2324 + }, + { + "epoch": 0.9035266686097347, + "grad_norm": 0.19760333001613617, + "learning_rate": 1.946282600233554e-05, + "loss": 1.0127, + "step": 2325 + }, + { + "epoch": 0.9039152822306422, + "grad_norm": 0.21375150978565216, + "learning_rate": 1.9384974698326197e-05, + "loss": 1.0166, + "step": 2326 + }, + { + "epoch": 0.9043038958515496, + "grad_norm": 0.21019572019577026, + "learning_rate": 1.9307123394316856e-05, + "loss": 0.9897, + "step": 2327 + }, + { + "epoch": 0.904692509472457, + "grad_norm": 0.20336006581783295, + "learning_rate": 1.922927209030751e-05, + "loss": 0.9788, + "step": 2328 + }, + { + "epoch": 0.9050811230933644, + "grad_norm": 0.20877422392368317, + "learning_rate": 1.915142078629817e-05, + "loss": 1.0257, + "step": 2329 + }, + { + "epoch": 0.9054697367142719, + "grad_norm": 0.21499283611774445, + "learning_rate": 1.9073569482288832e-05, + "loss": 1.0628, + "step": 2330 + }, + { + "epoch": 0.9058583503351793, + "grad_norm": 0.2943152189254761, + "learning_rate": 1.8995718178279488e-05, + "loss": 1.0859, + "step": 2331 + }, + { + "epoch": 0.9062469639560866, + "grad_norm": 0.20630142092704773, + "learning_rate": 1.8917866874270146e-05, + "loss": 1.0625, + "step": 2332 + }, + { + "epoch": 0.9066355775769941, + "grad_norm": 0.19609740376472473, + "learning_rate": 1.8840015570260802e-05, + "loss": 1.0043, + "step": 2333 + }, + { + "epoch": 0.9070241911979015, + "grad_norm": 0.21231451630592346, + "learning_rate": 1.876216426625146e-05, + "loss": 1.0534, + "step": 2334 + }, + { + "epoch": 0.9074128048188089, + "grad_norm": 0.2212425172328949, + "learning_rate": 1.8684312962242116e-05, + "loss": 0.99, + "step": 2335 + }, + { + "epoch": 0.9078014184397163, + "grad_norm": 0.21141575276851654, + "learning_rate": 1.860646165823278e-05, + "loss": 1.0442, + "step": 2336 + }, + { + "epoch": 0.9081900320606238, + "grad_norm": 0.20657780766487122, + "learning_rate": 1.8528610354223434e-05, + "loss": 1.0363, + "step": 2337 + }, + { + "epoch": 0.9085786456815311, + "grad_norm": 0.1973218023777008, + "learning_rate": 1.8450759050214093e-05, + "loss": 0.9868, + "step": 2338 + }, + { + "epoch": 0.9089672593024386, + "grad_norm": 0.19639235734939575, + "learning_rate": 1.837290774620475e-05, + "loss": 0.9865, + "step": 2339 + }, + { + "epoch": 0.909355872923346, + "grad_norm": 0.194901704788208, + "learning_rate": 1.8295056442195407e-05, + "loss": 0.9776, + "step": 2340 + }, + { + "epoch": 0.9097444865442533, + "grad_norm": 0.1907500922679901, + "learning_rate": 1.8217205138186066e-05, + "loss": 1.0048, + "step": 2341 + }, + { + "epoch": 0.9101331001651608, + "grad_norm": 0.20842313766479492, + "learning_rate": 1.813935383417672e-05, + "loss": 0.9773, + "step": 2342 + }, + { + "epoch": 0.9105217137860682, + "grad_norm": 0.2537369132041931, + "learning_rate": 1.8061502530167383e-05, + "loss": 0.9932, + "step": 2343 + }, + { + "epoch": 0.9109103274069756, + "grad_norm": 0.22774042189121246, + "learning_rate": 1.798365122615804e-05, + "loss": 1.1521, + "step": 2344 + }, + { + "epoch": 0.911298941027883, + "grad_norm": 0.192257359623909, + "learning_rate": 1.7905799922148698e-05, + "loss": 0.9707, + "step": 2345 + }, + { + "epoch": 0.9116875546487905, + "grad_norm": 0.21573100984096527, + "learning_rate": 1.7827948618139353e-05, + "loss": 1.0355, + "step": 2346 + }, + { + "epoch": 0.9120761682696978, + "grad_norm": 0.215474933385849, + "learning_rate": 1.7750097314130012e-05, + "loss": 1.0408, + "step": 2347 + }, + { + "epoch": 0.9124647818906053, + "grad_norm": 0.2031407654285431, + "learning_rate": 1.767224601012067e-05, + "loss": 1.0429, + "step": 2348 + }, + { + "epoch": 0.9128533955115127, + "grad_norm": 0.20461305975914001, + "learning_rate": 1.759439470611133e-05, + "loss": 1.0033, + "step": 2349 + }, + { + "epoch": 0.91324200913242, + "grad_norm": 0.20995965600013733, + "learning_rate": 1.7516543402101988e-05, + "loss": 1.089, + "step": 2350 + }, + { + "epoch": 0.9136306227533275, + "grad_norm": 0.20464631915092468, + "learning_rate": 1.7438692098092644e-05, + "loss": 1.0438, + "step": 2351 + }, + { + "epoch": 0.9140192363742349, + "grad_norm": 0.20657162368297577, + "learning_rate": 1.7360840794083303e-05, + "loss": 1.0687, + "step": 2352 + }, + { + "epoch": 0.9144078499951424, + "grad_norm": 0.20419646799564362, + "learning_rate": 1.7282989490073958e-05, + "loss": 1.0412, + "step": 2353 + }, + { + "epoch": 0.9147964636160497, + "grad_norm": 0.20655421912670135, + "learning_rate": 1.7205138186064617e-05, + "loss": 1.0343, + "step": 2354 + }, + { + "epoch": 0.9151850772369572, + "grad_norm": 0.20393185317516327, + "learning_rate": 1.7127286882055276e-05, + "loss": 1.0379, + "step": 2355 + }, + { + "epoch": 0.9155736908578646, + "grad_norm": 0.20768289268016815, + "learning_rate": 1.7049435578045934e-05, + "loss": 1.022, + "step": 2356 + }, + { + "epoch": 0.915962304478772, + "grad_norm": 0.2257547676563263, + "learning_rate": 1.6971584274036593e-05, + "loss": 1.1081, + "step": 2357 + }, + { + "epoch": 0.9163509180996794, + "grad_norm": 0.1980145126581192, + "learning_rate": 1.689373297002725e-05, + "loss": 1.0439, + "step": 2358 + }, + { + "epoch": 0.9167395317205868, + "grad_norm": 0.20351259410381317, + "learning_rate": 1.6815881666017908e-05, + "loss": 1.0363, + "step": 2359 + }, + { + "epoch": 0.9171281453414942, + "grad_norm": 0.20830631256103516, + "learning_rate": 1.6738030362008563e-05, + "loss": 1.0467, + "step": 2360 + }, + { + "epoch": 0.9175167589624016, + "grad_norm": 0.21225905418395996, + "learning_rate": 1.6660179057999222e-05, + "loss": 1.0611, + "step": 2361 + }, + { + "epoch": 0.9179053725833091, + "grad_norm": 0.20069880783557892, + "learning_rate": 1.658232775398988e-05, + "loss": 0.9989, + "step": 2362 + }, + { + "epoch": 0.9182939862042164, + "grad_norm": 0.21674825251102448, + "learning_rate": 1.650447644998054e-05, + "loss": 1.0578, + "step": 2363 + }, + { + "epoch": 0.9186825998251239, + "grad_norm": 0.20438091456890106, + "learning_rate": 1.6426625145971195e-05, + "loss": 1.0593, + "step": 2364 + }, + { + "epoch": 0.9190712134460313, + "grad_norm": 0.2195381075143814, + "learning_rate": 1.6348773841961854e-05, + "loss": 1.0354, + "step": 2365 + }, + { + "epoch": 0.9194598270669387, + "grad_norm": 0.21371111273765564, + "learning_rate": 1.6270922537952513e-05, + "loss": 0.9911, + "step": 2366 + }, + { + "epoch": 0.9198484406878461, + "grad_norm": 0.22097980976104736, + "learning_rate": 1.6193071233943168e-05, + "loss": 1.0064, + "step": 2367 + }, + { + "epoch": 0.9202370543087536, + "grad_norm": 0.20589159429073334, + "learning_rate": 1.6115219929933827e-05, + "loss": 1.0173, + "step": 2368 + }, + { + "epoch": 0.9206256679296609, + "grad_norm": 0.19218075275421143, + "learning_rate": 1.6037368625924486e-05, + "loss": 1.0215, + "step": 2369 + }, + { + "epoch": 0.9210142815505683, + "grad_norm": 0.2132728099822998, + "learning_rate": 1.5959517321915144e-05, + "loss": 1.0493, + "step": 2370 + }, + { + "epoch": 0.9214028951714758, + "grad_norm": 0.20006981492042542, + "learning_rate": 1.58816660179058e-05, + "loss": 0.9814, + "step": 2371 + }, + { + "epoch": 0.9217915087923831, + "grad_norm": 0.21600167453289032, + "learning_rate": 1.580381471389646e-05, + "loss": 1.0759, + "step": 2372 + }, + { + "epoch": 0.9221801224132906, + "grad_norm": 0.21474605798721313, + "learning_rate": 1.5725963409887114e-05, + "loss": 1.0411, + "step": 2373 + }, + { + "epoch": 0.922568736034198, + "grad_norm": 0.2044600546360016, + "learning_rate": 1.5648112105877773e-05, + "loss": 1.0236, + "step": 2374 + }, + { + "epoch": 0.9229573496551055, + "grad_norm": 0.20302869379520416, + "learning_rate": 1.5570260801868432e-05, + "loss": 0.9982, + "step": 2375 + }, + { + "epoch": 0.9233459632760128, + "grad_norm": 0.21155263483524323, + "learning_rate": 1.549240949785909e-05, + "loss": 1.0249, + "step": 2376 + }, + { + "epoch": 0.9237345768969203, + "grad_norm": 0.20336754620075226, + "learning_rate": 1.541455819384975e-05, + "loss": 1.0223, + "step": 2377 + }, + { + "epoch": 0.9241231905178277, + "grad_norm": 0.20189301669597626, + "learning_rate": 1.5336706889840405e-05, + "loss": 1.0228, + "step": 2378 + }, + { + "epoch": 0.924511804138735, + "grad_norm": 0.1962178647518158, + "learning_rate": 1.5258855585831064e-05, + "loss": 1.0137, + "step": 2379 + }, + { + "epoch": 0.9249004177596425, + "grad_norm": 0.21523639559745789, + "learning_rate": 1.518100428182172e-05, + "loss": 1.0498, + "step": 2380 + }, + { + "epoch": 0.9252890313805499, + "grad_norm": 0.20537924766540527, + "learning_rate": 1.510315297781238e-05, + "loss": 0.9995, + "step": 2381 + }, + { + "epoch": 0.9256776450014573, + "grad_norm": 0.21170039474964142, + "learning_rate": 1.5025301673803035e-05, + "loss": 1.0953, + "step": 2382 + }, + { + "epoch": 0.9260662586223647, + "grad_norm": 0.20737627148628235, + "learning_rate": 1.4947450369793694e-05, + "loss": 0.9892, + "step": 2383 + }, + { + "epoch": 0.9264548722432722, + "grad_norm": 0.20684003829956055, + "learning_rate": 1.4869599065784354e-05, + "loss": 1.0468, + "step": 2384 + }, + { + "epoch": 0.9268434858641795, + "grad_norm": 0.20738738775253296, + "learning_rate": 1.479174776177501e-05, + "loss": 1.0436, + "step": 2385 + }, + { + "epoch": 0.927232099485087, + "grad_norm": 0.19740383327007294, + "learning_rate": 1.4713896457765669e-05, + "loss": 0.9528, + "step": 2386 + }, + { + "epoch": 0.9276207131059944, + "grad_norm": 0.20328152179718018, + "learning_rate": 1.4636045153756326e-05, + "loss": 1.0272, + "step": 2387 + }, + { + "epoch": 0.9280093267269017, + "grad_norm": 0.2008744776248932, + "learning_rate": 1.4558193849746985e-05, + "loss": 1.0441, + "step": 2388 + }, + { + "epoch": 0.9283979403478092, + "grad_norm": 0.19907627999782562, + "learning_rate": 1.448034254573764e-05, + "loss": 0.9929, + "step": 2389 + }, + { + "epoch": 0.9287865539687166, + "grad_norm": 0.20299683511257172, + "learning_rate": 1.44024912417283e-05, + "loss": 0.9749, + "step": 2390 + }, + { + "epoch": 0.929175167589624, + "grad_norm": 0.21035155653953552, + "learning_rate": 1.432463993771896e-05, + "loss": 1.0356, + "step": 2391 + }, + { + "epoch": 0.9295637812105314, + "grad_norm": 0.20862546563148499, + "learning_rate": 1.4246788633709615e-05, + "loss": 1.0594, + "step": 2392 + }, + { + "epoch": 0.9299523948314389, + "grad_norm": 0.20775675773620605, + "learning_rate": 1.4168937329700274e-05, + "loss": 0.9959, + "step": 2393 + }, + { + "epoch": 0.9303410084523462, + "grad_norm": 0.1970052868127823, + "learning_rate": 1.409108602569093e-05, + "loss": 0.9956, + "step": 2394 + }, + { + "epoch": 0.9307296220732537, + "grad_norm": 0.2167968600988388, + "learning_rate": 1.401323472168159e-05, + "loss": 1.0202, + "step": 2395 + }, + { + "epoch": 0.9311182356941611, + "grad_norm": 0.20822198688983917, + "learning_rate": 1.3935383417672245e-05, + "loss": 1.0067, + "step": 2396 + }, + { + "epoch": 0.9315068493150684, + "grad_norm": 0.2004898339509964, + "learning_rate": 1.3857532113662906e-05, + "loss": 1.0069, + "step": 2397 + }, + { + "epoch": 0.9318954629359759, + "grad_norm": 0.22808429598808289, + "learning_rate": 1.3779680809653561e-05, + "loss": 1.1032, + "step": 2398 + }, + { + "epoch": 0.9322840765568833, + "grad_norm": 0.19940750300884247, + "learning_rate": 1.370182950564422e-05, + "loss": 0.9965, + "step": 2399 + }, + { + "epoch": 0.9326726901777908, + "grad_norm": 0.21138110756874084, + "learning_rate": 1.3623978201634879e-05, + "loss": 0.986, + "step": 2400 + }, + { + "epoch": 0.9330613037986981, + "grad_norm": 0.2118709534406662, + "learning_rate": 1.3546126897625536e-05, + "loss": 1.0672, + "step": 2401 + }, + { + "epoch": 0.9334499174196056, + "grad_norm": 0.22121763229370117, + "learning_rate": 1.3468275593616195e-05, + "loss": 1.0204, + "step": 2402 + }, + { + "epoch": 0.933838531040513, + "grad_norm": 0.20541204512119293, + "learning_rate": 1.3390424289606852e-05, + "loss": 1.0749, + "step": 2403 + }, + { + "epoch": 0.9342271446614204, + "grad_norm": 0.19598713517189026, + "learning_rate": 1.331257298559751e-05, + "loss": 0.9638, + "step": 2404 + }, + { + "epoch": 0.9346157582823278, + "grad_norm": 0.2157907783985138, + "learning_rate": 1.3234721681588166e-05, + "loss": 1.0312, + "step": 2405 + }, + { + "epoch": 0.9350043719032352, + "grad_norm": 0.19694723188877106, + "learning_rate": 1.3156870377578825e-05, + "loss": 1.0001, + "step": 2406 + }, + { + "epoch": 0.9353929855241426, + "grad_norm": 0.209597647190094, + "learning_rate": 1.3079019073569482e-05, + "loss": 0.9808, + "step": 2407 + }, + { + "epoch": 0.93578159914505, + "grad_norm": 0.2026679664850235, + "learning_rate": 1.300116776956014e-05, + "loss": 0.9938, + "step": 2408 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 0.20847374200820923, + "learning_rate": 1.29233164655508e-05, + "loss": 0.9948, + "step": 2409 + }, + { + "epoch": 0.9365588263868648, + "grad_norm": 0.23478667438030243, + "learning_rate": 1.2845465161541457e-05, + "loss": 1.0549, + "step": 2410 + }, + { + "epoch": 0.9369474400077723, + "grad_norm": 0.20954233407974243, + "learning_rate": 1.2767613857532116e-05, + "loss": 1.0336, + "step": 2411 + }, + { + "epoch": 0.9373360536286797, + "grad_norm": 0.2130623608827591, + "learning_rate": 1.2689762553522771e-05, + "loss": 1.0398, + "step": 2412 + }, + { + "epoch": 0.9377246672495871, + "grad_norm": 0.20076791942119598, + "learning_rate": 1.261191124951343e-05, + "loss": 0.9945, + "step": 2413 + }, + { + "epoch": 0.9381132808704945, + "grad_norm": 0.21280889213085175, + "learning_rate": 1.2534059945504087e-05, + "loss": 1.0538, + "step": 2414 + }, + { + "epoch": 0.938501894491402, + "grad_norm": 0.19909800589084625, + "learning_rate": 1.2456208641494746e-05, + "loss": 0.9783, + "step": 2415 + }, + { + "epoch": 0.9388905081123093, + "grad_norm": 0.21449251472949982, + "learning_rate": 1.2378357337485403e-05, + "loss": 1.0547, + "step": 2416 + }, + { + "epoch": 0.9392791217332167, + "grad_norm": 0.20742881298065186, + "learning_rate": 1.2300506033476062e-05, + "loss": 1.0471, + "step": 2417 + }, + { + "epoch": 0.9396677353541242, + "grad_norm": 0.21160250902175903, + "learning_rate": 1.2222654729466719e-05, + "loss": 1.0089, + "step": 2418 + }, + { + "epoch": 0.9400563489750315, + "grad_norm": 0.22055311501026154, + "learning_rate": 1.2144803425457376e-05, + "loss": 1.0201, + "step": 2419 + }, + { + "epoch": 0.940444962595939, + "grad_norm": 0.21073050796985626, + "learning_rate": 1.2066952121448035e-05, + "loss": 1.0025, + "step": 2420 + }, + { + "epoch": 0.9408335762168464, + "grad_norm": 0.19758272171020508, + "learning_rate": 1.1989100817438692e-05, + "loss": 0.9643, + "step": 2421 + }, + { + "epoch": 0.9412221898377539, + "grad_norm": 0.20312103629112244, + "learning_rate": 1.191124951342935e-05, + "loss": 1.023, + "step": 2422 + }, + { + "epoch": 0.9416108034586612, + "grad_norm": 0.19969260692596436, + "learning_rate": 1.183339820942001e-05, + "loss": 0.9623, + "step": 2423 + }, + { + "epoch": 0.9419994170795687, + "grad_norm": 0.21867750585079193, + "learning_rate": 1.1755546905410667e-05, + "loss": 1.0895, + "step": 2424 + }, + { + "epoch": 0.942388030700476, + "grad_norm": 0.19672009348869324, + "learning_rate": 1.1677695601401324e-05, + "loss": 1.0253, + "step": 2425 + }, + { + "epoch": 0.9427766443213834, + "grad_norm": 0.20442704856395721, + "learning_rate": 1.1599844297391983e-05, + "loss": 1.0515, + "step": 2426 + }, + { + "epoch": 0.9431652579422909, + "grad_norm": 0.2008974254131317, + "learning_rate": 1.152199299338264e-05, + "loss": 0.9934, + "step": 2427 + }, + { + "epoch": 0.9435538715631983, + "grad_norm": 0.20074884593486786, + "learning_rate": 1.1444141689373297e-05, + "loss": 0.9792, + "step": 2428 + }, + { + "epoch": 0.9439424851841057, + "grad_norm": 0.1945987194776535, + "learning_rate": 1.1366290385363956e-05, + "loss": 0.991, + "step": 2429 + }, + { + "epoch": 0.9443310988050131, + "grad_norm": 0.2123355269432068, + "learning_rate": 1.1288439081354613e-05, + "loss": 0.9768, + "step": 2430 + }, + { + "epoch": 0.9447197124259206, + "grad_norm": 0.19462116062641144, + "learning_rate": 1.1210587777345272e-05, + "loss": 1.0221, + "step": 2431 + }, + { + "epoch": 0.9451083260468279, + "grad_norm": 0.21487726271152496, + "learning_rate": 1.1132736473335929e-05, + "loss": 1.0273, + "step": 2432 + }, + { + "epoch": 0.9454969396677354, + "grad_norm": 0.2011580765247345, + "learning_rate": 1.1054885169326588e-05, + "loss": 1.0065, + "step": 2433 + }, + { + "epoch": 0.9458855532886428, + "grad_norm": 0.2009819597005844, + "learning_rate": 1.0977033865317245e-05, + "loss": 1.04, + "step": 2434 + }, + { + "epoch": 0.9462741669095501, + "grad_norm": 0.20142634212970734, + "learning_rate": 1.0899182561307902e-05, + "loss": 1.0101, + "step": 2435 + }, + { + "epoch": 0.9466627805304576, + "grad_norm": 0.20323152840137482, + "learning_rate": 1.082133125729856e-05, + "loss": 1.0039, + "step": 2436 + }, + { + "epoch": 0.947051394151365, + "grad_norm": 0.18746018409729004, + "learning_rate": 1.0743479953289218e-05, + "loss": 0.9876, + "step": 2437 + }, + { + "epoch": 0.9474400077722724, + "grad_norm": 0.20016197860240936, + "learning_rate": 1.0665628649279875e-05, + "loss": 1.0067, + "step": 2438 + }, + { + "epoch": 0.9478286213931798, + "grad_norm": 0.19872961938381195, + "learning_rate": 1.0587777345270534e-05, + "loss": 0.9884, + "step": 2439 + }, + { + "epoch": 0.9482172350140873, + "grad_norm": 0.20647788047790527, + "learning_rate": 1.0509926041261193e-05, + "loss": 1.0088, + "step": 2440 + }, + { + "epoch": 0.9486058486349946, + "grad_norm": 0.20790119469165802, + "learning_rate": 1.043207473725185e-05, + "loss": 1.0201, + "step": 2441 + }, + { + "epoch": 0.9489944622559021, + "grad_norm": 0.20318609476089478, + "learning_rate": 1.0354223433242507e-05, + "loss": 1.0199, + "step": 2442 + }, + { + "epoch": 0.9493830758768095, + "grad_norm": 0.21426942944526672, + "learning_rate": 1.0276372129233166e-05, + "loss": 1.0047, + "step": 2443 + }, + { + "epoch": 0.9497716894977168, + "grad_norm": 0.3223714828491211, + "learning_rate": 1.0198520825223823e-05, + "loss": 1.0532, + "step": 2444 + }, + { + "epoch": 0.9501603031186243, + "grad_norm": 0.2070651799440384, + "learning_rate": 1.012066952121448e-05, + "loss": 1.0576, + "step": 2445 + }, + { + "epoch": 0.9505489167395317, + "grad_norm": 0.20618025958538055, + "learning_rate": 1.0042818217205139e-05, + "loss": 1.061, + "step": 2446 + }, + { + "epoch": 0.9509375303604392, + "grad_norm": 0.20535731315612793, + "learning_rate": 9.964966913195796e-06, + "loss": 0.9923, + "step": 2447 + }, + { + "epoch": 0.9513261439813465, + "grad_norm": 0.21038392186164856, + "learning_rate": 9.887115609186455e-06, + "loss": 1.0257, + "step": 2448 + }, + { + "epoch": 0.951714757602254, + "grad_norm": 0.20872676372528076, + "learning_rate": 9.809264305177112e-06, + "loss": 1.0147, + "step": 2449 + }, + { + "epoch": 0.9521033712231614, + "grad_norm": 0.40158966183662415, + "learning_rate": 9.73141300116777e-06, + "loss": 1.0071, + "step": 2450 + }, + { + "epoch": 0.9524919848440688, + "grad_norm": 0.1991165280342102, + "learning_rate": 9.653561697158428e-06, + "loss": 0.9829, + "step": 2451 + }, + { + "epoch": 0.9528805984649762, + "grad_norm": 0.1965460628271103, + "learning_rate": 9.575710393149085e-06, + "loss": 1.0286, + "step": 2452 + }, + { + "epoch": 0.9532692120858836, + "grad_norm": 0.20879510045051575, + "learning_rate": 9.497859089139744e-06, + "loss": 1.0707, + "step": 2453 + }, + { + "epoch": 0.953657825706791, + "grad_norm": 0.19594980776309967, + "learning_rate": 9.420007785130401e-06, + "loss": 0.9946, + "step": 2454 + }, + { + "epoch": 0.9540464393276984, + "grad_norm": 0.19754594564437866, + "learning_rate": 9.342156481121058e-06, + "loss": 0.9737, + "step": 2455 + }, + { + "epoch": 0.9544350529486059, + "grad_norm": 0.21339558064937592, + "learning_rate": 9.264305177111717e-06, + "loss": 1.0505, + "step": 2456 + }, + { + "epoch": 0.9548236665695132, + "grad_norm": 0.20371811091899872, + "learning_rate": 9.186453873102376e-06, + "loss": 1.0594, + "step": 2457 + }, + { + "epoch": 0.9552122801904207, + "grad_norm": 0.20965653657913208, + "learning_rate": 9.108602569093033e-06, + "loss": 1.0639, + "step": 2458 + }, + { + "epoch": 0.9556008938113281, + "grad_norm": 0.20316167175769806, + "learning_rate": 9.030751265083692e-06, + "loss": 1.0219, + "step": 2459 + }, + { + "epoch": 0.9559895074322355, + "grad_norm": 0.19921238720417023, + "learning_rate": 8.952899961074349e-06, + "loss": 1.0399, + "step": 2460 + }, + { + "epoch": 0.9563781210531429, + "grad_norm": 0.196847602725029, + "learning_rate": 8.875048657065006e-06, + "loss": 0.9678, + "step": 2461 + }, + { + "epoch": 0.9567667346740504, + "grad_norm": 0.20746973156929016, + "learning_rate": 8.797197353055665e-06, + "loss": 1.0365, + "step": 2462 + }, + { + "epoch": 0.9571553482949577, + "grad_norm": 0.3297490179538727, + "learning_rate": 8.719346049046322e-06, + "loss": 1.0028, + "step": 2463 + }, + { + "epoch": 0.9575439619158651, + "grad_norm": 0.2101137936115265, + "learning_rate": 8.641494745036979e-06, + "loss": 1.0627, + "step": 2464 + }, + { + "epoch": 0.9579325755367726, + "grad_norm": 0.2444445937871933, + "learning_rate": 8.563643441027638e-06, + "loss": 0.9866, + "step": 2465 + }, + { + "epoch": 0.9583211891576799, + "grad_norm": 0.20323987305164337, + "learning_rate": 8.485792137018297e-06, + "loss": 1.0123, + "step": 2466 + }, + { + "epoch": 0.9587098027785874, + "grad_norm": 0.21334567666053772, + "learning_rate": 8.407940833008954e-06, + "loss": 1.0492, + "step": 2467 + }, + { + "epoch": 0.9590984163994948, + "grad_norm": 0.19852736592292786, + "learning_rate": 8.330089528999611e-06, + "loss": 1.0303, + "step": 2468 + }, + { + "epoch": 0.9594870300204023, + "grad_norm": 0.1995389610528946, + "learning_rate": 8.25223822499027e-06, + "loss": 0.9758, + "step": 2469 + }, + { + "epoch": 0.9598756436413096, + "grad_norm": 0.19799165427684784, + "learning_rate": 8.174386920980927e-06, + "loss": 0.9541, + "step": 2470 + }, + { + "epoch": 0.9602642572622171, + "grad_norm": 0.21066170930862427, + "learning_rate": 8.096535616971584e-06, + "loss": 1.0389, + "step": 2471 + }, + { + "epoch": 0.9606528708831245, + "grad_norm": 0.19671034812927246, + "learning_rate": 8.018684312962243e-06, + "loss": 0.9791, + "step": 2472 + }, + { + "epoch": 0.9610414845040318, + "grad_norm": 0.2106933444738388, + "learning_rate": 7.9408330089529e-06, + "loss": 0.9479, + "step": 2473 + }, + { + "epoch": 0.9614300981249393, + "grad_norm": 0.20396657288074493, + "learning_rate": 7.862981704943557e-06, + "loss": 1.0068, + "step": 2474 + }, + { + "epoch": 0.9618187117458467, + "grad_norm": 0.19684381783008575, + "learning_rate": 7.785130400934216e-06, + "loss": 1.0347, + "step": 2475 + }, + { + "epoch": 0.9622073253667541, + "grad_norm": 0.19494709372520447, + "learning_rate": 7.707279096924875e-06, + "loss": 0.9997, + "step": 2476 + }, + { + "epoch": 0.9625959389876615, + "grad_norm": 0.21996809542179108, + "learning_rate": 7.629427792915532e-06, + "loss": 1.0517, + "step": 2477 + }, + { + "epoch": 0.962984552608569, + "grad_norm": 0.2083420753479004, + "learning_rate": 7.55157648890619e-06, + "loss": 1.0483, + "step": 2478 + }, + { + "epoch": 0.9633731662294763, + "grad_norm": 0.2018081396818161, + "learning_rate": 7.473725184896847e-06, + "loss": 1.0167, + "step": 2479 + }, + { + "epoch": 0.9637617798503838, + "grad_norm": 0.22427868843078613, + "learning_rate": 7.395873880887505e-06, + "loss": 0.9759, + "step": 2480 + }, + { + "epoch": 0.9641503934712912, + "grad_norm": 0.2190699577331543, + "learning_rate": 7.318022576878163e-06, + "loss": 1.049, + "step": 2481 + }, + { + "epoch": 0.9645390070921985, + "grad_norm": 0.2035333812236786, + "learning_rate": 7.24017127286882e-06, + "loss": 1.0556, + "step": 2482 + }, + { + "epoch": 0.964927620713106, + "grad_norm": 0.20165729522705078, + "learning_rate": 7.16231996885948e-06, + "loss": 0.9958, + "step": 2483 + }, + { + "epoch": 0.9653162343340134, + "grad_norm": 0.20284077525138855, + "learning_rate": 7.084468664850137e-06, + "loss": 1.0146, + "step": 2484 + }, + { + "epoch": 0.9657048479549208, + "grad_norm": 0.1984403133392334, + "learning_rate": 7.006617360840795e-06, + "loss": 0.9797, + "step": 2485 + }, + { + "epoch": 0.9660934615758282, + "grad_norm": 0.22276800870895386, + "learning_rate": 6.928766056831453e-06, + "loss": 1.042, + "step": 2486 + }, + { + "epoch": 0.9664820751967357, + "grad_norm": 0.18282116949558258, + "learning_rate": 6.85091475282211e-06, + "loss": 0.9681, + "step": 2487 + }, + { + "epoch": 0.966870688817643, + "grad_norm": 0.19382023811340332, + "learning_rate": 6.773063448812768e-06, + "loss": 0.9991, + "step": 2488 + }, + { + "epoch": 0.9672593024385505, + "grad_norm": 0.2009381204843521, + "learning_rate": 6.695212144803426e-06, + "loss": 1.0061, + "step": 2489 + }, + { + "epoch": 0.9676479160594579, + "grad_norm": 0.2232959270477295, + "learning_rate": 6.617360840794083e-06, + "loss": 1.0776, + "step": 2490 + }, + { + "epoch": 0.9680365296803652, + "grad_norm": 0.2164563238620758, + "learning_rate": 6.539509536784741e-06, + "loss": 1.0834, + "step": 2491 + }, + { + "epoch": 0.9684251433012727, + "grad_norm": 0.2053539901971817, + "learning_rate": 6.4616582327754e-06, + "loss": 1.0449, + "step": 2492 + }, + { + "epoch": 0.9688137569221801, + "grad_norm": 0.23249384760856628, + "learning_rate": 6.383806928766058e-06, + "loss": 1.0418, + "step": 2493 + }, + { + "epoch": 0.9692023705430876, + "grad_norm": 0.18624578416347504, + "learning_rate": 6.305955624756715e-06, + "loss": 0.9152, + "step": 2494 + }, + { + "epoch": 0.9695909841639949, + "grad_norm": 0.2001798450946808, + "learning_rate": 6.228104320747373e-06, + "loss": 1.0084, + "step": 2495 + }, + { + "epoch": 0.9699795977849024, + "grad_norm": 0.2341216653585434, + "learning_rate": 6.150253016738031e-06, + "loss": 0.9935, + "step": 2496 + }, + { + "epoch": 0.9703682114058098, + "grad_norm": 0.21359120309352875, + "learning_rate": 6.072401712728688e-06, + "loss": 1.0498, + "step": 2497 + }, + { + "epoch": 0.9707568250267172, + "grad_norm": 0.21405139565467834, + "learning_rate": 5.994550408719346e-06, + "loss": 1.096, + "step": 2498 + }, + { + "epoch": 0.9711454386476246, + "grad_norm": 0.2035064846277237, + "learning_rate": 5.916699104710005e-06, + "loss": 1.0351, + "step": 2499 + }, + { + "epoch": 0.9715340522685321, + "grad_norm": 0.19452853500843048, + "learning_rate": 5.838847800700662e-06, + "loss": 0.9994, + "step": 2500 + } + ], + "logging_steps": 1, + "max_steps": 2574, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.1321276344029348e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/outputs/checkpoint-313/README.md b/outputs/checkpoint-313/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3abf956c074d00f34a12693c8d6da9738211d7c7 --- /dev/null +++ b/outputs/checkpoint-313/README.md @@ -0,0 +1,209 @@ +--- +base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit +library_name: peft +tags: +- base_model:adapter:unsloth/gpt-oss-20b-unsloth-bnb-4bit +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/outputs/checkpoint-313/adapter_config.json b/outputs/checkpoint-313/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8d4d3c69b9ee90115d6da73d3bfb98e6ac3721d1 --- /dev/null +++ b/outputs/checkpoint-313/adapter_config.json @@ -0,0 +1,45 @@ +{ + "alpha_pattern": {}, + "auto_mapping": { + "base_model_class": "GptOssForCausalLM", + "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss" + }, + "base_model_name_or_path": "unsloth/gpt-oss-20b-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "down_proj", + "o_proj", + "v_proj", + "q_proj", + "k_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/outputs/checkpoint-313/chat_template.jinja b/outputs/checkpoint-313/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..a3650f886e98b2834c25727759c8e0ab8495f316 --- /dev/null +++ b/outputs/checkpoint-313/chat_template.jinja @@ -0,0 +1,315 @@ +{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #} +{#- + In addition to the normal inputs of `messages` and `tools`, this template also accepts the + following kwargs: + - "builtin_tools": A list, can contain "browser" and/or "python". + - "model_identity": A string that optionally describes the model identity. + - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium". + #} + +{#- Tool Definition Rendering ============================================== #} +{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%} + {%- if param_spec.type == "array" -%} + {%- if param_spec['items'] -%} + {%- if param_spec['items']['type'] == "string" -%} + {{- "string[]" }} + {%- elif param_spec['items']['type'] == "number" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "integer" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "boolean" -%} + {{- "boolean[]" }} + {%- else -%} + {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%} + {%- if inner_type == "object | object" or inner_type|length > 50 -%} + {{- "any[]" }} + {%- else -%} + {{- inner_type + "[]" }} + {%- endif -%} + {%- endif -%} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- else -%} + {{- "any[]" }} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%} + {#- Handle array of types like ["object", "object"] from Union[dict, list] #} + {%- if param_spec.type | length > 1 -%} + {{- param_spec.type | join(" | ") }} + {%- else -%} + {{- param_spec.type[0] }} + {%- endif -%} + {%- elif param_spec.oneOf -%} + {#- Handle oneOf schemas - check for complex unions and fallback to any #} + {%- set has_object_variants = false -%} + {%- for variant in param_spec.oneOf -%} + {%- if variant.type == "object" -%} + {%- set has_object_variants = true -%} + {%- endif -%} + {%- endfor -%} + {%- if has_object_variants and param_spec.oneOf|length > 1 -%} + {{- "any" }} + {%- else -%} + {%- for variant in param_spec.oneOf -%} + {{- render_typescript_type(variant, required_params) -}} + {%- if variant.description %} + {{- "// " + variant.description }} + {%- endif -%} + {%- if variant.default is defined %} + {{ "// default: " + variant.default|tojson }} + {%- endif -%} + {%- if not loop.last %} + {{- " | " }} + {% endif -%} + {%- endfor -%} + {%- endif -%} + {%- elif param_spec.type == "string" -%} + {%- if param_spec.enum -%} + {{- '"' + param_spec.enum|join('" | "') + '"' -}} + {%- else -%} + {{- "string" }} + {%- if param_spec.nullable %} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type == "number" -%} + {{- "number" }} + {%- elif param_spec.type == "integer" -%} + {{- "number" }} + {%- elif param_spec.type == "boolean" -%} + {{- "boolean" }} + + {%- elif param_spec.type == "object" -%} + {%- if param_spec.properties -%} + {{- "{\n" }} + {%- for prop_name, prop_spec in param_spec.properties.items() -%} + {{- prop_name -}} + {%- if prop_name not in (param_spec.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{ render_typescript_type(prop_spec, param_spec.required or []) }} + {%- if not loop.last -%} + {{-", " }} + {%- endif -%} + {%- endfor -%} + {{- "}" }} + {%- else -%} + {{- "object" }} + {%- endif -%} + {%- else -%} + {{- "any" }} + {%- endif -%} +{%- endmacro -%} + +{%- macro render_tool_namespace(namespace_name, tools) -%} + {{- "## " + namespace_name + "\n\n" }} + {{- "namespace " + namespace_name + " {\n\n" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- "// " + tool.description + "\n" }} + {{- "type "+ tool.name + " = " }} + {%- if tool.parameters and tool.parameters.properties -%} + {{- "(_: " }} + {{- "{\n" }} + {%- for param_name, param_spec in tool.parameters.properties.items() %} + {{- "// " + param_spec.description + "\n" }} + {{- param_name }} + {%- if param_name not in (tool.parameters.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{- render_typescript_type(param_spec, tool.parameters.required or []) }} + {%- if param_spec.default is defined -%} + {%- if param_spec.enum %} + {{- ", // default: " + param_spec.default }} + {%- elif param_spec.oneOf %} + {{- "// default: " + param_spec.default }} + {%- else %} + {{- ", // default: " + param_spec.default|tojson }} + {%- endif -%} + {%- endif -%} + {%- if not loop.last %} + {{- ",\n" }} + {%- else %} + {{- "\n" }} + {%- endif -%} + {%- endfor %} + {{- "}) => any;\n\n" }} + {%- else -%} + {{- "() => any;\n\n" }} + {%- endif -%} + {%- endfor %} + {{- "} // namespace " + namespace_name }} +{%- endmacro -%} + +{%- macro render_builtin_tools(browser_tool, python_tool) -%} + {%- if browser_tool %} + {{- "## browser\n\n" }} + {{- "// Tool for browsing.\n" }} + {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }} + {{- "// Cite information from the tool using the following format:\n" }} + {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }} + {{- "// Do not quote more than 10 words directly from the tool output.\n" }} + {{- "// sources=web (default: web)\n" }} + {{- "namespace browser {\n\n" }} + {{- "// Searches for information related to `query` and displays `topn` results.\n" }} + {{- "type search = (_: {\n" }} + {{- "query: string,\n" }} + {{- "topn?: number, // default: 10\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }} + {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }} + {{- "// If `cursor` is not provided, the most recent page is implied.\n" }} + {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }} + {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }} + {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }} + {{- "type open = (_: {\n" }} + {{- "id?: number | string, // default: -1\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "loc?: number, // default: -1\n" }} + {{- "num_lines?: number, // default: -1\n" }} + {{- "view_source?: boolean, // default: false\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }} + {{- "type find = (_: {\n" }} + {{- "pattern: string,\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "}) => any;\n\n" }} + {{- "} // namespace browser\n\n" }} + {%- endif -%} + + {%- if python_tool %} + {{- "## python\n\n" }} + {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }} + {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }} + {%- endif -%} +{%- endmacro -%} + +{#- System Message Construction ============================================ #} +{%- macro build_system_message() -%} + {%- if model_identity is not defined %} + {{- "You are ChatGPT, a large language model trained by OpenAI.\n" -}} + {%- else %} + {{- model_identity }} + {%- endif %} + {{- "Knowledge cutoff: 2024-06\n" }} + {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }} + {%- if reasoning_effort is not defined %} + {%- set reasoning_effort = "medium" %} + {%- endif %} + {{- "Reasoning: " + reasoning_effort + "\n\n" }} + {%- if builtin_tools is defined %} + {{- "# Tools\n\n" }} + {%- set available_builtin_tools = namespace(browser=false, python=false) %} + {%- for tool in builtin_tools %} + {%- if tool == "browser" %} + {%- set available_builtin_tools.browser = true %} + {%- elif tool == "python" %} + {%- set available_builtin_tools.python = true %} + {%- endif %} + {%- endfor %} + {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }} + {%- endif -%} + {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }} + {%- if tools is defined -%} + {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }} + {%- endif -%} +{%- endmacro -%} + +{#- Main Template Logic ================================================= #} +{#- Set defaults #} + +{#- Render system message #} +{{- "<|start|>system<|message|>" }} +{{- build_system_message() }} +{{- "<|end|>" }} + +{#- Extract developer message #} +{%- if messages[0].role == "developer" or messages[0].role == "system" %} + {%- set developer_message = messages[0].content %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set developer_message = "" %} + {%- set loop_messages = messages %} +{%- endif %} + +{#- Render developer message #} +{%- if developer_message or tools %} + {{- "<|start|>developer<|message|>" }} + {%- if developer_message %} + {{- "# Instructions\n\n" }} + {{- developer_message }} + {%- endif %} + {%- if tools -%} + {{- "\n\n" }} + {{- "# Tools\n\n" }} + {{- render_tool_namespace("functions", tools) }} + {%- endif -%} + {{- "<|end|>" }} +{%- endif %} + +{#- Render messages #} +{%- set last_tool_call = namespace(name=none) %} +{%- for message in loop_messages -%} + {#- At this point only assistant/user/tool messages should remain #} + {%- if message.role == 'assistant' -%} + {%- if "tool_calls" in message %} + {#- We assume max 1 tool call per message, and so we infer the tool call name #} + {#- in "tool" messages from the most recent assistant tool call name #} + {%- set tool_call = message.tool_calls[0] %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if message.content %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }} + {%- endif %} + {{- "<|start|>assistant to=" }} + {{- "functions." + tool_call.name + "<|channel|>commentary json<|message|>" }} + {{- tool_call.arguments|tojson }} + {{- "<|call|>" }} + {%- set last_tool_call.name = tool_call.name %} + {%- elif "thinking" in message and loop.last and not add_generation_prompt %} + {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #} + {#- This is a situation that should only occur in training, never in inference. #} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }} + {%- set last_tool_call.name = none %} + {%- elif "thinking" in message %} + {#- CoT is dropped during all previous turns, so we never render it for inference #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- elif loop.last and not add_generation_prompt %} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|message|>" + message.content + "<|return|>" }} + {%- else %} + {{- "<|start|>assistant<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- endif %} + {%- elif message.role == 'tool' -%} + {%- if last_tool_call.name is none %} + {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }} + {%- endif %} + {{- "<|start|>functions." + last_tool_call.name }} + {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} + {%- else -%} + {{- "<|start|>user<|message|>" + message.content + "<|end|>" }} + {%- endif -%} +{%- endfor -%} + +{#- Generation prompt #} +{%- if add_generation_prompt -%} +<|start|>assistant +{%- endif -%} +{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #} \ No newline at end of file diff --git a/outputs/checkpoint-313/special_tokens_map.json b/outputs/checkpoint-313/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..6fba18753f4d09dbb8fcdf1482daff36b963d639 --- /dev/null +++ b/outputs/checkpoint-313/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/outputs/checkpoint-313/tokenizer.json b/outputs/checkpoint-313/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647 --- /dev/null +++ b/outputs/checkpoint-313/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3 +size 27868174 diff --git a/outputs/checkpoint-313/tokenizer_config.json b/outputs/checkpoint-313/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..482ae30d27a74c38d2228e69dd37c529fc485a45 --- /dev/null +++ b/outputs/checkpoint-313/tokenizer_config.json @@ -0,0 +1,185 @@ +{ + "added_tokens_decoder": { + "199998": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "199999": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200000": { + "content": "<|reserved_200000|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200001": { + "content": "<|reserved_200001|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200002": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200003": { + "content": "<|constrain|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200004": { + "content": "<|reserved_200004|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200005": { + "content": "<|channel|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200006": { + "content": "<|start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200008": { + "content": "<|message|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200009": { + "content": "<|reserved_200009|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200010": { + "content": "<|reserved_200010|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200011": { + "content": "<|reserved_200011|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200012": { + "content": "<|call|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200013": { + "content": "<|reserved_200013|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200014": { + "content": "<|reserved_200014|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200015": { + "content": "<|reserved_200015|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200016": { + "content": "<|reserved_200016|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200017": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200018": { + "content": "<|endofprompt|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|startoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|return|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|reserved_200017|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/outputs/checkpoint-313/trainer_state.json b/outputs/checkpoint-313/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0064f4acb6c8622ddcd506f380d076ef7b6f3b67 --- /dev/null +++ b/outputs/checkpoint-313/trainer_state.json @@ -0,0 +1,2225 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 313, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 13.684800148010254, + "learning_rate": 0.0, + "loss": 2.3276, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 13.660787582397461, + "learning_rate": 4e-05, + "loss": 2.2792, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 13.35280704498291, + "learning_rate": 8e-05, + "loss": 2.4151, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 6.15027379989624, + "learning_rate": 0.00012, + "loss": 1.7812, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 1.3168226480484009, + "learning_rate": 0.00016, + "loss": 1.4536, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.9872580170631409, + "learning_rate": 0.0002, + "loss": 1.4171, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.7496100664138794, + "learning_rate": 0.00019935064935064936, + "loss": 1.4168, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7376005053520203, + "learning_rate": 0.00019870129870129872, + "loss": 1.3659, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5281137824058533, + "learning_rate": 0.00019805194805194807, + "loss": 1.2566, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.5485746264457703, + "learning_rate": 0.00019740259740259742, + "loss": 1.3761, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5506592392921448, + "learning_rate": 0.00019675324675324675, + "loss": 1.3327, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.49382686614990234, + "learning_rate": 0.00019610389610389613, + "loss": 1.3727, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.36203011870384216, + "learning_rate": 0.00019545454545454548, + "loss": 1.1515, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.3528599739074707, + "learning_rate": 0.0001948051948051948, + "loss": 1.2636, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.31244418025016785, + "learning_rate": 0.00019415584415584416, + "loss": 1.1873, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.3379523754119873, + "learning_rate": 0.00019350649350649354, + "loss": 1.2657, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.3025083839893341, + "learning_rate": 0.00019285714285714286, + "loss": 1.2846, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.2560190260410309, + "learning_rate": 0.00019220779220779222, + "loss": 1.1587, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.2554129958152771, + "learning_rate": 0.00019155844155844157, + "loss": 1.2812, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.22662702202796936, + "learning_rate": 0.00019090909090909092, + "loss": 1.1664, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.2515714168548584, + "learning_rate": 0.00019025974025974027, + "loss": 1.2177, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.24396637082099915, + "learning_rate": 0.00018961038961038963, + "loss": 1.2053, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.24488303065299988, + "learning_rate": 0.00018896103896103895, + "loss": 1.2074, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.2168620079755783, + "learning_rate": 0.00018831168831168833, + "loss": 1.1284, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.24021224677562714, + "learning_rate": 0.00018766233766233769, + "loss": 1.2169, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.20057056844234467, + "learning_rate": 0.000187012987012987, + "loss": 1.1031, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.19900795817375183, + "learning_rate": 0.00018636363636363636, + "loss": 1.1004, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.2019268423318863, + "learning_rate": 0.00018571428571428572, + "loss": 1.1476, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.1996479034423828, + "learning_rate": 0.00018506493506493507, + "loss": 1.1455, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.25262022018432617, + "learning_rate": 0.00018441558441558442, + "loss": 1.1025, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.225438192486763, + "learning_rate": 0.00018376623376623378, + "loss": 1.1954, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.17834505438804626, + "learning_rate": 0.00018311688311688313, + "loss": 1.0934, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.20071206986904144, + "learning_rate": 0.00018246753246753248, + "loss": 1.0488, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.1920139640569687, + "learning_rate": 0.00018181818181818183, + "loss": 1.123, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.18714852631092072, + "learning_rate": 0.0001811688311688312, + "loss": 1.0798, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.18315713107585907, + "learning_rate": 0.00018051948051948054, + "loss": 1.1107, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.19156870245933533, + "learning_rate": 0.00017987012987012987, + "loss": 1.1125, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.21527768671512604, + "learning_rate": 0.00017922077922077922, + "loss": 1.1346, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.1871163249015808, + "learning_rate": 0.0001785714285714286, + "loss": 1.0742, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.17750784754753113, + "learning_rate": 0.00017792207792207792, + "loss": 1.1323, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.177419051527977, + "learning_rate": 0.00017727272727272728, + "loss": 1.1405, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.16714292764663696, + "learning_rate": 0.00017662337662337663, + "loss": 1.1084, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.1610356718301773, + "learning_rate": 0.00017597402597402598, + "loss": 1.1125, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.2548656761646271, + "learning_rate": 0.00017532467532467534, + "loss": 1.1114, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.1731044203042984, + "learning_rate": 0.0001746753246753247, + "loss": 1.1197, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.1739533394575119, + "learning_rate": 0.00017402597402597401, + "loss": 1.1777, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.2178352177143097, + "learning_rate": 0.0001733766233766234, + "loss": 1.1111, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.17247150838375092, + "learning_rate": 0.00017272727272727275, + "loss": 1.1253, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.18075324594974518, + "learning_rate": 0.00017207792207792207, + "loss": 1.1358, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.15898071229457855, + "learning_rate": 0.00017142857142857143, + "loss": 1.0606, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.16518613696098328, + "learning_rate": 0.0001707792207792208, + "loss": 1.0944, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.16035063564777374, + "learning_rate": 0.00017012987012987013, + "loss": 1.0554, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.1686483472585678, + "learning_rate": 0.00016948051948051948, + "loss": 1.0384, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.16575631499290466, + "learning_rate": 0.00016883116883116884, + "loss": 1.0243, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.16840039193630219, + "learning_rate": 0.0001681818181818182, + "loss": 1.117, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.17616064846515656, + "learning_rate": 0.00016753246753246754, + "loss": 1.0743, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.168218195438385, + "learning_rate": 0.0001668831168831169, + "loss": 1.0627, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.17026656866073608, + "learning_rate": 0.00016623376623376625, + "loss": 1.0059, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.16454458236694336, + "learning_rate": 0.0001655844155844156, + "loss": 0.9943, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.17185136675834656, + "learning_rate": 0.00016493506493506495, + "loss": 1.1545, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.17822986841201782, + "learning_rate": 0.00016428571428571428, + "loss": 1.073, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.1676608771085739, + "learning_rate": 0.00016363636363636366, + "loss": 1.0886, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.1727771908044815, + "learning_rate": 0.000162987012987013, + "loss": 1.0432, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.17827573418617249, + "learning_rate": 0.00016233766233766234, + "loss": 1.083, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.19807517528533936, + "learning_rate": 0.0001616883116883117, + "loss": 1.1208, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.17693684995174408, + "learning_rate": 0.00016103896103896104, + "loss": 1.089, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.15489234030246735, + "learning_rate": 0.0001603896103896104, + "loss": 0.9707, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.16443990170955658, + "learning_rate": 0.00015974025974025975, + "loss": 1.0643, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.2051103413105011, + "learning_rate": 0.0001590909090909091, + "loss": 1.1246, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.18824075162410736, + "learning_rate": 0.00015844155844155845, + "loss": 1.0855, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.18659448623657227, + "learning_rate": 0.0001577922077922078, + "loss": 1.1412, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.1854114979505539, + "learning_rate": 0.00015714285714285716, + "loss": 1.0249, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.1876193732023239, + "learning_rate": 0.00015649350649350649, + "loss": 1.1029, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.1888684630393982, + "learning_rate": 0.00015584415584415587, + "loss": 1.0789, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.20240606367588043, + "learning_rate": 0.0001551948051948052, + "loss": 1.0495, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.232120081782341, + "learning_rate": 0.00015454545454545454, + "loss": 1.0735, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.16897843778133392, + "learning_rate": 0.0001538961038961039, + "loss": 1.0164, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.18796634674072266, + "learning_rate": 0.00015324675324675325, + "loss": 1.0676, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.19574032723903656, + "learning_rate": 0.0001525974025974026, + "loss": 1.0456, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.18007811903953552, + "learning_rate": 0.00015194805194805196, + "loss": 1.0894, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.18932929635047913, + "learning_rate": 0.0001512987012987013, + "loss": 1.0729, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.20614288747310638, + "learning_rate": 0.00015064935064935066, + "loss": 1.0854, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.19291089475154877, + "learning_rate": 0.00015000000000000001, + "loss": 1.1217, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.18916529417037964, + "learning_rate": 0.00014935064935064934, + "loss": 1.0963, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.20306220650672913, + "learning_rate": 0.00014870129870129872, + "loss": 1.0898, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.17870067059993744, + "learning_rate": 0.00014805194805194807, + "loss": 1.0213, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.18411923944950104, + "learning_rate": 0.0001474025974025974, + "loss": 1.0844, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.18788227438926697, + "learning_rate": 0.00014675324675324675, + "loss": 1.0338, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.23874884843826294, + "learning_rate": 0.00014610389610389613, + "loss": 1.1118, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.19380499422550201, + "learning_rate": 0.00014545454545454546, + "loss": 1.0464, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.18968750536441803, + "learning_rate": 0.0001448051948051948, + "loss": 1.0569, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.19545753300189972, + "learning_rate": 0.00014415584415584416, + "loss": 1.1225, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.19170494377613068, + "learning_rate": 0.00014350649350649352, + "loss": 1.0602, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.17953918874263763, + "learning_rate": 0.00014285714285714287, + "loss": 1.032, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.1822536289691925, + "learning_rate": 0.00014220779220779222, + "loss": 1.0559, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.18591298162937164, + "learning_rate": 0.00014155844155844155, + "loss": 1.031, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.2129002958536148, + "learning_rate": 0.00014090909090909093, + "loss": 1.1391, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.18386681377887726, + "learning_rate": 0.00014025974025974028, + "loss": 0.9919, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.18314239382743835, + "learning_rate": 0.0001396103896103896, + "loss": 1.0445, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.1999066174030304, + "learning_rate": 0.00013896103896103896, + "loss": 1.0538, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.18741188943386078, + "learning_rate": 0.00013831168831168834, + "loss": 1.0722, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.19351010024547577, + "learning_rate": 0.00013766233766233766, + "loss": 1.0491, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.18859203159809113, + "learning_rate": 0.00013701298701298702, + "loss": 1.0593, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.1962767392396927, + "learning_rate": 0.00013636363636363637, + "loss": 1.1344, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.20819440484046936, + "learning_rate": 0.00013571428571428572, + "loss": 1.1137, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.19590184092521667, + "learning_rate": 0.00013506493506493507, + "loss": 1.0624, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.18631424009799957, + "learning_rate": 0.00013441558441558443, + "loss": 1.0587, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.19572143256664276, + "learning_rate": 0.00013376623376623375, + "loss": 1.0494, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.1910988837480545, + "learning_rate": 0.00013311688311688313, + "loss": 1.0481, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.19455869495868683, + "learning_rate": 0.00013246753246753249, + "loss": 1.029, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.18669827282428741, + "learning_rate": 0.0001318181818181818, + "loss": 1.0513, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.17523664236068726, + "learning_rate": 0.0001311688311688312, + "loss": 1.0126, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.17929129302501678, + "learning_rate": 0.00013051948051948052, + "loss": 1.0717, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.19380168616771698, + "learning_rate": 0.00012987012987012987, + "loss": 1.0324, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.18090228736400604, + "learning_rate": 0.00012922077922077922, + "loss": 1.0515, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.2067340910434723, + "learning_rate": 0.00012857142857142858, + "loss": 1.0939, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.1880485862493515, + "learning_rate": 0.00012792207792207793, + "loss": 1.0986, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.182168647646904, + "learning_rate": 0.00012727272727272728, + "loss": 1.0109, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.20187129080295563, + "learning_rate": 0.00012662337662337663, + "loss": 1.0668, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.2082669734954834, + "learning_rate": 0.000125974025974026, + "loss": 1.054, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.18294434249401093, + "learning_rate": 0.00012532467532467534, + "loss": 1.0397, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.20515067875385284, + "learning_rate": 0.00012467532467532467, + "loss": 1.1092, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.1758790761232376, + "learning_rate": 0.00012402597402597402, + "loss": 0.9755, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.2170792669057846, + "learning_rate": 0.0001233766233766234, + "loss": 1.0434, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.202157124876976, + "learning_rate": 0.00012272727272727272, + "loss": 1.1129, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.18556398153305054, + "learning_rate": 0.00012207792207792208, + "loss": 1.0665, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.20196087658405304, + "learning_rate": 0.00012142857142857143, + "loss": 1.1, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.1921566128730774, + "learning_rate": 0.0001207792207792208, + "loss": 1.0918, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.18866224586963654, + "learning_rate": 0.00012012987012987014, + "loss": 1.0014, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.207601398229599, + "learning_rate": 0.00011948051948051949, + "loss": 1.0726, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.21592366695404053, + "learning_rate": 0.00011883116883116883, + "loss": 1.1379, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.2016124576330185, + "learning_rate": 0.0001181818181818182, + "loss": 1.1428, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.20478437840938568, + "learning_rate": 0.00011753246753246753, + "loss": 1.121, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.22730594873428345, + "learning_rate": 0.00011688311688311689, + "loss": 1.0319, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.22592711448669434, + "learning_rate": 0.00011623376623376625, + "loss": 1.1264, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.20035041868686676, + "learning_rate": 0.00011558441558441559, + "loss": 1.0686, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.20648567378520966, + "learning_rate": 0.00011493506493506494, + "loss": 1.0817, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.21222743391990662, + "learning_rate": 0.00011428571428571428, + "loss": 1.0678, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.2075391560792923, + "learning_rate": 0.00011363636363636365, + "loss": 1.0897, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.1964101791381836, + "learning_rate": 0.000112987012987013, + "loss": 1.0906, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.22406511008739471, + "learning_rate": 0.00011233766233766234, + "loss": 1.0594, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.23787978291511536, + "learning_rate": 0.00011168831168831168, + "loss": 1.1053, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.21196185052394867, + "learning_rate": 0.00011103896103896105, + "loss": 1.0923, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.21042804419994354, + "learning_rate": 0.0001103896103896104, + "loss": 1.0381, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.2267436534166336, + "learning_rate": 0.00010974025974025974, + "loss": 1.0818, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.23742735385894775, + "learning_rate": 0.00010909090909090909, + "loss": 1.0872, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.17787213623523712, + "learning_rate": 0.00010844155844155846, + "loss": 1.03, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.22422832250595093, + "learning_rate": 0.0001077922077922078, + "loss": 1.0738, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.22946301102638245, + "learning_rate": 0.00010714285714285715, + "loss": 1.0274, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.2137996405363083, + "learning_rate": 0.00010649350649350649, + "loss": 1.0539, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.1748756766319275, + "learning_rate": 0.00010584415584415586, + "loss": 1.0355, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.22275175154209137, + "learning_rate": 0.0001051948051948052, + "loss": 1.1696, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.20996077358722687, + "learning_rate": 0.00010454545454545455, + "loss": 1.0303, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.1945938766002655, + "learning_rate": 0.00010389610389610389, + "loss": 0.9747, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.1970377266407013, + "learning_rate": 0.00010324675324675325, + "loss": 1.0358, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.18814732134342194, + "learning_rate": 0.00010259740259740261, + "loss": 0.9612, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.2153233289718628, + "learning_rate": 0.00010194805194805195, + "loss": 1.0749, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.21788008511066437, + "learning_rate": 0.0001012987012987013, + "loss": 1.0883, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.214650496840477, + "learning_rate": 0.00010064935064935067, + "loss": 1.0539, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.19312834739685059, + "learning_rate": 0.0001, + "loss": 1.0657, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.19916598498821259, + "learning_rate": 9.935064935064936e-05, + "loss": 1.0478, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.2057606726884842, + "learning_rate": 9.870129870129871e-05, + "loss": 1.0094, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.22159607708454132, + "learning_rate": 9.805194805194806e-05, + "loss": 1.0952, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.18274275958538055, + "learning_rate": 9.74025974025974e-05, + "loss": 1.0065, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.19835162162780762, + "learning_rate": 9.675324675324677e-05, + "loss": 1.0742, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.2114904820919037, + "learning_rate": 9.610389610389611e-05, + "loss": 1.1109, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.21488523483276367, + "learning_rate": 9.545454545454546e-05, + "loss": 1.0465, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.19870303571224213, + "learning_rate": 9.480519480519481e-05, + "loss": 1.0318, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.20413029193878174, + "learning_rate": 9.415584415584417e-05, + "loss": 1.0817, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.1847231239080429, + "learning_rate": 9.35064935064935e-05, + "loss": 1.0144, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.2715964913368225, + "learning_rate": 9.285714285714286e-05, + "loss": 0.9832, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.2225002497434616, + "learning_rate": 9.220779220779221e-05, + "loss": 1.1051, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.22931510210037231, + "learning_rate": 9.155844155844156e-05, + "loss": 1.1042, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.21848627924919128, + "learning_rate": 9.090909090909092e-05, + "loss": 1.1151, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.19852259755134583, + "learning_rate": 9.025974025974027e-05, + "loss": 1.0889, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.2080363780260086, + "learning_rate": 8.961038961038961e-05, + "loss": 1.0777, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.22391024231910706, + "learning_rate": 8.896103896103896e-05, + "loss": 1.1092, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.21793846786022186, + "learning_rate": 8.831168831168831e-05, + "loss": 1.044, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.2009749859571457, + "learning_rate": 8.766233766233767e-05, + "loss": 1.0198, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.19432318210601807, + "learning_rate": 8.701298701298701e-05, + "loss": 1.075, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.18634547293186188, + "learning_rate": 8.636363636363637e-05, + "loss": 0.9964, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.1947103589773178, + "learning_rate": 8.571428571428571e-05, + "loss": 1.0025, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.23098671436309814, + "learning_rate": 8.506493506493507e-05, + "loss": 1.0562, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.19686414301395416, + "learning_rate": 8.441558441558442e-05, + "loss": 1.0285, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.19852428138256073, + "learning_rate": 8.376623376623377e-05, + "loss": 1.0054, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.21483510732650757, + "learning_rate": 8.311688311688312e-05, + "loss": 1.108, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.23313644528388977, + "learning_rate": 8.246753246753248e-05, + "loss": 1.1383, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.21453145146369934, + "learning_rate": 8.181818181818183e-05, + "loss": 1.0911, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.20268195867538452, + "learning_rate": 8.116883116883117e-05, + "loss": 1.0145, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.20576398074626923, + "learning_rate": 8.051948051948052e-05, + "loss": 1.0829, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.21732626855373383, + "learning_rate": 7.987012987012987e-05, + "loss": 1.0152, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.22046895325183868, + "learning_rate": 7.922077922077923e-05, + "loss": 1.1311, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.19727715849876404, + "learning_rate": 7.857142857142858e-05, + "loss": 1.0364, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.20861488580703735, + "learning_rate": 7.792207792207793e-05, + "loss": 1.0435, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.18545083701610565, + "learning_rate": 7.727272727272727e-05, + "loss": 1.0299, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.19965052604675293, + "learning_rate": 7.662337662337662e-05, + "loss": 1.0511, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.23673909902572632, + "learning_rate": 7.597402597402598e-05, + "loss": 1.081, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.17583179473876953, + "learning_rate": 7.532467532467533e-05, + "loss": 0.9808, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.2129366099834442, + "learning_rate": 7.467532467532467e-05, + "loss": 1.0522, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.21679140627384186, + "learning_rate": 7.402597402597404e-05, + "loss": 1.0567, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.2032000720500946, + "learning_rate": 7.337662337662338e-05, + "loss": 1.0466, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.1887970268726349, + "learning_rate": 7.272727272727273e-05, + "loss": 1.0329, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.21060192584991455, + "learning_rate": 7.207792207792208e-05, + "loss": 1.1021, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.21191425621509552, + "learning_rate": 7.142857142857143e-05, + "loss": 0.99, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.1995989829301834, + "learning_rate": 7.077922077922077e-05, + "loss": 1.0526, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.1849513053894043, + "learning_rate": 7.012987012987014e-05, + "loss": 0.9998, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.1948779672384262, + "learning_rate": 6.948051948051948e-05, + "loss": 1.075, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.20374052226543427, + "learning_rate": 6.883116883116883e-05, + "loss": 1.0933, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.2102465033531189, + "learning_rate": 6.818181818181818e-05, + "loss": 1.1123, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.21376173198223114, + "learning_rate": 6.753246753246754e-05, + "loss": 1.1233, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.20934203267097473, + "learning_rate": 6.688311688311688e-05, + "loss": 1.1374, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.18604128062725067, + "learning_rate": 6.623376623376624e-05, + "loss": 1.0213, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.19644233584403992, + "learning_rate": 6.55844155844156e-05, + "loss": 1.0046, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.18479463458061218, + "learning_rate": 6.493506493506494e-05, + "loss": 0.9792, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.1945149153470993, + "learning_rate": 6.428571428571429e-05, + "loss": 1.0584, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.2070147544145584, + "learning_rate": 6.363636363636364e-05, + "loss": 1.071, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.19645985960960388, + "learning_rate": 6.2987012987013e-05, + "loss": 1.0721, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.1960117667913437, + "learning_rate": 6.233766233766233e-05, + "loss": 1.071, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.20168261229991913, + "learning_rate": 6.16883116883117e-05, + "loss": 1.0808, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.21254412829875946, + "learning_rate": 6.103896103896104e-05, + "loss": 1.0287, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.21271063387393951, + "learning_rate": 6.03896103896104e-05, + "loss": 1.0605, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.2081408053636551, + "learning_rate": 5.9740259740259744e-05, + "loss": 1.091, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.21113798022270203, + "learning_rate": 5.90909090909091e-05, + "loss": 1.1323, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.20670844614505768, + "learning_rate": 5.844155844155844e-05, + "loss": 1.0955, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.2010120451450348, + "learning_rate": 5.7792207792207796e-05, + "loss": 1.1068, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.20379121601581573, + "learning_rate": 5.714285714285714e-05, + "loss": 1.0419, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.22799807786941528, + "learning_rate": 5.64935064935065e-05, + "loss": 1.0904, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.2005995213985443, + "learning_rate": 5.584415584415584e-05, + "loss": 1.078, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.20329605042934418, + "learning_rate": 5.51948051948052e-05, + "loss": 1.0245, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.19283504784107208, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.0367, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.20624355971813202, + "learning_rate": 5.38961038961039e-05, + "loss": 1.1046, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.21362991631031036, + "learning_rate": 5.3246753246753245e-05, + "loss": 1.1104, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.20447863638401031, + "learning_rate": 5.25974025974026e-05, + "loss": 1.0514, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.1974381059408188, + "learning_rate": 5.1948051948051944e-05, + "loss": 1.0048, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.21237170696258545, + "learning_rate": 5.1298701298701304e-05, + "loss": 1.1299, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.21224971115589142, + "learning_rate": 5.064935064935065e-05, + "loss": 1.05, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.19865018129348755, + "learning_rate": 5e-05, + "loss": 1.0665, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.19199275970458984, + "learning_rate": 4.9350649350649355e-05, + "loss": 0.9531, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.19573214650154114, + "learning_rate": 4.87012987012987e-05, + "loss": 1.0318, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.21338805556297302, + "learning_rate": 4.8051948051948054e-05, + "loss": 1.0343, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.2254691869020462, + "learning_rate": 4.740259740259741e-05, + "loss": 1.0472, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.18101665377616882, + "learning_rate": 4.675324675324675e-05, + "loss": 1.017, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.22090592980384827, + "learning_rate": 4.6103896103896106e-05, + "loss": 1.0389, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.20865507423877716, + "learning_rate": 4.545454545454546e-05, + "loss": 1.0369, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.21619610488414764, + "learning_rate": 4.4805194805194805e-05, + "loss": 1.109, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.21694771945476532, + "learning_rate": 4.415584415584416e-05, + "loss": 1.0525, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.2182662934064865, + "learning_rate": 4.3506493506493503e-05, + "loss": 1.0331, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.2026486098766327, + "learning_rate": 4.2857142857142856e-05, + "loss": 1.027, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.19606547057628632, + "learning_rate": 4.220779220779221e-05, + "loss": 1.0242, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.22107470035552979, + "learning_rate": 4.155844155844156e-05, + "loss": 1.0924, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.19960008561611176, + "learning_rate": 4.0909090909090915e-05, + "loss": 1.0384, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.1945488154888153, + "learning_rate": 4.025974025974026e-05, + "loss": 1.0673, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.22067414224147797, + "learning_rate": 3.9610389610389614e-05, + "loss": 1.0426, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.19010980427265167, + "learning_rate": 3.8961038961038966e-05, + "loss": 1.0617, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.18781176209449768, + "learning_rate": 3.831168831168831e-05, + "loss": 1.0243, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.20388829708099365, + "learning_rate": 3.7662337662337665e-05, + "loss": 1.0476, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.19911155104637146, + "learning_rate": 3.701298701298702e-05, + "loss": 1.0324, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.19884039461612701, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.0242, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.19036105275154114, + "learning_rate": 3.571428571428572e-05, + "loss": 1.0323, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.20039844512939453, + "learning_rate": 3.506493506493507e-05, + "loss": 1.0749, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.1899934560060501, + "learning_rate": 3.4415584415584416e-05, + "loss": 1.0115, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.20019090175628662, + "learning_rate": 3.376623376623377e-05, + "loss": 1.0782, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.2020583152770996, + "learning_rate": 3.311688311688312e-05, + "loss": 1.0687, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.21407337486743927, + "learning_rate": 3.246753246753247e-05, + "loss": 1.1015, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.1871640682220459, + "learning_rate": 3.181818181818182e-05, + "loss": 0.9637, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.21622811257839203, + "learning_rate": 3.1168831168831166e-05, + "loss": 1.1222, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.22504661977291107, + "learning_rate": 3.051948051948052e-05, + "loss": 1.132, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.19177629053592682, + "learning_rate": 2.9870129870129872e-05, + "loss": 1.0281, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.1970544159412384, + "learning_rate": 2.922077922077922e-05, + "loss": 1.0393, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.21554522216320038, + "learning_rate": 2.857142857142857e-05, + "loss": 1.074, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.21131229400634766, + "learning_rate": 2.792207792207792e-05, + "loss": 1.054, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.19816523790359497, + "learning_rate": 2.7272727272727273e-05, + "loss": 1.0456, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.21075209975242615, + "learning_rate": 2.6623376623376623e-05, + "loss": 1.0758, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.2296527624130249, + "learning_rate": 2.5974025974025972e-05, + "loss": 1.0917, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.19722610712051392, + "learning_rate": 2.5324675324675325e-05, + "loss": 1.0704, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.18721099197864532, + "learning_rate": 2.4675324675324678e-05, + "loss": 0.9919, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.20244193077087402, + "learning_rate": 2.4025974025974027e-05, + "loss": 1.0368, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.19518914818763733, + "learning_rate": 2.3376623376623376e-05, + "loss": 1.0436, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.19650357961654663, + "learning_rate": 2.272727272727273e-05, + "loss": 1.0306, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.20320096611976624, + "learning_rate": 2.207792207792208e-05, + "loss": 1.0941, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.18296951055526733, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.9802, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.21357610821723938, + "learning_rate": 2.077922077922078e-05, + "loss": 1.0449, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.193921759724617, + "learning_rate": 2.012987012987013e-05, + "loss": 1.0116, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.1953902244567871, + "learning_rate": 1.9480519480519483e-05, + "loss": 1.0105, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.19440975785255432, + "learning_rate": 1.8831168831168833e-05, + "loss": 0.9952, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.21054105460643768, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.0701, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.18844804167747498, + "learning_rate": 1.7532467532467535e-05, + "loss": 1.0146, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.2067311704158783, + "learning_rate": 1.6883116883116884e-05, + "loss": 1.0781, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.1941213756799698, + "learning_rate": 1.6233766233766234e-05, + "loss": 0.9814, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.22726193070411682, + "learning_rate": 1.5584415584415583e-05, + "loss": 1.1431, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.18025581538677216, + "learning_rate": 1.4935064935064936e-05, + "loss": 0.9649, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.21535000205039978, + "learning_rate": 1.4285714285714285e-05, + "loss": 1.0441, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.20014546811580658, + "learning_rate": 1.3636363636363637e-05, + "loss": 1.0166, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.22738787531852722, + "learning_rate": 1.2987012987012986e-05, + "loss": 1.0564, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.2020861804485321, + "learning_rate": 1.2337662337662339e-05, + "loss": 1.1241, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.19888809323310852, + "learning_rate": 1.1688311688311688e-05, + "loss": 1.1114, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.20912377536296844, + "learning_rate": 1.103896103896104e-05, + "loss": 1.0971, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.21206621825695038, + "learning_rate": 1.038961038961039e-05, + "loss": 1.0601, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.18667680025100708, + "learning_rate": 9.740259740259742e-06, + "loss": 1.0291, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.21125559508800507, + "learning_rate": 9.090909090909091e-06, + "loss": 1.0483, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.21776145696640015, + "learning_rate": 8.441558441558442e-06, + "loss": 0.9912, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.20144303143024445, + "learning_rate": 7.792207792207792e-06, + "loss": 1.0357, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.1984029859304428, + "learning_rate": 7.142857142857143e-06, + "loss": 1.0648, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.17972829937934875, + "learning_rate": 6.493506493506493e-06, + "loss": 1.0033, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.1818286031484604, + "learning_rate": 5.844155844155844e-06, + "loss": 0.997, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.19670912623405457, + "learning_rate": 5.194805194805195e-06, + "loss": 1.0256, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.20527283847332, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.0348, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.19025909900665283, + "learning_rate": 3.896103896103896e-06, + "loss": 1.0682, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.19544818997383118, + "learning_rate": 3.2467532467532465e-06, + "loss": 0.9872, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.22112183272838593, + "learning_rate": 2.5974025974025976e-06, + "loss": 1.0661, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.23328153789043427, + "learning_rate": 1.948051948051948e-06, + "loss": 1.0691, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.20181375741958618, + "learning_rate": 1.2987012987012988e-06, + "loss": 0.9416, + "step": 312 + }, + { + "epoch": 1.0, + "grad_norm": 0.29312625527381897, + "learning_rate": 6.493506493506494e-07, + "loss": 1.1216, + "step": 313 + } + ], + "logging_steps": 1, + "max_steps": 313, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.768425540391928e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/outputs/checkpoint-500/README.md b/outputs/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3abf956c074d00f34a12693c8d6da9738211d7c7 --- /dev/null +++ b/outputs/checkpoint-500/README.md @@ -0,0 +1,209 @@ +--- +base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit +library_name: peft +tags: +- base_model:adapter:unsloth/gpt-oss-20b-unsloth-bnb-4bit +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/outputs/checkpoint-500/adapter_config.json b/outputs/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e285b9b6e018b5b9f23736d6699eb1a4267764e7 --- /dev/null +++ b/outputs/checkpoint-500/adapter_config.json @@ -0,0 +1,45 @@ +{ + "alpha_pattern": {}, + "auto_mapping": { + "base_model_class": "GptOssForCausalLM", + "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss" + }, + "base_model_name_or_path": "unsloth/gpt-oss-20b-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "up_proj", + "down_proj", + "gate_proj", + "k_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/outputs/checkpoint-500/chat_template.jinja b/outputs/checkpoint-500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..a3650f886e98b2834c25727759c8e0ab8495f316 --- /dev/null +++ b/outputs/checkpoint-500/chat_template.jinja @@ -0,0 +1,315 @@ +{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #} +{#- + In addition to the normal inputs of `messages` and `tools`, this template also accepts the + following kwargs: + - "builtin_tools": A list, can contain "browser" and/or "python". + - "model_identity": A string that optionally describes the model identity. + - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium". + #} + +{#- Tool Definition Rendering ============================================== #} +{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%} + {%- if param_spec.type == "array" -%} + {%- if param_spec['items'] -%} + {%- if param_spec['items']['type'] == "string" -%} + {{- "string[]" }} + {%- elif param_spec['items']['type'] == "number" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "integer" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "boolean" -%} + {{- "boolean[]" }} + {%- else -%} + {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%} + {%- if inner_type == "object | object" or inner_type|length > 50 -%} + {{- "any[]" }} + {%- else -%} + {{- inner_type + "[]" }} + {%- endif -%} + {%- endif -%} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- else -%} + {{- "any[]" }} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%} + {#- Handle array of types like ["object", "object"] from Union[dict, list] #} + {%- if param_spec.type | length > 1 -%} + {{- param_spec.type | join(" | ") }} + {%- else -%} + {{- param_spec.type[0] }} + {%- endif -%} + {%- elif param_spec.oneOf -%} + {#- Handle oneOf schemas - check for complex unions and fallback to any #} + {%- set has_object_variants = false -%} + {%- for variant in param_spec.oneOf -%} + {%- if variant.type == "object" -%} + {%- set has_object_variants = true -%} + {%- endif -%} + {%- endfor -%} + {%- if has_object_variants and param_spec.oneOf|length > 1 -%} + {{- "any" }} + {%- else -%} + {%- for variant in param_spec.oneOf -%} + {{- render_typescript_type(variant, required_params) -}} + {%- if variant.description %} + {{- "// " + variant.description }} + {%- endif -%} + {%- if variant.default is defined %} + {{ "// default: " + variant.default|tojson }} + {%- endif -%} + {%- if not loop.last %} + {{- " | " }} + {% endif -%} + {%- endfor -%} + {%- endif -%} + {%- elif param_spec.type == "string" -%} + {%- if param_spec.enum -%} + {{- '"' + param_spec.enum|join('" | "') + '"' -}} + {%- else -%} + {{- "string" }} + {%- if param_spec.nullable %} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type == "number" -%} + {{- "number" }} + {%- elif param_spec.type == "integer" -%} + {{- "number" }} + {%- elif param_spec.type == "boolean" -%} + {{- "boolean" }} + + {%- elif param_spec.type == "object" -%} + {%- if param_spec.properties -%} + {{- "{\n" }} + {%- for prop_name, prop_spec in param_spec.properties.items() -%} + {{- prop_name -}} + {%- if prop_name not in (param_spec.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{ render_typescript_type(prop_spec, param_spec.required or []) }} + {%- if not loop.last -%} + {{-", " }} + {%- endif -%} + {%- endfor -%} + {{- "}" }} + {%- else -%} + {{- "object" }} + {%- endif -%} + {%- else -%} + {{- "any" }} + {%- endif -%} +{%- endmacro -%} + +{%- macro render_tool_namespace(namespace_name, tools) -%} + {{- "## " + namespace_name + "\n\n" }} + {{- "namespace " + namespace_name + " {\n\n" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- "// " + tool.description + "\n" }} + {{- "type "+ tool.name + " = " }} + {%- if tool.parameters and tool.parameters.properties -%} + {{- "(_: " }} + {{- "{\n" }} + {%- for param_name, param_spec in tool.parameters.properties.items() %} + {{- "// " + param_spec.description + "\n" }} + {{- param_name }} + {%- if param_name not in (tool.parameters.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{- render_typescript_type(param_spec, tool.parameters.required or []) }} + {%- if param_spec.default is defined -%} + {%- if param_spec.enum %} + {{- ", // default: " + param_spec.default }} + {%- elif param_spec.oneOf %} + {{- "// default: " + param_spec.default }} + {%- else %} + {{- ", // default: " + param_spec.default|tojson }} + {%- endif -%} + {%- endif -%} + {%- if not loop.last %} + {{- ",\n" }} + {%- else %} + {{- "\n" }} + {%- endif -%} + {%- endfor %} + {{- "}) => any;\n\n" }} + {%- else -%} + {{- "() => any;\n\n" }} + {%- endif -%} + {%- endfor %} + {{- "} // namespace " + namespace_name }} +{%- endmacro -%} + +{%- macro render_builtin_tools(browser_tool, python_tool) -%} + {%- if browser_tool %} + {{- "## browser\n\n" }} + {{- "// Tool for browsing.\n" }} + {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }} + {{- "// Cite information from the tool using the following format:\n" }} + {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }} + {{- "// Do not quote more than 10 words directly from the tool output.\n" }} + {{- "// sources=web (default: web)\n" }} + {{- "namespace browser {\n\n" }} + {{- "// Searches for information related to `query` and displays `topn` results.\n" }} + {{- "type search = (_: {\n" }} + {{- "query: string,\n" }} + {{- "topn?: number, // default: 10\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }} + {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }} + {{- "// If `cursor` is not provided, the most recent page is implied.\n" }} + {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }} + {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }} + {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }} + {{- "type open = (_: {\n" }} + {{- "id?: number | string, // default: -1\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "loc?: number, // default: -1\n" }} + {{- "num_lines?: number, // default: -1\n" }} + {{- "view_source?: boolean, // default: false\n" }} + {{- "source?: string,\n" }} + {{- "}) => any;\n\n" }} + {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }} + {{- "type find = (_: {\n" }} + {{- "pattern: string,\n" }} + {{- "cursor?: number, // default: -1\n" }} + {{- "}) => any;\n\n" }} + {{- "} // namespace browser\n\n" }} + {%- endif -%} + + {%- if python_tool %} + {{- "## python\n\n" }} + {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }} + {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }} + {%- endif -%} +{%- endmacro -%} + +{#- System Message Construction ============================================ #} +{%- macro build_system_message() -%} + {%- if model_identity is not defined %} + {{- "You are ChatGPT, a large language model trained by OpenAI.\n" -}} + {%- else %} + {{- model_identity }} + {%- endif %} + {{- "Knowledge cutoff: 2024-06\n" }} + {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }} + {%- if reasoning_effort is not defined %} + {%- set reasoning_effort = "medium" %} + {%- endif %} + {{- "Reasoning: " + reasoning_effort + "\n\n" }} + {%- if builtin_tools is defined %} + {{- "# Tools\n\n" }} + {%- set available_builtin_tools = namespace(browser=false, python=false) %} + {%- for tool in builtin_tools %} + {%- if tool == "browser" %} + {%- set available_builtin_tools.browser = true %} + {%- elif tool == "python" %} + {%- set available_builtin_tools.python = true %} + {%- endif %} + {%- endfor %} + {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }} + {%- endif -%} + {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }} + {%- if tools is defined -%} + {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }} + {%- endif -%} +{%- endmacro -%} + +{#- Main Template Logic ================================================= #} +{#- Set defaults #} + +{#- Render system message #} +{{- "<|start|>system<|message|>" }} +{{- build_system_message() }} +{{- "<|end|>" }} + +{#- Extract developer message #} +{%- if messages[0].role == "developer" or messages[0].role == "system" %} + {%- set developer_message = messages[0].content %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set developer_message = "" %} + {%- set loop_messages = messages %} +{%- endif %} + +{#- Render developer message #} +{%- if developer_message or tools %} + {{- "<|start|>developer<|message|>" }} + {%- if developer_message %} + {{- "# Instructions\n\n" }} + {{- developer_message }} + {%- endif %} + {%- if tools -%} + {{- "\n\n" }} + {{- "# Tools\n\n" }} + {{- render_tool_namespace("functions", tools) }} + {%- endif -%} + {{- "<|end|>" }} +{%- endif %} + +{#- Render messages #} +{%- set last_tool_call = namespace(name=none) %} +{%- for message in loop_messages -%} + {#- At this point only assistant/user/tool messages should remain #} + {%- if message.role == 'assistant' -%} + {%- if "tool_calls" in message %} + {#- We assume max 1 tool call per message, and so we infer the tool call name #} + {#- in "tool" messages from the most recent assistant tool call name #} + {%- set tool_call = message.tool_calls[0] %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if message.content %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }} + {%- endif %} + {{- "<|start|>assistant to=" }} + {{- "functions." + tool_call.name + "<|channel|>commentary json<|message|>" }} + {{- tool_call.arguments|tojson }} + {{- "<|call|>" }} + {%- set last_tool_call.name = tool_call.name %} + {%- elif "thinking" in message and loop.last and not add_generation_prompt %} + {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #} + {#- This is a situation that should only occur in training, never in inference. #} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }} + {%- set last_tool_call.name = none %} + {%- elif "thinking" in message %} + {#- CoT is dropped during all previous turns, so we never render it for inference #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- elif loop.last and not add_generation_prompt %} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|message|>" + message.content + "<|return|>" }} + {%- else %} + {{- "<|start|>assistant<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- endif %} + {%- elif message.role == 'tool' -%} + {%- if last_tool_call.name is none %} + {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }} + {%- endif %} + {{- "<|start|>functions." + last_tool_call.name }} + {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} + {%- else -%} + {{- "<|start|>user<|message|>" + message.content + "<|end|>" }} + {%- endif -%} +{%- endfor -%} + +{#- Generation prompt #} +{%- if add_generation_prompt -%} +<|start|>assistant +{%- endif -%} +{# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #} \ No newline at end of file diff --git a/outputs/checkpoint-500/optimizer.pt b/outputs/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..96d24d2f8ffa32886b30c4b2ceacf177593b485f --- /dev/null +++ b/outputs/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e8e01b878a15f489ed7e9f584370716b15783abbbdabd551f242a6101e2133c +size 16894883 diff --git a/outputs/checkpoint-500/rng_state.pth b/outputs/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ef66339b9befa098183fd5d69faed6838e526b0 --- /dev/null +++ b/outputs/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1d565802a8e26c4e8a31328752b7a7fdc186d9401aa008e65697d0ad8c22e33 +size 14645 diff --git a/outputs/checkpoint-500/special_tokens_map.json b/outputs/checkpoint-500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..6fba18753f4d09dbb8fcdf1482daff36b963d639 --- /dev/null +++ b/outputs/checkpoint-500/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/outputs/checkpoint-500/tokenizer.json b/outputs/checkpoint-500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec3ef1795cbbda6b7cb7d1f114919cbe3fdd647 --- /dev/null +++ b/outputs/checkpoint-500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3 +size 27868174 diff --git a/outputs/checkpoint-500/tokenizer_config.json b/outputs/checkpoint-500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..482ae30d27a74c38d2228e69dd37c529fc485a45 --- /dev/null +++ b/outputs/checkpoint-500/tokenizer_config.json @@ -0,0 +1,185 @@ +{ + "added_tokens_decoder": { + "199998": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "199999": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200000": { + "content": "<|reserved_200000|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200001": { + "content": "<|reserved_200001|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200002": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200003": { + "content": "<|constrain|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200004": { + "content": "<|reserved_200004|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200005": { + "content": "<|channel|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200006": { + "content": "<|start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200008": { + "content": "<|message|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200009": { + "content": "<|reserved_200009|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200010": { + "content": "<|reserved_200010|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200011": { + "content": "<|reserved_200011|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200012": { + "content": "<|call|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200013": { + "content": "<|reserved_200013|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200014": { + "content": "<|reserved_200014|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200015": { + "content": "<|reserved_200015|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200016": { + "content": "<|reserved_200016|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200017": { + "content": "<|reserved_200017|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "200018": { + "content": "<|endofprompt|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|startoftext|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|return|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|reserved_200017|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/outputs/checkpoint-500/trainer_state.json b/outputs/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d32e9608f768bc00df1139575b6bc4e7d475f2d1 --- /dev/null +++ b/outputs/checkpoint-500/trainer_state.json @@ -0,0 +1,3534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1943068104537064, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 13.684800148010254, + "learning_rate": 0.0, + "loss": 2.3276, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 13.660787582397461, + "learning_rate": 4e-05, + "loss": 2.2792, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 13.35280704498291, + "learning_rate": 8e-05, + "loss": 2.4151, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 6.15027379989624, + "learning_rate": 0.00012, + "loss": 1.7812, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 1.3168226480484009, + "learning_rate": 0.00016, + "loss": 1.4536, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.9872580170631409, + "learning_rate": 0.0002, + "loss": 1.4171, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.7496100664138794, + "learning_rate": 0.00019935064935064936, + "loss": 1.4168, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7376005053520203, + "learning_rate": 0.00019870129870129872, + "loss": 1.3659, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5281137824058533, + "learning_rate": 0.00019805194805194807, + "loss": 1.2566, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.5485746264457703, + "learning_rate": 0.00019740259740259742, + "loss": 1.3761, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5506592392921448, + "learning_rate": 0.00019675324675324675, + "loss": 1.3327, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.49382686614990234, + "learning_rate": 0.00019610389610389613, + "loss": 1.3727, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.36203011870384216, + "learning_rate": 0.00019545454545454548, + "loss": 1.1515, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.3528599739074707, + "learning_rate": 0.0001948051948051948, + "loss": 1.2636, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.31244418025016785, + "learning_rate": 0.00019415584415584416, + "loss": 1.1873, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.3379523754119873, + "learning_rate": 0.00019350649350649354, + "loss": 1.2657, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.3025083839893341, + "learning_rate": 0.00019285714285714286, + "loss": 1.2846, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.2560190260410309, + "learning_rate": 0.00019220779220779222, + "loss": 1.1587, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.2554129958152771, + "learning_rate": 0.00019155844155844157, + "loss": 1.2812, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.22662702202796936, + "learning_rate": 0.00019090909090909092, + "loss": 1.1664, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.2515714168548584, + "learning_rate": 0.00019025974025974027, + "loss": 1.2177, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.24396637082099915, + "learning_rate": 0.00018961038961038963, + "loss": 1.2053, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.24488303065299988, + "learning_rate": 0.00018896103896103895, + "loss": 1.2074, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.2168620079755783, + "learning_rate": 0.00018831168831168833, + "loss": 1.1284, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.24021224677562714, + "learning_rate": 0.00018766233766233769, + "loss": 1.2169, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.20057056844234467, + "learning_rate": 0.000187012987012987, + "loss": 1.1031, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.19900795817375183, + "learning_rate": 0.00018636363636363636, + "loss": 1.1004, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.2019268423318863, + "learning_rate": 0.00018571428571428572, + "loss": 1.1476, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.1996479034423828, + "learning_rate": 0.00018506493506493507, + "loss": 1.1455, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.25262022018432617, + "learning_rate": 0.00018441558441558442, + "loss": 1.1025, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.225438192486763, + "learning_rate": 0.00018376623376623378, + "loss": 1.1954, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.17834505438804626, + "learning_rate": 0.00018311688311688313, + "loss": 1.0934, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.20071206986904144, + "learning_rate": 0.00018246753246753248, + "loss": 1.0488, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.1920139640569687, + "learning_rate": 0.00018181818181818183, + "loss": 1.123, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.18714852631092072, + "learning_rate": 0.0001811688311688312, + "loss": 1.0798, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.18315713107585907, + "learning_rate": 0.00018051948051948054, + "loss": 1.1107, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.19156870245933533, + "learning_rate": 0.00017987012987012987, + "loss": 1.1125, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.21527768671512604, + "learning_rate": 0.00017922077922077922, + "loss": 1.1346, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.1871163249015808, + "learning_rate": 0.0001785714285714286, + "loss": 1.0742, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.17750784754753113, + "learning_rate": 0.00017792207792207792, + "loss": 1.1323, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.177419051527977, + "learning_rate": 0.00017727272727272728, + "loss": 1.1405, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.16714292764663696, + "learning_rate": 0.00017662337662337663, + "loss": 1.1084, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.1610356718301773, + "learning_rate": 0.00017597402597402598, + "loss": 1.1125, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.2548656761646271, + "learning_rate": 0.00017532467532467534, + "loss": 1.1114, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.1731044203042984, + "learning_rate": 0.0001746753246753247, + "loss": 1.1197, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.1739533394575119, + "learning_rate": 0.00017402597402597401, + "loss": 1.1777, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.2178352177143097, + "learning_rate": 0.0001733766233766234, + "loss": 1.1111, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.17247150838375092, + "learning_rate": 0.00017272727272727275, + "loss": 1.1253, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.18075324594974518, + "learning_rate": 0.00017207792207792207, + "loss": 1.1358, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.15898071229457855, + "learning_rate": 0.00017142857142857143, + "loss": 1.0606, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.16518613696098328, + "learning_rate": 0.0001707792207792208, + "loss": 1.0944, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.16035063564777374, + "learning_rate": 0.00017012987012987013, + "loss": 1.0554, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.1686483472585678, + "learning_rate": 0.00016948051948051948, + "loss": 1.0384, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.16575631499290466, + "learning_rate": 0.00016883116883116884, + "loss": 1.0243, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.16840039193630219, + "learning_rate": 0.0001681818181818182, + "loss": 1.117, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.17616064846515656, + "learning_rate": 0.00016753246753246754, + "loss": 1.0743, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.168218195438385, + "learning_rate": 0.0001668831168831169, + "loss": 1.0627, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.17026656866073608, + "learning_rate": 0.00016623376623376625, + "loss": 1.0059, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.16454458236694336, + "learning_rate": 0.0001655844155844156, + "loss": 0.9943, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.17185136675834656, + "learning_rate": 0.00016493506493506495, + "loss": 1.1545, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.17822986841201782, + "learning_rate": 0.00016428571428571428, + "loss": 1.073, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.1676608771085739, + "learning_rate": 0.00016363636363636366, + "loss": 1.0886, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.1727771908044815, + "learning_rate": 0.000162987012987013, + "loss": 1.0432, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.17827573418617249, + "learning_rate": 0.00016233766233766234, + "loss": 1.083, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.19807517528533936, + "learning_rate": 0.0001616883116883117, + "loss": 1.1208, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.17693684995174408, + "learning_rate": 0.00016103896103896104, + "loss": 1.089, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.15489234030246735, + "learning_rate": 0.0001603896103896104, + "loss": 0.9707, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.16443990170955658, + "learning_rate": 0.00015974025974025975, + "loss": 1.0643, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.2051103413105011, + "learning_rate": 0.0001590909090909091, + "loss": 1.1246, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.18824075162410736, + "learning_rate": 0.00015844155844155845, + "loss": 1.0855, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.18659448623657227, + "learning_rate": 0.0001577922077922078, + "loss": 1.1412, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.1854114979505539, + "learning_rate": 0.00015714285714285716, + "loss": 1.0249, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.1876193732023239, + "learning_rate": 0.00015649350649350649, + "loss": 1.1029, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.1888684630393982, + "learning_rate": 0.00015584415584415587, + "loss": 1.0789, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.20240606367588043, + "learning_rate": 0.0001551948051948052, + "loss": 1.0495, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.232120081782341, + "learning_rate": 0.00015454545454545454, + "loss": 1.0735, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.16897843778133392, + "learning_rate": 0.0001538961038961039, + "loss": 1.0164, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.18796634674072266, + "learning_rate": 0.00015324675324675325, + "loss": 1.0676, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.19574032723903656, + "learning_rate": 0.0001525974025974026, + "loss": 1.0456, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.18007811903953552, + "learning_rate": 0.00015194805194805196, + "loss": 1.0894, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.18932929635047913, + "learning_rate": 0.0001512987012987013, + "loss": 1.0729, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.20614288747310638, + "learning_rate": 0.00015064935064935066, + "loss": 1.0854, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.19291089475154877, + "learning_rate": 0.00015000000000000001, + "loss": 1.1217, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.18916529417037964, + "learning_rate": 0.00014935064935064934, + "loss": 1.0963, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.20306220650672913, + "learning_rate": 0.00014870129870129872, + "loss": 1.0898, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.17870067059993744, + "learning_rate": 0.00014805194805194807, + "loss": 1.0213, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.18411923944950104, + "learning_rate": 0.0001474025974025974, + "loss": 1.0844, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.18788227438926697, + "learning_rate": 0.00014675324675324675, + "loss": 1.0338, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.23874884843826294, + "learning_rate": 0.00014610389610389613, + "loss": 1.1118, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.19380499422550201, + "learning_rate": 0.00014545454545454546, + "loss": 1.0464, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.18968750536441803, + "learning_rate": 0.0001448051948051948, + "loss": 1.0569, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.19545753300189972, + "learning_rate": 0.00014415584415584416, + "loss": 1.1225, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.19170494377613068, + "learning_rate": 0.00014350649350649352, + "loss": 1.0602, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.17953918874263763, + "learning_rate": 0.00014285714285714287, + "loss": 1.032, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.1822536289691925, + "learning_rate": 0.00014220779220779222, + "loss": 1.0559, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.18591298162937164, + "learning_rate": 0.00014155844155844155, + "loss": 1.031, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.2129002958536148, + "learning_rate": 0.00014090909090909093, + "loss": 1.1391, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.18386681377887726, + "learning_rate": 0.00014025974025974028, + "loss": 0.9919, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.18314239382743835, + "learning_rate": 0.0001396103896103896, + "loss": 1.0445, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.1999066174030304, + "learning_rate": 0.00013896103896103896, + "loss": 1.0538, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.18741188943386078, + "learning_rate": 0.00013831168831168834, + "loss": 1.0722, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.19351010024547577, + "learning_rate": 0.00013766233766233766, + "loss": 1.0491, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.18859203159809113, + "learning_rate": 0.00013701298701298702, + "loss": 1.0593, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.1962767392396927, + "learning_rate": 0.00013636363636363637, + "loss": 1.1344, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.20819440484046936, + "learning_rate": 0.00013571428571428572, + "loss": 1.1137, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.19590184092521667, + "learning_rate": 0.00013506493506493507, + "loss": 1.0624, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.18631424009799957, + "learning_rate": 0.00013441558441558443, + "loss": 1.0587, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.19572143256664276, + "learning_rate": 0.00013376623376623375, + "loss": 1.0494, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.1910988837480545, + "learning_rate": 0.00013311688311688313, + "loss": 1.0481, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.19455869495868683, + "learning_rate": 0.00013246753246753249, + "loss": 1.029, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.18669827282428741, + "learning_rate": 0.0001318181818181818, + "loss": 1.0513, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.17523664236068726, + "learning_rate": 0.0001311688311688312, + "loss": 1.0126, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.17929129302501678, + "learning_rate": 0.00013051948051948052, + "loss": 1.0717, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.19380168616771698, + "learning_rate": 0.00012987012987012987, + "loss": 1.0324, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.18090228736400604, + "learning_rate": 0.00012922077922077922, + "loss": 1.0515, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.2067340910434723, + "learning_rate": 0.00012857142857142858, + "loss": 1.0939, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.1880485862493515, + "learning_rate": 0.00012792207792207793, + "loss": 1.0986, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.182168647646904, + "learning_rate": 0.00012727272727272728, + "loss": 1.0109, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.20187129080295563, + "learning_rate": 0.00012662337662337663, + "loss": 1.0668, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.2082669734954834, + "learning_rate": 0.000125974025974026, + "loss": 1.054, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.18294434249401093, + "learning_rate": 0.00012532467532467534, + "loss": 1.0397, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.20515067875385284, + "learning_rate": 0.00012467532467532467, + "loss": 1.1092, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.1758790761232376, + "learning_rate": 0.00012402597402597402, + "loss": 0.9755, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.2170792669057846, + "learning_rate": 0.0001233766233766234, + "loss": 1.0434, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.202157124876976, + "learning_rate": 0.00012272727272727272, + "loss": 1.1129, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.18556398153305054, + "learning_rate": 0.00012207792207792208, + "loss": 1.0665, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.20196087658405304, + "learning_rate": 0.00012142857142857143, + "loss": 1.1, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.1921566128730774, + "learning_rate": 0.0001207792207792208, + "loss": 1.0918, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.18866224586963654, + "learning_rate": 0.00012012987012987014, + "loss": 1.0014, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.207601398229599, + "learning_rate": 0.00011948051948051949, + "loss": 1.0726, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.21592366695404053, + "learning_rate": 0.00011883116883116883, + "loss": 1.1379, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.2016124576330185, + "learning_rate": 0.0001181818181818182, + "loss": 1.1428, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.20478437840938568, + "learning_rate": 0.00011753246753246753, + "loss": 1.121, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.22730594873428345, + "learning_rate": 0.00011688311688311689, + "loss": 1.0319, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.22592711448669434, + "learning_rate": 0.00011623376623376625, + "loss": 1.1264, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.20035041868686676, + "learning_rate": 0.00011558441558441559, + "loss": 1.0686, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.20648567378520966, + "learning_rate": 0.00011493506493506494, + "loss": 1.0817, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.21222743391990662, + "learning_rate": 0.00011428571428571428, + "loss": 1.0678, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.2075391560792923, + "learning_rate": 0.00011363636363636365, + "loss": 1.0897, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.1964101791381836, + "learning_rate": 0.000112987012987013, + "loss": 1.0906, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.22406511008739471, + "learning_rate": 0.00011233766233766234, + "loss": 1.0594, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.23787978291511536, + "learning_rate": 0.00011168831168831168, + "loss": 1.1053, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.21196185052394867, + "learning_rate": 0.00011103896103896105, + "loss": 1.0923, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.21042804419994354, + "learning_rate": 0.0001103896103896104, + "loss": 1.0381, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.2267436534166336, + "learning_rate": 0.00010974025974025974, + "loss": 1.0818, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.23742735385894775, + "learning_rate": 0.00010909090909090909, + "loss": 1.0872, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.17787213623523712, + "learning_rate": 0.00010844155844155846, + "loss": 1.03, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.22422832250595093, + "learning_rate": 0.0001077922077922078, + "loss": 1.0738, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.22946301102638245, + "learning_rate": 0.00010714285714285715, + "loss": 1.0274, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.2137996405363083, + "learning_rate": 0.00010649350649350649, + "loss": 1.0539, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.1748756766319275, + "learning_rate": 0.00010584415584415586, + "loss": 1.0355, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.22275175154209137, + "learning_rate": 0.0001051948051948052, + "loss": 1.1696, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.20996077358722687, + "learning_rate": 0.00010454545454545455, + "loss": 1.0303, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.1945938766002655, + "learning_rate": 0.00010389610389610389, + "loss": 0.9747, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.1970377266407013, + "learning_rate": 0.00010324675324675325, + "loss": 1.0358, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.18814732134342194, + "learning_rate": 0.00010259740259740261, + "loss": 0.9612, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.2153233289718628, + "learning_rate": 0.00010194805194805195, + "loss": 1.0749, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.21788008511066437, + "learning_rate": 0.0001012987012987013, + "loss": 1.0883, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.214650496840477, + "learning_rate": 0.00010064935064935067, + "loss": 1.0539, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.19312834739685059, + "learning_rate": 0.0001, + "loss": 1.0657, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.19916598498821259, + "learning_rate": 9.935064935064936e-05, + "loss": 1.0478, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.2057606726884842, + "learning_rate": 9.870129870129871e-05, + "loss": 1.0094, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.22159607708454132, + "learning_rate": 9.805194805194806e-05, + "loss": 1.0952, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.18274275958538055, + "learning_rate": 9.74025974025974e-05, + "loss": 1.0065, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.19835162162780762, + "learning_rate": 9.675324675324677e-05, + "loss": 1.0742, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.2114904820919037, + "learning_rate": 9.610389610389611e-05, + "loss": 1.1109, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.21488523483276367, + "learning_rate": 9.545454545454546e-05, + "loss": 1.0465, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.19870303571224213, + "learning_rate": 9.480519480519481e-05, + "loss": 1.0318, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.20413029193878174, + "learning_rate": 9.415584415584417e-05, + "loss": 1.0817, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.1847231239080429, + "learning_rate": 9.35064935064935e-05, + "loss": 1.0144, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.2715964913368225, + "learning_rate": 9.285714285714286e-05, + "loss": 0.9832, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.2225002497434616, + "learning_rate": 9.220779220779221e-05, + "loss": 1.1051, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.22931510210037231, + "learning_rate": 9.155844155844156e-05, + "loss": 1.1042, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.21848627924919128, + "learning_rate": 9.090909090909092e-05, + "loss": 1.1151, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.19852259755134583, + "learning_rate": 9.025974025974027e-05, + "loss": 1.0889, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.2080363780260086, + "learning_rate": 8.961038961038961e-05, + "loss": 1.0777, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.22391024231910706, + "learning_rate": 8.896103896103896e-05, + "loss": 1.1092, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.21793846786022186, + "learning_rate": 8.831168831168831e-05, + "loss": 1.044, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.2009749859571457, + "learning_rate": 8.766233766233767e-05, + "loss": 1.0198, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.19432318210601807, + "learning_rate": 8.701298701298701e-05, + "loss": 1.075, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.18634547293186188, + "learning_rate": 8.636363636363637e-05, + "loss": 0.9964, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.1947103589773178, + "learning_rate": 8.571428571428571e-05, + "loss": 1.0025, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.23098671436309814, + "learning_rate": 8.506493506493507e-05, + "loss": 1.0562, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.19686414301395416, + "learning_rate": 8.441558441558442e-05, + "loss": 1.0285, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.19852428138256073, + "learning_rate": 8.376623376623377e-05, + "loss": 1.0054, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.21483510732650757, + "learning_rate": 8.311688311688312e-05, + "loss": 1.108, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.23313644528388977, + "learning_rate": 8.246753246753248e-05, + "loss": 1.1383, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.21453145146369934, + "learning_rate": 8.181818181818183e-05, + "loss": 1.0911, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.20268195867538452, + "learning_rate": 8.116883116883117e-05, + "loss": 1.0145, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.20576398074626923, + "learning_rate": 8.051948051948052e-05, + "loss": 1.0829, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.21732626855373383, + "learning_rate": 7.987012987012987e-05, + "loss": 1.0152, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.22046895325183868, + "learning_rate": 7.922077922077923e-05, + "loss": 1.1311, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.19727715849876404, + "learning_rate": 7.857142857142858e-05, + "loss": 1.0364, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.20861488580703735, + "learning_rate": 7.792207792207793e-05, + "loss": 1.0435, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.18545083701610565, + "learning_rate": 7.727272727272727e-05, + "loss": 1.0299, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.19965052604675293, + "learning_rate": 7.662337662337662e-05, + "loss": 1.0511, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.23673909902572632, + "learning_rate": 7.597402597402598e-05, + "loss": 1.081, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.17583179473876953, + "learning_rate": 7.532467532467533e-05, + "loss": 0.9808, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.2129366099834442, + "learning_rate": 7.467532467532467e-05, + "loss": 1.0522, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.21679140627384186, + "learning_rate": 7.402597402597404e-05, + "loss": 1.0567, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.2032000720500946, + "learning_rate": 7.337662337662338e-05, + "loss": 1.0466, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.1887970268726349, + "learning_rate": 7.272727272727273e-05, + "loss": 1.0329, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.21060192584991455, + "learning_rate": 7.207792207792208e-05, + "loss": 1.1021, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.21191425621509552, + "learning_rate": 7.142857142857143e-05, + "loss": 0.99, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.1995989829301834, + "learning_rate": 7.077922077922077e-05, + "loss": 1.0526, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.1849513053894043, + "learning_rate": 7.012987012987014e-05, + "loss": 0.9998, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.1948779672384262, + "learning_rate": 6.948051948051948e-05, + "loss": 1.075, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.20374052226543427, + "learning_rate": 6.883116883116883e-05, + "loss": 1.0933, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.2102465033531189, + "learning_rate": 6.818181818181818e-05, + "loss": 1.1123, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.21376173198223114, + "learning_rate": 6.753246753246754e-05, + "loss": 1.1233, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.20934203267097473, + "learning_rate": 6.688311688311688e-05, + "loss": 1.1374, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.18604128062725067, + "learning_rate": 6.623376623376624e-05, + "loss": 1.0213, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.19644233584403992, + "learning_rate": 6.55844155844156e-05, + "loss": 1.0046, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.18479463458061218, + "learning_rate": 6.493506493506494e-05, + "loss": 0.9792, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.1945149153470993, + "learning_rate": 6.428571428571429e-05, + "loss": 1.0584, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.2070147544145584, + "learning_rate": 6.363636363636364e-05, + "loss": 1.071, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.19645985960960388, + "learning_rate": 6.2987012987013e-05, + "loss": 1.0721, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.1960117667913437, + "learning_rate": 6.233766233766233e-05, + "loss": 1.071, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.20168261229991913, + "learning_rate": 6.16883116883117e-05, + "loss": 1.0808, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.21254412829875946, + "learning_rate": 6.103896103896104e-05, + "loss": 1.0287, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.21271063387393951, + "learning_rate": 6.03896103896104e-05, + "loss": 1.0605, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.2081408053636551, + "learning_rate": 5.9740259740259744e-05, + "loss": 1.091, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.21113798022270203, + "learning_rate": 5.90909090909091e-05, + "loss": 1.1323, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.20670844614505768, + "learning_rate": 5.844155844155844e-05, + "loss": 1.0955, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.2010120451450348, + "learning_rate": 5.7792207792207796e-05, + "loss": 1.1068, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.20379121601581573, + "learning_rate": 5.714285714285714e-05, + "loss": 1.0419, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.22799807786941528, + "learning_rate": 5.64935064935065e-05, + "loss": 1.0904, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.2005995213985443, + "learning_rate": 5.584415584415584e-05, + "loss": 1.078, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.20329605042934418, + "learning_rate": 5.51948051948052e-05, + "loss": 1.0245, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.19283504784107208, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.0367, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.20624355971813202, + "learning_rate": 5.38961038961039e-05, + "loss": 1.1046, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.21362991631031036, + "learning_rate": 5.3246753246753245e-05, + "loss": 1.1104, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.20447863638401031, + "learning_rate": 5.25974025974026e-05, + "loss": 1.0514, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.1974381059408188, + "learning_rate": 5.1948051948051944e-05, + "loss": 1.0048, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.21237170696258545, + "learning_rate": 5.1298701298701304e-05, + "loss": 1.1299, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.21224971115589142, + "learning_rate": 5.064935064935065e-05, + "loss": 1.05, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.19865018129348755, + "learning_rate": 5e-05, + "loss": 1.0665, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.19199275970458984, + "learning_rate": 4.9350649350649355e-05, + "loss": 0.9531, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.19573214650154114, + "learning_rate": 4.87012987012987e-05, + "loss": 1.0318, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.21338805556297302, + "learning_rate": 4.8051948051948054e-05, + "loss": 1.0343, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.2254691869020462, + "learning_rate": 4.740259740259741e-05, + "loss": 1.0472, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.18101665377616882, + "learning_rate": 4.675324675324675e-05, + "loss": 1.017, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.22090592980384827, + "learning_rate": 4.6103896103896106e-05, + "loss": 1.0389, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.20865507423877716, + "learning_rate": 4.545454545454546e-05, + "loss": 1.0369, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.21619610488414764, + "learning_rate": 4.4805194805194805e-05, + "loss": 1.109, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.21694771945476532, + "learning_rate": 4.415584415584416e-05, + "loss": 1.0525, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.2182662934064865, + "learning_rate": 4.3506493506493503e-05, + "loss": 1.0331, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.2026486098766327, + "learning_rate": 4.2857142857142856e-05, + "loss": 1.027, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.19606547057628632, + "learning_rate": 4.220779220779221e-05, + "loss": 1.0242, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.22107470035552979, + "learning_rate": 4.155844155844156e-05, + "loss": 1.0924, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.19960008561611176, + "learning_rate": 4.0909090909090915e-05, + "loss": 1.0384, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.1945488154888153, + "learning_rate": 4.025974025974026e-05, + "loss": 1.0673, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.22067414224147797, + "learning_rate": 3.9610389610389614e-05, + "loss": 1.0426, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.19010980427265167, + "learning_rate": 3.8961038961038966e-05, + "loss": 1.0617, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.18781176209449768, + "learning_rate": 3.831168831168831e-05, + "loss": 1.0243, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.20388829708099365, + "learning_rate": 3.7662337662337665e-05, + "loss": 1.0476, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.19911155104637146, + "learning_rate": 3.701298701298702e-05, + "loss": 1.0324, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.19884039461612701, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.0242, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.19036105275154114, + "learning_rate": 3.571428571428572e-05, + "loss": 1.0323, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.20039844512939453, + "learning_rate": 3.506493506493507e-05, + "loss": 1.0749, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.1899934560060501, + "learning_rate": 3.4415584415584416e-05, + "loss": 1.0115, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.20019090175628662, + "learning_rate": 3.376623376623377e-05, + "loss": 1.0782, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.2020583152770996, + "learning_rate": 3.311688311688312e-05, + "loss": 1.0687, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.21407337486743927, + "learning_rate": 3.246753246753247e-05, + "loss": 1.1015, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.1871640682220459, + "learning_rate": 3.181818181818182e-05, + "loss": 0.9637, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.21622811257839203, + "learning_rate": 3.1168831168831166e-05, + "loss": 1.1222, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.22504661977291107, + "learning_rate": 3.051948051948052e-05, + "loss": 1.132, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.19177629053592682, + "learning_rate": 2.9870129870129872e-05, + "loss": 1.0281, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.1970544159412384, + "learning_rate": 2.922077922077922e-05, + "loss": 1.0393, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.21554522216320038, + "learning_rate": 2.857142857142857e-05, + "loss": 1.074, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.21131229400634766, + "learning_rate": 2.792207792207792e-05, + "loss": 1.054, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.19816523790359497, + "learning_rate": 2.7272727272727273e-05, + "loss": 1.0456, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.21075209975242615, + "learning_rate": 2.6623376623376623e-05, + "loss": 1.0758, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.2296527624130249, + "learning_rate": 2.5974025974025972e-05, + "loss": 1.0917, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.19722610712051392, + "learning_rate": 2.5324675324675325e-05, + "loss": 1.0704, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.18721099197864532, + "learning_rate": 2.4675324675324678e-05, + "loss": 0.9919, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.20244193077087402, + "learning_rate": 2.4025974025974027e-05, + "loss": 1.0368, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.19518914818763733, + "learning_rate": 2.3376623376623376e-05, + "loss": 1.0436, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.19650357961654663, + "learning_rate": 2.272727272727273e-05, + "loss": 1.0306, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.20320096611976624, + "learning_rate": 2.207792207792208e-05, + "loss": 1.0941, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.18296951055526733, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.9802, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.21357610821723938, + "learning_rate": 2.077922077922078e-05, + "loss": 1.0449, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.193921759724617, + "learning_rate": 2.012987012987013e-05, + "loss": 1.0116, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.1953902244567871, + "learning_rate": 1.9480519480519483e-05, + "loss": 1.0105, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.19440975785255432, + "learning_rate": 1.8831168831168833e-05, + "loss": 0.9952, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.21054105460643768, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.0701, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.18844804167747498, + "learning_rate": 1.7532467532467535e-05, + "loss": 1.0146, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.2067311704158783, + "learning_rate": 1.6883116883116884e-05, + "loss": 1.0781, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.1941213756799698, + "learning_rate": 1.6233766233766234e-05, + "loss": 0.9814, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.22726193070411682, + "learning_rate": 1.5584415584415583e-05, + "loss": 1.1431, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.18025581538677216, + "learning_rate": 1.4935064935064936e-05, + "loss": 0.9649, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.21535000205039978, + "learning_rate": 1.4285714285714285e-05, + "loss": 1.0441, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.20014546811580658, + "learning_rate": 1.3636363636363637e-05, + "loss": 1.0166, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.22738787531852722, + "learning_rate": 1.2987012987012986e-05, + "loss": 1.0564, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.2020861804485321, + "learning_rate": 1.2337662337662339e-05, + "loss": 1.1241, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.19888809323310852, + "learning_rate": 1.1688311688311688e-05, + "loss": 1.1114, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.20912377536296844, + "learning_rate": 1.103896103896104e-05, + "loss": 1.0971, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.21206621825695038, + "learning_rate": 1.038961038961039e-05, + "loss": 1.0601, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.18667680025100708, + "learning_rate": 9.740259740259742e-06, + "loss": 1.0291, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.21125559508800507, + "learning_rate": 9.090909090909091e-06, + "loss": 1.0483, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.21776145696640015, + "learning_rate": 8.441558441558442e-06, + "loss": 0.9912, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.20144303143024445, + "learning_rate": 7.792207792207792e-06, + "loss": 1.0357, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.1984029859304428, + "learning_rate": 7.142857142857143e-06, + "loss": 1.0648, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.17972829937934875, + "learning_rate": 6.493506493506493e-06, + "loss": 1.0033, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.1818286031484604, + "learning_rate": 5.844155844155844e-06, + "loss": 0.997, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.19670912623405457, + "learning_rate": 5.194805194805195e-06, + "loss": 1.0256, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.20527283847332, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.0348, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.19025909900665283, + "learning_rate": 3.896103896103896e-06, + "loss": 1.0682, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.19544818997383118, + "learning_rate": 3.2467532467532465e-06, + "loss": 0.9872, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.22112183272838593, + "learning_rate": 2.5974025974025976e-06, + "loss": 1.0661, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.23328153789043427, + "learning_rate": 1.948051948051948e-06, + "loss": 1.0691, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.20181375741958618, + "learning_rate": 1.2987012987012988e-06, + "loss": 0.9416, + "step": 312 + }, + { + "epoch": 1.0, + "grad_norm": 0.29312625527381897, + "learning_rate": 6.493506493506494e-07, + "loss": 1.1216, + "step": 313 + }, + { + "epoch": 0.12202467696492762, + "grad_norm": 0.2231415957212448, + "learning_rate": 0.0, + "loss": 1.0468, + "step": 314 + }, + { + "epoch": 0.12241329058583503, + "grad_norm": 0.22263288497924805, + "learning_rate": 0.00017594394706111328, + "loss": 1.0399, + "step": 315 + }, + { + "epoch": 0.12280190420674245, + "grad_norm": 0.22909891605377197, + "learning_rate": 0.00017586609575710393, + "loss": 1.1069, + "step": 316 + }, + { + "epoch": 0.12319051782764986, + "grad_norm": 0.23951445519924164, + "learning_rate": 0.0001757882444530946, + "loss": 1.1036, + "step": 317 + }, + { + "epoch": 0.12357913144855727, + "grad_norm": 0.2409268021583557, + "learning_rate": 0.00017571039314908526, + "loss": 1.1114, + "step": 318 + }, + { + "epoch": 0.12396774506946469, + "grad_norm": 0.23753899335861206, + "learning_rate": 0.00017563254184507592, + "loss": 1.1297, + "step": 319 + }, + { + "epoch": 0.12435635869037209, + "grad_norm": 0.2823902666568756, + "learning_rate": 0.00017555469054106657, + "loss": 1.1293, + "step": 320 + }, + { + "epoch": 0.12474497231127951, + "grad_norm": 0.24093545973300934, + "learning_rate": 0.00017547683923705722, + "loss": 1.0678, + "step": 321 + }, + { + "epoch": 0.12513358593218693, + "grad_norm": 0.22565563023090363, + "learning_rate": 0.0001753989879330479, + "loss": 1.1408, + "step": 322 + }, + { + "epoch": 0.12552219955309435, + "grad_norm": 0.22569572925567627, + "learning_rate": 0.00017532113662903855, + "loss": 1.0543, + "step": 323 + }, + { + "epoch": 0.12591081317400174, + "grad_norm": 0.24962866306304932, + "learning_rate": 0.0001752432853250292, + "loss": 1.0818, + "step": 324 + }, + { + "epoch": 0.12629942679490916, + "grad_norm": 0.22184576094150543, + "learning_rate": 0.00017516543402101986, + "loss": 1.0835, + "step": 325 + }, + { + "epoch": 0.12668804041581658, + "grad_norm": 0.2572194039821625, + "learning_rate": 0.0001750875827170105, + "loss": 1.0767, + "step": 326 + }, + { + "epoch": 0.127076654036724, + "grad_norm": 0.24131342768669128, + "learning_rate": 0.00017500973141300116, + "loss": 1.0981, + "step": 327 + }, + { + "epoch": 0.1274652676576314, + "grad_norm": 0.2386389970779419, + "learning_rate": 0.00017493188010899184, + "loss": 1.0828, + "step": 328 + }, + { + "epoch": 0.1278538812785388, + "grad_norm": 0.2654125690460205, + "learning_rate": 0.0001748540288049825, + "loss": 1.1266, + "step": 329 + }, + { + "epoch": 0.12824249489944622, + "grad_norm": 0.2925739884376526, + "learning_rate": 0.00017477617750097314, + "loss": 1.0983, + "step": 330 + }, + { + "epoch": 0.12863110852035364, + "grad_norm": 0.26589342951774597, + "learning_rate": 0.0001746983261969638, + "loss": 1.1029, + "step": 331 + }, + { + "epoch": 0.12901972214126106, + "grad_norm": 0.24565957486629486, + "learning_rate": 0.00017462047489295445, + "loss": 1.0975, + "step": 332 + }, + { + "epoch": 0.12940833576216845, + "grad_norm": 0.2459682673215866, + "learning_rate": 0.00017454262358894513, + "loss": 1.0566, + "step": 333 + }, + { + "epoch": 0.12979694938307587, + "grad_norm": 0.23349183797836304, + "learning_rate": 0.00017446477228493578, + "loss": 1.0833, + "step": 334 + }, + { + "epoch": 0.1301855630039833, + "grad_norm": 0.26166337728500366, + "learning_rate": 0.00017438692098092643, + "loss": 1.1598, + "step": 335 + }, + { + "epoch": 0.1305741766248907, + "grad_norm": 0.24188168346881866, + "learning_rate": 0.00017430906967691708, + "loss": 1.0728, + "step": 336 + }, + { + "epoch": 0.13096279024579813, + "grad_norm": 0.22922398149967194, + "learning_rate": 0.00017423121837290773, + "loss": 1.0311, + "step": 337 + }, + { + "epoch": 0.13135140386670552, + "grad_norm": 0.2652754485607147, + "learning_rate": 0.00017415336706889841, + "loss": 1.1096, + "step": 338 + }, + { + "epoch": 0.13174001748761294, + "grad_norm": 0.2355881780385971, + "learning_rate": 0.00017407551576488907, + "loss": 1.0964, + "step": 339 + }, + { + "epoch": 0.13212863110852036, + "grad_norm": 0.244523823261261, + "learning_rate": 0.00017399766446087972, + "loss": 1.142, + "step": 340 + }, + { + "epoch": 0.13251724472942777, + "grad_norm": 0.24705976247787476, + "learning_rate": 0.00017391981315687037, + "loss": 1.0943, + "step": 341 + }, + { + "epoch": 0.13290585835033517, + "grad_norm": 0.22817552089691162, + "learning_rate": 0.00017384196185286102, + "loss": 1.0621, + "step": 342 + }, + { + "epoch": 0.13329447197124258, + "grad_norm": 0.22605225443840027, + "learning_rate": 0.0001737641105488517, + "loss": 1.0714, + "step": 343 + }, + { + "epoch": 0.13368308559215, + "grad_norm": 0.2584545314311981, + "learning_rate": 0.00017368625924484235, + "loss": 1.1367, + "step": 344 + }, + { + "epoch": 0.13407169921305742, + "grad_norm": 0.2248220443725586, + "learning_rate": 0.000173608407940833, + "loss": 1.0872, + "step": 345 + }, + { + "epoch": 0.13446031283396484, + "grad_norm": 0.2141868770122528, + "learning_rate": 0.00017353055663682368, + "loss": 1.0572, + "step": 346 + }, + { + "epoch": 0.13484892645487223, + "grad_norm": 0.2615523934364319, + "learning_rate": 0.00017345270533281434, + "loss": 1.1048, + "step": 347 + }, + { + "epoch": 0.13523754007577965, + "grad_norm": 0.22990448772907257, + "learning_rate": 0.000173374854028805, + "loss": 1.0528, + "step": 348 + }, + { + "epoch": 0.13562615369668707, + "grad_norm": 0.2132262885570526, + "learning_rate": 0.00017329700272479564, + "loss": 1.0476, + "step": 349 + }, + { + "epoch": 0.1360147673175945, + "grad_norm": 0.2578272819519043, + "learning_rate": 0.00017321915142078632, + "loss": 1.0852, + "step": 350 + }, + { + "epoch": 0.1364033809385019, + "grad_norm": 0.22881457209587097, + "learning_rate": 0.00017314130011677697, + "loss": 1.1017, + "step": 351 + }, + { + "epoch": 0.1367919945594093, + "grad_norm": 0.21067696809768677, + "learning_rate": 0.00017306344881276762, + "loss": 1.0444, + "step": 352 + }, + { + "epoch": 0.13718060818031672, + "grad_norm": 0.2304215282201767, + "learning_rate": 0.0001729855975087583, + "loss": 1.0737, + "step": 353 + }, + { + "epoch": 0.13756922180122413, + "grad_norm": 0.2031925916671753, + "learning_rate": 0.00017290774620474895, + "loss": 1.0036, + "step": 354 + }, + { + "epoch": 0.13795783542213155, + "grad_norm": 0.27281051874160767, + "learning_rate": 0.0001728298949007396, + "loss": 1.148, + "step": 355 + }, + { + "epoch": 0.13834644904303897, + "grad_norm": 0.204191654920578, + "learning_rate": 0.00017275204359673026, + "loss": 0.9607, + "step": 356 + }, + { + "epoch": 0.13873506266394636, + "grad_norm": 0.221976637840271, + "learning_rate": 0.0001726741922927209, + "loss": 1.1068, + "step": 357 + }, + { + "epoch": 0.13912367628485378, + "grad_norm": 0.20831729471683502, + "learning_rate": 0.0001725963409887116, + "loss": 1.034, + "step": 358 + }, + { + "epoch": 0.1395122899057612, + "grad_norm": 0.21639779210090637, + "learning_rate": 0.00017251848968470224, + "loss": 1.0613, + "step": 359 + }, + { + "epoch": 0.13990090352666862, + "grad_norm": 0.1959424465894699, + "learning_rate": 0.0001724406383806929, + "loss": 1.0506, + "step": 360 + }, + { + "epoch": 0.140289517147576, + "grad_norm": 0.2044398933649063, + "learning_rate": 0.00017236278707668355, + "loss": 1.0316, + "step": 361 + }, + { + "epoch": 0.14067813076848343, + "grad_norm": 0.21483004093170166, + "learning_rate": 0.0001722849357726742, + "loss": 1.0361, + "step": 362 + }, + { + "epoch": 0.14106674438939085, + "grad_norm": 0.237701416015625, + "learning_rate": 0.00017220708446866485, + "loss": 1.1264, + "step": 363 + }, + { + "epoch": 0.14145535801029827, + "grad_norm": 0.20750795304775238, + "learning_rate": 0.00017212923316465553, + "loss": 1.0523, + "step": 364 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 0.2252965271472931, + "learning_rate": 0.00017205138186064618, + "loss": 1.0764, + "step": 365 + }, + { + "epoch": 0.14223258525211308, + "grad_norm": 0.2033565789461136, + "learning_rate": 0.00017197353055663683, + "loss": 1.064, + "step": 366 + }, + { + "epoch": 0.1426211988730205, + "grad_norm": 0.21123190224170685, + "learning_rate": 0.00017189567925262749, + "loss": 1.0515, + "step": 367 + }, + { + "epoch": 0.1430098124939279, + "grad_norm": 0.20646221935749054, + "learning_rate": 0.00017181782794861814, + "loss": 1.0617, + "step": 368 + }, + { + "epoch": 0.14339842611483533, + "grad_norm": 0.2079589068889618, + "learning_rate": 0.00017173997664460882, + "loss": 1.0569, + "step": 369 + }, + { + "epoch": 0.14378703973574275, + "grad_norm": 0.216246098279953, + "learning_rate": 0.00017166212534059947, + "loss": 1.0986, + "step": 370 + }, + { + "epoch": 0.14417565335665014, + "grad_norm": 0.20711806416511536, + "learning_rate": 0.00017158427403659012, + "loss": 1.1342, + "step": 371 + }, + { + "epoch": 0.14456426697755756, + "grad_norm": 0.235435351729393, + "learning_rate": 0.00017150642273258077, + "loss": 1.1082, + "step": 372 + }, + { + "epoch": 0.14495288059846498, + "grad_norm": 0.2273191511631012, + "learning_rate": 0.00017142857142857143, + "loss": 1.1064, + "step": 373 + }, + { + "epoch": 0.1453414942193724, + "grad_norm": 0.2075672745704651, + "learning_rate": 0.0001713507201245621, + "loss": 1.0536, + "step": 374 + }, + { + "epoch": 0.14573010784027982, + "grad_norm": 0.20764274895191193, + "learning_rate": 0.00017127286882055276, + "loss": 1.0673, + "step": 375 + }, + { + "epoch": 0.1461187214611872, + "grad_norm": 0.2441243678331375, + "learning_rate": 0.0001711950175165434, + "loss": 1.1271, + "step": 376 + }, + { + "epoch": 0.14650733508209463, + "grad_norm": 0.2383374124765396, + "learning_rate": 0.00017111716621253406, + "loss": 1.083, + "step": 377 + }, + { + "epoch": 0.14689594870300204, + "grad_norm": 0.2172410786151886, + "learning_rate": 0.0001710393149085247, + "loss": 1.0605, + "step": 378 + }, + { + "epoch": 0.14728456232390946, + "grad_norm": 0.22591541707515717, + "learning_rate": 0.0001709614636045154, + "loss": 1.0931, + "step": 379 + }, + { + "epoch": 0.14767317594481685, + "grad_norm": 0.23099495470523834, + "learning_rate": 0.00017088361230050604, + "loss": 1.1021, + "step": 380 + }, + { + "epoch": 0.14806178956572427, + "grad_norm": 0.21461094915866852, + "learning_rate": 0.0001708057609964967, + "loss": 1.0959, + "step": 381 + }, + { + "epoch": 0.1484504031866317, + "grad_norm": 0.21557241678237915, + "learning_rate": 0.00017072790969248735, + "loss": 1.0155, + "step": 382 + }, + { + "epoch": 0.1488390168075391, + "grad_norm": 0.234396293759346, + "learning_rate": 0.000170650058388478, + "loss": 1.1289, + "step": 383 + }, + { + "epoch": 0.14922763042844653, + "grad_norm": 0.22895503044128418, + "learning_rate": 0.00017057220708446868, + "loss": 0.9919, + "step": 384 + }, + { + "epoch": 0.14961624404935392, + "grad_norm": 0.2054683268070221, + "learning_rate": 0.00017049435578045933, + "loss": 1.0607, + "step": 385 + }, + { + "epoch": 0.15000485767026134, + "grad_norm": 0.25569215416908264, + "learning_rate": 0.00017041650447644998, + "loss": 1.0517, + "step": 386 + }, + { + "epoch": 0.15039347129116876, + "grad_norm": 0.2222641259431839, + "learning_rate": 0.00017033865317244064, + "loss": 1.0404, + "step": 387 + }, + { + "epoch": 0.15078208491207618, + "grad_norm": 0.20501169562339783, + "learning_rate": 0.0001702608018684313, + "loss": 0.9897, + "step": 388 + }, + { + "epoch": 0.1511706985329836, + "grad_norm": 0.22080403566360474, + "learning_rate": 0.00017018295056442197, + "loss": 1.1013, + "step": 389 + }, + { + "epoch": 0.15155931215389098, + "grad_norm": 0.21218529343605042, + "learning_rate": 0.00017010509926041262, + "loss": 1.0541, + "step": 390 + }, + { + "epoch": 0.1519479257747984, + "grad_norm": 0.23064807057380676, + "learning_rate": 0.00017002724795640327, + "loss": 1.037, + "step": 391 + }, + { + "epoch": 0.15233653939570582, + "grad_norm": 0.21164493262767792, + "learning_rate": 0.00016994939665239392, + "loss": 1.0769, + "step": 392 + }, + { + "epoch": 0.15272515301661324, + "grad_norm": 0.22565549612045288, + "learning_rate": 0.00016987154534838457, + "loss": 1.0638, + "step": 393 + }, + { + "epoch": 0.15311376663752063, + "grad_norm": 0.22492647171020508, + "learning_rate": 0.00016979369404437525, + "loss": 1.063, + "step": 394 + }, + { + "epoch": 0.15350238025842805, + "grad_norm": 0.22335395216941833, + "learning_rate": 0.0001697158427403659, + "loss": 1.1032, + "step": 395 + }, + { + "epoch": 0.15389099387933547, + "grad_norm": 0.2164154201745987, + "learning_rate": 0.00016963799143635656, + "loss": 1.1275, + "step": 396 + }, + { + "epoch": 0.1542796075002429, + "grad_norm": 0.22547736763954163, + "learning_rate": 0.0001695601401323472, + "loss": 1.1324, + "step": 397 + }, + { + "epoch": 0.1546682211211503, + "grad_norm": 0.2028045952320099, + "learning_rate": 0.0001694822888283379, + "loss": 1.0057, + "step": 398 + }, + { + "epoch": 0.1550568347420577, + "grad_norm": 0.20770573616027832, + "learning_rate": 0.00016940443752432854, + "loss": 1.0311, + "step": 399 + }, + { + "epoch": 0.15544544836296512, + "grad_norm": 0.2231476902961731, + "learning_rate": 0.0001693265862203192, + "loss": 1.0535, + "step": 400 + }, + { + "epoch": 0.15583406198387253, + "grad_norm": 0.21618099510669708, + "learning_rate": 0.00016924873491630987, + "loss": 1.0616, + "step": 401 + }, + { + "epoch": 0.15622267560477995, + "grad_norm": 0.24024419486522675, + "learning_rate": 0.00016917088361230052, + "loss": 1.1324, + "step": 402 + }, + { + "epoch": 0.15661128922568737, + "grad_norm": 0.2002171128988266, + "learning_rate": 0.00016909303230829118, + "loss": 1.015, + "step": 403 + }, + { + "epoch": 0.15699990284659476, + "grad_norm": 0.21771477162837982, + "learning_rate": 0.00016901518100428183, + "loss": 1.0817, + "step": 404 + }, + { + "epoch": 0.15738851646750218, + "grad_norm": 0.22052259743213654, + "learning_rate": 0.0001689373297002725, + "loss": 1.0836, + "step": 405 + }, + { + "epoch": 0.1577771300884096, + "grad_norm": 0.1964062750339508, + "learning_rate": 0.00016885947839626316, + "loss": 1.0505, + "step": 406 + }, + { + "epoch": 0.15816574370931702, + "grad_norm": 0.22714298963546753, + "learning_rate": 0.0001687816270922538, + "loss": 1.0702, + "step": 407 + }, + { + "epoch": 0.15855435733022444, + "grad_norm": 0.20647728443145752, + "learning_rate": 0.00016870377578824446, + "loss": 1.0349, + "step": 408 + }, + { + "epoch": 0.15894297095113183, + "grad_norm": 0.2355160117149353, + "learning_rate": 0.00016862592448423512, + "loss": 1.0305, + "step": 409 + }, + { + "epoch": 0.15933158457203925, + "grad_norm": 0.22890770435333252, + "learning_rate": 0.0001685480731802258, + "loss": 1.0854, + "step": 410 + }, + { + "epoch": 0.15972019819294667, + "grad_norm": 0.21947838366031647, + "learning_rate": 0.00016847022187621645, + "loss": 1.0948, + "step": 411 + }, + { + "epoch": 0.16010881181385409, + "grad_norm": 0.22334899008274078, + "learning_rate": 0.0001683923705722071, + "loss": 1.006, + "step": 412 + }, + { + "epoch": 0.16049742543476148, + "grad_norm": 0.22324936091899872, + "learning_rate": 0.00016831451926819775, + "loss": 1.0402, + "step": 413 + }, + { + "epoch": 0.1608860390556689, + "grad_norm": 0.21462097764015198, + "learning_rate": 0.0001682366679641884, + "loss": 1.077, + "step": 414 + }, + { + "epoch": 0.1612746526765763, + "grad_norm": 0.24567006528377533, + "learning_rate": 0.00016815881666017908, + "loss": 1.15, + "step": 415 + }, + { + "epoch": 0.16166326629748373, + "grad_norm": 0.26437243819236755, + "learning_rate": 0.00016808096535616973, + "loss": 1.1251, + "step": 416 + }, + { + "epoch": 0.16205187991839115, + "grad_norm": 0.2217959761619568, + "learning_rate": 0.00016800311405216039, + "loss": 1.1103, + "step": 417 + }, + { + "epoch": 0.16244049353929854, + "grad_norm": 0.24402475357055664, + "learning_rate": 0.00016792526274815104, + "loss": 1.0672, + "step": 418 + }, + { + "epoch": 0.16282910716020596, + "grad_norm": 0.21609526872634888, + "learning_rate": 0.0001678474114441417, + "loss": 1.0291, + "step": 419 + }, + { + "epoch": 0.16321772078111338, + "grad_norm": 0.20054642856121063, + "learning_rate": 0.00016776956014013237, + "loss": 1.0704, + "step": 420 + }, + { + "epoch": 0.1636063344020208, + "grad_norm": 0.22864869236946106, + "learning_rate": 0.00016769170883612302, + "loss": 1.0612, + "step": 421 + }, + { + "epoch": 0.16399494802292822, + "grad_norm": 0.22651974856853485, + "learning_rate": 0.00016761385753211367, + "loss": 1.0749, + "step": 422 + }, + { + "epoch": 0.1643835616438356, + "grad_norm": 0.21587328612804413, + "learning_rate": 0.00016753600622810433, + "loss": 1.0398, + "step": 423 + }, + { + "epoch": 0.16477217526474303, + "grad_norm": 0.1953774094581604, + "learning_rate": 0.00016745815492409498, + "loss": 1.0275, + "step": 424 + }, + { + "epoch": 0.16516078888565044, + "grad_norm": 0.21803410351276398, + "learning_rate": 0.00016738030362008566, + "loss": 1.1219, + "step": 425 + }, + { + "epoch": 0.16554940250655786, + "grad_norm": 0.2034682035446167, + "learning_rate": 0.0001673024523160763, + "loss": 1.0342, + "step": 426 + }, + { + "epoch": 0.16593801612746525, + "grad_norm": 0.20135951042175293, + "learning_rate": 0.00016722460101206696, + "loss": 0.9802, + "step": 427 + }, + { + "epoch": 0.16632662974837267, + "grad_norm": 0.23310376703739166, + "learning_rate": 0.0001671467497080576, + "loss": 1.0789, + "step": 428 + }, + { + "epoch": 0.1667152433692801, + "grad_norm": 0.21475404500961304, + "learning_rate": 0.00016706889840404827, + "loss": 1.0416, + "step": 429 + }, + { + "epoch": 0.1671038569901875, + "grad_norm": 0.21661072969436646, + "learning_rate": 0.00016699104710003894, + "loss": 1.0568, + "step": 430 + }, + { + "epoch": 0.16749247061109493, + "grad_norm": 0.20310629904270172, + "learning_rate": 0.0001669131957960296, + "loss": 0.9968, + "step": 431 + }, + { + "epoch": 0.16788108423200232, + "grad_norm": 0.2596947252750397, + "learning_rate": 0.00016683534449202025, + "loss": 1.0478, + "step": 432 + }, + { + "epoch": 0.16826969785290974, + "grad_norm": 0.22226987779140472, + "learning_rate": 0.0001667574931880109, + "loss": 1.0898, + "step": 433 + }, + { + "epoch": 0.16865831147381716, + "grad_norm": 0.22499911487102509, + "learning_rate": 0.00016667964188400155, + "loss": 1.07, + "step": 434 + }, + { + "epoch": 0.16904692509472458, + "grad_norm": 0.2717292308807373, + "learning_rate": 0.0001666017905799922, + "loss": 1.0562, + "step": 435 + }, + { + "epoch": 0.169435538715632, + "grad_norm": 0.22052323818206787, + "learning_rate": 0.00016652393927598288, + "loss": 1.0732, + "step": 436 + }, + { + "epoch": 0.16982415233653939, + "grad_norm": 0.21741728484630585, + "learning_rate": 0.00016644608797197354, + "loss": 1.0409, + "step": 437 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 0.20701193809509277, + "learning_rate": 0.0001663682366679642, + "loss": 1.0731, + "step": 438 + }, + { + "epoch": 0.17060137957835422, + "grad_norm": 0.22071130573749542, + "learning_rate": 0.00016629038536395484, + "loss": 1.0992, + "step": 439 + }, + { + "epoch": 0.17098999319926164, + "grad_norm": 0.20261412858963013, + "learning_rate": 0.0001662125340599455, + "loss": 1.0051, + "step": 440 + }, + { + "epoch": 0.17137860682016906, + "grad_norm": 0.2082947939634323, + "learning_rate": 0.00016613468275593617, + "loss": 1.0477, + "step": 441 + }, + { + "epoch": 0.17176722044107645, + "grad_norm": 0.22534717619419098, + "learning_rate": 0.00016605683145192682, + "loss": 1.041, + "step": 442 + }, + { + "epoch": 0.17215583406198387, + "grad_norm": 0.21547731757164001, + "learning_rate": 0.00016597898014791748, + "loss": 1.0528, + "step": 443 + }, + { + "epoch": 0.1725444476828913, + "grad_norm": 0.24141089618206024, + "learning_rate": 0.00016590112884390813, + "loss": 1.0928, + "step": 444 + }, + { + "epoch": 0.1729330613037987, + "grad_norm": 0.21910884976387024, + "learning_rate": 0.00016582327753989878, + "loss": 1.063, + "step": 445 + }, + { + "epoch": 0.1733216749247061, + "grad_norm": 0.21782316267490387, + "learning_rate": 0.00016574542623588946, + "loss": 1.0976, + "step": 446 + }, + { + "epoch": 0.17371028854561352, + "grad_norm": 0.21771778166294098, + "learning_rate": 0.0001656675749318801, + "loss": 1.0677, + "step": 447 + }, + { + "epoch": 0.17409890216652094, + "grad_norm": 0.22117659449577332, + "learning_rate": 0.00016558972362787076, + "loss": 1.0669, + "step": 448 + }, + { + "epoch": 0.17448751578742835, + "grad_norm": 0.21918092668056488, + "learning_rate": 0.00016551187232386141, + "loss": 1.0955, + "step": 449 + }, + { + "epoch": 0.17487612940833577, + "grad_norm": 0.22027818858623505, + "learning_rate": 0.0001654340210198521, + "loss": 1.0201, + "step": 450 + }, + { + "epoch": 0.17526474302924316, + "grad_norm": 0.2042885720729828, + "learning_rate": 0.00016535616971584275, + "loss": 1.0881, + "step": 451 + }, + { + "epoch": 0.17565335665015058, + "grad_norm": 0.21788261830806732, + "learning_rate": 0.0001652783184118334, + "loss": 1.0918, + "step": 452 + }, + { + "epoch": 0.176041970271058, + "grad_norm": 0.23332571983337402, + "learning_rate": 0.00016520046710782408, + "loss": 1.091, + "step": 453 + }, + { + "epoch": 0.17643058389196542, + "grad_norm": 0.20204192399978638, + "learning_rate": 0.00016512261580381473, + "loss": 1.0366, + "step": 454 + }, + { + "epoch": 0.17681919751287284, + "grad_norm": 0.21761906147003174, + "learning_rate": 0.00016504476449980538, + "loss": 1.0131, + "step": 455 + }, + { + "epoch": 0.17720781113378023, + "grad_norm": 0.2152051478624344, + "learning_rate": 0.00016496691319579606, + "loss": 1.0868, + "step": 456 + }, + { + "epoch": 0.17759642475468765, + "grad_norm": 0.22776494920253754, + "learning_rate": 0.0001648890618917867, + "loss": 1.0807, + "step": 457 + }, + { + "epoch": 0.17798503837559507, + "grad_norm": 0.2171342968940735, + "learning_rate": 0.00016481121058777736, + "loss": 1.0537, + "step": 458 + }, + { + "epoch": 0.17837365199650249, + "grad_norm": 0.2046273946762085, + "learning_rate": 0.00016473335928376802, + "loss": 1.0097, + "step": 459 + }, + { + "epoch": 0.17876226561740988, + "grad_norm": 0.2047681361436844, + "learning_rate": 0.00016465550797975867, + "loss": 1.0204, + "step": 460 + }, + { + "epoch": 0.1791508792383173, + "grad_norm": 0.1876862645149231, + "learning_rate": 0.00016457765667574935, + "loss": 0.9383, + "step": 461 + }, + { + "epoch": 0.17953949285922471, + "grad_norm": 0.218430757522583, + "learning_rate": 0.00016449980537174, + "loss": 1.0721, + "step": 462 + }, + { + "epoch": 0.17992810648013213, + "grad_norm": 0.2245480865240097, + "learning_rate": 0.00016442195406773065, + "loss": 1.0859, + "step": 463 + }, + { + "epoch": 0.18031672010103955, + "grad_norm": 0.22577151656150818, + "learning_rate": 0.0001643441027637213, + "loss": 1.0825, + "step": 464 + }, + { + "epoch": 0.18070533372194694, + "grad_norm": 0.20132745802402496, + "learning_rate": 0.00016426625145971196, + "loss": 1.0615, + "step": 465 + }, + { + "epoch": 0.18109394734285436, + "grad_norm": 0.2277505248785019, + "learning_rate": 0.00016418840015570263, + "loss": 1.0426, + "step": 466 + }, + { + "epoch": 0.18148256096376178, + "grad_norm": 0.22540105879306793, + "learning_rate": 0.0001641105488516933, + "loss": 1.0481, + "step": 467 + }, + { + "epoch": 0.1818711745846692, + "grad_norm": 0.20358088612556458, + "learning_rate": 0.00016403269754768394, + "loss": 1.0286, + "step": 468 + }, + { + "epoch": 0.18225978820557662, + "grad_norm": 0.22534145414829254, + "learning_rate": 0.0001639548462436746, + "loss": 1.1183, + "step": 469 + }, + { + "epoch": 0.182648401826484, + "grad_norm": 0.2188873142004013, + "learning_rate": 0.00016387699493966524, + "loss": 1.0439, + "step": 470 + }, + { + "epoch": 0.18303701544739143, + "grad_norm": 0.2128048539161682, + "learning_rate": 0.00016379914363565592, + "loss": 1.027, + "step": 471 + }, + { + "epoch": 0.18342562906829885, + "grad_norm": 0.2518141567707062, + "learning_rate": 0.00016372129233164657, + "loss": 1.0468, + "step": 472 + }, + { + "epoch": 0.18381424268920626, + "grad_norm": 0.2189142256975174, + "learning_rate": 0.00016364344102763723, + "loss": 1.0581, + "step": 473 + }, + { + "epoch": 0.18420285631011368, + "grad_norm": 0.31266725063323975, + "learning_rate": 0.00016356558972362788, + "loss": 1.0554, + "step": 474 + }, + { + "epoch": 0.18459146993102107, + "grad_norm": 0.21343916654586792, + "learning_rate": 0.00016348773841961853, + "loss": 1.0795, + "step": 475 + }, + { + "epoch": 0.1849800835519285, + "grad_norm": 0.22907280921936035, + "learning_rate": 0.00016340988711560918, + "loss": 1.0304, + "step": 476 + }, + { + "epoch": 0.1853686971728359, + "grad_norm": 0.2105257511138916, + "learning_rate": 0.00016333203581159986, + "loss": 1.0231, + "step": 477 + }, + { + "epoch": 0.18575731079374333, + "grad_norm": 0.19537831842899323, + "learning_rate": 0.00016325418450759051, + "loss": 1.0103, + "step": 478 + }, + { + "epoch": 0.18614592441465072, + "grad_norm": 0.20522372424602509, + "learning_rate": 0.00016317633320358117, + "loss": 1.0196, + "step": 479 + }, + { + "epoch": 0.18653453803555814, + "grad_norm": 0.21646477282047272, + "learning_rate": 0.00016309848189957182, + "loss": 1.0579, + "step": 480 + }, + { + "epoch": 0.18692315165646556, + "grad_norm": 0.21077193319797516, + "learning_rate": 0.00016302063059556247, + "loss": 1.0638, + "step": 481 + }, + { + "epoch": 0.18731176527737298, + "grad_norm": 0.20357473194599152, + "learning_rate": 0.00016294277929155315, + "loss": 1.0635, + "step": 482 + }, + { + "epoch": 0.1877003788982804, + "grad_norm": 0.2188001275062561, + "learning_rate": 0.0001628649279875438, + "loss": 1.0267, + "step": 483 + }, + { + "epoch": 0.1880889925191878, + "grad_norm": 0.2128928154706955, + "learning_rate": 0.00016278707668353445, + "loss": 0.9706, + "step": 484 + }, + { + "epoch": 0.1884776061400952, + "grad_norm": 0.22081372141838074, + "learning_rate": 0.0001627092253795251, + "loss": 1.08, + "step": 485 + }, + { + "epoch": 0.18886621976100262, + "grad_norm": 0.2250615805387497, + "learning_rate": 0.00016263137407551576, + "loss": 1.1451, + "step": 486 + }, + { + "epoch": 0.18925483338191004, + "grad_norm": 0.1984967589378357, + "learning_rate": 0.00016255352277150644, + "loss": 1.0744, + "step": 487 + }, + { + "epoch": 0.18964344700281746, + "grad_norm": 0.20778900384902954, + "learning_rate": 0.0001624756714674971, + "loss": 1.0623, + "step": 488 + }, + { + "epoch": 0.19003206062372485, + "grad_norm": 0.2026563137769699, + "learning_rate": 0.00016239782016348774, + "loss": 1.0714, + "step": 489 + }, + { + "epoch": 0.19042067424463227, + "grad_norm": 0.21598374843597412, + "learning_rate": 0.0001623199688594784, + "loss": 1.0869, + "step": 490 + }, + { + "epoch": 0.1908092878655397, + "grad_norm": 0.18944978713989258, + "learning_rate": 0.00016224211755546904, + "loss": 1.055, + "step": 491 + }, + { + "epoch": 0.1911979014864471, + "grad_norm": 0.20698946714401245, + "learning_rate": 0.00016216426625145972, + "loss": 1.0392, + "step": 492 + }, + { + "epoch": 0.1915865151073545, + "grad_norm": 0.22395353019237518, + "learning_rate": 0.00016208641494745038, + "loss": 1.0681, + "step": 493 + }, + { + "epoch": 0.19197512872826192, + "grad_norm": 0.22372962534427643, + "learning_rate": 0.00016200856364344103, + "loss": 1.0767, + "step": 494 + }, + { + "epoch": 0.19236374234916934, + "grad_norm": 0.2066701054573059, + "learning_rate": 0.00016193071233943168, + "loss": 1.0061, + "step": 495 + }, + { + "epoch": 0.19275235597007676, + "grad_norm": 0.19716408848762512, + "learning_rate": 0.00016185286103542233, + "loss": 1.039, + "step": 496 + }, + { + "epoch": 0.19314096959098417, + "grad_norm": 0.22159601747989655, + "learning_rate": 0.000161775009731413, + "loss": 1.0832, + "step": 497 + }, + { + "epoch": 0.19352958321189156, + "grad_norm": 0.21509626507759094, + "learning_rate": 0.00016169715842740366, + "loss": 1.0264, + "step": 498 + }, + { + "epoch": 0.19391819683279898, + "grad_norm": 0.21598199009895325, + "learning_rate": 0.00016161930712339431, + "loss": 1.049, + "step": 499 + }, + { + "epoch": 0.1943068104537064, + "grad_norm": 0.20279590785503387, + "learning_rate": 0.00016154145581938497, + "loss": 1.0505, + "step": 500 + } + ], + "logging_steps": 1, + "max_steps": 2574, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.3571235778270986e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/outputs/checkpoint-500/training_args.bin b/outputs/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3421e16245b3c80191bfc6a33bcc4c101618df92 --- /dev/null +++ b/outputs/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7558c709c38131cfedd1780a2945d37b4a3ebf842fdf78718522b6636573099 +size 6161