diff --git a/.gitattributes b/.gitattributes index 6c0e6da2e0f08602e345e9061404f6b6e07fc72f..4d0692647539581025373e89b4c70fa37efb8f56 100644 --- a/.gitattributes +++ b/.gitattributes @@ -43,3 +43,8 @@ cpt_qwen_14B/checkpoints/checkpoint-600/tokenizer.json filter=lfs diff=lfs merge cpt_qwen_14B/checkpoints/checkpoint-656/tokenizer.json filter=lfs diff=lfs merge=lfs -text cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/run-g6vlcw0j.wandb filter=lfs diff=lfs merge=lfs -text sft_qwen_14B/wandb/run-20251223_142702-ldjr67u6/run-ldjr67u6.wandb filter=lfs diff=lfs merge=lfs -text +cpt_devstral_24B/best_adapter/tokenizer.json filter=lfs diff=lfs merge=lfs -text +cpt_devstral_24B/checkpoints/checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +cpt_devstral_24B/checkpoints/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +cpt_devstral_24B/checkpoints/checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +cpt_devstral_24B/checkpoints/checkpoint-686/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/cpt_devstral_24B/best_adapter/README.md b/cpt_devstral_24B/best_adapter/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2f04c3de935db4cae3da32ab6d1fcbbea11b4e09 --- /dev/null +++ b/cpt_devstral_24B/best_adapter/README.md @@ -0,0 +1,207 @@ +--- +base_model: /workspace/Models/Devstral-Small-2-24B-Instruct-2512 +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:/workspace/Models/Devstral-Small-2-24B-Instruct-2512 +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/cpt_devstral_24B/best_adapter/adapter_config.json b/cpt_devstral_24B/best_adapter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a10b9f1b7bb62dced9a7c13375c7ebbeb347c15b --- /dev/null +++ b/cpt_devstral_24B/best_adapter/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "/workspace/Models/Devstral-Small-2-24B-Instruct-2512", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/cpt_devstral_24B/best_adapter/adapter_model.safetensors b/cpt_devstral_24B/best_adapter/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..55a3c8fad99c3849ac93d36f5e50dbb9ac430b18 --- /dev/null +++ b/cpt_devstral_24B/best_adapter/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6528dd74de4fce9bff6c944acd9bc01868d155b1ea5403fe93fb8c5ced4d4ec +size 364983848 diff --git a/cpt_devstral_24B/best_adapter/chat_template.jinja b/cpt_devstral_24B/best_adapter/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..01c8776b5b3496af72e92a53a3bf92e113f66f2c --- /dev/null +++ b/cpt_devstral_24B/best_adapter/chat_template.jinja @@ -0,0 +1,121 @@ +{#- Default system message if no system prompt is passed. #} +{%- set default_system_message = '' %} + +{#- Begin of sequence token. #} +{{- bos_token }} + +{#- Handle system prompt if it exists. #} +{#- System prompt supports text content or text chunks. #} +{%- if messages[0]['role'] == 'system' %} + {{- '[SYSTEM_PROMPT]' -}} + {%- if messages[0]['content'] is string %} + {{- messages[0]['content'] -}} + {%- else %} + {%- for block in messages[0]['content'] %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- else %} + {{- raise_exception('Only text chunks are supported in system message contents.') }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- '[/SYSTEM_PROMPT]' -}} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set loop_messages = messages %} + {%- if default_system_message != '' %} + {{- '[SYSTEM_PROMPT]' + default_system_message + '[/SYSTEM_PROMPT]' }} + {%- endif %} +{%- endif %} + + +{#- Tools definition #} +{%- set tools_definition = '' %} +{%- set has_tools = false %} +{%- if tools is defined and tools is not none and tools|length > 0 %} + {%- set has_tools = true %} + {%- set tools_definition = '[AVAILABLE_TOOLS]' + (tools| tojson) + '[/AVAILABLE_TOOLS]' %} + {{- tools_definition }} +{%- endif %} + +{#- Checks for alternating user/assistant messages. #} +{%- set ns = namespace(index=0) %} +{%- for message in loop_messages %} + {%- if message.role == 'user' or (message.role == 'assistant' and (message.tool_calls is not defined or message.tool_calls is none or message.tool_calls | length == 0)) %} + {%- if (message['role'] == 'user') != (ns.index % 2 == 0) %} + {{- raise_exception('After the optional system message, conversation roles must alternate user and assistant roles except for tool calls and results.') }} + {%- endif %} + {%- set ns.index = ns.index + 1 %} + {%- endif %} +{%- endfor %} + +{#- Handle conversation messages. #} +{%- for message in loop_messages %} + + {#- User messages supports text content or text and image chunks. #} + {%- if message['role'] == 'user' %} + {%- if message['content'] is string %} + {{- '[INST]' + message['content'] + '[/INST]' }} + {%- elif message['content'] | length > 0 %} + {{- '[INST]' }} + {%- if message['content'] | length == 2 %} + {%- set blocks = message['content'] | sort(attribute='type') %} + {%- else %} + {%- set blocks = message['content'] %} + {%- endif %} + {%- for block in blocks %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- elif block['type'] in ['image', 'image_url'] %} + {{- '[IMG]' }} + {%- else %} + {{- raise_exception('Only text, image and image_url chunks are supported in user message content.') }} + {%- endif %} + {%- endfor %} + {{- '[/INST]' }} + {%- else %} + {{- raise_exception('User message must have a string or a list of chunks in content') }} + {%- endif %} + + {#- Assistant messages supports text content or text and image chunks. #} + {%- elif message['role'] == 'assistant' %} + {%- if (message['content'] is none or message['content'] == '' or message['content']|length == 0) and (message['tool_calls'] is not defined or message['tool_calls'] is none or message['tool_calls']|length == 0) %} + {{- raise_exception('Assistant message must have a string or a list of chunks in content or a list of tool calls.') }} + {%- endif %} + + {%- if message['content'] is string %} + {{- message['content'] }} + {%- elif message['content'] | length > 0 %} + {%- for block in message['content'] %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- else %} + {{- raise_exception('Only text chunks are supported in assistant message contents.') }} + {%- endif %} + {%- endfor %} + {%- endif %} + + {%- if message['tool_calls'] is defined and message['tool_calls'] is not none and message['tool_calls']|length > 0 %} + {%- for tool in message['tool_calls'] %} + {%- set arguments = tool['function']['arguments'] %} + {%- if arguments is not string %} + {%- set arguments = arguments|tojson|safe %} + {%- elif arguments == '' %} + {%- set arguments = '{}' %} + {%- endif %} + {{- '[TOOL_CALLS]' + tool['function']['name'] + '[ARGS]' + arguments }} + {%- endfor %} + {%- endif %} + + {#- End of sequence token for each assistant messages. #} + {{- eos_token }} + + {#- Tool messages only supports text content. #} + {%- elif message['role'] == 'tool' %} + {{- '[TOOL_RESULTS]' + message['content']|string + '[/TOOL_RESULTS]' }} + + {#- Raise exception for unsupported roles. #} + {%- else %} + {{- raise_exception('Only user, assistant and tool roles are supported, got ' + message['role'] + '.') }} + {%- endif %} +{%- endfor %} \ No newline at end of file diff --git a/cpt_devstral_24B/best_adapter/tokenizer.json b/cpt_devstral_24B/best_adapter/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..5b51e255641d3ab81f891f54bd61370fcedf6622 --- /dev/null +++ b/cpt_devstral_24B/best_adapter/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:286acad9b0e27fce778ac429763536accf618ccb6ed72963b6f94685e531c5c7 +size 17077402 diff --git a/cpt_devstral_24B/best_adapter/tokenizer_config.json b/cpt_devstral_24B/best_adapter/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1bb6b32cec8ab9654d2c84faeb9a332373476017 --- /dev/null +++ b/cpt_devstral_24B/best_adapter/tokenizer_config.json @@ -0,0 +1,1013 @@ +{ + "backend": "tokenizers", + "bos_token": "", + "eos_token": "", + "extra_special_tokens": [ + "", + "", + "", + "[INST]", + "[/INST]", + "[AVAILABLE_TOOLS]", + "[/AVAILABLE_TOOLS]", + "[TOOL_RESULTS]", + "[/TOOL_RESULTS]", + "[TOOL_CALLS]", + "[IMG]", + "", + "[IMG_BREAK]", + "[IMG_END]", + "[PREFIX]", + "[MIDDLE]", + "[SUFFIX]", + "[SYSTEM_PROMPT]", + "[/SYSTEM_PROMPT]", + "[TOOL_CONTENT]", + "", + "", + "", + "", + "[AUDIO]", + "[BEGIN_AUDIO]", + "", + "", + "", + "", + "", + "", + "[ARGS]", + "[CALL_ID]", + "[THINK]", + "[/THINK]", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "" + ], + "is_local": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "PixtralProcessor", + "tokenizer_class": "TokenizersBackend", + "unk_token": "" +} diff --git a/cpt_devstral_24B/best_adapter/training_args.bin b/cpt_devstral_24B/best_adapter/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..48a487f18680e3e5b768fe7ec9ec04e8778fc21e --- /dev/null +++ b/cpt_devstral_24B/best_adapter/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f62526ec2433add7ac031c48b1f6ff360f1ade77275765112cbf7cf361d64ca5 +size 5201 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-400/README.md b/cpt_devstral_24B/checkpoints/checkpoint-400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2f04c3de935db4cae3da32ab6d1fcbbea11b4e09 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-400/README.md @@ -0,0 +1,207 @@ +--- +base_model: /workspace/Models/Devstral-Small-2-24B-Instruct-2512 +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:/workspace/Models/Devstral-Small-2-24B-Instruct-2512 +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/cpt_devstral_24B/checkpoints/checkpoint-400/adapter_config.json b/cpt_devstral_24B/checkpoints/checkpoint-400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a10b9f1b7bb62dced9a7c13375c7ebbeb347c15b --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-400/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "/workspace/Models/Devstral-Small-2-24B-Instruct-2512", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/cpt_devstral_24B/checkpoints/checkpoint-400/adapter_model.safetensors b/cpt_devstral_24B/checkpoints/checkpoint-400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d5ee6a397481e54e89be6da6a164853fa3bdfd0d --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb99e750772d0be3eae91b1278f180702595f9e801cdcfe108166e6afe96e5ca +size 364983848 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-400/chat_template.jinja b/cpt_devstral_24B/checkpoints/checkpoint-400/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..01c8776b5b3496af72e92a53a3bf92e113f66f2c --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-400/chat_template.jinja @@ -0,0 +1,121 @@ +{#- Default system message if no system prompt is passed. #} +{%- set default_system_message = '' %} + +{#- Begin of sequence token. #} +{{- bos_token }} + +{#- Handle system prompt if it exists. #} +{#- System prompt supports text content or text chunks. #} +{%- if messages[0]['role'] == 'system' %} + {{- '[SYSTEM_PROMPT]' -}} + {%- if messages[0]['content'] is string %} + {{- messages[0]['content'] -}} + {%- else %} + {%- for block in messages[0]['content'] %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- else %} + {{- raise_exception('Only text chunks are supported in system message contents.') }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- '[/SYSTEM_PROMPT]' -}} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set loop_messages = messages %} + {%- if default_system_message != '' %} + {{- '[SYSTEM_PROMPT]' + default_system_message + '[/SYSTEM_PROMPT]' }} + {%- endif %} +{%- endif %} + + +{#- Tools definition #} +{%- set tools_definition = '' %} +{%- set has_tools = false %} +{%- if tools is defined and tools is not none and tools|length > 0 %} + {%- set has_tools = true %} + {%- set tools_definition = '[AVAILABLE_TOOLS]' + (tools| tojson) + '[/AVAILABLE_TOOLS]' %} + {{- tools_definition }} +{%- endif %} + +{#- Checks for alternating user/assistant messages. #} +{%- set ns = namespace(index=0) %} +{%- for message in loop_messages %} + {%- if message.role == 'user' or (message.role == 'assistant' and (message.tool_calls is not defined or message.tool_calls is none or message.tool_calls | length == 0)) %} + {%- if (message['role'] == 'user') != (ns.index % 2 == 0) %} + {{- raise_exception('After the optional system message, conversation roles must alternate user and assistant roles except for tool calls and results.') }} + {%- endif %} + {%- set ns.index = ns.index + 1 %} + {%- endif %} +{%- endfor %} + +{#- Handle conversation messages. #} +{%- for message in loop_messages %} + + {#- User messages supports text content or text and image chunks. #} + {%- if message['role'] == 'user' %} + {%- if message['content'] is string %} + {{- '[INST]' + message['content'] + '[/INST]' }} + {%- elif message['content'] | length > 0 %} + {{- '[INST]' }} + {%- if message['content'] | length == 2 %} + {%- set blocks = message['content'] | sort(attribute='type') %} + {%- else %} + {%- set blocks = message['content'] %} + {%- endif %} + {%- for block in blocks %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- elif block['type'] in ['image', 'image_url'] %} + {{- '[IMG]' }} + {%- else %} + {{- raise_exception('Only text, image and image_url chunks are supported in user message content.') }} + {%- endif %} + {%- endfor %} + {{- '[/INST]' }} + {%- else %} + {{- raise_exception('User message must have a string or a list of chunks in content') }} + {%- endif %} + + {#- Assistant messages supports text content or text and image chunks. #} + {%- elif message['role'] == 'assistant' %} + {%- if (message['content'] is none or message['content'] == '' or message['content']|length == 0) and (message['tool_calls'] is not defined or message['tool_calls'] is none or message['tool_calls']|length == 0) %} + {{- raise_exception('Assistant message must have a string or a list of chunks in content or a list of tool calls.') }} + {%- endif %} + + {%- if message['content'] is string %} + {{- message['content'] }} + {%- elif message['content'] | length > 0 %} + {%- for block in message['content'] %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- else %} + {{- raise_exception('Only text chunks are supported in assistant message contents.') }} + {%- endif %} + {%- endfor %} + {%- endif %} + + {%- if message['tool_calls'] is defined and message['tool_calls'] is not none and message['tool_calls']|length > 0 %} + {%- for tool in message['tool_calls'] %} + {%- set arguments = tool['function']['arguments'] %} + {%- if arguments is not string %} + {%- set arguments = arguments|tojson|safe %} + {%- elif arguments == '' %} + {%- set arguments = '{}' %} + {%- endif %} + {{- '[TOOL_CALLS]' + tool['function']['name'] + '[ARGS]' + arguments }} + {%- endfor %} + {%- endif %} + + {#- End of sequence token for each assistant messages. #} + {{- eos_token }} + + {#- Tool messages only supports text content. #} + {%- elif message['role'] == 'tool' %} + {{- '[TOOL_RESULTS]' + message['content']|string + '[/TOOL_RESULTS]' }} + + {#- Raise exception for unsupported roles. #} + {%- else %} + {{- raise_exception('Only user, assistant and tool roles are supported, got ' + message['role'] + '.') }} + {%- endif %} +{%- endfor %} \ No newline at end of file diff --git a/cpt_devstral_24B/checkpoints/checkpoint-400/optimizer.pt b/cpt_devstral_24B/checkpoints/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec8c10f320da4605f38165adbb326d3712c182cf --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5539bb24f0e90b59b5feaa90ce48faf2a89fef6e84d938cfdb015b096793c9e +size 160131559 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-400/rng_state.pth b/cpt_devstral_24B/checkpoints/checkpoint-400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..30b61c4ec790772a672785d3bbad7036cb34f17b --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c636bd3acde73735b158db1a7551369c5642650cac64756dc42008fea4a8a41c +size 14645 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-400/scheduler.pt b/cpt_devstral_24B/checkpoints/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a657fc6acf292190cd542d13efab960d887ae19f --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63227fb4710085463616ae42dc93bbd119bc402348d37ec9f6ab60b0d130235e +size 1465 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-400/tokenizer.json b/cpt_devstral_24B/checkpoints/checkpoint-400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..5b51e255641d3ab81f891f54bd61370fcedf6622 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:286acad9b0e27fce778ac429763536accf618ccb6ed72963b6f94685e531c5c7 +size 17077402 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-400/tokenizer_config.json b/cpt_devstral_24B/checkpoints/checkpoint-400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1bb6b32cec8ab9654d2c84faeb9a332373476017 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-400/tokenizer_config.json @@ -0,0 +1,1013 @@ +{ + "backend": "tokenizers", + "bos_token": "", + "eos_token": "", + "extra_special_tokens": [ + "", + "", + "", + "[INST]", + "[/INST]", + "[AVAILABLE_TOOLS]", + "[/AVAILABLE_TOOLS]", + "[TOOL_RESULTS]", + "[/TOOL_RESULTS]", + "[TOOL_CALLS]", + "[IMG]", + "", + "[IMG_BREAK]", + "[IMG_END]", + "[PREFIX]", + "[MIDDLE]", + "[SUFFIX]", + "[SYSTEM_PROMPT]", + "[/SYSTEM_PROMPT]", + "[TOOL_CONTENT]", + "", + "", + "", + "", + "[AUDIO]", + "[BEGIN_AUDIO]", + "", + "", + "", + "", + "", + "", + "[ARGS]", + "[CALL_ID]", + "[THINK]", + "[/THINK]", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "" + ], + "is_local": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "PixtralProcessor", + "tokenizer_class": "TokenizersBackend", + "unk_token": "" +} diff --git a/cpt_devstral_24B/checkpoints/checkpoint-400/trainer_state.json b/cpt_devstral_24B/checkpoints/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..385eced9e1afc171b9127f21a56b31ae77c243d7 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-400/trainer_state.json @@ -0,0 +1,2898 @@ +{ + "best_global_step": 400, + "best_metric": 0.4318464398384094, + "best_model_checkpoint": "runs/cpt_run_v1/checkpoints/checkpoint-400", + "epoch": 1.1662716499544212, + "eval_steps": 50, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0029170464904284413, + "grad_norm": 1.1577509641647339, + "learning_rate": 0.0, + "loss": 0.9893555045127869, + "step": 1 + }, + { + "epoch": 0.005834092980856883, + "grad_norm": 0.9491796493530273, + "learning_rate": 2.8985507246376816e-07, + "loss": 0.8791205883026123, + "step": 2 + }, + { + "epoch": 0.008751139471285323, + "grad_norm": 1.1600768566131592, + "learning_rate": 5.797101449275363e-07, + "loss": 0.9858248233795166, + "step": 3 + }, + { + "epoch": 0.011668185961713765, + "grad_norm": 1.2298306226730347, + "learning_rate": 8.695652173913044e-07, + "loss": 1.0516364574432373, + "step": 4 + }, + { + "epoch": 0.014585232452142206, + "grad_norm": 0.9520533680915833, + "learning_rate": 1.1594202898550726e-06, + "loss": 0.8392249345779419, + "step": 5 + }, + { + "epoch": 0.017502278942570646, + "grad_norm": 1.2451188564300537, + "learning_rate": 1.4492753623188408e-06, + "loss": 1.0955077409744263, + "step": 6 + }, + { + "epoch": 0.02041932543299909, + "grad_norm": 1.1123991012573242, + "learning_rate": 1.7391304347826088e-06, + "loss": 0.9201866388320923, + "step": 7 + }, + { + "epoch": 0.02333637192342753, + "grad_norm": 0.9283139705657959, + "learning_rate": 2.028985507246377e-06, + "loss": 0.9770950078964233, + "step": 8 + }, + { + "epoch": 0.02625341841385597, + "grad_norm": 0.9589216113090515, + "learning_rate": 2.3188405797101453e-06, + "loss": 0.9442565441131592, + "step": 9 + }, + { + "epoch": 0.02917046490428441, + "grad_norm": 0.8866703510284424, + "learning_rate": 2.6086956521739132e-06, + "loss": 0.9354464411735535, + "step": 10 + }, + { + "epoch": 0.03208751139471285, + "grad_norm": 0.7191241383552551, + "learning_rate": 2.8985507246376816e-06, + "loss": 0.7659736275672913, + "step": 11 + }, + { + "epoch": 0.03500455788514129, + "grad_norm": 0.9110142588615417, + "learning_rate": 3.188405797101449e-06, + "loss": 0.9319326877593994, + "step": 12 + }, + { + "epoch": 0.03792160437556973, + "grad_norm": 0.8754057288169861, + "learning_rate": 3.4782608695652175e-06, + "loss": 0.9819356203079224, + "step": 13 + }, + { + "epoch": 0.04083865086599818, + "grad_norm": 0.896181046962738, + "learning_rate": 3.768115942028986e-06, + "loss": 1.026316523551941, + "step": 14 + }, + { + "epoch": 0.04375569735642662, + "grad_norm": 0.6104832887649536, + "learning_rate": 4.057971014492754e-06, + "loss": 0.8427562713623047, + "step": 15 + }, + { + "epoch": 0.04667274384685506, + "grad_norm": 0.6529208421707153, + "learning_rate": 4.347826086956522e-06, + "loss": 0.8496565222740173, + "step": 16 + }, + { + "epoch": 0.0495897903372835, + "grad_norm": 0.6319335699081421, + "learning_rate": 4.637681159420291e-06, + "loss": 0.9139047861099243, + "step": 17 + }, + { + "epoch": 0.05250683682771194, + "grad_norm": 0.7458649277687073, + "learning_rate": 4.927536231884059e-06, + "loss": 0.8867442011833191, + "step": 18 + }, + { + "epoch": 0.05542388331814038, + "grad_norm": 0.6179773211479187, + "learning_rate": 5.2173913043478265e-06, + "loss": 0.9579408168792725, + "step": 19 + }, + { + "epoch": 0.05834092980856882, + "grad_norm": 0.794481635093689, + "learning_rate": 5.507246376811595e-06, + "loss": 0.8736554980278015, + "step": 20 + }, + { + "epoch": 0.06125797629899726, + "grad_norm": 0.8356145620346069, + "learning_rate": 5.797101449275363e-06, + "loss": 0.9358762502670288, + "step": 21 + }, + { + "epoch": 0.0641750227894257, + "grad_norm": 0.5891932845115662, + "learning_rate": 6.086956521739132e-06, + "loss": 0.8972038626670837, + "step": 22 + }, + { + "epoch": 0.06709206927985414, + "grad_norm": 0.6931268572807312, + "learning_rate": 6.376811594202898e-06, + "loss": 0.9583507776260376, + "step": 23 + }, + { + "epoch": 0.07000911577028258, + "grad_norm": 0.7298229336738586, + "learning_rate": 6.666666666666667e-06, + "loss": 0.8119489550590515, + "step": 24 + }, + { + "epoch": 0.07292616226071102, + "grad_norm": 0.6419956684112549, + "learning_rate": 6.956521739130435e-06, + "loss": 0.9386100769042969, + "step": 25 + }, + { + "epoch": 0.07584320875113947, + "grad_norm": 0.7508338689804077, + "learning_rate": 7.246376811594203e-06, + "loss": 0.9272583723068237, + "step": 26 + }, + { + "epoch": 0.0787602552415679, + "grad_norm": 0.5848079919815063, + "learning_rate": 7.536231884057972e-06, + "loss": 0.8967856168746948, + "step": 27 + }, + { + "epoch": 0.08167730173199636, + "grad_norm": 0.7384837865829468, + "learning_rate": 7.82608695652174e-06, + "loss": 0.8696568012237549, + "step": 28 + }, + { + "epoch": 0.0845943482224248, + "grad_norm": 0.5069604516029358, + "learning_rate": 8.115942028985508e-06, + "loss": 0.9121193885803223, + "step": 29 + }, + { + "epoch": 0.08751139471285324, + "grad_norm": 0.833165168762207, + "learning_rate": 8.405797101449275e-06, + "loss": 0.8180589079856873, + "step": 30 + }, + { + "epoch": 0.09042844120328168, + "grad_norm": 0.6355920433998108, + "learning_rate": 8.695652173913044e-06, + "loss": 0.8640957474708557, + "step": 31 + }, + { + "epoch": 0.09334548769371012, + "grad_norm": 1.0429315567016602, + "learning_rate": 8.985507246376812e-06, + "loss": 0.9517915844917297, + "step": 32 + }, + { + "epoch": 0.09626253418413856, + "grad_norm": 0.5875154733657837, + "learning_rate": 9.275362318840581e-06, + "loss": 0.9443603754043579, + "step": 33 + }, + { + "epoch": 0.099179580674567, + "grad_norm": 1.9913769960403442, + "learning_rate": 9.565217391304349e-06, + "loss": 0.9510866403579712, + "step": 34 + }, + { + "epoch": 0.10209662716499544, + "grad_norm": 0.5310097932815552, + "learning_rate": 9.855072463768118e-06, + "loss": 0.8653419613838196, + "step": 35 + }, + { + "epoch": 0.10501367365542388, + "grad_norm": 0.624421238899231, + "learning_rate": 1.0144927536231885e-05, + "loss": 0.7941208481788635, + "step": 36 + }, + { + "epoch": 0.10793072014585232, + "grad_norm": 0.6314200758934021, + "learning_rate": 1.0434782608695653e-05, + "loss": 0.8931174278259277, + "step": 37 + }, + { + "epoch": 0.11084776663628076, + "grad_norm": 0.6272342205047607, + "learning_rate": 1.0724637681159422e-05, + "loss": 0.8978185057640076, + "step": 38 + }, + { + "epoch": 0.1137648131267092, + "grad_norm": 0.5711184740066528, + "learning_rate": 1.101449275362319e-05, + "loss": 0.808263897895813, + "step": 39 + }, + { + "epoch": 0.11668185961713765, + "grad_norm": 0.7581208944320679, + "learning_rate": 1.1304347826086957e-05, + "loss": 0.7456756830215454, + "step": 40 + }, + { + "epoch": 0.11959890610756609, + "grad_norm": 0.4989977180957794, + "learning_rate": 1.1594202898550726e-05, + "loss": 0.8273333311080933, + "step": 41 + }, + { + "epoch": 0.12251595259799453, + "grad_norm": 0.8602972626686096, + "learning_rate": 1.1884057971014494e-05, + "loss": 0.8514784574508667, + "step": 42 + }, + { + "epoch": 0.12543299908842298, + "grad_norm": 0.6918581128120422, + "learning_rate": 1.2173913043478263e-05, + "loss": 0.8182265162467957, + "step": 43 + }, + { + "epoch": 0.1283500455788514, + "grad_norm": 0.653099536895752, + "learning_rate": 1.2463768115942029e-05, + "loss": 0.8242791891098022, + "step": 44 + }, + { + "epoch": 0.13126709206927986, + "grad_norm": 0.7485584616661072, + "learning_rate": 1.2753623188405797e-05, + "loss": 0.8229591250419617, + "step": 45 + }, + { + "epoch": 0.1341841385597083, + "grad_norm": 0.6724833250045776, + "learning_rate": 1.3043478260869566e-05, + "loss": 0.8146833181381226, + "step": 46 + }, + { + "epoch": 0.13710118505013674, + "grad_norm": 0.857208251953125, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.8154427409172058, + "step": 47 + }, + { + "epoch": 0.14001823154056517, + "grad_norm": 0.5559669137001038, + "learning_rate": 1.3623188405797103e-05, + "loss": 0.879005491733551, + "step": 48 + }, + { + "epoch": 0.14293527803099362, + "grad_norm": 0.5910897850990295, + "learning_rate": 1.391304347826087e-05, + "loss": 0.8148283362388611, + "step": 49 + }, + { + "epoch": 0.14585232452142205, + "grad_norm": 0.6478891372680664, + "learning_rate": 1.420289855072464e-05, + "loss": 0.8293006420135498, + "step": 50 + }, + { + "epoch": 0.14585232452142205, + "eval_loss": 0.7892261147499084, + "eval_runtime": 973.2157, + "eval_samples_per_second": 0.649, + "eval_steps_per_second": 0.649, + "step": 50 + }, + { + "epoch": 0.1487693710118505, + "grad_norm": 0.757882833480835, + "learning_rate": 1.4492753623188407e-05, + "loss": 0.8114852905273438, + "step": 51 + }, + { + "epoch": 0.15168641750227893, + "grad_norm": 0.8496116995811462, + "learning_rate": 1.4782608695652174e-05, + "loss": 0.7886185050010681, + "step": 52 + }, + { + "epoch": 0.15460346399270739, + "grad_norm": 0.6078857183456421, + "learning_rate": 1.5072463768115944e-05, + "loss": 0.7298170924186707, + "step": 53 + }, + { + "epoch": 0.1575205104831358, + "grad_norm": 0.5856835246086121, + "learning_rate": 1.536231884057971e-05, + "loss": 0.7407160997390747, + "step": 54 + }, + { + "epoch": 0.16043755697356427, + "grad_norm": 1.0533701181411743, + "learning_rate": 1.565217391304348e-05, + "loss": 0.7057831287384033, + "step": 55 + }, + { + "epoch": 0.16335460346399272, + "grad_norm": 0.8087610006332397, + "learning_rate": 1.5942028985507246e-05, + "loss": 0.7409019470214844, + "step": 56 + }, + { + "epoch": 0.16627164995442115, + "grad_norm": 0.629945695400238, + "learning_rate": 1.6231884057971015e-05, + "loss": 0.7768293023109436, + "step": 57 + }, + { + "epoch": 0.1691886964448496, + "grad_norm": 0.5187911987304688, + "learning_rate": 1.6521739130434785e-05, + "loss": 0.825718104839325, + "step": 58 + }, + { + "epoch": 0.17210574293527803, + "grad_norm": 0.5866358280181885, + "learning_rate": 1.681159420289855e-05, + "loss": 0.8575979471206665, + "step": 59 + }, + { + "epoch": 0.17502278942570648, + "grad_norm": 1.5098934173583984, + "learning_rate": 1.710144927536232e-05, + "loss": 0.8058848977088928, + "step": 60 + }, + { + "epoch": 0.1779398359161349, + "grad_norm": 0.6981958150863647, + "learning_rate": 1.739130434782609e-05, + "loss": 0.7640778422355652, + "step": 61 + }, + { + "epoch": 0.18085688240656336, + "grad_norm": 0.631349503993988, + "learning_rate": 1.7681159420289858e-05, + "loss": 0.7896331548690796, + "step": 62 + }, + { + "epoch": 0.1837739288969918, + "grad_norm": 0.6930747032165527, + "learning_rate": 1.7971014492753624e-05, + "loss": 0.6762524247169495, + "step": 63 + }, + { + "epoch": 0.18669097538742024, + "grad_norm": 0.599399209022522, + "learning_rate": 1.8260869565217393e-05, + "loss": 0.7285035848617554, + "step": 64 + }, + { + "epoch": 0.18960802187784867, + "grad_norm": 0.6194344758987427, + "learning_rate": 1.8550724637681162e-05, + "loss": 0.7682523131370544, + "step": 65 + }, + { + "epoch": 0.19252506836827712, + "grad_norm": 0.5691342949867249, + "learning_rate": 1.8840579710144928e-05, + "loss": 0.6791993379592896, + "step": 66 + }, + { + "epoch": 0.19544211485870555, + "grad_norm": 0.6257390379905701, + "learning_rate": 1.9130434782608697e-05, + "loss": 0.6744828224182129, + "step": 67 + }, + { + "epoch": 0.198359161349134, + "grad_norm": 0.5871018767356873, + "learning_rate": 1.9420289855072467e-05, + "loss": 0.7317330837249756, + "step": 68 + }, + { + "epoch": 0.20127620783956243, + "grad_norm": 1.0744612216949463, + "learning_rate": 1.9710144927536236e-05, + "loss": 0.6617178916931152, + "step": 69 + }, + { + "epoch": 0.2041932543299909, + "grad_norm": 0.675946831703186, + "learning_rate": 2e-05, + "loss": 0.7615712881088257, + "step": 70 + }, + { + "epoch": 0.2071103008204193, + "grad_norm": 0.7663411498069763, + "learning_rate": 1.9999870372100614e-05, + "loss": 0.7131291627883911, + "step": 71 + }, + { + "epoch": 0.21002734731084777, + "grad_norm": 0.6725395321846008, + "learning_rate": 1.9999481491763123e-05, + "loss": 0.7452989816665649, + "step": 72 + }, + { + "epoch": 0.21294439380127622, + "grad_norm": 0.6505664587020874, + "learning_rate": 1.9998833369069483e-05, + "loss": 0.7477136850357056, + "step": 73 + }, + { + "epoch": 0.21586144029170465, + "grad_norm": 0.7032860517501831, + "learning_rate": 1.9997926020822643e-05, + "loss": 0.6854275465011597, + "step": 74 + }, + { + "epoch": 0.2187784867821331, + "grad_norm": 0.645345151424408, + "learning_rate": 1.999675947054614e-05, + "loss": 0.7552425265312195, + "step": 75 + }, + { + "epoch": 0.22169553327256153, + "grad_norm": 0.6620492935180664, + "learning_rate": 1.9995333748483464e-05, + "loss": 0.7262853384017944, + "step": 76 + }, + { + "epoch": 0.22461257976298998, + "grad_norm": 0.6511455774307251, + "learning_rate": 1.9993648891597284e-05, + "loss": 0.7591732144355774, + "step": 77 + }, + { + "epoch": 0.2275296262534184, + "grad_norm": 0.6775254011154175, + "learning_rate": 1.9991704943568497e-05, + "loss": 0.7498704195022583, + "step": 78 + }, + { + "epoch": 0.23044667274384686, + "grad_norm": 0.8199896216392517, + "learning_rate": 1.9989501954795076e-05, + "loss": 0.7238684296607971, + "step": 79 + }, + { + "epoch": 0.2333637192342753, + "grad_norm": 0.8197569847106934, + "learning_rate": 1.998703998239079e-05, + "loss": 0.7028778195381165, + "step": 80 + }, + { + "epoch": 0.23628076572470375, + "grad_norm": 0.6602625250816345, + "learning_rate": 1.9984319090183692e-05, + "loss": 0.8842703104019165, + "step": 81 + }, + { + "epoch": 0.23919781221513217, + "grad_norm": 0.9587129354476929, + "learning_rate": 1.99813393487145e-05, + "loss": 0.732614278793335, + "step": 82 + }, + { + "epoch": 0.24211485870556063, + "grad_norm": 0.6822189092636108, + "learning_rate": 1.997810083523473e-05, + "loss": 0.7544928193092346, + "step": 83 + }, + { + "epoch": 0.24503190519598905, + "grad_norm": 0.8980082869529724, + "learning_rate": 1.9974603633704726e-05, + "loss": 0.6704054474830627, + "step": 84 + }, + { + "epoch": 0.2479489516864175, + "grad_norm": 0.7413425445556641, + "learning_rate": 1.9970847834791472e-05, + "loss": 0.693661093711853, + "step": 85 + }, + { + "epoch": 0.25086599817684596, + "grad_norm": 0.8314999341964722, + "learning_rate": 1.9966833535866223e-05, + "loss": 0.667654275894165, + "step": 86 + }, + { + "epoch": 0.25378304466727436, + "grad_norm": 0.7972444891929626, + "learning_rate": 1.9962560841002013e-05, + "loss": 0.8403134942054749, + "step": 87 + }, + { + "epoch": 0.2567000911577028, + "grad_norm": 0.8519951701164246, + "learning_rate": 1.995802986097093e-05, + "loss": 0.6897370219230652, + "step": 88 + }, + { + "epoch": 0.25961713764813127, + "grad_norm": 0.8268933892250061, + "learning_rate": 1.995324071324126e-05, + "loss": 0.6690632700920105, + "step": 89 + }, + { + "epoch": 0.2625341841385597, + "grad_norm": 0.7133983969688416, + "learning_rate": 1.9948193521974436e-05, + "loss": 0.6314147114753723, + "step": 90 + }, + { + "epoch": 0.2654512306289881, + "grad_norm": 0.889302134513855, + "learning_rate": 1.9942888418021814e-05, + "loss": 0.7389825582504272, + "step": 91 + }, + { + "epoch": 0.2683682771194166, + "grad_norm": 0.7022432088851929, + "learning_rate": 1.99373255389213e-05, + "loss": 0.6916261911392212, + "step": 92 + }, + { + "epoch": 0.27128532360984503, + "grad_norm": 0.696432888507843, + "learning_rate": 1.9931505028893748e-05, + "loss": 0.6908476948738098, + "step": 93 + }, + { + "epoch": 0.2742023701002735, + "grad_norm": 0.7667419910430908, + "learning_rate": 1.9925427038839267e-05, + "loss": 0.6500837206840515, + "step": 94 + }, + { + "epoch": 0.27711941659070194, + "grad_norm": 0.6974894404411316, + "learning_rate": 1.9919091726333265e-05, + "loss": 0.7059191465377808, + "step": 95 + }, + { + "epoch": 0.28003646308113034, + "grad_norm": 0.7047077417373657, + "learning_rate": 1.9912499255622397e-05, + "loss": 0.6287837624549866, + "step": 96 + }, + { + "epoch": 0.2829535095715588, + "grad_norm": 0.7729557156562805, + "learning_rate": 1.990564979762029e-05, + "loss": 0.6738612055778503, + "step": 97 + }, + { + "epoch": 0.28587055606198725, + "grad_norm": 0.7020529508590698, + "learning_rate": 1.989854352990311e-05, + "loss": 0.662042498588562, + "step": 98 + }, + { + "epoch": 0.2887876025524157, + "grad_norm": 0.7369800209999084, + "learning_rate": 1.9891180636704975e-05, + "loss": 0.6246830821037292, + "step": 99 + }, + { + "epoch": 0.2917046490428441, + "grad_norm": 0.7412623167037964, + "learning_rate": 1.9883561308913154e-05, + "loss": 0.6623879075050354, + "step": 100 + }, + { + "epoch": 0.2917046490428441, + "eval_loss": 0.6552971005439758, + "eval_runtime": 966.7072, + "eval_samples_per_second": 0.654, + "eval_steps_per_second": 0.654, + "step": 100 + }, + { + "epoch": 0.29462169553327255, + "grad_norm": 0.8428792953491211, + "learning_rate": 1.987568574406314e-05, + "loss": 0.6312171816825867, + "step": 101 + }, + { + "epoch": 0.297538742023701, + "grad_norm": 0.6948133707046509, + "learning_rate": 1.9867554146333517e-05, + "loss": 0.6266146898269653, + "step": 102 + }, + { + "epoch": 0.30045578851412946, + "grad_norm": 1.3897597789764404, + "learning_rate": 1.985916672654068e-05, + "loss": 0.6669265031814575, + "step": 103 + }, + { + "epoch": 0.30337283500455786, + "grad_norm": 0.8838400840759277, + "learning_rate": 1.985052370213334e-05, + "loss": 0.6601086854934692, + "step": 104 + }, + { + "epoch": 0.3062898814949863, + "grad_norm": 0.8471395373344421, + "learning_rate": 1.9841625297186925e-05, + "loss": 0.5984431505203247, + "step": 105 + }, + { + "epoch": 0.30920692798541477, + "grad_norm": 0.8940042853355408, + "learning_rate": 1.983247174239774e-05, + "loss": 0.7223822474479675, + "step": 106 + }, + { + "epoch": 0.3121239744758432, + "grad_norm": 0.7833696603775024, + "learning_rate": 1.9823063275076998e-05, + "loss": 0.6868705749511719, + "step": 107 + }, + { + "epoch": 0.3150410209662716, + "grad_norm": 0.8794649243354797, + "learning_rate": 1.9813400139144673e-05, + "loss": 0.6246675848960876, + "step": 108 + }, + { + "epoch": 0.3179580674567001, + "grad_norm": 0.8126057982444763, + "learning_rate": 1.9803482585123165e-05, + "loss": 0.5908697247505188, + "step": 109 + }, + { + "epoch": 0.32087511394712853, + "grad_norm": 0.7947676777839661, + "learning_rate": 1.979331087013082e-05, + "loss": 0.5751246809959412, + "step": 110 + }, + { + "epoch": 0.323792160437557, + "grad_norm": 0.713545560836792, + "learning_rate": 1.978288525787524e-05, + "loss": 0.6081106066703796, + "step": 111 + }, + { + "epoch": 0.32670920692798544, + "grad_norm": 1.011828064918518, + "learning_rate": 1.977220601864647e-05, + "loss": 0.7039169669151306, + "step": 112 + }, + { + "epoch": 0.32962625341841384, + "grad_norm": 0.730570912361145, + "learning_rate": 1.9761273429309982e-05, + "loss": 0.6140255928039551, + "step": 113 + }, + { + "epoch": 0.3325432999088423, + "grad_norm": 1.059688687324524, + "learning_rate": 1.9750087773299492e-05, + "loss": 0.648114025592804, + "step": 114 + }, + { + "epoch": 0.33546034639927075, + "grad_norm": 0.9336895942687988, + "learning_rate": 1.973864934060962e-05, + "loss": 0.622555673122406, + "step": 115 + }, + { + "epoch": 0.3383773928896992, + "grad_norm": 0.7195945978164673, + "learning_rate": 1.9726958427788367e-05, + "loss": 0.70485520362854, + "step": 116 + }, + { + "epoch": 0.3412944393801276, + "grad_norm": 0.8101872801780701, + "learning_rate": 1.971501533792942e-05, + "loss": 0.6958848834037781, + "step": 117 + }, + { + "epoch": 0.34421148587055606, + "grad_norm": 1.6075212955474854, + "learning_rate": 1.970282038066432e-05, + "loss": 0.6021550893783569, + "step": 118 + }, + { + "epoch": 0.3471285323609845, + "grad_norm": 0.7881433963775635, + "learning_rate": 1.9690373872154396e-05, + "loss": 0.6449777483940125, + "step": 119 + }, + { + "epoch": 0.35004557885141296, + "grad_norm": 1.014639973640442, + "learning_rate": 1.9677676135082606e-05, + "loss": 0.5939379930496216, + "step": 120 + }, + { + "epoch": 0.35296262534184136, + "grad_norm": 0.8198449611663818, + "learning_rate": 1.9664727498645144e-05, + "loss": 0.6210286617279053, + "step": 121 + }, + { + "epoch": 0.3558796718322698, + "grad_norm": 1.0194576978683472, + "learning_rate": 1.9651528298542918e-05, + "loss": 0.624247670173645, + "step": 122 + }, + { + "epoch": 0.35879671832269827, + "grad_norm": 0.7963470220565796, + "learning_rate": 1.9638078876972842e-05, + "loss": 0.6479315757751465, + "step": 123 + }, + { + "epoch": 0.3617137648131267, + "grad_norm": 0.9007541537284851, + "learning_rate": 1.9624379582618976e-05, + "loss": 0.6131505370140076, + "step": 124 + }, + { + "epoch": 0.3646308113035551, + "grad_norm": 0.8712120056152344, + "learning_rate": 1.9610430770643464e-05, + "loss": 0.6249448657035828, + "step": 125 + }, + { + "epoch": 0.3675478577939836, + "grad_norm": 1.1482540369033813, + "learning_rate": 1.9596232802677347e-05, + "loss": 0.5844688415527344, + "step": 126 + }, + { + "epoch": 0.37046490428441203, + "grad_norm": 0.8662379384040833, + "learning_rate": 1.9581786046811175e-05, + "loss": 0.6573485732078552, + "step": 127 + }, + { + "epoch": 0.3733819507748405, + "grad_norm": 0.8191388845443726, + "learning_rate": 1.9567090877585477e-05, + "loss": 0.5896862745285034, + "step": 128 + }, + { + "epoch": 0.37629899726526894, + "grad_norm": 1.0187078714370728, + "learning_rate": 1.955214767598103e-05, + "loss": 0.613490879535675, + "step": 129 + }, + { + "epoch": 0.37921604375569734, + "grad_norm": 0.8444119691848755, + "learning_rate": 1.953695682940901e-05, + "loss": 0.727687656879425, + "step": 130 + }, + { + "epoch": 0.3821330902461258, + "grad_norm": 0.74753737449646, + "learning_rate": 1.9521518731700913e-05, + "loss": 0.6102436780929565, + "step": 131 + }, + { + "epoch": 0.38505013673655425, + "grad_norm": 1.0166202783584595, + "learning_rate": 1.9505833783098378e-05, + "loss": 0.6244844198226929, + "step": 132 + }, + { + "epoch": 0.3879671832269827, + "grad_norm": 0.8175772428512573, + "learning_rate": 1.9489902390242793e-05, + "loss": 0.5939282178878784, + "step": 133 + }, + { + "epoch": 0.3908842297174111, + "grad_norm": 1.0177713632583618, + "learning_rate": 1.947372496616476e-05, + "loss": 0.6418229937553406, + "step": 134 + }, + { + "epoch": 0.39380127620783956, + "grad_norm": 0.8652453422546387, + "learning_rate": 1.9457301930273376e-05, + "loss": 0.5870395302772522, + "step": 135 + }, + { + "epoch": 0.396718322698268, + "grad_norm": 0.8378894925117493, + "learning_rate": 1.9440633708345365e-05, + "loss": 0.6480278372764587, + "step": 136 + }, + { + "epoch": 0.39963536918869647, + "grad_norm": 0.8303541541099548, + "learning_rate": 1.9423720732514052e-05, + "loss": 0.6191359758377075, + "step": 137 + }, + { + "epoch": 0.40255241567912486, + "grad_norm": 0.8576734662055969, + "learning_rate": 1.9406563441258145e-05, + "loss": 0.5696198344230652, + "step": 138 + }, + { + "epoch": 0.4054694621695533, + "grad_norm": 0.9558727145195007, + "learning_rate": 1.9389162279390362e-05, + "loss": 0.6177623271942139, + "step": 139 + }, + { + "epoch": 0.4083865086599818, + "grad_norm": 0.7046042084693909, + "learning_rate": 1.9371517698045922e-05, + "loss": 0.5836521983146667, + "step": 140 + }, + { + "epoch": 0.4113035551504102, + "grad_norm": 1.0522717237472534, + "learning_rate": 1.935363015467082e-05, + "loss": 0.5728275775909424, + "step": 141 + }, + { + "epoch": 0.4142206016408386, + "grad_norm": 0.9554787874221802, + "learning_rate": 1.933550011301e-05, + "loss": 0.632586658000946, + "step": 142 + }, + { + "epoch": 0.4171376481312671, + "grad_norm": 0.8874214291572571, + "learning_rate": 1.9317128043095293e-05, + "loss": 0.5850118398666382, + "step": 143 + }, + { + "epoch": 0.42005469462169553, + "grad_norm": 1.0708963871002197, + "learning_rate": 1.9298514421233276e-05, + "loss": 0.6260685324668884, + "step": 144 + }, + { + "epoch": 0.422971741112124, + "grad_norm": 0.8135736584663391, + "learning_rate": 1.9279659729992888e-05, + "loss": 0.6031094193458557, + "step": 145 + }, + { + "epoch": 0.42588878760255244, + "grad_norm": 0.7971774339675903, + "learning_rate": 1.9260564458192926e-05, + "loss": 0.6101322770118713, + "step": 146 + }, + { + "epoch": 0.42880583409298084, + "grad_norm": 0.9374974966049194, + "learning_rate": 1.9241229100889397e-05, + "loss": 0.5836313366889954, + "step": 147 + }, + { + "epoch": 0.4317228805834093, + "grad_norm": 0.8043425679206848, + "learning_rate": 1.9221654159362636e-05, + "loss": 0.6181215047836304, + "step": 148 + }, + { + "epoch": 0.43463992707383775, + "grad_norm": 0.8923380374908447, + "learning_rate": 1.920184014110436e-05, + "loss": 0.6149677634239197, + "step": 149 + }, + { + "epoch": 0.4375569735642662, + "grad_norm": 0.8908132314682007, + "learning_rate": 1.918178755980449e-05, + "loss": 0.5899742841720581, + "step": 150 + }, + { + "epoch": 0.4375569735642662, + "eval_loss": 0.5903874635696411, + "eval_runtime": 1186.9542, + "eval_samples_per_second": 0.532, + "eval_steps_per_second": 0.532, + "step": 150 + }, + { + "epoch": 0.4404740200546946, + "grad_norm": 1.060531497001648, + "learning_rate": 1.9161496935337808e-05, + "loss": 0.5852696895599365, + "step": 151 + }, + { + "epoch": 0.44339106654512306, + "grad_norm": 0.9723032712936401, + "learning_rate": 1.914096879375053e-05, + "loss": 0.5822056531906128, + "step": 152 + }, + { + "epoch": 0.4463081130355515, + "grad_norm": 0.9519931674003601, + "learning_rate": 1.912020366724663e-05, + "loss": 0.6183493137359619, + "step": 153 + }, + { + "epoch": 0.44922515952597997, + "grad_norm": 0.8282918334007263, + "learning_rate": 1.9099202094174055e-05, + "loss": 0.6229860782623291, + "step": 154 + }, + { + "epoch": 0.45214220601640837, + "grad_norm": 0.9251292943954468, + "learning_rate": 1.907796461901076e-05, + "loss": 0.6552959680557251, + "step": 155 + }, + { + "epoch": 0.4550592525068368, + "grad_norm": 1.0349540710449219, + "learning_rate": 1.9056491792350606e-05, + "loss": 0.6170098781585693, + "step": 156 + }, + { + "epoch": 0.4579762989972653, + "grad_norm": 0.8720711469650269, + "learning_rate": 1.9034784170889076e-05, + "loss": 0.5870137810707092, + "step": 157 + }, + { + "epoch": 0.46089334548769373, + "grad_norm": 1.0785977840423584, + "learning_rate": 1.9012842317408843e-05, + "loss": 0.5515124201774597, + "step": 158 + }, + { + "epoch": 0.4638103919781221, + "grad_norm": 1.0634154081344604, + "learning_rate": 1.8990666800765187e-05, + "loss": 0.6073828339576721, + "step": 159 + }, + { + "epoch": 0.4667274384685506, + "grad_norm": 0.8770879507064819, + "learning_rate": 1.896825819587123e-05, + "loss": 0.5960907936096191, + "step": 160 + }, + { + "epoch": 0.46964448495897904, + "grad_norm": 1.1225898265838623, + "learning_rate": 1.894561708368305e-05, + "loss": 0.545990526676178, + "step": 161 + }, + { + "epoch": 0.4725615314494075, + "grad_norm": 0.9373893141746521, + "learning_rate": 1.8922744051184613e-05, + "loss": 0.5566108822822571, + "step": 162 + }, + { + "epoch": 0.4754785779398359, + "grad_norm": 1.5016087293624878, + "learning_rate": 1.8899639691372545e-05, + "loss": 0.558845043182373, + "step": 163 + }, + { + "epoch": 0.47839562443026434, + "grad_norm": 0.903020977973938, + "learning_rate": 1.8876304603240773e-05, + "loss": 0.6824233531951904, + "step": 164 + }, + { + "epoch": 0.4813126709206928, + "grad_norm": 0.8239623308181763, + "learning_rate": 1.8852739391764993e-05, + "loss": 0.5630610585212708, + "step": 165 + }, + { + "epoch": 0.48422971741112125, + "grad_norm": 0.926069438457489, + "learning_rate": 1.882894466788697e-05, + "loss": 0.6211802363395691, + "step": 166 + }, + { + "epoch": 0.4871467639015497, + "grad_norm": 1.0098828077316284, + "learning_rate": 1.8804921048498722e-05, + "loss": 0.5513257384300232, + "step": 167 + }, + { + "epoch": 0.4900638103919781, + "grad_norm": 0.9228141903877258, + "learning_rate": 1.8780669156426517e-05, + "loss": 0.6197121739387512, + "step": 168 + }, + { + "epoch": 0.49298085688240656, + "grad_norm": 1.0551754236221313, + "learning_rate": 1.8756189620414712e-05, + "loss": 0.5221806764602661, + "step": 169 + }, + { + "epoch": 0.495897903372835, + "grad_norm": 0.9017496109008789, + "learning_rate": 1.873148307510948e-05, + "loss": 0.5766995549201965, + "step": 170 + }, + { + "epoch": 0.49881494986326347, + "grad_norm": 0.9704970717430115, + "learning_rate": 1.870655016104233e-05, + "loss": 0.6514763832092285, + "step": 171 + }, + { + "epoch": 0.5017319963536919, + "grad_norm": 0.9972712397575378, + "learning_rate": 1.8681391524613518e-05, + "loss": 0.5273895263671875, + "step": 172 + }, + { + "epoch": 0.5046490428441204, + "grad_norm": 0.9473339319229126, + "learning_rate": 1.8656007818075288e-05, + "loss": 0.5548599362373352, + "step": 173 + }, + { + "epoch": 0.5075660893345487, + "grad_norm": 1.2493574619293213, + "learning_rate": 1.8630399699514944e-05, + "loss": 0.5593586564064026, + "step": 174 + }, + { + "epoch": 0.5104831358249772, + "grad_norm": 1.2766696214675903, + "learning_rate": 1.860456783283781e-05, + "loss": 0.6054630279541016, + "step": 175 + }, + { + "epoch": 0.5134001823154056, + "grad_norm": 0.9555240869522095, + "learning_rate": 1.857851288775002e-05, + "loss": 0.508592963218689, + "step": 176 + }, + { + "epoch": 0.5163172288058341, + "grad_norm": 1.260219931602478, + "learning_rate": 1.8552235539741118e-05, + "loss": 0.5532065629959106, + "step": 177 + }, + { + "epoch": 0.5192342752962625, + "grad_norm": 1.1859954595565796, + "learning_rate": 1.8525736470066595e-05, + "loss": 0.5683344006538391, + "step": 178 + }, + { + "epoch": 0.522151321786691, + "grad_norm": 1.3044344186782837, + "learning_rate": 1.8499016365730203e-05, + "loss": 0.5281959772109985, + "step": 179 + }, + { + "epoch": 0.5250683682771194, + "grad_norm": 1.3049921989440918, + "learning_rate": 1.8472075919466137e-05, + "loss": 0.49621230363845825, + "step": 180 + }, + { + "epoch": 0.5279854147675479, + "grad_norm": 1.0488537549972534, + "learning_rate": 1.844491582972109e-05, + "loss": 0.6194032430648804, + "step": 181 + }, + { + "epoch": 0.5309024612579762, + "grad_norm": 1.5553455352783203, + "learning_rate": 1.8417536800636138e-05, + "loss": 0.5645846724510193, + "step": 182 + }, + { + "epoch": 0.5338195077484047, + "grad_norm": 1.2673912048339844, + "learning_rate": 1.8389939542028484e-05, + "loss": 0.6267315745353699, + "step": 183 + }, + { + "epoch": 0.5367365542388332, + "grad_norm": 1.0273847579956055, + "learning_rate": 1.8362124769373064e-05, + "loss": 0.5256403684616089, + "step": 184 + }, + { + "epoch": 0.5396536007292616, + "grad_norm": 1.006093978881836, + "learning_rate": 1.8334093203783986e-05, + "loss": 0.5916382074356079, + "step": 185 + }, + { + "epoch": 0.5425706472196901, + "grad_norm": 1.2740857601165771, + "learning_rate": 1.8305845571995843e-05, + "loss": 0.581648588180542, + "step": 186 + }, + { + "epoch": 0.5454876937101185, + "grad_norm": 1.494248390197754, + "learning_rate": 1.8277382606344872e-05, + "loss": 0.4824523627758026, + "step": 187 + }, + { + "epoch": 0.548404740200547, + "grad_norm": 1.1862496137619019, + "learning_rate": 1.824870504474996e-05, + "loss": 0.5531858205795288, + "step": 188 + }, + { + "epoch": 0.5513217866909754, + "grad_norm": 3.503049373626709, + "learning_rate": 1.8219813630693523e-05, + "loss": 0.6308296918869019, + "step": 189 + }, + { + "epoch": 0.5542388331814039, + "grad_norm": 1.7544710636138916, + "learning_rate": 1.819070911320222e-05, + "loss": 0.6146273016929626, + "step": 190 + }, + { + "epoch": 0.5571558796718322, + "grad_norm": 1.3367774486541748, + "learning_rate": 1.8161392246827546e-05, + "loss": 0.5848966240882874, + "step": 191 + }, + { + "epoch": 0.5600729261622607, + "grad_norm": 1.696418046951294, + "learning_rate": 1.8131863791626263e-05, + "loss": 0.6621730327606201, + "step": 192 + }, + { + "epoch": 0.5629899726526891, + "grad_norm": 1.360052227973938, + "learning_rate": 1.8102124513140694e-05, + "loss": 0.5972204208374023, + "step": 193 + }, + { + "epoch": 0.5659070191431176, + "grad_norm": 1.5376263856887817, + "learning_rate": 1.807217518237888e-05, + "loss": 0.4938785433769226, + "step": 194 + }, + { + "epoch": 0.568824065633546, + "grad_norm": 1.2249681949615479, + "learning_rate": 1.8042016575794585e-05, + "loss": 0.5366095304489136, + "step": 195 + }, + { + "epoch": 0.5717411121239745, + "grad_norm": 1.7868080139160156, + "learning_rate": 1.8011649475267178e-05, + "loss": 0.5116773843765259, + "step": 196 + }, + { + "epoch": 0.574658158614403, + "grad_norm": 2.369993209838867, + "learning_rate": 1.7981074668081345e-05, + "loss": 0.49072742462158203, + "step": 197 + }, + { + "epoch": 0.5775752051048314, + "grad_norm": 1.0168434381484985, + "learning_rate": 1.7950292946906695e-05, + "loss": 0.5691611170768738, + "step": 198 + }, + { + "epoch": 0.5804922515952597, + "grad_norm": 1.2990851402282715, + "learning_rate": 1.7919305109777195e-05, + "loss": 0.5515039563179016, + "step": 199 + }, + { + "epoch": 0.5834092980856882, + "grad_norm": 1.4859853982925415, + "learning_rate": 1.7888111960070493e-05, + "loss": 0.5017011165618896, + "step": 200 + }, + { + "epoch": 0.5834092980856882, + "eval_loss": 0.5414339303970337, + "eval_runtime": 1180.7894, + "eval_samples_per_second": 0.535, + "eval_steps_per_second": 0.535, + "step": 200 + }, + { + "epoch": 0.5863263445761167, + "grad_norm": 1.0065829753875732, + "learning_rate": 1.7856714306487088e-05, + "loss": 0.5677731037139893, + "step": 201 + }, + { + "epoch": 0.5892433910665451, + "grad_norm": 1.1727538108825684, + "learning_rate": 1.7825112963029352e-05, + "loss": 0.4525509476661682, + "step": 202 + }, + { + "epoch": 0.5921604375569736, + "grad_norm": 1.3376752138137817, + "learning_rate": 1.7793308748980437e-05, + "loss": 0.5208959579467773, + "step": 203 + }, + { + "epoch": 0.595077484047402, + "grad_norm": 0.9196159839630127, + "learning_rate": 1.776130248888304e-05, + "loss": 0.6033903360366821, + "step": 204 + }, + { + "epoch": 0.5979945305378305, + "grad_norm": 1.0750919580459595, + "learning_rate": 1.772909501251801e-05, + "loss": 0.5449609160423279, + "step": 205 + }, + { + "epoch": 0.6009115770282589, + "grad_norm": 1.2459467649459839, + "learning_rate": 1.769668715488285e-05, + "loss": 0.5685338377952576, + "step": 206 + }, + { + "epoch": 0.6038286235186874, + "grad_norm": 1.1690552234649658, + "learning_rate": 1.766407975617006e-05, + "loss": 0.5240382552146912, + "step": 207 + }, + { + "epoch": 0.6067456700091157, + "grad_norm": 1.0816599130630493, + "learning_rate": 1.7631273661745362e-05, + "loss": 0.6802893877029419, + "step": 208 + }, + { + "epoch": 0.6096627164995442, + "grad_norm": 1.3662947416305542, + "learning_rate": 1.7598269722125775e-05, + "loss": 0.48193931579589844, + "step": 209 + }, + { + "epoch": 0.6125797629899726, + "grad_norm": 0.9364766478538513, + "learning_rate": 1.7565068792957576e-05, + "loss": 0.5675849914550781, + "step": 210 + }, + { + "epoch": 0.6154968094804011, + "grad_norm": 1.123828411102295, + "learning_rate": 1.75316717349941e-05, + "loss": 0.5474762916564941, + "step": 211 + }, + { + "epoch": 0.6184138559708295, + "grad_norm": 1.1924363374710083, + "learning_rate": 1.749807941407345e-05, + "loss": 0.4918654263019562, + "step": 212 + }, + { + "epoch": 0.621330902461258, + "grad_norm": 1.101293921470642, + "learning_rate": 1.7464292701096014e-05, + "loss": 0.5742691159248352, + "step": 213 + }, + { + "epoch": 0.6242479489516864, + "grad_norm": 1.7374963760375977, + "learning_rate": 1.7430312472001928e-05, + "loss": 0.5828965902328491, + "step": 214 + }, + { + "epoch": 0.6271649954421149, + "grad_norm": 1.3195666074752808, + "learning_rate": 1.739613960774833e-05, + "loss": 0.5265159010887146, + "step": 215 + }, + { + "epoch": 0.6300820419325432, + "grad_norm": 1.254686713218689, + "learning_rate": 1.7361774994286545e-05, + "loss": 0.4929371476173401, + "step": 216 + }, + { + "epoch": 0.6329990884229717, + "grad_norm": 1.1476380825042725, + "learning_rate": 1.7327219522539102e-05, + "loss": 0.5060417652130127, + "step": 217 + }, + { + "epoch": 0.6359161349134002, + "grad_norm": 1.0914150476455688, + "learning_rate": 1.7292474088376643e-05, + "loss": 0.504043698310852, + "step": 218 + }, + { + "epoch": 0.6388331814038286, + "grad_norm": 1.1339508295059204, + "learning_rate": 1.7257539592594698e-05, + "loss": 0.4797310531139374, + "step": 219 + }, + { + "epoch": 0.6417502278942571, + "grad_norm": 1.0805399417877197, + "learning_rate": 1.722241694089033e-05, + "loss": 0.5878555178642273, + "step": 220 + }, + { + "epoch": 0.6446672743846855, + "grad_norm": 1.8615056276321411, + "learning_rate": 1.718710704383865e-05, + "loss": 0.5005823969841003, + "step": 221 + }, + { + "epoch": 0.647584320875114, + "grad_norm": 1.1445401906967163, + "learning_rate": 1.7151610816869214e-05, + "loss": 0.4949319064617157, + "step": 222 + }, + { + "epoch": 0.6505013673655424, + "grad_norm": 0.9726515412330627, + "learning_rate": 1.711592918024229e-05, + "loss": 0.5073204040527344, + "step": 223 + }, + { + "epoch": 0.6534184138559709, + "grad_norm": 1.4491140842437744, + "learning_rate": 1.7080063059024998e-05, + "loss": 0.47885262966156006, + "step": 224 + }, + { + "epoch": 0.6563354603463992, + "grad_norm": 1.0070592164993286, + "learning_rate": 1.7044013383067327e-05, + "loss": 0.5775837898254395, + "step": 225 + }, + { + "epoch": 0.6592525068368277, + "grad_norm": 0.966221272945404, + "learning_rate": 1.7007781086978037e-05, + "loss": 0.5050399899482727, + "step": 226 + }, + { + "epoch": 0.6621695533272561, + "grad_norm": 0.9808815121650696, + "learning_rate": 1.6971367110100407e-05, + "loss": 0.5737045407295227, + "step": 227 + }, + { + "epoch": 0.6650865998176846, + "grad_norm": 1.0158127546310425, + "learning_rate": 1.6934772396487906e-05, + "loss": 0.48077821731567383, + "step": 228 + }, + { + "epoch": 0.668003646308113, + "grad_norm": 1.32015860080719, + "learning_rate": 1.6897997894879706e-05, + "loss": 0.5614925026893616, + "step": 229 + }, + { + "epoch": 0.6709206927985415, + "grad_norm": 1.1055903434753418, + "learning_rate": 1.686104455867608e-05, + "loss": 0.4970760643482208, + "step": 230 + }, + { + "epoch": 0.67383773928897, + "grad_norm": 1.0804500579833984, + "learning_rate": 1.682391334591371e-05, + "loss": 0.5540452003479004, + "step": 231 + }, + { + "epoch": 0.6767547857793984, + "grad_norm": 1.1906245946884155, + "learning_rate": 1.6786605219240807e-05, + "loss": 0.5778501033782959, + "step": 232 + }, + { + "epoch": 0.6796718322698267, + "grad_norm": 0.9758645296096802, + "learning_rate": 1.6749121145892192e-05, + "loss": 0.49073565006256104, + "step": 233 + }, + { + "epoch": 0.6825888787602552, + "grad_norm": 1.1678364276885986, + "learning_rate": 1.6711462097664207e-05, + "loss": 0.4828741252422333, + "step": 234 + }, + { + "epoch": 0.6855059252506837, + "grad_norm": 1.148301362991333, + "learning_rate": 1.6673629050889507e-05, + "loss": 0.5143818855285645, + "step": 235 + }, + { + "epoch": 0.6884229717411121, + "grad_norm": 1.005898356437683, + "learning_rate": 1.6635622986411776e-05, + "loss": 0.5301160216331482, + "step": 236 + }, + { + "epoch": 0.6913400182315406, + "grad_norm": 1.2227320671081543, + "learning_rate": 1.659744488956027e-05, + "loss": 0.4800386130809784, + "step": 237 + }, + { + "epoch": 0.694257064721969, + "grad_norm": 0.986456573009491, + "learning_rate": 1.6559095750124296e-05, + "loss": 0.5098081827163696, + "step": 238 + }, + { + "epoch": 0.6971741112123975, + "grad_norm": 1.1474376916885376, + "learning_rate": 1.6520576562327518e-05, + "loss": 0.5147273540496826, + "step": 239 + }, + { + "epoch": 0.7000911577028259, + "grad_norm": 1.10917067527771, + "learning_rate": 1.6481888324802223e-05, + "loss": 0.5023190379142761, + "step": 240 + }, + { + "epoch": 0.7030082041932544, + "grad_norm": 1.2339262962341309, + "learning_rate": 1.644303204056341e-05, + "loss": 0.5282092690467834, + "step": 241 + }, + { + "epoch": 0.7059252506836827, + "grad_norm": 0.997941255569458, + "learning_rate": 1.640400871698277e-05, + "loss": 0.5635963082313538, + "step": 242 + }, + { + "epoch": 0.7088422971741112, + "grad_norm": 1.0345823764801025, + "learning_rate": 1.63648193657626e-05, + "loss": 0.5577977895736694, + "step": 243 + }, + { + "epoch": 0.7117593436645396, + "grad_norm": 1.3468303680419922, + "learning_rate": 1.6325465002909554e-05, + "loss": 0.4365362524986267, + "step": 244 + }, + { + "epoch": 0.7146763901549681, + "grad_norm": 1.2817128896713257, + "learning_rate": 1.628594664870831e-05, + "loss": 0.46069926023483276, + "step": 245 + }, + { + "epoch": 0.7175934366453965, + "grad_norm": 1.043311357498169, + "learning_rate": 1.6246265327695117e-05, + "loss": 0.5476971864700317, + "step": 246 + }, + { + "epoch": 0.720510483135825, + "grad_norm": 1.0297389030456543, + "learning_rate": 1.620642206863124e-05, + "loss": 0.48051249980926514, + "step": 247 + }, + { + "epoch": 0.7234275296262535, + "grad_norm": 1.4869836568832397, + "learning_rate": 1.6166417904476257e-05, + "loss": 0.5683314800262451, + "step": 248 + }, + { + "epoch": 0.7263445761166819, + "grad_norm": 1.0628005266189575, + "learning_rate": 1.6126253872361336e-05, + "loss": 0.5277887582778931, + "step": 249 + }, + { + "epoch": 0.7292616226071102, + "grad_norm": 1.2682170867919922, + "learning_rate": 1.608593101356229e-05, + "loss": 0.5048879384994507, + "step": 250 + }, + { + "epoch": 0.7292616226071102, + "eval_loss": 0.5038471221923828, + "eval_runtime": 1175.0375, + "eval_samples_per_second": 0.538, + "eval_steps_per_second": 0.538, + "step": 250 + }, + { + "epoch": 0.7321786690975387, + "grad_norm": 1.7376199960708618, + "learning_rate": 1.6045450373472626e-05, + "loss": 0.5093721151351929, + "step": 251 + }, + { + "epoch": 0.7350957155879672, + "grad_norm": 1.6047718524932861, + "learning_rate": 1.6004813001576405e-05, + "loss": 0.4796055555343628, + "step": 252 + }, + { + "epoch": 0.7380127620783956, + "grad_norm": 1.3582886457443237, + "learning_rate": 1.5964019951421058e-05, + "loss": 0.4733014702796936, + "step": 253 + }, + { + "epoch": 0.7409298085688241, + "grad_norm": 0.9468897581100464, + "learning_rate": 1.5923072280590072e-05, + "loss": 0.5312032103538513, + "step": 254 + }, + { + "epoch": 0.7438468550592525, + "grad_norm": 1.3890198469161987, + "learning_rate": 1.5881971050675547e-05, + "loss": 0.47576645016670227, + "step": 255 + }, + { + "epoch": 0.746763901549681, + "grad_norm": 1.782992959022522, + "learning_rate": 1.584071732725071e-05, + "loss": 0.5555092096328735, + "step": 256 + }, + { + "epoch": 0.7496809480401094, + "grad_norm": 1.1790621280670166, + "learning_rate": 1.5799312179842265e-05, + "loss": 0.5148727893829346, + "step": 257 + }, + { + "epoch": 0.7525979945305379, + "grad_norm": 1.446694254875183, + "learning_rate": 1.5757756681902664e-05, + "loss": 0.49939870834350586, + "step": 258 + }, + { + "epoch": 0.7555150410209662, + "grad_norm": 1.1786166429519653, + "learning_rate": 1.571605191078229e-05, + "loss": 0.562156081199646, + "step": 259 + }, + { + "epoch": 0.7584320875113947, + "grad_norm": 1.16925847530365, + "learning_rate": 1.567419894770151e-05, + "loss": 0.49580734968185425, + "step": 260 + }, + { + "epoch": 0.7613491340018231, + "grad_norm": 1.60944664478302, + "learning_rate": 1.5632198877722676e-05, + "loss": 0.4821680784225464, + "step": 261 + }, + { + "epoch": 0.7642661804922516, + "grad_norm": 1.3957884311676025, + "learning_rate": 1.5590052789721946e-05, + "loss": 0.4392276406288147, + "step": 262 + }, + { + "epoch": 0.76718322698268, + "grad_norm": 1.636195421218872, + "learning_rate": 1.5547761776361096e-05, + "loss": 0.39603114128112793, + "step": 263 + }, + { + "epoch": 0.7701002734731085, + "grad_norm": 1.496766448020935, + "learning_rate": 1.550532693405917e-05, + "loss": 0.4833749234676361, + "step": 264 + }, + { + "epoch": 0.773017319963537, + "grad_norm": 1.3587844371795654, + "learning_rate": 1.5462749362964058e-05, + "loss": 0.43738317489624023, + "step": 265 + }, + { + "epoch": 0.7759343664539654, + "grad_norm": 1.670704960823059, + "learning_rate": 1.5420030166923983e-05, + "loss": 0.4476737380027771, + "step": 266 + }, + { + "epoch": 0.7788514129443938, + "grad_norm": 1.2674932479858398, + "learning_rate": 1.537717045345888e-05, + "loss": 0.42266708612442017, + "step": 267 + }, + { + "epoch": 0.7817684594348222, + "grad_norm": 2.0639536380767822, + "learning_rate": 1.5334171333731666e-05, + "loss": 0.5245381593704224, + "step": 268 + }, + { + "epoch": 0.7846855059252507, + "grad_norm": 1.2091766595840454, + "learning_rate": 1.529103392251946e-05, + "loss": 0.5166443586349487, + "step": 269 + }, + { + "epoch": 0.7876025524156791, + "grad_norm": 1.1021631956100464, + "learning_rate": 1.5247759338184653e-05, + "loss": 0.5674265027046204, + "step": 270 + }, + { + "epoch": 0.7905195989061076, + "grad_norm": 1.3143829107284546, + "learning_rate": 1.520434870264595e-05, + "loss": 0.40855613350868225, + "step": 271 + }, + { + "epoch": 0.793436645396536, + "grad_norm": 1.1784812211990356, + "learning_rate": 1.5160803141349244e-05, + "loss": 0.4308925271034241, + "step": 272 + }, + { + "epoch": 0.7963536918869645, + "grad_norm": 2.1635706424713135, + "learning_rate": 1.5117123783238458e-05, + "loss": 0.45035502314567566, + "step": 273 + }, + { + "epoch": 0.7992707383773929, + "grad_norm": 1.569203495979309, + "learning_rate": 1.5073311760726287e-05, + "loss": 0.5095728635787964, + "step": 274 + }, + { + "epoch": 0.8021877848678214, + "grad_norm": 2.532621383666992, + "learning_rate": 1.5029368209664822e-05, + "loss": 0.496748685836792, + "step": 275 + }, + { + "epoch": 0.8051048313582497, + "grad_norm": 1.6312552690505981, + "learning_rate": 1.4985294269316098e-05, + "loss": 0.4972914159297943, + "step": 276 + }, + { + "epoch": 0.8080218778486782, + "grad_norm": 1.3996756076812744, + "learning_rate": 1.4941091082322579e-05, + "loss": 0.5589750409126282, + "step": 277 + }, + { + "epoch": 0.8109389243391066, + "grad_norm": 1.1288363933563232, + "learning_rate": 1.4896759794677526e-05, + "loss": 0.5349453687667847, + "step": 278 + }, + { + "epoch": 0.8138559708295351, + "grad_norm": 1.6913920640945435, + "learning_rate": 1.4852301555695268e-05, + "loss": 0.46511000394821167, + "step": 279 + }, + { + "epoch": 0.8167730173199635, + "grad_norm": 1.1913212537765503, + "learning_rate": 1.4807717517981439e-05, + "loss": 0.4715422987937927, + "step": 280 + }, + { + "epoch": 0.819690063810392, + "grad_norm": 1.1179691553115845, + "learning_rate": 1.476300883740307e-05, + "loss": 0.53330397605896, + "step": 281 + }, + { + "epoch": 0.8226071103008205, + "grad_norm": 1.7473797798156738, + "learning_rate": 1.4718176673058624e-05, + "loss": 0.47564437985420227, + "step": 282 + }, + { + "epoch": 0.8255241567912489, + "grad_norm": 1.2653177976608276, + "learning_rate": 1.4673222187247963e-05, + "loss": 0.46364277601242065, + "step": 283 + }, + { + "epoch": 0.8284412032816773, + "grad_norm": 1.2567330598831177, + "learning_rate": 1.4628146545442202e-05, + "loss": 0.4778091013431549, + "step": 284 + }, + { + "epoch": 0.8313582497721057, + "grad_norm": 1.5848406553268433, + "learning_rate": 1.4582950916253488e-05, + "loss": 0.4480203688144684, + "step": 285 + }, + { + "epoch": 0.8342752962625342, + "grad_norm": 1.3278183937072754, + "learning_rate": 1.453763647140472e-05, + "loss": 0.37945032119750977, + "step": 286 + }, + { + "epoch": 0.8371923427529626, + "grad_norm": 1.0961651802062988, + "learning_rate": 1.4492204385699155e-05, + "loss": 0.5306747555732727, + "step": 287 + }, + { + "epoch": 0.8401093892433911, + "grad_norm": 1.176276683807373, + "learning_rate": 1.4446655836989961e-05, + "loss": 0.49950045347213745, + "step": 288 + }, + { + "epoch": 0.8430264357338195, + "grad_norm": 1.2228269577026367, + "learning_rate": 1.4400992006149674e-05, + "loss": 0.494475394487381, + "step": 289 + }, + { + "epoch": 0.845943482224248, + "grad_norm": 1.1584209203720093, + "learning_rate": 1.4355214077039592e-05, + "loss": 0.44170859456062317, + "step": 290 + }, + { + "epoch": 0.8488605287146764, + "grad_norm": 1.2041938304901123, + "learning_rate": 1.4309323236479071e-05, + "loss": 0.4359871745109558, + "step": 291 + }, + { + "epoch": 0.8517775752051049, + "grad_norm": 1.279645562171936, + "learning_rate": 1.4263320674214762e-05, + "loss": 0.45031386613845825, + "step": 292 + }, + { + "epoch": 0.8546946216955332, + "grad_norm": 1.3958357572555542, + "learning_rate": 1.4217207582889769e-05, + "loss": 0.4832204580307007, + "step": 293 + }, + { + "epoch": 0.8576116681859617, + "grad_norm": 1.2788586616516113, + "learning_rate": 1.4170985158012725e-05, + "loss": 0.5154346227645874, + "step": 294 + }, + { + "epoch": 0.8605287146763901, + "grad_norm": 1.3634892702102661, + "learning_rate": 1.4124654597926795e-05, + "loss": 0.46777206659317017, + "step": 295 + }, + { + "epoch": 0.8634457611668186, + "grad_norm": 1.2719579935073853, + "learning_rate": 1.4078217103778619e-05, + "loss": 0.4247053265571594, + "step": 296 + }, + { + "epoch": 0.866362807657247, + "grad_norm": 2.890467643737793, + "learning_rate": 1.4031673879487161e-05, + "loss": 0.38349640369415283, + "step": 297 + }, + { + "epoch": 0.8692798541476755, + "grad_norm": 2.4354801177978516, + "learning_rate": 1.3985026131712499e-05, + "loss": 0.4134889543056488, + "step": 298 + }, + { + "epoch": 0.872196900638104, + "grad_norm": 1.0138323307037354, + "learning_rate": 1.3938275069824541e-05, + "loss": 0.5176680684089661, + "step": 299 + }, + { + "epoch": 0.8751139471285324, + "grad_norm": 1.2316186428070068, + "learning_rate": 1.389142190587168e-05, + "loss": 0.4818477928638458, + "step": 300 + }, + { + "epoch": 0.8751139471285324, + "eval_loss": 0.4752846360206604, + "eval_runtime": 1189.1666, + "eval_samples_per_second": 0.531, + "eval_steps_per_second": 0.531, + "step": 300 + }, + { + "epoch": 0.8780309936189608, + "grad_norm": 1.515487551689148, + "learning_rate": 1.384446785454936e-05, + "loss": 0.47766175866127014, + "step": 301 + }, + { + "epoch": 0.8809480401093892, + "grad_norm": 1.4357497692108154, + "learning_rate": 1.3797414133168591e-05, + "loss": 0.49297061562538147, + "step": 302 + }, + { + "epoch": 0.8838650865998177, + "grad_norm": 1.2523037195205688, + "learning_rate": 1.3750261961624383e-05, + "loss": 0.4629015326499939, + "step": 303 + }, + { + "epoch": 0.8867821330902461, + "grad_norm": 3.5790023803710938, + "learning_rate": 1.3703012562364124e-05, + "loss": 0.3773120045661926, + "step": 304 + }, + { + "epoch": 0.8896991795806746, + "grad_norm": 1.9305704832077026, + "learning_rate": 1.3655667160355892e-05, + "loss": 0.496719628572464, + "step": 305 + }, + { + "epoch": 0.892616226071103, + "grad_norm": 1.1506154537200928, + "learning_rate": 1.3608226983056687e-05, + "loss": 0.49487072229385376, + "step": 306 + }, + { + "epoch": 0.8955332725615315, + "grad_norm": 1.8046090602874756, + "learning_rate": 1.3560693260380614e-05, + "loss": 0.4910697937011719, + "step": 307 + }, + { + "epoch": 0.8984503190519599, + "grad_norm": 2.0088653564453125, + "learning_rate": 1.3513067224667e-05, + "loss": 0.508246660232544, + "step": 308 + }, + { + "epoch": 0.9013673655423883, + "grad_norm": 1.2966033220291138, + "learning_rate": 1.3465350110648437e-05, + "loss": 0.5125166177749634, + "step": 309 + }, + { + "epoch": 0.9042844120328167, + "grad_norm": 1.9976309537887573, + "learning_rate": 1.3417543155418775e-05, + "loss": 0.43942537903785706, + "step": 310 + }, + { + "epoch": 0.9072014585232452, + "grad_norm": 1.2663682699203491, + "learning_rate": 1.336964759840105e-05, + "loss": 0.4839101731777191, + "step": 311 + }, + { + "epoch": 0.9101185050136736, + "grad_norm": 1.1223328113555908, + "learning_rate": 1.3321664681315354e-05, + "loss": 0.48008066415786743, + "step": 312 + }, + { + "epoch": 0.9130355515041021, + "grad_norm": 1.5786972045898438, + "learning_rate": 1.3273595648146634e-05, + "loss": 0.47250309586524963, + "step": 313 + }, + { + "epoch": 0.9159525979945305, + "grad_norm": 1.2150241136550903, + "learning_rate": 1.322544174511245e-05, + "loss": 0.5149738788604736, + "step": 314 + }, + { + "epoch": 0.918869644484959, + "grad_norm": 1.3676542043685913, + "learning_rate": 1.3177204220630662e-05, + "loss": 0.4430195093154907, + "step": 315 + }, + { + "epoch": 0.9217866909753875, + "grad_norm": 1.0703285932540894, + "learning_rate": 1.3128884325287064e-05, + "loss": 0.4798983037471771, + "step": 316 + }, + { + "epoch": 0.9247037374658159, + "grad_norm": 1.3131535053253174, + "learning_rate": 1.308048331180296e-05, + "loss": 0.4241073727607727, + "step": 317 + }, + { + "epoch": 0.9276207839562443, + "grad_norm": 1.4485348463058472, + "learning_rate": 1.3032002435002698e-05, + "loss": 0.527199923992157, + "step": 318 + }, + { + "epoch": 0.9305378304466727, + "grad_norm": 1.370936393737793, + "learning_rate": 1.2983442951781114e-05, + "loss": 0.47125962376594543, + "step": 319 + }, + { + "epoch": 0.9334548769371012, + "grad_norm": 1.2369643449783325, + "learning_rate": 1.2934806121070973e-05, + "loss": 0.4814244210720062, + "step": 320 + }, + { + "epoch": 0.9363719234275296, + "grad_norm": 1.2632933855056763, + "learning_rate": 1.2886093203810314e-05, + "loss": 0.4915548264980316, + "step": 321 + }, + { + "epoch": 0.9392889699179581, + "grad_norm": 1.054569959640503, + "learning_rate": 1.2837305462909764e-05, + "loss": 0.5325602293014526, + "step": 322 + }, + { + "epoch": 0.9422060164083865, + "grad_norm": 1.15959632396698, + "learning_rate": 1.27884441632198e-05, + "loss": 0.43607404828071594, + "step": 323 + }, + { + "epoch": 0.945123062898815, + "grad_norm": 1.1667979955673218, + "learning_rate": 1.2739510571497945e-05, + "loss": 0.4631507992744446, + "step": 324 + }, + { + "epoch": 0.9480401093892434, + "grad_norm": 1.6009081602096558, + "learning_rate": 1.2690505956375944e-05, + "loss": 0.4935731887817383, + "step": 325 + }, + { + "epoch": 0.9509571558796718, + "grad_norm": 1.1193996667861938, + "learning_rate": 1.2641431588326858e-05, + "loss": 0.45883435010910034, + "step": 326 + }, + { + "epoch": 0.9538742023701002, + "grad_norm": 1.5365067720413208, + "learning_rate": 1.2592288739632138e-05, + "loss": 0.5206276178359985, + "step": 327 + }, + { + "epoch": 0.9567912488605287, + "grad_norm": 1.0714622735977173, + "learning_rate": 1.2543078684348632e-05, + "loss": 0.5242853760719299, + "step": 328 + }, + { + "epoch": 0.9597082953509571, + "grad_norm": 1.3009248971939087, + "learning_rate": 1.2493802698275557e-05, + "loss": 0.4794357717037201, + "step": 329 + }, + { + "epoch": 0.9626253418413856, + "grad_norm": 1.495771050453186, + "learning_rate": 1.244446205892143e-05, + "loss": 0.5849282145500183, + "step": 330 + }, + { + "epoch": 0.965542388331814, + "grad_norm": 1.2046003341674805, + "learning_rate": 1.2395058045470935e-05, + "loss": 0.47758305072784424, + "step": 331 + }, + { + "epoch": 0.9684594348222425, + "grad_norm": 1.1362569332122803, + "learning_rate": 1.2345591938751772e-05, + "loss": 0.4490663409233093, + "step": 332 + }, + { + "epoch": 0.971376481312671, + "grad_norm": 1.2658129930496216, + "learning_rate": 1.2296065021201438e-05, + "loss": 0.4035309851169586, + "step": 333 + }, + { + "epoch": 0.9742935278030994, + "grad_norm": 4.370306015014648, + "learning_rate": 1.2246478576833993e-05, + "loss": 0.495273619890213, + "step": 334 + }, + { + "epoch": 0.9772105742935278, + "grad_norm": 1.3863654136657715, + "learning_rate": 1.219683389120676e-05, + "loss": 0.46410733461380005, + "step": 335 + }, + { + "epoch": 0.9801276207839562, + "grad_norm": 1.4544321298599243, + "learning_rate": 1.2147132251387004e-05, + "loss": 0.4301709830760956, + "step": 336 + }, + { + "epoch": 0.9830446672743847, + "grad_norm": 1.0852457284927368, + "learning_rate": 1.2097374945918554e-05, + "loss": 0.48892468214035034, + "step": 337 + }, + { + "epoch": 0.9859617137648131, + "grad_norm": 1.5062257051467896, + "learning_rate": 1.2047563264788412e-05, + "loss": 0.4667983055114746, + "step": 338 + }, + { + "epoch": 0.9888787602552416, + "grad_norm": 1.2472951412200928, + "learning_rate": 1.199769849939329e-05, + "loss": 0.4827345013618469, + "step": 339 + }, + { + "epoch": 0.99179580674567, + "grad_norm": 1.2589871883392334, + "learning_rate": 1.1947781942506151e-05, + "loss": 0.405245304107666, + "step": 340 + }, + { + "epoch": 0.9947128532360985, + "grad_norm": 1.25636625289917, + "learning_rate": 1.1897814888242679e-05, + "loss": 0.37956133484840393, + "step": 341 + }, + { + "epoch": 0.9976298997265269, + "grad_norm": 2.7064895629882812, + "learning_rate": 1.1847798632027726e-05, + "loss": 0.489456444978714, + "step": 342 + }, + { + "epoch": 1.0, + "grad_norm": 1.6156240701675415, + "learning_rate": 1.1797734470561744e-05, + "loss": 0.46473199129104614, + "step": 343 + }, + { + "epoch": 1.0029170464904285, + "grad_norm": 1.3046343326568604, + "learning_rate": 1.1747623701787143e-05, + "loss": 0.3504878282546997, + "step": 344 + }, + { + "epoch": 1.005834092980857, + "grad_norm": 1.414828896522522, + "learning_rate": 1.1697467624854666e-05, + "loss": 0.4719260334968567, + "step": 345 + }, + { + "epoch": 1.0087511394712854, + "grad_norm": 1.1873356103897095, + "learning_rate": 1.164726754008969e-05, + "loss": 0.45313555002212524, + "step": 346 + }, + { + "epoch": 1.0116681859617138, + "grad_norm": 1.1382380723953247, + "learning_rate": 1.1597024748958526e-05, + "loss": 0.4365478456020355, + "step": 347 + }, + { + "epoch": 1.0145852324521423, + "grad_norm": 1.8141961097717285, + "learning_rate": 1.1546740554034661e-05, + "loss": 0.3694503605365753, + "step": 348 + }, + { + "epoch": 1.0175022789425707, + "grad_norm": 1.333388328552246, + "learning_rate": 1.1496416258965015e-05, + "loss": 0.4755721688270569, + "step": 349 + }, + { + "epoch": 1.0204193254329992, + "grad_norm": 1.3464443683624268, + "learning_rate": 1.1446053168436117e-05, + "loss": 0.4227846562862396, + "step": 350 + }, + { + "epoch": 1.0204193254329992, + "eval_loss": 0.44924086332321167, + "eval_runtime": 1214.6648, + "eval_samples_per_second": 0.52, + "eval_steps_per_second": 0.52, + "step": 350 + }, + { + "epoch": 1.0233363719234276, + "grad_norm": 1.2682689428329468, + "learning_rate": 1.1395652588140292e-05, + "loss": 0.44300130009651184, + "step": 351 + }, + { + "epoch": 1.0262534184138559, + "grad_norm": 1.7737696170806885, + "learning_rate": 1.1345215824741814e-05, + "loss": 0.5106258988380432, + "step": 352 + }, + { + "epoch": 1.0291704649042843, + "grad_norm": 1.2601238489151, + "learning_rate": 1.1294744185843014e-05, + "loss": 0.45930635929107666, + "step": 353 + }, + { + "epoch": 1.0320875113947128, + "grad_norm": 1.2162678241729736, + "learning_rate": 1.1244238979950406e-05, + "loss": 0.44163084030151367, + "step": 354 + }, + { + "epoch": 1.0350045578851412, + "grad_norm": 1.0905817747116089, + "learning_rate": 1.1193701516440733e-05, + "loss": 0.510662317276001, + "step": 355 + }, + { + "epoch": 1.0379216043755697, + "grad_norm": 0.9624952673912048, + "learning_rate": 1.1143133105527048e-05, + "loss": 0.5297917127609253, + "step": 356 + }, + { + "epoch": 1.0408386508659981, + "grad_norm": 1.2757681608200073, + "learning_rate": 1.1092535058224725e-05, + "loss": 0.4332093596458435, + "step": 357 + }, + { + "epoch": 1.0437556973564266, + "grad_norm": 1.6885719299316406, + "learning_rate": 1.104190868631748e-05, + "loss": 0.4337635040283203, + "step": 358 + }, + { + "epoch": 1.046672743846855, + "grad_norm": 1.175484538078308, + "learning_rate": 1.099125530232336e-05, + "loss": 0.45411020517349243, + "step": 359 + }, + { + "epoch": 1.0495897903372835, + "grad_norm": 1.0964939594268799, + "learning_rate": 1.0940576219460723e-05, + "loss": 0.5333439707756042, + "step": 360 + }, + { + "epoch": 1.052506836827712, + "grad_norm": 1.5493136644363403, + "learning_rate": 1.0889872751614176e-05, + "loss": 0.4400906264781952, + "step": 361 + }, + { + "epoch": 1.0554238833181404, + "grad_norm": 1.2491416931152344, + "learning_rate": 1.0839146213300526e-05, + "loss": 0.31049978733062744, + "step": 362 + }, + { + "epoch": 1.0583409298085689, + "grad_norm": 1.7213693857192993, + "learning_rate": 1.0788397919634694e-05, + "loss": 0.389009028673172, + "step": 363 + }, + { + "epoch": 1.0612579762989973, + "grad_norm": 1.5405336618423462, + "learning_rate": 1.0737629186295621e-05, + "loss": 0.4068562984466553, + "step": 364 + }, + { + "epoch": 1.0641750227894258, + "grad_norm": 1.225455641746521, + "learning_rate": 1.0686841329492159e-05, + "loss": 0.47358617186546326, + "step": 365 + }, + { + "epoch": 1.0670920692798542, + "grad_norm": 1.3436250686645508, + "learning_rate": 1.0636035665928945e-05, + "loss": 0.47050854563713074, + "step": 366 + }, + { + "epoch": 1.0700091157702827, + "grad_norm": 1.4952112436294556, + "learning_rate": 1.058521351277227e-05, + "loss": 0.43496906757354736, + "step": 367 + }, + { + "epoch": 1.072926162260711, + "grad_norm": 1.549112319946289, + "learning_rate": 1.0534376187615924e-05, + "loss": 0.45711052417755127, + "step": 368 + }, + { + "epoch": 1.0758432087511394, + "grad_norm": 1.3851526975631714, + "learning_rate": 1.048352500844704e-05, + "loss": 0.45045915246009827, + "step": 369 + }, + { + "epoch": 1.0787602552415678, + "grad_norm": 1.6302049160003662, + "learning_rate": 1.0432661293611927e-05, + "loss": 0.3736046254634857, + "step": 370 + }, + { + "epoch": 1.0816773017319963, + "grad_norm": 1.3365869522094727, + "learning_rate": 1.0381786361781885e-05, + "loss": 0.42242100834846497, + "step": 371 + }, + { + "epoch": 1.0845943482224247, + "grad_norm": 1.4369138479232788, + "learning_rate": 1.0330901531919026e-05, + "loss": 0.44570961594581604, + "step": 372 + }, + { + "epoch": 1.0875113947128532, + "grad_norm": 1.3528283834457397, + "learning_rate": 1.0280008123242069e-05, + "loss": 0.43440738320350647, + "step": 373 + }, + { + "epoch": 1.0904284412032816, + "grad_norm": 1.469660997390747, + "learning_rate": 1.0229107455192147e-05, + "loss": 0.3960394263267517, + "step": 374 + }, + { + "epoch": 1.09334548769371, + "grad_norm": 1.4542185068130493, + "learning_rate": 1.0178200847398595e-05, + "loss": 0.47834208607673645, + "step": 375 + }, + { + "epoch": 1.0962625341841385, + "grad_norm": 1.6470292806625366, + "learning_rate": 1.0127289619644737e-05, + "loss": 0.42791086435317993, + "step": 376 + }, + { + "epoch": 1.099179580674567, + "grad_norm": 1.1934021711349487, + "learning_rate": 1.0076375091833681e-05, + "loss": 0.4401305019855499, + "step": 377 + }, + { + "epoch": 1.1020966271649955, + "grad_norm": 0.9786668419837952, + "learning_rate": 1.0025458583954078e-05, + "loss": 0.4816555678844452, + "step": 378 + }, + { + "epoch": 1.105013673655424, + "grad_norm": 1.1348779201507568, + "learning_rate": 9.974541416045924e-06, + "loss": 0.41516968607902527, + "step": 379 + }, + { + "epoch": 1.1079307201458524, + "grad_norm": 1.0188615322113037, + "learning_rate": 9.923624908166322e-06, + "loss": 0.48087278008461, + "step": 380 + }, + { + "epoch": 1.1108477666362808, + "grad_norm": 1.0821740627288818, + "learning_rate": 9.872710380355263e-06, + "loss": 0.41974008083343506, + "step": 381 + }, + { + "epoch": 1.1137648131267093, + "grad_norm": 1.250951886177063, + "learning_rate": 9.82179915260141e-06, + "loss": 0.42703643441200256, + "step": 382 + }, + { + "epoch": 1.1166818596171377, + "grad_norm": 1.4528254270553589, + "learning_rate": 9.770892544807856e-06, + "loss": 0.43801453709602356, + "step": 383 + }, + { + "epoch": 1.1195989061075662, + "grad_norm": 1.813859462738037, + "learning_rate": 9.719991876757934e-06, + "loss": 0.4344240725040436, + "step": 384 + }, + { + "epoch": 1.1225159525979946, + "grad_norm": 1.6681253910064697, + "learning_rate": 9.669098468080976e-06, + "loss": 0.4356998801231384, + "step": 385 + }, + { + "epoch": 1.125432999088423, + "grad_norm": 1.3447953462600708, + "learning_rate": 9.618213638218117e-06, + "loss": 0.43189188838005066, + "step": 386 + }, + { + "epoch": 1.1283500455788513, + "grad_norm": 1.9577926397323608, + "learning_rate": 9.567338706388074e-06, + "loss": 0.34984707832336426, + "step": 387 + }, + { + "epoch": 1.1312670920692798, + "grad_norm": 1.5225576162338257, + "learning_rate": 9.516474991552965e-06, + "loss": 0.4243963062763214, + "step": 388 + }, + { + "epoch": 1.1341841385597082, + "grad_norm": 1.7416809797286987, + "learning_rate": 9.46562381238408e-06, + "loss": 0.3414606750011444, + "step": 389 + }, + { + "epoch": 1.1371011850501367, + "grad_norm": 1.8358951807022095, + "learning_rate": 9.414786487227732e-06, + "loss": 0.387447327375412, + "step": 390 + }, + { + "epoch": 1.1400182315405651, + "grad_norm": 1.9706153869628906, + "learning_rate": 9.363964334071057e-06, + "loss": 0.4599088728427887, + "step": 391 + }, + { + "epoch": 1.1429352780309936, + "grad_norm": 1.0604286193847656, + "learning_rate": 9.313158670507843e-06, + "loss": 0.4633581042289734, + "step": 392 + }, + { + "epoch": 1.145852324521422, + "grad_norm": 1.4851202964782715, + "learning_rate": 9.262370813704379e-06, + "loss": 0.3872259557247162, + "step": 393 + }, + { + "epoch": 1.1487693710118505, + "grad_norm": 1.7839159965515137, + "learning_rate": 9.21160208036531e-06, + "loss": 0.5215944647789001, + "step": 394 + }, + { + "epoch": 1.151686417502279, + "grad_norm": 1.3054656982421875, + "learning_rate": 9.160853786699475e-06, + "loss": 0.4030425548553467, + "step": 395 + }, + { + "epoch": 1.1546034639927074, + "grad_norm": 3.8467981815338135, + "learning_rate": 9.110127248385827e-06, + "loss": 0.4032524824142456, + "step": 396 + }, + { + "epoch": 1.1575205104831359, + "grad_norm": 1.8513801097869873, + "learning_rate": 9.05942378053928e-06, + "loss": 0.46577155590057373, + "step": 397 + }, + { + "epoch": 1.1604375569735643, + "grad_norm": 1.312689185142517, + "learning_rate": 9.008744697676642e-06, + "loss": 0.39114487171173096, + "step": 398 + }, + { + "epoch": 1.1633546034639928, + "grad_norm": 1.1996328830718994, + "learning_rate": 8.958091313682521e-06, + "loss": 0.481199711561203, + "step": 399 + }, + { + "epoch": 1.1662716499544212, + "grad_norm": 5.172409534454346, + "learning_rate": 8.90746494177528e-06, + "loss": 0.3803558945655823, + "step": 400 + }, + { + "epoch": 1.1662716499544212, + "eval_loss": 0.4318464398384094, + "eval_runtime": 1206.0306, + "eval_samples_per_second": 0.524, + "eval_steps_per_second": 0.524, + "step": 400 + } + ], + "logging_steps": 1, + "max_steps": 686, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.6837285277665853e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/cpt_devstral_24B/checkpoints/checkpoint-400/training_args.bin b/cpt_devstral_24B/checkpoints/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..48a487f18680e3e5b768fe7ec9ec04e8778fc21e --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f62526ec2433add7ac031c48b1f6ff360f1ade77275765112cbf7cf361d64ca5 +size 5201 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-500/README.md b/cpt_devstral_24B/checkpoints/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2f04c3de935db4cae3da32ab6d1fcbbea11b4e09 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-500/README.md @@ -0,0 +1,207 @@ +--- +base_model: /workspace/Models/Devstral-Small-2-24B-Instruct-2512 +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:/workspace/Models/Devstral-Small-2-24B-Instruct-2512 +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/cpt_devstral_24B/checkpoints/checkpoint-500/adapter_config.json b/cpt_devstral_24B/checkpoints/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a10b9f1b7bb62dced9a7c13375c7ebbeb347c15b --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-500/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "/workspace/Models/Devstral-Small-2-24B-Instruct-2512", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/cpt_devstral_24B/checkpoints/checkpoint-500/adapter_model.safetensors b/cpt_devstral_24B/checkpoints/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..368f2e5aa1ae913955dbae1e76bfad7a7a07e5e6 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40d8d694c7fc76d670a2720c03cab875ad7ac3e20bfdbf2d2360ed074f2d69a8 +size 364983848 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-500/chat_template.jinja b/cpt_devstral_24B/checkpoints/checkpoint-500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..01c8776b5b3496af72e92a53a3bf92e113f66f2c --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-500/chat_template.jinja @@ -0,0 +1,121 @@ +{#- Default system message if no system prompt is passed. #} +{%- set default_system_message = '' %} + +{#- Begin of sequence token. #} +{{- bos_token }} + +{#- Handle system prompt if it exists. #} +{#- System prompt supports text content or text chunks. #} +{%- if messages[0]['role'] == 'system' %} + {{- '[SYSTEM_PROMPT]' -}} + {%- if messages[0]['content'] is string %} + {{- messages[0]['content'] -}} + {%- else %} + {%- for block in messages[0]['content'] %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- else %} + {{- raise_exception('Only text chunks are supported in system message contents.') }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- '[/SYSTEM_PROMPT]' -}} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set loop_messages = messages %} + {%- if default_system_message != '' %} + {{- '[SYSTEM_PROMPT]' + default_system_message + '[/SYSTEM_PROMPT]' }} + {%- endif %} +{%- endif %} + + +{#- Tools definition #} +{%- set tools_definition = '' %} +{%- set has_tools = false %} +{%- if tools is defined and tools is not none and tools|length > 0 %} + {%- set has_tools = true %} + {%- set tools_definition = '[AVAILABLE_TOOLS]' + (tools| tojson) + '[/AVAILABLE_TOOLS]' %} + {{- tools_definition }} +{%- endif %} + +{#- Checks for alternating user/assistant messages. #} +{%- set ns = namespace(index=0) %} +{%- for message in loop_messages %} + {%- if message.role == 'user' or (message.role == 'assistant' and (message.tool_calls is not defined or message.tool_calls is none or message.tool_calls | length == 0)) %} + {%- if (message['role'] == 'user') != (ns.index % 2 == 0) %} + {{- raise_exception('After the optional system message, conversation roles must alternate user and assistant roles except for tool calls and results.') }} + {%- endif %} + {%- set ns.index = ns.index + 1 %} + {%- endif %} +{%- endfor %} + +{#- Handle conversation messages. #} +{%- for message in loop_messages %} + + {#- User messages supports text content or text and image chunks. #} + {%- if message['role'] == 'user' %} + {%- if message['content'] is string %} + {{- '[INST]' + message['content'] + '[/INST]' }} + {%- elif message['content'] | length > 0 %} + {{- '[INST]' }} + {%- if message['content'] | length == 2 %} + {%- set blocks = message['content'] | sort(attribute='type') %} + {%- else %} + {%- set blocks = message['content'] %} + {%- endif %} + {%- for block in blocks %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- elif block['type'] in ['image', 'image_url'] %} + {{- '[IMG]' }} + {%- else %} + {{- raise_exception('Only text, image and image_url chunks are supported in user message content.') }} + {%- endif %} + {%- endfor %} + {{- '[/INST]' }} + {%- else %} + {{- raise_exception('User message must have a string or a list of chunks in content') }} + {%- endif %} + + {#- Assistant messages supports text content or text and image chunks. #} + {%- elif message['role'] == 'assistant' %} + {%- if (message['content'] is none or message['content'] == '' or message['content']|length == 0) and (message['tool_calls'] is not defined or message['tool_calls'] is none or message['tool_calls']|length == 0) %} + {{- raise_exception('Assistant message must have a string or a list of chunks in content or a list of tool calls.') }} + {%- endif %} + + {%- if message['content'] is string %} + {{- message['content'] }} + {%- elif message['content'] | length > 0 %} + {%- for block in message['content'] %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- else %} + {{- raise_exception('Only text chunks are supported in assistant message contents.') }} + {%- endif %} + {%- endfor %} + {%- endif %} + + {%- if message['tool_calls'] is defined and message['tool_calls'] is not none and message['tool_calls']|length > 0 %} + {%- for tool in message['tool_calls'] %} + {%- set arguments = tool['function']['arguments'] %} + {%- if arguments is not string %} + {%- set arguments = arguments|tojson|safe %} + {%- elif arguments == '' %} + {%- set arguments = '{}' %} + {%- endif %} + {{- '[TOOL_CALLS]' + tool['function']['name'] + '[ARGS]' + arguments }} + {%- endfor %} + {%- endif %} + + {#- End of sequence token for each assistant messages. #} + {{- eos_token }} + + {#- Tool messages only supports text content. #} + {%- elif message['role'] == 'tool' %} + {{- '[TOOL_RESULTS]' + message['content']|string + '[/TOOL_RESULTS]' }} + + {#- Raise exception for unsupported roles. #} + {%- else %} + {{- raise_exception('Only user, assistant and tool roles are supported, got ' + message['role'] + '.') }} + {%- endif %} +{%- endfor %} \ No newline at end of file diff --git a/cpt_devstral_24B/checkpoints/checkpoint-500/optimizer.pt b/cpt_devstral_24B/checkpoints/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6149a884fbd5db3bc93b447b47ab031508273651 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:335aa629db8c0490b49c6dbd8a2212e0cb7b06d115ab83a96e1f18d23652855c +size 160131559 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-500/rng_state.pth b/cpt_devstral_24B/checkpoints/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9f4247a7c07f13ef57db578fd56f9f76d254fbe --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08185a483370e678d79b51807a988cfe41d318265f91634025d2a0d25c5a3615 +size 14645 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-500/scheduler.pt b/cpt_devstral_24B/checkpoints/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b85d872ff1bd3abae69b4ae9c19cc54a3c4d27b2 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fee4726958e54cc743c86acde73ad8c729bc35b56d33cb9894bdb5eba634ffd9 +size 1465 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-500/tokenizer.json b/cpt_devstral_24B/checkpoints/checkpoint-500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..5b51e255641d3ab81f891f54bd61370fcedf6622 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:286acad9b0e27fce778ac429763536accf618ccb6ed72963b6f94685e531c5c7 +size 17077402 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-500/tokenizer_config.json b/cpt_devstral_24B/checkpoints/checkpoint-500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1bb6b32cec8ab9654d2c84faeb9a332373476017 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-500/tokenizer_config.json @@ -0,0 +1,1013 @@ +{ + "backend": "tokenizers", + "bos_token": "", + "eos_token": "", + "extra_special_tokens": [ + "", + "", + "", + "[INST]", + "[/INST]", + "[AVAILABLE_TOOLS]", + "[/AVAILABLE_TOOLS]", + "[TOOL_RESULTS]", + "[/TOOL_RESULTS]", + "[TOOL_CALLS]", + "[IMG]", + "", + "[IMG_BREAK]", + "[IMG_END]", + "[PREFIX]", + "[MIDDLE]", + "[SUFFIX]", + "[SYSTEM_PROMPT]", + "[/SYSTEM_PROMPT]", + "[TOOL_CONTENT]", + "", + "", + "", + "", + "[AUDIO]", + "[BEGIN_AUDIO]", + "", + "", + "", + "", + "", + "", + "[ARGS]", + "[CALL_ID]", + "[THINK]", + "[/THINK]", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "" + ], + "is_local": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "PixtralProcessor", + "tokenizer_class": "TokenizersBackend", + "unk_token": "" +} diff --git a/cpt_devstral_24B/checkpoints/checkpoint-500/trainer_state.json b/cpt_devstral_24B/checkpoints/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..91e905aef87d26604fc9b5e042a6a78a93863679 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-500/trainer_state.json @@ -0,0 +1,3614 @@ +{ + "best_global_step": 500, + "best_metric": 0.40706494450569153, + "best_model_checkpoint": "runs/cpt_run_v1/checkpoints/checkpoint-500", + "epoch": 1.4579762989972653, + "eval_steps": 50, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0029170464904284413, + "grad_norm": 1.1577509641647339, + "learning_rate": 0.0, + "loss": 0.9893555045127869, + "step": 1 + }, + { + "epoch": 0.005834092980856883, + "grad_norm": 0.9491796493530273, + "learning_rate": 2.8985507246376816e-07, + "loss": 0.8791205883026123, + "step": 2 + }, + { + "epoch": 0.008751139471285323, + "grad_norm": 1.1600768566131592, + "learning_rate": 5.797101449275363e-07, + "loss": 0.9858248233795166, + "step": 3 + }, + { + "epoch": 0.011668185961713765, + "grad_norm": 1.2298306226730347, + "learning_rate": 8.695652173913044e-07, + "loss": 1.0516364574432373, + "step": 4 + }, + { + "epoch": 0.014585232452142206, + "grad_norm": 0.9520533680915833, + "learning_rate": 1.1594202898550726e-06, + "loss": 0.8392249345779419, + "step": 5 + }, + { + "epoch": 0.017502278942570646, + "grad_norm": 1.2451188564300537, + "learning_rate": 1.4492753623188408e-06, + "loss": 1.0955077409744263, + "step": 6 + }, + { + "epoch": 0.02041932543299909, + "grad_norm": 1.1123991012573242, + "learning_rate": 1.7391304347826088e-06, + "loss": 0.9201866388320923, + "step": 7 + }, + { + "epoch": 0.02333637192342753, + "grad_norm": 0.9283139705657959, + "learning_rate": 2.028985507246377e-06, + "loss": 0.9770950078964233, + "step": 8 + }, + { + "epoch": 0.02625341841385597, + "grad_norm": 0.9589216113090515, + "learning_rate": 2.3188405797101453e-06, + "loss": 0.9442565441131592, + "step": 9 + }, + { + "epoch": 0.02917046490428441, + "grad_norm": 0.8866703510284424, + "learning_rate": 2.6086956521739132e-06, + "loss": 0.9354464411735535, + "step": 10 + }, + { + "epoch": 0.03208751139471285, + "grad_norm": 0.7191241383552551, + "learning_rate": 2.8985507246376816e-06, + "loss": 0.7659736275672913, + "step": 11 + }, + { + "epoch": 0.03500455788514129, + "grad_norm": 0.9110142588615417, + "learning_rate": 3.188405797101449e-06, + "loss": 0.9319326877593994, + "step": 12 + }, + { + "epoch": 0.03792160437556973, + "grad_norm": 0.8754057288169861, + "learning_rate": 3.4782608695652175e-06, + "loss": 0.9819356203079224, + "step": 13 + }, + { + "epoch": 0.04083865086599818, + "grad_norm": 0.896181046962738, + "learning_rate": 3.768115942028986e-06, + "loss": 1.026316523551941, + "step": 14 + }, + { + "epoch": 0.04375569735642662, + "grad_norm": 0.6104832887649536, + "learning_rate": 4.057971014492754e-06, + "loss": 0.8427562713623047, + "step": 15 + }, + { + "epoch": 0.04667274384685506, + "grad_norm": 0.6529208421707153, + "learning_rate": 4.347826086956522e-06, + "loss": 0.8496565222740173, + "step": 16 + }, + { + "epoch": 0.0495897903372835, + "grad_norm": 0.6319335699081421, + "learning_rate": 4.637681159420291e-06, + "loss": 0.9139047861099243, + "step": 17 + }, + { + "epoch": 0.05250683682771194, + "grad_norm": 0.7458649277687073, + "learning_rate": 4.927536231884059e-06, + "loss": 0.8867442011833191, + "step": 18 + }, + { + "epoch": 0.05542388331814038, + "grad_norm": 0.6179773211479187, + "learning_rate": 5.2173913043478265e-06, + "loss": 0.9579408168792725, + "step": 19 + }, + { + "epoch": 0.05834092980856882, + "grad_norm": 0.794481635093689, + "learning_rate": 5.507246376811595e-06, + "loss": 0.8736554980278015, + "step": 20 + }, + { + "epoch": 0.06125797629899726, + "grad_norm": 0.8356145620346069, + "learning_rate": 5.797101449275363e-06, + "loss": 0.9358762502670288, + "step": 21 + }, + { + "epoch": 0.0641750227894257, + "grad_norm": 0.5891932845115662, + "learning_rate": 6.086956521739132e-06, + "loss": 0.8972038626670837, + "step": 22 + }, + { + "epoch": 0.06709206927985414, + "grad_norm": 0.6931268572807312, + "learning_rate": 6.376811594202898e-06, + "loss": 0.9583507776260376, + "step": 23 + }, + { + "epoch": 0.07000911577028258, + "grad_norm": 0.7298229336738586, + "learning_rate": 6.666666666666667e-06, + "loss": 0.8119489550590515, + "step": 24 + }, + { + "epoch": 0.07292616226071102, + "grad_norm": 0.6419956684112549, + "learning_rate": 6.956521739130435e-06, + "loss": 0.9386100769042969, + "step": 25 + }, + { + "epoch": 0.07584320875113947, + "grad_norm": 0.7508338689804077, + "learning_rate": 7.246376811594203e-06, + "loss": 0.9272583723068237, + "step": 26 + }, + { + "epoch": 0.0787602552415679, + "grad_norm": 0.5848079919815063, + "learning_rate": 7.536231884057972e-06, + "loss": 0.8967856168746948, + "step": 27 + }, + { + "epoch": 0.08167730173199636, + "grad_norm": 0.7384837865829468, + "learning_rate": 7.82608695652174e-06, + "loss": 0.8696568012237549, + "step": 28 + }, + { + "epoch": 0.0845943482224248, + "grad_norm": 0.5069604516029358, + "learning_rate": 8.115942028985508e-06, + "loss": 0.9121193885803223, + "step": 29 + }, + { + "epoch": 0.08751139471285324, + "grad_norm": 0.833165168762207, + "learning_rate": 8.405797101449275e-06, + "loss": 0.8180589079856873, + "step": 30 + }, + { + "epoch": 0.09042844120328168, + "grad_norm": 0.6355920433998108, + "learning_rate": 8.695652173913044e-06, + "loss": 0.8640957474708557, + "step": 31 + }, + { + "epoch": 0.09334548769371012, + "grad_norm": 1.0429315567016602, + "learning_rate": 8.985507246376812e-06, + "loss": 0.9517915844917297, + "step": 32 + }, + { + "epoch": 0.09626253418413856, + "grad_norm": 0.5875154733657837, + "learning_rate": 9.275362318840581e-06, + "loss": 0.9443603754043579, + "step": 33 + }, + { + "epoch": 0.099179580674567, + "grad_norm": 1.9913769960403442, + "learning_rate": 9.565217391304349e-06, + "loss": 0.9510866403579712, + "step": 34 + }, + { + "epoch": 0.10209662716499544, + "grad_norm": 0.5310097932815552, + "learning_rate": 9.855072463768118e-06, + "loss": 0.8653419613838196, + "step": 35 + }, + { + "epoch": 0.10501367365542388, + "grad_norm": 0.624421238899231, + "learning_rate": 1.0144927536231885e-05, + "loss": 0.7941208481788635, + "step": 36 + }, + { + "epoch": 0.10793072014585232, + "grad_norm": 0.6314200758934021, + "learning_rate": 1.0434782608695653e-05, + "loss": 0.8931174278259277, + "step": 37 + }, + { + "epoch": 0.11084776663628076, + "grad_norm": 0.6272342205047607, + "learning_rate": 1.0724637681159422e-05, + "loss": 0.8978185057640076, + "step": 38 + }, + { + "epoch": 0.1137648131267092, + "grad_norm": 0.5711184740066528, + "learning_rate": 1.101449275362319e-05, + "loss": 0.808263897895813, + "step": 39 + }, + { + "epoch": 0.11668185961713765, + "grad_norm": 0.7581208944320679, + "learning_rate": 1.1304347826086957e-05, + "loss": 0.7456756830215454, + "step": 40 + }, + { + "epoch": 0.11959890610756609, + "grad_norm": 0.4989977180957794, + "learning_rate": 1.1594202898550726e-05, + "loss": 0.8273333311080933, + "step": 41 + }, + { + "epoch": 0.12251595259799453, + "grad_norm": 0.8602972626686096, + "learning_rate": 1.1884057971014494e-05, + "loss": 0.8514784574508667, + "step": 42 + }, + { + "epoch": 0.12543299908842298, + "grad_norm": 0.6918581128120422, + "learning_rate": 1.2173913043478263e-05, + "loss": 0.8182265162467957, + "step": 43 + }, + { + "epoch": 0.1283500455788514, + "grad_norm": 0.653099536895752, + "learning_rate": 1.2463768115942029e-05, + "loss": 0.8242791891098022, + "step": 44 + }, + { + "epoch": 0.13126709206927986, + "grad_norm": 0.7485584616661072, + "learning_rate": 1.2753623188405797e-05, + "loss": 0.8229591250419617, + "step": 45 + }, + { + "epoch": 0.1341841385597083, + "grad_norm": 0.6724833250045776, + "learning_rate": 1.3043478260869566e-05, + "loss": 0.8146833181381226, + "step": 46 + }, + { + "epoch": 0.13710118505013674, + "grad_norm": 0.857208251953125, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.8154427409172058, + "step": 47 + }, + { + "epoch": 0.14001823154056517, + "grad_norm": 0.5559669137001038, + "learning_rate": 1.3623188405797103e-05, + "loss": 0.879005491733551, + "step": 48 + }, + { + "epoch": 0.14293527803099362, + "grad_norm": 0.5910897850990295, + "learning_rate": 1.391304347826087e-05, + "loss": 0.8148283362388611, + "step": 49 + }, + { + "epoch": 0.14585232452142205, + "grad_norm": 0.6478891372680664, + "learning_rate": 1.420289855072464e-05, + "loss": 0.8293006420135498, + "step": 50 + }, + { + "epoch": 0.14585232452142205, + "eval_loss": 0.7892261147499084, + "eval_runtime": 973.2157, + "eval_samples_per_second": 0.649, + "eval_steps_per_second": 0.649, + "step": 50 + }, + { + "epoch": 0.1487693710118505, + "grad_norm": 0.757882833480835, + "learning_rate": 1.4492753623188407e-05, + "loss": 0.8114852905273438, + "step": 51 + }, + { + "epoch": 0.15168641750227893, + "grad_norm": 0.8496116995811462, + "learning_rate": 1.4782608695652174e-05, + "loss": 0.7886185050010681, + "step": 52 + }, + { + "epoch": 0.15460346399270739, + "grad_norm": 0.6078857183456421, + "learning_rate": 1.5072463768115944e-05, + "loss": 0.7298170924186707, + "step": 53 + }, + { + "epoch": 0.1575205104831358, + "grad_norm": 0.5856835246086121, + "learning_rate": 1.536231884057971e-05, + "loss": 0.7407160997390747, + "step": 54 + }, + { + "epoch": 0.16043755697356427, + "grad_norm": 1.0533701181411743, + "learning_rate": 1.565217391304348e-05, + "loss": 0.7057831287384033, + "step": 55 + }, + { + "epoch": 0.16335460346399272, + "grad_norm": 0.8087610006332397, + "learning_rate": 1.5942028985507246e-05, + "loss": 0.7409019470214844, + "step": 56 + }, + { + "epoch": 0.16627164995442115, + "grad_norm": 0.629945695400238, + "learning_rate": 1.6231884057971015e-05, + "loss": 0.7768293023109436, + "step": 57 + }, + { + "epoch": 0.1691886964448496, + "grad_norm": 0.5187911987304688, + "learning_rate": 1.6521739130434785e-05, + "loss": 0.825718104839325, + "step": 58 + }, + { + "epoch": 0.17210574293527803, + "grad_norm": 0.5866358280181885, + "learning_rate": 1.681159420289855e-05, + "loss": 0.8575979471206665, + "step": 59 + }, + { + "epoch": 0.17502278942570648, + "grad_norm": 1.5098934173583984, + "learning_rate": 1.710144927536232e-05, + "loss": 0.8058848977088928, + "step": 60 + }, + { + "epoch": 0.1779398359161349, + "grad_norm": 0.6981958150863647, + "learning_rate": 1.739130434782609e-05, + "loss": 0.7640778422355652, + "step": 61 + }, + { + "epoch": 0.18085688240656336, + "grad_norm": 0.631349503993988, + "learning_rate": 1.7681159420289858e-05, + "loss": 0.7896331548690796, + "step": 62 + }, + { + "epoch": 0.1837739288969918, + "grad_norm": 0.6930747032165527, + "learning_rate": 1.7971014492753624e-05, + "loss": 0.6762524247169495, + "step": 63 + }, + { + "epoch": 0.18669097538742024, + "grad_norm": 0.599399209022522, + "learning_rate": 1.8260869565217393e-05, + "loss": 0.7285035848617554, + "step": 64 + }, + { + "epoch": 0.18960802187784867, + "grad_norm": 0.6194344758987427, + "learning_rate": 1.8550724637681162e-05, + "loss": 0.7682523131370544, + "step": 65 + }, + { + "epoch": 0.19252506836827712, + "grad_norm": 0.5691342949867249, + "learning_rate": 1.8840579710144928e-05, + "loss": 0.6791993379592896, + "step": 66 + }, + { + "epoch": 0.19544211485870555, + "grad_norm": 0.6257390379905701, + "learning_rate": 1.9130434782608697e-05, + "loss": 0.6744828224182129, + "step": 67 + }, + { + "epoch": 0.198359161349134, + "grad_norm": 0.5871018767356873, + "learning_rate": 1.9420289855072467e-05, + "loss": 0.7317330837249756, + "step": 68 + }, + { + "epoch": 0.20127620783956243, + "grad_norm": 1.0744612216949463, + "learning_rate": 1.9710144927536236e-05, + "loss": 0.6617178916931152, + "step": 69 + }, + { + "epoch": 0.2041932543299909, + "grad_norm": 0.675946831703186, + "learning_rate": 2e-05, + "loss": 0.7615712881088257, + "step": 70 + }, + { + "epoch": 0.2071103008204193, + "grad_norm": 0.7663411498069763, + "learning_rate": 1.9999870372100614e-05, + "loss": 0.7131291627883911, + "step": 71 + }, + { + "epoch": 0.21002734731084777, + "grad_norm": 0.6725395321846008, + "learning_rate": 1.9999481491763123e-05, + "loss": 0.7452989816665649, + "step": 72 + }, + { + "epoch": 0.21294439380127622, + "grad_norm": 0.6505664587020874, + "learning_rate": 1.9998833369069483e-05, + "loss": 0.7477136850357056, + "step": 73 + }, + { + "epoch": 0.21586144029170465, + "grad_norm": 0.7032860517501831, + "learning_rate": 1.9997926020822643e-05, + "loss": 0.6854275465011597, + "step": 74 + }, + { + "epoch": 0.2187784867821331, + "grad_norm": 0.645345151424408, + "learning_rate": 1.999675947054614e-05, + "loss": 0.7552425265312195, + "step": 75 + }, + { + "epoch": 0.22169553327256153, + "grad_norm": 0.6620492935180664, + "learning_rate": 1.9995333748483464e-05, + "loss": 0.7262853384017944, + "step": 76 + }, + { + "epoch": 0.22461257976298998, + "grad_norm": 0.6511455774307251, + "learning_rate": 1.9993648891597284e-05, + "loss": 0.7591732144355774, + "step": 77 + }, + { + "epoch": 0.2275296262534184, + "grad_norm": 0.6775254011154175, + "learning_rate": 1.9991704943568497e-05, + "loss": 0.7498704195022583, + "step": 78 + }, + { + "epoch": 0.23044667274384686, + "grad_norm": 0.8199896216392517, + "learning_rate": 1.9989501954795076e-05, + "loss": 0.7238684296607971, + "step": 79 + }, + { + "epoch": 0.2333637192342753, + "grad_norm": 0.8197569847106934, + "learning_rate": 1.998703998239079e-05, + "loss": 0.7028778195381165, + "step": 80 + }, + { + "epoch": 0.23628076572470375, + "grad_norm": 0.6602625250816345, + "learning_rate": 1.9984319090183692e-05, + "loss": 0.8842703104019165, + "step": 81 + }, + { + "epoch": 0.23919781221513217, + "grad_norm": 0.9587129354476929, + "learning_rate": 1.99813393487145e-05, + "loss": 0.732614278793335, + "step": 82 + }, + { + "epoch": 0.24211485870556063, + "grad_norm": 0.6822189092636108, + "learning_rate": 1.997810083523473e-05, + "loss": 0.7544928193092346, + "step": 83 + }, + { + "epoch": 0.24503190519598905, + "grad_norm": 0.8980082869529724, + "learning_rate": 1.9974603633704726e-05, + "loss": 0.6704054474830627, + "step": 84 + }, + { + "epoch": 0.2479489516864175, + "grad_norm": 0.7413425445556641, + "learning_rate": 1.9970847834791472e-05, + "loss": 0.693661093711853, + "step": 85 + }, + { + "epoch": 0.25086599817684596, + "grad_norm": 0.8314999341964722, + "learning_rate": 1.9966833535866223e-05, + "loss": 0.667654275894165, + "step": 86 + }, + { + "epoch": 0.25378304466727436, + "grad_norm": 0.7972444891929626, + "learning_rate": 1.9962560841002013e-05, + "loss": 0.8403134942054749, + "step": 87 + }, + { + "epoch": 0.2567000911577028, + "grad_norm": 0.8519951701164246, + "learning_rate": 1.995802986097093e-05, + "loss": 0.6897370219230652, + "step": 88 + }, + { + "epoch": 0.25961713764813127, + "grad_norm": 0.8268933892250061, + "learning_rate": 1.995324071324126e-05, + "loss": 0.6690632700920105, + "step": 89 + }, + { + "epoch": 0.2625341841385597, + "grad_norm": 0.7133983969688416, + "learning_rate": 1.9948193521974436e-05, + "loss": 0.6314147114753723, + "step": 90 + }, + { + "epoch": 0.2654512306289881, + "grad_norm": 0.889302134513855, + "learning_rate": 1.9942888418021814e-05, + "loss": 0.7389825582504272, + "step": 91 + }, + { + "epoch": 0.2683682771194166, + "grad_norm": 0.7022432088851929, + "learning_rate": 1.99373255389213e-05, + "loss": 0.6916261911392212, + "step": 92 + }, + { + "epoch": 0.27128532360984503, + "grad_norm": 0.696432888507843, + "learning_rate": 1.9931505028893748e-05, + "loss": 0.6908476948738098, + "step": 93 + }, + { + "epoch": 0.2742023701002735, + "grad_norm": 0.7667419910430908, + "learning_rate": 1.9925427038839267e-05, + "loss": 0.6500837206840515, + "step": 94 + }, + { + "epoch": 0.27711941659070194, + "grad_norm": 0.6974894404411316, + "learning_rate": 1.9919091726333265e-05, + "loss": 0.7059191465377808, + "step": 95 + }, + { + "epoch": 0.28003646308113034, + "grad_norm": 0.7047077417373657, + "learning_rate": 1.9912499255622397e-05, + "loss": 0.6287837624549866, + "step": 96 + }, + { + "epoch": 0.2829535095715588, + "grad_norm": 0.7729557156562805, + "learning_rate": 1.990564979762029e-05, + "loss": 0.6738612055778503, + "step": 97 + }, + { + "epoch": 0.28587055606198725, + "grad_norm": 0.7020529508590698, + "learning_rate": 1.989854352990311e-05, + "loss": 0.662042498588562, + "step": 98 + }, + { + "epoch": 0.2887876025524157, + "grad_norm": 0.7369800209999084, + "learning_rate": 1.9891180636704975e-05, + "loss": 0.6246830821037292, + "step": 99 + }, + { + "epoch": 0.2917046490428441, + "grad_norm": 0.7412623167037964, + "learning_rate": 1.9883561308913154e-05, + "loss": 0.6623879075050354, + "step": 100 + }, + { + "epoch": 0.2917046490428441, + "eval_loss": 0.6552971005439758, + "eval_runtime": 966.7072, + "eval_samples_per_second": 0.654, + "eval_steps_per_second": 0.654, + "step": 100 + }, + { + "epoch": 0.29462169553327255, + "grad_norm": 0.8428792953491211, + "learning_rate": 1.987568574406314e-05, + "loss": 0.6312171816825867, + "step": 101 + }, + { + "epoch": 0.297538742023701, + "grad_norm": 0.6948133707046509, + "learning_rate": 1.9867554146333517e-05, + "loss": 0.6266146898269653, + "step": 102 + }, + { + "epoch": 0.30045578851412946, + "grad_norm": 1.3897597789764404, + "learning_rate": 1.985916672654068e-05, + "loss": 0.6669265031814575, + "step": 103 + }, + { + "epoch": 0.30337283500455786, + "grad_norm": 0.8838400840759277, + "learning_rate": 1.985052370213334e-05, + "loss": 0.6601086854934692, + "step": 104 + }, + { + "epoch": 0.3062898814949863, + "grad_norm": 0.8471395373344421, + "learning_rate": 1.9841625297186925e-05, + "loss": 0.5984431505203247, + "step": 105 + }, + { + "epoch": 0.30920692798541477, + "grad_norm": 0.8940042853355408, + "learning_rate": 1.983247174239774e-05, + "loss": 0.7223822474479675, + "step": 106 + }, + { + "epoch": 0.3121239744758432, + "grad_norm": 0.7833696603775024, + "learning_rate": 1.9823063275076998e-05, + "loss": 0.6868705749511719, + "step": 107 + }, + { + "epoch": 0.3150410209662716, + "grad_norm": 0.8794649243354797, + "learning_rate": 1.9813400139144673e-05, + "loss": 0.6246675848960876, + "step": 108 + }, + { + "epoch": 0.3179580674567001, + "grad_norm": 0.8126057982444763, + "learning_rate": 1.9803482585123165e-05, + "loss": 0.5908697247505188, + "step": 109 + }, + { + "epoch": 0.32087511394712853, + "grad_norm": 0.7947676777839661, + "learning_rate": 1.979331087013082e-05, + "loss": 0.5751246809959412, + "step": 110 + }, + { + "epoch": 0.323792160437557, + "grad_norm": 0.713545560836792, + "learning_rate": 1.978288525787524e-05, + "loss": 0.6081106066703796, + "step": 111 + }, + { + "epoch": 0.32670920692798544, + "grad_norm": 1.011828064918518, + "learning_rate": 1.977220601864647e-05, + "loss": 0.7039169669151306, + "step": 112 + }, + { + "epoch": 0.32962625341841384, + "grad_norm": 0.730570912361145, + "learning_rate": 1.9761273429309982e-05, + "loss": 0.6140255928039551, + "step": 113 + }, + { + "epoch": 0.3325432999088423, + "grad_norm": 1.059688687324524, + "learning_rate": 1.9750087773299492e-05, + "loss": 0.648114025592804, + "step": 114 + }, + { + "epoch": 0.33546034639927075, + "grad_norm": 0.9336895942687988, + "learning_rate": 1.973864934060962e-05, + "loss": 0.622555673122406, + "step": 115 + }, + { + "epoch": 0.3383773928896992, + "grad_norm": 0.7195945978164673, + "learning_rate": 1.9726958427788367e-05, + "loss": 0.70485520362854, + "step": 116 + }, + { + "epoch": 0.3412944393801276, + "grad_norm": 0.8101872801780701, + "learning_rate": 1.971501533792942e-05, + "loss": 0.6958848834037781, + "step": 117 + }, + { + "epoch": 0.34421148587055606, + "grad_norm": 1.6075212955474854, + "learning_rate": 1.970282038066432e-05, + "loss": 0.6021550893783569, + "step": 118 + }, + { + "epoch": 0.3471285323609845, + "grad_norm": 0.7881433963775635, + "learning_rate": 1.9690373872154396e-05, + "loss": 0.6449777483940125, + "step": 119 + }, + { + "epoch": 0.35004557885141296, + "grad_norm": 1.014639973640442, + "learning_rate": 1.9677676135082606e-05, + "loss": 0.5939379930496216, + "step": 120 + }, + { + "epoch": 0.35296262534184136, + "grad_norm": 0.8198449611663818, + "learning_rate": 1.9664727498645144e-05, + "loss": 0.6210286617279053, + "step": 121 + }, + { + "epoch": 0.3558796718322698, + "grad_norm": 1.0194576978683472, + "learning_rate": 1.9651528298542918e-05, + "loss": 0.624247670173645, + "step": 122 + }, + { + "epoch": 0.35879671832269827, + "grad_norm": 0.7963470220565796, + "learning_rate": 1.9638078876972842e-05, + "loss": 0.6479315757751465, + "step": 123 + }, + { + "epoch": 0.3617137648131267, + "grad_norm": 0.9007541537284851, + "learning_rate": 1.9624379582618976e-05, + "loss": 0.6131505370140076, + "step": 124 + }, + { + "epoch": 0.3646308113035551, + "grad_norm": 0.8712120056152344, + "learning_rate": 1.9610430770643464e-05, + "loss": 0.6249448657035828, + "step": 125 + }, + { + "epoch": 0.3675478577939836, + "grad_norm": 1.1482540369033813, + "learning_rate": 1.9596232802677347e-05, + "loss": 0.5844688415527344, + "step": 126 + }, + { + "epoch": 0.37046490428441203, + "grad_norm": 0.8662379384040833, + "learning_rate": 1.9581786046811175e-05, + "loss": 0.6573485732078552, + "step": 127 + }, + { + "epoch": 0.3733819507748405, + "grad_norm": 0.8191388845443726, + "learning_rate": 1.9567090877585477e-05, + "loss": 0.5896862745285034, + "step": 128 + }, + { + "epoch": 0.37629899726526894, + "grad_norm": 1.0187078714370728, + "learning_rate": 1.955214767598103e-05, + "loss": 0.613490879535675, + "step": 129 + }, + { + "epoch": 0.37921604375569734, + "grad_norm": 0.8444119691848755, + "learning_rate": 1.953695682940901e-05, + "loss": 0.727687656879425, + "step": 130 + }, + { + "epoch": 0.3821330902461258, + "grad_norm": 0.74753737449646, + "learning_rate": 1.9521518731700913e-05, + "loss": 0.6102436780929565, + "step": 131 + }, + { + "epoch": 0.38505013673655425, + "grad_norm": 1.0166202783584595, + "learning_rate": 1.9505833783098378e-05, + "loss": 0.6244844198226929, + "step": 132 + }, + { + "epoch": 0.3879671832269827, + "grad_norm": 0.8175772428512573, + "learning_rate": 1.9489902390242793e-05, + "loss": 0.5939282178878784, + "step": 133 + }, + { + "epoch": 0.3908842297174111, + "grad_norm": 1.0177713632583618, + "learning_rate": 1.947372496616476e-05, + "loss": 0.6418229937553406, + "step": 134 + }, + { + "epoch": 0.39380127620783956, + "grad_norm": 0.8652453422546387, + "learning_rate": 1.9457301930273376e-05, + "loss": 0.5870395302772522, + "step": 135 + }, + { + "epoch": 0.396718322698268, + "grad_norm": 0.8378894925117493, + "learning_rate": 1.9440633708345365e-05, + "loss": 0.6480278372764587, + "step": 136 + }, + { + "epoch": 0.39963536918869647, + "grad_norm": 0.8303541541099548, + "learning_rate": 1.9423720732514052e-05, + "loss": 0.6191359758377075, + "step": 137 + }, + { + "epoch": 0.40255241567912486, + "grad_norm": 0.8576734662055969, + "learning_rate": 1.9406563441258145e-05, + "loss": 0.5696198344230652, + "step": 138 + }, + { + "epoch": 0.4054694621695533, + "grad_norm": 0.9558727145195007, + "learning_rate": 1.9389162279390362e-05, + "loss": 0.6177623271942139, + "step": 139 + }, + { + "epoch": 0.4083865086599818, + "grad_norm": 0.7046042084693909, + "learning_rate": 1.9371517698045922e-05, + "loss": 0.5836521983146667, + "step": 140 + }, + { + "epoch": 0.4113035551504102, + "grad_norm": 1.0522717237472534, + "learning_rate": 1.935363015467082e-05, + "loss": 0.5728275775909424, + "step": 141 + }, + { + "epoch": 0.4142206016408386, + "grad_norm": 0.9554787874221802, + "learning_rate": 1.933550011301e-05, + "loss": 0.632586658000946, + "step": 142 + }, + { + "epoch": 0.4171376481312671, + "grad_norm": 0.8874214291572571, + "learning_rate": 1.9317128043095293e-05, + "loss": 0.5850118398666382, + "step": 143 + }, + { + "epoch": 0.42005469462169553, + "grad_norm": 1.0708963871002197, + "learning_rate": 1.9298514421233276e-05, + "loss": 0.6260685324668884, + "step": 144 + }, + { + "epoch": 0.422971741112124, + "grad_norm": 0.8135736584663391, + "learning_rate": 1.9279659729992888e-05, + "loss": 0.6031094193458557, + "step": 145 + }, + { + "epoch": 0.42588878760255244, + "grad_norm": 0.7971774339675903, + "learning_rate": 1.9260564458192926e-05, + "loss": 0.6101322770118713, + "step": 146 + }, + { + "epoch": 0.42880583409298084, + "grad_norm": 0.9374974966049194, + "learning_rate": 1.9241229100889397e-05, + "loss": 0.5836313366889954, + "step": 147 + }, + { + "epoch": 0.4317228805834093, + "grad_norm": 0.8043425679206848, + "learning_rate": 1.9221654159362636e-05, + "loss": 0.6181215047836304, + "step": 148 + }, + { + "epoch": 0.43463992707383775, + "grad_norm": 0.8923380374908447, + "learning_rate": 1.920184014110436e-05, + "loss": 0.6149677634239197, + "step": 149 + }, + { + "epoch": 0.4375569735642662, + "grad_norm": 0.8908132314682007, + "learning_rate": 1.918178755980449e-05, + "loss": 0.5899742841720581, + "step": 150 + }, + { + "epoch": 0.4375569735642662, + "eval_loss": 0.5903874635696411, + "eval_runtime": 1186.9542, + "eval_samples_per_second": 0.532, + "eval_steps_per_second": 0.532, + "step": 150 + }, + { + "epoch": 0.4404740200546946, + "grad_norm": 1.060531497001648, + "learning_rate": 1.9161496935337808e-05, + "loss": 0.5852696895599365, + "step": 151 + }, + { + "epoch": 0.44339106654512306, + "grad_norm": 0.9723032712936401, + "learning_rate": 1.914096879375053e-05, + "loss": 0.5822056531906128, + "step": 152 + }, + { + "epoch": 0.4463081130355515, + "grad_norm": 0.9519931674003601, + "learning_rate": 1.912020366724663e-05, + "loss": 0.6183493137359619, + "step": 153 + }, + { + "epoch": 0.44922515952597997, + "grad_norm": 0.8282918334007263, + "learning_rate": 1.9099202094174055e-05, + "loss": 0.6229860782623291, + "step": 154 + }, + { + "epoch": 0.45214220601640837, + "grad_norm": 0.9251292943954468, + "learning_rate": 1.907796461901076e-05, + "loss": 0.6552959680557251, + "step": 155 + }, + { + "epoch": 0.4550592525068368, + "grad_norm": 1.0349540710449219, + "learning_rate": 1.9056491792350606e-05, + "loss": 0.6170098781585693, + "step": 156 + }, + { + "epoch": 0.4579762989972653, + "grad_norm": 0.8720711469650269, + "learning_rate": 1.9034784170889076e-05, + "loss": 0.5870137810707092, + "step": 157 + }, + { + "epoch": 0.46089334548769373, + "grad_norm": 1.0785977840423584, + "learning_rate": 1.9012842317408843e-05, + "loss": 0.5515124201774597, + "step": 158 + }, + { + "epoch": 0.4638103919781221, + "grad_norm": 1.0634154081344604, + "learning_rate": 1.8990666800765187e-05, + "loss": 0.6073828339576721, + "step": 159 + }, + { + "epoch": 0.4667274384685506, + "grad_norm": 0.8770879507064819, + "learning_rate": 1.896825819587123e-05, + "loss": 0.5960907936096191, + "step": 160 + }, + { + "epoch": 0.46964448495897904, + "grad_norm": 1.1225898265838623, + "learning_rate": 1.894561708368305e-05, + "loss": 0.545990526676178, + "step": 161 + }, + { + "epoch": 0.4725615314494075, + "grad_norm": 0.9373893141746521, + "learning_rate": 1.8922744051184613e-05, + "loss": 0.5566108822822571, + "step": 162 + }, + { + "epoch": 0.4754785779398359, + "grad_norm": 1.5016087293624878, + "learning_rate": 1.8899639691372545e-05, + "loss": 0.558845043182373, + "step": 163 + }, + { + "epoch": 0.47839562443026434, + "grad_norm": 0.903020977973938, + "learning_rate": 1.8876304603240773e-05, + "loss": 0.6824233531951904, + "step": 164 + }, + { + "epoch": 0.4813126709206928, + "grad_norm": 0.8239623308181763, + "learning_rate": 1.8852739391764993e-05, + "loss": 0.5630610585212708, + "step": 165 + }, + { + "epoch": 0.48422971741112125, + "grad_norm": 0.926069438457489, + "learning_rate": 1.882894466788697e-05, + "loss": 0.6211802363395691, + "step": 166 + }, + { + "epoch": 0.4871467639015497, + "grad_norm": 1.0098828077316284, + "learning_rate": 1.8804921048498722e-05, + "loss": 0.5513257384300232, + "step": 167 + }, + { + "epoch": 0.4900638103919781, + "grad_norm": 0.9228141903877258, + "learning_rate": 1.8780669156426517e-05, + "loss": 0.6197121739387512, + "step": 168 + }, + { + "epoch": 0.49298085688240656, + "grad_norm": 1.0551754236221313, + "learning_rate": 1.8756189620414712e-05, + "loss": 0.5221806764602661, + "step": 169 + }, + { + "epoch": 0.495897903372835, + "grad_norm": 0.9017496109008789, + "learning_rate": 1.873148307510948e-05, + "loss": 0.5766995549201965, + "step": 170 + }, + { + "epoch": 0.49881494986326347, + "grad_norm": 0.9704970717430115, + "learning_rate": 1.870655016104233e-05, + "loss": 0.6514763832092285, + "step": 171 + }, + { + "epoch": 0.5017319963536919, + "grad_norm": 0.9972712397575378, + "learning_rate": 1.8681391524613518e-05, + "loss": 0.5273895263671875, + "step": 172 + }, + { + "epoch": 0.5046490428441204, + "grad_norm": 0.9473339319229126, + "learning_rate": 1.8656007818075288e-05, + "loss": 0.5548599362373352, + "step": 173 + }, + { + "epoch": 0.5075660893345487, + "grad_norm": 1.2493574619293213, + "learning_rate": 1.8630399699514944e-05, + "loss": 0.5593586564064026, + "step": 174 + }, + { + "epoch": 0.5104831358249772, + "grad_norm": 1.2766696214675903, + "learning_rate": 1.860456783283781e-05, + "loss": 0.6054630279541016, + "step": 175 + }, + { + "epoch": 0.5134001823154056, + "grad_norm": 0.9555240869522095, + "learning_rate": 1.857851288775002e-05, + "loss": 0.508592963218689, + "step": 176 + }, + { + "epoch": 0.5163172288058341, + "grad_norm": 1.260219931602478, + "learning_rate": 1.8552235539741118e-05, + "loss": 0.5532065629959106, + "step": 177 + }, + { + "epoch": 0.5192342752962625, + "grad_norm": 1.1859954595565796, + "learning_rate": 1.8525736470066595e-05, + "loss": 0.5683344006538391, + "step": 178 + }, + { + "epoch": 0.522151321786691, + "grad_norm": 1.3044344186782837, + "learning_rate": 1.8499016365730203e-05, + "loss": 0.5281959772109985, + "step": 179 + }, + { + "epoch": 0.5250683682771194, + "grad_norm": 1.3049921989440918, + "learning_rate": 1.8472075919466137e-05, + "loss": 0.49621230363845825, + "step": 180 + }, + { + "epoch": 0.5279854147675479, + "grad_norm": 1.0488537549972534, + "learning_rate": 1.844491582972109e-05, + "loss": 0.6194032430648804, + "step": 181 + }, + { + "epoch": 0.5309024612579762, + "grad_norm": 1.5553455352783203, + "learning_rate": 1.8417536800636138e-05, + "loss": 0.5645846724510193, + "step": 182 + }, + { + "epoch": 0.5338195077484047, + "grad_norm": 1.2673912048339844, + "learning_rate": 1.8389939542028484e-05, + "loss": 0.6267315745353699, + "step": 183 + }, + { + "epoch": 0.5367365542388332, + "grad_norm": 1.0273847579956055, + "learning_rate": 1.8362124769373064e-05, + "loss": 0.5256403684616089, + "step": 184 + }, + { + "epoch": 0.5396536007292616, + "grad_norm": 1.006093978881836, + "learning_rate": 1.8334093203783986e-05, + "loss": 0.5916382074356079, + "step": 185 + }, + { + "epoch": 0.5425706472196901, + "grad_norm": 1.2740857601165771, + "learning_rate": 1.8305845571995843e-05, + "loss": 0.581648588180542, + "step": 186 + }, + { + "epoch": 0.5454876937101185, + "grad_norm": 1.494248390197754, + "learning_rate": 1.8277382606344872e-05, + "loss": 0.4824523627758026, + "step": 187 + }, + { + "epoch": 0.548404740200547, + "grad_norm": 1.1862496137619019, + "learning_rate": 1.824870504474996e-05, + "loss": 0.5531858205795288, + "step": 188 + }, + { + "epoch": 0.5513217866909754, + "grad_norm": 3.503049373626709, + "learning_rate": 1.8219813630693523e-05, + "loss": 0.6308296918869019, + "step": 189 + }, + { + "epoch": 0.5542388331814039, + "grad_norm": 1.7544710636138916, + "learning_rate": 1.819070911320222e-05, + "loss": 0.6146273016929626, + "step": 190 + }, + { + "epoch": 0.5571558796718322, + "grad_norm": 1.3367774486541748, + "learning_rate": 1.8161392246827546e-05, + "loss": 0.5848966240882874, + "step": 191 + }, + { + "epoch": 0.5600729261622607, + "grad_norm": 1.696418046951294, + "learning_rate": 1.8131863791626263e-05, + "loss": 0.6621730327606201, + "step": 192 + }, + { + "epoch": 0.5629899726526891, + "grad_norm": 1.360052227973938, + "learning_rate": 1.8102124513140694e-05, + "loss": 0.5972204208374023, + "step": 193 + }, + { + "epoch": 0.5659070191431176, + "grad_norm": 1.5376263856887817, + "learning_rate": 1.807217518237888e-05, + "loss": 0.4938785433769226, + "step": 194 + }, + { + "epoch": 0.568824065633546, + "grad_norm": 1.2249681949615479, + "learning_rate": 1.8042016575794585e-05, + "loss": 0.5366095304489136, + "step": 195 + }, + { + "epoch": 0.5717411121239745, + "grad_norm": 1.7868080139160156, + "learning_rate": 1.8011649475267178e-05, + "loss": 0.5116773843765259, + "step": 196 + }, + { + "epoch": 0.574658158614403, + "grad_norm": 2.369993209838867, + "learning_rate": 1.7981074668081345e-05, + "loss": 0.49072742462158203, + "step": 197 + }, + { + "epoch": 0.5775752051048314, + "grad_norm": 1.0168434381484985, + "learning_rate": 1.7950292946906695e-05, + "loss": 0.5691611170768738, + "step": 198 + }, + { + "epoch": 0.5804922515952597, + "grad_norm": 1.2990851402282715, + "learning_rate": 1.7919305109777195e-05, + "loss": 0.5515039563179016, + "step": 199 + }, + { + "epoch": 0.5834092980856882, + "grad_norm": 1.4859853982925415, + "learning_rate": 1.7888111960070493e-05, + "loss": 0.5017011165618896, + "step": 200 + }, + { + "epoch": 0.5834092980856882, + "eval_loss": 0.5414339303970337, + "eval_runtime": 1180.7894, + "eval_samples_per_second": 0.535, + "eval_steps_per_second": 0.535, + "step": 200 + }, + { + "epoch": 0.5863263445761167, + "grad_norm": 1.0065829753875732, + "learning_rate": 1.7856714306487088e-05, + "loss": 0.5677731037139893, + "step": 201 + }, + { + "epoch": 0.5892433910665451, + "grad_norm": 1.1727538108825684, + "learning_rate": 1.7825112963029352e-05, + "loss": 0.4525509476661682, + "step": 202 + }, + { + "epoch": 0.5921604375569736, + "grad_norm": 1.3376752138137817, + "learning_rate": 1.7793308748980437e-05, + "loss": 0.5208959579467773, + "step": 203 + }, + { + "epoch": 0.595077484047402, + "grad_norm": 0.9196159839630127, + "learning_rate": 1.776130248888304e-05, + "loss": 0.6033903360366821, + "step": 204 + }, + { + "epoch": 0.5979945305378305, + "grad_norm": 1.0750919580459595, + "learning_rate": 1.772909501251801e-05, + "loss": 0.5449609160423279, + "step": 205 + }, + { + "epoch": 0.6009115770282589, + "grad_norm": 1.2459467649459839, + "learning_rate": 1.769668715488285e-05, + "loss": 0.5685338377952576, + "step": 206 + }, + { + "epoch": 0.6038286235186874, + "grad_norm": 1.1690552234649658, + "learning_rate": 1.766407975617006e-05, + "loss": 0.5240382552146912, + "step": 207 + }, + { + "epoch": 0.6067456700091157, + "grad_norm": 1.0816599130630493, + "learning_rate": 1.7631273661745362e-05, + "loss": 0.6802893877029419, + "step": 208 + }, + { + "epoch": 0.6096627164995442, + "grad_norm": 1.3662947416305542, + "learning_rate": 1.7598269722125775e-05, + "loss": 0.48193931579589844, + "step": 209 + }, + { + "epoch": 0.6125797629899726, + "grad_norm": 0.9364766478538513, + "learning_rate": 1.7565068792957576e-05, + "loss": 0.5675849914550781, + "step": 210 + }, + { + "epoch": 0.6154968094804011, + "grad_norm": 1.123828411102295, + "learning_rate": 1.75316717349941e-05, + "loss": 0.5474762916564941, + "step": 211 + }, + { + "epoch": 0.6184138559708295, + "grad_norm": 1.1924363374710083, + "learning_rate": 1.749807941407345e-05, + "loss": 0.4918654263019562, + "step": 212 + }, + { + "epoch": 0.621330902461258, + "grad_norm": 1.101293921470642, + "learning_rate": 1.7464292701096014e-05, + "loss": 0.5742691159248352, + "step": 213 + }, + { + "epoch": 0.6242479489516864, + "grad_norm": 1.7374963760375977, + "learning_rate": 1.7430312472001928e-05, + "loss": 0.5828965902328491, + "step": 214 + }, + { + "epoch": 0.6271649954421149, + "grad_norm": 1.3195666074752808, + "learning_rate": 1.739613960774833e-05, + "loss": 0.5265159010887146, + "step": 215 + }, + { + "epoch": 0.6300820419325432, + "grad_norm": 1.254686713218689, + "learning_rate": 1.7361774994286545e-05, + "loss": 0.4929371476173401, + "step": 216 + }, + { + "epoch": 0.6329990884229717, + "grad_norm": 1.1476380825042725, + "learning_rate": 1.7327219522539102e-05, + "loss": 0.5060417652130127, + "step": 217 + }, + { + "epoch": 0.6359161349134002, + "grad_norm": 1.0914150476455688, + "learning_rate": 1.7292474088376643e-05, + "loss": 0.504043698310852, + "step": 218 + }, + { + "epoch": 0.6388331814038286, + "grad_norm": 1.1339508295059204, + "learning_rate": 1.7257539592594698e-05, + "loss": 0.4797310531139374, + "step": 219 + }, + { + "epoch": 0.6417502278942571, + "grad_norm": 1.0805399417877197, + "learning_rate": 1.722241694089033e-05, + "loss": 0.5878555178642273, + "step": 220 + }, + { + "epoch": 0.6446672743846855, + "grad_norm": 1.8615056276321411, + "learning_rate": 1.718710704383865e-05, + "loss": 0.5005823969841003, + "step": 221 + }, + { + "epoch": 0.647584320875114, + "grad_norm": 1.1445401906967163, + "learning_rate": 1.7151610816869214e-05, + "loss": 0.4949319064617157, + "step": 222 + }, + { + "epoch": 0.6505013673655424, + "grad_norm": 0.9726515412330627, + "learning_rate": 1.711592918024229e-05, + "loss": 0.5073204040527344, + "step": 223 + }, + { + "epoch": 0.6534184138559709, + "grad_norm": 1.4491140842437744, + "learning_rate": 1.7080063059024998e-05, + "loss": 0.47885262966156006, + "step": 224 + }, + { + "epoch": 0.6563354603463992, + "grad_norm": 1.0070592164993286, + "learning_rate": 1.7044013383067327e-05, + "loss": 0.5775837898254395, + "step": 225 + }, + { + "epoch": 0.6592525068368277, + "grad_norm": 0.966221272945404, + "learning_rate": 1.7007781086978037e-05, + "loss": 0.5050399899482727, + "step": 226 + }, + { + "epoch": 0.6621695533272561, + "grad_norm": 0.9808815121650696, + "learning_rate": 1.6971367110100407e-05, + "loss": 0.5737045407295227, + "step": 227 + }, + { + "epoch": 0.6650865998176846, + "grad_norm": 1.0158127546310425, + "learning_rate": 1.6934772396487906e-05, + "loss": 0.48077821731567383, + "step": 228 + }, + { + "epoch": 0.668003646308113, + "grad_norm": 1.32015860080719, + "learning_rate": 1.6897997894879706e-05, + "loss": 0.5614925026893616, + "step": 229 + }, + { + "epoch": 0.6709206927985415, + "grad_norm": 1.1055903434753418, + "learning_rate": 1.686104455867608e-05, + "loss": 0.4970760643482208, + "step": 230 + }, + { + "epoch": 0.67383773928897, + "grad_norm": 1.0804500579833984, + "learning_rate": 1.682391334591371e-05, + "loss": 0.5540452003479004, + "step": 231 + }, + { + "epoch": 0.6767547857793984, + "grad_norm": 1.1906245946884155, + "learning_rate": 1.6786605219240807e-05, + "loss": 0.5778501033782959, + "step": 232 + }, + { + "epoch": 0.6796718322698267, + "grad_norm": 0.9758645296096802, + "learning_rate": 1.6749121145892192e-05, + "loss": 0.49073565006256104, + "step": 233 + }, + { + "epoch": 0.6825888787602552, + "grad_norm": 1.1678364276885986, + "learning_rate": 1.6711462097664207e-05, + "loss": 0.4828741252422333, + "step": 234 + }, + { + "epoch": 0.6855059252506837, + "grad_norm": 1.148301362991333, + "learning_rate": 1.6673629050889507e-05, + "loss": 0.5143818855285645, + "step": 235 + }, + { + "epoch": 0.6884229717411121, + "grad_norm": 1.005898356437683, + "learning_rate": 1.6635622986411776e-05, + "loss": 0.5301160216331482, + "step": 236 + }, + { + "epoch": 0.6913400182315406, + "grad_norm": 1.2227320671081543, + "learning_rate": 1.659744488956027e-05, + "loss": 0.4800386130809784, + "step": 237 + }, + { + "epoch": 0.694257064721969, + "grad_norm": 0.986456573009491, + "learning_rate": 1.6559095750124296e-05, + "loss": 0.5098081827163696, + "step": 238 + }, + { + "epoch": 0.6971741112123975, + "grad_norm": 1.1474376916885376, + "learning_rate": 1.6520576562327518e-05, + "loss": 0.5147273540496826, + "step": 239 + }, + { + "epoch": 0.7000911577028259, + "grad_norm": 1.10917067527771, + "learning_rate": 1.6481888324802223e-05, + "loss": 0.5023190379142761, + "step": 240 + }, + { + "epoch": 0.7030082041932544, + "grad_norm": 1.2339262962341309, + "learning_rate": 1.644303204056341e-05, + "loss": 0.5282092690467834, + "step": 241 + }, + { + "epoch": 0.7059252506836827, + "grad_norm": 0.997941255569458, + "learning_rate": 1.640400871698277e-05, + "loss": 0.5635963082313538, + "step": 242 + }, + { + "epoch": 0.7088422971741112, + "grad_norm": 1.0345823764801025, + "learning_rate": 1.63648193657626e-05, + "loss": 0.5577977895736694, + "step": 243 + }, + { + "epoch": 0.7117593436645396, + "grad_norm": 1.3468303680419922, + "learning_rate": 1.6325465002909554e-05, + "loss": 0.4365362524986267, + "step": 244 + }, + { + "epoch": 0.7146763901549681, + "grad_norm": 1.2817128896713257, + "learning_rate": 1.628594664870831e-05, + "loss": 0.46069926023483276, + "step": 245 + }, + { + "epoch": 0.7175934366453965, + "grad_norm": 1.043311357498169, + "learning_rate": 1.6246265327695117e-05, + "loss": 0.5476971864700317, + "step": 246 + }, + { + "epoch": 0.720510483135825, + "grad_norm": 1.0297389030456543, + "learning_rate": 1.620642206863124e-05, + "loss": 0.48051249980926514, + "step": 247 + }, + { + "epoch": 0.7234275296262535, + "grad_norm": 1.4869836568832397, + "learning_rate": 1.6166417904476257e-05, + "loss": 0.5683314800262451, + "step": 248 + }, + { + "epoch": 0.7263445761166819, + "grad_norm": 1.0628005266189575, + "learning_rate": 1.6126253872361336e-05, + "loss": 0.5277887582778931, + "step": 249 + }, + { + "epoch": 0.7292616226071102, + "grad_norm": 1.2682170867919922, + "learning_rate": 1.608593101356229e-05, + "loss": 0.5048879384994507, + "step": 250 + }, + { + "epoch": 0.7292616226071102, + "eval_loss": 0.5038471221923828, + "eval_runtime": 1175.0375, + "eval_samples_per_second": 0.538, + "eval_steps_per_second": 0.538, + "step": 250 + }, + { + "epoch": 0.7321786690975387, + "grad_norm": 1.7376199960708618, + "learning_rate": 1.6045450373472626e-05, + "loss": 0.5093721151351929, + "step": 251 + }, + { + "epoch": 0.7350957155879672, + "grad_norm": 1.6047718524932861, + "learning_rate": 1.6004813001576405e-05, + "loss": 0.4796055555343628, + "step": 252 + }, + { + "epoch": 0.7380127620783956, + "grad_norm": 1.3582886457443237, + "learning_rate": 1.5964019951421058e-05, + "loss": 0.4733014702796936, + "step": 253 + }, + { + "epoch": 0.7409298085688241, + "grad_norm": 0.9468897581100464, + "learning_rate": 1.5923072280590072e-05, + "loss": 0.5312032103538513, + "step": 254 + }, + { + "epoch": 0.7438468550592525, + "grad_norm": 1.3890198469161987, + "learning_rate": 1.5881971050675547e-05, + "loss": 0.47576645016670227, + "step": 255 + }, + { + "epoch": 0.746763901549681, + "grad_norm": 1.782992959022522, + "learning_rate": 1.584071732725071e-05, + "loss": 0.5555092096328735, + "step": 256 + }, + { + "epoch": 0.7496809480401094, + "grad_norm": 1.1790621280670166, + "learning_rate": 1.5799312179842265e-05, + "loss": 0.5148727893829346, + "step": 257 + }, + { + "epoch": 0.7525979945305379, + "grad_norm": 1.446694254875183, + "learning_rate": 1.5757756681902664e-05, + "loss": 0.49939870834350586, + "step": 258 + }, + { + "epoch": 0.7555150410209662, + "grad_norm": 1.1786166429519653, + "learning_rate": 1.571605191078229e-05, + "loss": 0.562156081199646, + "step": 259 + }, + { + "epoch": 0.7584320875113947, + "grad_norm": 1.16925847530365, + "learning_rate": 1.567419894770151e-05, + "loss": 0.49580734968185425, + "step": 260 + }, + { + "epoch": 0.7613491340018231, + "grad_norm": 1.60944664478302, + "learning_rate": 1.5632198877722676e-05, + "loss": 0.4821680784225464, + "step": 261 + }, + { + "epoch": 0.7642661804922516, + "grad_norm": 1.3957884311676025, + "learning_rate": 1.5590052789721946e-05, + "loss": 0.4392276406288147, + "step": 262 + }, + { + "epoch": 0.76718322698268, + "grad_norm": 1.636195421218872, + "learning_rate": 1.5547761776361096e-05, + "loss": 0.39603114128112793, + "step": 263 + }, + { + "epoch": 0.7701002734731085, + "grad_norm": 1.496766448020935, + "learning_rate": 1.550532693405917e-05, + "loss": 0.4833749234676361, + "step": 264 + }, + { + "epoch": 0.773017319963537, + "grad_norm": 1.3587844371795654, + "learning_rate": 1.5462749362964058e-05, + "loss": 0.43738317489624023, + "step": 265 + }, + { + "epoch": 0.7759343664539654, + "grad_norm": 1.670704960823059, + "learning_rate": 1.5420030166923983e-05, + "loss": 0.4476737380027771, + "step": 266 + }, + { + "epoch": 0.7788514129443938, + "grad_norm": 1.2674932479858398, + "learning_rate": 1.537717045345888e-05, + "loss": 0.42266708612442017, + "step": 267 + }, + { + "epoch": 0.7817684594348222, + "grad_norm": 2.0639536380767822, + "learning_rate": 1.5334171333731666e-05, + "loss": 0.5245381593704224, + "step": 268 + }, + { + "epoch": 0.7846855059252507, + "grad_norm": 1.2091766595840454, + "learning_rate": 1.529103392251946e-05, + "loss": 0.5166443586349487, + "step": 269 + }, + { + "epoch": 0.7876025524156791, + "grad_norm": 1.1021631956100464, + "learning_rate": 1.5247759338184653e-05, + "loss": 0.5674265027046204, + "step": 270 + }, + { + "epoch": 0.7905195989061076, + "grad_norm": 1.3143829107284546, + "learning_rate": 1.520434870264595e-05, + "loss": 0.40855613350868225, + "step": 271 + }, + { + "epoch": 0.793436645396536, + "grad_norm": 1.1784812211990356, + "learning_rate": 1.5160803141349244e-05, + "loss": 0.4308925271034241, + "step": 272 + }, + { + "epoch": 0.7963536918869645, + "grad_norm": 2.1635706424713135, + "learning_rate": 1.5117123783238458e-05, + "loss": 0.45035502314567566, + "step": 273 + }, + { + "epoch": 0.7992707383773929, + "grad_norm": 1.569203495979309, + "learning_rate": 1.5073311760726287e-05, + "loss": 0.5095728635787964, + "step": 274 + }, + { + "epoch": 0.8021877848678214, + "grad_norm": 2.532621383666992, + "learning_rate": 1.5029368209664822e-05, + "loss": 0.496748685836792, + "step": 275 + }, + { + "epoch": 0.8051048313582497, + "grad_norm": 1.6312552690505981, + "learning_rate": 1.4985294269316098e-05, + "loss": 0.4972914159297943, + "step": 276 + }, + { + "epoch": 0.8080218778486782, + "grad_norm": 1.3996756076812744, + "learning_rate": 1.4941091082322579e-05, + "loss": 0.5589750409126282, + "step": 277 + }, + { + "epoch": 0.8109389243391066, + "grad_norm": 1.1288363933563232, + "learning_rate": 1.4896759794677526e-05, + "loss": 0.5349453687667847, + "step": 278 + }, + { + "epoch": 0.8138559708295351, + "grad_norm": 1.6913920640945435, + "learning_rate": 1.4852301555695268e-05, + "loss": 0.46511000394821167, + "step": 279 + }, + { + "epoch": 0.8167730173199635, + "grad_norm": 1.1913212537765503, + "learning_rate": 1.4807717517981439e-05, + "loss": 0.4715422987937927, + "step": 280 + }, + { + "epoch": 0.819690063810392, + "grad_norm": 1.1179691553115845, + "learning_rate": 1.476300883740307e-05, + "loss": 0.53330397605896, + "step": 281 + }, + { + "epoch": 0.8226071103008205, + "grad_norm": 1.7473797798156738, + "learning_rate": 1.4718176673058624e-05, + "loss": 0.47564437985420227, + "step": 282 + }, + { + "epoch": 0.8255241567912489, + "grad_norm": 1.2653177976608276, + "learning_rate": 1.4673222187247963e-05, + "loss": 0.46364277601242065, + "step": 283 + }, + { + "epoch": 0.8284412032816773, + "grad_norm": 1.2567330598831177, + "learning_rate": 1.4628146545442202e-05, + "loss": 0.4778091013431549, + "step": 284 + }, + { + "epoch": 0.8313582497721057, + "grad_norm": 1.5848406553268433, + "learning_rate": 1.4582950916253488e-05, + "loss": 0.4480203688144684, + "step": 285 + }, + { + "epoch": 0.8342752962625342, + "grad_norm": 1.3278183937072754, + "learning_rate": 1.453763647140472e-05, + "loss": 0.37945032119750977, + "step": 286 + }, + { + "epoch": 0.8371923427529626, + "grad_norm": 1.0961651802062988, + "learning_rate": 1.4492204385699155e-05, + "loss": 0.5306747555732727, + "step": 287 + }, + { + "epoch": 0.8401093892433911, + "grad_norm": 1.176276683807373, + "learning_rate": 1.4446655836989961e-05, + "loss": 0.49950045347213745, + "step": 288 + }, + { + "epoch": 0.8430264357338195, + "grad_norm": 1.2228269577026367, + "learning_rate": 1.4400992006149674e-05, + "loss": 0.494475394487381, + "step": 289 + }, + { + "epoch": 0.845943482224248, + "grad_norm": 1.1584209203720093, + "learning_rate": 1.4355214077039592e-05, + "loss": 0.44170859456062317, + "step": 290 + }, + { + "epoch": 0.8488605287146764, + "grad_norm": 1.2041938304901123, + "learning_rate": 1.4309323236479071e-05, + "loss": 0.4359871745109558, + "step": 291 + }, + { + "epoch": 0.8517775752051049, + "grad_norm": 1.279645562171936, + "learning_rate": 1.4263320674214762e-05, + "loss": 0.45031386613845825, + "step": 292 + }, + { + "epoch": 0.8546946216955332, + "grad_norm": 1.3958357572555542, + "learning_rate": 1.4217207582889769e-05, + "loss": 0.4832204580307007, + "step": 293 + }, + { + "epoch": 0.8576116681859617, + "grad_norm": 1.2788586616516113, + "learning_rate": 1.4170985158012725e-05, + "loss": 0.5154346227645874, + "step": 294 + }, + { + "epoch": 0.8605287146763901, + "grad_norm": 1.3634892702102661, + "learning_rate": 1.4124654597926795e-05, + "loss": 0.46777206659317017, + "step": 295 + }, + { + "epoch": 0.8634457611668186, + "grad_norm": 1.2719579935073853, + "learning_rate": 1.4078217103778619e-05, + "loss": 0.4247053265571594, + "step": 296 + }, + { + "epoch": 0.866362807657247, + "grad_norm": 2.890467643737793, + "learning_rate": 1.4031673879487161e-05, + "loss": 0.38349640369415283, + "step": 297 + }, + { + "epoch": 0.8692798541476755, + "grad_norm": 2.4354801177978516, + "learning_rate": 1.3985026131712499e-05, + "loss": 0.4134889543056488, + "step": 298 + }, + { + "epoch": 0.872196900638104, + "grad_norm": 1.0138323307037354, + "learning_rate": 1.3938275069824541e-05, + "loss": 0.5176680684089661, + "step": 299 + }, + { + "epoch": 0.8751139471285324, + "grad_norm": 1.2316186428070068, + "learning_rate": 1.389142190587168e-05, + "loss": 0.4818477928638458, + "step": 300 + }, + { + "epoch": 0.8751139471285324, + "eval_loss": 0.4752846360206604, + "eval_runtime": 1189.1666, + "eval_samples_per_second": 0.531, + "eval_steps_per_second": 0.531, + "step": 300 + }, + { + "epoch": 0.8780309936189608, + "grad_norm": 1.515487551689148, + "learning_rate": 1.384446785454936e-05, + "loss": 0.47766175866127014, + "step": 301 + }, + { + "epoch": 0.8809480401093892, + "grad_norm": 1.4357497692108154, + "learning_rate": 1.3797414133168591e-05, + "loss": 0.49297061562538147, + "step": 302 + }, + { + "epoch": 0.8838650865998177, + "grad_norm": 1.2523037195205688, + "learning_rate": 1.3750261961624383e-05, + "loss": 0.4629015326499939, + "step": 303 + }, + { + "epoch": 0.8867821330902461, + "grad_norm": 3.5790023803710938, + "learning_rate": 1.3703012562364124e-05, + "loss": 0.3773120045661926, + "step": 304 + }, + { + "epoch": 0.8896991795806746, + "grad_norm": 1.9305704832077026, + "learning_rate": 1.3655667160355892e-05, + "loss": 0.496719628572464, + "step": 305 + }, + { + "epoch": 0.892616226071103, + "grad_norm": 1.1506154537200928, + "learning_rate": 1.3608226983056687e-05, + "loss": 0.49487072229385376, + "step": 306 + }, + { + "epoch": 0.8955332725615315, + "grad_norm": 1.8046090602874756, + "learning_rate": 1.3560693260380614e-05, + "loss": 0.4910697937011719, + "step": 307 + }, + { + "epoch": 0.8984503190519599, + "grad_norm": 2.0088653564453125, + "learning_rate": 1.3513067224667e-05, + "loss": 0.508246660232544, + "step": 308 + }, + { + "epoch": 0.9013673655423883, + "grad_norm": 1.2966033220291138, + "learning_rate": 1.3465350110648437e-05, + "loss": 0.5125166177749634, + "step": 309 + }, + { + "epoch": 0.9042844120328167, + "grad_norm": 1.9976309537887573, + "learning_rate": 1.3417543155418775e-05, + "loss": 0.43942537903785706, + "step": 310 + }, + { + "epoch": 0.9072014585232452, + "grad_norm": 1.2663682699203491, + "learning_rate": 1.336964759840105e-05, + "loss": 0.4839101731777191, + "step": 311 + }, + { + "epoch": 0.9101185050136736, + "grad_norm": 1.1223328113555908, + "learning_rate": 1.3321664681315354e-05, + "loss": 0.48008066415786743, + "step": 312 + }, + { + "epoch": 0.9130355515041021, + "grad_norm": 1.5786972045898438, + "learning_rate": 1.3273595648146634e-05, + "loss": 0.47250309586524963, + "step": 313 + }, + { + "epoch": 0.9159525979945305, + "grad_norm": 1.2150241136550903, + "learning_rate": 1.322544174511245e-05, + "loss": 0.5149738788604736, + "step": 314 + }, + { + "epoch": 0.918869644484959, + "grad_norm": 1.3676542043685913, + "learning_rate": 1.3177204220630662e-05, + "loss": 0.4430195093154907, + "step": 315 + }, + { + "epoch": 0.9217866909753875, + "grad_norm": 1.0703285932540894, + "learning_rate": 1.3128884325287064e-05, + "loss": 0.4798983037471771, + "step": 316 + }, + { + "epoch": 0.9247037374658159, + "grad_norm": 1.3131535053253174, + "learning_rate": 1.308048331180296e-05, + "loss": 0.4241073727607727, + "step": 317 + }, + { + "epoch": 0.9276207839562443, + "grad_norm": 1.4485348463058472, + "learning_rate": 1.3032002435002698e-05, + "loss": 0.527199923992157, + "step": 318 + }, + { + "epoch": 0.9305378304466727, + "grad_norm": 1.370936393737793, + "learning_rate": 1.2983442951781114e-05, + "loss": 0.47125962376594543, + "step": 319 + }, + { + "epoch": 0.9334548769371012, + "grad_norm": 1.2369643449783325, + "learning_rate": 1.2934806121070973e-05, + "loss": 0.4814244210720062, + "step": 320 + }, + { + "epoch": 0.9363719234275296, + "grad_norm": 1.2632933855056763, + "learning_rate": 1.2886093203810314e-05, + "loss": 0.4915548264980316, + "step": 321 + }, + { + "epoch": 0.9392889699179581, + "grad_norm": 1.054569959640503, + "learning_rate": 1.2837305462909764e-05, + "loss": 0.5325602293014526, + "step": 322 + }, + { + "epoch": 0.9422060164083865, + "grad_norm": 1.15959632396698, + "learning_rate": 1.27884441632198e-05, + "loss": 0.43607404828071594, + "step": 323 + }, + { + "epoch": 0.945123062898815, + "grad_norm": 1.1667979955673218, + "learning_rate": 1.2739510571497945e-05, + "loss": 0.4631507992744446, + "step": 324 + }, + { + "epoch": 0.9480401093892434, + "grad_norm": 1.6009081602096558, + "learning_rate": 1.2690505956375944e-05, + "loss": 0.4935731887817383, + "step": 325 + }, + { + "epoch": 0.9509571558796718, + "grad_norm": 1.1193996667861938, + "learning_rate": 1.2641431588326858e-05, + "loss": 0.45883435010910034, + "step": 326 + }, + { + "epoch": 0.9538742023701002, + "grad_norm": 1.5365067720413208, + "learning_rate": 1.2592288739632138e-05, + "loss": 0.5206276178359985, + "step": 327 + }, + { + "epoch": 0.9567912488605287, + "grad_norm": 1.0714622735977173, + "learning_rate": 1.2543078684348632e-05, + "loss": 0.5242853760719299, + "step": 328 + }, + { + "epoch": 0.9597082953509571, + "grad_norm": 1.3009248971939087, + "learning_rate": 1.2493802698275557e-05, + "loss": 0.4794357717037201, + "step": 329 + }, + { + "epoch": 0.9626253418413856, + "grad_norm": 1.495771050453186, + "learning_rate": 1.244446205892143e-05, + "loss": 0.5849282145500183, + "step": 330 + }, + { + "epoch": 0.965542388331814, + "grad_norm": 1.2046003341674805, + "learning_rate": 1.2395058045470935e-05, + "loss": 0.47758305072784424, + "step": 331 + }, + { + "epoch": 0.9684594348222425, + "grad_norm": 1.1362569332122803, + "learning_rate": 1.2345591938751772e-05, + "loss": 0.4490663409233093, + "step": 332 + }, + { + "epoch": 0.971376481312671, + "grad_norm": 1.2658129930496216, + "learning_rate": 1.2296065021201438e-05, + "loss": 0.4035309851169586, + "step": 333 + }, + { + "epoch": 0.9742935278030994, + "grad_norm": 4.370306015014648, + "learning_rate": 1.2246478576833993e-05, + "loss": 0.495273619890213, + "step": 334 + }, + { + "epoch": 0.9772105742935278, + "grad_norm": 1.3863654136657715, + "learning_rate": 1.219683389120676e-05, + "loss": 0.46410733461380005, + "step": 335 + }, + { + "epoch": 0.9801276207839562, + "grad_norm": 1.4544321298599243, + "learning_rate": 1.2147132251387004e-05, + "loss": 0.4301709830760956, + "step": 336 + }, + { + "epoch": 0.9830446672743847, + "grad_norm": 1.0852457284927368, + "learning_rate": 1.2097374945918554e-05, + "loss": 0.48892468214035034, + "step": 337 + }, + { + "epoch": 0.9859617137648131, + "grad_norm": 1.5062257051467896, + "learning_rate": 1.2047563264788412e-05, + "loss": 0.4667983055114746, + "step": 338 + }, + { + "epoch": 0.9888787602552416, + "grad_norm": 1.2472951412200928, + "learning_rate": 1.199769849939329e-05, + "loss": 0.4827345013618469, + "step": 339 + }, + { + "epoch": 0.99179580674567, + "grad_norm": 1.2589871883392334, + "learning_rate": 1.1947781942506151e-05, + "loss": 0.405245304107666, + "step": 340 + }, + { + "epoch": 0.9947128532360985, + "grad_norm": 1.25636625289917, + "learning_rate": 1.1897814888242679e-05, + "loss": 0.37956133484840393, + "step": 341 + }, + { + "epoch": 0.9976298997265269, + "grad_norm": 2.7064895629882812, + "learning_rate": 1.1847798632027726e-05, + "loss": 0.489456444978714, + "step": 342 + }, + { + "epoch": 1.0, + "grad_norm": 1.6156240701675415, + "learning_rate": 1.1797734470561744e-05, + "loss": 0.46473199129104614, + "step": 343 + }, + { + "epoch": 1.0029170464904285, + "grad_norm": 1.3046343326568604, + "learning_rate": 1.1747623701787143e-05, + "loss": 0.3504878282546997, + "step": 344 + }, + { + "epoch": 1.005834092980857, + "grad_norm": 1.414828896522522, + "learning_rate": 1.1697467624854666e-05, + "loss": 0.4719260334968567, + "step": 345 + }, + { + "epoch": 1.0087511394712854, + "grad_norm": 1.1873356103897095, + "learning_rate": 1.164726754008969e-05, + "loss": 0.45313555002212524, + "step": 346 + }, + { + "epoch": 1.0116681859617138, + "grad_norm": 1.1382380723953247, + "learning_rate": 1.1597024748958526e-05, + "loss": 0.4365478456020355, + "step": 347 + }, + { + "epoch": 1.0145852324521423, + "grad_norm": 1.8141961097717285, + "learning_rate": 1.1546740554034661e-05, + "loss": 0.3694503605365753, + "step": 348 + }, + { + "epoch": 1.0175022789425707, + "grad_norm": 1.333388328552246, + "learning_rate": 1.1496416258965015e-05, + "loss": 0.4755721688270569, + "step": 349 + }, + { + "epoch": 1.0204193254329992, + "grad_norm": 1.3464443683624268, + "learning_rate": 1.1446053168436117e-05, + "loss": 0.4227846562862396, + "step": 350 + }, + { + "epoch": 1.0204193254329992, + "eval_loss": 0.44924086332321167, + "eval_runtime": 1214.6648, + "eval_samples_per_second": 0.52, + "eval_steps_per_second": 0.52, + "step": 350 + }, + { + "epoch": 1.0233363719234276, + "grad_norm": 1.2682689428329468, + "learning_rate": 1.1395652588140292e-05, + "loss": 0.44300130009651184, + "step": 351 + }, + { + "epoch": 1.0262534184138559, + "grad_norm": 1.7737696170806885, + "learning_rate": 1.1345215824741814e-05, + "loss": 0.5106258988380432, + "step": 352 + }, + { + "epoch": 1.0291704649042843, + "grad_norm": 1.2601238489151, + "learning_rate": 1.1294744185843014e-05, + "loss": 0.45930635929107666, + "step": 353 + }, + { + "epoch": 1.0320875113947128, + "grad_norm": 1.2162678241729736, + "learning_rate": 1.1244238979950406e-05, + "loss": 0.44163084030151367, + "step": 354 + }, + { + "epoch": 1.0350045578851412, + "grad_norm": 1.0905817747116089, + "learning_rate": 1.1193701516440733e-05, + "loss": 0.510662317276001, + "step": 355 + }, + { + "epoch": 1.0379216043755697, + "grad_norm": 0.9624952673912048, + "learning_rate": 1.1143133105527048e-05, + "loss": 0.5297917127609253, + "step": 356 + }, + { + "epoch": 1.0408386508659981, + "grad_norm": 1.2757681608200073, + "learning_rate": 1.1092535058224725e-05, + "loss": 0.4332093596458435, + "step": 357 + }, + { + "epoch": 1.0437556973564266, + "grad_norm": 1.6885719299316406, + "learning_rate": 1.104190868631748e-05, + "loss": 0.4337635040283203, + "step": 358 + }, + { + "epoch": 1.046672743846855, + "grad_norm": 1.175484538078308, + "learning_rate": 1.099125530232336e-05, + "loss": 0.45411020517349243, + "step": 359 + }, + { + "epoch": 1.0495897903372835, + "grad_norm": 1.0964939594268799, + "learning_rate": 1.0940576219460723e-05, + "loss": 0.5333439707756042, + "step": 360 + }, + { + "epoch": 1.052506836827712, + "grad_norm": 1.5493136644363403, + "learning_rate": 1.0889872751614176e-05, + "loss": 0.4400906264781952, + "step": 361 + }, + { + "epoch": 1.0554238833181404, + "grad_norm": 1.2491416931152344, + "learning_rate": 1.0839146213300526e-05, + "loss": 0.31049978733062744, + "step": 362 + }, + { + "epoch": 1.0583409298085689, + "grad_norm": 1.7213693857192993, + "learning_rate": 1.0788397919634694e-05, + "loss": 0.389009028673172, + "step": 363 + }, + { + "epoch": 1.0612579762989973, + "grad_norm": 1.5405336618423462, + "learning_rate": 1.0737629186295621e-05, + "loss": 0.4068562984466553, + "step": 364 + }, + { + "epoch": 1.0641750227894258, + "grad_norm": 1.225455641746521, + "learning_rate": 1.0686841329492159e-05, + "loss": 0.47358617186546326, + "step": 365 + }, + { + "epoch": 1.0670920692798542, + "grad_norm": 1.3436250686645508, + "learning_rate": 1.0636035665928945e-05, + "loss": 0.47050854563713074, + "step": 366 + }, + { + "epoch": 1.0700091157702827, + "grad_norm": 1.4952112436294556, + "learning_rate": 1.058521351277227e-05, + "loss": 0.43496906757354736, + "step": 367 + }, + { + "epoch": 1.072926162260711, + "grad_norm": 1.549112319946289, + "learning_rate": 1.0534376187615924e-05, + "loss": 0.45711052417755127, + "step": 368 + }, + { + "epoch": 1.0758432087511394, + "grad_norm": 1.3851526975631714, + "learning_rate": 1.048352500844704e-05, + "loss": 0.45045915246009827, + "step": 369 + }, + { + "epoch": 1.0787602552415678, + "grad_norm": 1.6302049160003662, + "learning_rate": 1.0432661293611927e-05, + "loss": 0.3736046254634857, + "step": 370 + }, + { + "epoch": 1.0816773017319963, + "grad_norm": 1.3365869522094727, + "learning_rate": 1.0381786361781885e-05, + "loss": 0.42242100834846497, + "step": 371 + }, + { + "epoch": 1.0845943482224247, + "grad_norm": 1.4369138479232788, + "learning_rate": 1.0330901531919026e-05, + "loss": 0.44570961594581604, + "step": 372 + }, + { + "epoch": 1.0875113947128532, + "grad_norm": 1.3528283834457397, + "learning_rate": 1.0280008123242069e-05, + "loss": 0.43440738320350647, + "step": 373 + }, + { + "epoch": 1.0904284412032816, + "grad_norm": 1.469660997390747, + "learning_rate": 1.0229107455192147e-05, + "loss": 0.3960394263267517, + "step": 374 + }, + { + "epoch": 1.09334548769371, + "grad_norm": 1.4542185068130493, + "learning_rate": 1.0178200847398595e-05, + "loss": 0.47834208607673645, + "step": 375 + }, + { + "epoch": 1.0962625341841385, + "grad_norm": 1.6470292806625366, + "learning_rate": 1.0127289619644737e-05, + "loss": 0.42791086435317993, + "step": 376 + }, + { + "epoch": 1.099179580674567, + "grad_norm": 1.1934021711349487, + "learning_rate": 1.0076375091833681e-05, + "loss": 0.4401305019855499, + "step": 377 + }, + { + "epoch": 1.1020966271649955, + "grad_norm": 0.9786668419837952, + "learning_rate": 1.0025458583954078e-05, + "loss": 0.4816555678844452, + "step": 378 + }, + { + "epoch": 1.105013673655424, + "grad_norm": 1.1348779201507568, + "learning_rate": 9.974541416045924e-06, + "loss": 0.41516968607902527, + "step": 379 + }, + { + "epoch": 1.1079307201458524, + "grad_norm": 1.0188615322113037, + "learning_rate": 9.923624908166322e-06, + "loss": 0.48087278008461, + "step": 380 + }, + { + "epoch": 1.1108477666362808, + "grad_norm": 1.0821740627288818, + "learning_rate": 9.872710380355263e-06, + "loss": 0.41974008083343506, + "step": 381 + }, + { + "epoch": 1.1137648131267093, + "grad_norm": 1.250951886177063, + "learning_rate": 9.82179915260141e-06, + "loss": 0.42703643441200256, + "step": 382 + }, + { + "epoch": 1.1166818596171377, + "grad_norm": 1.4528254270553589, + "learning_rate": 9.770892544807856e-06, + "loss": 0.43801453709602356, + "step": 383 + }, + { + "epoch": 1.1195989061075662, + "grad_norm": 1.813859462738037, + "learning_rate": 9.719991876757934e-06, + "loss": 0.4344240725040436, + "step": 384 + }, + { + "epoch": 1.1225159525979946, + "grad_norm": 1.6681253910064697, + "learning_rate": 9.669098468080976e-06, + "loss": 0.4356998801231384, + "step": 385 + }, + { + "epoch": 1.125432999088423, + "grad_norm": 1.3447953462600708, + "learning_rate": 9.618213638218117e-06, + "loss": 0.43189188838005066, + "step": 386 + }, + { + "epoch": 1.1283500455788513, + "grad_norm": 1.9577926397323608, + "learning_rate": 9.567338706388074e-06, + "loss": 0.34984707832336426, + "step": 387 + }, + { + "epoch": 1.1312670920692798, + "grad_norm": 1.5225576162338257, + "learning_rate": 9.516474991552965e-06, + "loss": 0.4243963062763214, + "step": 388 + }, + { + "epoch": 1.1341841385597082, + "grad_norm": 1.7416809797286987, + "learning_rate": 9.46562381238408e-06, + "loss": 0.3414606750011444, + "step": 389 + }, + { + "epoch": 1.1371011850501367, + "grad_norm": 1.8358951807022095, + "learning_rate": 9.414786487227732e-06, + "loss": 0.387447327375412, + "step": 390 + }, + { + "epoch": 1.1400182315405651, + "grad_norm": 1.9706153869628906, + "learning_rate": 9.363964334071057e-06, + "loss": 0.4599088728427887, + "step": 391 + }, + { + "epoch": 1.1429352780309936, + "grad_norm": 1.0604286193847656, + "learning_rate": 9.313158670507843e-06, + "loss": 0.4633581042289734, + "step": 392 + }, + { + "epoch": 1.145852324521422, + "grad_norm": 1.4851202964782715, + "learning_rate": 9.262370813704379e-06, + "loss": 0.3872259557247162, + "step": 393 + }, + { + "epoch": 1.1487693710118505, + "grad_norm": 1.7839159965515137, + "learning_rate": 9.21160208036531e-06, + "loss": 0.5215944647789001, + "step": 394 + }, + { + "epoch": 1.151686417502279, + "grad_norm": 1.3054656982421875, + "learning_rate": 9.160853786699475e-06, + "loss": 0.4030425548553467, + "step": 395 + }, + { + "epoch": 1.1546034639927074, + "grad_norm": 3.8467981815338135, + "learning_rate": 9.110127248385827e-06, + "loss": 0.4032524824142456, + "step": 396 + }, + { + "epoch": 1.1575205104831359, + "grad_norm": 1.8513801097869873, + "learning_rate": 9.05942378053928e-06, + "loss": 0.46577155590057373, + "step": 397 + }, + { + "epoch": 1.1604375569735643, + "grad_norm": 1.312689185142517, + "learning_rate": 9.008744697676642e-06, + "loss": 0.39114487171173096, + "step": 398 + }, + { + "epoch": 1.1633546034639928, + "grad_norm": 1.1996328830718994, + "learning_rate": 8.958091313682521e-06, + "loss": 0.481199711561203, + "step": 399 + }, + { + "epoch": 1.1662716499544212, + "grad_norm": 5.172409534454346, + "learning_rate": 8.90746494177528e-06, + "loss": 0.3803558945655823, + "step": 400 + }, + { + "epoch": 1.1662716499544212, + "eval_loss": 0.4318464398384094, + "eval_runtime": 1206.0306, + "eval_samples_per_second": 0.524, + "eval_steps_per_second": 0.524, + "step": 400 + }, + { + "epoch": 1.1691886964448497, + "grad_norm": 1.0115015506744385, + "learning_rate": 8.856866894472954e-06, + "loss": 0.39636704325675964, + "step": 401 + }, + { + "epoch": 1.172105742935278, + "grad_norm": 1.1557435989379883, + "learning_rate": 8.806298483559268e-06, + "loss": 0.4076298475265503, + "step": 402 + }, + { + "epoch": 1.1750227894257064, + "grad_norm": 1.2802515029907227, + "learning_rate": 8.755761020049597e-06, + "loss": 0.44352248311042786, + "step": 403 + }, + { + "epoch": 1.1779398359161348, + "grad_norm": 1.2755069732666016, + "learning_rate": 8.705255814156988e-06, + "loss": 0.390497624874115, + "step": 404 + }, + { + "epoch": 1.1808568824065633, + "grad_norm": 1.2799782752990723, + "learning_rate": 8.654784175258188e-06, + "loss": 0.35810694098472595, + "step": 405 + }, + { + "epoch": 1.1837739288969917, + "grad_norm": 1.0968674421310425, + "learning_rate": 8.604347411859713e-06, + "loss": 0.3890265226364136, + "step": 406 + }, + { + "epoch": 1.1866909753874202, + "grad_norm": 1.3334455490112305, + "learning_rate": 8.553946831563886e-06, + "loss": 0.3916901648044586, + "step": 407 + }, + { + "epoch": 1.1896080218778486, + "grad_norm": 1.1888184547424316, + "learning_rate": 8.503583741034988e-06, + "loss": 0.5231326222419739, + "step": 408 + }, + { + "epoch": 1.192525068368277, + "grad_norm": 1.1163763999938965, + "learning_rate": 8.45325944596534e-06, + "loss": 0.4249858558177948, + "step": 409 + }, + { + "epoch": 1.1954421148587056, + "grad_norm": 1.3470333814620972, + "learning_rate": 8.40297525104148e-06, + "loss": 0.5201632380485535, + "step": 410 + }, + { + "epoch": 1.198359161349134, + "grad_norm": 1.5412285327911377, + "learning_rate": 8.35273245991031e-06, + "loss": 0.39376699924468994, + "step": 411 + }, + { + "epoch": 1.2012762078395625, + "grad_norm": 1.3408735990524292, + "learning_rate": 8.302532375145339e-06, + "loss": 0.39554283022880554, + "step": 412 + }, + { + "epoch": 1.204193254329991, + "grad_norm": 1.990668773651123, + "learning_rate": 8.25237629821286e-06, + "loss": 0.42424261569976807, + "step": 413 + }, + { + "epoch": 1.2071103008204194, + "grad_norm": 1.6471989154815674, + "learning_rate": 8.202265529438259e-06, + "loss": 0.3234582543373108, + "step": 414 + }, + { + "epoch": 1.2100273473108478, + "grad_norm": 1.1483631134033203, + "learning_rate": 8.152201367972275e-06, + "loss": 0.39163246750831604, + "step": 415 + }, + { + "epoch": 1.2129443938012763, + "grad_norm": 1.800149917602539, + "learning_rate": 8.102185111757323e-06, + "loss": 0.5055042505264282, + "step": 416 + }, + { + "epoch": 1.2158614402917047, + "grad_norm": 1.4394795894622803, + "learning_rate": 8.052218057493849e-06, + "loss": 0.4761751592159271, + "step": 417 + }, + { + "epoch": 1.2187784867821332, + "grad_norm": 1.622689962387085, + "learning_rate": 8.002301500606715e-06, + "loss": 0.4490141272544861, + "step": 418 + }, + { + "epoch": 1.2216955332725616, + "grad_norm": 1.2564961910247803, + "learning_rate": 7.952436735211593e-06, + "loss": 0.3964035212993622, + "step": 419 + }, + { + "epoch": 1.22461257976299, + "grad_norm": 1.3248411417007446, + "learning_rate": 7.902625054081449e-06, + "loss": 0.46039122343063354, + "step": 420 + }, + { + "epoch": 1.2275296262534183, + "grad_norm": 1.568983793258667, + "learning_rate": 7.852867748613e-06, + "loss": 0.49916595220565796, + "step": 421 + }, + { + "epoch": 1.2304466727438468, + "grad_norm": 1.4784491062164307, + "learning_rate": 7.803166108793243e-06, + "loss": 0.4035068154335022, + "step": 422 + }, + { + "epoch": 1.2333637192342752, + "grad_norm": 1.2940057516098022, + "learning_rate": 7.753521423166007e-06, + "loss": 0.4154140055179596, + "step": 423 + }, + { + "epoch": 1.2362807657247037, + "grad_norm": 1.167786717414856, + "learning_rate": 7.703934978798565e-06, + "loss": 0.39541637897491455, + "step": 424 + }, + { + "epoch": 1.2391978122151321, + "grad_norm": 1.5126771926879883, + "learning_rate": 7.65440806124823e-06, + "loss": 0.37744253873825073, + "step": 425 + }, + { + "epoch": 1.2421148587055606, + "grad_norm": 1.2595263719558716, + "learning_rate": 7.604941954529067e-06, + "loss": 0.46380615234375, + "step": 426 + }, + { + "epoch": 1.245031905195989, + "grad_norm": 1.4258298873901367, + "learning_rate": 7.555537941078573e-06, + "loss": 0.3391319513320923, + "step": 427 + }, + { + "epoch": 1.2479489516864175, + "grad_norm": 1.5371774435043335, + "learning_rate": 7.506197301724446e-06, + "loss": 0.39805102348327637, + "step": 428 + }, + { + "epoch": 1.250865998176846, + "grad_norm": 1.3789173364639282, + "learning_rate": 7.456921315651371e-06, + "loss": 0.37969034910202026, + "step": 429 + }, + { + "epoch": 1.2537830446672744, + "grad_norm": 1.32931649684906, + "learning_rate": 7.407711260367867e-06, + "loss": 0.3841526508331299, + "step": 430 + }, + { + "epoch": 1.2567000911577029, + "grad_norm": 1.2836817502975464, + "learning_rate": 7.358568411673145e-06, + "loss": 0.340289443731308, + "step": 431 + }, + { + "epoch": 1.2596171376481313, + "grad_norm": 1.0418318510055542, + "learning_rate": 7.309494043624059e-06, + "loss": 0.44747158885002136, + "step": 432 + }, + { + "epoch": 1.2625341841385598, + "grad_norm": 1.1769362688064575, + "learning_rate": 7.260489428502058e-06, + "loss": 0.45737382769584656, + "step": 433 + }, + { + "epoch": 1.265451230628988, + "grad_norm": 2.2730748653411865, + "learning_rate": 7.211555836780203e-06, + "loss": 0.3827931582927704, + "step": 434 + }, + { + "epoch": 1.2683682771194165, + "grad_norm": 1.263096809387207, + "learning_rate": 7.162694537090235e-06, + "loss": 0.3589435815811157, + "step": 435 + }, + { + "epoch": 1.271285323609845, + "grad_norm": 1.4073514938354492, + "learning_rate": 7.113906796189692e-06, + "loss": 0.45206642150878906, + "step": 436 + }, + { + "epoch": 1.2742023701002734, + "grad_norm": 1.064585566520691, + "learning_rate": 7.0651938789290306e-06, + "loss": 0.5409261584281921, + "step": 437 + }, + { + "epoch": 1.2771194165907018, + "grad_norm": 1.2346999645233154, + "learning_rate": 7.016557048218889e-06, + "loss": 0.40680158138275146, + "step": 438 + }, + { + "epoch": 1.2800364630811303, + "grad_norm": 1.5816547870635986, + "learning_rate": 6.967997564997306e-06, + "loss": 0.38718655705451965, + "step": 439 + }, + { + "epoch": 1.2829535095715587, + "grad_norm": 1.085268259048462, + "learning_rate": 6.919516688197041e-06, + "loss": 0.4863276779651642, + "step": 440 + }, + { + "epoch": 1.2858705560619872, + "grad_norm": 1.0984629392623901, + "learning_rate": 6.871115674712937e-06, + "loss": 0.39562875032424927, + "step": 441 + }, + { + "epoch": 1.2887876025524156, + "grad_norm": 1.3004229068756104, + "learning_rate": 6.822795779369339e-06, + "loss": 0.44437694549560547, + "step": 442 + }, + { + "epoch": 1.291704649042844, + "grad_norm": 1.3541183471679688, + "learning_rate": 6.774558254887553e-06, + "loss": 0.4728967249393463, + "step": 443 + }, + { + "epoch": 1.2946216955332726, + "grad_norm": 1.2485377788543701, + "learning_rate": 6.7264043518533695e-06, + "loss": 0.4052809476852417, + "step": 444 + }, + { + "epoch": 1.297538742023701, + "grad_norm": 1.412827730178833, + "learning_rate": 6.67833531868465e-06, + "loss": 0.40149861574172974, + "step": 445 + }, + { + "epoch": 1.3004557885141295, + "grad_norm": 1.5576224327087402, + "learning_rate": 6.630352401598953e-06, + "loss": 0.44107240438461304, + "step": 446 + }, + { + "epoch": 1.303372835004558, + "grad_norm": 1.1551047563552856, + "learning_rate": 6.582456844581226e-06, + "loss": 0.4898405969142914, + "step": 447 + }, + { + "epoch": 1.3062898814949864, + "grad_norm": 1.9939689636230469, + "learning_rate": 6.5346498893515645e-06, + "loss": 0.4791329801082611, + "step": 448 + }, + { + "epoch": 1.3092069279854148, + "grad_norm": 1.4782553911209106, + "learning_rate": 6.486932775333002e-06, + "loss": 0.472908616065979, + "step": 449 + }, + { + "epoch": 1.3121239744758433, + "grad_norm": 1.2496148347854614, + "learning_rate": 6.439306739619387e-06, + "loss": 0.514995276927948, + "step": 450 + }, + { + "epoch": 1.3121239744758433, + "eval_loss": 0.4178673028945923, + "eval_runtime": 1197.5534, + "eval_samples_per_second": 0.528, + "eval_steps_per_second": 0.528, + "step": 450 + }, + { + "epoch": 1.3150410209662717, + "grad_norm": 1.3996772766113281, + "learning_rate": 6.391773016943316e-06, + "loss": 0.4087896943092346, + "step": 451 + }, + { + "epoch": 1.3179580674567002, + "grad_norm": 1.20390784740448, + "learning_rate": 6.344332839644111e-06, + "loss": 0.43224579095840454, + "step": 452 + }, + { + "epoch": 1.3208751139471286, + "grad_norm": 1.2709496021270752, + "learning_rate": 6.296987437635876e-06, + "loss": 0.44104093313217163, + "step": 453 + }, + { + "epoch": 1.323792160437557, + "grad_norm": 1.0112334489822388, + "learning_rate": 6.249738038375618e-06, + "loss": 0.47084498405456543, + "step": 454 + }, + { + "epoch": 1.3267092069279856, + "grad_norm": 1.0771515369415283, + "learning_rate": 6.202585866831411e-06, + "loss": 0.4700928032398224, + "step": 455 + }, + { + "epoch": 1.3296262534184138, + "grad_norm": 1.4937143325805664, + "learning_rate": 6.15553214545064e-06, + "loss": 0.345747709274292, + "step": 456 + }, + { + "epoch": 1.3325432999088422, + "grad_norm": 1.1348456144332886, + "learning_rate": 6.108578094128321e-06, + "loss": 0.33824583888053894, + "step": 457 + }, + { + "epoch": 1.3354603463992707, + "grad_norm": 1.2502707242965698, + "learning_rate": 6.061724930175461e-06, + "loss": 0.3528832197189331, + "step": 458 + }, + { + "epoch": 1.3383773928896991, + "grad_norm": 1.5359619855880737, + "learning_rate": 6.014973868287504e-06, + "loss": 0.4413869082927704, + "step": 459 + }, + { + "epoch": 1.3412944393801276, + "grad_norm": 0.9747081398963928, + "learning_rate": 5.9683261205128395e-06, + "loss": 0.6849499940872192, + "step": 460 + }, + { + "epoch": 1.344211485870556, + "grad_norm": 1.3150533437728882, + "learning_rate": 5.921782896221383e-06, + "loss": 0.3901931047439575, + "step": 461 + }, + { + "epoch": 1.3471285323609845, + "grad_norm": 1.137770652770996, + "learning_rate": 5.875345402073207e-06, + "loss": 0.37498384714126587, + "step": 462 + }, + { + "epoch": 1.350045578851413, + "grad_norm": 1.2216367721557617, + "learning_rate": 5.829014841987277e-06, + "loss": 0.3874579966068268, + "step": 463 + }, + { + "epoch": 1.3529626253418414, + "grad_norm": 1.135439157485962, + "learning_rate": 5.782792417110233e-06, + "loss": 0.384797066450119, + "step": 464 + }, + { + "epoch": 1.3558796718322699, + "grad_norm": 1.2400696277618408, + "learning_rate": 5.736679325785239e-06, + "loss": 0.46303266286849976, + "step": 465 + }, + { + "epoch": 1.3587967183226983, + "grad_norm": 1.8848882913589478, + "learning_rate": 5.6906767635209304e-06, + "loss": 0.5068309903144836, + "step": 466 + }, + { + "epoch": 1.3617137648131268, + "grad_norm": 1.4707008600234985, + "learning_rate": 5.644785922960412e-06, + "loss": 0.364332914352417, + "step": 467 + }, + { + "epoch": 1.364630811303555, + "grad_norm": 2.4436841011047363, + "learning_rate": 5.599007993850329e-06, + "loss": 0.485107421875, + "step": 468 + }, + { + "epoch": 1.3675478577939835, + "grad_norm": 1.1924740076065063, + "learning_rate": 5.553344163010039e-06, + "loss": 0.34547489881515503, + "step": 469 + }, + { + "epoch": 1.370464904284412, + "grad_norm": 1.1255877017974854, + "learning_rate": 5.507795614300846e-06, + "loss": 0.39645254611968994, + "step": 470 + }, + { + "epoch": 1.3733819507748404, + "grad_norm": 1.0937018394470215, + "learning_rate": 5.4623635285952815e-06, + "loss": 0.4267856478691101, + "step": 471 + }, + { + "epoch": 1.3762989972652688, + "grad_norm": 1.3355520963668823, + "learning_rate": 5.417049083746513e-06, + "loss": 0.3669992983341217, + "step": 472 + }, + { + "epoch": 1.3792160437556973, + "grad_norm": 1.7302504777908325, + "learning_rate": 5.3718534545578035e-06, + "loss": 0.3873697519302368, + "step": 473 + }, + { + "epoch": 1.3821330902461257, + "grad_norm": 1.17263662815094, + "learning_rate": 5.326777812752041e-06, + "loss": 0.4581540524959564, + "step": 474 + }, + { + "epoch": 1.3850501367365542, + "grad_norm": 1.0998128652572632, + "learning_rate": 5.281823326941377e-06, + "loss": 0.43062761425971985, + "step": 475 + }, + { + "epoch": 1.3879671832269826, + "grad_norm": 1.1194556951522827, + "learning_rate": 5.236991162596932e-06, + "loss": 0.381741464138031, + "step": 476 + }, + { + "epoch": 1.390884229717411, + "grad_norm": 1.2759051322937012, + "learning_rate": 5.19228248201856e-06, + "loss": 0.49175748229026794, + "step": 477 + }, + { + "epoch": 1.3938012762078396, + "grad_norm": 1.2134747505187988, + "learning_rate": 5.147698444304732e-06, + "loss": 0.4997562766075134, + "step": 478 + }, + { + "epoch": 1.396718322698268, + "grad_norm": 1.0833078622817993, + "learning_rate": 5.1032402053224804e-06, + "loss": 0.42580488324165344, + "step": 479 + }, + { + "epoch": 1.3996353691886965, + "grad_norm": 1.4838510751724243, + "learning_rate": 5.058908917677426e-06, + "loss": 0.5015593767166138, + "step": 480 + }, + { + "epoch": 1.402552415679125, + "grad_norm": 1.218610167503357, + "learning_rate": 5.014705730683904e-06, + "loss": 0.34739193320274353, + "step": 481 + }, + { + "epoch": 1.4054694621695534, + "grad_norm": 1.1883307695388794, + "learning_rate": 4.970631790335181e-06, + "loss": 0.41708022356033325, + "step": 482 + }, + { + "epoch": 1.4083865086599818, + "grad_norm": 1.209291696548462, + "learning_rate": 4.926688239273713e-06, + "loss": 0.43546172976493835, + "step": 483 + }, + { + "epoch": 1.4113035551504103, + "grad_norm": 1.0801606178283691, + "learning_rate": 4.882876216761543e-06, + "loss": 0.44491735100746155, + "step": 484 + }, + { + "epoch": 1.4142206016408387, + "grad_norm": 1.2746628522872925, + "learning_rate": 4.839196858650763e-06, + "loss": 0.436122864484787, + "step": 485 + }, + { + "epoch": 1.4171376481312672, + "grad_norm": 1.4465962648391724, + "learning_rate": 4.795651297354056e-06, + "loss": 0.3750447630882263, + "step": 486 + }, + { + "epoch": 1.4200546946216956, + "grad_norm": 1.6736211776733398, + "learning_rate": 4.752240661815346e-06, + "loss": 0.38286519050598145, + "step": 487 + }, + { + "epoch": 1.422971741112124, + "grad_norm": 1.1946996450424194, + "learning_rate": 4.708966077480544e-06, + "loss": 0.4488063156604767, + "step": 488 + }, + { + "epoch": 1.4258887876025526, + "grad_norm": 1.42599356174469, + "learning_rate": 4.665828666268335e-06, + "loss": 0.44088613986968994, + "step": 489 + }, + { + "epoch": 1.4288058340929808, + "grad_norm": 1.2281016111373901, + "learning_rate": 4.622829546541121e-06, + "loss": 0.4030645489692688, + "step": 490 + }, + { + "epoch": 1.4317228805834092, + "grad_norm": 1.2875670194625854, + "learning_rate": 4.57996983307602e-06, + "loss": 0.44702020287513733, + "step": 491 + }, + { + "epoch": 1.4346399270738377, + "grad_norm": 1.2456860542297363, + "learning_rate": 4.537250637035947e-06, + "loss": 0.4067370593547821, + "step": 492 + }, + { + "epoch": 1.4375569735642661, + "grad_norm": 1.2822725772857666, + "learning_rate": 4.494673065940833e-06, + "loss": 0.4237740635871887, + "step": 493 + }, + { + "epoch": 1.4404740200546946, + "grad_norm": 1.5517818927764893, + "learning_rate": 4.452238223638906e-06, + "loss": 0.40579724311828613, + "step": 494 + }, + { + "epoch": 1.443391066545123, + "grad_norm": 1.275344967842102, + "learning_rate": 4.409947210278056e-06, + "loss": 0.38880717754364014, + "step": 495 + }, + { + "epoch": 1.4463081130355515, + "grad_norm": 1.22952139377594, + "learning_rate": 4.367801122277327e-06, + "loss": 0.4042310416698456, + "step": 496 + }, + { + "epoch": 1.44922515952598, + "grad_norm": 1.122261643409729, + "learning_rate": 4.325801052298493e-06, + "loss": 0.5408368110656738, + "step": 497 + }, + { + "epoch": 1.4521422060164084, + "grad_norm": 1.5885361433029175, + "learning_rate": 4.283948089217715e-06, + "loss": 0.37697717547416687, + "step": 498 + }, + { + "epoch": 1.4550592525068369, + "grad_norm": 2.3565149307250977, + "learning_rate": 4.242243318097338e-06, + "loss": 0.3811529576778412, + "step": 499 + }, + { + "epoch": 1.4579762989972653, + "grad_norm": 1.1944137811660767, + "learning_rate": 4.200687820157735e-06, + "loss": 0.414781391620636, + "step": 500 + }, + { + "epoch": 1.4579762989972653, + "eval_loss": 0.40706494450569153, + "eval_runtime": 1189.1593, + "eval_samples_per_second": 0.531, + "eval_steps_per_second": 0.531, + "step": 500 + } + ], + "logging_steps": 1, + "max_steps": 686, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.6050925490932285e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/cpt_devstral_24B/checkpoints/checkpoint-500/training_args.bin b/cpt_devstral_24B/checkpoints/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..48a487f18680e3e5b768fe7ec9ec04e8778fc21e --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f62526ec2433add7ac031c48b1f6ff360f1ade77275765112cbf7cf361d64ca5 +size 5201 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-600/README.md b/cpt_devstral_24B/checkpoints/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2f04c3de935db4cae3da32ab6d1fcbbea11b4e09 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-600/README.md @@ -0,0 +1,207 @@ +--- +base_model: /workspace/Models/Devstral-Small-2-24B-Instruct-2512 +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:/workspace/Models/Devstral-Small-2-24B-Instruct-2512 +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/cpt_devstral_24B/checkpoints/checkpoint-600/adapter_config.json b/cpt_devstral_24B/checkpoints/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a10b9f1b7bb62dced9a7c13375c7ebbeb347c15b --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-600/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "/workspace/Models/Devstral-Small-2-24B-Instruct-2512", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/cpt_devstral_24B/checkpoints/checkpoint-600/adapter_model.safetensors b/cpt_devstral_24B/checkpoints/checkpoint-600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..55a3c8fad99c3849ac93d36f5e50dbb9ac430b18 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6528dd74de4fce9bff6c944acd9bc01868d155b1ea5403fe93fb8c5ced4d4ec +size 364983848 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-600/chat_template.jinja b/cpt_devstral_24B/checkpoints/checkpoint-600/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..01c8776b5b3496af72e92a53a3bf92e113f66f2c --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-600/chat_template.jinja @@ -0,0 +1,121 @@ +{#- Default system message if no system prompt is passed. #} +{%- set default_system_message = '' %} + +{#- Begin of sequence token. #} +{{- bos_token }} + +{#- Handle system prompt if it exists. #} +{#- System prompt supports text content or text chunks. #} +{%- if messages[0]['role'] == 'system' %} + {{- '[SYSTEM_PROMPT]' -}} + {%- if messages[0]['content'] is string %} + {{- messages[0]['content'] -}} + {%- else %} + {%- for block in messages[0]['content'] %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- else %} + {{- raise_exception('Only text chunks are supported in system message contents.') }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- '[/SYSTEM_PROMPT]' -}} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set loop_messages = messages %} + {%- if default_system_message != '' %} + {{- '[SYSTEM_PROMPT]' + default_system_message + '[/SYSTEM_PROMPT]' }} + {%- endif %} +{%- endif %} + + +{#- Tools definition #} +{%- set tools_definition = '' %} +{%- set has_tools = false %} +{%- if tools is defined and tools is not none and tools|length > 0 %} + {%- set has_tools = true %} + {%- set tools_definition = '[AVAILABLE_TOOLS]' + (tools| tojson) + '[/AVAILABLE_TOOLS]' %} + {{- tools_definition }} +{%- endif %} + +{#- Checks for alternating user/assistant messages. #} +{%- set ns = namespace(index=0) %} +{%- for message in loop_messages %} + {%- if message.role == 'user' or (message.role == 'assistant' and (message.tool_calls is not defined or message.tool_calls is none or message.tool_calls | length == 0)) %} + {%- if (message['role'] == 'user') != (ns.index % 2 == 0) %} + {{- raise_exception('After the optional system message, conversation roles must alternate user and assistant roles except for tool calls and results.') }} + {%- endif %} + {%- set ns.index = ns.index + 1 %} + {%- endif %} +{%- endfor %} + +{#- Handle conversation messages. #} +{%- for message in loop_messages %} + + {#- User messages supports text content or text and image chunks. #} + {%- if message['role'] == 'user' %} + {%- if message['content'] is string %} + {{- '[INST]' + message['content'] + '[/INST]' }} + {%- elif message['content'] | length > 0 %} + {{- '[INST]' }} + {%- if message['content'] | length == 2 %} + {%- set blocks = message['content'] | sort(attribute='type') %} + {%- else %} + {%- set blocks = message['content'] %} + {%- endif %} + {%- for block in blocks %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- elif block['type'] in ['image', 'image_url'] %} + {{- '[IMG]' }} + {%- else %} + {{- raise_exception('Only text, image and image_url chunks are supported in user message content.') }} + {%- endif %} + {%- endfor %} + {{- '[/INST]' }} + {%- else %} + {{- raise_exception('User message must have a string or a list of chunks in content') }} + {%- endif %} + + {#- Assistant messages supports text content or text and image chunks. #} + {%- elif message['role'] == 'assistant' %} + {%- if (message['content'] is none or message['content'] == '' or message['content']|length == 0) and (message['tool_calls'] is not defined or message['tool_calls'] is none or message['tool_calls']|length == 0) %} + {{- raise_exception('Assistant message must have a string or a list of chunks in content or a list of tool calls.') }} + {%- endif %} + + {%- if message['content'] is string %} + {{- message['content'] }} + {%- elif message['content'] | length > 0 %} + {%- for block in message['content'] %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- else %} + {{- raise_exception('Only text chunks are supported in assistant message contents.') }} + {%- endif %} + {%- endfor %} + {%- endif %} + + {%- if message['tool_calls'] is defined and message['tool_calls'] is not none and message['tool_calls']|length > 0 %} + {%- for tool in message['tool_calls'] %} + {%- set arguments = tool['function']['arguments'] %} + {%- if arguments is not string %} + {%- set arguments = arguments|tojson|safe %} + {%- elif arguments == '' %} + {%- set arguments = '{}' %} + {%- endif %} + {{- '[TOOL_CALLS]' + tool['function']['name'] + '[ARGS]' + arguments }} + {%- endfor %} + {%- endif %} + + {#- End of sequence token for each assistant messages. #} + {{- eos_token }} + + {#- Tool messages only supports text content. #} + {%- elif message['role'] == 'tool' %} + {{- '[TOOL_RESULTS]' + message['content']|string + '[/TOOL_RESULTS]' }} + + {#- Raise exception for unsupported roles. #} + {%- else %} + {{- raise_exception('Only user, assistant and tool roles are supported, got ' + message['role'] + '.') }} + {%- endif %} +{%- endfor %} \ No newline at end of file diff --git a/cpt_devstral_24B/checkpoints/checkpoint-600/optimizer.pt b/cpt_devstral_24B/checkpoints/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..667fefc61f1423b9a48049aa48e4ea27217a2b7e --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed9b2e44d0ab7459e766b2b426fe5e300025849ada2eb46e1e2d89ca430a99f5 +size 160131559 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-600/rng_state.pth b/cpt_devstral_24B/checkpoints/checkpoint-600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f8cc479fd296e533fb5e69d4a5e30aeba522672e --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:225b67663a6b759f77f860fb03e0bd5eaf5759053344c810157aab3c54e1e986 +size 14645 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-600/scheduler.pt b/cpt_devstral_24B/checkpoints/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..020c77ef9859f55fb5195feae1ce299f68dc1679 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e3d2efb23ab02a563acff2988a53e730d4e5d08f3c1c39f1bd998cc5047ea45 +size 1465 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-600/tokenizer.json b/cpt_devstral_24B/checkpoints/checkpoint-600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..5b51e255641d3ab81f891f54bd61370fcedf6622 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:286acad9b0e27fce778ac429763536accf618ccb6ed72963b6f94685e531c5c7 +size 17077402 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-600/tokenizer_config.json b/cpt_devstral_24B/checkpoints/checkpoint-600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1bb6b32cec8ab9654d2c84faeb9a332373476017 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-600/tokenizer_config.json @@ -0,0 +1,1013 @@ +{ + "backend": "tokenizers", + "bos_token": "", + "eos_token": "", + "extra_special_tokens": [ + "", + "", + "", + "[INST]", + "[/INST]", + "[AVAILABLE_TOOLS]", + "[/AVAILABLE_TOOLS]", + "[TOOL_RESULTS]", + "[/TOOL_RESULTS]", + "[TOOL_CALLS]", + "[IMG]", + "", + "[IMG_BREAK]", + "[IMG_END]", + "[PREFIX]", + "[MIDDLE]", + "[SUFFIX]", + "[SYSTEM_PROMPT]", + "[/SYSTEM_PROMPT]", + "[TOOL_CONTENT]", + "", + "", + "", + "", + "[AUDIO]", + "[BEGIN_AUDIO]", + "", + "", + "", + "", + "", + "", + "[ARGS]", + "[CALL_ID]", + "[THINK]", + "[/THINK]", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "" + ], + "is_local": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "PixtralProcessor", + "tokenizer_class": "TokenizersBackend", + "unk_token": "" +} diff --git a/cpt_devstral_24B/checkpoints/checkpoint-600/trainer_state.json b/cpt_devstral_24B/checkpoints/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f432a86deecba4924db2694b2f74be265c6884d8 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-600/trainer_state.json @@ -0,0 +1,4330 @@ +{ + "best_global_step": 600, + "best_metric": 0.3965963125228882, + "best_model_checkpoint": "runs/cpt_run_v1/checkpoints/checkpoint-600", + "epoch": 1.7496809480401094, + "eval_steps": 50, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0029170464904284413, + "grad_norm": 1.1577509641647339, + "learning_rate": 0.0, + "loss": 0.9893555045127869, + "step": 1 + }, + { + "epoch": 0.005834092980856883, + "grad_norm": 0.9491796493530273, + "learning_rate": 2.8985507246376816e-07, + "loss": 0.8791205883026123, + "step": 2 + }, + { + "epoch": 0.008751139471285323, + "grad_norm": 1.1600768566131592, + "learning_rate": 5.797101449275363e-07, + "loss": 0.9858248233795166, + "step": 3 + }, + { + "epoch": 0.011668185961713765, + "grad_norm": 1.2298306226730347, + "learning_rate": 8.695652173913044e-07, + "loss": 1.0516364574432373, + "step": 4 + }, + { + "epoch": 0.014585232452142206, + "grad_norm": 0.9520533680915833, + "learning_rate": 1.1594202898550726e-06, + "loss": 0.8392249345779419, + "step": 5 + }, + { + "epoch": 0.017502278942570646, + "grad_norm": 1.2451188564300537, + "learning_rate": 1.4492753623188408e-06, + "loss": 1.0955077409744263, + "step": 6 + }, + { + "epoch": 0.02041932543299909, + "grad_norm": 1.1123991012573242, + "learning_rate": 1.7391304347826088e-06, + "loss": 0.9201866388320923, + "step": 7 + }, + { + "epoch": 0.02333637192342753, + "grad_norm": 0.9283139705657959, + "learning_rate": 2.028985507246377e-06, + "loss": 0.9770950078964233, + "step": 8 + }, + { + "epoch": 0.02625341841385597, + "grad_norm": 0.9589216113090515, + "learning_rate": 2.3188405797101453e-06, + "loss": 0.9442565441131592, + "step": 9 + }, + { + "epoch": 0.02917046490428441, + "grad_norm": 0.8866703510284424, + "learning_rate": 2.6086956521739132e-06, + "loss": 0.9354464411735535, + "step": 10 + }, + { + "epoch": 0.03208751139471285, + "grad_norm": 0.7191241383552551, + "learning_rate": 2.8985507246376816e-06, + "loss": 0.7659736275672913, + "step": 11 + }, + { + "epoch": 0.03500455788514129, + "grad_norm": 0.9110142588615417, + "learning_rate": 3.188405797101449e-06, + "loss": 0.9319326877593994, + "step": 12 + }, + { + "epoch": 0.03792160437556973, + "grad_norm": 0.8754057288169861, + "learning_rate": 3.4782608695652175e-06, + "loss": 0.9819356203079224, + "step": 13 + }, + { + "epoch": 0.04083865086599818, + "grad_norm": 0.896181046962738, + "learning_rate": 3.768115942028986e-06, + "loss": 1.026316523551941, + "step": 14 + }, + { + "epoch": 0.04375569735642662, + "grad_norm": 0.6104832887649536, + "learning_rate": 4.057971014492754e-06, + "loss": 0.8427562713623047, + "step": 15 + }, + { + "epoch": 0.04667274384685506, + "grad_norm": 0.6529208421707153, + "learning_rate": 4.347826086956522e-06, + "loss": 0.8496565222740173, + "step": 16 + }, + { + "epoch": 0.0495897903372835, + "grad_norm": 0.6319335699081421, + "learning_rate": 4.637681159420291e-06, + "loss": 0.9139047861099243, + "step": 17 + }, + { + "epoch": 0.05250683682771194, + "grad_norm": 0.7458649277687073, + "learning_rate": 4.927536231884059e-06, + "loss": 0.8867442011833191, + "step": 18 + }, + { + "epoch": 0.05542388331814038, + "grad_norm": 0.6179773211479187, + "learning_rate": 5.2173913043478265e-06, + "loss": 0.9579408168792725, + "step": 19 + }, + { + "epoch": 0.05834092980856882, + "grad_norm": 0.794481635093689, + "learning_rate": 5.507246376811595e-06, + "loss": 0.8736554980278015, + "step": 20 + }, + { + "epoch": 0.06125797629899726, + "grad_norm": 0.8356145620346069, + "learning_rate": 5.797101449275363e-06, + "loss": 0.9358762502670288, + "step": 21 + }, + { + "epoch": 0.0641750227894257, + "grad_norm": 0.5891932845115662, + "learning_rate": 6.086956521739132e-06, + "loss": 0.8972038626670837, + "step": 22 + }, + { + "epoch": 0.06709206927985414, + "grad_norm": 0.6931268572807312, + "learning_rate": 6.376811594202898e-06, + "loss": 0.9583507776260376, + "step": 23 + }, + { + "epoch": 0.07000911577028258, + "grad_norm": 0.7298229336738586, + "learning_rate": 6.666666666666667e-06, + "loss": 0.8119489550590515, + "step": 24 + }, + { + "epoch": 0.07292616226071102, + "grad_norm": 0.6419956684112549, + "learning_rate": 6.956521739130435e-06, + "loss": 0.9386100769042969, + "step": 25 + }, + { + "epoch": 0.07584320875113947, + "grad_norm": 0.7508338689804077, + "learning_rate": 7.246376811594203e-06, + "loss": 0.9272583723068237, + "step": 26 + }, + { + "epoch": 0.0787602552415679, + "grad_norm": 0.5848079919815063, + "learning_rate": 7.536231884057972e-06, + "loss": 0.8967856168746948, + "step": 27 + }, + { + "epoch": 0.08167730173199636, + "grad_norm": 0.7384837865829468, + "learning_rate": 7.82608695652174e-06, + "loss": 0.8696568012237549, + "step": 28 + }, + { + "epoch": 0.0845943482224248, + "grad_norm": 0.5069604516029358, + "learning_rate": 8.115942028985508e-06, + "loss": 0.9121193885803223, + "step": 29 + }, + { + "epoch": 0.08751139471285324, + "grad_norm": 0.833165168762207, + "learning_rate": 8.405797101449275e-06, + "loss": 0.8180589079856873, + "step": 30 + }, + { + "epoch": 0.09042844120328168, + "grad_norm": 0.6355920433998108, + "learning_rate": 8.695652173913044e-06, + "loss": 0.8640957474708557, + "step": 31 + }, + { + "epoch": 0.09334548769371012, + "grad_norm": 1.0429315567016602, + "learning_rate": 8.985507246376812e-06, + "loss": 0.9517915844917297, + "step": 32 + }, + { + "epoch": 0.09626253418413856, + "grad_norm": 0.5875154733657837, + "learning_rate": 9.275362318840581e-06, + "loss": 0.9443603754043579, + "step": 33 + }, + { + "epoch": 0.099179580674567, + "grad_norm": 1.9913769960403442, + "learning_rate": 9.565217391304349e-06, + "loss": 0.9510866403579712, + "step": 34 + }, + { + "epoch": 0.10209662716499544, + "grad_norm": 0.5310097932815552, + "learning_rate": 9.855072463768118e-06, + "loss": 0.8653419613838196, + "step": 35 + }, + { + "epoch": 0.10501367365542388, + "grad_norm": 0.624421238899231, + "learning_rate": 1.0144927536231885e-05, + "loss": 0.7941208481788635, + "step": 36 + }, + { + "epoch": 0.10793072014585232, + "grad_norm": 0.6314200758934021, + "learning_rate": 1.0434782608695653e-05, + "loss": 0.8931174278259277, + "step": 37 + }, + { + "epoch": 0.11084776663628076, + "grad_norm": 0.6272342205047607, + "learning_rate": 1.0724637681159422e-05, + "loss": 0.8978185057640076, + "step": 38 + }, + { + "epoch": 0.1137648131267092, + "grad_norm": 0.5711184740066528, + "learning_rate": 1.101449275362319e-05, + "loss": 0.808263897895813, + "step": 39 + }, + { + "epoch": 0.11668185961713765, + "grad_norm": 0.7581208944320679, + "learning_rate": 1.1304347826086957e-05, + "loss": 0.7456756830215454, + "step": 40 + }, + { + "epoch": 0.11959890610756609, + "grad_norm": 0.4989977180957794, + "learning_rate": 1.1594202898550726e-05, + "loss": 0.8273333311080933, + "step": 41 + }, + { + "epoch": 0.12251595259799453, + "grad_norm": 0.8602972626686096, + "learning_rate": 1.1884057971014494e-05, + "loss": 0.8514784574508667, + "step": 42 + }, + { + "epoch": 0.12543299908842298, + "grad_norm": 0.6918581128120422, + "learning_rate": 1.2173913043478263e-05, + "loss": 0.8182265162467957, + "step": 43 + }, + { + "epoch": 0.1283500455788514, + "grad_norm": 0.653099536895752, + "learning_rate": 1.2463768115942029e-05, + "loss": 0.8242791891098022, + "step": 44 + }, + { + "epoch": 0.13126709206927986, + "grad_norm": 0.7485584616661072, + "learning_rate": 1.2753623188405797e-05, + "loss": 0.8229591250419617, + "step": 45 + }, + { + "epoch": 0.1341841385597083, + "grad_norm": 0.6724833250045776, + "learning_rate": 1.3043478260869566e-05, + "loss": 0.8146833181381226, + "step": 46 + }, + { + "epoch": 0.13710118505013674, + "grad_norm": 0.857208251953125, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.8154427409172058, + "step": 47 + }, + { + "epoch": 0.14001823154056517, + "grad_norm": 0.5559669137001038, + "learning_rate": 1.3623188405797103e-05, + "loss": 0.879005491733551, + "step": 48 + }, + { + "epoch": 0.14293527803099362, + "grad_norm": 0.5910897850990295, + "learning_rate": 1.391304347826087e-05, + "loss": 0.8148283362388611, + "step": 49 + }, + { + "epoch": 0.14585232452142205, + "grad_norm": 0.6478891372680664, + "learning_rate": 1.420289855072464e-05, + "loss": 0.8293006420135498, + "step": 50 + }, + { + "epoch": 0.14585232452142205, + "eval_loss": 0.7892261147499084, + "eval_runtime": 973.2157, + "eval_samples_per_second": 0.649, + "eval_steps_per_second": 0.649, + "step": 50 + }, + { + "epoch": 0.1487693710118505, + "grad_norm": 0.757882833480835, + "learning_rate": 1.4492753623188407e-05, + "loss": 0.8114852905273438, + "step": 51 + }, + { + "epoch": 0.15168641750227893, + "grad_norm": 0.8496116995811462, + "learning_rate": 1.4782608695652174e-05, + "loss": 0.7886185050010681, + "step": 52 + }, + { + "epoch": 0.15460346399270739, + "grad_norm": 0.6078857183456421, + "learning_rate": 1.5072463768115944e-05, + "loss": 0.7298170924186707, + "step": 53 + }, + { + "epoch": 0.1575205104831358, + "grad_norm": 0.5856835246086121, + "learning_rate": 1.536231884057971e-05, + "loss": 0.7407160997390747, + "step": 54 + }, + { + "epoch": 0.16043755697356427, + "grad_norm": 1.0533701181411743, + "learning_rate": 1.565217391304348e-05, + "loss": 0.7057831287384033, + "step": 55 + }, + { + "epoch": 0.16335460346399272, + "grad_norm": 0.8087610006332397, + "learning_rate": 1.5942028985507246e-05, + "loss": 0.7409019470214844, + "step": 56 + }, + { + "epoch": 0.16627164995442115, + "grad_norm": 0.629945695400238, + "learning_rate": 1.6231884057971015e-05, + "loss": 0.7768293023109436, + "step": 57 + }, + { + "epoch": 0.1691886964448496, + "grad_norm": 0.5187911987304688, + "learning_rate": 1.6521739130434785e-05, + "loss": 0.825718104839325, + "step": 58 + }, + { + "epoch": 0.17210574293527803, + "grad_norm": 0.5866358280181885, + "learning_rate": 1.681159420289855e-05, + "loss": 0.8575979471206665, + "step": 59 + }, + { + "epoch": 0.17502278942570648, + "grad_norm": 1.5098934173583984, + "learning_rate": 1.710144927536232e-05, + "loss": 0.8058848977088928, + "step": 60 + }, + { + "epoch": 0.1779398359161349, + "grad_norm": 0.6981958150863647, + "learning_rate": 1.739130434782609e-05, + "loss": 0.7640778422355652, + "step": 61 + }, + { + "epoch": 0.18085688240656336, + "grad_norm": 0.631349503993988, + "learning_rate": 1.7681159420289858e-05, + "loss": 0.7896331548690796, + "step": 62 + }, + { + "epoch": 0.1837739288969918, + "grad_norm": 0.6930747032165527, + "learning_rate": 1.7971014492753624e-05, + "loss": 0.6762524247169495, + "step": 63 + }, + { + "epoch": 0.18669097538742024, + "grad_norm": 0.599399209022522, + "learning_rate": 1.8260869565217393e-05, + "loss": 0.7285035848617554, + "step": 64 + }, + { + "epoch": 0.18960802187784867, + "grad_norm": 0.6194344758987427, + "learning_rate": 1.8550724637681162e-05, + "loss": 0.7682523131370544, + "step": 65 + }, + { + "epoch": 0.19252506836827712, + "grad_norm": 0.5691342949867249, + "learning_rate": 1.8840579710144928e-05, + "loss": 0.6791993379592896, + "step": 66 + }, + { + "epoch": 0.19544211485870555, + "grad_norm": 0.6257390379905701, + "learning_rate": 1.9130434782608697e-05, + "loss": 0.6744828224182129, + "step": 67 + }, + { + "epoch": 0.198359161349134, + "grad_norm": 0.5871018767356873, + "learning_rate": 1.9420289855072467e-05, + "loss": 0.7317330837249756, + "step": 68 + }, + { + "epoch": 0.20127620783956243, + "grad_norm": 1.0744612216949463, + "learning_rate": 1.9710144927536236e-05, + "loss": 0.6617178916931152, + "step": 69 + }, + { + "epoch": 0.2041932543299909, + "grad_norm": 0.675946831703186, + "learning_rate": 2e-05, + "loss": 0.7615712881088257, + "step": 70 + }, + { + "epoch": 0.2071103008204193, + "grad_norm": 0.7663411498069763, + "learning_rate": 1.9999870372100614e-05, + "loss": 0.7131291627883911, + "step": 71 + }, + { + "epoch": 0.21002734731084777, + "grad_norm": 0.6725395321846008, + "learning_rate": 1.9999481491763123e-05, + "loss": 0.7452989816665649, + "step": 72 + }, + { + "epoch": 0.21294439380127622, + "grad_norm": 0.6505664587020874, + "learning_rate": 1.9998833369069483e-05, + "loss": 0.7477136850357056, + "step": 73 + }, + { + "epoch": 0.21586144029170465, + "grad_norm": 0.7032860517501831, + "learning_rate": 1.9997926020822643e-05, + "loss": 0.6854275465011597, + "step": 74 + }, + { + "epoch": 0.2187784867821331, + "grad_norm": 0.645345151424408, + "learning_rate": 1.999675947054614e-05, + "loss": 0.7552425265312195, + "step": 75 + }, + { + "epoch": 0.22169553327256153, + "grad_norm": 0.6620492935180664, + "learning_rate": 1.9995333748483464e-05, + "loss": 0.7262853384017944, + "step": 76 + }, + { + "epoch": 0.22461257976298998, + "grad_norm": 0.6511455774307251, + "learning_rate": 1.9993648891597284e-05, + "loss": 0.7591732144355774, + "step": 77 + }, + { + "epoch": 0.2275296262534184, + "grad_norm": 0.6775254011154175, + "learning_rate": 1.9991704943568497e-05, + "loss": 0.7498704195022583, + "step": 78 + }, + { + "epoch": 0.23044667274384686, + "grad_norm": 0.8199896216392517, + "learning_rate": 1.9989501954795076e-05, + "loss": 0.7238684296607971, + "step": 79 + }, + { + "epoch": 0.2333637192342753, + "grad_norm": 0.8197569847106934, + "learning_rate": 1.998703998239079e-05, + "loss": 0.7028778195381165, + "step": 80 + }, + { + "epoch": 0.23628076572470375, + "grad_norm": 0.6602625250816345, + "learning_rate": 1.9984319090183692e-05, + "loss": 0.8842703104019165, + "step": 81 + }, + { + "epoch": 0.23919781221513217, + "grad_norm": 0.9587129354476929, + "learning_rate": 1.99813393487145e-05, + "loss": 0.732614278793335, + "step": 82 + }, + { + "epoch": 0.24211485870556063, + "grad_norm": 0.6822189092636108, + "learning_rate": 1.997810083523473e-05, + "loss": 0.7544928193092346, + "step": 83 + }, + { + "epoch": 0.24503190519598905, + "grad_norm": 0.8980082869529724, + "learning_rate": 1.9974603633704726e-05, + "loss": 0.6704054474830627, + "step": 84 + }, + { + "epoch": 0.2479489516864175, + "grad_norm": 0.7413425445556641, + "learning_rate": 1.9970847834791472e-05, + "loss": 0.693661093711853, + "step": 85 + }, + { + "epoch": 0.25086599817684596, + "grad_norm": 0.8314999341964722, + "learning_rate": 1.9966833535866223e-05, + "loss": 0.667654275894165, + "step": 86 + }, + { + "epoch": 0.25378304466727436, + "grad_norm": 0.7972444891929626, + "learning_rate": 1.9962560841002013e-05, + "loss": 0.8403134942054749, + "step": 87 + }, + { + "epoch": 0.2567000911577028, + "grad_norm": 0.8519951701164246, + "learning_rate": 1.995802986097093e-05, + "loss": 0.6897370219230652, + "step": 88 + }, + { + "epoch": 0.25961713764813127, + "grad_norm": 0.8268933892250061, + "learning_rate": 1.995324071324126e-05, + "loss": 0.6690632700920105, + "step": 89 + }, + { + "epoch": 0.2625341841385597, + "grad_norm": 0.7133983969688416, + "learning_rate": 1.9948193521974436e-05, + "loss": 0.6314147114753723, + "step": 90 + }, + { + "epoch": 0.2654512306289881, + "grad_norm": 0.889302134513855, + "learning_rate": 1.9942888418021814e-05, + "loss": 0.7389825582504272, + "step": 91 + }, + { + "epoch": 0.2683682771194166, + "grad_norm": 0.7022432088851929, + "learning_rate": 1.99373255389213e-05, + "loss": 0.6916261911392212, + "step": 92 + }, + { + "epoch": 0.27128532360984503, + "grad_norm": 0.696432888507843, + "learning_rate": 1.9931505028893748e-05, + "loss": 0.6908476948738098, + "step": 93 + }, + { + "epoch": 0.2742023701002735, + "grad_norm": 0.7667419910430908, + "learning_rate": 1.9925427038839267e-05, + "loss": 0.6500837206840515, + "step": 94 + }, + { + "epoch": 0.27711941659070194, + "grad_norm": 0.6974894404411316, + "learning_rate": 1.9919091726333265e-05, + "loss": 0.7059191465377808, + "step": 95 + }, + { + "epoch": 0.28003646308113034, + "grad_norm": 0.7047077417373657, + "learning_rate": 1.9912499255622397e-05, + "loss": 0.6287837624549866, + "step": 96 + }, + { + "epoch": 0.2829535095715588, + "grad_norm": 0.7729557156562805, + "learning_rate": 1.990564979762029e-05, + "loss": 0.6738612055778503, + "step": 97 + }, + { + "epoch": 0.28587055606198725, + "grad_norm": 0.7020529508590698, + "learning_rate": 1.989854352990311e-05, + "loss": 0.662042498588562, + "step": 98 + }, + { + "epoch": 0.2887876025524157, + "grad_norm": 0.7369800209999084, + "learning_rate": 1.9891180636704975e-05, + "loss": 0.6246830821037292, + "step": 99 + }, + { + "epoch": 0.2917046490428441, + "grad_norm": 0.7412623167037964, + "learning_rate": 1.9883561308913154e-05, + "loss": 0.6623879075050354, + "step": 100 + }, + { + "epoch": 0.2917046490428441, + "eval_loss": 0.6552971005439758, + "eval_runtime": 966.7072, + "eval_samples_per_second": 0.654, + "eval_steps_per_second": 0.654, + "step": 100 + }, + { + "epoch": 0.29462169553327255, + "grad_norm": 0.8428792953491211, + "learning_rate": 1.987568574406314e-05, + "loss": 0.6312171816825867, + "step": 101 + }, + { + "epoch": 0.297538742023701, + "grad_norm": 0.6948133707046509, + "learning_rate": 1.9867554146333517e-05, + "loss": 0.6266146898269653, + "step": 102 + }, + { + "epoch": 0.30045578851412946, + "grad_norm": 1.3897597789764404, + "learning_rate": 1.985916672654068e-05, + "loss": 0.6669265031814575, + "step": 103 + }, + { + "epoch": 0.30337283500455786, + "grad_norm": 0.8838400840759277, + "learning_rate": 1.985052370213334e-05, + "loss": 0.6601086854934692, + "step": 104 + }, + { + "epoch": 0.3062898814949863, + "grad_norm": 0.8471395373344421, + "learning_rate": 1.9841625297186925e-05, + "loss": 0.5984431505203247, + "step": 105 + }, + { + "epoch": 0.30920692798541477, + "grad_norm": 0.8940042853355408, + "learning_rate": 1.983247174239774e-05, + "loss": 0.7223822474479675, + "step": 106 + }, + { + "epoch": 0.3121239744758432, + "grad_norm": 0.7833696603775024, + "learning_rate": 1.9823063275076998e-05, + "loss": 0.6868705749511719, + "step": 107 + }, + { + "epoch": 0.3150410209662716, + "grad_norm": 0.8794649243354797, + "learning_rate": 1.9813400139144673e-05, + "loss": 0.6246675848960876, + "step": 108 + }, + { + "epoch": 0.3179580674567001, + "grad_norm": 0.8126057982444763, + "learning_rate": 1.9803482585123165e-05, + "loss": 0.5908697247505188, + "step": 109 + }, + { + "epoch": 0.32087511394712853, + "grad_norm": 0.7947676777839661, + "learning_rate": 1.979331087013082e-05, + "loss": 0.5751246809959412, + "step": 110 + }, + { + "epoch": 0.323792160437557, + "grad_norm": 0.713545560836792, + "learning_rate": 1.978288525787524e-05, + "loss": 0.6081106066703796, + "step": 111 + }, + { + "epoch": 0.32670920692798544, + "grad_norm": 1.011828064918518, + "learning_rate": 1.977220601864647e-05, + "loss": 0.7039169669151306, + "step": 112 + }, + { + "epoch": 0.32962625341841384, + "grad_norm": 0.730570912361145, + "learning_rate": 1.9761273429309982e-05, + "loss": 0.6140255928039551, + "step": 113 + }, + { + "epoch": 0.3325432999088423, + "grad_norm": 1.059688687324524, + "learning_rate": 1.9750087773299492e-05, + "loss": 0.648114025592804, + "step": 114 + }, + { + "epoch": 0.33546034639927075, + "grad_norm": 0.9336895942687988, + "learning_rate": 1.973864934060962e-05, + "loss": 0.622555673122406, + "step": 115 + }, + { + "epoch": 0.3383773928896992, + "grad_norm": 0.7195945978164673, + "learning_rate": 1.9726958427788367e-05, + "loss": 0.70485520362854, + "step": 116 + }, + { + "epoch": 0.3412944393801276, + "grad_norm": 0.8101872801780701, + "learning_rate": 1.971501533792942e-05, + "loss": 0.6958848834037781, + "step": 117 + }, + { + "epoch": 0.34421148587055606, + "grad_norm": 1.6075212955474854, + "learning_rate": 1.970282038066432e-05, + "loss": 0.6021550893783569, + "step": 118 + }, + { + "epoch": 0.3471285323609845, + "grad_norm": 0.7881433963775635, + "learning_rate": 1.9690373872154396e-05, + "loss": 0.6449777483940125, + "step": 119 + }, + { + "epoch": 0.35004557885141296, + "grad_norm": 1.014639973640442, + "learning_rate": 1.9677676135082606e-05, + "loss": 0.5939379930496216, + "step": 120 + }, + { + "epoch": 0.35296262534184136, + "grad_norm": 0.8198449611663818, + "learning_rate": 1.9664727498645144e-05, + "loss": 0.6210286617279053, + "step": 121 + }, + { + "epoch": 0.3558796718322698, + "grad_norm": 1.0194576978683472, + "learning_rate": 1.9651528298542918e-05, + "loss": 0.624247670173645, + "step": 122 + }, + { + "epoch": 0.35879671832269827, + "grad_norm": 0.7963470220565796, + "learning_rate": 1.9638078876972842e-05, + "loss": 0.6479315757751465, + "step": 123 + }, + { + "epoch": 0.3617137648131267, + "grad_norm": 0.9007541537284851, + "learning_rate": 1.9624379582618976e-05, + "loss": 0.6131505370140076, + "step": 124 + }, + { + "epoch": 0.3646308113035551, + "grad_norm": 0.8712120056152344, + "learning_rate": 1.9610430770643464e-05, + "loss": 0.6249448657035828, + "step": 125 + }, + { + "epoch": 0.3675478577939836, + "grad_norm": 1.1482540369033813, + "learning_rate": 1.9596232802677347e-05, + "loss": 0.5844688415527344, + "step": 126 + }, + { + "epoch": 0.37046490428441203, + "grad_norm": 0.8662379384040833, + "learning_rate": 1.9581786046811175e-05, + "loss": 0.6573485732078552, + "step": 127 + }, + { + "epoch": 0.3733819507748405, + "grad_norm": 0.8191388845443726, + "learning_rate": 1.9567090877585477e-05, + "loss": 0.5896862745285034, + "step": 128 + }, + { + "epoch": 0.37629899726526894, + "grad_norm": 1.0187078714370728, + "learning_rate": 1.955214767598103e-05, + "loss": 0.613490879535675, + "step": 129 + }, + { + "epoch": 0.37921604375569734, + "grad_norm": 0.8444119691848755, + "learning_rate": 1.953695682940901e-05, + "loss": 0.727687656879425, + "step": 130 + }, + { + "epoch": 0.3821330902461258, + "grad_norm": 0.74753737449646, + "learning_rate": 1.9521518731700913e-05, + "loss": 0.6102436780929565, + "step": 131 + }, + { + "epoch": 0.38505013673655425, + "grad_norm": 1.0166202783584595, + "learning_rate": 1.9505833783098378e-05, + "loss": 0.6244844198226929, + "step": 132 + }, + { + "epoch": 0.3879671832269827, + "grad_norm": 0.8175772428512573, + "learning_rate": 1.9489902390242793e-05, + "loss": 0.5939282178878784, + "step": 133 + }, + { + "epoch": 0.3908842297174111, + "grad_norm": 1.0177713632583618, + "learning_rate": 1.947372496616476e-05, + "loss": 0.6418229937553406, + "step": 134 + }, + { + "epoch": 0.39380127620783956, + "grad_norm": 0.8652453422546387, + "learning_rate": 1.9457301930273376e-05, + "loss": 0.5870395302772522, + "step": 135 + }, + { + "epoch": 0.396718322698268, + "grad_norm": 0.8378894925117493, + "learning_rate": 1.9440633708345365e-05, + "loss": 0.6480278372764587, + "step": 136 + }, + { + "epoch": 0.39963536918869647, + "grad_norm": 0.8303541541099548, + "learning_rate": 1.9423720732514052e-05, + "loss": 0.6191359758377075, + "step": 137 + }, + { + "epoch": 0.40255241567912486, + "grad_norm": 0.8576734662055969, + "learning_rate": 1.9406563441258145e-05, + "loss": 0.5696198344230652, + "step": 138 + }, + { + "epoch": 0.4054694621695533, + "grad_norm": 0.9558727145195007, + "learning_rate": 1.9389162279390362e-05, + "loss": 0.6177623271942139, + "step": 139 + }, + { + "epoch": 0.4083865086599818, + "grad_norm": 0.7046042084693909, + "learning_rate": 1.9371517698045922e-05, + "loss": 0.5836521983146667, + "step": 140 + }, + { + "epoch": 0.4113035551504102, + "grad_norm": 1.0522717237472534, + "learning_rate": 1.935363015467082e-05, + "loss": 0.5728275775909424, + "step": 141 + }, + { + "epoch": 0.4142206016408386, + "grad_norm": 0.9554787874221802, + "learning_rate": 1.933550011301e-05, + "loss": 0.632586658000946, + "step": 142 + }, + { + "epoch": 0.4171376481312671, + "grad_norm": 0.8874214291572571, + "learning_rate": 1.9317128043095293e-05, + "loss": 0.5850118398666382, + "step": 143 + }, + { + "epoch": 0.42005469462169553, + "grad_norm": 1.0708963871002197, + "learning_rate": 1.9298514421233276e-05, + "loss": 0.6260685324668884, + "step": 144 + }, + { + "epoch": 0.422971741112124, + "grad_norm": 0.8135736584663391, + "learning_rate": 1.9279659729992888e-05, + "loss": 0.6031094193458557, + "step": 145 + }, + { + "epoch": 0.42588878760255244, + "grad_norm": 0.7971774339675903, + "learning_rate": 1.9260564458192926e-05, + "loss": 0.6101322770118713, + "step": 146 + }, + { + "epoch": 0.42880583409298084, + "grad_norm": 0.9374974966049194, + "learning_rate": 1.9241229100889397e-05, + "loss": 0.5836313366889954, + "step": 147 + }, + { + "epoch": 0.4317228805834093, + "grad_norm": 0.8043425679206848, + "learning_rate": 1.9221654159362636e-05, + "loss": 0.6181215047836304, + "step": 148 + }, + { + "epoch": 0.43463992707383775, + "grad_norm": 0.8923380374908447, + "learning_rate": 1.920184014110436e-05, + "loss": 0.6149677634239197, + "step": 149 + }, + { + "epoch": 0.4375569735642662, + "grad_norm": 0.8908132314682007, + "learning_rate": 1.918178755980449e-05, + "loss": 0.5899742841720581, + "step": 150 + }, + { + "epoch": 0.4375569735642662, + "eval_loss": 0.5903874635696411, + "eval_runtime": 1186.9542, + "eval_samples_per_second": 0.532, + "eval_steps_per_second": 0.532, + "step": 150 + }, + { + "epoch": 0.4404740200546946, + "grad_norm": 1.060531497001648, + "learning_rate": 1.9161496935337808e-05, + "loss": 0.5852696895599365, + "step": 151 + }, + { + "epoch": 0.44339106654512306, + "grad_norm": 0.9723032712936401, + "learning_rate": 1.914096879375053e-05, + "loss": 0.5822056531906128, + "step": 152 + }, + { + "epoch": 0.4463081130355515, + "grad_norm": 0.9519931674003601, + "learning_rate": 1.912020366724663e-05, + "loss": 0.6183493137359619, + "step": 153 + }, + { + "epoch": 0.44922515952597997, + "grad_norm": 0.8282918334007263, + "learning_rate": 1.9099202094174055e-05, + "loss": 0.6229860782623291, + "step": 154 + }, + { + "epoch": 0.45214220601640837, + "grad_norm": 0.9251292943954468, + "learning_rate": 1.907796461901076e-05, + "loss": 0.6552959680557251, + "step": 155 + }, + { + "epoch": 0.4550592525068368, + "grad_norm": 1.0349540710449219, + "learning_rate": 1.9056491792350606e-05, + "loss": 0.6170098781585693, + "step": 156 + }, + { + "epoch": 0.4579762989972653, + "grad_norm": 0.8720711469650269, + "learning_rate": 1.9034784170889076e-05, + "loss": 0.5870137810707092, + "step": 157 + }, + { + "epoch": 0.46089334548769373, + "grad_norm": 1.0785977840423584, + "learning_rate": 1.9012842317408843e-05, + "loss": 0.5515124201774597, + "step": 158 + }, + { + "epoch": 0.4638103919781221, + "grad_norm": 1.0634154081344604, + "learning_rate": 1.8990666800765187e-05, + "loss": 0.6073828339576721, + "step": 159 + }, + { + "epoch": 0.4667274384685506, + "grad_norm": 0.8770879507064819, + "learning_rate": 1.896825819587123e-05, + "loss": 0.5960907936096191, + "step": 160 + }, + { + "epoch": 0.46964448495897904, + "grad_norm": 1.1225898265838623, + "learning_rate": 1.894561708368305e-05, + "loss": 0.545990526676178, + "step": 161 + }, + { + "epoch": 0.4725615314494075, + "grad_norm": 0.9373893141746521, + "learning_rate": 1.8922744051184613e-05, + "loss": 0.5566108822822571, + "step": 162 + }, + { + "epoch": 0.4754785779398359, + "grad_norm": 1.5016087293624878, + "learning_rate": 1.8899639691372545e-05, + "loss": 0.558845043182373, + "step": 163 + }, + { + "epoch": 0.47839562443026434, + "grad_norm": 0.903020977973938, + "learning_rate": 1.8876304603240773e-05, + "loss": 0.6824233531951904, + "step": 164 + }, + { + "epoch": 0.4813126709206928, + "grad_norm": 0.8239623308181763, + "learning_rate": 1.8852739391764993e-05, + "loss": 0.5630610585212708, + "step": 165 + }, + { + "epoch": 0.48422971741112125, + "grad_norm": 0.926069438457489, + "learning_rate": 1.882894466788697e-05, + "loss": 0.6211802363395691, + "step": 166 + }, + { + "epoch": 0.4871467639015497, + "grad_norm": 1.0098828077316284, + "learning_rate": 1.8804921048498722e-05, + "loss": 0.5513257384300232, + "step": 167 + }, + { + "epoch": 0.4900638103919781, + "grad_norm": 0.9228141903877258, + "learning_rate": 1.8780669156426517e-05, + "loss": 0.6197121739387512, + "step": 168 + }, + { + "epoch": 0.49298085688240656, + "grad_norm": 1.0551754236221313, + "learning_rate": 1.8756189620414712e-05, + "loss": 0.5221806764602661, + "step": 169 + }, + { + "epoch": 0.495897903372835, + "grad_norm": 0.9017496109008789, + "learning_rate": 1.873148307510948e-05, + "loss": 0.5766995549201965, + "step": 170 + }, + { + "epoch": 0.49881494986326347, + "grad_norm": 0.9704970717430115, + "learning_rate": 1.870655016104233e-05, + "loss": 0.6514763832092285, + "step": 171 + }, + { + "epoch": 0.5017319963536919, + "grad_norm": 0.9972712397575378, + "learning_rate": 1.8681391524613518e-05, + "loss": 0.5273895263671875, + "step": 172 + }, + { + "epoch": 0.5046490428441204, + "grad_norm": 0.9473339319229126, + "learning_rate": 1.8656007818075288e-05, + "loss": 0.5548599362373352, + "step": 173 + }, + { + "epoch": 0.5075660893345487, + "grad_norm": 1.2493574619293213, + "learning_rate": 1.8630399699514944e-05, + "loss": 0.5593586564064026, + "step": 174 + }, + { + "epoch": 0.5104831358249772, + "grad_norm": 1.2766696214675903, + "learning_rate": 1.860456783283781e-05, + "loss": 0.6054630279541016, + "step": 175 + }, + { + "epoch": 0.5134001823154056, + "grad_norm": 0.9555240869522095, + "learning_rate": 1.857851288775002e-05, + "loss": 0.508592963218689, + "step": 176 + }, + { + "epoch": 0.5163172288058341, + "grad_norm": 1.260219931602478, + "learning_rate": 1.8552235539741118e-05, + "loss": 0.5532065629959106, + "step": 177 + }, + { + "epoch": 0.5192342752962625, + "grad_norm": 1.1859954595565796, + "learning_rate": 1.8525736470066595e-05, + "loss": 0.5683344006538391, + "step": 178 + }, + { + "epoch": 0.522151321786691, + "grad_norm": 1.3044344186782837, + "learning_rate": 1.8499016365730203e-05, + "loss": 0.5281959772109985, + "step": 179 + }, + { + "epoch": 0.5250683682771194, + "grad_norm": 1.3049921989440918, + "learning_rate": 1.8472075919466137e-05, + "loss": 0.49621230363845825, + "step": 180 + }, + { + "epoch": 0.5279854147675479, + "grad_norm": 1.0488537549972534, + "learning_rate": 1.844491582972109e-05, + "loss": 0.6194032430648804, + "step": 181 + }, + { + "epoch": 0.5309024612579762, + "grad_norm": 1.5553455352783203, + "learning_rate": 1.8417536800636138e-05, + "loss": 0.5645846724510193, + "step": 182 + }, + { + "epoch": 0.5338195077484047, + "grad_norm": 1.2673912048339844, + "learning_rate": 1.8389939542028484e-05, + "loss": 0.6267315745353699, + "step": 183 + }, + { + "epoch": 0.5367365542388332, + "grad_norm": 1.0273847579956055, + "learning_rate": 1.8362124769373064e-05, + "loss": 0.5256403684616089, + "step": 184 + }, + { + "epoch": 0.5396536007292616, + "grad_norm": 1.006093978881836, + "learning_rate": 1.8334093203783986e-05, + "loss": 0.5916382074356079, + "step": 185 + }, + { + "epoch": 0.5425706472196901, + "grad_norm": 1.2740857601165771, + "learning_rate": 1.8305845571995843e-05, + "loss": 0.581648588180542, + "step": 186 + }, + { + "epoch": 0.5454876937101185, + "grad_norm": 1.494248390197754, + "learning_rate": 1.8277382606344872e-05, + "loss": 0.4824523627758026, + "step": 187 + }, + { + "epoch": 0.548404740200547, + "grad_norm": 1.1862496137619019, + "learning_rate": 1.824870504474996e-05, + "loss": 0.5531858205795288, + "step": 188 + }, + { + "epoch": 0.5513217866909754, + "grad_norm": 3.503049373626709, + "learning_rate": 1.8219813630693523e-05, + "loss": 0.6308296918869019, + "step": 189 + }, + { + "epoch": 0.5542388331814039, + "grad_norm": 1.7544710636138916, + "learning_rate": 1.819070911320222e-05, + "loss": 0.6146273016929626, + "step": 190 + }, + { + "epoch": 0.5571558796718322, + "grad_norm": 1.3367774486541748, + "learning_rate": 1.8161392246827546e-05, + "loss": 0.5848966240882874, + "step": 191 + }, + { + "epoch": 0.5600729261622607, + "grad_norm": 1.696418046951294, + "learning_rate": 1.8131863791626263e-05, + "loss": 0.6621730327606201, + "step": 192 + }, + { + "epoch": 0.5629899726526891, + "grad_norm": 1.360052227973938, + "learning_rate": 1.8102124513140694e-05, + "loss": 0.5972204208374023, + "step": 193 + }, + { + "epoch": 0.5659070191431176, + "grad_norm": 1.5376263856887817, + "learning_rate": 1.807217518237888e-05, + "loss": 0.4938785433769226, + "step": 194 + }, + { + "epoch": 0.568824065633546, + "grad_norm": 1.2249681949615479, + "learning_rate": 1.8042016575794585e-05, + "loss": 0.5366095304489136, + "step": 195 + }, + { + "epoch": 0.5717411121239745, + "grad_norm": 1.7868080139160156, + "learning_rate": 1.8011649475267178e-05, + "loss": 0.5116773843765259, + "step": 196 + }, + { + "epoch": 0.574658158614403, + "grad_norm": 2.369993209838867, + "learning_rate": 1.7981074668081345e-05, + "loss": 0.49072742462158203, + "step": 197 + }, + { + "epoch": 0.5775752051048314, + "grad_norm": 1.0168434381484985, + "learning_rate": 1.7950292946906695e-05, + "loss": 0.5691611170768738, + "step": 198 + }, + { + "epoch": 0.5804922515952597, + "grad_norm": 1.2990851402282715, + "learning_rate": 1.7919305109777195e-05, + "loss": 0.5515039563179016, + "step": 199 + }, + { + "epoch": 0.5834092980856882, + "grad_norm": 1.4859853982925415, + "learning_rate": 1.7888111960070493e-05, + "loss": 0.5017011165618896, + "step": 200 + }, + { + "epoch": 0.5834092980856882, + "eval_loss": 0.5414339303970337, + "eval_runtime": 1180.7894, + "eval_samples_per_second": 0.535, + "eval_steps_per_second": 0.535, + "step": 200 + }, + { + "epoch": 0.5863263445761167, + "grad_norm": 1.0065829753875732, + "learning_rate": 1.7856714306487088e-05, + "loss": 0.5677731037139893, + "step": 201 + }, + { + "epoch": 0.5892433910665451, + "grad_norm": 1.1727538108825684, + "learning_rate": 1.7825112963029352e-05, + "loss": 0.4525509476661682, + "step": 202 + }, + { + "epoch": 0.5921604375569736, + "grad_norm": 1.3376752138137817, + "learning_rate": 1.7793308748980437e-05, + "loss": 0.5208959579467773, + "step": 203 + }, + { + "epoch": 0.595077484047402, + "grad_norm": 0.9196159839630127, + "learning_rate": 1.776130248888304e-05, + "loss": 0.6033903360366821, + "step": 204 + }, + { + "epoch": 0.5979945305378305, + "grad_norm": 1.0750919580459595, + "learning_rate": 1.772909501251801e-05, + "loss": 0.5449609160423279, + "step": 205 + }, + { + "epoch": 0.6009115770282589, + "grad_norm": 1.2459467649459839, + "learning_rate": 1.769668715488285e-05, + "loss": 0.5685338377952576, + "step": 206 + }, + { + "epoch": 0.6038286235186874, + "grad_norm": 1.1690552234649658, + "learning_rate": 1.766407975617006e-05, + "loss": 0.5240382552146912, + "step": 207 + }, + { + "epoch": 0.6067456700091157, + "grad_norm": 1.0816599130630493, + "learning_rate": 1.7631273661745362e-05, + "loss": 0.6802893877029419, + "step": 208 + }, + { + "epoch": 0.6096627164995442, + "grad_norm": 1.3662947416305542, + "learning_rate": 1.7598269722125775e-05, + "loss": 0.48193931579589844, + "step": 209 + }, + { + "epoch": 0.6125797629899726, + "grad_norm": 0.9364766478538513, + "learning_rate": 1.7565068792957576e-05, + "loss": 0.5675849914550781, + "step": 210 + }, + { + "epoch": 0.6154968094804011, + "grad_norm": 1.123828411102295, + "learning_rate": 1.75316717349941e-05, + "loss": 0.5474762916564941, + "step": 211 + }, + { + "epoch": 0.6184138559708295, + "grad_norm": 1.1924363374710083, + "learning_rate": 1.749807941407345e-05, + "loss": 0.4918654263019562, + "step": 212 + }, + { + "epoch": 0.621330902461258, + "grad_norm": 1.101293921470642, + "learning_rate": 1.7464292701096014e-05, + "loss": 0.5742691159248352, + "step": 213 + }, + { + "epoch": 0.6242479489516864, + "grad_norm": 1.7374963760375977, + "learning_rate": 1.7430312472001928e-05, + "loss": 0.5828965902328491, + "step": 214 + }, + { + "epoch": 0.6271649954421149, + "grad_norm": 1.3195666074752808, + "learning_rate": 1.739613960774833e-05, + "loss": 0.5265159010887146, + "step": 215 + }, + { + "epoch": 0.6300820419325432, + "grad_norm": 1.254686713218689, + "learning_rate": 1.7361774994286545e-05, + "loss": 0.4929371476173401, + "step": 216 + }, + { + "epoch": 0.6329990884229717, + "grad_norm": 1.1476380825042725, + "learning_rate": 1.7327219522539102e-05, + "loss": 0.5060417652130127, + "step": 217 + }, + { + "epoch": 0.6359161349134002, + "grad_norm": 1.0914150476455688, + "learning_rate": 1.7292474088376643e-05, + "loss": 0.504043698310852, + "step": 218 + }, + { + "epoch": 0.6388331814038286, + "grad_norm": 1.1339508295059204, + "learning_rate": 1.7257539592594698e-05, + "loss": 0.4797310531139374, + "step": 219 + }, + { + "epoch": 0.6417502278942571, + "grad_norm": 1.0805399417877197, + "learning_rate": 1.722241694089033e-05, + "loss": 0.5878555178642273, + "step": 220 + }, + { + "epoch": 0.6446672743846855, + "grad_norm": 1.8615056276321411, + "learning_rate": 1.718710704383865e-05, + "loss": 0.5005823969841003, + "step": 221 + }, + { + "epoch": 0.647584320875114, + "grad_norm": 1.1445401906967163, + "learning_rate": 1.7151610816869214e-05, + "loss": 0.4949319064617157, + "step": 222 + }, + { + "epoch": 0.6505013673655424, + "grad_norm": 0.9726515412330627, + "learning_rate": 1.711592918024229e-05, + "loss": 0.5073204040527344, + "step": 223 + }, + { + "epoch": 0.6534184138559709, + "grad_norm": 1.4491140842437744, + "learning_rate": 1.7080063059024998e-05, + "loss": 0.47885262966156006, + "step": 224 + }, + { + "epoch": 0.6563354603463992, + "grad_norm": 1.0070592164993286, + "learning_rate": 1.7044013383067327e-05, + "loss": 0.5775837898254395, + "step": 225 + }, + { + "epoch": 0.6592525068368277, + "grad_norm": 0.966221272945404, + "learning_rate": 1.7007781086978037e-05, + "loss": 0.5050399899482727, + "step": 226 + }, + { + "epoch": 0.6621695533272561, + "grad_norm": 0.9808815121650696, + "learning_rate": 1.6971367110100407e-05, + "loss": 0.5737045407295227, + "step": 227 + }, + { + "epoch": 0.6650865998176846, + "grad_norm": 1.0158127546310425, + "learning_rate": 1.6934772396487906e-05, + "loss": 0.48077821731567383, + "step": 228 + }, + { + "epoch": 0.668003646308113, + "grad_norm": 1.32015860080719, + "learning_rate": 1.6897997894879706e-05, + "loss": 0.5614925026893616, + "step": 229 + }, + { + "epoch": 0.6709206927985415, + "grad_norm": 1.1055903434753418, + "learning_rate": 1.686104455867608e-05, + "loss": 0.4970760643482208, + "step": 230 + }, + { + "epoch": 0.67383773928897, + "grad_norm": 1.0804500579833984, + "learning_rate": 1.682391334591371e-05, + "loss": 0.5540452003479004, + "step": 231 + }, + { + "epoch": 0.6767547857793984, + "grad_norm": 1.1906245946884155, + "learning_rate": 1.6786605219240807e-05, + "loss": 0.5778501033782959, + "step": 232 + }, + { + "epoch": 0.6796718322698267, + "grad_norm": 0.9758645296096802, + "learning_rate": 1.6749121145892192e-05, + "loss": 0.49073565006256104, + "step": 233 + }, + { + "epoch": 0.6825888787602552, + "grad_norm": 1.1678364276885986, + "learning_rate": 1.6711462097664207e-05, + "loss": 0.4828741252422333, + "step": 234 + }, + { + "epoch": 0.6855059252506837, + "grad_norm": 1.148301362991333, + "learning_rate": 1.6673629050889507e-05, + "loss": 0.5143818855285645, + "step": 235 + }, + { + "epoch": 0.6884229717411121, + "grad_norm": 1.005898356437683, + "learning_rate": 1.6635622986411776e-05, + "loss": 0.5301160216331482, + "step": 236 + }, + { + "epoch": 0.6913400182315406, + "grad_norm": 1.2227320671081543, + "learning_rate": 1.659744488956027e-05, + "loss": 0.4800386130809784, + "step": 237 + }, + { + "epoch": 0.694257064721969, + "grad_norm": 0.986456573009491, + "learning_rate": 1.6559095750124296e-05, + "loss": 0.5098081827163696, + "step": 238 + }, + { + "epoch": 0.6971741112123975, + "grad_norm": 1.1474376916885376, + "learning_rate": 1.6520576562327518e-05, + "loss": 0.5147273540496826, + "step": 239 + }, + { + "epoch": 0.7000911577028259, + "grad_norm": 1.10917067527771, + "learning_rate": 1.6481888324802223e-05, + "loss": 0.5023190379142761, + "step": 240 + }, + { + "epoch": 0.7030082041932544, + "grad_norm": 1.2339262962341309, + "learning_rate": 1.644303204056341e-05, + "loss": 0.5282092690467834, + "step": 241 + }, + { + "epoch": 0.7059252506836827, + "grad_norm": 0.997941255569458, + "learning_rate": 1.640400871698277e-05, + "loss": 0.5635963082313538, + "step": 242 + }, + { + "epoch": 0.7088422971741112, + "grad_norm": 1.0345823764801025, + "learning_rate": 1.63648193657626e-05, + "loss": 0.5577977895736694, + "step": 243 + }, + { + "epoch": 0.7117593436645396, + "grad_norm": 1.3468303680419922, + "learning_rate": 1.6325465002909554e-05, + "loss": 0.4365362524986267, + "step": 244 + }, + { + "epoch": 0.7146763901549681, + "grad_norm": 1.2817128896713257, + "learning_rate": 1.628594664870831e-05, + "loss": 0.46069926023483276, + "step": 245 + }, + { + "epoch": 0.7175934366453965, + "grad_norm": 1.043311357498169, + "learning_rate": 1.6246265327695117e-05, + "loss": 0.5476971864700317, + "step": 246 + }, + { + "epoch": 0.720510483135825, + "grad_norm": 1.0297389030456543, + "learning_rate": 1.620642206863124e-05, + "loss": 0.48051249980926514, + "step": 247 + }, + { + "epoch": 0.7234275296262535, + "grad_norm": 1.4869836568832397, + "learning_rate": 1.6166417904476257e-05, + "loss": 0.5683314800262451, + "step": 248 + }, + { + "epoch": 0.7263445761166819, + "grad_norm": 1.0628005266189575, + "learning_rate": 1.6126253872361336e-05, + "loss": 0.5277887582778931, + "step": 249 + }, + { + "epoch": 0.7292616226071102, + "grad_norm": 1.2682170867919922, + "learning_rate": 1.608593101356229e-05, + "loss": 0.5048879384994507, + "step": 250 + }, + { + "epoch": 0.7292616226071102, + "eval_loss": 0.5038471221923828, + "eval_runtime": 1175.0375, + "eval_samples_per_second": 0.538, + "eval_steps_per_second": 0.538, + "step": 250 + }, + { + "epoch": 0.7321786690975387, + "grad_norm": 1.7376199960708618, + "learning_rate": 1.6045450373472626e-05, + "loss": 0.5093721151351929, + "step": 251 + }, + { + "epoch": 0.7350957155879672, + "grad_norm": 1.6047718524932861, + "learning_rate": 1.6004813001576405e-05, + "loss": 0.4796055555343628, + "step": 252 + }, + { + "epoch": 0.7380127620783956, + "grad_norm": 1.3582886457443237, + "learning_rate": 1.5964019951421058e-05, + "loss": 0.4733014702796936, + "step": 253 + }, + { + "epoch": 0.7409298085688241, + "grad_norm": 0.9468897581100464, + "learning_rate": 1.5923072280590072e-05, + "loss": 0.5312032103538513, + "step": 254 + }, + { + "epoch": 0.7438468550592525, + "grad_norm": 1.3890198469161987, + "learning_rate": 1.5881971050675547e-05, + "loss": 0.47576645016670227, + "step": 255 + }, + { + "epoch": 0.746763901549681, + "grad_norm": 1.782992959022522, + "learning_rate": 1.584071732725071e-05, + "loss": 0.5555092096328735, + "step": 256 + }, + { + "epoch": 0.7496809480401094, + "grad_norm": 1.1790621280670166, + "learning_rate": 1.5799312179842265e-05, + "loss": 0.5148727893829346, + "step": 257 + }, + { + "epoch": 0.7525979945305379, + "grad_norm": 1.446694254875183, + "learning_rate": 1.5757756681902664e-05, + "loss": 0.49939870834350586, + "step": 258 + }, + { + "epoch": 0.7555150410209662, + "grad_norm": 1.1786166429519653, + "learning_rate": 1.571605191078229e-05, + "loss": 0.562156081199646, + "step": 259 + }, + { + "epoch": 0.7584320875113947, + "grad_norm": 1.16925847530365, + "learning_rate": 1.567419894770151e-05, + "loss": 0.49580734968185425, + "step": 260 + }, + { + "epoch": 0.7613491340018231, + "grad_norm": 1.60944664478302, + "learning_rate": 1.5632198877722676e-05, + "loss": 0.4821680784225464, + "step": 261 + }, + { + "epoch": 0.7642661804922516, + "grad_norm": 1.3957884311676025, + "learning_rate": 1.5590052789721946e-05, + "loss": 0.4392276406288147, + "step": 262 + }, + { + "epoch": 0.76718322698268, + "grad_norm": 1.636195421218872, + "learning_rate": 1.5547761776361096e-05, + "loss": 0.39603114128112793, + "step": 263 + }, + { + "epoch": 0.7701002734731085, + "grad_norm": 1.496766448020935, + "learning_rate": 1.550532693405917e-05, + "loss": 0.4833749234676361, + "step": 264 + }, + { + "epoch": 0.773017319963537, + "grad_norm": 1.3587844371795654, + "learning_rate": 1.5462749362964058e-05, + "loss": 0.43738317489624023, + "step": 265 + }, + { + "epoch": 0.7759343664539654, + "grad_norm": 1.670704960823059, + "learning_rate": 1.5420030166923983e-05, + "loss": 0.4476737380027771, + "step": 266 + }, + { + "epoch": 0.7788514129443938, + "grad_norm": 1.2674932479858398, + "learning_rate": 1.537717045345888e-05, + "loss": 0.42266708612442017, + "step": 267 + }, + { + "epoch": 0.7817684594348222, + "grad_norm": 2.0639536380767822, + "learning_rate": 1.5334171333731666e-05, + "loss": 0.5245381593704224, + "step": 268 + }, + { + "epoch": 0.7846855059252507, + "grad_norm": 1.2091766595840454, + "learning_rate": 1.529103392251946e-05, + "loss": 0.5166443586349487, + "step": 269 + }, + { + "epoch": 0.7876025524156791, + "grad_norm": 1.1021631956100464, + "learning_rate": 1.5247759338184653e-05, + "loss": 0.5674265027046204, + "step": 270 + }, + { + "epoch": 0.7905195989061076, + "grad_norm": 1.3143829107284546, + "learning_rate": 1.520434870264595e-05, + "loss": 0.40855613350868225, + "step": 271 + }, + { + "epoch": 0.793436645396536, + "grad_norm": 1.1784812211990356, + "learning_rate": 1.5160803141349244e-05, + "loss": 0.4308925271034241, + "step": 272 + }, + { + "epoch": 0.7963536918869645, + "grad_norm": 2.1635706424713135, + "learning_rate": 1.5117123783238458e-05, + "loss": 0.45035502314567566, + "step": 273 + }, + { + "epoch": 0.7992707383773929, + "grad_norm": 1.569203495979309, + "learning_rate": 1.5073311760726287e-05, + "loss": 0.5095728635787964, + "step": 274 + }, + { + "epoch": 0.8021877848678214, + "grad_norm": 2.532621383666992, + "learning_rate": 1.5029368209664822e-05, + "loss": 0.496748685836792, + "step": 275 + }, + { + "epoch": 0.8051048313582497, + "grad_norm": 1.6312552690505981, + "learning_rate": 1.4985294269316098e-05, + "loss": 0.4972914159297943, + "step": 276 + }, + { + "epoch": 0.8080218778486782, + "grad_norm": 1.3996756076812744, + "learning_rate": 1.4941091082322579e-05, + "loss": 0.5589750409126282, + "step": 277 + }, + { + "epoch": 0.8109389243391066, + "grad_norm": 1.1288363933563232, + "learning_rate": 1.4896759794677526e-05, + "loss": 0.5349453687667847, + "step": 278 + }, + { + "epoch": 0.8138559708295351, + "grad_norm": 1.6913920640945435, + "learning_rate": 1.4852301555695268e-05, + "loss": 0.46511000394821167, + "step": 279 + }, + { + "epoch": 0.8167730173199635, + "grad_norm": 1.1913212537765503, + "learning_rate": 1.4807717517981439e-05, + "loss": 0.4715422987937927, + "step": 280 + }, + { + "epoch": 0.819690063810392, + "grad_norm": 1.1179691553115845, + "learning_rate": 1.476300883740307e-05, + "loss": 0.53330397605896, + "step": 281 + }, + { + "epoch": 0.8226071103008205, + "grad_norm": 1.7473797798156738, + "learning_rate": 1.4718176673058624e-05, + "loss": 0.47564437985420227, + "step": 282 + }, + { + "epoch": 0.8255241567912489, + "grad_norm": 1.2653177976608276, + "learning_rate": 1.4673222187247963e-05, + "loss": 0.46364277601242065, + "step": 283 + }, + { + "epoch": 0.8284412032816773, + "grad_norm": 1.2567330598831177, + "learning_rate": 1.4628146545442202e-05, + "loss": 0.4778091013431549, + "step": 284 + }, + { + "epoch": 0.8313582497721057, + "grad_norm": 1.5848406553268433, + "learning_rate": 1.4582950916253488e-05, + "loss": 0.4480203688144684, + "step": 285 + }, + { + "epoch": 0.8342752962625342, + "grad_norm": 1.3278183937072754, + "learning_rate": 1.453763647140472e-05, + "loss": 0.37945032119750977, + "step": 286 + }, + { + "epoch": 0.8371923427529626, + "grad_norm": 1.0961651802062988, + "learning_rate": 1.4492204385699155e-05, + "loss": 0.5306747555732727, + "step": 287 + }, + { + "epoch": 0.8401093892433911, + "grad_norm": 1.176276683807373, + "learning_rate": 1.4446655836989961e-05, + "loss": 0.49950045347213745, + "step": 288 + }, + { + "epoch": 0.8430264357338195, + "grad_norm": 1.2228269577026367, + "learning_rate": 1.4400992006149674e-05, + "loss": 0.494475394487381, + "step": 289 + }, + { + "epoch": 0.845943482224248, + "grad_norm": 1.1584209203720093, + "learning_rate": 1.4355214077039592e-05, + "loss": 0.44170859456062317, + "step": 290 + }, + { + "epoch": 0.8488605287146764, + "grad_norm": 1.2041938304901123, + "learning_rate": 1.4309323236479071e-05, + "loss": 0.4359871745109558, + "step": 291 + }, + { + "epoch": 0.8517775752051049, + "grad_norm": 1.279645562171936, + "learning_rate": 1.4263320674214762e-05, + "loss": 0.45031386613845825, + "step": 292 + }, + { + "epoch": 0.8546946216955332, + "grad_norm": 1.3958357572555542, + "learning_rate": 1.4217207582889769e-05, + "loss": 0.4832204580307007, + "step": 293 + }, + { + "epoch": 0.8576116681859617, + "grad_norm": 1.2788586616516113, + "learning_rate": 1.4170985158012725e-05, + "loss": 0.5154346227645874, + "step": 294 + }, + { + "epoch": 0.8605287146763901, + "grad_norm": 1.3634892702102661, + "learning_rate": 1.4124654597926795e-05, + "loss": 0.46777206659317017, + "step": 295 + }, + { + "epoch": 0.8634457611668186, + "grad_norm": 1.2719579935073853, + "learning_rate": 1.4078217103778619e-05, + "loss": 0.4247053265571594, + "step": 296 + }, + { + "epoch": 0.866362807657247, + "grad_norm": 2.890467643737793, + "learning_rate": 1.4031673879487161e-05, + "loss": 0.38349640369415283, + "step": 297 + }, + { + "epoch": 0.8692798541476755, + "grad_norm": 2.4354801177978516, + "learning_rate": 1.3985026131712499e-05, + "loss": 0.4134889543056488, + "step": 298 + }, + { + "epoch": 0.872196900638104, + "grad_norm": 1.0138323307037354, + "learning_rate": 1.3938275069824541e-05, + "loss": 0.5176680684089661, + "step": 299 + }, + { + "epoch": 0.8751139471285324, + "grad_norm": 1.2316186428070068, + "learning_rate": 1.389142190587168e-05, + "loss": 0.4818477928638458, + "step": 300 + }, + { + "epoch": 0.8751139471285324, + "eval_loss": 0.4752846360206604, + "eval_runtime": 1189.1666, + "eval_samples_per_second": 0.531, + "eval_steps_per_second": 0.531, + "step": 300 + }, + { + "epoch": 0.8780309936189608, + "grad_norm": 1.515487551689148, + "learning_rate": 1.384446785454936e-05, + "loss": 0.47766175866127014, + "step": 301 + }, + { + "epoch": 0.8809480401093892, + "grad_norm": 1.4357497692108154, + "learning_rate": 1.3797414133168591e-05, + "loss": 0.49297061562538147, + "step": 302 + }, + { + "epoch": 0.8838650865998177, + "grad_norm": 1.2523037195205688, + "learning_rate": 1.3750261961624383e-05, + "loss": 0.4629015326499939, + "step": 303 + }, + { + "epoch": 0.8867821330902461, + "grad_norm": 3.5790023803710938, + "learning_rate": 1.3703012562364124e-05, + "loss": 0.3773120045661926, + "step": 304 + }, + { + "epoch": 0.8896991795806746, + "grad_norm": 1.9305704832077026, + "learning_rate": 1.3655667160355892e-05, + "loss": 0.496719628572464, + "step": 305 + }, + { + "epoch": 0.892616226071103, + "grad_norm": 1.1506154537200928, + "learning_rate": 1.3608226983056687e-05, + "loss": 0.49487072229385376, + "step": 306 + }, + { + "epoch": 0.8955332725615315, + "grad_norm": 1.8046090602874756, + "learning_rate": 1.3560693260380614e-05, + "loss": 0.4910697937011719, + "step": 307 + }, + { + "epoch": 0.8984503190519599, + "grad_norm": 2.0088653564453125, + "learning_rate": 1.3513067224667e-05, + "loss": 0.508246660232544, + "step": 308 + }, + { + "epoch": 0.9013673655423883, + "grad_norm": 1.2966033220291138, + "learning_rate": 1.3465350110648437e-05, + "loss": 0.5125166177749634, + "step": 309 + }, + { + "epoch": 0.9042844120328167, + "grad_norm": 1.9976309537887573, + "learning_rate": 1.3417543155418775e-05, + "loss": 0.43942537903785706, + "step": 310 + }, + { + "epoch": 0.9072014585232452, + "grad_norm": 1.2663682699203491, + "learning_rate": 1.336964759840105e-05, + "loss": 0.4839101731777191, + "step": 311 + }, + { + "epoch": 0.9101185050136736, + "grad_norm": 1.1223328113555908, + "learning_rate": 1.3321664681315354e-05, + "loss": 0.48008066415786743, + "step": 312 + }, + { + "epoch": 0.9130355515041021, + "grad_norm": 1.5786972045898438, + "learning_rate": 1.3273595648146634e-05, + "loss": 0.47250309586524963, + "step": 313 + }, + { + "epoch": 0.9159525979945305, + "grad_norm": 1.2150241136550903, + "learning_rate": 1.322544174511245e-05, + "loss": 0.5149738788604736, + "step": 314 + }, + { + "epoch": 0.918869644484959, + "grad_norm": 1.3676542043685913, + "learning_rate": 1.3177204220630662e-05, + "loss": 0.4430195093154907, + "step": 315 + }, + { + "epoch": 0.9217866909753875, + "grad_norm": 1.0703285932540894, + "learning_rate": 1.3128884325287064e-05, + "loss": 0.4798983037471771, + "step": 316 + }, + { + "epoch": 0.9247037374658159, + "grad_norm": 1.3131535053253174, + "learning_rate": 1.308048331180296e-05, + "loss": 0.4241073727607727, + "step": 317 + }, + { + "epoch": 0.9276207839562443, + "grad_norm": 1.4485348463058472, + "learning_rate": 1.3032002435002698e-05, + "loss": 0.527199923992157, + "step": 318 + }, + { + "epoch": 0.9305378304466727, + "grad_norm": 1.370936393737793, + "learning_rate": 1.2983442951781114e-05, + "loss": 0.47125962376594543, + "step": 319 + }, + { + "epoch": 0.9334548769371012, + "grad_norm": 1.2369643449783325, + "learning_rate": 1.2934806121070973e-05, + "loss": 0.4814244210720062, + "step": 320 + }, + { + "epoch": 0.9363719234275296, + "grad_norm": 1.2632933855056763, + "learning_rate": 1.2886093203810314e-05, + "loss": 0.4915548264980316, + "step": 321 + }, + { + "epoch": 0.9392889699179581, + "grad_norm": 1.054569959640503, + "learning_rate": 1.2837305462909764e-05, + "loss": 0.5325602293014526, + "step": 322 + }, + { + "epoch": 0.9422060164083865, + "grad_norm": 1.15959632396698, + "learning_rate": 1.27884441632198e-05, + "loss": 0.43607404828071594, + "step": 323 + }, + { + "epoch": 0.945123062898815, + "grad_norm": 1.1667979955673218, + "learning_rate": 1.2739510571497945e-05, + "loss": 0.4631507992744446, + "step": 324 + }, + { + "epoch": 0.9480401093892434, + "grad_norm": 1.6009081602096558, + "learning_rate": 1.2690505956375944e-05, + "loss": 0.4935731887817383, + "step": 325 + }, + { + "epoch": 0.9509571558796718, + "grad_norm": 1.1193996667861938, + "learning_rate": 1.2641431588326858e-05, + "loss": 0.45883435010910034, + "step": 326 + }, + { + "epoch": 0.9538742023701002, + "grad_norm": 1.5365067720413208, + "learning_rate": 1.2592288739632138e-05, + "loss": 0.5206276178359985, + "step": 327 + }, + { + "epoch": 0.9567912488605287, + "grad_norm": 1.0714622735977173, + "learning_rate": 1.2543078684348632e-05, + "loss": 0.5242853760719299, + "step": 328 + }, + { + "epoch": 0.9597082953509571, + "grad_norm": 1.3009248971939087, + "learning_rate": 1.2493802698275557e-05, + "loss": 0.4794357717037201, + "step": 329 + }, + { + "epoch": 0.9626253418413856, + "grad_norm": 1.495771050453186, + "learning_rate": 1.244446205892143e-05, + "loss": 0.5849282145500183, + "step": 330 + }, + { + "epoch": 0.965542388331814, + "grad_norm": 1.2046003341674805, + "learning_rate": 1.2395058045470935e-05, + "loss": 0.47758305072784424, + "step": 331 + }, + { + "epoch": 0.9684594348222425, + "grad_norm": 1.1362569332122803, + "learning_rate": 1.2345591938751772e-05, + "loss": 0.4490663409233093, + "step": 332 + }, + { + "epoch": 0.971376481312671, + "grad_norm": 1.2658129930496216, + "learning_rate": 1.2296065021201438e-05, + "loss": 0.4035309851169586, + "step": 333 + }, + { + "epoch": 0.9742935278030994, + "grad_norm": 4.370306015014648, + "learning_rate": 1.2246478576833993e-05, + "loss": 0.495273619890213, + "step": 334 + }, + { + "epoch": 0.9772105742935278, + "grad_norm": 1.3863654136657715, + "learning_rate": 1.219683389120676e-05, + "loss": 0.46410733461380005, + "step": 335 + }, + { + "epoch": 0.9801276207839562, + "grad_norm": 1.4544321298599243, + "learning_rate": 1.2147132251387004e-05, + "loss": 0.4301709830760956, + "step": 336 + }, + { + "epoch": 0.9830446672743847, + "grad_norm": 1.0852457284927368, + "learning_rate": 1.2097374945918554e-05, + "loss": 0.48892468214035034, + "step": 337 + }, + { + "epoch": 0.9859617137648131, + "grad_norm": 1.5062257051467896, + "learning_rate": 1.2047563264788412e-05, + "loss": 0.4667983055114746, + "step": 338 + }, + { + "epoch": 0.9888787602552416, + "grad_norm": 1.2472951412200928, + "learning_rate": 1.199769849939329e-05, + "loss": 0.4827345013618469, + "step": 339 + }, + { + "epoch": 0.99179580674567, + "grad_norm": 1.2589871883392334, + "learning_rate": 1.1947781942506151e-05, + "loss": 0.405245304107666, + "step": 340 + }, + { + "epoch": 0.9947128532360985, + "grad_norm": 1.25636625289917, + "learning_rate": 1.1897814888242679e-05, + "loss": 0.37956133484840393, + "step": 341 + }, + { + "epoch": 0.9976298997265269, + "grad_norm": 2.7064895629882812, + "learning_rate": 1.1847798632027726e-05, + "loss": 0.489456444978714, + "step": 342 + }, + { + "epoch": 1.0, + "grad_norm": 1.6156240701675415, + "learning_rate": 1.1797734470561744e-05, + "loss": 0.46473199129104614, + "step": 343 + }, + { + "epoch": 1.0029170464904285, + "grad_norm": 1.3046343326568604, + "learning_rate": 1.1747623701787143e-05, + "loss": 0.3504878282546997, + "step": 344 + }, + { + "epoch": 1.005834092980857, + "grad_norm": 1.414828896522522, + "learning_rate": 1.1697467624854666e-05, + "loss": 0.4719260334968567, + "step": 345 + }, + { + "epoch": 1.0087511394712854, + "grad_norm": 1.1873356103897095, + "learning_rate": 1.164726754008969e-05, + "loss": 0.45313555002212524, + "step": 346 + }, + { + "epoch": 1.0116681859617138, + "grad_norm": 1.1382380723953247, + "learning_rate": 1.1597024748958526e-05, + "loss": 0.4365478456020355, + "step": 347 + }, + { + "epoch": 1.0145852324521423, + "grad_norm": 1.8141961097717285, + "learning_rate": 1.1546740554034661e-05, + "loss": 0.3694503605365753, + "step": 348 + }, + { + "epoch": 1.0175022789425707, + "grad_norm": 1.333388328552246, + "learning_rate": 1.1496416258965015e-05, + "loss": 0.4755721688270569, + "step": 349 + }, + { + "epoch": 1.0204193254329992, + "grad_norm": 1.3464443683624268, + "learning_rate": 1.1446053168436117e-05, + "loss": 0.4227846562862396, + "step": 350 + }, + { + "epoch": 1.0204193254329992, + "eval_loss": 0.44924086332321167, + "eval_runtime": 1214.6648, + "eval_samples_per_second": 0.52, + "eval_steps_per_second": 0.52, + "step": 350 + }, + { + "epoch": 1.0233363719234276, + "grad_norm": 1.2682689428329468, + "learning_rate": 1.1395652588140292e-05, + "loss": 0.44300130009651184, + "step": 351 + }, + { + "epoch": 1.0262534184138559, + "grad_norm": 1.7737696170806885, + "learning_rate": 1.1345215824741814e-05, + "loss": 0.5106258988380432, + "step": 352 + }, + { + "epoch": 1.0291704649042843, + "grad_norm": 1.2601238489151, + "learning_rate": 1.1294744185843014e-05, + "loss": 0.45930635929107666, + "step": 353 + }, + { + "epoch": 1.0320875113947128, + "grad_norm": 1.2162678241729736, + "learning_rate": 1.1244238979950406e-05, + "loss": 0.44163084030151367, + "step": 354 + }, + { + "epoch": 1.0350045578851412, + "grad_norm": 1.0905817747116089, + "learning_rate": 1.1193701516440733e-05, + "loss": 0.510662317276001, + "step": 355 + }, + { + "epoch": 1.0379216043755697, + "grad_norm": 0.9624952673912048, + "learning_rate": 1.1143133105527048e-05, + "loss": 0.5297917127609253, + "step": 356 + }, + { + "epoch": 1.0408386508659981, + "grad_norm": 1.2757681608200073, + "learning_rate": 1.1092535058224725e-05, + "loss": 0.4332093596458435, + "step": 357 + }, + { + "epoch": 1.0437556973564266, + "grad_norm": 1.6885719299316406, + "learning_rate": 1.104190868631748e-05, + "loss": 0.4337635040283203, + "step": 358 + }, + { + "epoch": 1.046672743846855, + "grad_norm": 1.175484538078308, + "learning_rate": 1.099125530232336e-05, + "loss": 0.45411020517349243, + "step": 359 + }, + { + "epoch": 1.0495897903372835, + "grad_norm": 1.0964939594268799, + "learning_rate": 1.0940576219460723e-05, + "loss": 0.5333439707756042, + "step": 360 + }, + { + "epoch": 1.052506836827712, + "grad_norm": 1.5493136644363403, + "learning_rate": 1.0889872751614176e-05, + "loss": 0.4400906264781952, + "step": 361 + }, + { + "epoch": 1.0554238833181404, + "grad_norm": 1.2491416931152344, + "learning_rate": 1.0839146213300526e-05, + "loss": 0.31049978733062744, + "step": 362 + }, + { + "epoch": 1.0583409298085689, + "grad_norm": 1.7213693857192993, + "learning_rate": 1.0788397919634694e-05, + "loss": 0.389009028673172, + "step": 363 + }, + { + "epoch": 1.0612579762989973, + "grad_norm": 1.5405336618423462, + "learning_rate": 1.0737629186295621e-05, + "loss": 0.4068562984466553, + "step": 364 + }, + { + "epoch": 1.0641750227894258, + "grad_norm": 1.225455641746521, + "learning_rate": 1.0686841329492159e-05, + "loss": 0.47358617186546326, + "step": 365 + }, + { + "epoch": 1.0670920692798542, + "grad_norm": 1.3436250686645508, + "learning_rate": 1.0636035665928945e-05, + "loss": 0.47050854563713074, + "step": 366 + }, + { + "epoch": 1.0700091157702827, + "grad_norm": 1.4952112436294556, + "learning_rate": 1.058521351277227e-05, + "loss": 0.43496906757354736, + "step": 367 + }, + { + "epoch": 1.072926162260711, + "grad_norm": 1.549112319946289, + "learning_rate": 1.0534376187615924e-05, + "loss": 0.45711052417755127, + "step": 368 + }, + { + "epoch": 1.0758432087511394, + "grad_norm": 1.3851526975631714, + "learning_rate": 1.048352500844704e-05, + "loss": 0.45045915246009827, + "step": 369 + }, + { + "epoch": 1.0787602552415678, + "grad_norm": 1.6302049160003662, + "learning_rate": 1.0432661293611927e-05, + "loss": 0.3736046254634857, + "step": 370 + }, + { + "epoch": 1.0816773017319963, + "grad_norm": 1.3365869522094727, + "learning_rate": 1.0381786361781885e-05, + "loss": 0.42242100834846497, + "step": 371 + }, + { + "epoch": 1.0845943482224247, + "grad_norm": 1.4369138479232788, + "learning_rate": 1.0330901531919026e-05, + "loss": 0.44570961594581604, + "step": 372 + }, + { + "epoch": 1.0875113947128532, + "grad_norm": 1.3528283834457397, + "learning_rate": 1.0280008123242069e-05, + "loss": 0.43440738320350647, + "step": 373 + }, + { + "epoch": 1.0904284412032816, + "grad_norm": 1.469660997390747, + "learning_rate": 1.0229107455192147e-05, + "loss": 0.3960394263267517, + "step": 374 + }, + { + "epoch": 1.09334548769371, + "grad_norm": 1.4542185068130493, + "learning_rate": 1.0178200847398595e-05, + "loss": 0.47834208607673645, + "step": 375 + }, + { + "epoch": 1.0962625341841385, + "grad_norm": 1.6470292806625366, + "learning_rate": 1.0127289619644737e-05, + "loss": 0.42791086435317993, + "step": 376 + }, + { + "epoch": 1.099179580674567, + "grad_norm": 1.1934021711349487, + "learning_rate": 1.0076375091833681e-05, + "loss": 0.4401305019855499, + "step": 377 + }, + { + "epoch": 1.1020966271649955, + "grad_norm": 0.9786668419837952, + "learning_rate": 1.0025458583954078e-05, + "loss": 0.4816555678844452, + "step": 378 + }, + { + "epoch": 1.105013673655424, + "grad_norm": 1.1348779201507568, + "learning_rate": 9.974541416045924e-06, + "loss": 0.41516968607902527, + "step": 379 + }, + { + "epoch": 1.1079307201458524, + "grad_norm": 1.0188615322113037, + "learning_rate": 9.923624908166322e-06, + "loss": 0.48087278008461, + "step": 380 + }, + { + "epoch": 1.1108477666362808, + "grad_norm": 1.0821740627288818, + "learning_rate": 9.872710380355263e-06, + "loss": 0.41974008083343506, + "step": 381 + }, + { + "epoch": 1.1137648131267093, + "grad_norm": 1.250951886177063, + "learning_rate": 9.82179915260141e-06, + "loss": 0.42703643441200256, + "step": 382 + }, + { + "epoch": 1.1166818596171377, + "grad_norm": 1.4528254270553589, + "learning_rate": 9.770892544807856e-06, + "loss": 0.43801453709602356, + "step": 383 + }, + { + "epoch": 1.1195989061075662, + "grad_norm": 1.813859462738037, + "learning_rate": 9.719991876757934e-06, + "loss": 0.4344240725040436, + "step": 384 + }, + { + "epoch": 1.1225159525979946, + "grad_norm": 1.6681253910064697, + "learning_rate": 9.669098468080976e-06, + "loss": 0.4356998801231384, + "step": 385 + }, + { + "epoch": 1.125432999088423, + "grad_norm": 1.3447953462600708, + "learning_rate": 9.618213638218117e-06, + "loss": 0.43189188838005066, + "step": 386 + }, + { + "epoch": 1.1283500455788513, + "grad_norm": 1.9577926397323608, + "learning_rate": 9.567338706388074e-06, + "loss": 0.34984707832336426, + "step": 387 + }, + { + "epoch": 1.1312670920692798, + "grad_norm": 1.5225576162338257, + "learning_rate": 9.516474991552965e-06, + "loss": 0.4243963062763214, + "step": 388 + }, + { + "epoch": 1.1341841385597082, + "grad_norm": 1.7416809797286987, + "learning_rate": 9.46562381238408e-06, + "loss": 0.3414606750011444, + "step": 389 + }, + { + "epoch": 1.1371011850501367, + "grad_norm": 1.8358951807022095, + "learning_rate": 9.414786487227732e-06, + "loss": 0.387447327375412, + "step": 390 + }, + { + "epoch": 1.1400182315405651, + "grad_norm": 1.9706153869628906, + "learning_rate": 9.363964334071057e-06, + "loss": 0.4599088728427887, + "step": 391 + }, + { + "epoch": 1.1429352780309936, + "grad_norm": 1.0604286193847656, + "learning_rate": 9.313158670507843e-06, + "loss": 0.4633581042289734, + "step": 392 + }, + { + "epoch": 1.145852324521422, + "grad_norm": 1.4851202964782715, + "learning_rate": 9.262370813704379e-06, + "loss": 0.3872259557247162, + "step": 393 + }, + { + "epoch": 1.1487693710118505, + "grad_norm": 1.7839159965515137, + "learning_rate": 9.21160208036531e-06, + "loss": 0.5215944647789001, + "step": 394 + }, + { + "epoch": 1.151686417502279, + "grad_norm": 1.3054656982421875, + "learning_rate": 9.160853786699475e-06, + "loss": 0.4030425548553467, + "step": 395 + }, + { + "epoch": 1.1546034639927074, + "grad_norm": 3.8467981815338135, + "learning_rate": 9.110127248385827e-06, + "loss": 0.4032524824142456, + "step": 396 + }, + { + "epoch": 1.1575205104831359, + "grad_norm": 1.8513801097869873, + "learning_rate": 9.05942378053928e-06, + "loss": 0.46577155590057373, + "step": 397 + }, + { + "epoch": 1.1604375569735643, + "grad_norm": 1.312689185142517, + "learning_rate": 9.008744697676642e-06, + "loss": 0.39114487171173096, + "step": 398 + }, + { + "epoch": 1.1633546034639928, + "grad_norm": 1.1996328830718994, + "learning_rate": 8.958091313682521e-06, + "loss": 0.481199711561203, + "step": 399 + }, + { + "epoch": 1.1662716499544212, + "grad_norm": 5.172409534454346, + "learning_rate": 8.90746494177528e-06, + "loss": 0.3803558945655823, + "step": 400 + }, + { + "epoch": 1.1662716499544212, + "eval_loss": 0.4318464398384094, + "eval_runtime": 1206.0306, + "eval_samples_per_second": 0.524, + "eval_steps_per_second": 0.524, + "step": 400 + }, + { + "epoch": 1.1691886964448497, + "grad_norm": 1.0115015506744385, + "learning_rate": 8.856866894472954e-06, + "loss": 0.39636704325675964, + "step": 401 + }, + { + "epoch": 1.172105742935278, + "grad_norm": 1.1557435989379883, + "learning_rate": 8.806298483559268e-06, + "loss": 0.4076298475265503, + "step": 402 + }, + { + "epoch": 1.1750227894257064, + "grad_norm": 1.2802515029907227, + "learning_rate": 8.755761020049597e-06, + "loss": 0.44352248311042786, + "step": 403 + }, + { + "epoch": 1.1779398359161348, + "grad_norm": 1.2755069732666016, + "learning_rate": 8.705255814156988e-06, + "loss": 0.390497624874115, + "step": 404 + }, + { + "epoch": 1.1808568824065633, + "grad_norm": 1.2799782752990723, + "learning_rate": 8.654784175258188e-06, + "loss": 0.35810694098472595, + "step": 405 + }, + { + "epoch": 1.1837739288969917, + "grad_norm": 1.0968674421310425, + "learning_rate": 8.604347411859713e-06, + "loss": 0.3890265226364136, + "step": 406 + }, + { + "epoch": 1.1866909753874202, + "grad_norm": 1.3334455490112305, + "learning_rate": 8.553946831563886e-06, + "loss": 0.3916901648044586, + "step": 407 + }, + { + "epoch": 1.1896080218778486, + "grad_norm": 1.1888184547424316, + "learning_rate": 8.503583741034988e-06, + "loss": 0.5231326222419739, + "step": 408 + }, + { + "epoch": 1.192525068368277, + "grad_norm": 1.1163763999938965, + "learning_rate": 8.45325944596534e-06, + "loss": 0.4249858558177948, + "step": 409 + }, + { + "epoch": 1.1954421148587056, + "grad_norm": 1.3470333814620972, + "learning_rate": 8.40297525104148e-06, + "loss": 0.5201632380485535, + "step": 410 + }, + { + "epoch": 1.198359161349134, + "grad_norm": 1.5412285327911377, + "learning_rate": 8.35273245991031e-06, + "loss": 0.39376699924468994, + "step": 411 + }, + { + "epoch": 1.2012762078395625, + "grad_norm": 1.3408735990524292, + "learning_rate": 8.302532375145339e-06, + "loss": 0.39554283022880554, + "step": 412 + }, + { + "epoch": 1.204193254329991, + "grad_norm": 1.990668773651123, + "learning_rate": 8.25237629821286e-06, + "loss": 0.42424261569976807, + "step": 413 + }, + { + "epoch": 1.2071103008204194, + "grad_norm": 1.6471989154815674, + "learning_rate": 8.202265529438259e-06, + "loss": 0.3234582543373108, + "step": 414 + }, + { + "epoch": 1.2100273473108478, + "grad_norm": 1.1483631134033203, + "learning_rate": 8.152201367972275e-06, + "loss": 0.39163246750831604, + "step": 415 + }, + { + "epoch": 1.2129443938012763, + "grad_norm": 1.800149917602539, + "learning_rate": 8.102185111757323e-06, + "loss": 0.5055042505264282, + "step": 416 + }, + { + "epoch": 1.2158614402917047, + "grad_norm": 1.4394795894622803, + "learning_rate": 8.052218057493849e-06, + "loss": 0.4761751592159271, + "step": 417 + }, + { + "epoch": 1.2187784867821332, + "grad_norm": 1.622689962387085, + "learning_rate": 8.002301500606715e-06, + "loss": 0.4490141272544861, + "step": 418 + }, + { + "epoch": 1.2216955332725616, + "grad_norm": 1.2564961910247803, + "learning_rate": 7.952436735211593e-06, + "loss": 0.3964035212993622, + "step": 419 + }, + { + "epoch": 1.22461257976299, + "grad_norm": 1.3248411417007446, + "learning_rate": 7.902625054081449e-06, + "loss": 0.46039122343063354, + "step": 420 + }, + { + "epoch": 1.2275296262534183, + "grad_norm": 1.568983793258667, + "learning_rate": 7.852867748613e-06, + "loss": 0.49916595220565796, + "step": 421 + }, + { + "epoch": 1.2304466727438468, + "grad_norm": 1.4784491062164307, + "learning_rate": 7.803166108793243e-06, + "loss": 0.4035068154335022, + "step": 422 + }, + { + "epoch": 1.2333637192342752, + "grad_norm": 1.2940057516098022, + "learning_rate": 7.753521423166007e-06, + "loss": 0.4154140055179596, + "step": 423 + }, + { + "epoch": 1.2362807657247037, + "grad_norm": 1.167786717414856, + "learning_rate": 7.703934978798565e-06, + "loss": 0.39541637897491455, + "step": 424 + }, + { + "epoch": 1.2391978122151321, + "grad_norm": 1.5126771926879883, + "learning_rate": 7.65440806124823e-06, + "loss": 0.37744253873825073, + "step": 425 + }, + { + "epoch": 1.2421148587055606, + "grad_norm": 1.2595263719558716, + "learning_rate": 7.604941954529067e-06, + "loss": 0.46380615234375, + "step": 426 + }, + { + "epoch": 1.245031905195989, + "grad_norm": 1.4258298873901367, + "learning_rate": 7.555537941078573e-06, + "loss": 0.3391319513320923, + "step": 427 + }, + { + "epoch": 1.2479489516864175, + "grad_norm": 1.5371774435043335, + "learning_rate": 7.506197301724446e-06, + "loss": 0.39805102348327637, + "step": 428 + }, + { + "epoch": 1.250865998176846, + "grad_norm": 1.3789173364639282, + "learning_rate": 7.456921315651371e-06, + "loss": 0.37969034910202026, + "step": 429 + }, + { + "epoch": 1.2537830446672744, + "grad_norm": 1.32931649684906, + "learning_rate": 7.407711260367867e-06, + "loss": 0.3841526508331299, + "step": 430 + }, + { + "epoch": 1.2567000911577029, + "grad_norm": 1.2836817502975464, + "learning_rate": 7.358568411673145e-06, + "loss": 0.340289443731308, + "step": 431 + }, + { + "epoch": 1.2596171376481313, + "grad_norm": 1.0418318510055542, + "learning_rate": 7.309494043624059e-06, + "loss": 0.44747158885002136, + "step": 432 + }, + { + "epoch": 1.2625341841385598, + "grad_norm": 1.1769362688064575, + "learning_rate": 7.260489428502058e-06, + "loss": 0.45737382769584656, + "step": 433 + }, + { + "epoch": 1.265451230628988, + "grad_norm": 2.2730748653411865, + "learning_rate": 7.211555836780203e-06, + "loss": 0.3827931582927704, + "step": 434 + }, + { + "epoch": 1.2683682771194165, + "grad_norm": 1.263096809387207, + "learning_rate": 7.162694537090235e-06, + "loss": 0.3589435815811157, + "step": 435 + }, + { + "epoch": 1.271285323609845, + "grad_norm": 1.4073514938354492, + "learning_rate": 7.113906796189692e-06, + "loss": 0.45206642150878906, + "step": 436 + }, + { + "epoch": 1.2742023701002734, + "grad_norm": 1.064585566520691, + "learning_rate": 7.0651938789290306e-06, + "loss": 0.5409261584281921, + "step": 437 + }, + { + "epoch": 1.2771194165907018, + "grad_norm": 1.2346999645233154, + "learning_rate": 7.016557048218889e-06, + "loss": 0.40680158138275146, + "step": 438 + }, + { + "epoch": 1.2800364630811303, + "grad_norm": 1.5816547870635986, + "learning_rate": 6.967997564997306e-06, + "loss": 0.38718655705451965, + "step": 439 + }, + { + "epoch": 1.2829535095715587, + "grad_norm": 1.085268259048462, + "learning_rate": 6.919516688197041e-06, + "loss": 0.4863276779651642, + "step": 440 + }, + { + "epoch": 1.2858705560619872, + "grad_norm": 1.0984629392623901, + "learning_rate": 6.871115674712937e-06, + "loss": 0.39562875032424927, + "step": 441 + }, + { + "epoch": 1.2887876025524156, + "grad_norm": 1.3004229068756104, + "learning_rate": 6.822795779369339e-06, + "loss": 0.44437694549560547, + "step": 442 + }, + { + "epoch": 1.291704649042844, + "grad_norm": 1.3541183471679688, + "learning_rate": 6.774558254887553e-06, + "loss": 0.4728967249393463, + "step": 443 + }, + { + "epoch": 1.2946216955332726, + "grad_norm": 1.2485377788543701, + "learning_rate": 6.7264043518533695e-06, + "loss": 0.4052809476852417, + "step": 444 + }, + { + "epoch": 1.297538742023701, + "grad_norm": 1.412827730178833, + "learning_rate": 6.67833531868465e-06, + "loss": 0.40149861574172974, + "step": 445 + }, + { + "epoch": 1.3004557885141295, + "grad_norm": 1.5576224327087402, + "learning_rate": 6.630352401598953e-06, + "loss": 0.44107240438461304, + "step": 446 + }, + { + "epoch": 1.303372835004558, + "grad_norm": 1.1551047563552856, + "learning_rate": 6.582456844581226e-06, + "loss": 0.4898405969142914, + "step": 447 + }, + { + "epoch": 1.3062898814949864, + "grad_norm": 1.9939689636230469, + "learning_rate": 6.5346498893515645e-06, + "loss": 0.4791329801082611, + "step": 448 + }, + { + "epoch": 1.3092069279854148, + "grad_norm": 1.4782553911209106, + "learning_rate": 6.486932775333002e-06, + "loss": 0.472908616065979, + "step": 449 + }, + { + "epoch": 1.3121239744758433, + "grad_norm": 1.2496148347854614, + "learning_rate": 6.439306739619387e-06, + "loss": 0.514995276927948, + "step": 450 + }, + { + "epoch": 1.3121239744758433, + "eval_loss": 0.4178673028945923, + "eval_runtime": 1197.5534, + "eval_samples_per_second": 0.528, + "eval_steps_per_second": 0.528, + "step": 450 + }, + { + "epoch": 1.3150410209662717, + "grad_norm": 1.3996772766113281, + "learning_rate": 6.391773016943316e-06, + "loss": 0.4087896943092346, + "step": 451 + }, + { + "epoch": 1.3179580674567002, + "grad_norm": 1.20390784740448, + "learning_rate": 6.344332839644111e-06, + "loss": 0.43224579095840454, + "step": 452 + }, + { + "epoch": 1.3208751139471286, + "grad_norm": 1.2709496021270752, + "learning_rate": 6.296987437635876e-06, + "loss": 0.44104093313217163, + "step": 453 + }, + { + "epoch": 1.323792160437557, + "grad_norm": 1.0112334489822388, + "learning_rate": 6.249738038375618e-06, + "loss": 0.47084498405456543, + "step": 454 + }, + { + "epoch": 1.3267092069279856, + "grad_norm": 1.0771515369415283, + "learning_rate": 6.202585866831411e-06, + "loss": 0.4700928032398224, + "step": 455 + }, + { + "epoch": 1.3296262534184138, + "grad_norm": 1.4937143325805664, + "learning_rate": 6.15553214545064e-06, + "loss": 0.345747709274292, + "step": 456 + }, + { + "epoch": 1.3325432999088422, + "grad_norm": 1.1348456144332886, + "learning_rate": 6.108578094128321e-06, + "loss": 0.33824583888053894, + "step": 457 + }, + { + "epoch": 1.3354603463992707, + "grad_norm": 1.2502707242965698, + "learning_rate": 6.061724930175461e-06, + "loss": 0.3528832197189331, + "step": 458 + }, + { + "epoch": 1.3383773928896991, + "grad_norm": 1.5359619855880737, + "learning_rate": 6.014973868287504e-06, + "loss": 0.4413869082927704, + "step": 459 + }, + { + "epoch": 1.3412944393801276, + "grad_norm": 0.9747081398963928, + "learning_rate": 5.9683261205128395e-06, + "loss": 0.6849499940872192, + "step": 460 + }, + { + "epoch": 1.344211485870556, + "grad_norm": 1.3150533437728882, + "learning_rate": 5.921782896221383e-06, + "loss": 0.3901931047439575, + "step": 461 + }, + { + "epoch": 1.3471285323609845, + "grad_norm": 1.137770652770996, + "learning_rate": 5.875345402073207e-06, + "loss": 0.37498384714126587, + "step": 462 + }, + { + "epoch": 1.350045578851413, + "grad_norm": 1.2216367721557617, + "learning_rate": 5.829014841987277e-06, + "loss": 0.3874579966068268, + "step": 463 + }, + { + "epoch": 1.3529626253418414, + "grad_norm": 1.135439157485962, + "learning_rate": 5.782792417110233e-06, + "loss": 0.384797066450119, + "step": 464 + }, + { + "epoch": 1.3558796718322699, + "grad_norm": 1.2400696277618408, + "learning_rate": 5.736679325785239e-06, + "loss": 0.46303266286849976, + "step": 465 + }, + { + "epoch": 1.3587967183226983, + "grad_norm": 1.8848882913589478, + "learning_rate": 5.6906767635209304e-06, + "loss": 0.5068309903144836, + "step": 466 + }, + { + "epoch": 1.3617137648131268, + "grad_norm": 1.4707008600234985, + "learning_rate": 5.644785922960412e-06, + "loss": 0.364332914352417, + "step": 467 + }, + { + "epoch": 1.364630811303555, + "grad_norm": 2.4436841011047363, + "learning_rate": 5.599007993850329e-06, + "loss": 0.485107421875, + "step": 468 + }, + { + "epoch": 1.3675478577939835, + "grad_norm": 1.1924740076065063, + "learning_rate": 5.553344163010039e-06, + "loss": 0.34547489881515503, + "step": 469 + }, + { + "epoch": 1.370464904284412, + "grad_norm": 1.1255877017974854, + "learning_rate": 5.507795614300846e-06, + "loss": 0.39645254611968994, + "step": 470 + }, + { + "epoch": 1.3733819507748404, + "grad_norm": 1.0937018394470215, + "learning_rate": 5.4623635285952815e-06, + "loss": 0.4267856478691101, + "step": 471 + }, + { + "epoch": 1.3762989972652688, + "grad_norm": 1.3355520963668823, + "learning_rate": 5.417049083746513e-06, + "loss": 0.3669992983341217, + "step": 472 + }, + { + "epoch": 1.3792160437556973, + "grad_norm": 1.7302504777908325, + "learning_rate": 5.3718534545578035e-06, + "loss": 0.3873697519302368, + "step": 473 + }, + { + "epoch": 1.3821330902461257, + "grad_norm": 1.17263662815094, + "learning_rate": 5.326777812752041e-06, + "loss": 0.4581540524959564, + "step": 474 + }, + { + "epoch": 1.3850501367365542, + "grad_norm": 1.0998128652572632, + "learning_rate": 5.281823326941377e-06, + "loss": 0.43062761425971985, + "step": 475 + }, + { + "epoch": 1.3879671832269826, + "grad_norm": 1.1194556951522827, + "learning_rate": 5.236991162596932e-06, + "loss": 0.381741464138031, + "step": 476 + }, + { + "epoch": 1.390884229717411, + "grad_norm": 1.2759051322937012, + "learning_rate": 5.19228248201856e-06, + "loss": 0.49175748229026794, + "step": 477 + }, + { + "epoch": 1.3938012762078396, + "grad_norm": 1.2134747505187988, + "learning_rate": 5.147698444304732e-06, + "loss": 0.4997562766075134, + "step": 478 + }, + { + "epoch": 1.396718322698268, + "grad_norm": 1.0833078622817993, + "learning_rate": 5.1032402053224804e-06, + "loss": 0.42580488324165344, + "step": 479 + }, + { + "epoch": 1.3996353691886965, + "grad_norm": 1.4838510751724243, + "learning_rate": 5.058908917677426e-06, + "loss": 0.5015593767166138, + "step": 480 + }, + { + "epoch": 1.402552415679125, + "grad_norm": 1.218610167503357, + "learning_rate": 5.014705730683904e-06, + "loss": 0.34739193320274353, + "step": 481 + }, + { + "epoch": 1.4054694621695534, + "grad_norm": 1.1883307695388794, + "learning_rate": 4.970631790335181e-06, + "loss": 0.41708022356033325, + "step": 482 + }, + { + "epoch": 1.4083865086599818, + "grad_norm": 1.209291696548462, + "learning_rate": 4.926688239273713e-06, + "loss": 0.43546172976493835, + "step": 483 + }, + { + "epoch": 1.4113035551504103, + "grad_norm": 1.0801606178283691, + "learning_rate": 4.882876216761543e-06, + "loss": 0.44491735100746155, + "step": 484 + }, + { + "epoch": 1.4142206016408387, + "grad_norm": 1.2746628522872925, + "learning_rate": 4.839196858650763e-06, + "loss": 0.436122864484787, + "step": 485 + }, + { + "epoch": 1.4171376481312672, + "grad_norm": 1.4465962648391724, + "learning_rate": 4.795651297354056e-06, + "loss": 0.3750447630882263, + "step": 486 + }, + { + "epoch": 1.4200546946216956, + "grad_norm": 1.6736211776733398, + "learning_rate": 4.752240661815346e-06, + "loss": 0.38286519050598145, + "step": 487 + }, + { + "epoch": 1.422971741112124, + "grad_norm": 1.1946996450424194, + "learning_rate": 4.708966077480544e-06, + "loss": 0.4488063156604767, + "step": 488 + }, + { + "epoch": 1.4258887876025526, + "grad_norm": 1.42599356174469, + "learning_rate": 4.665828666268335e-06, + "loss": 0.44088613986968994, + "step": 489 + }, + { + "epoch": 1.4288058340929808, + "grad_norm": 1.2281016111373901, + "learning_rate": 4.622829546541121e-06, + "loss": 0.4030645489692688, + "step": 490 + }, + { + "epoch": 1.4317228805834092, + "grad_norm": 1.2875670194625854, + "learning_rate": 4.57996983307602e-06, + "loss": 0.44702020287513733, + "step": 491 + }, + { + "epoch": 1.4346399270738377, + "grad_norm": 1.2456860542297363, + "learning_rate": 4.537250637035947e-06, + "loss": 0.4067370593547821, + "step": 492 + }, + { + "epoch": 1.4375569735642661, + "grad_norm": 1.2822725772857666, + "learning_rate": 4.494673065940833e-06, + "loss": 0.4237740635871887, + "step": 493 + }, + { + "epoch": 1.4404740200546946, + "grad_norm": 1.5517818927764893, + "learning_rate": 4.452238223638906e-06, + "loss": 0.40579724311828613, + "step": 494 + }, + { + "epoch": 1.443391066545123, + "grad_norm": 1.275344967842102, + "learning_rate": 4.409947210278056e-06, + "loss": 0.38880717754364014, + "step": 495 + }, + { + "epoch": 1.4463081130355515, + "grad_norm": 1.22952139377594, + "learning_rate": 4.367801122277327e-06, + "loss": 0.4042310416698456, + "step": 496 + }, + { + "epoch": 1.44922515952598, + "grad_norm": 1.122261643409729, + "learning_rate": 4.325801052298493e-06, + "loss": 0.5408368110656738, + "step": 497 + }, + { + "epoch": 1.4521422060164084, + "grad_norm": 1.5885361433029175, + "learning_rate": 4.283948089217715e-06, + "loss": 0.37697717547416687, + "step": 498 + }, + { + "epoch": 1.4550592525068369, + "grad_norm": 2.3565149307250977, + "learning_rate": 4.242243318097338e-06, + "loss": 0.3811529576778412, + "step": 499 + }, + { + "epoch": 1.4579762989972653, + "grad_norm": 1.1944137811660767, + "learning_rate": 4.200687820157735e-06, + "loss": 0.414781391620636, + "step": 500 + }, + { + "epoch": 1.4579762989972653, + "eval_loss": 0.40706494450569153, + "eval_runtime": 1189.1593, + "eval_samples_per_second": 0.531, + "eval_steps_per_second": 0.531, + "step": 500 + }, + { + "epoch": 1.4608933454876938, + "grad_norm": 1.0442464351654053, + "learning_rate": 4.159282672749289e-06, + "loss": 0.38155990839004517, + "step": 501 + }, + { + "epoch": 1.463810391978122, + "grad_norm": 1.7274727821350098, + "learning_rate": 4.118028949324453e-06, + "loss": 0.4830601215362549, + "step": 502 + }, + { + "epoch": 1.4667274384685505, + "grad_norm": 2.064513921737671, + "learning_rate": 4.0769277194099345e-06, + "loss": 0.3975123167037964, + "step": 503 + }, + { + "epoch": 1.469644484958979, + "grad_norm": 1.7695534229278564, + "learning_rate": 4.035980048578942e-06, + "loss": 0.37033841013908386, + "step": 504 + }, + { + "epoch": 1.4725615314494074, + "grad_norm": 1.4455046653747559, + "learning_rate": 3.995186998423597e-06, + "loss": 0.39567673206329346, + "step": 505 + }, + { + "epoch": 1.4754785779398358, + "grad_norm": 1.1791958808898926, + "learning_rate": 3.9545496265273765e-06, + "loss": 0.44786664843559265, + "step": 506 + }, + { + "epoch": 1.4783956244302643, + "grad_norm": 2.0874717235565186, + "learning_rate": 3.9140689864377105e-06, + "loss": 0.3333263099193573, + "step": 507 + }, + { + "epoch": 1.4813126709206927, + "grad_norm": 1.5897501707077026, + "learning_rate": 3.873746127638668e-06, + "loss": 0.5105943083763123, + "step": 508 + }, + { + "epoch": 1.4842297174111212, + "grad_norm": 1.5059760808944702, + "learning_rate": 3.833582095523749e-06, + "loss": 0.43922683596611023, + "step": 509 + }, + { + "epoch": 1.4871467639015497, + "grad_norm": 1.379347562789917, + "learning_rate": 3.7935779313687648e-06, + "loss": 0.4584790766239166, + "step": 510 + }, + { + "epoch": 1.490063810391978, + "grad_norm": 1.0984690189361572, + "learning_rate": 3.7537346723048816e-06, + "loss": 0.5217512249946594, + "step": 511 + }, + { + "epoch": 1.4929808568824066, + "grad_norm": 1.5944225788116455, + "learning_rate": 3.71405335129169e-06, + "loss": 0.4180052876472473, + "step": 512 + }, + { + "epoch": 1.495897903372835, + "grad_norm": 1.2745033502578735, + "learning_rate": 3.6745349970904465e-06, + "loss": 0.4584833085536957, + "step": 513 + }, + { + "epoch": 1.4988149498632635, + "grad_norm": 1.2746814489364624, + "learning_rate": 3.6351806342374007e-06, + "loss": 0.3202287554740906, + "step": 514 + }, + { + "epoch": 1.501731996353692, + "grad_norm": 1.409638524055481, + "learning_rate": 3.5959912830172348e-06, + "loss": 0.37963351607322693, + "step": 515 + }, + { + "epoch": 1.5046490428441204, + "grad_norm": 1.1655553579330444, + "learning_rate": 3.556967959436591e-06, + "loss": 0.43133026361465454, + "step": 516 + }, + { + "epoch": 1.5075660893345488, + "grad_norm": 1.0495020151138306, + "learning_rate": 3.518111675197776e-06, + "loss": 0.3739299178123474, + "step": 517 + }, + { + "epoch": 1.5104831358249773, + "grad_norm": 1.3055057525634766, + "learning_rate": 3.4794234376724835e-06, + "loss": 0.4099601209163666, + "step": 518 + }, + { + "epoch": 1.5134001823154057, + "grad_norm": 1.2252463102340698, + "learning_rate": 3.4409042498757084e-06, + "loss": 0.380616158246994, + "step": 519 + }, + { + "epoch": 1.5163172288058342, + "grad_norm": 1.2728638648986816, + "learning_rate": 3.4025551104397294e-06, + "loss": 0.3510003685951233, + "step": 520 + }, + { + "epoch": 1.5192342752962626, + "grad_norm": 2.70664644241333, + "learning_rate": 3.3643770135882282e-06, + "loss": 0.4087940752506256, + "step": 521 + }, + { + "epoch": 1.522151321786691, + "grad_norm": 1.6197112798690796, + "learning_rate": 3.3263709491104933e-06, + "loss": 0.45614126324653625, + "step": 522 + }, + { + "epoch": 1.5250683682771196, + "grad_norm": 1.3596103191375732, + "learning_rate": 3.2885379023357956e-06, + "loss": 0.3824586272239685, + "step": 523 + }, + { + "epoch": 1.527985414767548, + "grad_norm": 1.1768635511398315, + "learning_rate": 3.2508788541078097e-06, + "loss": 0.47717779874801636, + "step": 524 + }, + { + "epoch": 1.5309024612579762, + "grad_norm": 1.669474482536316, + "learning_rate": 3.2133947807591958e-06, + "loss": 0.4013281762599945, + "step": 525 + }, + { + "epoch": 1.5338195077484047, + "grad_norm": 1.600868582725525, + "learning_rate": 3.1760866540862932e-06, + "loss": 0.367280513048172, + "step": 526 + }, + { + "epoch": 1.5367365542388332, + "grad_norm": 1.1689515113830566, + "learning_rate": 3.138955441323923e-06, + "loss": 0.4432409405708313, + "step": 527 + }, + { + "epoch": 1.5396536007292616, + "grad_norm": 2.361961603164673, + "learning_rate": 3.1020021051202973e-06, + "loss": 0.4219942092895508, + "step": 528 + }, + { + "epoch": 1.54257064721969, + "grad_norm": 1.1962230205535889, + "learning_rate": 3.0652276035120964e-06, + "loss": 0.3672596514225006, + "step": 529 + }, + { + "epoch": 1.5454876937101185, + "grad_norm": 1.4149441719055176, + "learning_rate": 3.0286328898995963e-06, + "loss": 0.42919260263442993, + "step": 530 + }, + { + "epoch": 1.548404740200547, + "grad_norm": 1.2668434381484985, + "learning_rate": 2.992218913021966e-06, + "loss": 0.4499061107635498, + "step": 531 + }, + { + "epoch": 1.5513217866909754, + "grad_norm": 1.268114686012268, + "learning_rate": 2.9559866169326734e-06, + "loss": 0.34660714864730835, + "step": 532 + }, + { + "epoch": 1.5542388331814039, + "grad_norm": 1.0086419582366943, + "learning_rate": 2.919936940975007e-06, + "loss": 0.38239023089408875, + "step": 533 + }, + { + "epoch": 1.557155879671832, + "grad_norm": 1.0700170993804932, + "learning_rate": 2.884070819757712e-06, + "loss": 0.48240017890930176, + "step": 534 + }, + { + "epoch": 1.5600729261622606, + "grad_norm": 1.2101227045059204, + "learning_rate": 2.8483891831307873e-06, + "loss": 0.4098761975765228, + "step": 535 + }, + { + "epoch": 1.562989972652689, + "grad_norm": 1.2731400728225708, + "learning_rate": 2.8128929561613505e-06, + "loss": 0.45641395449638367, + "step": 536 + }, + { + "epoch": 1.5659070191431175, + "grad_norm": 1.1474392414093018, + "learning_rate": 2.777583059109671e-06, + "loss": 0.42283985018730164, + "step": 537 + }, + { + "epoch": 1.568824065633546, + "grad_norm": 1.789881944656372, + "learning_rate": 2.7424604074053028e-06, + "loss": 0.3469158113002777, + "step": 538 + }, + { + "epoch": 1.5717411121239744, + "grad_norm": 1.3426933288574219, + "learning_rate": 2.707525911623362e-06, + "loss": 0.35837510228157043, + "step": 539 + }, + { + "epoch": 1.5746581586144028, + "grad_norm": 1.2343578338623047, + "learning_rate": 2.672780477460901e-06, + "loss": 0.4736083745956421, + "step": 540 + }, + { + "epoch": 1.5775752051048313, + "grad_norm": 1.516298770904541, + "learning_rate": 2.638225005713457e-06, + "loss": 0.34345340728759766, + "step": 541 + }, + { + "epoch": 1.5804922515952597, + "grad_norm": 1.1488829851150513, + "learning_rate": 2.6038603922516705e-06, + "loss": 0.4134179949760437, + "step": 542 + }, + { + "epoch": 1.5834092980856882, + "grad_norm": 1.4486491680145264, + "learning_rate": 2.569687527998073e-06, + "loss": 0.3297592103481293, + "step": 543 + }, + { + "epoch": 1.5863263445761167, + "grad_norm": 1.272691011428833, + "learning_rate": 2.5357072989039855e-06, + "loss": 0.3958476185798645, + "step": 544 + }, + { + "epoch": 1.589243391066545, + "grad_norm": 1.244240641593933, + "learning_rate": 2.501920585926555e-06, + "loss": 0.4125611186027527, + "step": 545 + }, + { + "epoch": 1.5921604375569736, + "grad_norm": 1.5844073295593262, + "learning_rate": 2.4683282650058992e-06, + "loss": 0.3762253224849701, + "step": 546 + }, + { + "epoch": 1.595077484047402, + "grad_norm": 1.8209946155548096, + "learning_rate": 2.4349312070424258e-06, + "loss": 0.37053319811820984, + "step": 547 + }, + { + "epoch": 1.5979945305378305, + "grad_norm": 1.3752915859222412, + "learning_rate": 2.4017302778742247e-06, + "loss": 0.5004774332046509, + "step": 548 + }, + { + "epoch": 1.600911577028259, + "grad_norm": 5.143753528594971, + "learning_rate": 2.36872633825464e-06, + "loss": 0.39014023542404175, + "step": 549 + }, + { + "epoch": 1.6038286235186874, + "grad_norm": 1.0730944871902466, + "learning_rate": 2.335920243829941e-06, + "loss": 0.378440260887146, + "step": 550 + }, + { + "epoch": 1.6038286235186874, + "eval_loss": 0.40037089586257935, + "eval_runtime": 893.7411, + "eval_samples_per_second": 0.707, + "eval_steps_per_second": 0.707, + "step": 550 + }, + { + "epoch": 1.6067456700091158, + "grad_norm": 1.5507797002792358, + "learning_rate": 2.3033128451171548e-06, + "loss": 0.4471960663795471, + "step": 551 + }, + { + "epoch": 1.6096627164995443, + "grad_norm": 1.9462968111038208, + "learning_rate": 2.2709049874819924e-06, + "loss": 0.3658301830291748, + "step": 552 + }, + { + "epoch": 1.6125797629899727, + "grad_norm": 1.2034238576889038, + "learning_rate": 2.238697511116962e-06, + "loss": 0.3911179304122925, + "step": 553 + }, + { + "epoch": 1.6154968094804012, + "grad_norm": 1.3574327230453491, + "learning_rate": 2.2066912510195636e-06, + "loss": 0.3998897671699524, + "step": 554 + }, + { + "epoch": 1.6184138559708297, + "grad_norm": 1.1973012685775757, + "learning_rate": 2.1748870369706507e-06, + "loss": 0.38577449321746826, + "step": 555 + }, + { + "epoch": 1.621330902461258, + "grad_norm": 1.9365874528884888, + "learning_rate": 2.1432856935129144e-06, + "loss": 0.411307156085968, + "step": 556 + }, + { + "epoch": 1.6242479489516866, + "grad_norm": 1.3558642864227295, + "learning_rate": 2.1118880399295106e-06, + "loss": 0.38424253463745117, + "step": 557 + }, + { + "epoch": 1.627164995442115, + "grad_norm": 1.4368890523910522, + "learning_rate": 2.0806948902228075e-06, + "loss": 0.39943546056747437, + "step": 558 + }, + { + "epoch": 1.6300820419325432, + "grad_norm": 1.6266753673553467, + "learning_rate": 2.0497070530933084e-06, + "loss": 0.36787641048431396, + "step": 559 + }, + { + "epoch": 1.6329990884229717, + "grad_norm": 1.2600938081741333, + "learning_rate": 2.0189253319186576e-06, + "loss": 0.3781934380531311, + "step": 560 + }, + { + "epoch": 1.6359161349134002, + "grad_norm": 1.975071907043457, + "learning_rate": 1.9883505247328237e-06, + "loss": 0.4132305383682251, + "step": 561 + }, + { + "epoch": 1.6388331814038286, + "grad_norm": 1.4095909595489502, + "learning_rate": 1.9579834242054154e-06, + "loss": 0.3727574646472931, + "step": 562 + }, + { + "epoch": 1.641750227894257, + "grad_norm": 1.4271371364593506, + "learning_rate": 1.9278248176211243e-06, + "loss": 0.33786773681640625, + "step": 563 + }, + { + "epoch": 1.6446672743846855, + "grad_norm": 1.5907646417617798, + "learning_rate": 1.8978754868593074e-06, + "loss": 0.33035099506378174, + "step": 564 + }, + { + "epoch": 1.647584320875114, + "grad_norm": 1.1315702199935913, + "learning_rate": 1.8681362083737387e-06, + "loss": 0.41707149147987366, + "step": 565 + }, + { + "epoch": 1.6505013673655424, + "grad_norm": 1.4737143516540527, + "learning_rate": 1.8386077531724556e-06, + "loss": 0.43079230189323425, + "step": 566 + }, + { + "epoch": 1.6534184138559709, + "grad_norm": 1.1006760597229004, + "learning_rate": 1.8092908867977822e-06, + "loss": 0.3524904251098633, + "step": 567 + }, + { + "epoch": 1.6563354603463991, + "grad_norm": 1.4066118001937866, + "learning_rate": 1.780186369306479e-06, + "loss": 0.3695681691169739, + "step": 568 + }, + { + "epoch": 1.6592525068368276, + "grad_norm": 1.6444640159606934, + "learning_rate": 1.7512949552500412e-06, + "loss": 0.35596007108688354, + "step": 569 + }, + { + "epoch": 1.662169553327256, + "grad_norm": 1.159480094909668, + "learning_rate": 1.7226173936551282e-06, + "loss": 0.4520571827888489, + "step": 570 + }, + { + "epoch": 1.6650865998176845, + "grad_norm": 1.5874221324920654, + "learning_rate": 1.6941544280041567e-06, + "loss": 0.4702282249927521, + "step": 571 + }, + { + "epoch": 1.668003646308113, + "grad_norm": 1.6153535842895508, + "learning_rate": 1.6659067962160157e-06, + "loss": 0.3803800046443939, + "step": 572 + }, + { + "epoch": 1.6709206927985414, + "grad_norm": 1.0748940706253052, + "learning_rate": 1.6378752306269386e-06, + "loss": 0.4368419051170349, + "step": 573 + }, + { + "epoch": 1.6738377392889698, + "grad_norm": 1.5286788940429688, + "learning_rate": 1.6100604579715185e-06, + "loss": 0.4195623993873596, + "step": 574 + }, + { + "epoch": 1.6767547857793983, + "grad_norm": 1.1433510780334473, + "learning_rate": 1.5824631993638651e-06, + "loss": 0.4366849660873413, + "step": 575 + }, + { + "epoch": 1.6796718322698267, + "grad_norm": 1.9694907665252686, + "learning_rate": 1.5550841702789122e-06, + "loss": 0.5555303692817688, + "step": 576 + }, + { + "epoch": 1.6825888787602552, + "grad_norm": 1.7587188482284546, + "learning_rate": 1.5279240805338647e-06, + "loss": 0.40394848585128784, + "step": 577 + }, + { + "epoch": 1.6855059252506837, + "grad_norm": 1.063381314277649, + "learning_rate": 1.5009836342697993e-06, + "loss": 0.49564215540885925, + "step": 578 + }, + { + "epoch": 1.688422971741112, + "grad_norm": 1.1742531061172485, + "learning_rate": 1.4742635299334063e-06, + "loss": 0.3891904950141907, + "step": 579 + }, + { + "epoch": 1.6913400182315406, + "grad_norm": 1.499934196472168, + "learning_rate": 1.4477644602588848e-06, + "loss": 0.35497623682022095, + "step": 580 + }, + { + "epoch": 1.694257064721969, + "grad_norm": 1.5112360715866089, + "learning_rate": 1.421487112249984e-06, + "loss": 0.4062272012233734, + "step": 581 + }, + { + "epoch": 1.6971741112123975, + "grad_norm": 1.3583141565322876, + "learning_rate": 1.3954321671621885e-06, + "loss": 0.3655265271663666, + "step": 582 + }, + { + "epoch": 1.700091157702826, + "grad_norm": 2.8181653022766113, + "learning_rate": 1.3696003004850577e-06, + "loss": 0.37418332695961, + "step": 583 + }, + { + "epoch": 1.7030082041932544, + "grad_norm": 0.967166543006897, + "learning_rate": 1.3439921819247138e-06, + "loss": 0.4946930408477783, + "step": 584 + }, + { + "epoch": 1.7059252506836828, + "grad_norm": 1.2773699760437012, + "learning_rate": 1.3186084753864813e-06, + "loss": 0.5101871490478516, + "step": 585 + }, + { + "epoch": 1.7088422971741113, + "grad_norm": 1.2814991474151611, + "learning_rate": 1.293449838957671e-06, + "loss": 0.3688133656978607, + "step": 586 + }, + { + "epoch": 1.7117593436645397, + "grad_norm": 1.594966173171997, + "learning_rate": 1.2685169248905228e-06, + "loss": 0.4739398956298828, + "step": 587 + }, + { + "epoch": 1.7146763901549682, + "grad_norm": 1.1471531391143799, + "learning_rate": 1.2438103795852885e-06, + "loss": 0.3719588816165924, + "step": 588 + }, + { + "epoch": 1.7175934366453967, + "grad_norm": 1.1657356023788452, + "learning_rate": 1.2193308435734852e-06, + "loss": 0.4119298458099365, + "step": 589 + }, + { + "epoch": 1.720510483135825, + "grad_norm": 1.1239042282104492, + "learning_rate": 1.1950789515012783e-06, + "loss": 0.38277503848075867, + "step": 590 + }, + { + "epoch": 1.7234275296262536, + "grad_norm": 1.149478554725647, + "learning_rate": 1.1710553321130324e-06, + "loss": 0.35080626606941223, + "step": 591 + }, + { + "epoch": 1.726344576116682, + "grad_norm": 1.2020260095596313, + "learning_rate": 1.1472606082350112e-06, + "loss": 0.3991318345069885, + "step": 592 + }, + { + "epoch": 1.7292616226071102, + "grad_norm": 1.101475477218628, + "learning_rate": 1.123695396759229e-06, + "loss": 0.45791420340538025, + "step": 593 + }, + { + "epoch": 1.7321786690975387, + "grad_norm": 0.9617101550102234, + "learning_rate": 1.1003603086274584e-06, + "loss": 0.39805036783218384, + "step": 594 + }, + { + "epoch": 1.7350957155879672, + "grad_norm": 1.1439731121063232, + "learning_rate": 1.07725594881539e-06, + "loss": 0.35753339529037476, + "step": 595 + }, + { + "epoch": 1.7380127620783956, + "grad_norm": 1.0350618362426758, + "learning_rate": 1.0543829163169516e-06, + "loss": 0.42581748962402344, + "step": 596 + }, + { + "epoch": 1.740929808568824, + "grad_norm": 1.2865227460861206, + "learning_rate": 1.031741804128773e-06, + "loss": 0.34685325622558594, + "step": 597 + }, + { + "epoch": 1.7438468550592525, + "grad_norm": 1.2079373598098755, + "learning_rate": 1.0093331992348154e-06, + "loss": 0.48401936888694763, + "step": 598 + }, + { + "epoch": 1.746763901549681, + "grad_norm": 1.1684436798095703, + "learning_rate": 9.871576825911577e-07, + "loss": 0.387456476688385, + "step": 599 + }, + { + "epoch": 1.7496809480401094, + "grad_norm": 1.298045039176941, + "learning_rate": 9.65215829110927e-07, + "loss": 0.40196847915649414, + "step": 600 + }, + { + "epoch": 1.7496809480401094, + "eval_loss": 0.3965963125228882, + "eval_runtime": 912.3102, + "eval_samples_per_second": 0.693, + "eval_steps_per_second": 0.693, + "step": 600 + } + ], + "logging_steps": 1, + "max_steps": 686, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.526456570419872e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/cpt_devstral_24B/checkpoints/checkpoint-600/training_args.bin b/cpt_devstral_24B/checkpoints/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..48a487f18680e3e5b768fe7ec9ec04e8778fc21e --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f62526ec2433add7ac031c48b1f6ff360f1ade77275765112cbf7cf361d64ca5 +size 5201 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-686/README.md b/cpt_devstral_24B/checkpoints/checkpoint-686/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2f04c3de935db4cae3da32ab6d1fcbbea11b4e09 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-686/README.md @@ -0,0 +1,207 @@ +--- +base_model: /workspace/Models/Devstral-Small-2-24B-Instruct-2512 +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:/workspace/Models/Devstral-Small-2-24B-Instruct-2512 +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/cpt_devstral_24B/checkpoints/checkpoint-686/adapter_config.json b/cpt_devstral_24B/checkpoints/checkpoint-686/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a10b9f1b7bb62dced9a7c13375c7ebbeb347c15b --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-686/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "/workspace/Models/Devstral-Small-2-24B-Instruct-2512", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/cpt_devstral_24B/checkpoints/checkpoint-686/adapter_model.safetensors b/cpt_devstral_24B/checkpoints/checkpoint-686/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92e4e56f58a7eff600d77ca7cddf998a2eb2a801 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-686/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03251a15616d79fc9469a2a39740e675ba2e6c64b84f82511ff46a4917a1a103 +size 364983848 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-686/chat_template.jinja b/cpt_devstral_24B/checkpoints/checkpoint-686/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..01c8776b5b3496af72e92a53a3bf92e113f66f2c --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-686/chat_template.jinja @@ -0,0 +1,121 @@ +{#- Default system message if no system prompt is passed. #} +{%- set default_system_message = '' %} + +{#- Begin of sequence token. #} +{{- bos_token }} + +{#- Handle system prompt if it exists. #} +{#- System prompt supports text content or text chunks. #} +{%- if messages[0]['role'] == 'system' %} + {{- '[SYSTEM_PROMPT]' -}} + {%- if messages[0]['content'] is string %} + {{- messages[0]['content'] -}} + {%- else %} + {%- for block in messages[0]['content'] %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- else %} + {{- raise_exception('Only text chunks are supported in system message contents.') }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- '[/SYSTEM_PROMPT]' -}} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set loop_messages = messages %} + {%- if default_system_message != '' %} + {{- '[SYSTEM_PROMPT]' + default_system_message + '[/SYSTEM_PROMPT]' }} + {%- endif %} +{%- endif %} + + +{#- Tools definition #} +{%- set tools_definition = '' %} +{%- set has_tools = false %} +{%- if tools is defined and tools is not none and tools|length > 0 %} + {%- set has_tools = true %} + {%- set tools_definition = '[AVAILABLE_TOOLS]' + (tools| tojson) + '[/AVAILABLE_TOOLS]' %} + {{- tools_definition }} +{%- endif %} + +{#- Checks for alternating user/assistant messages. #} +{%- set ns = namespace(index=0) %} +{%- for message in loop_messages %} + {%- if message.role == 'user' or (message.role == 'assistant' and (message.tool_calls is not defined or message.tool_calls is none or message.tool_calls | length == 0)) %} + {%- if (message['role'] == 'user') != (ns.index % 2 == 0) %} + {{- raise_exception('After the optional system message, conversation roles must alternate user and assistant roles except for tool calls and results.') }} + {%- endif %} + {%- set ns.index = ns.index + 1 %} + {%- endif %} +{%- endfor %} + +{#- Handle conversation messages. #} +{%- for message in loop_messages %} + + {#- User messages supports text content or text and image chunks. #} + {%- if message['role'] == 'user' %} + {%- if message['content'] is string %} + {{- '[INST]' + message['content'] + '[/INST]' }} + {%- elif message['content'] | length > 0 %} + {{- '[INST]' }} + {%- if message['content'] | length == 2 %} + {%- set blocks = message['content'] | sort(attribute='type') %} + {%- else %} + {%- set blocks = message['content'] %} + {%- endif %} + {%- for block in blocks %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- elif block['type'] in ['image', 'image_url'] %} + {{- '[IMG]' }} + {%- else %} + {{- raise_exception('Only text, image and image_url chunks are supported in user message content.') }} + {%- endif %} + {%- endfor %} + {{- '[/INST]' }} + {%- else %} + {{- raise_exception('User message must have a string or a list of chunks in content') }} + {%- endif %} + + {#- Assistant messages supports text content or text and image chunks. #} + {%- elif message['role'] == 'assistant' %} + {%- if (message['content'] is none or message['content'] == '' or message['content']|length == 0) and (message['tool_calls'] is not defined or message['tool_calls'] is none or message['tool_calls']|length == 0) %} + {{- raise_exception('Assistant message must have a string or a list of chunks in content or a list of tool calls.') }} + {%- endif %} + + {%- if message['content'] is string %} + {{- message['content'] }} + {%- elif message['content'] | length > 0 %} + {%- for block in message['content'] %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- else %} + {{- raise_exception('Only text chunks are supported in assistant message contents.') }} + {%- endif %} + {%- endfor %} + {%- endif %} + + {%- if message['tool_calls'] is defined and message['tool_calls'] is not none and message['tool_calls']|length > 0 %} + {%- for tool in message['tool_calls'] %} + {%- set arguments = tool['function']['arguments'] %} + {%- if arguments is not string %} + {%- set arguments = arguments|tojson|safe %} + {%- elif arguments == '' %} + {%- set arguments = '{}' %} + {%- endif %} + {{- '[TOOL_CALLS]' + tool['function']['name'] + '[ARGS]' + arguments }} + {%- endfor %} + {%- endif %} + + {#- End of sequence token for each assistant messages. #} + {{- eos_token }} + + {#- Tool messages only supports text content. #} + {%- elif message['role'] == 'tool' %} + {{- '[TOOL_RESULTS]' + message['content']|string + '[/TOOL_RESULTS]' }} + + {#- Raise exception for unsupported roles. #} + {%- else %} + {{- raise_exception('Only user, assistant and tool roles are supported, got ' + message['role'] + '.') }} + {%- endif %} +{%- endfor %} \ No newline at end of file diff --git a/cpt_devstral_24B/checkpoints/checkpoint-686/optimizer.pt b/cpt_devstral_24B/checkpoints/checkpoint-686/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd9b4de30adaa7fca0c6e911accb751b92791998 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-686/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03f935371d01b010e5f58a9120e2561936056ae567db9fe04fb52a1b63061363 +size 160131559 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-686/rng_state.pth b/cpt_devstral_24B/checkpoints/checkpoint-686/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7b135527c30a97861a0eaa050f1df8d9ed675830 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-686/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e706874d7a72e865503ef7a4aaf06a0ded1324badabfab0b0223627edcf671a8 +size 14645 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-686/scheduler.pt b/cpt_devstral_24B/checkpoints/checkpoint-686/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae0b68f1cc9c8d19c633c0148f998c5d1d3c0903 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-686/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac75df5c8a3fb8b97098c85f14d52dc911b665df45efe2b05fbc8192aba4e49f +size 1465 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-686/tokenizer.json b/cpt_devstral_24B/checkpoints/checkpoint-686/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..5b51e255641d3ab81f891f54bd61370fcedf6622 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-686/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:286acad9b0e27fce778ac429763536accf618ccb6ed72963b6f94685e531c5c7 +size 17077402 diff --git a/cpt_devstral_24B/checkpoints/checkpoint-686/tokenizer_config.json b/cpt_devstral_24B/checkpoints/checkpoint-686/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1bb6b32cec8ab9654d2c84faeb9a332373476017 --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-686/tokenizer_config.json @@ -0,0 +1,1013 @@ +{ + "backend": "tokenizers", + "bos_token": "", + "eos_token": "", + "extra_special_tokens": [ + "", + "", + "", + "[INST]", + "[/INST]", + "[AVAILABLE_TOOLS]", + "[/AVAILABLE_TOOLS]", + "[TOOL_RESULTS]", + "[/TOOL_RESULTS]", + "[TOOL_CALLS]", + "[IMG]", + "", + "[IMG_BREAK]", + "[IMG_END]", + "[PREFIX]", + "[MIDDLE]", + "[SUFFIX]", + "[SYSTEM_PROMPT]", + "[/SYSTEM_PROMPT]", + "[TOOL_CONTENT]", + "", + "", + "", + "", + "[AUDIO]", + "[BEGIN_AUDIO]", + "", + "", + "", + "", + "", + "", + "[ARGS]", + "[CALL_ID]", + "[THINK]", + "[/THINK]", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "" + ], + "is_local": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "processor_class": "PixtralProcessor", + "tokenizer_class": "TokenizersBackend", + "unk_token": "" +} diff --git a/cpt_devstral_24B/checkpoints/checkpoint-686/trainer_state.json b/cpt_devstral_24B/checkpoints/checkpoint-686/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c5826b66c4042cd22c496f6a809f36abae023fef --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-686/trainer_state.json @@ -0,0 +1,4940 @@ +{ + "best_global_step": 650, + "best_metric": 0.3949255049228668, + "best_model_checkpoint": "runs/cpt_run_v1/checkpoints/checkpoint-600", + "epoch": 2.0, + "eval_steps": 50, + "global_step": 686, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0029170464904284413, + "grad_norm": 1.1577509641647339, + "learning_rate": 0.0, + "loss": 0.9893555045127869, + "step": 1 + }, + { + "epoch": 0.005834092980856883, + "grad_norm": 0.9491796493530273, + "learning_rate": 2.8985507246376816e-07, + "loss": 0.8791205883026123, + "step": 2 + }, + { + "epoch": 0.008751139471285323, + "grad_norm": 1.1600768566131592, + "learning_rate": 5.797101449275363e-07, + "loss": 0.9858248233795166, + "step": 3 + }, + { + "epoch": 0.011668185961713765, + "grad_norm": 1.2298306226730347, + "learning_rate": 8.695652173913044e-07, + "loss": 1.0516364574432373, + "step": 4 + }, + { + "epoch": 0.014585232452142206, + "grad_norm": 0.9520533680915833, + "learning_rate": 1.1594202898550726e-06, + "loss": 0.8392249345779419, + "step": 5 + }, + { + "epoch": 0.017502278942570646, + "grad_norm": 1.2451188564300537, + "learning_rate": 1.4492753623188408e-06, + "loss": 1.0955077409744263, + "step": 6 + }, + { + "epoch": 0.02041932543299909, + "grad_norm": 1.1123991012573242, + "learning_rate": 1.7391304347826088e-06, + "loss": 0.9201866388320923, + "step": 7 + }, + { + "epoch": 0.02333637192342753, + "grad_norm": 0.9283139705657959, + "learning_rate": 2.028985507246377e-06, + "loss": 0.9770950078964233, + "step": 8 + }, + { + "epoch": 0.02625341841385597, + "grad_norm": 0.9589216113090515, + "learning_rate": 2.3188405797101453e-06, + "loss": 0.9442565441131592, + "step": 9 + }, + { + "epoch": 0.02917046490428441, + "grad_norm": 0.8866703510284424, + "learning_rate": 2.6086956521739132e-06, + "loss": 0.9354464411735535, + "step": 10 + }, + { + "epoch": 0.03208751139471285, + "grad_norm": 0.7191241383552551, + "learning_rate": 2.8985507246376816e-06, + "loss": 0.7659736275672913, + "step": 11 + }, + { + "epoch": 0.03500455788514129, + "grad_norm": 0.9110142588615417, + "learning_rate": 3.188405797101449e-06, + "loss": 0.9319326877593994, + "step": 12 + }, + { + "epoch": 0.03792160437556973, + "grad_norm": 0.8754057288169861, + "learning_rate": 3.4782608695652175e-06, + "loss": 0.9819356203079224, + "step": 13 + }, + { + "epoch": 0.04083865086599818, + "grad_norm": 0.896181046962738, + "learning_rate": 3.768115942028986e-06, + "loss": 1.026316523551941, + "step": 14 + }, + { + "epoch": 0.04375569735642662, + "grad_norm": 0.6104832887649536, + "learning_rate": 4.057971014492754e-06, + "loss": 0.8427562713623047, + "step": 15 + }, + { + "epoch": 0.04667274384685506, + "grad_norm": 0.6529208421707153, + "learning_rate": 4.347826086956522e-06, + "loss": 0.8496565222740173, + "step": 16 + }, + { + "epoch": 0.0495897903372835, + "grad_norm": 0.6319335699081421, + "learning_rate": 4.637681159420291e-06, + "loss": 0.9139047861099243, + "step": 17 + }, + { + "epoch": 0.05250683682771194, + "grad_norm": 0.7458649277687073, + "learning_rate": 4.927536231884059e-06, + "loss": 0.8867442011833191, + "step": 18 + }, + { + "epoch": 0.05542388331814038, + "grad_norm": 0.6179773211479187, + "learning_rate": 5.2173913043478265e-06, + "loss": 0.9579408168792725, + "step": 19 + }, + { + "epoch": 0.05834092980856882, + "grad_norm": 0.794481635093689, + "learning_rate": 5.507246376811595e-06, + "loss": 0.8736554980278015, + "step": 20 + }, + { + "epoch": 0.06125797629899726, + "grad_norm": 0.8356145620346069, + "learning_rate": 5.797101449275363e-06, + "loss": 0.9358762502670288, + "step": 21 + }, + { + "epoch": 0.0641750227894257, + "grad_norm": 0.5891932845115662, + "learning_rate": 6.086956521739132e-06, + "loss": 0.8972038626670837, + "step": 22 + }, + { + "epoch": 0.06709206927985414, + "grad_norm": 0.6931268572807312, + "learning_rate": 6.376811594202898e-06, + "loss": 0.9583507776260376, + "step": 23 + }, + { + "epoch": 0.07000911577028258, + "grad_norm": 0.7298229336738586, + "learning_rate": 6.666666666666667e-06, + "loss": 0.8119489550590515, + "step": 24 + }, + { + "epoch": 0.07292616226071102, + "grad_norm": 0.6419956684112549, + "learning_rate": 6.956521739130435e-06, + "loss": 0.9386100769042969, + "step": 25 + }, + { + "epoch": 0.07584320875113947, + "grad_norm": 0.7508338689804077, + "learning_rate": 7.246376811594203e-06, + "loss": 0.9272583723068237, + "step": 26 + }, + { + "epoch": 0.0787602552415679, + "grad_norm": 0.5848079919815063, + "learning_rate": 7.536231884057972e-06, + "loss": 0.8967856168746948, + "step": 27 + }, + { + "epoch": 0.08167730173199636, + "grad_norm": 0.7384837865829468, + "learning_rate": 7.82608695652174e-06, + "loss": 0.8696568012237549, + "step": 28 + }, + { + "epoch": 0.0845943482224248, + "grad_norm": 0.5069604516029358, + "learning_rate": 8.115942028985508e-06, + "loss": 0.9121193885803223, + "step": 29 + }, + { + "epoch": 0.08751139471285324, + "grad_norm": 0.833165168762207, + "learning_rate": 8.405797101449275e-06, + "loss": 0.8180589079856873, + "step": 30 + }, + { + "epoch": 0.09042844120328168, + "grad_norm": 0.6355920433998108, + "learning_rate": 8.695652173913044e-06, + "loss": 0.8640957474708557, + "step": 31 + }, + { + "epoch": 0.09334548769371012, + "grad_norm": 1.0429315567016602, + "learning_rate": 8.985507246376812e-06, + "loss": 0.9517915844917297, + "step": 32 + }, + { + "epoch": 0.09626253418413856, + "grad_norm": 0.5875154733657837, + "learning_rate": 9.275362318840581e-06, + "loss": 0.9443603754043579, + "step": 33 + }, + { + "epoch": 0.099179580674567, + "grad_norm": 1.9913769960403442, + "learning_rate": 9.565217391304349e-06, + "loss": 0.9510866403579712, + "step": 34 + }, + { + "epoch": 0.10209662716499544, + "grad_norm": 0.5310097932815552, + "learning_rate": 9.855072463768118e-06, + "loss": 0.8653419613838196, + "step": 35 + }, + { + "epoch": 0.10501367365542388, + "grad_norm": 0.624421238899231, + "learning_rate": 1.0144927536231885e-05, + "loss": 0.7941208481788635, + "step": 36 + }, + { + "epoch": 0.10793072014585232, + "grad_norm": 0.6314200758934021, + "learning_rate": 1.0434782608695653e-05, + "loss": 0.8931174278259277, + "step": 37 + }, + { + "epoch": 0.11084776663628076, + "grad_norm": 0.6272342205047607, + "learning_rate": 1.0724637681159422e-05, + "loss": 0.8978185057640076, + "step": 38 + }, + { + "epoch": 0.1137648131267092, + "grad_norm": 0.5711184740066528, + "learning_rate": 1.101449275362319e-05, + "loss": 0.808263897895813, + "step": 39 + }, + { + "epoch": 0.11668185961713765, + "grad_norm": 0.7581208944320679, + "learning_rate": 1.1304347826086957e-05, + "loss": 0.7456756830215454, + "step": 40 + }, + { + "epoch": 0.11959890610756609, + "grad_norm": 0.4989977180957794, + "learning_rate": 1.1594202898550726e-05, + "loss": 0.8273333311080933, + "step": 41 + }, + { + "epoch": 0.12251595259799453, + "grad_norm": 0.8602972626686096, + "learning_rate": 1.1884057971014494e-05, + "loss": 0.8514784574508667, + "step": 42 + }, + { + "epoch": 0.12543299908842298, + "grad_norm": 0.6918581128120422, + "learning_rate": 1.2173913043478263e-05, + "loss": 0.8182265162467957, + "step": 43 + }, + { + "epoch": 0.1283500455788514, + "grad_norm": 0.653099536895752, + "learning_rate": 1.2463768115942029e-05, + "loss": 0.8242791891098022, + "step": 44 + }, + { + "epoch": 0.13126709206927986, + "grad_norm": 0.7485584616661072, + "learning_rate": 1.2753623188405797e-05, + "loss": 0.8229591250419617, + "step": 45 + }, + { + "epoch": 0.1341841385597083, + "grad_norm": 0.6724833250045776, + "learning_rate": 1.3043478260869566e-05, + "loss": 0.8146833181381226, + "step": 46 + }, + { + "epoch": 0.13710118505013674, + "grad_norm": 0.857208251953125, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.8154427409172058, + "step": 47 + }, + { + "epoch": 0.14001823154056517, + "grad_norm": 0.5559669137001038, + "learning_rate": 1.3623188405797103e-05, + "loss": 0.879005491733551, + "step": 48 + }, + { + "epoch": 0.14293527803099362, + "grad_norm": 0.5910897850990295, + "learning_rate": 1.391304347826087e-05, + "loss": 0.8148283362388611, + "step": 49 + }, + { + "epoch": 0.14585232452142205, + "grad_norm": 0.6478891372680664, + "learning_rate": 1.420289855072464e-05, + "loss": 0.8293006420135498, + "step": 50 + }, + { + "epoch": 0.14585232452142205, + "eval_loss": 0.7892261147499084, + "eval_runtime": 973.2157, + "eval_samples_per_second": 0.649, + "eval_steps_per_second": 0.649, + "step": 50 + }, + { + "epoch": 0.1487693710118505, + "grad_norm": 0.757882833480835, + "learning_rate": 1.4492753623188407e-05, + "loss": 0.8114852905273438, + "step": 51 + }, + { + "epoch": 0.15168641750227893, + "grad_norm": 0.8496116995811462, + "learning_rate": 1.4782608695652174e-05, + "loss": 0.7886185050010681, + "step": 52 + }, + { + "epoch": 0.15460346399270739, + "grad_norm": 0.6078857183456421, + "learning_rate": 1.5072463768115944e-05, + "loss": 0.7298170924186707, + "step": 53 + }, + { + "epoch": 0.1575205104831358, + "grad_norm": 0.5856835246086121, + "learning_rate": 1.536231884057971e-05, + "loss": 0.7407160997390747, + "step": 54 + }, + { + "epoch": 0.16043755697356427, + "grad_norm": 1.0533701181411743, + "learning_rate": 1.565217391304348e-05, + "loss": 0.7057831287384033, + "step": 55 + }, + { + "epoch": 0.16335460346399272, + "grad_norm": 0.8087610006332397, + "learning_rate": 1.5942028985507246e-05, + "loss": 0.7409019470214844, + "step": 56 + }, + { + "epoch": 0.16627164995442115, + "grad_norm": 0.629945695400238, + "learning_rate": 1.6231884057971015e-05, + "loss": 0.7768293023109436, + "step": 57 + }, + { + "epoch": 0.1691886964448496, + "grad_norm": 0.5187911987304688, + "learning_rate": 1.6521739130434785e-05, + "loss": 0.825718104839325, + "step": 58 + }, + { + "epoch": 0.17210574293527803, + "grad_norm": 0.5866358280181885, + "learning_rate": 1.681159420289855e-05, + "loss": 0.8575979471206665, + "step": 59 + }, + { + "epoch": 0.17502278942570648, + "grad_norm": 1.5098934173583984, + "learning_rate": 1.710144927536232e-05, + "loss": 0.8058848977088928, + "step": 60 + }, + { + "epoch": 0.1779398359161349, + "grad_norm": 0.6981958150863647, + "learning_rate": 1.739130434782609e-05, + "loss": 0.7640778422355652, + "step": 61 + }, + { + "epoch": 0.18085688240656336, + "grad_norm": 0.631349503993988, + "learning_rate": 1.7681159420289858e-05, + "loss": 0.7896331548690796, + "step": 62 + }, + { + "epoch": 0.1837739288969918, + "grad_norm": 0.6930747032165527, + "learning_rate": 1.7971014492753624e-05, + "loss": 0.6762524247169495, + "step": 63 + }, + { + "epoch": 0.18669097538742024, + "grad_norm": 0.599399209022522, + "learning_rate": 1.8260869565217393e-05, + "loss": 0.7285035848617554, + "step": 64 + }, + { + "epoch": 0.18960802187784867, + "grad_norm": 0.6194344758987427, + "learning_rate": 1.8550724637681162e-05, + "loss": 0.7682523131370544, + "step": 65 + }, + { + "epoch": 0.19252506836827712, + "grad_norm": 0.5691342949867249, + "learning_rate": 1.8840579710144928e-05, + "loss": 0.6791993379592896, + "step": 66 + }, + { + "epoch": 0.19544211485870555, + "grad_norm": 0.6257390379905701, + "learning_rate": 1.9130434782608697e-05, + "loss": 0.6744828224182129, + "step": 67 + }, + { + "epoch": 0.198359161349134, + "grad_norm": 0.5871018767356873, + "learning_rate": 1.9420289855072467e-05, + "loss": 0.7317330837249756, + "step": 68 + }, + { + "epoch": 0.20127620783956243, + "grad_norm": 1.0744612216949463, + "learning_rate": 1.9710144927536236e-05, + "loss": 0.6617178916931152, + "step": 69 + }, + { + "epoch": 0.2041932543299909, + "grad_norm": 0.675946831703186, + "learning_rate": 2e-05, + "loss": 0.7615712881088257, + "step": 70 + }, + { + "epoch": 0.2071103008204193, + "grad_norm": 0.7663411498069763, + "learning_rate": 1.9999870372100614e-05, + "loss": 0.7131291627883911, + "step": 71 + }, + { + "epoch": 0.21002734731084777, + "grad_norm": 0.6725395321846008, + "learning_rate": 1.9999481491763123e-05, + "loss": 0.7452989816665649, + "step": 72 + }, + { + "epoch": 0.21294439380127622, + "grad_norm": 0.6505664587020874, + "learning_rate": 1.9998833369069483e-05, + "loss": 0.7477136850357056, + "step": 73 + }, + { + "epoch": 0.21586144029170465, + "grad_norm": 0.7032860517501831, + "learning_rate": 1.9997926020822643e-05, + "loss": 0.6854275465011597, + "step": 74 + }, + { + "epoch": 0.2187784867821331, + "grad_norm": 0.645345151424408, + "learning_rate": 1.999675947054614e-05, + "loss": 0.7552425265312195, + "step": 75 + }, + { + "epoch": 0.22169553327256153, + "grad_norm": 0.6620492935180664, + "learning_rate": 1.9995333748483464e-05, + "loss": 0.7262853384017944, + "step": 76 + }, + { + "epoch": 0.22461257976298998, + "grad_norm": 0.6511455774307251, + "learning_rate": 1.9993648891597284e-05, + "loss": 0.7591732144355774, + "step": 77 + }, + { + "epoch": 0.2275296262534184, + "grad_norm": 0.6775254011154175, + "learning_rate": 1.9991704943568497e-05, + "loss": 0.7498704195022583, + "step": 78 + }, + { + "epoch": 0.23044667274384686, + "grad_norm": 0.8199896216392517, + "learning_rate": 1.9989501954795076e-05, + "loss": 0.7238684296607971, + "step": 79 + }, + { + "epoch": 0.2333637192342753, + "grad_norm": 0.8197569847106934, + "learning_rate": 1.998703998239079e-05, + "loss": 0.7028778195381165, + "step": 80 + }, + { + "epoch": 0.23628076572470375, + "grad_norm": 0.6602625250816345, + "learning_rate": 1.9984319090183692e-05, + "loss": 0.8842703104019165, + "step": 81 + }, + { + "epoch": 0.23919781221513217, + "grad_norm": 0.9587129354476929, + "learning_rate": 1.99813393487145e-05, + "loss": 0.732614278793335, + "step": 82 + }, + { + "epoch": 0.24211485870556063, + "grad_norm": 0.6822189092636108, + "learning_rate": 1.997810083523473e-05, + "loss": 0.7544928193092346, + "step": 83 + }, + { + "epoch": 0.24503190519598905, + "grad_norm": 0.8980082869529724, + "learning_rate": 1.9974603633704726e-05, + "loss": 0.6704054474830627, + "step": 84 + }, + { + "epoch": 0.2479489516864175, + "grad_norm": 0.7413425445556641, + "learning_rate": 1.9970847834791472e-05, + "loss": 0.693661093711853, + "step": 85 + }, + { + "epoch": 0.25086599817684596, + "grad_norm": 0.8314999341964722, + "learning_rate": 1.9966833535866223e-05, + "loss": 0.667654275894165, + "step": 86 + }, + { + "epoch": 0.25378304466727436, + "grad_norm": 0.7972444891929626, + "learning_rate": 1.9962560841002013e-05, + "loss": 0.8403134942054749, + "step": 87 + }, + { + "epoch": 0.2567000911577028, + "grad_norm": 0.8519951701164246, + "learning_rate": 1.995802986097093e-05, + "loss": 0.6897370219230652, + "step": 88 + }, + { + "epoch": 0.25961713764813127, + "grad_norm": 0.8268933892250061, + "learning_rate": 1.995324071324126e-05, + "loss": 0.6690632700920105, + "step": 89 + }, + { + "epoch": 0.2625341841385597, + "grad_norm": 0.7133983969688416, + "learning_rate": 1.9948193521974436e-05, + "loss": 0.6314147114753723, + "step": 90 + }, + { + "epoch": 0.2654512306289881, + "grad_norm": 0.889302134513855, + "learning_rate": 1.9942888418021814e-05, + "loss": 0.7389825582504272, + "step": 91 + }, + { + "epoch": 0.2683682771194166, + "grad_norm": 0.7022432088851929, + "learning_rate": 1.99373255389213e-05, + "loss": 0.6916261911392212, + "step": 92 + }, + { + "epoch": 0.27128532360984503, + "grad_norm": 0.696432888507843, + "learning_rate": 1.9931505028893748e-05, + "loss": 0.6908476948738098, + "step": 93 + }, + { + "epoch": 0.2742023701002735, + "grad_norm": 0.7667419910430908, + "learning_rate": 1.9925427038839267e-05, + "loss": 0.6500837206840515, + "step": 94 + }, + { + "epoch": 0.27711941659070194, + "grad_norm": 0.6974894404411316, + "learning_rate": 1.9919091726333265e-05, + "loss": 0.7059191465377808, + "step": 95 + }, + { + "epoch": 0.28003646308113034, + "grad_norm": 0.7047077417373657, + "learning_rate": 1.9912499255622397e-05, + "loss": 0.6287837624549866, + "step": 96 + }, + { + "epoch": 0.2829535095715588, + "grad_norm": 0.7729557156562805, + "learning_rate": 1.990564979762029e-05, + "loss": 0.6738612055778503, + "step": 97 + }, + { + "epoch": 0.28587055606198725, + "grad_norm": 0.7020529508590698, + "learning_rate": 1.989854352990311e-05, + "loss": 0.662042498588562, + "step": 98 + }, + { + "epoch": 0.2887876025524157, + "grad_norm": 0.7369800209999084, + "learning_rate": 1.9891180636704975e-05, + "loss": 0.6246830821037292, + "step": 99 + }, + { + "epoch": 0.2917046490428441, + "grad_norm": 0.7412623167037964, + "learning_rate": 1.9883561308913154e-05, + "loss": 0.6623879075050354, + "step": 100 + }, + { + "epoch": 0.2917046490428441, + "eval_loss": 0.6552971005439758, + "eval_runtime": 966.7072, + "eval_samples_per_second": 0.654, + "eval_steps_per_second": 0.654, + "step": 100 + }, + { + "epoch": 0.29462169553327255, + "grad_norm": 0.8428792953491211, + "learning_rate": 1.987568574406314e-05, + "loss": 0.6312171816825867, + "step": 101 + }, + { + "epoch": 0.297538742023701, + "grad_norm": 0.6948133707046509, + "learning_rate": 1.9867554146333517e-05, + "loss": 0.6266146898269653, + "step": 102 + }, + { + "epoch": 0.30045578851412946, + "grad_norm": 1.3897597789764404, + "learning_rate": 1.985916672654068e-05, + "loss": 0.6669265031814575, + "step": 103 + }, + { + "epoch": 0.30337283500455786, + "grad_norm": 0.8838400840759277, + "learning_rate": 1.985052370213334e-05, + "loss": 0.6601086854934692, + "step": 104 + }, + { + "epoch": 0.3062898814949863, + "grad_norm": 0.8471395373344421, + "learning_rate": 1.9841625297186925e-05, + "loss": 0.5984431505203247, + "step": 105 + }, + { + "epoch": 0.30920692798541477, + "grad_norm": 0.8940042853355408, + "learning_rate": 1.983247174239774e-05, + "loss": 0.7223822474479675, + "step": 106 + }, + { + "epoch": 0.3121239744758432, + "grad_norm": 0.7833696603775024, + "learning_rate": 1.9823063275076998e-05, + "loss": 0.6868705749511719, + "step": 107 + }, + { + "epoch": 0.3150410209662716, + "grad_norm": 0.8794649243354797, + "learning_rate": 1.9813400139144673e-05, + "loss": 0.6246675848960876, + "step": 108 + }, + { + "epoch": 0.3179580674567001, + "grad_norm": 0.8126057982444763, + "learning_rate": 1.9803482585123165e-05, + "loss": 0.5908697247505188, + "step": 109 + }, + { + "epoch": 0.32087511394712853, + "grad_norm": 0.7947676777839661, + "learning_rate": 1.979331087013082e-05, + "loss": 0.5751246809959412, + "step": 110 + }, + { + "epoch": 0.323792160437557, + "grad_norm": 0.713545560836792, + "learning_rate": 1.978288525787524e-05, + "loss": 0.6081106066703796, + "step": 111 + }, + { + "epoch": 0.32670920692798544, + "grad_norm": 1.011828064918518, + "learning_rate": 1.977220601864647e-05, + "loss": 0.7039169669151306, + "step": 112 + }, + { + "epoch": 0.32962625341841384, + "grad_norm": 0.730570912361145, + "learning_rate": 1.9761273429309982e-05, + "loss": 0.6140255928039551, + "step": 113 + }, + { + "epoch": 0.3325432999088423, + "grad_norm": 1.059688687324524, + "learning_rate": 1.9750087773299492e-05, + "loss": 0.648114025592804, + "step": 114 + }, + { + "epoch": 0.33546034639927075, + "grad_norm": 0.9336895942687988, + "learning_rate": 1.973864934060962e-05, + "loss": 0.622555673122406, + "step": 115 + }, + { + "epoch": 0.3383773928896992, + "grad_norm": 0.7195945978164673, + "learning_rate": 1.9726958427788367e-05, + "loss": 0.70485520362854, + "step": 116 + }, + { + "epoch": 0.3412944393801276, + "grad_norm": 0.8101872801780701, + "learning_rate": 1.971501533792942e-05, + "loss": 0.6958848834037781, + "step": 117 + }, + { + "epoch": 0.34421148587055606, + "grad_norm": 1.6075212955474854, + "learning_rate": 1.970282038066432e-05, + "loss": 0.6021550893783569, + "step": 118 + }, + { + "epoch": 0.3471285323609845, + "grad_norm": 0.7881433963775635, + "learning_rate": 1.9690373872154396e-05, + "loss": 0.6449777483940125, + "step": 119 + }, + { + "epoch": 0.35004557885141296, + "grad_norm": 1.014639973640442, + "learning_rate": 1.9677676135082606e-05, + "loss": 0.5939379930496216, + "step": 120 + }, + { + "epoch": 0.35296262534184136, + "grad_norm": 0.8198449611663818, + "learning_rate": 1.9664727498645144e-05, + "loss": 0.6210286617279053, + "step": 121 + }, + { + "epoch": 0.3558796718322698, + "grad_norm": 1.0194576978683472, + "learning_rate": 1.9651528298542918e-05, + "loss": 0.624247670173645, + "step": 122 + }, + { + "epoch": 0.35879671832269827, + "grad_norm": 0.7963470220565796, + "learning_rate": 1.9638078876972842e-05, + "loss": 0.6479315757751465, + "step": 123 + }, + { + "epoch": 0.3617137648131267, + "grad_norm": 0.9007541537284851, + "learning_rate": 1.9624379582618976e-05, + "loss": 0.6131505370140076, + "step": 124 + }, + { + "epoch": 0.3646308113035551, + "grad_norm": 0.8712120056152344, + "learning_rate": 1.9610430770643464e-05, + "loss": 0.6249448657035828, + "step": 125 + }, + { + "epoch": 0.3675478577939836, + "grad_norm": 1.1482540369033813, + "learning_rate": 1.9596232802677347e-05, + "loss": 0.5844688415527344, + "step": 126 + }, + { + "epoch": 0.37046490428441203, + "grad_norm": 0.8662379384040833, + "learning_rate": 1.9581786046811175e-05, + "loss": 0.6573485732078552, + "step": 127 + }, + { + "epoch": 0.3733819507748405, + "grad_norm": 0.8191388845443726, + "learning_rate": 1.9567090877585477e-05, + "loss": 0.5896862745285034, + "step": 128 + }, + { + "epoch": 0.37629899726526894, + "grad_norm": 1.0187078714370728, + "learning_rate": 1.955214767598103e-05, + "loss": 0.613490879535675, + "step": 129 + }, + { + "epoch": 0.37921604375569734, + "grad_norm": 0.8444119691848755, + "learning_rate": 1.953695682940901e-05, + "loss": 0.727687656879425, + "step": 130 + }, + { + "epoch": 0.3821330902461258, + "grad_norm": 0.74753737449646, + "learning_rate": 1.9521518731700913e-05, + "loss": 0.6102436780929565, + "step": 131 + }, + { + "epoch": 0.38505013673655425, + "grad_norm": 1.0166202783584595, + "learning_rate": 1.9505833783098378e-05, + "loss": 0.6244844198226929, + "step": 132 + }, + { + "epoch": 0.3879671832269827, + "grad_norm": 0.8175772428512573, + "learning_rate": 1.9489902390242793e-05, + "loss": 0.5939282178878784, + "step": 133 + }, + { + "epoch": 0.3908842297174111, + "grad_norm": 1.0177713632583618, + "learning_rate": 1.947372496616476e-05, + "loss": 0.6418229937553406, + "step": 134 + }, + { + "epoch": 0.39380127620783956, + "grad_norm": 0.8652453422546387, + "learning_rate": 1.9457301930273376e-05, + "loss": 0.5870395302772522, + "step": 135 + }, + { + "epoch": 0.396718322698268, + "grad_norm": 0.8378894925117493, + "learning_rate": 1.9440633708345365e-05, + "loss": 0.6480278372764587, + "step": 136 + }, + { + "epoch": 0.39963536918869647, + "grad_norm": 0.8303541541099548, + "learning_rate": 1.9423720732514052e-05, + "loss": 0.6191359758377075, + "step": 137 + }, + { + "epoch": 0.40255241567912486, + "grad_norm": 0.8576734662055969, + "learning_rate": 1.9406563441258145e-05, + "loss": 0.5696198344230652, + "step": 138 + }, + { + "epoch": 0.4054694621695533, + "grad_norm": 0.9558727145195007, + "learning_rate": 1.9389162279390362e-05, + "loss": 0.6177623271942139, + "step": 139 + }, + { + "epoch": 0.4083865086599818, + "grad_norm": 0.7046042084693909, + "learning_rate": 1.9371517698045922e-05, + "loss": 0.5836521983146667, + "step": 140 + }, + { + "epoch": 0.4113035551504102, + "grad_norm": 1.0522717237472534, + "learning_rate": 1.935363015467082e-05, + "loss": 0.5728275775909424, + "step": 141 + }, + { + "epoch": 0.4142206016408386, + "grad_norm": 0.9554787874221802, + "learning_rate": 1.933550011301e-05, + "loss": 0.632586658000946, + "step": 142 + }, + { + "epoch": 0.4171376481312671, + "grad_norm": 0.8874214291572571, + "learning_rate": 1.9317128043095293e-05, + "loss": 0.5850118398666382, + "step": 143 + }, + { + "epoch": 0.42005469462169553, + "grad_norm": 1.0708963871002197, + "learning_rate": 1.9298514421233276e-05, + "loss": 0.6260685324668884, + "step": 144 + }, + { + "epoch": 0.422971741112124, + "grad_norm": 0.8135736584663391, + "learning_rate": 1.9279659729992888e-05, + "loss": 0.6031094193458557, + "step": 145 + }, + { + "epoch": 0.42588878760255244, + "grad_norm": 0.7971774339675903, + "learning_rate": 1.9260564458192926e-05, + "loss": 0.6101322770118713, + "step": 146 + }, + { + "epoch": 0.42880583409298084, + "grad_norm": 0.9374974966049194, + "learning_rate": 1.9241229100889397e-05, + "loss": 0.5836313366889954, + "step": 147 + }, + { + "epoch": 0.4317228805834093, + "grad_norm": 0.8043425679206848, + "learning_rate": 1.9221654159362636e-05, + "loss": 0.6181215047836304, + "step": 148 + }, + { + "epoch": 0.43463992707383775, + "grad_norm": 0.8923380374908447, + "learning_rate": 1.920184014110436e-05, + "loss": 0.6149677634239197, + "step": 149 + }, + { + "epoch": 0.4375569735642662, + "grad_norm": 0.8908132314682007, + "learning_rate": 1.918178755980449e-05, + "loss": 0.5899742841720581, + "step": 150 + }, + { + "epoch": 0.4375569735642662, + "eval_loss": 0.5903874635696411, + "eval_runtime": 1186.9542, + "eval_samples_per_second": 0.532, + "eval_steps_per_second": 0.532, + "step": 150 + }, + { + "epoch": 0.4404740200546946, + "grad_norm": 1.060531497001648, + "learning_rate": 1.9161496935337808e-05, + "loss": 0.5852696895599365, + "step": 151 + }, + { + "epoch": 0.44339106654512306, + "grad_norm": 0.9723032712936401, + "learning_rate": 1.914096879375053e-05, + "loss": 0.5822056531906128, + "step": 152 + }, + { + "epoch": 0.4463081130355515, + "grad_norm": 0.9519931674003601, + "learning_rate": 1.912020366724663e-05, + "loss": 0.6183493137359619, + "step": 153 + }, + { + "epoch": 0.44922515952597997, + "grad_norm": 0.8282918334007263, + "learning_rate": 1.9099202094174055e-05, + "loss": 0.6229860782623291, + "step": 154 + }, + { + "epoch": 0.45214220601640837, + "grad_norm": 0.9251292943954468, + "learning_rate": 1.907796461901076e-05, + "loss": 0.6552959680557251, + "step": 155 + }, + { + "epoch": 0.4550592525068368, + "grad_norm": 1.0349540710449219, + "learning_rate": 1.9056491792350606e-05, + "loss": 0.6170098781585693, + "step": 156 + }, + { + "epoch": 0.4579762989972653, + "grad_norm": 0.8720711469650269, + "learning_rate": 1.9034784170889076e-05, + "loss": 0.5870137810707092, + "step": 157 + }, + { + "epoch": 0.46089334548769373, + "grad_norm": 1.0785977840423584, + "learning_rate": 1.9012842317408843e-05, + "loss": 0.5515124201774597, + "step": 158 + }, + { + "epoch": 0.4638103919781221, + "grad_norm": 1.0634154081344604, + "learning_rate": 1.8990666800765187e-05, + "loss": 0.6073828339576721, + "step": 159 + }, + { + "epoch": 0.4667274384685506, + "grad_norm": 0.8770879507064819, + "learning_rate": 1.896825819587123e-05, + "loss": 0.5960907936096191, + "step": 160 + }, + { + "epoch": 0.46964448495897904, + "grad_norm": 1.1225898265838623, + "learning_rate": 1.894561708368305e-05, + "loss": 0.545990526676178, + "step": 161 + }, + { + "epoch": 0.4725615314494075, + "grad_norm": 0.9373893141746521, + "learning_rate": 1.8922744051184613e-05, + "loss": 0.5566108822822571, + "step": 162 + }, + { + "epoch": 0.4754785779398359, + "grad_norm": 1.5016087293624878, + "learning_rate": 1.8899639691372545e-05, + "loss": 0.558845043182373, + "step": 163 + }, + { + "epoch": 0.47839562443026434, + "grad_norm": 0.903020977973938, + "learning_rate": 1.8876304603240773e-05, + "loss": 0.6824233531951904, + "step": 164 + }, + { + "epoch": 0.4813126709206928, + "grad_norm": 0.8239623308181763, + "learning_rate": 1.8852739391764993e-05, + "loss": 0.5630610585212708, + "step": 165 + }, + { + "epoch": 0.48422971741112125, + "grad_norm": 0.926069438457489, + "learning_rate": 1.882894466788697e-05, + "loss": 0.6211802363395691, + "step": 166 + }, + { + "epoch": 0.4871467639015497, + "grad_norm": 1.0098828077316284, + "learning_rate": 1.8804921048498722e-05, + "loss": 0.5513257384300232, + "step": 167 + }, + { + "epoch": 0.4900638103919781, + "grad_norm": 0.9228141903877258, + "learning_rate": 1.8780669156426517e-05, + "loss": 0.6197121739387512, + "step": 168 + }, + { + "epoch": 0.49298085688240656, + "grad_norm": 1.0551754236221313, + "learning_rate": 1.8756189620414712e-05, + "loss": 0.5221806764602661, + "step": 169 + }, + { + "epoch": 0.495897903372835, + "grad_norm": 0.9017496109008789, + "learning_rate": 1.873148307510948e-05, + "loss": 0.5766995549201965, + "step": 170 + }, + { + "epoch": 0.49881494986326347, + "grad_norm": 0.9704970717430115, + "learning_rate": 1.870655016104233e-05, + "loss": 0.6514763832092285, + "step": 171 + }, + { + "epoch": 0.5017319963536919, + "grad_norm": 0.9972712397575378, + "learning_rate": 1.8681391524613518e-05, + "loss": 0.5273895263671875, + "step": 172 + }, + { + "epoch": 0.5046490428441204, + "grad_norm": 0.9473339319229126, + "learning_rate": 1.8656007818075288e-05, + "loss": 0.5548599362373352, + "step": 173 + }, + { + "epoch": 0.5075660893345487, + "grad_norm": 1.2493574619293213, + "learning_rate": 1.8630399699514944e-05, + "loss": 0.5593586564064026, + "step": 174 + }, + { + "epoch": 0.5104831358249772, + "grad_norm": 1.2766696214675903, + "learning_rate": 1.860456783283781e-05, + "loss": 0.6054630279541016, + "step": 175 + }, + { + "epoch": 0.5134001823154056, + "grad_norm": 0.9555240869522095, + "learning_rate": 1.857851288775002e-05, + "loss": 0.508592963218689, + "step": 176 + }, + { + "epoch": 0.5163172288058341, + "grad_norm": 1.260219931602478, + "learning_rate": 1.8552235539741118e-05, + "loss": 0.5532065629959106, + "step": 177 + }, + { + "epoch": 0.5192342752962625, + "grad_norm": 1.1859954595565796, + "learning_rate": 1.8525736470066595e-05, + "loss": 0.5683344006538391, + "step": 178 + }, + { + "epoch": 0.522151321786691, + "grad_norm": 1.3044344186782837, + "learning_rate": 1.8499016365730203e-05, + "loss": 0.5281959772109985, + "step": 179 + }, + { + "epoch": 0.5250683682771194, + "grad_norm": 1.3049921989440918, + "learning_rate": 1.8472075919466137e-05, + "loss": 0.49621230363845825, + "step": 180 + }, + { + "epoch": 0.5279854147675479, + "grad_norm": 1.0488537549972534, + "learning_rate": 1.844491582972109e-05, + "loss": 0.6194032430648804, + "step": 181 + }, + { + "epoch": 0.5309024612579762, + "grad_norm": 1.5553455352783203, + "learning_rate": 1.8417536800636138e-05, + "loss": 0.5645846724510193, + "step": 182 + }, + { + "epoch": 0.5338195077484047, + "grad_norm": 1.2673912048339844, + "learning_rate": 1.8389939542028484e-05, + "loss": 0.6267315745353699, + "step": 183 + }, + { + "epoch": 0.5367365542388332, + "grad_norm": 1.0273847579956055, + "learning_rate": 1.8362124769373064e-05, + "loss": 0.5256403684616089, + "step": 184 + }, + { + "epoch": 0.5396536007292616, + "grad_norm": 1.006093978881836, + "learning_rate": 1.8334093203783986e-05, + "loss": 0.5916382074356079, + "step": 185 + }, + { + "epoch": 0.5425706472196901, + "grad_norm": 1.2740857601165771, + "learning_rate": 1.8305845571995843e-05, + "loss": 0.581648588180542, + "step": 186 + }, + { + "epoch": 0.5454876937101185, + "grad_norm": 1.494248390197754, + "learning_rate": 1.8277382606344872e-05, + "loss": 0.4824523627758026, + "step": 187 + }, + { + "epoch": 0.548404740200547, + "grad_norm": 1.1862496137619019, + "learning_rate": 1.824870504474996e-05, + "loss": 0.5531858205795288, + "step": 188 + }, + { + "epoch": 0.5513217866909754, + "grad_norm": 3.503049373626709, + "learning_rate": 1.8219813630693523e-05, + "loss": 0.6308296918869019, + "step": 189 + }, + { + "epoch": 0.5542388331814039, + "grad_norm": 1.7544710636138916, + "learning_rate": 1.819070911320222e-05, + "loss": 0.6146273016929626, + "step": 190 + }, + { + "epoch": 0.5571558796718322, + "grad_norm": 1.3367774486541748, + "learning_rate": 1.8161392246827546e-05, + "loss": 0.5848966240882874, + "step": 191 + }, + { + "epoch": 0.5600729261622607, + "grad_norm": 1.696418046951294, + "learning_rate": 1.8131863791626263e-05, + "loss": 0.6621730327606201, + "step": 192 + }, + { + "epoch": 0.5629899726526891, + "grad_norm": 1.360052227973938, + "learning_rate": 1.8102124513140694e-05, + "loss": 0.5972204208374023, + "step": 193 + }, + { + "epoch": 0.5659070191431176, + "grad_norm": 1.5376263856887817, + "learning_rate": 1.807217518237888e-05, + "loss": 0.4938785433769226, + "step": 194 + }, + { + "epoch": 0.568824065633546, + "grad_norm": 1.2249681949615479, + "learning_rate": 1.8042016575794585e-05, + "loss": 0.5366095304489136, + "step": 195 + }, + { + "epoch": 0.5717411121239745, + "grad_norm": 1.7868080139160156, + "learning_rate": 1.8011649475267178e-05, + "loss": 0.5116773843765259, + "step": 196 + }, + { + "epoch": 0.574658158614403, + "grad_norm": 2.369993209838867, + "learning_rate": 1.7981074668081345e-05, + "loss": 0.49072742462158203, + "step": 197 + }, + { + "epoch": 0.5775752051048314, + "grad_norm": 1.0168434381484985, + "learning_rate": 1.7950292946906695e-05, + "loss": 0.5691611170768738, + "step": 198 + }, + { + "epoch": 0.5804922515952597, + "grad_norm": 1.2990851402282715, + "learning_rate": 1.7919305109777195e-05, + "loss": 0.5515039563179016, + "step": 199 + }, + { + "epoch": 0.5834092980856882, + "grad_norm": 1.4859853982925415, + "learning_rate": 1.7888111960070493e-05, + "loss": 0.5017011165618896, + "step": 200 + }, + { + "epoch": 0.5834092980856882, + "eval_loss": 0.5414339303970337, + "eval_runtime": 1180.7894, + "eval_samples_per_second": 0.535, + "eval_steps_per_second": 0.535, + "step": 200 + }, + { + "epoch": 0.5863263445761167, + "grad_norm": 1.0065829753875732, + "learning_rate": 1.7856714306487088e-05, + "loss": 0.5677731037139893, + "step": 201 + }, + { + "epoch": 0.5892433910665451, + "grad_norm": 1.1727538108825684, + "learning_rate": 1.7825112963029352e-05, + "loss": 0.4525509476661682, + "step": 202 + }, + { + "epoch": 0.5921604375569736, + "grad_norm": 1.3376752138137817, + "learning_rate": 1.7793308748980437e-05, + "loss": 0.5208959579467773, + "step": 203 + }, + { + "epoch": 0.595077484047402, + "grad_norm": 0.9196159839630127, + "learning_rate": 1.776130248888304e-05, + "loss": 0.6033903360366821, + "step": 204 + }, + { + "epoch": 0.5979945305378305, + "grad_norm": 1.0750919580459595, + "learning_rate": 1.772909501251801e-05, + "loss": 0.5449609160423279, + "step": 205 + }, + { + "epoch": 0.6009115770282589, + "grad_norm": 1.2459467649459839, + "learning_rate": 1.769668715488285e-05, + "loss": 0.5685338377952576, + "step": 206 + }, + { + "epoch": 0.6038286235186874, + "grad_norm": 1.1690552234649658, + "learning_rate": 1.766407975617006e-05, + "loss": 0.5240382552146912, + "step": 207 + }, + { + "epoch": 0.6067456700091157, + "grad_norm": 1.0816599130630493, + "learning_rate": 1.7631273661745362e-05, + "loss": 0.6802893877029419, + "step": 208 + }, + { + "epoch": 0.6096627164995442, + "grad_norm": 1.3662947416305542, + "learning_rate": 1.7598269722125775e-05, + "loss": 0.48193931579589844, + "step": 209 + }, + { + "epoch": 0.6125797629899726, + "grad_norm": 0.9364766478538513, + "learning_rate": 1.7565068792957576e-05, + "loss": 0.5675849914550781, + "step": 210 + }, + { + "epoch": 0.6154968094804011, + "grad_norm": 1.123828411102295, + "learning_rate": 1.75316717349941e-05, + "loss": 0.5474762916564941, + "step": 211 + }, + { + "epoch": 0.6184138559708295, + "grad_norm": 1.1924363374710083, + "learning_rate": 1.749807941407345e-05, + "loss": 0.4918654263019562, + "step": 212 + }, + { + "epoch": 0.621330902461258, + "grad_norm": 1.101293921470642, + "learning_rate": 1.7464292701096014e-05, + "loss": 0.5742691159248352, + "step": 213 + }, + { + "epoch": 0.6242479489516864, + "grad_norm": 1.7374963760375977, + "learning_rate": 1.7430312472001928e-05, + "loss": 0.5828965902328491, + "step": 214 + }, + { + "epoch": 0.6271649954421149, + "grad_norm": 1.3195666074752808, + "learning_rate": 1.739613960774833e-05, + "loss": 0.5265159010887146, + "step": 215 + }, + { + "epoch": 0.6300820419325432, + "grad_norm": 1.254686713218689, + "learning_rate": 1.7361774994286545e-05, + "loss": 0.4929371476173401, + "step": 216 + }, + { + "epoch": 0.6329990884229717, + "grad_norm": 1.1476380825042725, + "learning_rate": 1.7327219522539102e-05, + "loss": 0.5060417652130127, + "step": 217 + }, + { + "epoch": 0.6359161349134002, + "grad_norm": 1.0914150476455688, + "learning_rate": 1.7292474088376643e-05, + "loss": 0.504043698310852, + "step": 218 + }, + { + "epoch": 0.6388331814038286, + "grad_norm": 1.1339508295059204, + "learning_rate": 1.7257539592594698e-05, + "loss": 0.4797310531139374, + "step": 219 + }, + { + "epoch": 0.6417502278942571, + "grad_norm": 1.0805399417877197, + "learning_rate": 1.722241694089033e-05, + "loss": 0.5878555178642273, + "step": 220 + }, + { + "epoch": 0.6446672743846855, + "grad_norm": 1.8615056276321411, + "learning_rate": 1.718710704383865e-05, + "loss": 0.5005823969841003, + "step": 221 + }, + { + "epoch": 0.647584320875114, + "grad_norm": 1.1445401906967163, + "learning_rate": 1.7151610816869214e-05, + "loss": 0.4949319064617157, + "step": 222 + }, + { + "epoch": 0.6505013673655424, + "grad_norm": 0.9726515412330627, + "learning_rate": 1.711592918024229e-05, + "loss": 0.5073204040527344, + "step": 223 + }, + { + "epoch": 0.6534184138559709, + "grad_norm": 1.4491140842437744, + "learning_rate": 1.7080063059024998e-05, + "loss": 0.47885262966156006, + "step": 224 + }, + { + "epoch": 0.6563354603463992, + "grad_norm": 1.0070592164993286, + "learning_rate": 1.7044013383067327e-05, + "loss": 0.5775837898254395, + "step": 225 + }, + { + "epoch": 0.6592525068368277, + "grad_norm": 0.966221272945404, + "learning_rate": 1.7007781086978037e-05, + "loss": 0.5050399899482727, + "step": 226 + }, + { + "epoch": 0.6621695533272561, + "grad_norm": 0.9808815121650696, + "learning_rate": 1.6971367110100407e-05, + "loss": 0.5737045407295227, + "step": 227 + }, + { + "epoch": 0.6650865998176846, + "grad_norm": 1.0158127546310425, + "learning_rate": 1.6934772396487906e-05, + "loss": 0.48077821731567383, + "step": 228 + }, + { + "epoch": 0.668003646308113, + "grad_norm": 1.32015860080719, + "learning_rate": 1.6897997894879706e-05, + "loss": 0.5614925026893616, + "step": 229 + }, + { + "epoch": 0.6709206927985415, + "grad_norm": 1.1055903434753418, + "learning_rate": 1.686104455867608e-05, + "loss": 0.4970760643482208, + "step": 230 + }, + { + "epoch": 0.67383773928897, + "grad_norm": 1.0804500579833984, + "learning_rate": 1.682391334591371e-05, + "loss": 0.5540452003479004, + "step": 231 + }, + { + "epoch": 0.6767547857793984, + "grad_norm": 1.1906245946884155, + "learning_rate": 1.6786605219240807e-05, + "loss": 0.5778501033782959, + "step": 232 + }, + { + "epoch": 0.6796718322698267, + "grad_norm": 0.9758645296096802, + "learning_rate": 1.6749121145892192e-05, + "loss": 0.49073565006256104, + "step": 233 + }, + { + "epoch": 0.6825888787602552, + "grad_norm": 1.1678364276885986, + "learning_rate": 1.6711462097664207e-05, + "loss": 0.4828741252422333, + "step": 234 + }, + { + "epoch": 0.6855059252506837, + "grad_norm": 1.148301362991333, + "learning_rate": 1.6673629050889507e-05, + "loss": 0.5143818855285645, + "step": 235 + }, + { + "epoch": 0.6884229717411121, + "grad_norm": 1.005898356437683, + "learning_rate": 1.6635622986411776e-05, + "loss": 0.5301160216331482, + "step": 236 + }, + { + "epoch": 0.6913400182315406, + "grad_norm": 1.2227320671081543, + "learning_rate": 1.659744488956027e-05, + "loss": 0.4800386130809784, + "step": 237 + }, + { + "epoch": 0.694257064721969, + "grad_norm": 0.986456573009491, + "learning_rate": 1.6559095750124296e-05, + "loss": 0.5098081827163696, + "step": 238 + }, + { + "epoch": 0.6971741112123975, + "grad_norm": 1.1474376916885376, + "learning_rate": 1.6520576562327518e-05, + "loss": 0.5147273540496826, + "step": 239 + }, + { + "epoch": 0.7000911577028259, + "grad_norm": 1.10917067527771, + "learning_rate": 1.6481888324802223e-05, + "loss": 0.5023190379142761, + "step": 240 + }, + { + "epoch": 0.7030082041932544, + "grad_norm": 1.2339262962341309, + "learning_rate": 1.644303204056341e-05, + "loss": 0.5282092690467834, + "step": 241 + }, + { + "epoch": 0.7059252506836827, + "grad_norm": 0.997941255569458, + "learning_rate": 1.640400871698277e-05, + "loss": 0.5635963082313538, + "step": 242 + }, + { + "epoch": 0.7088422971741112, + "grad_norm": 1.0345823764801025, + "learning_rate": 1.63648193657626e-05, + "loss": 0.5577977895736694, + "step": 243 + }, + { + "epoch": 0.7117593436645396, + "grad_norm": 1.3468303680419922, + "learning_rate": 1.6325465002909554e-05, + "loss": 0.4365362524986267, + "step": 244 + }, + { + "epoch": 0.7146763901549681, + "grad_norm": 1.2817128896713257, + "learning_rate": 1.628594664870831e-05, + "loss": 0.46069926023483276, + "step": 245 + }, + { + "epoch": 0.7175934366453965, + "grad_norm": 1.043311357498169, + "learning_rate": 1.6246265327695117e-05, + "loss": 0.5476971864700317, + "step": 246 + }, + { + "epoch": 0.720510483135825, + "grad_norm": 1.0297389030456543, + "learning_rate": 1.620642206863124e-05, + "loss": 0.48051249980926514, + "step": 247 + }, + { + "epoch": 0.7234275296262535, + "grad_norm": 1.4869836568832397, + "learning_rate": 1.6166417904476257e-05, + "loss": 0.5683314800262451, + "step": 248 + }, + { + "epoch": 0.7263445761166819, + "grad_norm": 1.0628005266189575, + "learning_rate": 1.6126253872361336e-05, + "loss": 0.5277887582778931, + "step": 249 + }, + { + "epoch": 0.7292616226071102, + "grad_norm": 1.2682170867919922, + "learning_rate": 1.608593101356229e-05, + "loss": 0.5048879384994507, + "step": 250 + }, + { + "epoch": 0.7292616226071102, + "eval_loss": 0.5038471221923828, + "eval_runtime": 1175.0375, + "eval_samples_per_second": 0.538, + "eval_steps_per_second": 0.538, + "step": 250 + }, + { + "epoch": 0.7321786690975387, + "grad_norm": 1.7376199960708618, + "learning_rate": 1.6045450373472626e-05, + "loss": 0.5093721151351929, + "step": 251 + }, + { + "epoch": 0.7350957155879672, + "grad_norm": 1.6047718524932861, + "learning_rate": 1.6004813001576405e-05, + "loss": 0.4796055555343628, + "step": 252 + }, + { + "epoch": 0.7380127620783956, + "grad_norm": 1.3582886457443237, + "learning_rate": 1.5964019951421058e-05, + "loss": 0.4733014702796936, + "step": 253 + }, + { + "epoch": 0.7409298085688241, + "grad_norm": 0.9468897581100464, + "learning_rate": 1.5923072280590072e-05, + "loss": 0.5312032103538513, + "step": 254 + }, + { + "epoch": 0.7438468550592525, + "grad_norm": 1.3890198469161987, + "learning_rate": 1.5881971050675547e-05, + "loss": 0.47576645016670227, + "step": 255 + }, + { + "epoch": 0.746763901549681, + "grad_norm": 1.782992959022522, + "learning_rate": 1.584071732725071e-05, + "loss": 0.5555092096328735, + "step": 256 + }, + { + "epoch": 0.7496809480401094, + "grad_norm": 1.1790621280670166, + "learning_rate": 1.5799312179842265e-05, + "loss": 0.5148727893829346, + "step": 257 + }, + { + "epoch": 0.7525979945305379, + "grad_norm": 1.446694254875183, + "learning_rate": 1.5757756681902664e-05, + "loss": 0.49939870834350586, + "step": 258 + }, + { + "epoch": 0.7555150410209662, + "grad_norm": 1.1786166429519653, + "learning_rate": 1.571605191078229e-05, + "loss": 0.562156081199646, + "step": 259 + }, + { + "epoch": 0.7584320875113947, + "grad_norm": 1.16925847530365, + "learning_rate": 1.567419894770151e-05, + "loss": 0.49580734968185425, + "step": 260 + }, + { + "epoch": 0.7613491340018231, + "grad_norm": 1.60944664478302, + "learning_rate": 1.5632198877722676e-05, + "loss": 0.4821680784225464, + "step": 261 + }, + { + "epoch": 0.7642661804922516, + "grad_norm": 1.3957884311676025, + "learning_rate": 1.5590052789721946e-05, + "loss": 0.4392276406288147, + "step": 262 + }, + { + "epoch": 0.76718322698268, + "grad_norm": 1.636195421218872, + "learning_rate": 1.5547761776361096e-05, + "loss": 0.39603114128112793, + "step": 263 + }, + { + "epoch": 0.7701002734731085, + "grad_norm": 1.496766448020935, + "learning_rate": 1.550532693405917e-05, + "loss": 0.4833749234676361, + "step": 264 + }, + { + "epoch": 0.773017319963537, + "grad_norm": 1.3587844371795654, + "learning_rate": 1.5462749362964058e-05, + "loss": 0.43738317489624023, + "step": 265 + }, + { + "epoch": 0.7759343664539654, + "grad_norm": 1.670704960823059, + "learning_rate": 1.5420030166923983e-05, + "loss": 0.4476737380027771, + "step": 266 + }, + { + "epoch": 0.7788514129443938, + "grad_norm": 1.2674932479858398, + "learning_rate": 1.537717045345888e-05, + "loss": 0.42266708612442017, + "step": 267 + }, + { + "epoch": 0.7817684594348222, + "grad_norm": 2.0639536380767822, + "learning_rate": 1.5334171333731666e-05, + "loss": 0.5245381593704224, + "step": 268 + }, + { + "epoch": 0.7846855059252507, + "grad_norm": 1.2091766595840454, + "learning_rate": 1.529103392251946e-05, + "loss": 0.5166443586349487, + "step": 269 + }, + { + "epoch": 0.7876025524156791, + "grad_norm": 1.1021631956100464, + "learning_rate": 1.5247759338184653e-05, + "loss": 0.5674265027046204, + "step": 270 + }, + { + "epoch": 0.7905195989061076, + "grad_norm": 1.3143829107284546, + "learning_rate": 1.520434870264595e-05, + "loss": 0.40855613350868225, + "step": 271 + }, + { + "epoch": 0.793436645396536, + "grad_norm": 1.1784812211990356, + "learning_rate": 1.5160803141349244e-05, + "loss": 0.4308925271034241, + "step": 272 + }, + { + "epoch": 0.7963536918869645, + "grad_norm": 2.1635706424713135, + "learning_rate": 1.5117123783238458e-05, + "loss": 0.45035502314567566, + "step": 273 + }, + { + "epoch": 0.7992707383773929, + "grad_norm": 1.569203495979309, + "learning_rate": 1.5073311760726287e-05, + "loss": 0.5095728635787964, + "step": 274 + }, + { + "epoch": 0.8021877848678214, + "grad_norm": 2.532621383666992, + "learning_rate": 1.5029368209664822e-05, + "loss": 0.496748685836792, + "step": 275 + }, + { + "epoch": 0.8051048313582497, + "grad_norm": 1.6312552690505981, + "learning_rate": 1.4985294269316098e-05, + "loss": 0.4972914159297943, + "step": 276 + }, + { + "epoch": 0.8080218778486782, + "grad_norm": 1.3996756076812744, + "learning_rate": 1.4941091082322579e-05, + "loss": 0.5589750409126282, + "step": 277 + }, + { + "epoch": 0.8109389243391066, + "grad_norm": 1.1288363933563232, + "learning_rate": 1.4896759794677526e-05, + "loss": 0.5349453687667847, + "step": 278 + }, + { + "epoch": 0.8138559708295351, + "grad_norm": 1.6913920640945435, + "learning_rate": 1.4852301555695268e-05, + "loss": 0.46511000394821167, + "step": 279 + }, + { + "epoch": 0.8167730173199635, + "grad_norm": 1.1913212537765503, + "learning_rate": 1.4807717517981439e-05, + "loss": 0.4715422987937927, + "step": 280 + }, + { + "epoch": 0.819690063810392, + "grad_norm": 1.1179691553115845, + "learning_rate": 1.476300883740307e-05, + "loss": 0.53330397605896, + "step": 281 + }, + { + "epoch": 0.8226071103008205, + "grad_norm": 1.7473797798156738, + "learning_rate": 1.4718176673058624e-05, + "loss": 0.47564437985420227, + "step": 282 + }, + { + "epoch": 0.8255241567912489, + "grad_norm": 1.2653177976608276, + "learning_rate": 1.4673222187247963e-05, + "loss": 0.46364277601242065, + "step": 283 + }, + { + "epoch": 0.8284412032816773, + "grad_norm": 1.2567330598831177, + "learning_rate": 1.4628146545442202e-05, + "loss": 0.4778091013431549, + "step": 284 + }, + { + "epoch": 0.8313582497721057, + "grad_norm": 1.5848406553268433, + "learning_rate": 1.4582950916253488e-05, + "loss": 0.4480203688144684, + "step": 285 + }, + { + "epoch": 0.8342752962625342, + "grad_norm": 1.3278183937072754, + "learning_rate": 1.453763647140472e-05, + "loss": 0.37945032119750977, + "step": 286 + }, + { + "epoch": 0.8371923427529626, + "grad_norm": 1.0961651802062988, + "learning_rate": 1.4492204385699155e-05, + "loss": 0.5306747555732727, + "step": 287 + }, + { + "epoch": 0.8401093892433911, + "grad_norm": 1.176276683807373, + "learning_rate": 1.4446655836989961e-05, + "loss": 0.49950045347213745, + "step": 288 + }, + { + "epoch": 0.8430264357338195, + "grad_norm": 1.2228269577026367, + "learning_rate": 1.4400992006149674e-05, + "loss": 0.494475394487381, + "step": 289 + }, + { + "epoch": 0.845943482224248, + "grad_norm": 1.1584209203720093, + "learning_rate": 1.4355214077039592e-05, + "loss": 0.44170859456062317, + "step": 290 + }, + { + "epoch": 0.8488605287146764, + "grad_norm": 1.2041938304901123, + "learning_rate": 1.4309323236479071e-05, + "loss": 0.4359871745109558, + "step": 291 + }, + { + "epoch": 0.8517775752051049, + "grad_norm": 1.279645562171936, + "learning_rate": 1.4263320674214762e-05, + "loss": 0.45031386613845825, + "step": 292 + }, + { + "epoch": 0.8546946216955332, + "grad_norm": 1.3958357572555542, + "learning_rate": 1.4217207582889769e-05, + "loss": 0.4832204580307007, + "step": 293 + }, + { + "epoch": 0.8576116681859617, + "grad_norm": 1.2788586616516113, + "learning_rate": 1.4170985158012725e-05, + "loss": 0.5154346227645874, + "step": 294 + }, + { + "epoch": 0.8605287146763901, + "grad_norm": 1.3634892702102661, + "learning_rate": 1.4124654597926795e-05, + "loss": 0.46777206659317017, + "step": 295 + }, + { + "epoch": 0.8634457611668186, + "grad_norm": 1.2719579935073853, + "learning_rate": 1.4078217103778619e-05, + "loss": 0.4247053265571594, + "step": 296 + }, + { + "epoch": 0.866362807657247, + "grad_norm": 2.890467643737793, + "learning_rate": 1.4031673879487161e-05, + "loss": 0.38349640369415283, + "step": 297 + }, + { + "epoch": 0.8692798541476755, + "grad_norm": 2.4354801177978516, + "learning_rate": 1.3985026131712499e-05, + "loss": 0.4134889543056488, + "step": 298 + }, + { + "epoch": 0.872196900638104, + "grad_norm": 1.0138323307037354, + "learning_rate": 1.3938275069824541e-05, + "loss": 0.5176680684089661, + "step": 299 + }, + { + "epoch": 0.8751139471285324, + "grad_norm": 1.2316186428070068, + "learning_rate": 1.389142190587168e-05, + "loss": 0.4818477928638458, + "step": 300 + }, + { + "epoch": 0.8751139471285324, + "eval_loss": 0.4752846360206604, + "eval_runtime": 1189.1666, + "eval_samples_per_second": 0.531, + "eval_steps_per_second": 0.531, + "step": 300 + }, + { + "epoch": 0.8780309936189608, + "grad_norm": 1.515487551689148, + "learning_rate": 1.384446785454936e-05, + "loss": 0.47766175866127014, + "step": 301 + }, + { + "epoch": 0.8809480401093892, + "grad_norm": 1.4357497692108154, + "learning_rate": 1.3797414133168591e-05, + "loss": 0.49297061562538147, + "step": 302 + }, + { + "epoch": 0.8838650865998177, + "grad_norm": 1.2523037195205688, + "learning_rate": 1.3750261961624383e-05, + "loss": 0.4629015326499939, + "step": 303 + }, + { + "epoch": 0.8867821330902461, + "grad_norm": 3.5790023803710938, + "learning_rate": 1.3703012562364124e-05, + "loss": 0.3773120045661926, + "step": 304 + }, + { + "epoch": 0.8896991795806746, + "grad_norm": 1.9305704832077026, + "learning_rate": 1.3655667160355892e-05, + "loss": 0.496719628572464, + "step": 305 + }, + { + "epoch": 0.892616226071103, + "grad_norm": 1.1506154537200928, + "learning_rate": 1.3608226983056687e-05, + "loss": 0.49487072229385376, + "step": 306 + }, + { + "epoch": 0.8955332725615315, + "grad_norm": 1.8046090602874756, + "learning_rate": 1.3560693260380614e-05, + "loss": 0.4910697937011719, + "step": 307 + }, + { + "epoch": 0.8984503190519599, + "grad_norm": 2.0088653564453125, + "learning_rate": 1.3513067224667e-05, + "loss": 0.508246660232544, + "step": 308 + }, + { + "epoch": 0.9013673655423883, + "grad_norm": 1.2966033220291138, + "learning_rate": 1.3465350110648437e-05, + "loss": 0.5125166177749634, + "step": 309 + }, + { + "epoch": 0.9042844120328167, + "grad_norm": 1.9976309537887573, + "learning_rate": 1.3417543155418775e-05, + "loss": 0.43942537903785706, + "step": 310 + }, + { + "epoch": 0.9072014585232452, + "grad_norm": 1.2663682699203491, + "learning_rate": 1.336964759840105e-05, + "loss": 0.4839101731777191, + "step": 311 + }, + { + "epoch": 0.9101185050136736, + "grad_norm": 1.1223328113555908, + "learning_rate": 1.3321664681315354e-05, + "loss": 0.48008066415786743, + "step": 312 + }, + { + "epoch": 0.9130355515041021, + "grad_norm": 1.5786972045898438, + "learning_rate": 1.3273595648146634e-05, + "loss": 0.47250309586524963, + "step": 313 + }, + { + "epoch": 0.9159525979945305, + "grad_norm": 1.2150241136550903, + "learning_rate": 1.322544174511245e-05, + "loss": 0.5149738788604736, + "step": 314 + }, + { + "epoch": 0.918869644484959, + "grad_norm": 1.3676542043685913, + "learning_rate": 1.3177204220630662e-05, + "loss": 0.4430195093154907, + "step": 315 + }, + { + "epoch": 0.9217866909753875, + "grad_norm": 1.0703285932540894, + "learning_rate": 1.3128884325287064e-05, + "loss": 0.4798983037471771, + "step": 316 + }, + { + "epoch": 0.9247037374658159, + "grad_norm": 1.3131535053253174, + "learning_rate": 1.308048331180296e-05, + "loss": 0.4241073727607727, + "step": 317 + }, + { + "epoch": 0.9276207839562443, + "grad_norm": 1.4485348463058472, + "learning_rate": 1.3032002435002698e-05, + "loss": 0.527199923992157, + "step": 318 + }, + { + "epoch": 0.9305378304466727, + "grad_norm": 1.370936393737793, + "learning_rate": 1.2983442951781114e-05, + "loss": 0.47125962376594543, + "step": 319 + }, + { + "epoch": 0.9334548769371012, + "grad_norm": 1.2369643449783325, + "learning_rate": 1.2934806121070973e-05, + "loss": 0.4814244210720062, + "step": 320 + }, + { + "epoch": 0.9363719234275296, + "grad_norm": 1.2632933855056763, + "learning_rate": 1.2886093203810314e-05, + "loss": 0.4915548264980316, + "step": 321 + }, + { + "epoch": 0.9392889699179581, + "grad_norm": 1.054569959640503, + "learning_rate": 1.2837305462909764e-05, + "loss": 0.5325602293014526, + "step": 322 + }, + { + "epoch": 0.9422060164083865, + "grad_norm": 1.15959632396698, + "learning_rate": 1.27884441632198e-05, + "loss": 0.43607404828071594, + "step": 323 + }, + { + "epoch": 0.945123062898815, + "grad_norm": 1.1667979955673218, + "learning_rate": 1.2739510571497945e-05, + "loss": 0.4631507992744446, + "step": 324 + }, + { + "epoch": 0.9480401093892434, + "grad_norm": 1.6009081602096558, + "learning_rate": 1.2690505956375944e-05, + "loss": 0.4935731887817383, + "step": 325 + }, + { + "epoch": 0.9509571558796718, + "grad_norm": 1.1193996667861938, + "learning_rate": 1.2641431588326858e-05, + "loss": 0.45883435010910034, + "step": 326 + }, + { + "epoch": 0.9538742023701002, + "grad_norm": 1.5365067720413208, + "learning_rate": 1.2592288739632138e-05, + "loss": 0.5206276178359985, + "step": 327 + }, + { + "epoch": 0.9567912488605287, + "grad_norm": 1.0714622735977173, + "learning_rate": 1.2543078684348632e-05, + "loss": 0.5242853760719299, + "step": 328 + }, + { + "epoch": 0.9597082953509571, + "grad_norm": 1.3009248971939087, + "learning_rate": 1.2493802698275557e-05, + "loss": 0.4794357717037201, + "step": 329 + }, + { + "epoch": 0.9626253418413856, + "grad_norm": 1.495771050453186, + "learning_rate": 1.244446205892143e-05, + "loss": 0.5849282145500183, + "step": 330 + }, + { + "epoch": 0.965542388331814, + "grad_norm": 1.2046003341674805, + "learning_rate": 1.2395058045470935e-05, + "loss": 0.47758305072784424, + "step": 331 + }, + { + "epoch": 0.9684594348222425, + "grad_norm": 1.1362569332122803, + "learning_rate": 1.2345591938751772e-05, + "loss": 0.4490663409233093, + "step": 332 + }, + { + "epoch": 0.971376481312671, + "grad_norm": 1.2658129930496216, + "learning_rate": 1.2296065021201438e-05, + "loss": 0.4035309851169586, + "step": 333 + }, + { + "epoch": 0.9742935278030994, + "grad_norm": 4.370306015014648, + "learning_rate": 1.2246478576833993e-05, + "loss": 0.495273619890213, + "step": 334 + }, + { + "epoch": 0.9772105742935278, + "grad_norm": 1.3863654136657715, + "learning_rate": 1.219683389120676e-05, + "loss": 0.46410733461380005, + "step": 335 + }, + { + "epoch": 0.9801276207839562, + "grad_norm": 1.4544321298599243, + "learning_rate": 1.2147132251387004e-05, + "loss": 0.4301709830760956, + "step": 336 + }, + { + "epoch": 0.9830446672743847, + "grad_norm": 1.0852457284927368, + "learning_rate": 1.2097374945918554e-05, + "loss": 0.48892468214035034, + "step": 337 + }, + { + "epoch": 0.9859617137648131, + "grad_norm": 1.5062257051467896, + "learning_rate": 1.2047563264788412e-05, + "loss": 0.4667983055114746, + "step": 338 + }, + { + "epoch": 0.9888787602552416, + "grad_norm": 1.2472951412200928, + "learning_rate": 1.199769849939329e-05, + "loss": 0.4827345013618469, + "step": 339 + }, + { + "epoch": 0.99179580674567, + "grad_norm": 1.2589871883392334, + "learning_rate": 1.1947781942506151e-05, + "loss": 0.405245304107666, + "step": 340 + }, + { + "epoch": 0.9947128532360985, + "grad_norm": 1.25636625289917, + "learning_rate": 1.1897814888242679e-05, + "loss": 0.37956133484840393, + "step": 341 + }, + { + "epoch": 0.9976298997265269, + "grad_norm": 2.7064895629882812, + "learning_rate": 1.1847798632027726e-05, + "loss": 0.489456444978714, + "step": 342 + }, + { + "epoch": 1.0, + "grad_norm": 1.6156240701675415, + "learning_rate": 1.1797734470561744e-05, + "loss": 0.46473199129104614, + "step": 343 + }, + { + "epoch": 1.0029170464904285, + "grad_norm": 1.3046343326568604, + "learning_rate": 1.1747623701787143e-05, + "loss": 0.3504878282546997, + "step": 344 + }, + { + "epoch": 1.005834092980857, + "grad_norm": 1.414828896522522, + "learning_rate": 1.1697467624854666e-05, + "loss": 0.4719260334968567, + "step": 345 + }, + { + "epoch": 1.0087511394712854, + "grad_norm": 1.1873356103897095, + "learning_rate": 1.164726754008969e-05, + "loss": 0.45313555002212524, + "step": 346 + }, + { + "epoch": 1.0116681859617138, + "grad_norm": 1.1382380723953247, + "learning_rate": 1.1597024748958526e-05, + "loss": 0.4365478456020355, + "step": 347 + }, + { + "epoch": 1.0145852324521423, + "grad_norm": 1.8141961097717285, + "learning_rate": 1.1546740554034661e-05, + "loss": 0.3694503605365753, + "step": 348 + }, + { + "epoch": 1.0175022789425707, + "grad_norm": 1.333388328552246, + "learning_rate": 1.1496416258965015e-05, + "loss": 0.4755721688270569, + "step": 349 + }, + { + "epoch": 1.0204193254329992, + "grad_norm": 1.3464443683624268, + "learning_rate": 1.1446053168436117e-05, + "loss": 0.4227846562862396, + "step": 350 + }, + { + "epoch": 1.0204193254329992, + "eval_loss": 0.44924086332321167, + "eval_runtime": 1214.6648, + "eval_samples_per_second": 0.52, + "eval_steps_per_second": 0.52, + "step": 350 + }, + { + "epoch": 1.0233363719234276, + "grad_norm": 1.2682689428329468, + "learning_rate": 1.1395652588140292e-05, + "loss": 0.44300130009651184, + "step": 351 + }, + { + "epoch": 1.0262534184138559, + "grad_norm": 1.7737696170806885, + "learning_rate": 1.1345215824741814e-05, + "loss": 0.5106258988380432, + "step": 352 + }, + { + "epoch": 1.0291704649042843, + "grad_norm": 1.2601238489151, + "learning_rate": 1.1294744185843014e-05, + "loss": 0.45930635929107666, + "step": 353 + }, + { + "epoch": 1.0320875113947128, + "grad_norm": 1.2162678241729736, + "learning_rate": 1.1244238979950406e-05, + "loss": 0.44163084030151367, + "step": 354 + }, + { + "epoch": 1.0350045578851412, + "grad_norm": 1.0905817747116089, + "learning_rate": 1.1193701516440733e-05, + "loss": 0.510662317276001, + "step": 355 + }, + { + "epoch": 1.0379216043755697, + "grad_norm": 0.9624952673912048, + "learning_rate": 1.1143133105527048e-05, + "loss": 0.5297917127609253, + "step": 356 + }, + { + "epoch": 1.0408386508659981, + "grad_norm": 1.2757681608200073, + "learning_rate": 1.1092535058224725e-05, + "loss": 0.4332093596458435, + "step": 357 + }, + { + "epoch": 1.0437556973564266, + "grad_norm": 1.6885719299316406, + "learning_rate": 1.104190868631748e-05, + "loss": 0.4337635040283203, + "step": 358 + }, + { + "epoch": 1.046672743846855, + "grad_norm": 1.175484538078308, + "learning_rate": 1.099125530232336e-05, + "loss": 0.45411020517349243, + "step": 359 + }, + { + "epoch": 1.0495897903372835, + "grad_norm": 1.0964939594268799, + "learning_rate": 1.0940576219460723e-05, + "loss": 0.5333439707756042, + "step": 360 + }, + { + "epoch": 1.052506836827712, + "grad_norm": 1.5493136644363403, + "learning_rate": 1.0889872751614176e-05, + "loss": 0.4400906264781952, + "step": 361 + }, + { + "epoch": 1.0554238833181404, + "grad_norm": 1.2491416931152344, + "learning_rate": 1.0839146213300526e-05, + "loss": 0.31049978733062744, + "step": 362 + }, + { + "epoch": 1.0583409298085689, + "grad_norm": 1.7213693857192993, + "learning_rate": 1.0788397919634694e-05, + "loss": 0.389009028673172, + "step": 363 + }, + { + "epoch": 1.0612579762989973, + "grad_norm": 1.5405336618423462, + "learning_rate": 1.0737629186295621e-05, + "loss": 0.4068562984466553, + "step": 364 + }, + { + "epoch": 1.0641750227894258, + "grad_norm": 1.225455641746521, + "learning_rate": 1.0686841329492159e-05, + "loss": 0.47358617186546326, + "step": 365 + }, + { + "epoch": 1.0670920692798542, + "grad_norm": 1.3436250686645508, + "learning_rate": 1.0636035665928945e-05, + "loss": 0.47050854563713074, + "step": 366 + }, + { + "epoch": 1.0700091157702827, + "grad_norm": 1.4952112436294556, + "learning_rate": 1.058521351277227e-05, + "loss": 0.43496906757354736, + "step": 367 + }, + { + "epoch": 1.072926162260711, + "grad_norm": 1.549112319946289, + "learning_rate": 1.0534376187615924e-05, + "loss": 0.45711052417755127, + "step": 368 + }, + { + "epoch": 1.0758432087511394, + "grad_norm": 1.3851526975631714, + "learning_rate": 1.048352500844704e-05, + "loss": 0.45045915246009827, + "step": 369 + }, + { + "epoch": 1.0787602552415678, + "grad_norm": 1.6302049160003662, + "learning_rate": 1.0432661293611927e-05, + "loss": 0.3736046254634857, + "step": 370 + }, + { + "epoch": 1.0816773017319963, + "grad_norm": 1.3365869522094727, + "learning_rate": 1.0381786361781885e-05, + "loss": 0.42242100834846497, + "step": 371 + }, + { + "epoch": 1.0845943482224247, + "grad_norm": 1.4369138479232788, + "learning_rate": 1.0330901531919026e-05, + "loss": 0.44570961594581604, + "step": 372 + }, + { + "epoch": 1.0875113947128532, + "grad_norm": 1.3528283834457397, + "learning_rate": 1.0280008123242069e-05, + "loss": 0.43440738320350647, + "step": 373 + }, + { + "epoch": 1.0904284412032816, + "grad_norm": 1.469660997390747, + "learning_rate": 1.0229107455192147e-05, + "loss": 0.3960394263267517, + "step": 374 + }, + { + "epoch": 1.09334548769371, + "grad_norm": 1.4542185068130493, + "learning_rate": 1.0178200847398595e-05, + "loss": 0.47834208607673645, + "step": 375 + }, + { + "epoch": 1.0962625341841385, + "grad_norm": 1.6470292806625366, + "learning_rate": 1.0127289619644737e-05, + "loss": 0.42791086435317993, + "step": 376 + }, + { + "epoch": 1.099179580674567, + "grad_norm": 1.1934021711349487, + "learning_rate": 1.0076375091833681e-05, + "loss": 0.4401305019855499, + "step": 377 + }, + { + "epoch": 1.1020966271649955, + "grad_norm": 0.9786668419837952, + "learning_rate": 1.0025458583954078e-05, + "loss": 0.4816555678844452, + "step": 378 + }, + { + "epoch": 1.105013673655424, + "grad_norm": 1.1348779201507568, + "learning_rate": 9.974541416045924e-06, + "loss": 0.41516968607902527, + "step": 379 + }, + { + "epoch": 1.1079307201458524, + "grad_norm": 1.0188615322113037, + "learning_rate": 9.923624908166322e-06, + "loss": 0.48087278008461, + "step": 380 + }, + { + "epoch": 1.1108477666362808, + "grad_norm": 1.0821740627288818, + "learning_rate": 9.872710380355263e-06, + "loss": 0.41974008083343506, + "step": 381 + }, + { + "epoch": 1.1137648131267093, + "grad_norm": 1.250951886177063, + "learning_rate": 9.82179915260141e-06, + "loss": 0.42703643441200256, + "step": 382 + }, + { + "epoch": 1.1166818596171377, + "grad_norm": 1.4528254270553589, + "learning_rate": 9.770892544807856e-06, + "loss": 0.43801453709602356, + "step": 383 + }, + { + "epoch": 1.1195989061075662, + "grad_norm": 1.813859462738037, + "learning_rate": 9.719991876757934e-06, + "loss": 0.4344240725040436, + "step": 384 + }, + { + "epoch": 1.1225159525979946, + "grad_norm": 1.6681253910064697, + "learning_rate": 9.669098468080976e-06, + "loss": 0.4356998801231384, + "step": 385 + }, + { + "epoch": 1.125432999088423, + "grad_norm": 1.3447953462600708, + "learning_rate": 9.618213638218117e-06, + "loss": 0.43189188838005066, + "step": 386 + }, + { + "epoch": 1.1283500455788513, + "grad_norm": 1.9577926397323608, + "learning_rate": 9.567338706388074e-06, + "loss": 0.34984707832336426, + "step": 387 + }, + { + "epoch": 1.1312670920692798, + "grad_norm": 1.5225576162338257, + "learning_rate": 9.516474991552965e-06, + "loss": 0.4243963062763214, + "step": 388 + }, + { + "epoch": 1.1341841385597082, + "grad_norm": 1.7416809797286987, + "learning_rate": 9.46562381238408e-06, + "loss": 0.3414606750011444, + "step": 389 + }, + { + "epoch": 1.1371011850501367, + "grad_norm": 1.8358951807022095, + "learning_rate": 9.414786487227732e-06, + "loss": 0.387447327375412, + "step": 390 + }, + { + "epoch": 1.1400182315405651, + "grad_norm": 1.9706153869628906, + "learning_rate": 9.363964334071057e-06, + "loss": 0.4599088728427887, + "step": 391 + }, + { + "epoch": 1.1429352780309936, + "grad_norm": 1.0604286193847656, + "learning_rate": 9.313158670507843e-06, + "loss": 0.4633581042289734, + "step": 392 + }, + { + "epoch": 1.145852324521422, + "grad_norm": 1.4851202964782715, + "learning_rate": 9.262370813704379e-06, + "loss": 0.3872259557247162, + "step": 393 + }, + { + "epoch": 1.1487693710118505, + "grad_norm": 1.7839159965515137, + "learning_rate": 9.21160208036531e-06, + "loss": 0.5215944647789001, + "step": 394 + }, + { + "epoch": 1.151686417502279, + "grad_norm": 1.3054656982421875, + "learning_rate": 9.160853786699475e-06, + "loss": 0.4030425548553467, + "step": 395 + }, + { + "epoch": 1.1546034639927074, + "grad_norm": 3.8467981815338135, + "learning_rate": 9.110127248385827e-06, + "loss": 0.4032524824142456, + "step": 396 + }, + { + "epoch": 1.1575205104831359, + "grad_norm": 1.8513801097869873, + "learning_rate": 9.05942378053928e-06, + "loss": 0.46577155590057373, + "step": 397 + }, + { + "epoch": 1.1604375569735643, + "grad_norm": 1.312689185142517, + "learning_rate": 9.008744697676642e-06, + "loss": 0.39114487171173096, + "step": 398 + }, + { + "epoch": 1.1633546034639928, + "grad_norm": 1.1996328830718994, + "learning_rate": 8.958091313682521e-06, + "loss": 0.481199711561203, + "step": 399 + }, + { + "epoch": 1.1662716499544212, + "grad_norm": 5.172409534454346, + "learning_rate": 8.90746494177528e-06, + "loss": 0.3803558945655823, + "step": 400 + }, + { + "epoch": 1.1662716499544212, + "eval_loss": 0.4318464398384094, + "eval_runtime": 1206.0306, + "eval_samples_per_second": 0.524, + "eval_steps_per_second": 0.524, + "step": 400 + }, + { + "epoch": 1.1691886964448497, + "grad_norm": 1.0115015506744385, + "learning_rate": 8.856866894472954e-06, + "loss": 0.39636704325675964, + "step": 401 + }, + { + "epoch": 1.172105742935278, + "grad_norm": 1.1557435989379883, + "learning_rate": 8.806298483559268e-06, + "loss": 0.4076298475265503, + "step": 402 + }, + { + "epoch": 1.1750227894257064, + "grad_norm": 1.2802515029907227, + "learning_rate": 8.755761020049597e-06, + "loss": 0.44352248311042786, + "step": 403 + }, + { + "epoch": 1.1779398359161348, + "grad_norm": 1.2755069732666016, + "learning_rate": 8.705255814156988e-06, + "loss": 0.390497624874115, + "step": 404 + }, + { + "epoch": 1.1808568824065633, + "grad_norm": 1.2799782752990723, + "learning_rate": 8.654784175258188e-06, + "loss": 0.35810694098472595, + "step": 405 + }, + { + "epoch": 1.1837739288969917, + "grad_norm": 1.0968674421310425, + "learning_rate": 8.604347411859713e-06, + "loss": 0.3890265226364136, + "step": 406 + }, + { + "epoch": 1.1866909753874202, + "grad_norm": 1.3334455490112305, + "learning_rate": 8.553946831563886e-06, + "loss": 0.3916901648044586, + "step": 407 + }, + { + "epoch": 1.1896080218778486, + "grad_norm": 1.1888184547424316, + "learning_rate": 8.503583741034988e-06, + "loss": 0.5231326222419739, + "step": 408 + }, + { + "epoch": 1.192525068368277, + "grad_norm": 1.1163763999938965, + "learning_rate": 8.45325944596534e-06, + "loss": 0.4249858558177948, + "step": 409 + }, + { + "epoch": 1.1954421148587056, + "grad_norm": 1.3470333814620972, + "learning_rate": 8.40297525104148e-06, + "loss": 0.5201632380485535, + "step": 410 + }, + { + "epoch": 1.198359161349134, + "grad_norm": 1.5412285327911377, + "learning_rate": 8.35273245991031e-06, + "loss": 0.39376699924468994, + "step": 411 + }, + { + "epoch": 1.2012762078395625, + "grad_norm": 1.3408735990524292, + "learning_rate": 8.302532375145339e-06, + "loss": 0.39554283022880554, + "step": 412 + }, + { + "epoch": 1.204193254329991, + "grad_norm": 1.990668773651123, + "learning_rate": 8.25237629821286e-06, + "loss": 0.42424261569976807, + "step": 413 + }, + { + "epoch": 1.2071103008204194, + "grad_norm": 1.6471989154815674, + "learning_rate": 8.202265529438259e-06, + "loss": 0.3234582543373108, + "step": 414 + }, + { + "epoch": 1.2100273473108478, + "grad_norm": 1.1483631134033203, + "learning_rate": 8.152201367972275e-06, + "loss": 0.39163246750831604, + "step": 415 + }, + { + "epoch": 1.2129443938012763, + "grad_norm": 1.800149917602539, + "learning_rate": 8.102185111757323e-06, + "loss": 0.5055042505264282, + "step": 416 + }, + { + "epoch": 1.2158614402917047, + "grad_norm": 1.4394795894622803, + "learning_rate": 8.052218057493849e-06, + "loss": 0.4761751592159271, + "step": 417 + }, + { + "epoch": 1.2187784867821332, + "grad_norm": 1.622689962387085, + "learning_rate": 8.002301500606715e-06, + "loss": 0.4490141272544861, + "step": 418 + }, + { + "epoch": 1.2216955332725616, + "grad_norm": 1.2564961910247803, + "learning_rate": 7.952436735211593e-06, + "loss": 0.3964035212993622, + "step": 419 + }, + { + "epoch": 1.22461257976299, + "grad_norm": 1.3248411417007446, + "learning_rate": 7.902625054081449e-06, + "loss": 0.46039122343063354, + "step": 420 + }, + { + "epoch": 1.2275296262534183, + "grad_norm": 1.568983793258667, + "learning_rate": 7.852867748613e-06, + "loss": 0.49916595220565796, + "step": 421 + }, + { + "epoch": 1.2304466727438468, + "grad_norm": 1.4784491062164307, + "learning_rate": 7.803166108793243e-06, + "loss": 0.4035068154335022, + "step": 422 + }, + { + "epoch": 1.2333637192342752, + "grad_norm": 1.2940057516098022, + "learning_rate": 7.753521423166007e-06, + "loss": 0.4154140055179596, + "step": 423 + }, + { + "epoch": 1.2362807657247037, + "grad_norm": 1.167786717414856, + "learning_rate": 7.703934978798565e-06, + "loss": 0.39541637897491455, + "step": 424 + }, + { + "epoch": 1.2391978122151321, + "grad_norm": 1.5126771926879883, + "learning_rate": 7.65440806124823e-06, + "loss": 0.37744253873825073, + "step": 425 + }, + { + "epoch": 1.2421148587055606, + "grad_norm": 1.2595263719558716, + "learning_rate": 7.604941954529067e-06, + "loss": 0.46380615234375, + "step": 426 + }, + { + "epoch": 1.245031905195989, + "grad_norm": 1.4258298873901367, + "learning_rate": 7.555537941078573e-06, + "loss": 0.3391319513320923, + "step": 427 + }, + { + "epoch": 1.2479489516864175, + "grad_norm": 1.5371774435043335, + "learning_rate": 7.506197301724446e-06, + "loss": 0.39805102348327637, + "step": 428 + }, + { + "epoch": 1.250865998176846, + "grad_norm": 1.3789173364639282, + "learning_rate": 7.456921315651371e-06, + "loss": 0.37969034910202026, + "step": 429 + }, + { + "epoch": 1.2537830446672744, + "grad_norm": 1.32931649684906, + "learning_rate": 7.407711260367867e-06, + "loss": 0.3841526508331299, + "step": 430 + }, + { + "epoch": 1.2567000911577029, + "grad_norm": 1.2836817502975464, + "learning_rate": 7.358568411673145e-06, + "loss": 0.340289443731308, + "step": 431 + }, + { + "epoch": 1.2596171376481313, + "grad_norm": 1.0418318510055542, + "learning_rate": 7.309494043624059e-06, + "loss": 0.44747158885002136, + "step": 432 + }, + { + "epoch": 1.2625341841385598, + "grad_norm": 1.1769362688064575, + "learning_rate": 7.260489428502058e-06, + "loss": 0.45737382769584656, + "step": 433 + }, + { + "epoch": 1.265451230628988, + "grad_norm": 2.2730748653411865, + "learning_rate": 7.211555836780203e-06, + "loss": 0.3827931582927704, + "step": 434 + }, + { + "epoch": 1.2683682771194165, + "grad_norm": 1.263096809387207, + "learning_rate": 7.162694537090235e-06, + "loss": 0.3589435815811157, + "step": 435 + }, + { + "epoch": 1.271285323609845, + "grad_norm": 1.4073514938354492, + "learning_rate": 7.113906796189692e-06, + "loss": 0.45206642150878906, + "step": 436 + }, + { + "epoch": 1.2742023701002734, + "grad_norm": 1.064585566520691, + "learning_rate": 7.0651938789290306e-06, + "loss": 0.5409261584281921, + "step": 437 + }, + { + "epoch": 1.2771194165907018, + "grad_norm": 1.2346999645233154, + "learning_rate": 7.016557048218889e-06, + "loss": 0.40680158138275146, + "step": 438 + }, + { + "epoch": 1.2800364630811303, + "grad_norm": 1.5816547870635986, + "learning_rate": 6.967997564997306e-06, + "loss": 0.38718655705451965, + "step": 439 + }, + { + "epoch": 1.2829535095715587, + "grad_norm": 1.085268259048462, + "learning_rate": 6.919516688197041e-06, + "loss": 0.4863276779651642, + "step": 440 + }, + { + "epoch": 1.2858705560619872, + "grad_norm": 1.0984629392623901, + "learning_rate": 6.871115674712937e-06, + "loss": 0.39562875032424927, + "step": 441 + }, + { + "epoch": 1.2887876025524156, + "grad_norm": 1.3004229068756104, + "learning_rate": 6.822795779369339e-06, + "loss": 0.44437694549560547, + "step": 442 + }, + { + "epoch": 1.291704649042844, + "grad_norm": 1.3541183471679688, + "learning_rate": 6.774558254887553e-06, + "loss": 0.4728967249393463, + "step": 443 + }, + { + "epoch": 1.2946216955332726, + "grad_norm": 1.2485377788543701, + "learning_rate": 6.7264043518533695e-06, + "loss": 0.4052809476852417, + "step": 444 + }, + { + "epoch": 1.297538742023701, + "grad_norm": 1.412827730178833, + "learning_rate": 6.67833531868465e-06, + "loss": 0.40149861574172974, + "step": 445 + }, + { + "epoch": 1.3004557885141295, + "grad_norm": 1.5576224327087402, + "learning_rate": 6.630352401598953e-06, + "loss": 0.44107240438461304, + "step": 446 + }, + { + "epoch": 1.303372835004558, + "grad_norm": 1.1551047563552856, + "learning_rate": 6.582456844581226e-06, + "loss": 0.4898405969142914, + "step": 447 + }, + { + "epoch": 1.3062898814949864, + "grad_norm": 1.9939689636230469, + "learning_rate": 6.5346498893515645e-06, + "loss": 0.4791329801082611, + "step": 448 + }, + { + "epoch": 1.3092069279854148, + "grad_norm": 1.4782553911209106, + "learning_rate": 6.486932775333002e-06, + "loss": 0.472908616065979, + "step": 449 + }, + { + "epoch": 1.3121239744758433, + "grad_norm": 1.2496148347854614, + "learning_rate": 6.439306739619387e-06, + "loss": 0.514995276927948, + "step": 450 + }, + { + "epoch": 1.3121239744758433, + "eval_loss": 0.4178673028945923, + "eval_runtime": 1197.5534, + "eval_samples_per_second": 0.528, + "eval_steps_per_second": 0.528, + "step": 450 + }, + { + "epoch": 1.3150410209662717, + "grad_norm": 1.3996772766113281, + "learning_rate": 6.391773016943316e-06, + "loss": 0.4087896943092346, + "step": 451 + }, + { + "epoch": 1.3179580674567002, + "grad_norm": 1.20390784740448, + "learning_rate": 6.344332839644111e-06, + "loss": 0.43224579095840454, + "step": 452 + }, + { + "epoch": 1.3208751139471286, + "grad_norm": 1.2709496021270752, + "learning_rate": 6.296987437635876e-06, + "loss": 0.44104093313217163, + "step": 453 + }, + { + "epoch": 1.323792160437557, + "grad_norm": 1.0112334489822388, + "learning_rate": 6.249738038375618e-06, + "loss": 0.47084498405456543, + "step": 454 + }, + { + "epoch": 1.3267092069279856, + "grad_norm": 1.0771515369415283, + "learning_rate": 6.202585866831411e-06, + "loss": 0.4700928032398224, + "step": 455 + }, + { + "epoch": 1.3296262534184138, + "grad_norm": 1.4937143325805664, + "learning_rate": 6.15553214545064e-06, + "loss": 0.345747709274292, + "step": 456 + }, + { + "epoch": 1.3325432999088422, + "grad_norm": 1.1348456144332886, + "learning_rate": 6.108578094128321e-06, + "loss": 0.33824583888053894, + "step": 457 + }, + { + "epoch": 1.3354603463992707, + "grad_norm": 1.2502707242965698, + "learning_rate": 6.061724930175461e-06, + "loss": 0.3528832197189331, + "step": 458 + }, + { + "epoch": 1.3383773928896991, + "grad_norm": 1.5359619855880737, + "learning_rate": 6.014973868287504e-06, + "loss": 0.4413869082927704, + "step": 459 + }, + { + "epoch": 1.3412944393801276, + "grad_norm": 0.9747081398963928, + "learning_rate": 5.9683261205128395e-06, + "loss": 0.6849499940872192, + "step": 460 + }, + { + "epoch": 1.344211485870556, + "grad_norm": 1.3150533437728882, + "learning_rate": 5.921782896221383e-06, + "loss": 0.3901931047439575, + "step": 461 + }, + { + "epoch": 1.3471285323609845, + "grad_norm": 1.137770652770996, + "learning_rate": 5.875345402073207e-06, + "loss": 0.37498384714126587, + "step": 462 + }, + { + "epoch": 1.350045578851413, + "grad_norm": 1.2216367721557617, + "learning_rate": 5.829014841987277e-06, + "loss": 0.3874579966068268, + "step": 463 + }, + { + "epoch": 1.3529626253418414, + "grad_norm": 1.135439157485962, + "learning_rate": 5.782792417110233e-06, + "loss": 0.384797066450119, + "step": 464 + }, + { + "epoch": 1.3558796718322699, + "grad_norm": 1.2400696277618408, + "learning_rate": 5.736679325785239e-06, + "loss": 0.46303266286849976, + "step": 465 + }, + { + "epoch": 1.3587967183226983, + "grad_norm": 1.8848882913589478, + "learning_rate": 5.6906767635209304e-06, + "loss": 0.5068309903144836, + "step": 466 + }, + { + "epoch": 1.3617137648131268, + "grad_norm": 1.4707008600234985, + "learning_rate": 5.644785922960412e-06, + "loss": 0.364332914352417, + "step": 467 + }, + { + "epoch": 1.364630811303555, + "grad_norm": 2.4436841011047363, + "learning_rate": 5.599007993850329e-06, + "loss": 0.485107421875, + "step": 468 + }, + { + "epoch": 1.3675478577939835, + "grad_norm": 1.1924740076065063, + "learning_rate": 5.553344163010039e-06, + "loss": 0.34547489881515503, + "step": 469 + }, + { + "epoch": 1.370464904284412, + "grad_norm": 1.1255877017974854, + "learning_rate": 5.507795614300846e-06, + "loss": 0.39645254611968994, + "step": 470 + }, + { + "epoch": 1.3733819507748404, + "grad_norm": 1.0937018394470215, + "learning_rate": 5.4623635285952815e-06, + "loss": 0.4267856478691101, + "step": 471 + }, + { + "epoch": 1.3762989972652688, + "grad_norm": 1.3355520963668823, + "learning_rate": 5.417049083746513e-06, + "loss": 0.3669992983341217, + "step": 472 + }, + { + "epoch": 1.3792160437556973, + "grad_norm": 1.7302504777908325, + "learning_rate": 5.3718534545578035e-06, + "loss": 0.3873697519302368, + "step": 473 + }, + { + "epoch": 1.3821330902461257, + "grad_norm": 1.17263662815094, + "learning_rate": 5.326777812752041e-06, + "loss": 0.4581540524959564, + "step": 474 + }, + { + "epoch": 1.3850501367365542, + "grad_norm": 1.0998128652572632, + "learning_rate": 5.281823326941377e-06, + "loss": 0.43062761425971985, + "step": 475 + }, + { + "epoch": 1.3879671832269826, + "grad_norm": 1.1194556951522827, + "learning_rate": 5.236991162596932e-06, + "loss": 0.381741464138031, + "step": 476 + }, + { + "epoch": 1.390884229717411, + "grad_norm": 1.2759051322937012, + "learning_rate": 5.19228248201856e-06, + "loss": 0.49175748229026794, + "step": 477 + }, + { + "epoch": 1.3938012762078396, + "grad_norm": 1.2134747505187988, + "learning_rate": 5.147698444304732e-06, + "loss": 0.4997562766075134, + "step": 478 + }, + { + "epoch": 1.396718322698268, + "grad_norm": 1.0833078622817993, + "learning_rate": 5.1032402053224804e-06, + "loss": 0.42580488324165344, + "step": 479 + }, + { + "epoch": 1.3996353691886965, + "grad_norm": 1.4838510751724243, + "learning_rate": 5.058908917677426e-06, + "loss": 0.5015593767166138, + "step": 480 + }, + { + "epoch": 1.402552415679125, + "grad_norm": 1.218610167503357, + "learning_rate": 5.014705730683904e-06, + "loss": 0.34739193320274353, + "step": 481 + }, + { + "epoch": 1.4054694621695534, + "grad_norm": 1.1883307695388794, + "learning_rate": 4.970631790335181e-06, + "loss": 0.41708022356033325, + "step": 482 + }, + { + "epoch": 1.4083865086599818, + "grad_norm": 1.209291696548462, + "learning_rate": 4.926688239273713e-06, + "loss": 0.43546172976493835, + "step": 483 + }, + { + "epoch": 1.4113035551504103, + "grad_norm": 1.0801606178283691, + "learning_rate": 4.882876216761543e-06, + "loss": 0.44491735100746155, + "step": 484 + }, + { + "epoch": 1.4142206016408387, + "grad_norm": 1.2746628522872925, + "learning_rate": 4.839196858650763e-06, + "loss": 0.436122864484787, + "step": 485 + }, + { + "epoch": 1.4171376481312672, + "grad_norm": 1.4465962648391724, + "learning_rate": 4.795651297354056e-06, + "loss": 0.3750447630882263, + "step": 486 + }, + { + "epoch": 1.4200546946216956, + "grad_norm": 1.6736211776733398, + "learning_rate": 4.752240661815346e-06, + "loss": 0.38286519050598145, + "step": 487 + }, + { + "epoch": 1.422971741112124, + "grad_norm": 1.1946996450424194, + "learning_rate": 4.708966077480544e-06, + "loss": 0.4488063156604767, + "step": 488 + }, + { + "epoch": 1.4258887876025526, + "grad_norm": 1.42599356174469, + "learning_rate": 4.665828666268335e-06, + "loss": 0.44088613986968994, + "step": 489 + }, + { + "epoch": 1.4288058340929808, + "grad_norm": 1.2281016111373901, + "learning_rate": 4.622829546541121e-06, + "loss": 0.4030645489692688, + "step": 490 + }, + { + "epoch": 1.4317228805834092, + "grad_norm": 1.2875670194625854, + "learning_rate": 4.57996983307602e-06, + "loss": 0.44702020287513733, + "step": 491 + }, + { + "epoch": 1.4346399270738377, + "grad_norm": 1.2456860542297363, + "learning_rate": 4.537250637035947e-06, + "loss": 0.4067370593547821, + "step": 492 + }, + { + "epoch": 1.4375569735642661, + "grad_norm": 1.2822725772857666, + "learning_rate": 4.494673065940833e-06, + "loss": 0.4237740635871887, + "step": 493 + }, + { + "epoch": 1.4404740200546946, + "grad_norm": 1.5517818927764893, + "learning_rate": 4.452238223638906e-06, + "loss": 0.40579724311828613, + "step": 494 + }, + { + "epoch": 1.443391066545123, + "grad_norm": 1.275344967842102, + "learning_rate": 4.409947210278056e-06, + "loss": 0.38880717754364014, + "step": 495 + }, + { + "epoch": 1.4463081130355515, + "grad_norm": 1.22952139377594, + "learning_rate": 4.367801122277327e-06, + "loss": 0.4042310416698456, + "step": 496 + }, + { + "epoch": 1.44922515952598, + "grad_norm": 1.122261643409729, + "learning_rate": 4.325801052298493e-06, + "loss": 0.5408368110656738, + "step": 497 + }, + { + "epoch": 1.4521422060164084, + "grad_norm": 1.5885361433029175, + "learning_rate": 4.283948089217715e-06, + "loss": 0.37697717547416687, + "step": 498 + }, + { + "epoch": 1.4550592525068369, + "grad_norm": 2.3565149307250977, + "learning_rate": 4.242243318097338e-06, + "loss": 0.3811529576778412, + "step": 499 + }, + { + "epoch": 1.4579762989972653, + "grad_norm": 1.1944137811660767, + "learning_rate": 4.200687820157735e-06, + "loss": 0.414781391620636, + "step": 500 + }, + { + "epoch": 1.4579762989972653, + "eval_loss": 0.40706494450569153, + "eval_runtime": 1189.1593, + "eval_samples_per_second": 0.531, + "eval_steps_per_second": 0.531, + "step": 500 + }, + { + "epoch": 1.4608933454876938, + "grad_norm": 1.0442464351654053, + "learning_rate": 4.159282672749289e-06, + "loss": 0.38155990839004517, + "step": 501 + }, + { + "epoch": 1.463810391978122, + "grad_norm": 1.7274727821350098, + "learning_rate": 4.118028949324453e-06, + "loss": 0.4830601215362549, + "step": 502 + }, + { + "epoch": 1.4667274384685505, + "grad_norm": 2.064513921737671, + "learning_rate": 4.0769277194099345e-06, + "loss": 0.3975123167037964, + "step": 503 + }, + { + "epoch": 1.469644484958979, + "grad_norm": 1.7695534229278564, + "learning_rate": 4.035980048578942e-06, + "loss": 0.37033841013908386, + "step": 504 + }, + { + "epoch": 1.4725615314494074, + "grad_norm": 1.4455046653747559, + "learning_rate": 3.995186998423597e-06, + "loss": 0.39567673206329346, + "step": 505 + }, + { + "epoch": 1.4754785779398358, + "grad_norm": 1.1791958808898926, + "learning_rate": 3.9545496265273765e-06, + "loss": 0.44786664843559265, + "step": 506 + }, + { + "epoch": 1.4783956244302643, + "grad_norm": 2.0874717235565186, + "learning_rate": 3.9140689864377105e-06, + "loss": 0.3333263099193573, + "step": 507 + }, + { + "epoch": 1.4813126709206927, + "grad_norm": 1.5897501707077026, + "learning_rate": 3.873746127638668e-06, + "loss": 0.5105943083763123, + "step": 508 + }, + { + "epoch": 1.4842297174111212, + "grad_norm": 1.5059760808944702, + "learning_rate": 3.833582095523749e-06, + "loss": 0.43922683596611023, + "step": 509 + }, + { + "epoch": 1.4871467639015497, + "grad_norm": 1.379347562789917, + "learning_rate": 3.7935779313687648e-06, + "loss": 0.4584790766239166, + "step": 510 + }, + { + "epoch": 1.490063810391978, + "grad_norm": 1.0984690189361572, + "learning_rate": 3.7537346723048816e-06, + "loss": 0.5217512249946594, + "step": 511 + }, + { + "epoch": 1.4929808568824066, + "grad_norm": 1.5944225788116455, + "learning_rate": 3.71405335129169e-06, + "loss": 0.4180052876472473, + "step": 512 + }, + { + "epoch": 1.495897903372835, + "grad_norm": 1.2745033502578735, + "learning_rate": 3.6745349970904465e-06, + "loss": 0.4584833085536957, + "step": 513 + }, + { + "epoch": 1.4988149498632635, + "grad_norm": 1.2746814489364624, + "learning_rate": 3.6351806342374007e-06, + "loss": 0.3202287554740906, + "step": 514 + }, + { + "epoch": 1.501731996353692, + "grad_norm": 1.409638524055481, + "learning_rate": 3.5959912830172348e-06, + "loss": 0.37963351607322693, + "step": 515 + }, + { + "epoch": 1.5046490428441204, + "grad_norm": 1.1655553579330444, + "learning_rate": 3.556967959436591e-06, + "loss": 0.43133026361465454, + "step": 516 + }, + { + "epoch": 1.5075660893345488, + "grad_norm": 1.0495020151138306, + "learning_rate": 3.518111675197776e-06, + "loss": 0.3739299178123474, + "step": 517 + }, + { + "epoch": 1.5104831358249773, + "grad_norm": 1.3055057525634766, + "learning_rate": 3.4794234376724835e-06, + "loss": 0.4099601209163666, + "step": 518 + }, + { + "epoch": 1.5134001823154057, + "grad_norm": 1.2252463102340698, + "learning_rate": 3.4409042498757084e-06, + "loss": 0.380616158246994, + "step": 519 + }, + { + "epoch": 1.5163172288058342, + "grad_norm": 1.2728638648986816, + "learning_rate": 3.4025551104397294e-06, + "loss": 0.3510003685951233, + "step": 520 + }, + { + "epoch": 1.5192342752962626, + "grad_norm": 2.70664644241333, + "learning_rate": 3.3643770135882282e-06, + "loss": 0.4087940752506256, + "step": 521 + }, + { + "epoch": 1.522151321786691, + "grad_norm": 1.6197112798690796, + "learning_rate": 3.3263709491104933e-06, + "loss": 0.45614126324653625, + "step": 522 + }, + { + "epoch": 1.5250683682771196, + "grad_norm": 1.3596103191375732, + "learning_rate": 3.2885379023357956e-06, + "loss": 0.3824586272239685, + "step": 523 + }, + { + "epoch": 1.527985414767548, + "grad_norm": 1.1768635511398315, + "learning_rate": 3.2508788541078097e-06, + "loss": 0.47717779874801636, + "step": 524 + }, + { + "epoch": 1.5309024612579762, + "grad_norm": 1.669474482536316, + "learning_rate": 3.2133947807591958e-06, + "loss": 0.4013281762599945, + "step": 525 + }, + { + "epoch": 1.5338195077484047, + "grad_norm": 1.600868582725525, + "learning_rate": 3.1760866540862932e-06, + "loss": 0.367280513048172, + "step": 526 + }, + { + "epoch": 1.5367365542388332, + "grad_norm": 1.1689515113830566, + "learning_rate": 3.138955441323923e-06, + "loss": 0.4432409405708313, + "step": 527 + }, + { + "epoch": 1.5396536007292616, + "grad_norm": 2.361961603164673, + "learning_rate": 3.1020021051202973e-06, + "loss": 0.4219942092895508, + "step": 528 + }, + { + "epoch": 1.54257064721969, + "grad_norm": 1.1962230205535889, + "learning_rate": 3.0652276035120964e-06, + "loss": 0.3672596514225006, + "step": 529 + }, + { + "epoch": 1.5454876937101185, + "grad_norm": 1.4149441719055176, + "learning_rate": 3.0286328898995963e-06, + "loss": 0.42919260263442993, + "step": 530 + }, + { + "epoch": 1.548404740200547, + "grad_norm": 1.2668434381484985, + "learning_rate": 2.992218913021966e-06, + "loss": 0.4499061107635498, + "step": 531 + }, + { + "epoch": 1.5513217866909754, + "grad_norm": 1.268114686012268, + "learning_rate": 2.9559866169326734e-06, + "loss": 0.34660714864730835, + "step": 532 + }, + { + "epoch": 1.5542388331814039, + "grad_norm": 1.0086419582366943, + "learning_rate": 2.919936940975007e-06, + "loss": 0.38239023089408875, + "step": 533 + }, + { + "epoch": 1.557155879671832, + "grad_norm": 1.0700170993804932, + "learning_rate": 2.884070819757712e-06, + "loss": 0.48240017890930176, + "step": 534 + }, + { + "epoch": 1.5600729261622606, + "grad_norm": 1.2101227045059204, + "learning_rate": 2.8483891831307873e-06, + "loss": 0.4098761975765228, + "step": 535 + }, + { + "epoch": 1.562989972652689, + "grad_norm": 1.2731400728225708, + "learning_rate": 2.8128929561613505e-06, + "loss": 0.45641395449638367, + "step": 536 + }, + { + "epoch": 1.5659070191431175, + "grad_norm": 1.1474392414093018, + "learning_rate": 2.777583059109671e-06, + "loss": 0.42283985018730164, + "step": 537 + }, + { + "epoch": 1.568824065633546, + "grad_norm": 1.789881944656372, + "learning_rate": 2.7424604074053028e-06, + "loss": 0.3469158113002777, + "step": 538 + }, + { + "epoch": 1.5717411121239744, + "grad_norm": 1.3426933288574219, + "learning_rate": 2.707525911623362e-06, + "loss": 0.35837510228157043, + "step": 539 + }, + { + "epoch": 1.5746581586144028, + "grad_norm": 1.2343578338623047, + "learning_rate": 2.672780477460901e-06, + "loss": 0.4736083745956421, + "step": 540 + }, + { + "epoch": 1.5775752051048313, + "grad_norm": 1.516298770904541, + "learning_rate": 2.638225005713457e-06, + "loss": 0.34345340728759766, + "step": 541 + }, + { + "epoch": 1.5804922515952597, + "grad_norm": 1.1488829851150513, + "learning_rate": 2.6038603922516705e-06, + "loss": 0.4134179949760437, + "step": 542 + }, + { + "epoch": 1.5834092980856882, + "grad_norm": 1.4486491680145264, + "learning_rate": 2.569687527998073e-06, + "loss": 0.3297592103481293, + "step": 543 + }, + { + "epoch": 1.5863263445761167, + "grad_norm": 1.272691011428833, + "learning_rate": 2.5357072989039855e-06, + "loss": 0.3958476185798645, + "step": 544 + }, + { + "epoch": 1.589243391066545, + "grad_norm": 1.244240641593933, + "learning_rate": 2.501920585926555e-06, + "loss": 0.4125611186027527, + "step": 545 + }, + { + "epoch": 1.5921604375569736, + "grad_norm": 1.5844073295593262, + "learning_rate": 2.4683282650058992e-06, + "loss": 0.3762253224849701, + "step": 546 + }, + { + "epoch": 1.595077484047402, + "grad_norm": 1.8209946155548096, + "learning_rate": 2.4349312070424258e-06, + "loss": 0.37053319811820984, + "step": 547 + }, + { + "epoch": 1.5979945305378305, + "grad_norm": 1.3752915859222412, + "learning_rate": 2.4017302778742247e-06, + "loss": 0.5004774332046509, + "step": 548 + }, + { + "epoch": 1.600911577028259, + "grad_norm": 5.143753528594971, + "learning_rate": 2.36872633825464e-06, + "loss": 0.39014023542404175, + "step": 549 + }, + { + "epoch": 1.6038286235186874, + "grad_norm": 1.0730944871902466, + "learning_rate": 2.335920243829941e-06, + "loss": 0.378440260887146, + "step": 550 + }, + { + "epoch": 1.6038286235186874, + "eval_loss": 0.40037089586257935, + "eval_runtime": 893.7411, + "eval_samples_per_second": 0.707, + "eval_steps_per_second": 0.707, + "step": 550 + }, + { + "epoch": 1.6067456700091158, + "grad_norm": 1.5507797002792358, + "learning_rate": 2.3033128451171548e-06, + "loss": 0.4471960663795471, + "step": 551 + }, + { + "epoch": 1.6096627164995443, + "grad_norm": 1.9462968111038208, + "learning_rate": 2.2709049874819924e-06, + "loss": 0.3658301830291748, + "step": 552 + }, + { + "epoch": 1.6125797629899727, + "grad_norm": 1.2034238576889038, + "learning_rate": 2.238697511116962e-06, + "loss": 0.3911179304122925, + "step": 553 + }, + { + "epoch": 1.6154968094804012, + "grad_norm": 1.3574327230453491, + "learning_rate": 2.2066912510195636e-06, + "loss": 0.3998897671699524, + "step": 554 + }, + { + "epoch": 1.6184138559708297, + "grad_norm": 1.1973012685775757, + "learning_rate": 2.1748870369706507e-06, + "loss": 0.38577449321746826, + "step": 555 + }, + { + "epoch": 1.621330902461258, + "grad_norm": 1.9365874528884888, + "learning_rate": 2.1432856935129144e-06, + "loss": 0.411307156085968, + "step": 556 + }, + { + "epoch": 1.6242479489516866, + "grad_norm": 1.3558642864227295, + "learning_rate": 2.1118880399295106e-06, + "loss": 0.38424253463745117, + "step": 557 + }, + { + "epoch": 1.627164995442115, + "grad_norm": 1.4368890523910522, + "learning_rate": 2.0806948902228075e-06, + "loss": 0.39943546056747437, + "step": 558 + }, + { + "epoch": 1.6300820419325432, + "grad_norm": 1.6266753673553467, + "learning_rate": 2.0497070530933084e-06, + "loss": 0.36787641048431396, + "step": 559 + }, + { + "epoch": 1.6329990884229717, + "grad_norm": 1.2600938081741333, + "learning_rate": 2.0189253319186576e-06, + "loss": 0.3781934380531311, + "step": 560 + }, + { + "epoch": 1.6359161349134002, + "grad_norm": 1.975071907043457, + "learning_rate": 1.9883505247328237e-06, + "loss": 0.4132305383682251, + "step": 561 + }, + { + "epoch": 1.6388331814038286, + "grad_norm": 1.4095909595489502, + "learning_rate": 1.9579834242054154e-06, + "loss": 0.3727574646472931, + "step": 562 + }, + { + "epoch": 1.641750227894257, + "grad_norm": 1.4271371364593506, + "learning_rate": 1.9278248176211243e-06, + "loss": 0.33786773681640625, + "step": 563 + }, + { + "epoch": 1.6446672743846855, + "grad_norm": 1.5907646417617798, + "learning_rate": 1.8978754868593074e-06, + "loss": 0.33035099506378174, + "step": 564 + }, + { + "epoch": 1.647584320875114, + "grad_norm": 1.1315702199935913, + "learning_rate": 1.8681362083737387e-06, + "loss": 0.41707149147987366, + "step": 565 + }, + { + "epoch": 1.6505013673655424, + "grad_norm": 1.4737143516540527, + "learning_rate": 1.8386077531724556e-06, + "loss": 0.43079230189323425, + "step": 566 + }, + { + "epoch": 1.6534184138559709, + "grad_norm": 1.1006760597229004, + "learning_rate": 1.8092908867977822e-06, + "loss": 0.3524904251098633, + "step": 567 + }, + { + "epoch": 1.6563354603463991, + "grad_norm": 1.4066118001937866, + "learning_rate": 1.780186369306479e-06, + "loss": 0.3695681691169739, + "step": 568 + }, + { + "epoch": 1.6592525068368276, + "grad_norm": 1.6444640159606934, + "learning_rate": 1.7512949552500412e-06, + "loss": 0.35596007108688354, + "step": 569 + }, + { + "epoch": 1.662169553327256, + "grad_norm": 1.159480094909668, + "learning_rate": 1.7226173936551282e-06, + "loss": 0.4520571827888489, + "step": 570 + }, + { + "epoch": 1.6650865998176845, + "grad_norm": 1.5874221324920654, + "learning_rate": 1.6941544280041567e-06, + "loss": 0.4702282249927521, + "step": 571 + }, + { + "epoch": 1.668003646308113, + "grad_norm": 1.6153535842895508, + "learning_rate": 1.6659067962160157e-06, + "loss": 0.3803800046443939, + "step": 572 + }, + { + "epoch": 1.6709206927985414, + "grad_norm": 1.0748940706253052, + "learning_rate": 1.6378752306269386e-06, + "loss": 0.4368419051170349, + "step": 573 + }, + { + "epoch": 1.6738377392889698, + "grad_norm": 1.5286788940429688, + "learning_rate": 1.6100604579715185e-06, + "loss": 0.4195623993873596, + "step": 574 + }, + { + "epoch": 1.6767547857793983, + "grad_norm": 1.1433510780334473, + "learning_rate": 1.5824631993638651e-06, + "loss": 0.4366849660873413, + "step": 575 + }, + { + "epoch": 1.6796718322698267, + "grad_norm": 1.9694907665252686, + "learning_rate": 1.5550841702789122e-06, + "loss": 0.5555303692817688, + "step": 576 + }, + { + "epoch": 1.6825888787602552, + "grad_norm": 1.7587188482284546, + "learning_rate": 1.5279240805338647e-06, + "loss": 0.40394848585128784, + "step": 577 + }, + { + "epoch": 1.6855059252506837, + "grad_norm": 1.063381314277649, + "learning_rate": 1.5009836342697993e-06, + "loss": 0.49564215540885925, + "step": 578 + }, + { + "epoch": 1.688422971741112, + "grad_norm": 1.1742531061172485, + "learning_rate": 1.4742635299334063e-06, + "loss": 0.3891904950141907, + "step": 579 + }, + { + "epoch": 1.6913400182315406, + "grad_norm": 1.499934196472168, + "learning_rate": 1.4477644602588848e-06, + "loss": 0.35497623682022095, + "step": 580 + }, + { + "epoch": 1.694257064721969, + "grad_norm": 1.5112360715866089, + "learning_rate": 1.421487112249984e-06, + "loss": 0.4062272012233734, + "step": 581 + }, + { + "epoch": 1.6971741112123975, + "grad_norm": 1.3583141565322876, + "learning_rate": 1.3954321671621885e-06, + "loss": 0.3655265271663666, + "step": 582 + }, + { + "epoch": 1.700091157702826, + "grad_norm": 2.8181653022766113, + "learning_rate": 1.3696003004850577e-06, + "loss": 0.37418332695961, + "step": 583 + }, + { + "epoch": 1.7030082041932544, + "grad_norm": 0.967166543006897, + "learning_rate": 1.3439921819247138e-06, + "loss": 0.4946930408477783, + "step": 584 + }, + { + "epoch": 1.7059252506836828, + "grad_norm": 1.2773699760437012, + "learning_rate": 1.3186084753864813e-06, + "loss": 0.5101871490478516, + "step": 585 + }, + { + "epoch": 1.7088422971741113, + "grad_norm": 1.2814991474151611, + "learning_rate": 1.293449838957671e-06, + "loss": 0.3688133656978607, + "step": 586 + }, + { + "epoch": 1.7117593436645397, + "grad_norm": 1.594966173171997, + "learning_rate": 1.2685169248905228e-06, + "loss": 0.4739398956298828, + "step": 587 + }, + { + "epoch": 1.7146763901549682, + "grad_norm": 1.1471531391143799, + "learning_rate": 1.2438103795852885e-06, + "loss": 0.3719588816165924, + "step": 588 + }, + { + "epoch": 1.7175934366453967, + "grad_norm": 1.1657356023788452, + "learning_rate": 1.2193308435734852e-06, + "loss": 0.4119298458099365, + "step": 589 + }, + { + "epoch": 1.720510483135825, + "grad_norm": 1.1239042282104492, + "learning_rate": 1.1950789515012783e-06, + "loss": 0.38277503848075867, + "step": 590 + }, + { + "epoch": 1.7234275296262536, + "grad_norm": 1.149478554725647, + "learning_rate": 1.1710553321130324e-06, + "loss": 0.35080626606941223, + "step": 591 + }, + { + "epoch": 1.726344576116682, + "grad_norm": 1.2020260095596313, + "learning_rate": 1.1472606082350112e-06, + "loss": 0.3991318345069885, + "step": 592 + }, + { + "epoch": 1.7292616226071102, + "grad_norm": 1.101475477218628, + "learning_rate": 1.123695396759229e-06, + "loss": 0.45791420340538025, + "step": 593 + }, + { + "epoch": 1.7321786690975387, + "grad_norm": 0.9617101550102234, + "learning_rate": 1.1003603086274584e-06, + "loss": 0.39805036783218384, + "step": 594 + }, + { + "epoch": 1.7350957155879672, + "grad_norm": 1.1439731121063232, + "learning_rate": 1.07725594881539e-06, + "loss": 0.35753339529037476, + "step": 595 + }, + { + "epoch": 1.7380127620783956, + "grad_norm": 1.0350618362426758, + "learning_rate": 1.0543829163169516e-06, + "loss": 0.42581748962402344, + "step": 596 + }, + { + "epoch": 1.740929808568824, + "grad_norm": 1.2865227460861206, + "learning_rate": 1.031741804128773e-06, + "loss": 0.34685325622558594, + "step": 597 + }, + { + "epoch": 1.7438468550592525, + "grad_norm": 1.2079373598098755, + "learning_rate": 1.0093331992348154e-06, + "loss": 0.48401936888694763, + "step": 598 + }, + { + "epoch": 1.746763901549681, + "grad_norm": 1.1684436798095703, + "learning_rate": 9.871576825911577e-07, + "loss": 0.387456476688385, + "step": 599 + }, + { + "epoch": 1.7496809480401094, + "grad_norm": 1.298045039176941, + "learning_rate": 9.65215829110927e-07, + "loss": 0.40196847915649414, + "step": 600 + }, + { + "epoch": 1.7496809480401094, + "eval_loss": 0.3965963125228882, + "eval_runtime": 912.3102, + "eval_samples_per_second": 0.693, + "eval_steps_per_second": 0.693, + "step": 600 + }, + { + "epoch": 1.7525979945305379, + "grad_norm": 1.24501371383667, + "learning_rate": 9.435082076493974e-07, + "loss": 0.3990224003791809, + "step": 601 + }, + { + "epoch": 1.7555150410209661, + "grad_norm": 1.0634632110595703, + "learning_rate": 9.220353809892435e-07, + "loss": 0.44232451915740967, + "step": 602 + }, + { + "epoch": 1.7584320875113946, + "grad_norm": 1.0276325941085815, + "learning_rate": 9.007979058259475e-07, + "loss": 0.5336061716079712, + "step": 603 + }, + { + "epoch": 1.761349134001823, + "grad_norm": 1.1488786935806274, + "learning_rate": 8.797963327533698e-07, + "loss": 0.35023194551467896, + "step": 604 + }, + { + "epoch": 1.7642661804922515, + "grad_norm": 1.171109676361084, + "learning_rate": 8.590312062494699e-07, + "loss": 0.4461829662322998, + "step": 605 + }, + { + "epoch": 1.76718322698268, + "grad_norm": 1.3948134183883667, + "learning_rate": 8.385030646621938e-07, + "loss": 0.3448236584663391, + "step": 606 + }, + { + "epoch": 1.7701002734731084, + "grad_norm": 1.144608497619629, + "learning_rate": 8.18212440195515e-07, + "loss": 0.39913487434387207, + "step": 607 + }, + { + "epoch": 1.7730173199635368, + "grad_norm": 1.1941088438034058, + "learning_rate": 7.981598588956396e-07, + "loss": 0.40005186200141907, + "step": 608 + }, + { + "epoch": 1.7759343664539653, + "grad_norm": 1.1087690591812134, + "learning_rate": 7.783458406373656e-07, + "loss": 0.38895174860954285, + "step": 609 + }, + { + "epoch": 1.7788514129443938, + "grad_norm": 1.1787676811218262, + "learning_rate": 7.587708991106069e-07, + "loss": 0.36259594559669495, + "step": 610 + }, + { + "epoch": 1.7817684594348222, + "grad_norm": 1.1265360116958618, + "learning_rate": 7.394355418070731e-07, + "loss": 0.44475269317626953, + "step": 611 + }, + { + "epoch": 1.7846855059252507, + "grad_norm": 1.2230898141860962, + "learning_rate": 7.203402700071138e-07, + "loss": 0.3823542594909668, + "step": 612 + }, + { + "epoch": 1.7876025524156791, + "grad_norm": 1.0893492698669434, + "learning_rate": 7.01485578766724e-07, + "loss": 0.43276944756507874, + "step": 613 + }, + { + "epoch": 1.7905195989061076, + "grad_norm": 1.039494514465332, + "learning_rate": 6.828719569047082e-07, + "loss": 0.5362570881843567, + "step": 614 + }, + { + "epoch": 1.793436645396536, + "grad_norm": 1.0307413339614868, + "learning_rate": 6.644998869900054e-07, + "loss": 0.34828731417655945, + "step": 615 + }, + { + "epoch": 1.7963536918869645, + "grad_norm": 1.1253540515899658, + "learning_rate": 6.463698453291823e-07, + "loss": 0.3669811487197876, + "step": 616 + }, + { + "epoch": 1.799270738377393, + "grad_norm": 1.1103028059005737, + "learning_rate": 6.28482301954082e-07, + "loss": 0.3868233561515808, + "step": 617 + }, + { + "epoch": 1.8021877848678214, + "grad_norm": 1.0804798603057861, + "learning_rate": 6.108377206096394e-07, + "loss": 0.4123673439025879, + "step": 618 + }, + { + "epoch": 1.8051048313582498, + "grad_norm": 1.1068788766860962, + "learning_rate": 5.934365587418567e-07, + "loss": 0.44468799233436584, + "step": 619 + }, + { + "epoch": 1.8080218778486783, + "grad_norm": 1.0318645238876343, + "learning_rate": 5.762792674859474e-07, + "loss": 0.3586595356464386, + "step": 620 + }, + { + "epoch": 1.8109389243391067, + "grad_norm": 1.1553035974502563, + "learning_rate": 5.593662916546361e-07, + "loss": 0.4580552577972412, + "step": 621 + }, + { + "epoch": 1.8138559708295352, + "grad_norm": 1.3010531663894653, + "learning_rate": 5.426980697266271e-07, + "loss": 0.42412641644477844, + "step": 622 + }, + { + "epoch": 1.8167730173199637, + "grad_norm": 1.1858006715774536, + "learning_rate": 5.262750338352418e-07, + "loss": 0.38257676362991333, + "step": 623 + }, + { + "epoch": 1.8196900638103921, + "grad_norm": 1.1341536045074463, + "learning_rate": 5.100976097572074e-07, + "loss": 0.48365846276283264, + "step": 624 + }, + { + "epoch": 1.8226071103008206, + "grad_norm": 1.112844467163086, + "learning_rate": 4.941662169016237e-07, + "loss": 0.3893233835697174, + "step": 625 + }, + { + "epoch": 1.825524156791249, + "grad_norm": 1.1846497058868408, + "learning_rate": 4.784812682990903e-07, + "loss": 0.38869139552116394, + "step": 626 + }, + { + "epoch": 1.8284412032816773, + "grad_norm": 1.1383928060531616, + "learning_rate": 4.6304317059099326e-07, + "loss": 0.36156678199768066, + "step": 627 + }, + { + "epoch": 1.8313582497721057, + "grad_norm": 1.0891298055648804, + "learning_rate": 4.478523240189703e-07, + "loss": 0.40910348296165466, + "step": 628 + }, + { + "epoch": 1.8342752962625342, + "grad_norm": 1.1337662935256958, + "learning_rate": 4.3290912241452545e-07, + "loss": 0.3360365629196167, + "step": 629 + }, + { + "epoch": 1.8371923427529626, + "grad_norm": 1.280463695526123, + "learning_rate": 4.182139531888263e-07, + "loss": 0.44318532943725586, + "step": 630 + }, + { + "epoch": 1.840109389243391, + "grad_norm": 1.1408170461654663, + "learning_rate": 4.0376719732265647e-07, + "loss": 0.37003564834594727, + "step": 631 + }, + { + "epoch": 1.8430264357338195, + "grad_norm": 0.9730168581008911, + "learning_rate": 3.8956922935653895e-07, + "loss": 0.355985552072525, + "step": 632 + }, + { + "epoch": 1.845943482224248, + "grad_norm": 1.0643151998519897, + "learning_rate": 3.756204173810263e-07, + "loss": 0.3911808729171753, + "step": 633 + }, + { + "epoch": 1.8488605287146764, + "grad_norm": 1.1769851446151733, + "learning_rate": 3.61921123027158e-07, + "loss": 0.314385324716568, + "step": 634 + }, + { + "epoch": 1.8517775752051049, + "grad_norm": 0.921336829662323, + "learning_rate": 3.484717014570838e-07, + "loss": 0.3375144302845001, + "step": 635 + }, + { + "epoch": 1.8546946216955331, + "grad_norm": 0.9904773235321045, + "learning_rate": 3.3527250135485744e-07, + "loss": 0.4461369514465332, + "step": 636 + }, + { + "epoch": 1.8576116681859616, + "grad_norm": 1.0844534635543823, + "learning_rate": 3.223238649173954e-07, + "loss": 0.398414671421051, + "step": 637 + }, + { + "epoch": 1.86052871467639, + "grad_norm": 0.9829220771789551, + "learning_rate": 3.096261278456048e-07, + "loss": 0.35938704013824463, + "step": 638 + }, + { + "epoch": 1.8634457611668185, + "grad_norm": 1.13048255443573, + "learning_rate": 2.971796193356835e-07, + "loss": 0.3783624768257141, + "step": 639 + }, + { + "epoch": 1.866362807657247, + "grad_norm": 1.4307893514633179, + "learning_rate": 2.8498466207058095e-07, + "loss": 0.3601874113082886, + "step": 640 + }, + { + "epoch": 1.8692798541476754, + "grad_norm": 1.1835116147994995, + "learning_rate": 2.7304157221163753e-07, + "loss": 0.43897169828414917, + "step": 641 + }, + { + "epoch": 1.8721969006381038, + "grad_norm": 1.0730469226837158, + "learning_rate": 2.613506593903825e-07, + "loss": 0.4407995343208313, + "step": 642 + }, + { + "epoch": 1.8751139471285323, + "grad_norm": 0.9504678845405579, + "learning_rate": 2.499122267005105e-07, + "loss": 0.4105035960674286, + "step": 643 + }, + { + "epoch": 1.8780309936189608, + "grad_norm": 1.2599385976791382, + "learning_rate": 2.387265706900199e-07, + "loss": 0.41521430015563965, + "step": 644 + }, + { + "epoch": 1.8809480401093892, + "grad_norm": 1.035783052444458, + "learning_rate": 2.2779398135353127e-07, + "loss": 0.33491846919059753, + "step": 645 + }, + { + "epoch": 1.8838650865998177, + "grad_norm": 1.1612690687179565, + "learning_rate": 2.1711474212476325e-07, + "loss": 0.3367970287799835, + "step": 646 + }, + { + "epoch": 1.8867821330902461, + "grad_norm": 1.2541207075119019, + "learning_rate": 2.066891298691831e-07, + "loss": 0.46374717354774475, + "step": 647 + }, + { + "epoch": 1.8896991795806746, + "grad_norm": 1.1037088632583618, + "learning_rate": 1.9651741487683562e-07, + "loss": 0.3799871802330017, + "step": 648 + }, + { + "epoch": 1.892616226071103, + "grad_norm": 1.3611476421356201, + "learning_rate": 1.8659986085532988e-07, + "loss": 0.40523889660835266, + "step": 649 + }, + { + "epoch": 1.8955332725615315, + "grad_norm": 1.1628823280334473, + "learning_rate": 1.7693672492300473e-07, + "loss": 0.38399839401245117, + "step": 650 + }, + { + "epoch": 1.8955332725615315, + "eval_loss": 0.3949255049228668, + "eval_runtime": 903.6455, + "eval_samples_per_second": 0.699, + "eval_steps_per_second": 0.699, + "step": 650 + }, + { + "epoch": 1.89845031905196, + "grad_norm": 1.1185522079467773, + "learning_rate": 1.675282576022641e-07, + "loss": 0.4280855059623718, + "step": 651 + }, + { + "epoch": 1.9013673655423884, + "grad_norm": 1.1962717771530151, + "learning_rate": 1.5837470281307666e-07, + "loss": 0.3026162087917328, + "step": 652 + }, + { + "epoch": 1.9042844120328168, + "grad_norm": 1.1818240880966187, + "learning_rate": 1.4947629786666084e-07, + "loss": 0.43283963203430176, + "step": 653 + }, + { + "epoch": 1.9072014585232453, + "grad_norm": 1.161944031715393, + "learning_rate": 1.4083327345932208e-07, + "loss": 0.435259610414505, + "step": 654 + }, + { + "epoch": 1.9101185050136738, + "grad_norm": 1.1311709880828857, + "learning_rate": 1.32445853666483e-07, + "loss": 0.3258042633533478, + "step": 655 + }, + { + "epoch": 1.9130355515041022, + "grad_norm": 1.0152852535247803, + "learning_rate": 1.2431425593686263e-07, + "loss": 0.40951770544052124, + "step": 656 + }, + { + "epoch": 1.9159525979945307, + "grad_norm": 1.2698794603347778, + "learning_rate": 1.164386910868498e-07, + "loss": 0.3610893785953522, + "step": 657 + }, + { + "epoch": 1.9188696444849591, + "grad_norm": 1.1092722415924072, + "learning_rate": 1.0881936329502851e-07, + "loss": 0.31951773166656494, + "step": 658 + }, + { + "epoch": 1.9217866909753876, + "grad_norm": 1.2378597259521484, + "learning_rate": 1.0145647009689008e-07, + "loss": 0.3756055235862732, + "step": 659 + }, + { + "epoch": 1.924703737465816, + "grad_norm": 1.0100237131118774, + "learning_rate": 9.43502023797116e-08, + "loss": 0.26117536425590515, + "step": 660 + }, + { + "epoch": 1.9276207839562443, + "grad_norm": 1.2368487119674683, + "learning_rate": 8.750074437760325e-08, + "loss": 0.3092282712459564, + "step": 661 + }, + { + "epoch": 1.9305378304466727, + "grad_norm": 1.0328837633132935, + "learning_rate": 8.090827366673548e-08, + "loss": 0.4076297879219055, + "step": 662 + }, + { + "epoch": 1.9334548769371012, + "grad_norm": 0.9885771870613098, + "learning_rate": 7.457296116073487e-08, + "loss": 0.40007251501083374, + "step": 663 + }, + { + "epoch": 1.9363719234275296, + "grad_norm": 1.19287109375, + "learning_rate": 6.849497110625214e-08, + "loss": 0.3751019239425659, + "step": 664 + }, + { + "epoch": 1.939288969917958, + "grad_norm": 1.134682536125183, + "learning_rate": 6.267446107870334e-08, + "loss": 0.4558236300945282, + "step": 665 + }, + { + "epoch": 1.9422060164083865, + "grad_norm": 3.414883852005005, + "learning_rate": 5.7111581978185336e-08, + "loss": 0.5070392489433289, + "step": 666 + }, + { + "epoch": 1.945123062898815, + "grad_norm": 1.179479956626892, + "learning_rate": 5.180647802556671e-08, + "loss": 0.389989972114563, + "step": 667 + }, + { + "epoch": 1.9480401093892434, + "grad_norm": 1.1473273038864136, + "learning_rate": 4.675928675874186e-08, + "loss": 0.460910826921463, + "step": 668 + }, + { + "epoch": 1.9509571558796717, + "grad_norm": 0.9269355535507202, + "learning_rate": 4.197013902907165e-08, + "loss": 0.5488728284835815, + "step": 669 + }, + { + "epoch": 1.9538742023701001, + "grad_norm": 1.1781370639801025, + "learning_rate": 3.7439158997989445e-08, + "loss": 0.39483463764190674, + "step": 670 + }, + { + "epoch": 1.9567912488605286, + "grad_norm": 1.1759430170059204, + "learning_rate": 3.316646413377811e-08, + "loss": 0.38600990176200867, + "step": 671 + }, + { + "epoch": 1.959708295350957, + "grad_norm": 1.1981792449951172, + "learning_rate": 2.9152165208529147e-08, + "loss": 0.4657193422317505, + "step": 672 + }, + { + "epoch": 1.9626253418413855, + "grad_norm": 1.186043620109558, + "learning_rate": 2.5396366295272756e-08, + "loss": 0.46212077140808105, + "step": 673 + }, + { + "epoch": 1.965542388331814, + "grad_norm": 1.115103840827942, + "learning_rate": 2.1899164765271096e-08, + "loss": 0.4416077733039856, + "step": 674 + }, + { + "epoch": 1.9684594348222424, + "grad_norm": 1.2150691747665405, + "learning_rate": 1.866065128550365e-08, + "loss": 0.3557685911655426, + "step": 675 + }, + { + "epoch": 1.9713764813126708, + "grad_norm": 1.096506953239441, + "learning_rate": 1.5680909816309098e-08, + "loss": 0.32865390181541443, + "step": 676 + }, + { + "epoch": 1.9742935278030993, + "grad_norm": 1.0974191427230835, + "learning_rate": 1.2960017609213727e-08, + "loss": 0.37568721175193787, + "step": 677 + }, + { + "epoch": 1.9772105742935278, + "grad_norm": 1.1290082931518555, + "learning_rate": 1.0498045204924145e-08, + "loss": 0.329836905002594, + "step": 678 + }, + { + "epoch": 1.9801276207839562, + "grad_norm": 1.0609803199768066, + "learning_rate": 8.295056431504301e-09, + "loss": 0.2694982886314392, + "step": 679 + }, + { + "epoch": 1.9830446672743847, + "grad_norm": 0.9838472604751587, + "learning_rate": 6.3511084027156885e-09, + "loss": 0.4270719587802887, + "step": 680 + }, + { + "epoch": 1.9859617137648131, + "grad_norm": 1.1900098323822021, + "learning_rate": 4.666251516536324e-09, + "loss": 0.4060650169849396, + "step": 681 + }, + { + "epoch": 1.9888787602552416, + "grad_norm": 0.9812174439430237, + "learning_rate": 3.2405294538606637e-09, + "loss": 0.3900409936904907, + "step": 682 + }, + { + "epoch": 1.99179580674567, + "grad_norm": 1.1988210678100586, + "learning_rate": 2.073979177357188e-09, + "loss": 0.3999583125114441, + "step": 683 + }, + { + "epoch": 1.9947128532360985, + "grad_norm": 0.9738736152648926, + "learning_rate": 1.1666309305202738e-09, + "loss": 0.46780622005462646, + "step": 684 + }, + { + "epoch": 1.997629899726527, + "grad_norm": 0.9841824173927307, + "learning_rate": 5.18508236878601e-10, + "loss": 0.4595794975757599, + "step": 685 + }, + { + "epoch": 2.0, + "grad_norm": 1.0865421295166016, + "learning_rate": 1.2962789938897323e-10, + "loss": 0.5136060118675232, + "step": 686 + } + ], + "logging_steps": 1, + "max_steps": 686, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.317102071220797e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/cpt_devstral_24B/checkpoints/checkpoint-686/training_args.bin b/cpt_devstral_24B/checkpoints/checkpoint-686/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..48a487f18680e3e5b768fe7ec9ec04e8778fc21e --- /dev/null +++ b/cpt_devstral_24B/checkpoints/checkpoint-686/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f62526ec2433add7ac031c48b1f6ff360f1ade77275765112cbf7cf361d64ca5 +size 5201 diff --git a/cpt_devstral_24B/config_resolved.yaml b/cpt_devstral_24B/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8a68770254306ce2a8741b9348b3766095f60ef4 --- /dev/null +++ b/cpt_devstral_24B/config_resolved.yaml @@ -0,0 +1,57 @@ +run: + run_dir: ./runs/cpt_run_v1 + seed: 42 +model: + repo_id: /workspace/Models/Devstral-Small-2-24B-Instruct-2512 + revision: null + base_local_dir: base_model + trust_remote_code: true + tokenizer_use_fast: true + device_map: auto + torch_dtype: bfloat16 + use_4bit: false + bnb_4bit_quant_type: nf4 + bnb_4bit_use_double_quant: false + bnb_4bit_compute_dtype: bfloat16 + attn_implementation: null +data: + train_jsonl: /workspace/all_data_with_descriptions.jsonl + eval_jsonl: null + eval_split_ratio: 0.1 + text_field: text + block_size: 4096 + shuffle: true + num_proc: 4 + pack_mode: pad +peft: + enabled: true + r: 64 + lora_alpha: 128 + lora_dropout: 0.05 + bias: none + target_modules: auto +train: + num_train_epochs: 2 + per_device_train_batch_size: 1 + per_device_eval_batch_size: 1 + gradient_accumulation_steps: 16 + learning_rate: 2e-5 + weight_decay: 0.0 + warmup_ratio: 0.1 + lr_scheduler_type: cosine + optim: paged_adamw_8bit + max_grad_norm: 1.0 + gradient_checkpointing: true + logging_steps: 1 + save_strategy: steps + save_steps: 100 + save_total_limit: 4 + evaluation_strategy: steps + eval_steps: 50 + load_best_model_at_end: true + resume_from_checkpoint: auto +merge: + enabled: true + merged_dtype: float16 + max_shard_size: 2GB + output_dir: ./merged_24b_cpt_lora diff --git a/cpt_devstral_24B/eval_final.json b/cpt_devstral_24B/eval_final.json new file mode 100644 index 0000000000000000000000000000000000000000..d176e9f5183f7708e5f4460d8f968b8d80aa5fbf --- /dev/null +++ b/cpt_devstral_24B/eval_final.json @@ -0,0 +1,8 @@ +{ + "eval_loss": 0.3965963125228882, + "eval_runtime": 916.1187, + "eval_samples_per_second": 0.69, + "eval_steps_per_second": 0.69, + "epoch": 2.0, + "perplexity": 1.4867556242644535 +} \ No newline at end of file diff --git a/cpt_devstral_24B/logs/eval.jsonl b/cpt_devstral_24B/logs/eval.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bbc3c3e9631f4ffa454da40141648c01ca886826 --- /dev/null +++ b/cpt_devstral_24B/logs/eval.jsonl @@ -0,0 +1,14 @@ +{"ts": "2025-12-22T12:54:44", "event": "eval", "step": 50, "epoch": 0.14585232452142205, "eval_loss": 0.7892261147499084, "eval_runtime": 973.2157, "eval_samples_per_second": 0.649, "eval_steps_per_second": 0.649, "perplexity": 2.2016919098966565} +{"ts": "2025-12-22T14:16:39", "event": "eval", "step": 100, "epoch": 0.2917046490428441, "eval_loss": 0.6552971005439758, "eval_runtime": 966.7072, "eval_samples_per_second": 0.654, "eval_steps_per_second": 0.654, "perplexity": 1.9257145632388668} +{"ts": "2025-12-22T15:50:59", "event": "eval", "step": 150, "epoch": 0.4375569735642662, "eval_loss": 0.5903874635696411, "eval_runtime": 1186.9542, "eval_samples_per_second": 0.532, "eval_steps_per_second": 0.532, "perplexity": 1.8046875306209718} +{"ts": "2025-12-22T17:31:15", "event": "eval", "step": 200, "epoch": 0.5834092980856882, "eval_loss": 0.5414339303970337, "eval_runtime": 1180.7894, "eval_samples_per_second": 0.535, "eval_steps_per_second": 0.535, "perplexity": 1.7184692616188395} +{"ts": "2025-12-22T19:11:10", "event": "eval", "step": 250, "epoch": 0.7292616226071102, "eval_loss": 0.5038471221923828, "eval_runtime": 1175.0375, "eval_samples_per_second": 0.538, "eval_steps_per_second": 0.538, "perplexity": 1.6550763193760132} +{"ts": "2025-12-22T20:51:33", "event": "eval", "step": 300, "epoch": 0.8751139471285324, "eval_loss": 0.4752846360206604, "eval_runtime": 1189.1666, "eval_samples_per_second": 0.531, "eval_steps_per_second": 0.531, "perplexity": 1.6084719613930785} +{"ts": "2025-12-22T22:31:42", "event": "eval", "step": 350, "epoch": 1.0204193254329992, "eval_loss": 0.44924086332321167, "eval_runtime": 1214.6648, "eval_samples_per_second": 0.52, "eval_steps_per_second": 0.52, "perplexity": 1.5671220739753133} +{"ts": "2025-12-23T00:12:28", "event": "eval", "step": 400, "epoch": 1.1662716499544212, "eval_loss": 0.4318464398384094, "eval_runtime": 1206.0306, "eval_samples_per_second": 0.524, "eval_steps_per_second": 0.524, "perplexity": 1.5400985992121177} +{"ts": "2025-12-23T01:52:27", "event": "eval", "step": 450, "epoch": 1.3121239744758433, "eval_loss": 0.4178673028945923, "eval_runtime": 1197.5534, "eval_samples_per_second": 0.528, "eval_steps_per_second": 0.528, "perplexity": 1.5187191313977852} +{"ts": "2025-12-23T03:32:49", "event": "eval", "step": 500, "epoch": 1.4579762989972653, "eval_loss": 0.40706494450569153, "eval_runtime": 1189.1593, "eval_samples_per_second": 0.531, "eval_steps_per_second": 0.531, "perplexity": 1.5024016752277602} +{"ts": "2025-12-23T04:52:38", "event": "eval", "step": 550, "epoch": 1.6038286235186874, "eval_loss": 0.40037089586257935, "eval_runtime": 893.7411, "eval_samples_per_second": 0.707, "eval_steps_per_second": 0.707, "perplexity": 1.4923781118724992} +{"ts": "2025-12-23T06:06:46", "event": "eval", "step": 600, "epoch": 1.7496809480401094, "eval_loss": 0.3965963125228882, "eval_runtime": 912.3102, "eval_samples_per_second": 0.693, "eval_steps_per_second": 0.693, "perplexity": 1.4867556242644535} +{"ts": "2025-12-23T07:20:44", "event": "eval", "step": 650, "epoch": 1.8955332725615315, "eval_loss": 0.3949255049228668, "eval_runtime": 903.6455, "eval_samples_per_second": 0.699, "eval_steps_per_second": 0.699, "perplexity": 1.484273615724821} +{"ts": "2025-12-23T08:18:17", "event": "eval", "step": 686, "epoch": 2.0, "eval_loss": 0.3965963125228882, "eval_runtime": 916.1187, "eval_samples_per_second": 0.69, "eval_steps_per_second": 0.69, "perplexity": 1.4867556242644535} diff --git a/cpt_devstral_24B/logs/train.jsonl b/cpt_devstral_24B/logs/train.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4019144e0a29322a055b6fed134b134f2ebffd86 --- /dev/null +++ b/cpt_devstral_24B/logs/train.jsonl @@ -0,0 +1,701 @@ +{"ts": "2025-12-22T11:34:07", "event": "train_log", "step": 1, "epoch": 0.0029170464904284413, "progress_pct": 0.15, "epoch_pct": 0.15, "eta": "15:18:24", "max_grad_norm": 1.0, "loss": 0.9893555045127869, "grad_norm": 1.1577509641647339, "learning_rate": 0.0} +{"ts": "2025-12-22T11:35:27", "event": "train_log", "step": 2, "epoch": 0.005834092980856883, "progress_pct": 0.29, "epoch_pct": 0.29, "eta": "15:18:08", "max_grad_norm": 1.0, "loss": 0.8791205883026123, "grad_norm": 0.9491796493530273, "learning_rate": 2.8985507246376816e-07} +{"ts": "2025-12-22T11:36:45", "event": "train_log", "step": 3, "epoch": 0.008751139471285323, "progress_pct": 0.44, "epoch_pct": 0.44, "eta": "15:06:28", "max_grad_norm": 1.0, "loss": 0.9858248233795166, "grad_norm": 1.1600768566131592, "learning_rate": 5.797101449275363e-07} +{"ts": "2025-12-22T11:38:05", "event": "train_log", "step": 4, "epoch": 0.011668185961713765, "progress_pct": 0.58, "epoch_pct": 0.58, "eta": "15:05:29", "max_grad_norm": 1.0, "loss": 1.0516364574432373, "grad_norm": 1.2298306226730347, "learning_rate": 8.695652173913044e-07} +{"ts": "2025-12-22T11:39:23", "event": "train_log", "step": 5, "epoch": 0.014585232452142206, "progress_pct": 0.73, "epoch_pct": 0.73, "eta": "15:01:23", "max_grad_norm": 1.0, "loss": 0.8392249345779419, "grad_norm": 0.9520533680915833, "learning_rate": 1.1594202898550726e-06} +{"ts": "2025-12-22T11:40:42", "event": "train_log", "step": 6, "epoch": 0.017502278942570646, "progress_pct": 0.87, "epoch_pct": 0.88, "eta": "14:57:40", "max_grad_norm": 1.0, "loss": 1.0955077409744263, "grad_norm": 1.2451188564300537, "learning_rate": 1.4492753623188408e-06} +{"ts": "2025-12-22T11:42:01", "event": "train_log", "step": 7, "epoch": 0.02041932543299909, "progress_pct": 1.02, "epoch_pct": 1.02, "eta": "14:57:15", "max_grad_norm": 1.0, "loss": 0.9201866388320923, "grad_norm": 1.1123991012573242, "learning_rate": 1.7391304347826088e-06} +{"ts": "2025-12-22T11:43:21", "event": "train_log", "step": 8, "epoch": 0.02333637192342753, "progress_pct": 1.17, "epoch_pct": 1.17, "eta": "14:56:13", "max_grad_norm": 1.0, "loss": 0.9770950078964233, "grad_norm": 0.9283139705657959, "learning_rate": 2.028985507246377e-06} +{"ts": "2025-12-22T11:44:38", "event": "train_log", "step": 9, "epoch": 0.02625341841385597, "progress_pct": 1.31, "epoch_pct": 1.31, "eta": "14:52:43", "max_grad_norm": 1.0, "loss": 0.9442565441131592, "grad_norm": 0.9589216113090515, "learning_rate": 2.3188405797101453e-06} +{"ts": "2025-12-22T11:45:58", "event": "train_log", "step": 10, "epoch": 0.02917046490428441, "progress_pct": 1.46, "epoch_pct": 1.46, "eta": "14:51:41", "max_grad_norm": 1.0, "loss": 0.9354464411735535, "grad_norm": 0.8866703510284424, "learning_rate": 2.6086956521739132e-06} +{"ts": "2025-12-22T11:47:16", "event": "train_log", "step": 11, "epoch": 0.03208751139471285, "progress_pct": 1.6, "epoch_pct": 1.6, "eta": "14:49:12", "max_grad_norm": 1.0, "loss": 0.7659736275672913, "grad_norm": 0.7191241383552551, "learning_rate": 2.8985507246376816e-06} +{"ts": "2025-12-22T11:48:35", "event": "train_log", "step": 12, "epoch": 0.03500455788514129, "progress_pct": 1.75, "epoch_pct": 1.75, "eta": "14:47:51", "max_grad_norm": 1.0, "loss": 0.9319326877593994, "grad_norm": 0.9110142588615417, "learning_rate": 3.188405797101449e-06} +{"ts": "2025-12-22T11:49:53", "event": "train_log", "step": 13, "epoch": 0.03792160437556973, "progress_pct": 1.9, "epoch_pct": 1.9, "eta": "14:46:04", "max_grad_norm": 1.0, "loss": 0.9819356203079224, "grad_norm": 0.8754057288169861, "learning_rate": 3.4782608695652175e-06} +{"ts": "2025-12-22T11:51:14", "event": "train_log", "step": 14, "epoch": 0.04083865086599818, "progress_pct": 2.04, "epoch_pct": 2.04, "eta": "14:45:49", "max_grad_norm": 1.0, "loss": 1.026316523551941, "grad_norm": 0.896181046962738, "learning_rate": 3.768115942028986e-06} +{"ts": "2025-12-22T11:52:33", "event": "train_log", "step": 15, "epoch": 0.04375569735642662, "progress_pct": 2.19, "epoch_pct": 2.19, "eta": "14:44:32", "max_grad_norm": 1.0, "loss": 0.8427562713623047, "grad_norm": 0.6104832887649536, "learning_rate": 4.057971014492754e-06} +{"ts": "2025-12-22T11:53:51", "event": "train_log", "step": 16, "epoch": 0.04667274384685506, "progress_pct": 2.33, "epoch_pct": 2.33, "eta": "14:42:49", "max_grad_norm": 1.0, "loss": 0.8496565222740173, "grad_norm": 0.6529208421707153, "learning_rate": 4.347826086956522e-06} +{"ts": "2025-12-22T11:55:09", "event": "train_log", "step": 17, "epoch": 0.0495897903372835, "progress_pct": 2.48, "epoch_pct": 2.48, "eta": "14:40:25", "max_grad_norm": 1.0, "loss": 0.9139047861099243, "grad_norm": 0.6319335699081421, "learning_rate": 4.637681159420291e-06} +{"ts": "2025-12-22T11:56:29", "event": "train_log", "step": 18, "epoch": 0.05250683682771194, "progress_pct": 2.62, "epoch_pct": 2.63, "eta": "14:40:16", "max_grad_norm": 1.0, "loss": 0.8867442011833191, "grad_norm": 0.7458649277687073, "learning_rate": 4.927536231884059e-06} +{"ts": "2025-12-22T11:57:48", "event": "train_log", "step": 19, "epoch": 0.05542388331814038, "progress_pct": 2.77, "epoch_pct": 2.77, "eta": "14:38:35", "max_grad_norm": 1.0, "loss": 0.9579408168792725, "grad_norm": 0.6179773211479187, "learning_rate": 5.2173913043478265e-06} +{"ts": "2025-12-22T11:59:08", "event": "train_log", "step": 20, "epoch": 0.05834092980856882, "progress_pct": 2.92, "epoch_pct": 2.92, "eta": "14:37:37", "max_grad_norm": 1.0, "loss": 0.8736554980278015, "grad_norm": 0.794481635093689, "learning_rate": 5.507246376811595e-06} +{"ts": "2025-12-22T12:00:27", "event": "train_log", "step": 21, "epoch": 0.06125797629899726, "progress_pct": 3.06, "epoch_pct": 3.06, "eta": "14:36:13", "max_grad_norm": 1.0, "loss": 0.9358762502670288, "grad_norm": 0.8356145620346069, "learning_rate": 5.797101449275363e-06} +{"ts": "2025-12-22T12:01:46", "event": "train_log", "step": 22, "epoch": 0.0641750227894257, "progress_pct": 3.21, "epoch_pct": 3.21, "eta": "14:34:55", "max_grad_norm": 1.0, "loss": 0.8972038626670837, "grad_norm": 0.5891932845115662, "learning_rate": 6.086956521739132e-06} +{"ts": "2025-12-22T12:03:04", "event": "train_log", "step": 23, "epoch": 0.06709206927985414, "progress_pct": 3.35, "epoch_pct": 3.35, "eta": "14:33:11", "max_grad_norm": 1.0, "loss": 0.9583507776260376, "grad_norm": 0.6931268572807312, "learning_rate": 6.376811594202898e-06} +{"ts": "2025-12-22T12:04:24", "event": "train_log", "step": 24, "epoch": 0.07000911577028258, "progress_pct": 3.5, "epoch_pct": 3.5, "eta": "14:32:25", "max_grad_norm": 1.0, "loss": 0.8119489550590515, "grad_norm": 0.7298229336738586, "learning_rate": 6.666666666666667e-06} +{"ts": "2025-12-22T12:05:42", "event": "train_log", "step": 25, "epoch": 0.07292616226071102, "progress_pct": 3.64, "epoch_pct": 3.65, "eta": "14:30:29", "max_grad_norm": 1.0, "loss": 0.9386100769042969, "grad_norm": 0.6419956684112549, "learning_rate": 6.956521739130435e-06} +{"ts": "2025-12-22T12:07:01", "event": "train_log", "step": 26, "epoch": 0.07584320875113947, "progress_pct": 3.79, "epoch_pct": 3.79, "eta": "14:29:29", "max_grad_norm": 1.0, "loss": 0.9272583723068237, "grad_norm": 0.7508338689804077, "learning_rate": 7.246376811594203e-06} +{"ts": "2025-12-22T12:08:21", "event": "train_log", "step": 27, "epoch": 0.0787602552415679, "progress_pct": 3.94, "epoch_pct": 3.94, "eta": "14:28:27", "max_grad_norm": 1.0, "loss": 0.8967856168746948, "grad_norm": 0.5848079919815063, "learning_rate": 7.536231884057972e-06} +{"ts": "2025-12-22T12:09:40", "event": "train_log", "step": 28, "epoch": 0.08167730173199636, "progress_pct": 4.08, "epoch_pct": 4.08, "eta": "14:26:51", "max_grad_norm": 1.0, "loss": 0.8696568012237549, "grad_norm": 0.7384837865829468, "learning_rate": 7.82608695652174e-06} +{"ts": "2025-12-22T12:10:59", "event": "train_log", "step": 29, "epoch": 0.0845943482224248, "progress_pct": 4.23, "epoch_pct": 4.23, "eta": "14:25:41", "max_grad_norm": 1.0, "loss": 0.9121193885803223, "grad_norm": 0.5069604516029358, "learning_rate": 8.115942028985508e-06} +{"ts": "2025-12-22T12:12:18", "event": "train_log", "step": 30, "epoch": 0.08751139471285324, "progress_pct": 4.37, "epoch_pct": 4.38, "eta": "14:24:13", "max_grad_norm": 1.0, "loss": 0.8180589079856873, "grad_norm": 0.833165168762207, "learning_rate": 8.405797101449275e-06} +{"ts": "2025-12-22T12:13:37", "event": "train_log", "step": 31, "epoch": 0.09042844120328168, "progress_pct": 4.52, "epoch_pct": 4.52, "eta": "14:23:02", "max_grad_norm": 1.0, "loss": 0.8640957474708557, "grad_norm": 0.6355920433998108, "learning_rate": 8.695652173913044e-06} +{"ts": "2025-12-22T12:14:55", "event": "train_log", "step": 32, "epoch": 0.09334548769371012, "progress_pct": 4.66, "epoch_pct": 4.67, "eta": "14:21:12", "max_grad_norm": 1.0, "loss": 0.9517915844917297, "grad_norm": 1.0429315567016602, "learning_rate": 8.985507246376812e-06} +{"ts": "2025-12-22T12:16:14", "event": "train_log", "step": 33, "epoch": 0.09626253418413856, "progress_pct": 4.81, "epoch_pct": 4.81, "eta": "14:20:05", "max_grad_norm": 1.0, "loss": 0.9443603754043579, "grad_norm": 0.5875154733657837, "learning_rate": 9.275362318840581e-06} +{"ts": "2025-12-22T12:17:34", "event": "train_log", "step": 34, "epoch": 0.099179580674567, "progress_pct": 4.96, "epoch_pct": 4.96, "eta": "14:18:58", "max_grad_norm": 1.0, "loss": 0.9510866403579712, "grad_norm": 1.9913769960403442, "learning_rate": 9.565217391304349e-06} +{"ts": "2025-12-22T12:18:52", "event": "train_log", "step": 35, "epoch": 0.10209662716499544, "progress_pct": 5.1, "epoch_pct": 5.1, "eta": "14:17:27", "max_grad_norm": 1.0, "loss": 0.8653419613838196, "grad_norm": 0.5310097932815552, "learning_rate": 9.855072463768118e-06} +{"ts": "2025-12-22T12:20:12", "event": "train_log", "step": 36, "epoch": 0.10501367365542388, "progress_pct": 5.25, "epoch_pct": 5.25, "eta": "14:16:16", "max_grad_norm": 1.0, "loss": 0.7941208481788635, "grad_norm": 0.624421238899231, "learning_rate": 1.0144927536231885e-05} +{"ts": "2025-12-22T12:21:31", "event": "train_log", "step": 37, "epoch": 0.10793072014585232, "progress_pct": 5.39, "epoch_pct": 5.4, "eta": "14:14:57", "max_grad_norm": 1.0, "loss": 0.8931174278259277, "grad_norm": 0.6314200758934021, "learning_rate": 1.0434782608695653e-05} +{"ts": "2025-12-22T12:22:49", "event": "train_log", "step": 38, "epoch": 0.11084776663628076, "progress_pct": 5.54, "epoch_pct": 5.54, "eta": "14:13:20", "max_grad_norm": 1.0, "loss": 0.8978185057640076, "grad_norm": 0.6272342205047607, "learning_rate": 1.0724637681159422e-05} +{"ts": "2025-12-22T12:24:07", "event": "train_log", "step": 39, "epoch": 0.1137648131267092, "progress_pct": 5.69, "epoch_pct": 5.69, "eta": "14:11:45", "max_grad_norm": 1.0, "loss": 0.808263897895813, "grad_norm": 0.5711184740066528, "learning_rate": 1.101449275362319e-05} +{"ts": "2025-12-22T12:25:25", "event": "train_log", "step": 40, "epoch": 0.11668185961713765, "progress_pct": 5.83, "epoch_pct": 5.83, "eta": "14:10:11", "max_grad_norm": 1.0, "loss": 0.7456756830215454, "grad_norm": 0.7581208944320679, "learning_rate": 1.1304347826086957e-05} +{"ts": "2025-12-22T12:26:43", "event": "train_log", "step": 41, "epoch": 0.11959890610756609, "progress_pct": 5.98, "epoch_pct": 5.98, "eta": "14:08:35", "max_grad_norm": 1.0, "loss": 0.8273333311080933, "grad_norm": 0.4989977180957794, "learning_rate": 1.1594202898550726e-05} +{"ts": "2025-12-22T12:28:01", "event": "train_log", "step": 42, "epoch": 0.12251595259799453, "progress_pct": 6.12, "epoch_pct": 6.13, "eta": "14:07:05", "max_grad_norm": 1.0, "loss": 0.8514784574508667, "grad_norm": 0.8602972626686096, "learning_rate": 1.1884057971014494e-05} +{"ts": "2025-12-22T12:29:19", "event": "train_log", "step": 43, "epoch": 0.12543299908842298, "progress_pct": 6.27, "epoch_pct": 6.27, "eta": "14:05:32", "max_grad_norm": 1.0, "loss": 0.8182265162467957, "grad_norm": 0.6918581128120422, "learning_rate": 1.2173913043478263e-05} +{"ts": "2025-12-22T12:30:37", "event": "train_log", "step": 44, "epoch": 0.1283500455788514, "progress_pct": 6.41, "epoch_pct": 6.42, "eta": "14:04:02", "max_grad_norm": 1.0, "loss": 0.8242791891098022, "grad_norm": 0.653099536895752, "learning_rate": 1.2463768115942029e-05} +{"ts": "2025-12-22T12:31:55", "event": "train_log", "step": 45, "epoch": 0.13126709206927986, "progress_pct": 6.56, "epoch_pct": 6.56, "eta": "14:02:30", "max_grad_norm": 1.0, "loss": 0.8229591250419617, "grad_norm": 0.7485584616661072, "learning_rate": 1.2753623188405797e-05} +{"ts": "2025-12-22T12:33:13", "event": "train_log", "step": 46, "epoch": 0.1341841385597083, "progress_pct": 6.71, "epoch_pct": 6.71, "eta": "14:01:01", "max_grad_norm": 1.0, "loss": 0.8146833181381226, "grad_norm": 0.6724833250045776, "learning_rate": 1.3043478260869566e-05} +{"ts": "2025-12-22T12:34:33", "event": "train_log", "step": 47, "epoch": 0.13710118505013674, "progress_pct": 6.85, "epoch_pct": 6.86, "eta": "13:59:55", "max_grad_norm": 1.0, "loss": 0.8154427409172058, "grad_norm": 0.857208251953125, "learning_rate": 1.3333333333333333e-05} +{"ts": "2025-12-22T12:35:53", "event": "train_log", "step": 48, "epoch": 0.14001823154056517, "progress_pct": 7.0, "epoch_pct": 7.0, "eta": "13:58:47", "max_grad_norm": 1.0, "loss": 0.879005491733551, "grad_norm": 0.5559669137001038, "learning_rate": 1.3623188405797103e-05} +{"ts": "2025-12-22T12:37:12", "event": "train_log", "step": 49, "epoch": 0.14293527803099362, "progress_pct": 7.14, "epoch_pct": 7.15, "eta": "13:57:28", "max_grad_norm": 1.0, "loss": 0.8148283362388611, "grad_norm": 0.5910897850990295, "learning_rate": 1.391304347826087e-05} +{"ts": "2025-12-22T12:38:31", "event": "train_log", "step": 50, "epoch": 0.14585232452142205, "progress_pct": 7.29, "epoch_pct": 7.29, "eta": "13:56:10", "max_grad_norm": 1.0, "loss": 0.8293006420135498, "grad_norm": 0.6478891372680664, "learning_rate": 1.420289855072464e-05} +{"ts": "2025-12-22T12:54:44", "event": "train_log", "step": 50, "epoch": 0.14585232452142205, "progress_pct": 7.29, "epoch_pct": 7.29, "eta": "17:22:30", "max_grad_norm": 1.0, "eval_loss": 0.7892261147499084, "eval_runtime": 973.2157, "eval_samples_per_second": 0.649, "eval_steps_per_second": 0.649} +{"ts": "2025-12-22T12:56:05", "event": "train_log", "step": 51, "epoch": 0.1487693710118505, "progress_pct": 7.43, "epoch_pct": 7.44, "eta": "17:17:18", "max_grad_norm": 1.0, "loss": 0.8114852905273438, "grad_norm": 0.757882833480835, "learning_rate": 1.4492753623188407e-05} +{"ts": "2025-12-22T12:57:24", "event": "train_log", "step": 52, "epoch": 0.15168641750227893, "progress_pct": 7.58, "epoch_pct": 7.58, "eta": "17:11:48", "max_grad_norm": 1.0, "loss": 0.7886185050010681, "grad_norm": 0.8496116995811462, "learning_rate": 1.4782608695652174e-05} +{"ts": "2025-12-22T12:58:43", "event": "train_log", "step": 53, "epoch": 0.15460346399270739, "progress_pct": 7.73, "epoch_pct": 7.73, "eta": "17:06:27", "max_grad_norm": 1.0, "loss": 0.7298170924186707, "grad_norm": 0.6078857183456421, "learning_rate": 1.5072463768115944e-05} +{"ts": "2025-12-22T13:00:01", "event": "train_log", "step": 54, "epoch": 0.1575205104831358, "progress_pct": 7.87, "epoch_pct": 7.88, "eta": "17:01:08", "max_grad_norm": 1.0, "loss": 0.7407160997390747, "grad_norm": 0.5856835246086121, "learning_rate": 1.536231884057971e-05} +{"ts": "2025-12-22T13:01:21", "event": "train_log", "step": 55, "epoch": 0.16043755697356427, "progress_pct": 8.02, "epoch_pct": 8.02, "eta": "16:56:17", "max_grad_norm": 1.0, "loss": 0.7057831287384033, "grad_norm": 1.0533701181411743, "learning_rate": 1.565217391304348e-05} +{"ts": "2025-12-22T13:02:39", "event": "train_log", "step": 56, "epoch": 0.16335460346399272, "progress_pct": 8.16, "epoch_pct": 8.17, "eta": "16:51:04", "max_grad_norm": 1.0, "loss": 0.7409019470214844, "grad_norm": 0.8087610006332397, "learning_rate": 1.5942028985507246e-05} +{"ts": "2025-12-22T13:03:58", "event": "train_log", "step": 57, "epoch": 0.16627164995442115, "progress_pct": 8.31, "epoch_pct": 8.31, "eta": "16:46:24", "max_grad_norm": 1.0, "loss": 0.7768293023109436, "grad_norm": 0.629945695400238, "learning_rate": 1.6231884057971015e-05} +{"ts": "2025-12-22T13:05:18", "event": "train_log", "step": 58, "epoch": 0.1691886964448496, "progress_pct": 8.45, "epoch_pct": 8.46, "eta": "16:41:50", "max_grad_norm": 1.0, "loss": 0.825718104839325, "grad_norm": 0.5187911987304688, "learning_rate": 1.6521739130434785e-05} +{"ts": "2025-12-22T13:06:36", "event": "train_log", "step": 59, "epoch": 0.17210574293527803, "progress_pct": 8.6, "epoch_pct": 8.61, "eta": "16:37:06", "max_grad_norm": 1.0, "loss": 0.8575979471206665, "grad_norm": 0.5866358280181885, "learning_rate": 1.681159420289855e-05} +{"ts": "2025-12-22T13:07:56", "event": "train_log", "step": 60, "epoch": 0.17502278942570648, "progress_pct": 8.75, "epoch_pct": 8.75, "eta": "16:32:46", "max_grad_norm": 1.0, "loss": 0.8058848977088928, "grad_norm": 1.5098934173583984, "learning_rate": 1.710144927536232e-05} +{"ts": "2025-12-22T13:09:14", "event": "train_log", "step": 61, "epoch": 0.1779398359161349, "progress_pct": 8.89, "epoch_pct": 8.9, "eta": "16:28:17", "max_grad_norm": 1.0, "loss": 0.7640778422355652, "grad_norm": 0.6981958150863647, "learning_rate": 1.739130434782609e-05} +{"ts": "2025-12-22T13:10:32", "event": "train_log", "step": 62, "epoch": 0.18085688240656336, "progress_pct": 9.04, "epoch_pct": 9.04, "eta": "16:23:54", "max_grad_norm": 1.0, "loss": 0.7896331548690796, "grad_norm": 0.631349503993988, "learning_rate": 1.7681159420289858e-05} +{"ts": "2025-12-22T13:11:51", "event": "train_log", "step": 63, "epoch": 0.1837739288969918, "progress_pct": 9.18, "epoch_pct": 9.19, "eta": "16:19:51", "max_grad_norm": 1.0, "loss": 0.6762524247169495, "grad_norm": 0.6930747032165527, "learning_rate": 1.7971014492753624e-05} +{"ts": "2025-12-22T13:13:11", "event": "train_log", "step": 64, "epoch": 0.18669097538742024, "progress_pct": 9.33, "epoch_pct": 9.33, "eta": "16:15:53", "max_grad_norm": 1.0, "loss": 0.7285035848617554, "grad_norm": 0.599399209022522, "learning_rate": 1.8260869565217393e-05} +{"ts": "2025-12-22T13:14:29", "event": "train_log", "step": 65, "epoch": 0.18960802187784867, "progress_pct": 9.48, "epoch_pct": 9.48, "eta": "16:11:40", "max_grad_norm": 1.0, "loss": 0.7682523131370544, "grad_norm": 0.6194344758987427, "learning_rate": 1.8550724637681162e-05} +{"ts": "2025-12-22T13:15:48", "event": "train_log", "step": 66, "epoch": 0.19252506836827712, "progress_pct": 9.62, "epoch_pct": 9.63, "eta": "16:07:50", "max_grad_norm": 1.0, "loss": 0.6791993379592896, "grad_norm": 0.5691342949867249, "learning_rate": 1.8840579710144928e-05} +{"ts": "2025-12-22T13:17:08", "event": "train_log", "step": 67, "epoch": 0.19544211485870555, "progress_pct": 9.77, "epoch_pct": 9.77, "eta": "16:04:13", "max_grad_norm": 1.0, "loss": 0.6744828224182129, "grad_norm": 0.6257390379905701, "learning_rate": 1.9130434782608697e-05} +{"ts": "2025-12-22T13:18:26", "event": "train_log", "step": 68, "epoch": 0.198359161349134, "progress_pct": 9.91, "epoch_pct": 9.92, "eta": "16:00:17", "max_grad_norm": 1.0, "loss": 0.7317330837249756, "grad_norm": 0.5871018767356873, "learning_rate": 1.9420289855072467e-05} +{"ts": "2025-12-22T13:19:44", "event": "train_log", "step": 69, "epoch": 0.20127620783956243, "progress_pct": 10.06, "epoch_pct": 10.06, "eta": "15:56:29", "max_grad_norm": 1.0, "loss": 0.6617178916931152, "grad_norm": 1.0744612216949463, "learning_rate": 1.9710144927536236e-05} +{"ts": "2025-12-22T13:21:05", "event": "train_log", "step": 70, "epoch": 0.2041932543299909, "progress_pct": 10.2, "epoch_pct": 10.21, "eta": "15:53:04", "max_grad_norm": 1.0, "loss": 0.7615712881088257, "grad_norm": 0.675946831703186, "learning_rate": 2e-05} +{"ts": "2025-12-22T13:22:23", "event": "train_log", "step": 71, "epoch": 0.2071103008204193, "progress_pct": 10.35, "epoch_pct": 10.36, "eta": "15:49:25", "max_grad_norm": 1.0, "loss": 0.7131291627883911, "grad_norm": 0.7663411498069763, "learning_rate": 1.9999870372100614e-05} +{"ts": "2025-12-22T13:23:42", "event": "train_log", "step": 72, "epoch": 0.21002734731084777, "progress_pct": 10.5, "epoch_pct": 10.5, "eta": "15:46:02", "max_grad_norm": 1.0, "loss": 0.7452989816665649, "grad_norm": 0.6725395321846008, "learning_rate": 1.9999481491763123e-05} +{"ts": "2025-12-22T13:25:01", "event": "train_log", "step": 73, "epoch": 0.21294439380127622, "progress_pct": 10.64, "epoch_pct": 10.65, "eta": "15:42:30", "max_grad_norm": 1.0, "loss": 0.7477136850357056, "grad_norm": 0.6505664587020874, "learning_rate": 1.9998833369069483e-05} +{"ts": "2025-12-22T13:26:19", "event": "train_log", "step": 74, "epoch": 0.21586144029170465, "progress_pct": 10.79, "epoch_pct": 10.79, "eta": "15:38:59", "max_grad_norm": 1.0, "loss": 0.6854275465011597, "grad_norm": 0.7032860517501831, "learning_rate": 1.9997926020822643e-05} +{"ts": "2025-12-22T13:27:37", "event": "train_log", "step": 75, "epoch": 0.2187784867821331, "progress_pct": 10.93, "epoch_pct": 10.94, "eta": "15:35:36", "max_grad_norm": 1.0, "loss": 0.7552425265312195, "grad_norm": 0.645345151424408, "learning_rate": 1.999675947054614e-05} +{"ts": "2025-12-22T13:28:55", "event": "train_log", "step": 76, "epoch": 0.22169553327256153, "progress_pct": 11.08, "epoch_pct": 11.08, "eta": "15:32:12", "max_grad_norm": 1.0, "loss": 0.7262853384017944, "grad_norm": 0.6620492935180664, "learning_rate": 1.9995333748483464e-05} +{"ts": "2025-12-22T13:30:13", "event": "train_log", "step": 77, "epoch": 0.22461257976298998, "progress_pct": 11.22, "epoch_pct": 11.23, "eta": "15:28:51", "max_grad_norm": 1.0, "loss": 0.7591732144355774, "grad_norm": 0.6511455774307251, "learning_rate": 1.9993648891597284e-05} +{"ts": "2025-12-22T13:31:31", "event": "train_log", "step": 78, "epoch": 0.2275296262534184, "progress_pct": 11.37, "epoch_pct": 11.38, "eta": "15:25:32", "max_grad_norm": 1.0, "loss": 0.7498704195022583, "grad_norm": 0.6775254011154175, "learning_rate": 1.9991704943568497e-05} +{"ts": "2025-12-22T13:32:48", "event": "train_log", "step": 79, "epoch": 0.23044667274384686, "progress_pct": 11.52, "epoch_pct": 11.52, "eta": "15:22:18", "max_grad_norm": 1.0, "loss": 0.7238684296607971, "grad_norm": 0.8199896216392517, "learning_rate": 1.9989501954795076e-05} +{"ts": "2025-12-22T13:34:07", "event": "train_log", "step": 80, "epoch": 0.2333637192342753, "progress_pct": 11.66, "epoch_pct": 11.67, "eta": "15:19:10", "max_grad_norm": 1.0, "loss": 0.7028778195381165, "grad_norm": 0.8197569847106934, "learning_rate": 1.998703998239079e-05} +{"ts": "2025-12-22T13:35:25", "event": "train_log", "step": 81, "epoch": 0.23628076572470375, "progress_pct": 11.81, "epoch_pct": 11.81, "eta": "15:16:02", "max_grad_norm": 1.0, "loss": 0.8842703104019165, "grad_norm": 0.6602625250816345, "learning_rate": 1.9984319090183692e-05} +{"ts": "2025-12-22T13:36:43", "event": "train_log", "step": 82, "epoch": 0.23919781221513217, "progress_pct": 11.95, "epoch_pct": 11.96, "eta": "15:12:59", "max_grad_norm": 1.0, "loss": 0.732614278793335, "grad_norm": 0.9587129354476929, "learning_rate": 1.99813393487145e-05} +{"ts": "2025-12-22T13:38:04", "event": "train_log", "step": 83, "epoch": 0.24211485870556063, "progress_pct": 12.1, "epoch_pct": 12.11, "eta": "15:10:13", "max_grad_norm": 1.0, "loss": 0.7544928193092346, "grad_norm": 0.6822189092636108, "learning_rate": 1.997810083523473e-05} +{"ts": "2025-12-22T13:39:23", "event": "train_log", "step": 84, "epoch": 0.24503190519598905, "progress_pct": 12.24, "epoch_pct": 12.25, "eta": "15:07:22", "max_grad_norm": 1.0, "loss": 0.6704054474830627, "grad_norm": 0.8980082869529724, "learning_rate": 1.9974603633704726e-05} +{"ts": "2025-12-22T13:40:43", "event": "train_log", "step": 85, "epoch": 0.2479489516864175, "progress_pct": 12.39, "epoch_pct": 12.4, "eta": "15:04:39", "max_grad_norm": 1.0, "loss": 0.693661093711853, "grad_norm": 0.7413425445556641, "learning_rate": 1.9970847834791472e-05} +{"ts": "2025-12-22T13:42:01", "event": "train_log", "step": 86, "epoch": 0.25086599817684596, "progress_pct": 12.54, "epoch_pct": 12.54, "eta": "15:01:40", "max_grad_norm": 1.0, "loss": 0.667654275894165, "grad_norm": 0.8314999341964722, "learning_rate": 1.9966833535866223e-05} +{"ts": "2025-12-22T13:43:21", "event": "train_log", "step": 87, "epoch": 0.25378304466727436, "progress_pct": 12.68, "epoch_pct": 12.69, "eta": "14:59:00", "max_grad_norm": 1.0, "loss": 0.8403134942054749, "grad_norm": 0.7972444891929626, "learning_rate": 1.9962560841002013e-05} +{"ts": "2025-12-22T13:44:38", "event": "train_log", "step": 88, "epoch": 0.2567000911577028, "progress_pct": 12.83, "epoch_pct": 12.84, "eta": "14:56:05", "max_grad_norm": 1.0, "loss": 0.6897370219230652, "grad_norm": 0.8519951701164246, "learning_rate": 1.995802986097093e-05} +{"ts": "2025-12-22T13:45:58", "event": "train_log", "step": 89, "epoch": 0.25961713764813127, "progress_pct": 12.97, "epoch_pct": 12.98, "eta": "14:53:25", "max_grad_norm": 1.0, "loss": 0.6690632700920105, "grad_norm": 0.8268933892250061, "learning_rate": 1.995324071324126e-05} +{"ts": "2025-12-22T13:47:16", "event": "train_log", "step": 90, "epoch": 0.2625341841385597, "progress_pct": 13.12, "epoch_pct": 13.13, "eta": "14:50:38", "max_grad_norm": 1.0, "loss": 0.6314147114753723, "grad_norm": 0.7133983969688416, "learning_rate": 1.9948193521974436e-05} +{"ts": "2025-12-22T13:48:36", "event": "train_log", "step": 91, "epoch": 0.2654512306289881, "progress_pct": 13.27, "epoch_pct": 13.27, "eta": "14:48:07", "max_grad_norm": 1.0, "loss": 0.7389825582504272, "grad_norm": 0.889302134513855, "learning_rate": 1.9942888418021814e-05} +{"ts": "2025-12-22T13:49:54", "event": "train_log", "step": 92, "epoch": 0.2683682771194166, "progress_pct": 13.41, "epoch_pct": 13.42, "eta": "14:45:24", "max_grad_norm": 1.0, "loss": 0.6916261911392212, "grad_norm": 0.7022432088851929, "learning_rate": 1.99373255389213e-05} +{"ts": "2025-12-22T13:51:14", "event": "train_log", "step": 93, "epoch": 0.27128532360984503, "progress_pct": 13.56, "epoch_pct": 13.56, "eta": "14:42:54", "max_grad_norm": 1.0, "loss": 0.6908476948738098, "grad_norm": 0.696432888507843, "learning_rate": 1.9931505028893748e-05} +{"ts": "2025-12-22T13:52:32", "event": "train_log", "step": 94, "epoch": 0.2742023701002735, "progress_pct": 13.7, "epoch_pct": 13.71, "eta": "14:40:11", "max_grad_norm": 1.0, "loss": 0.6500837206840515, "grad_norm": 0.7667419910430908, "learning_rate": 1.9925427038839267e-05} +{"ts": "2025-12-22T13:53:53", "event": "train_log", "step": 95, "epoch": 0.27711941659070194, "progress_pct": 13.85, "epoch_pct": 13.86, "eta": "14:37:49", "max_grad_norm": 1.0, "loss": 0.7059191465377808, "grad_norm": 0.6974894404411316, "learning_rate": 1.9919091726333265e-05} +{"ts": "2025-12-22T13:55:13", "event": "train_log", "step": 96, "epoch": 0.28003646308113034, "progress_pct": 13.99, "epoch_pct": 14.0, "eta": "14:35:27", "max_grad_norm": 1.0, "loss": 0.6287837624549866, "grad_norm": 0.7047077417373657, "learning_rate": 1.9912499255622397e-05} +{"ts": "2025-12-22T13:56:37", "event": "train_log", "step": 97, "epoch": 0.2829535095715588, "progress_pct": 14.14, "epoch_pct": 14.15, "eta": "14:33:25", "max_grad_norm": 1.0, "loss": 0.6738612055778503, "grad_norm": 0.7729557156562805, "learning_rate": 1.990564979762029e-05} +{"ts": "2025-12-22T13:57:55", "event": "train_log", "step": 98, "epoch": 0.28587055606198725, "progress_pct": 14.29, "epoch_pct": 14.29, "eta": "14:30:51", "max_grad_norm": 1.0, "loss": 0.662042498588562, "grad_norm": 0.7020529508590698, "learning_rate": 1.989854352990311e-05} +{"ts": "2025-12-22T13:59:15", "event": "train_log", "step": 99, "epoch": 0.2887876025524157, "progress_pct": 14.43, "epoch_pct": 14.44, "eta": "14:28:29", "max_grad_norm": 1.0, "loss": 0.6246830821037292, "grad_norm": 0.7369800209999084, "learning_rate": 1.9891180636704975e-05} +{"ts": "2025-12-22T14:00:33", "event": "train_log", "step": 100, "epoch": 0.2917046490428441, "progress_pct": 14.58, "epoch_pct": 14.59, "eta": "14:25:57", "max_grad_norm": 1.0, "loss": 0.6623879075050354, "grad_norm": 0.7412623167037964, "learning_rate": 1.9883561308913154e-05} +{"ts": "2025-12-22T14:16:39", "event": "train_log", "step": 100, "epoch": 0.2917046490428441, "progress_pct": 14.58, "epoch_pct": 14.59, "eta": "16:00:22", "max_grad_norm": 1.0, "eval_loss": 0.6552971005439758, "eval_runtime": 966.7072, "eval_samples_per_second": 0.654, "eval_steps_per_second": 0.654} +{"ts": "2025-12-22T14:18:00", "event": "train_log", "step": 101, "epoch": 0.29462169553327255, "progress_pct": 14.72, "epoch_pct": 14.73, "eta": "15:57:00", "max_grad_norm": 1.0, "loss": 0.6312171816825867, "grad_norm": 0.8428792953491211, "learning_rate": 1.987568574406314e-05} +{"ts": "2025-12-22T14:19:18", "event": "train_log", "step": 102, "epoch": 0.297538742023701, "progress_pct": 14.87, "epoch_pct": 14.88, "eta": "15:53:28", "max_grad_norm": 1.0, "loss": 0.6266146898269653, "grad_norm": 0.6948133707046509, "learning_rate": 1.9867554146333517e-05} +{"ts": "2025-12-22T14:20:38", "event": "train_log", "step": 103, "epoch": 0.30045578851412946, "progress_pct": 15.01, "epoch_pct": 15.02, "eta": "15:50:06", "max_grad_norm": 1.0, "loss": 0.6669265031814575, "grad_norm": 1.3897597789764404, "learning_rate": 1.985916672654068e-05} +{"ts": "2025-12-22T14:21:56", "event": "train_log", "step": 104, "epoch": 0.30337283500455786, "progress_pct": 15.16, "epoch_pct": 15.17, "eta": "15:46:37", "max_grad_norm": 1.0, "loss": 0.6601086854934692, "grad_norm": 0.8838400840759277, "learning_rate": 1.985052370213334e-05} +{"ts": "2025-12-22T14:23:16", "event": "train_log", "step": 105, "epoch": 0.3062898814949863, "progress_pct": 15.31, "epoch_pct": 15.31, "eta": "15:43:26", "max_grad_norm": 1.0, "loss": 0.5984431505203247, "grad_norm": 0.8471395373344421, "learning_rate": 1.9841625297186925e-05} +{"ts": "2025-12-22T14:24:46", "event": "train_log", "step": 106, "epoch": 0.30920692798541477, "progress_pct": 15.45, "epoch_pct": 15.46, "eta": "15:41:08", "max_grad_norm": 1.0, "loss": 0.7223822474479675, "grad_norm": 0.8940042853355408, "learning_rate": 1.983247174239774e-05} +{"ts": "2025-12-22T14:26:09", "event": "train_log", "step": 107, "epoch": 0.3121239744758432, "progress_pct": 15.6, "epoch_pct": 15.61, "eta": "15:38:10", "max_grad_norm": 1.0, "loss": 0.6868705749511719, "grad_norm": 0.7833696603775024, "learning_rate": 1.9823063275076998e-05} +{"ts": "2025-12-22T14:27:28", "event": "train_log", "step": 108, "epoch": 0.3150410209662716, "progress_pct": 15.74, "epoch_pct": 15.75, "eta": "15:34:57", "max_grad_norm": 1.0, "loss": 0.6246675848960876, "grad_norm": 0.8794649243354797, "learning_rate": 1.9813400139144673e-05} +{"ts": "2025-12-22T14:28:50", "event": "train_log", "step": 109, "epoch": 0.3179580674567001, "progress_pct": 15.89, "epoch_pct": 15.9, "eta": "15:32:00", "max_grad_norm": 1.0, "loss": 0.5908697247505188, "grad_norm": 0.8126057982444763, "learning_rate": 1.9803482585123165e-05} +{"ts": "2025-12-22T14:30:08", "event": "train_log", "step": 110, "epoch": 0.32087511394712853, "progress_pct": 16.03, "epoch_pct": 16.04, "eta": "15:28:44", "max_grad_norm": 1.0, "loss": 0.5751246809959412, "grad_norm": 0.7947676777839661, "learning_rate": 1.979331087013082e-05} +{"ts": "2025-12-22T14:31:27", "event": "train_log", "step": 111, "epoch": 0.323792160437557, "progress_pct": 16.18, "epoch_pct": 16.19, "eta": "15:25:33", "max_grad_norm": 1.0, "loss": 0.6081106066703796, "grad_norm": 0.713545560836792, "learning_rate": 1.978288525787524e-05} +{"ts": "2025-12-22T14:32:59", "event": "train_log", "step": 112, "epoch": 0.32670920692798544, "progress_pct": 16.33, "epoch_pct": 16.34, "eta": "15:23:33", "max_grad_norm": 1.0, "loss": 0.7039169669151306, "grad_norm": 1.011828064918518, "learning_rate": 1.977220601864647e-05} +{"ts": "2025-12-22T14:34:38", "event": "train_log", "step": 113, "epoch": 0.32962625341841384, "progress_pct": 16.47, "epoch_pct": 16.48, "eta": "15:22:13", "max_grad_norm": 1.0, "loss": 0.6140255928039551, "grad_norm": 0.730570912361145, "learning_rate": 1.9761273429309982e-05} +{"ts": "2025-12-22T14:36:17", "event": "train_log", "step": 114, "epoch": 0.3325432999088423, "progress_pct": 16.62, "epoch_pct": 16.63, "eta": "15:20:46", "max_grad_norm": 1.0, "loss": 0.648114025592804, "grad_norm": 1.059688687324524, "learning_rate": 1.9750087773299492e-05} +{"ts": "2025-12-22T14:37:58", "event": "train_log", "step": 115, "epoch": 0.33546034639927075, "progress_pct": 16.76, "epoch_pct": 16.77, "eta": "15:19:30", "max_grad_norm": 1.0, "loss": 0.622555673122406, "grad_norm": 0.9336895942687988, "learning_rate": 1.973864934060962e-05} +{"ts": "2025-12-22T14:39:38", "event": "train_log", "step": 116, "epoch": 0.3383773928896992, "progress_pct": 16.91, "epoch_pct": 16.92, "eta": "15:18:09", "max_grad_norm": 1.0, "loss": 0.70485520362854, "grad_norm": 0.7195945978164673, "learning_rate": 1.9726958427788367e-05} +{"ts": "2025-12-22T14:41:21", "event": "train_log", "step": 117, "epoch": 0.3412944393801276, "progress_pct": 17.06, "epoch_pct": 17.06, "eta": "15:17:06", "max_grad_norm": 1.0, "loss": 0.6958848834037781, "grad_norm": 0.8101872801780701, "learning_rate": 1.971501533792942e-05} +{"ts": "2025-12-22T14:43:00", "event": "train_log", "step": 118, "epoch": 0.34421148587055606, "progress_pct": 17.2, "epoch_pct": 17.21, "eta": "15:15:42", "max_grad_norm": 1.0, "loss": 0.6021550893783569, "grad_norm": 1.6075212955474854, "learning_rate": 1.970282038066432e-05} +{"ts": "2025-12-22T14:44:35", "event": "train_log", "step": 119, "epoch": 0.3471285323609845, "progress_pct": 17.35, "epoch_pct": 17.36, "eta": "15:13:56", "max_grad_norm": 1.0, "loss": 0.6449777483940125, "grad_norm": 0.7881433963775635, "learning_rate": 1.9690373872154396e-05} +{"ts": "2025-12-22T14:46:12", "event": "train_log", "step": 120, "epoch": 0.35004557885141296, "progress_pct": 17.49, "epoch_pct": 17.5, "eta": "15:12:18", "max_grad_norm": 1.0, "loss": 0.5939379930496216, "grad_norm": 1.014639973640442, "learning_rate": 1.9677676135082606e-05} +{"ts": "2025-12-22T14:47:47", "event": "train_log", "step": 121, "epoch": 0.35296262534184136, "progress_pct": 17.64, "epoch_pct": 17.65, "eta": "15:10:36", "max_grad_norm": 1.0, "loss": 0.6210286617279053, "grad_norm": 0.8198449611663818, "learning_rate": 1.9664727498645144e-05} +{"ts": "2025-12-22T14:49:23", "event": "train_log", "step": 122, "epoch": 0.3558796718322698, "progress_pct": 17.78, "epoch_pct": 17.79, "eta": "15:08:53", "max_grad_norm": 1.0, "loss": 0.624247670173645, "grad_norm": 1.0194576978683472, "learning_rate": 1.9651528298542918e-05} +{"ts": "2025-12-22T14:50:59", "event": "train_log", "step": 123, "epoch": 0.35879671832269827, "progress_pct": 17.93, "epoch_pct": 17.94, "eta": "15:07:16", "max_grad_norm": 1.0, "loss": 0.6479315757751465, "grad_norm": 0.7963470220565796, "learning_rate": 1.9638078876972842e-05} +{"ts": "2025-12-22T14:52:37", "event": "train_log", "step": 124, "epoch": 0.3617137648131267, "progress_pct": 18.08, "epoch_pct": 18.09, "eta": "15:05:45", "max_grad_norm": 1.0, "loss": 0.6131505370140076, "grad_norm": 0.9007541537284851, "learning_rate": 1.9624379582618976e-05} +{"ts": "2025-12-22T14:54:13", "event": "train_log", "step": 125, "epoch": 0.3646308113035551, "progress_pct": 18.22, "epoch_pct": 18.23, "eta": "15:04:03", "max_grad_norm": 1.0, "loss": 0.6249448657035828, "grad_norm": 0.8712120056152344, "learning_rate": 1.9610430770643464e-05} +{"ts": "2025-12-22T14:55:47", "event": "train_log", "step": 126, "epoch": 0.3675478577939836, "progress_pct": 18.37, "epoch_pct": 18.38, "eta": "15:02:17", "max_grad_norm": 1.0, "loss": 0.5844688415527344, "grad_norm": 1.1482540369033813, "learning_rate": 1.9596232802677347e-05} +{"ts": "2025-12-22T14:57:23", "event": "train_log", "step": 127, "epoch": 0.37046490428441203, "progress_pct": 18.51, "epoch_pct": 18.52, "eta": "15:00:36", "max_grad_norm": 1.0, "loss": 0.6573485732078552, "grad_norm": 0.8662379384040833, "learning_rate": 1.9581786046811175e-05} +{"ts": "2025-12-22T14:59:01", "event": "train_log", "step": 128, "epoch": 0.3733819507748405, "progress_pct": 18.66, "epoch_pct": 18.67, "eta": "14:59:05", "max_grad_norm": 1.0, "loss": 0.5896862745285034, "grad_norm": 0.8191388845443726, "learning_rate": 1.9567090877585477e-05} +{"ts": "2025-12-22T15:00:39", "event": "train_log", "step": 129, "epoch": 0.37629899726526894, "progress_pct": 18.8, "epoch_pct": 18.81, "eta": "14:57:35", "max_grad_norm": 1.0, "loss": 0.613490879535675, "grad_norm": 1.0187078714370728, "learning_rate": 1.955214767598103e-05} +{"ts": "2025-12-22T15:02:16", "event": "train_log", "step": 130, "epoch": 0.37921604375569734, "progress_pct": 18.95, "epoch_pct": 18.96, "eta": "14:55:57", "max_grad_norm": 1.0, "loss": 0.727687656879425, "grad_norm": 0.8444119691848755, "learning_rate": 1.953695682940901e-05} +{"ts": "2025-12-22T15:03:52", "event": "train_log", "step": 131, "epoch": 0.3821330902461258, "progress_pct": 19.1, "epoch_pct": 19.11, "eta": "14:54:19", "max_grad_norm": 1.0, "loss": 0.6102436780929565, "grad_norm": 0.74753737449646, "learning_rate": 1.9521518731700913e-05} +{"ts": "2025-12-22T15:05:26", "event": "train_log", "step": 132, "epoch": 0.38505013673655425, "progress_pct": 19.24, "epoch_pct": 19.25, "eta": "14:52:32", "max_grad_norm": 1.0, "loss": 0.6244844198226929, "grad_norm": 1.0166202783584595, "learning_rate": 1.9505833783098378e-05} +{"ts": "2025-12-22T15:06:45", "event": "train_log", "step": 133, "epoch": 0.3879671832269827, "progress_pct": 19.39, "epoch_pct": 19.4, "eta": "14:49:42", "max_grad_norm": 1.0, "loss": 0.5939282178878784, "grad_norm": 0.8175772428512573, "learning_rate": 1.9489902390242793e-05} +{"ts": "2025-12-22T15:08:05", "event": "train_log", "step": 134, "epoch": 0.3908842297174111, "progress_pct": 19.53, "epoch_pct": 19.54, "eta": "14:46:56", "max_grad_norm": 1.0, "loss": 0.6418229937553406, "grad_norm": 1.0177713632583618, "learning_rate": 1.947372496616476e-05} +{"ts": "2025-12-22T15:09:24", "event": "train_log", "step": 135, "epoch": 0.39380127620783956, "progress_pct": 19.68, "epoch_pct": 19.69, "eta": "14:44:08", "max_grad_norm": 1.0, "loss": 0.5870395302772522, "grad_norm": 0.8652453422546387, "learning_rate": 1.9457301930273376e-05} +{"ts": "2025-12-22T15:10:43", "event": "train_log", "step": 136, "epoch": 0.396718322698268, "progress_pct": 19.83, "epoch_pct": 19.84, "eta": "14:41:22", "max_grad_norm": 1.0, "loss": 0.6480278372764587, "grad_norm": 0.8378894925117493, "learning_rate": 1.9440633708345365e-05} +{"ts": "2025-12-22T15:12:01", "event": "train_log", "step": 137, "epoch": 0.39963536918869647, "progress_pct": 19.97, "epoch_pct": 19.98, "eta": "14:38:34", "max_grad_norm": 1.0, "loss": 0.6191359758377075, "grad_norm": 0.8303541541099548, "learning_rate": 1.9423720732514052e-05} +{"ts": "2025-12-22T15:13:23", "event": "train_log", "step": 138, "epoch": 0.40255241567912486, "progress_pct": 20.12, "epoch_pct": 20.13, "eta": "14:36:04", "max_grad_norm": 1.0, "loss": 0.5696198344230652, "grad_norm": 0.8576734662055969, "learning_rate": 1.9406563441258145e-05} +{"ts": "2025-12-22T15:14:43", "event": "train_log", "step": 139, "epoch": 0.4054694621695533, "progress_pct": 20.26, "epoch_pct": 20.27, "eta": "14:33:24", "max_grad_norm": 1.0, "loss": 0.6177623271942139, "grad_norm": 0.9558727145195007, "learning_rate": 1.9389162279390362e-05} +{"ts": "2025-12-22T15:16:03", "event": "train_log", "step": 140, "epoch": 0.4083865086599818, "progress_pct": 20.41, "epoch_pct": 20.42, "eta": "14:30:46", "max_grad_norm": 1.0, "loss": 0.5836521983146667, "grad_norm": 0.7046042084693909, "learning_rate": 1.9371517698045922e-05} +{"ts": "2025-12-22T15:17:20", "event": "train_log", "step": 141, "epoch": 0.4113035551504102, "progress_pct": 20.55, "epoch_pct": 20.57, "eta": "14:28:00", "max_grad_norm": 1.0, "loss": 0.5728275775909424, "grad_norm": 1.0522717237472534, "learning_rate": 1.935363015467082e-05} +{"ts": "2025-12-22T15:18:40", "event": "train_log", "step": 142, "epoch": 0.4142206016408386, "progress_pct": 20.7, "epoch_pct": 20.71, "eta": "14:25:23", "max_grad_norm": 1.0, "loss": 0.632586658000946, "grad_norm": 0.9554787874221802, "learning_rate": 1.933550011301e-05} +{"ts": "2025-12-22T15:20:14", "event": "train_log", "step": 143, "epoch": 0.4171376481312671, "progress_pct": 20.85, "epoch_pct": 20.86, "eta": "14:23:43", "max_grad_norm": 1.0, "loss": 0.5850118398666382, "grad_norm": 0.8874214291572571, "learning_rate": 1.9317128043095293e-05} +{"ts": "2025-12-22T15:21:43", "event": "train_log", "step": 144, "epoch": 0.42005469462169553, "progress_pct": 20.99, "epoch_pct": 21.0, "eta": "14:21:45", "max_grad_norm": 1.0, "loss": 0.6260685324668884, "grad_norm": 1.0708963871002197, "learning_rate": 1.9298514421233276e-05} +{"ts": "2025-12-22T15:23:05", "event": "train_log", "step": 145, "epoch": 0.422971741112124, "progress_pct": 21.14, "epoch_pct": 21.15, "eta": "14:19:17", "max_grad_norm": 1.0, "loss": 0.6031094193458557, "grad_norm": 0.8135736584663391, "learning_rate": 1.9279659729992888e-05} +{"ts": "2025-12-22T15:24:45", "event": "train_log", "step": 146, "epoch": 0.42588878760255244, "progress_pct": 21.28, "epoch_pct": 21.29, "eta": "14:18:01", "max_grad_norm": 1.0, "loss": 0.6101322770118713, "grad_norm": 0.7971774339675903, "learning_rate": 1.9260564458192926e-05} +{"ts": "2025-12-22T15:26:22", "event": "train_log", "step": 147, "epoch": 0.42880583409298084, "progress_pct": 21.43, "epoch_pct": 21.44, "eta": "14:16:31", "max_grad_norm": 1.0, "loss": 0.5836313366889954, "grad_norm": 0.9374974966049194, "learning_rate": 1.9241229100889397e-05} +{"ts": "2025-12-22T15:27:57", "event": "train_log", "step": 148, "epoch": 0.4317228805834093, "progress_pct": 21.57, "epoch_pct": 21.59, "eta": "14:14:55", "max_grad_norm": 1.0, "loss": 0.6181215047836304, "grad_norm": 0.8043425679206848, "learning_rate": 1.9221654159362636e-05} +{"ts": "2025-12-22T15:29:34", "event": "train_log", "step": 149, "epoch": 0.43463992707383775, "progress_pct": 21.72, "epoch_pct": 21.73, "eta": "14:13:23", "max_grad_norm": 1.0, "loss": 0.6149677634239197, "grad_norm": 0.8923380374908447, "learning_rate": 1.920184014110436e-05} +{"ts": "2025-12-22T15:31:12", "event": "train_log", "step": 150, "epoch": 0.4375569735642662, "progress_pct": 21.87, "epoch_pct": 21.88, "eta": "14:12:00", "max_grad_norm": 1.0, "loss": 0.5899742841720581, "grad_norm": 0.8908132314682007, "learning_rate": 1.918178755980449e-05} +{"ts": "2025-12-22T15:50:59", "event": "train_log", "step": 150, "epoch": 0.4375569735642662, "progress_pct": 21.87, "epoch_pct": 21.88, "eta": "15:22:41", "max_grad_norm": 1.0, "eval_loss": 0.5903874635696411, "eval_runtime": 1186.9542, "eval_samples_per_second": 0.532, "eval_steps_per_second": 0.532} +{"ts": "2025-12-22T15:52:35", "event": "train_log", "step": 151, "epoch": 0.4404740200546946, "progress_pct": 22.01, "epoch_pct": 22.02, "eta": "15:20:30", "max_grad_norm": 1.0, "loss": 0.5852696895599365, "grad_norm": 1.060531497001648, "learning_rate": 1.9161496935337808e-05} +{"ts": "2025-12-22T15:54:07", "event": "train_log", "step": 152, "epoch": 0.44339106654512306, "progress_pct": 22.16, "epoch_pct": 22.17, "eta": "15:18:09", "max_grad_norm": 1.0, "loss": 0.5822056531906128, "grad_norm": 0.9723032712936401, "learning_rate": 1.914096879375053e-05} +{"ts": "2025-12-22T15:55:42", "event": "train_log", "step": 153, "epoch": 0.4463081130355515, "progress_pct": 22.3, "epoch_pct": 22.32, "eta": "15:15:55", "max_grad_norm": 1.0, "loss": 0.6183493137359619, "grad_norm": 0.9519931674003601, "learning_rate": 1.912020366724663e-05} +{"ts": "2025-12-22T15:57:22", "event": "train_log", "step": 154, "epoch": 0.44922515952597997, "progress_pct": 22.45, "epoch_pct": 22.46, "eta": "15:14:02", "max_grad_norm": 1.0, "loss": 0.6229860782623291, "grad_norm": 0.8282918334007263, "learning_rate": 1.9099202094174055e-05} +{"ts": "2025-12-22T15:59:00", "event": "train_log", "step": 155, "epoch": 0.45214220601640837, "progress_pct": 22.59, "epoch_pct": 22.61, "eta": "15:12:03", "max_grad_norm": 1.0, "loss": 0.6552959680557251, "grad_norm": 0.9251292943954468, "learning_rate": 1.907796461901076e-05} +{"ts": "2025-12-22T16:00:43", "event": "train_log", "step": 156, "epoch": 0.4550592525068368, "progress_pct": 22.74, "epoch_pct": 22.75, "eta": "15:10:17", "max_grad_norm": 1.0, "loss": 0.6170098781585693, "grad_norm": 1.0349540710449219, "learning_rate": 1.9056491792350606e-05} +{"ts": "2025-12-22T16:02:17", "event": "train_log", "step": 157, "epoch": 0.4579762989972653, "progress_pct": 22.89, "epoch_pct": 22.9, "eta": "15:08:06", "max_grad_norm": 1.0, "loss": 0.5870137810707092, "grad_norm": 0.8720711469650269, "learning_rate": 1.9034784170889076e-05} +{"ts": "2025-12-22T16:03:50", "event": "train_log", "step": 158, "epoch": 0.46089334548769373, "progress_pct": 23.03, "epoch_pct": 23.04, "eta": "15:05:51", "max_grad_norm": 1.0, "loss": 0.5515124201774597, "grad_norm": 1.0785977840423584, "learning_rate": 1.9012842317408843e-05} +{"ts": "2025-12-22T16:05:30", "event": "train_log", "step": 159, "epoch": 0.4638103919781221, "progress_pct": 23.18, "epoch_pct": 23.19, "eta": "15:03:56", "max_grad_norm": 1.0, "loss": 0.6073828339576721, "grad_norm": 1.0634154081344604, "learning_rate": 1.8990666800765187e-05} +{"ts": "2025-12-22T16:07:08", "event": "train_log", "step": 160, "epoch": 0.4667274384685506, "progress_pct": 23.32, "epoch_pct": 23.34, "eta": "15:01:56", "max_grad_norm": 1.0, "loss": 0.5960907936096191, "grad_norm": 0.8770879507064819, "learning_rate": 1.896825819587123e-05} +{"ts": "2025-12-22T16:08:44", "event": "train_log", "step": 161, "epoch": 0.46964448495897904, "progress_pct": 23.47, "epoch_pct": 23.48, "eta": "14:59:52", "max_grad_norm": 1.0, "loss": 0.545990526676178, "grad_norm": 1.1225898265838623, "learning_rate": 1.894561708368305e-05} +{"ts": "2025-12-22T16:10:17", "event": "train_log", "step": 162, "epoch": 0.4725615314494075, "progress_pct": 23.62, "epoch_pct": 23.63, "eta": "14:57:39", "max_grad_norm": 1.0, "loss": 0.5566108822822571, "grad_norm": 0.9373893141746521, "learning_rate": 1.8922744051184613e-05} +{"ts": "2025-12-22T16:11:53", "event": "train_log", "step": 163, "epoch": 0.4754785779398359, "progress_pct": 23.76, "epoch_pct": 23.77, "eta": "14:55:33", "max_grad_norm": 1.0, "loss": 0.558845043182373, "grad_norm": 1.5016087293624878, "learning_rate": 1.8899639691372545e-05} +{"ts": "2025-12-22T16:13:30", "event": "train_log", "step": 164, "epoch": 0.47839562443026434, "progress_pct": 23.91, "epoch_pct": 23.92, "eta": "14:53:31", "max_grad_norm": 1.0, "loss": 0.6824233531951904, "grad_norm": 0.903020977973938, "learning_rate": 1.8876304603240773e-05} +{"ts": "2025-12-22T16:15:06", "event": "train_log", "step": 165, "epoch": 0.4813126709206928, "progress_pct": 24.05, "epoch_pct": 24.07, "eta": "14:51:27", "max_grad_norm": 1.0, "loss": 0.5630610585212708, "grad_norm": 0.8239623308181763, "learning_rate": 1.8852739391764993e-05} +{"ts": "2025-12-22T16:16:41", "event": "train_log", "step": 166, "epoch": 0.48422971741112125, "progress_pct": 24.2, "epoch_pct": 24.21, "eta": "14:49:21", "max_grad_norm": 1.0, "loss": 0.6211802363395691, "grad_norm": 0.926069438457489, "learning_rate": 1.882894466788697e-05} +{"ts": "2025-12-22T16:18:17", "event": "train_log", "step": 167, "epoch": 0.4871467639015497, "progress_pct": 24.34, "epoch_pct": 24.36, "eta": "14:47:19", "max_grad_norm": 1.0, "loss": 0.5513257384300232, "grad_norm": 1.0098828077316284, "learning_rate": 1.8804921048498722e-05} +{"ts": "2025-12-22T16:19:53", "event": "train_log", "step": 168, "epoch": 0.4900638103919781, "progress_pct": 24.49, "epoch_pct": 24.5, "eta": "14:45:16", "max_grad_norm": 1.0, "loss": 0.6197121739387512, "grad_norm": 0.9228141903877258, "learning_rate": 1.8780669156426517e-05} +{"ts": "2025-12-22T16:21:29", "event": "train_log", "step": 169, "epoch": 0.49298085688240656, "progress_pct": 24.64, "epoch_pct": 24.65, "eta": "14:43:14", "max_grad_norm": 1.0, "loss": 0.5221806764602661, "grad_norm": 1.0551754236221313, "learning_rate": 1.8756189620414712e-05} +{"ts": "2025-12-22T16:23:07", "event": "train_log", "step": 170, "epoch": 0.495897903372835, "progress_pct": 24.78, "epoch_pct": 24.79, "eta": "14:41:17", "max_grad_norm": 1.0, "loss": 0.5766995549201965, "grad_norm": 0.9017496109008789, "learning_rate": 1.873148307510948e-05} +{"ts": "2025-12-22T16:24:41", "event": "train_log", "step": 171, "epoch": 0.49881494986326347, "progress_pct": 24.93, "epoch_pct": 24.94, "eta": "14:39:10", "max_grad_norm": 1.0, "loss": 0.6514763832092285, "grad_norm": 0.9704970717430115, "learning_rate": 1.870655016104233e-05} +{"ts": "2025-12-22T16:26:20", "event": "train_log", "step": 172, "epoch": 0.5017319963536919, "progress_pct": 25.07, "epoch_pct": 25.09, "eta": "14:37:15", "max_grad_norm": 1.0, "loss": 0.5273895263671875, "grad_norm": 0.9972712397575378, "learning_rate": 1.8681391524613518e-05} +{"ts": "2025-12-22T16:27:53", "event": "train_log", "step": 173, "epoch": 0.5046490428441204, "progress_pct": 25.22, "epoch_pct": 25.23, "eta": "14:35:07", "max_grad_norm": 1.0, "loss": 0.5548599362373352, "grad_norm": 0.9473339319229126, "learning_rate": 1.8656007818075288e-05} +{"ts": "2025-12-22T16:29:31", "event": "train_log", "step": 174, "epoch": 0.5075660893345487, "progress_pct": 25.36, "epoch_pct": 25.38, "eta": "14:33:11", "max_grad_norm": 1.0, "loss": 0.5593586564064026, "grad_norm": 1.2493574619293213, "learning_rate": 1.8630399699514944e-05} +{"ts": "2025-12-22T16:31:03", "event": "train_log", "step": 175, "epoch": 0.5104831358249772, "progress_pct": 25.51, "epoch_pct": 25.52, "eta": "14:30:56", "max_grad_norm": 1.0, "loss": 0.6054630279541016, "grad_norm": 1.2766696214675903, "learning_rate": 1.860456783283781e-05} +{"ts": "2025-12-22T16:32:42", "event": "train_log", "step": 176, "epoch": 0.5134001823154056, "progress_pct": 25.66, "epoch_pct": 25.67, "eta": "14:29:07", "max_grad_norm": 1.0, "loss": 0.508592963218689, "grad_norm": 0.9555240869522095, "learning_rate": 1.857851288775002e-05} +{"ts": "2025-12-22T16:34:19", "event": "train_log", "step": 177, "epoch": 0.5163172288058341, "progress_pct": 25.8, "epoch_pct": 25.82, "eta": "14:27:10", "max_grad_norm": 1.0, "loss": 0.5532065629959106, "grad_norm": 1.260219931602478, "learning_rate": 1.8552235539741118e-05} +{"ts": "2025-12-22T16:35:54", "event": "train_log", "step": 178, "epoch": 0.5192342752962625, "progress_pct": 25.95, "epoch_pct": 25.96, "eta": "14:25:06", "max_grad_norm": 1.0, "loss": 0.5683344006538391, "grad_norm": 1.1859954595565796, "learning_rate": 1.8525736470066595e-05} +{"ts": "2025-12-22T16:37:34", "event": "train_log", "step": 179, "epoch": 0.522151321786691, "progress_pct": 26.09, "epoch_pct": 26.11, "eta": "14:23:17", "max_grad_norm": 1.0, "loss": 0.5281959772109985, "grad_norm": 1.3044344186782837, "learning_rate": 1.8499016365730203e-05} +{"ts": "2025-12-22T16:39:12", "event": "train_log", "step": 180, "epoch": 0.5250683682771194, "progress_pct": 26.24, "epoch_pct": 26.25, "eta": "14:21:23", "max_grad_norm": 1.0, "loss": 0.49621230363845825, "grad_norm": 1.3049921989440918, "learning_rate": 1.8472075919466137e-05} +{"ts": "2025-12-22T16:40:48", "event": "train_log", "step": 181, "epoch": 0.5279854147675479, "progress_pct": 26.38, "epoch_pct": 26.4, "eta": "14:19:25", "max_grad_norm": 1.0, "loss": 0.6194032430648804, "grad_norm": 1.0488537549972534, "learning_rate": 1.844491582972109e-05} +{"ts": "2025-12-22T16:42:25", "event": "train_log", "step": 182, "epoch": 0.5309024612579762, "progress_pct": 26.53, "epoch_pct": 26.55, "eta": "14:17:28", "max_grad_norm": 1.0, "loss": 0.5645846724510193, "grad_norm": 1.5553455352783203, "learning_rate": 1.8417536800636138e-05} +{"ts": "2025-12-22T16:44:01", "event": "train_log", "step": 183, "epoch": 0.5338195077484047, "progress_pct": 26.68, "epoch_pct": 26.69, "eta": "14:15:30", "max_grad_norm": 1.0, "loss": 0.6267315745353699, "grad_norm": 1.2673912048339844, "learning_rate": 1.8389939542028484e-05} +{"ts": "2025-12-22T16:45:39", "event": "train_log", "step": 184, "epoch": 0.5367365542388332, "progress_pct": 26.82, "epoch_pct": 26.84, "eta": "14:13:36", "max_grad_norm": 1.0, "loss": 0.5256403684616089, "grad_norm": 1.0273847579956055, "learning_rate": 1.8362124769373064e-05} +{"ts": "2025-12-22T16:47:16", "event": "train_log", "step": 185, "epoch": 0.5396536007292616, "progress_pct": 26.97, "epoch_pct": 26.98, "eta": "14:11:40", "max_grad_norm": 1.0, "loss": 0.5916382074356079, "grad_norm": 1.006093978881836, "learning_rate": 1.8334093203783986e-05} +{"ts": "2025-12-22T16:48:51", "event": "train_log", "step": 186, "epoch": 0.5425706472196901, "progress_pct": 27.11, "epoch_pct": 27.13, "eta": "14:09:41", "max_grad_norm": 1.0, "loss": 0.581648588180542, "grad_norm": 1.2740857601165771, "learning_rate": 1.8305845571995843e-05} +{"ts": "2025-12-22T16:50:29", "event": "train_log", "step": 187, "epoch": 0.5454876937101185, "progress_pct": 27.26, "epoch_pct": 27.27, "eta": "14:07:49", "max_grad_norm": 1.0, "loss": 0.4824523627758026, "grad_norm": 1.494248390197754, "learning_rate": 1.8277382606344872e-05} +{"ts": "2025-12-22T16:52:05", "event": "train_log", "step": 188, "epoch": 0.548404740200547, "progress_pct": 27.41, "epoch_pct": 27.42, "eta": "14:05:49", "max_grad_norm": 1.0, "loss": 0.5531858205795288, "grad_norm": 1.1862496137619019, "learning_rate": 1.824870504474996e-05} +{"ts": "2025-12-22T16:53:40", "event": "train_log", "step": 189, "epoch": 0.5513217866909754, "progress_pct": 27.55, "epoch_pct": 27.57, "eta": "14:03:50", "max_grad_norm": 1.0, "loss": 0.6308296918869019, "grad_norm": 3.503049373626709, "learning_rate": 1.8219813630693523e-05} +{"ts": "2025-12-22T16:55:15", "event": "train_log", "step": 190, "epoch": 0.5542388331814039, "progress_pct": 27.7, "epoch_pct": 27.71, "eta": "14:01:50", "max_grad_norm": 1.0, "loss": 0.6146273016929626, "grad_norm": 1.7544710636138916, "learning_rate": 1.819070911320222e-05} +{"ts": "2025-12-22T16:56:52", "event": "train_log", "step": 191, "epoch": 0.5571558796718322, "progress_pct": 27.84, "epoch_pct": 27.86, "eta": "13:59:56", "max_grad_norm": 1.0, "loss": 0.5848966240882874, "grad_norm": 1.3367774486541748, "learning_rate": 1.8161392246827546e-05} +{"ts": "2025-12-22T16:58:28", "event": "train_log", "step": 192, "epoch": 0.5600729261622607, "progress_pct": 27.99, "epoch_pct": 28.0, "eta": "13:57:58", "max_grad_norm": 1.0, "loss": 0.6621730327606201, "grad_norm": 1.696418046951294, "learning_rate": 1.8131863791626263e-05} +{"ts": "2025-12-22T17:00:04", "event": "train_log", "step": 193, "epoch": 0.5629899726526891, "progress_pct": 28.13, "epoch_pct": 28.15, "eta": "13:56:02", "max_grad_norm": 1.0, "loss": 0.5972204208374023, "grad_norm": 1.360052227973938, "learning_rate": 1.8102124513140694e-05} +{"ts": "2025-12-22T17:01:43", "event": "train_log", "step": 194, "epoch": 0.5659070191431176, "progress_pct": 28.28, "epoch_pct": 28.3, "eta": "13:54:13", "max_grad_norm": 1.0, "loss": 0.4938785433769226, "grad_norm": 1.5376263856887817, "learning_rate": 1.807217518237888e-05} +{"ts": "2025-12-22T17:03:21", "event": "train_log", "step": 195, "epoch": 0.568824065633546, "progress_pct": 28.43, "epoch_pct": 28.44, "eta": "13:52:22", "max_grad_norm": 1.0, "loss": 0.5366095304489136, "grad_norm": 1.2249681949615479, "learning_rate": 1.8042016575794585e-05} +{"ts": "2025-12-22T17:04:59", "event": "train_log", "step": 196, "epoch": 0.5717411121239745, "progress_pct": 28.57, "epoch_pct": 28.59, "eta": "13:50:31", "max_grad_norm": 1.0, "loss": 0.5116773843765259, "grad_norm": 1.7868080139160156, "learning_rate": 1.8011649475267178e-05} +{"ts": "2025-12-22T17:06:39", "event": "train_log", "step": 197, "epoch": 0.574658158614403, "progress_pct": 28.72, "epoch_pct": 28.73, "eta": "13:48:45", "max_grad_norm": 1.0, "loss": 0.49072742462158203, "grad_norm": 2.369993209838867, "learning_rate": 1.7981074668081345e-05} +{"ts": "2025-12-22T17:08:19", "event": "train_log", "step": 198, "epoch": 0.5775752051048314, "progress_pct": 28.86, "epoch_pct": 28.88, "eta": "13:46:59", "max_grad_norm": 1.0, "loss": 0.5691611170768738, "grad_norm": 1.0168434381484985, "learning_rate": 1.7950292946906695e-05} +{"ts": "2025-12-22T17:09:56", "event": "train_log", "step": 199, "epoch": 0.5804922515952597, "progress_pct": 29.01, "epoch_pct": 29.02, "eta": "13:45:07", "max_grad_norm": 1.0, "loss": 0.5515039563179016, "grad_norm": 1.2990851402282715, "learning_rate": 1.7919305109777195e-05} +{"ts": "2025-12-22T17:11:34", "event": "train_log", "step": 200, "epoch": 0.5834092980856882, "progress_pct": 29.15, "epoch_pct": 29.17, "eta": "13:43:15", "max_grad_norm": 1.0, "loss": 0.5017011165618896, "grad_norm": 1.4859853982925415, "learning_rate": 1.7888111960070493e-05} +{"ts": "2025-12-22T17:31:15", "event": "train_log", "step": 200, "epoch": 0.5834092980856882, "progress_pct": 29.15, "epoch_pct": 29.17, "eta": "14:31:05", "max_grad_norm": 1.0, "eval_loss": 0.5414339303970337, "eval_runtime": 1180.7894, "eval_samples_per_second": 0.535, "eval_steps_per_second": 0.535} +{"ts": "2025-12-22T17:32:55", "event": "train_log", "step": 201, "epoch": 0.5863263445761167, "progress_pct": 29.3, "epoch_pct": 29.32, "eta": "14:28:59", "max_grad_norm": 1.0, "loss": 0.5677731037139893, "grad_norm": 1.0065829753875732, "learning_rate": 1.7856714306487088e-05} +{"ts": "2025-12-22T17:34:31", "event": "train_log", "step": 202, "epoch": 0.5892433910665451, "progress_pct": 29.45, "epoch_pct": 29.46, "eta": "14:26:45", "max_grad_norm": 1.0, "loss": 0.4525509476661682, "grad_norm": 1.1727538108825684, "learning_rate": 1.7825112963029352e-05} +{"ts": "2025-12-22T17:36:10", "event": "train_log", "step": 203, "epoch": 0.5921604375569736, "progress_pct": 29.59, "epoch_pct": 29.61, "eta": "14:24:36", "max_grad_norm": 1.0, "loss": 0.5208959579467773, "grad_norm": 1.3376752138137817, "learning_rate": 1.7793308748980437e-05} +{"ts": "2025-12-22T17:37:46", "event": "train_log", "step": 204, "epoch": 0.595077484047402, "progress_pct": 29.74, "epoch_pct": 29.75, "eta": "14:22:22", "max_grad_norm": 1.0, "loss": 0.6033903360366821, "grad_norm": 0.9196159839630127, "learning_rate": 1.776130248888304e-05} +{"ts": "2025-12-22T17:39:25", "event": "train_log", "step": 205, "epoch": 0.5979945305378305, "progress_pct": 29.88, "epoch_pct": 29.9, "eta": "14:20:17", "max_grad_norm": 1.0, "loss": 0.5449609160423279, "grad_norm": 1.0750919580459595, "learning_rate": 1.772909501251801e-05} +{"ts": "2025-12-22T17:41:03", "event": "train_log", "step": 206, "epoch": 0.6009115770282589, "progress_pct": 30.03, "epoch_pct": 30.05, "eta": "14:18:07", "max_grad_norm": 1.0, "loss": 0.5685338377952576, "grad_norm": 1.2459467649459839, "learning_rate": 1.769668715488285e-05} +{"ts": "2025-12-22T17:42:41", "event": "train_log", "step": 207, "epoch": 0.6038286235186874, "progress_pct": 30.17, "epoch_pct": 30.19, "eta": "14:15:58", "max_grad_norm": 1.0, "loss": 0.5240382552146912, "grad_norm": 1.1690552234649658, "learning_rate": 1.766407975617006e-05} +{"ts": "2025-12-22T17:44:18", "event": "train_log", "step": 208, "epoch": 0.6067456700091157, "progress_pct": 30.32, "epoch_pct": 30.34, "eta": "14:13:48", "max_grad_norm": 1.0, "loss": 0.6802893877029419, "grad_norm": 1.0816599130630493, "learning_rate": 1.7631273661745362e-05} +{"ts": "2025-12-22T17:45:53", "event": "train_log", "step": 209, "epoch": 0.6096627164995442, "progress_pct": 30.47, "epoch_pct": 30.48, "eta": "14:11:33", "max_grad_norm": 1.0, "loss": 0.48193931579589844, "grad_norm": 1.3662947416305542, "learning_rate": 1.7598269722125775e-05} +{"ts": "2025-12-22T17:47:29", "event": "train_log", "step": 210, "epoch": 0.6125797629899726, "progress_pct": 30.61, "epoch_pct": 30.63, "eta": "14:09:20", "max_grad_norm": 1.0, "loss": 0.5675849914550781, "grad_norm": 0.9364766478538513, "learning_rate": 1.7565068792957576e-05} +{"ts": "2025-12-22T17:49:06", "event": "train_log", "step": 211, "epoch": 0.6154968094804011, "progress_pct": 30.76, "epoch_pct": 30.77, "eta": "14:07:10", "max_grad_norm": 1.0, "loss": 0.5474762916564941, "grad_norm": 1.123828411102295, "learning_rate": 1.75316717349941e-05} +{"ts": "2025-12-22T17:50:41", "event": "train_log", "step": 212, "epoch": 0.6184138559708295, "progress_pct": 30.9, "epoch_pct": 30.92, "eta": "14:04:58", "max_grad_norm": 1.0, "loss": 0.4918654263019562, "grad_norm": 1.1924363374710083, "learning_rate": 1.749807941407345e-05} +{"ts": "2025-12-22T17:52:17", "event": "train_log", "step": 213, "epoch": 0.621330902461258, "progress_pct": 31.05, "epoch_pct": 31.07, "eta": "14:02:45", "max_grad_norm": 1.0, "loss": 0.5742691159248352, "grad_norm": 1.101293921470642, "learning_rate": 1.7464292701096014e-05} +{"ts": "2025-12-22T17:53:54", "event": "train_log", "step": 214, "epoch": 0.6242479489516864, "progress_pct": 31.2, "epoch_pct": 31.21, "eta": "14:00:36", "max_grad_norm": 1.0, "loss": 0.5828965902328491, "grad_norm": 1.7374963760375977, "learning_rate": 1.7430312472001928e-05} +{"ts": "2025-12-22T17:55:28", "event": "train_log", "step": 215, "epoch": 0.6271649954421149, "progress_pct": 31.34, "epoch_pct": 31.36, "eta": "13:58:22", "max_grad_norm": 1.0, "loss": 0.5265159010887146, "grad_norm": 1.3195666074752808, "learning_rate": 1.739613960774833e-05} +{"ts": "2025-12-22T17:57:07", "event": "train_log", "step": 216, "epoch": 0.6300820419325432, "progress_pct": 31.49, "epoch_pct": 31.5, "eta": "13:56:18", "max_grad_norm": 1.0, "loss": 0.4929371476173401, "grad_norm": 1.254686713218689, "learning_rate": 1.7361774994286545e-05} +{"ts": "2025-12-22T17:58:43", "event": "train_log", "step": 217, "epoch": 0.6329990884229717, "progress_pct": 31.63, "epoch_pct": 31.65, "eta": "13:54:08", "max_grad_norm": 1.0, "loss": 0.5060417652130127, "grad_norm": 1.1476380825042725, "learning_rate": 1.7327219522539102e-05} +{"ts": "2025-12-22T18:00:20", "event": "train_log", "step": 218, "epoch": 0.6359161349134002, "progress_pct": 31.78, "epoch_pct": 31.8, "eta": "13:52:01", "max_grad_norm": 1.0, "loss": 0.504043698310852, "grad_norm": 1.0914150476455688, "learning_rate": 1.7292474088376643e-05} +{"ts": "2025-12-22T18:01:56", "event": "train_log", "step": 219, "epoch": 0.6388331814038286, "progress_pct": 31.92, "epoch_pct": 31.94, "eta": "13:49:50", "max_grad_norm": 1.0, "loss": 0.4797310531139374, "grad_norm": 1.1339508295059204, "learning_rate": 1.7257539592594698e-05} +{"ts": "2025-12-22T18:03:33", "event": "train_log", "step": 220, "epoch": 0.6417502278942571, "progress_pct": 32.07, "epoch_pct": 32.09, "eta": "13:47:44", "max_grad_norm": 1.0, "loss": 0.5878555178642273, "grad_norm": 1.0805399417877197, "learning_rate": 1.722241694089033e-05} +{"ts": "2025-12-22T18:05:10", "event": "train_log", "step": 221, "epoch": 0.6446672743846855, "progress_pct": 32.22, "epoch_pct": 32.23, "eta": "13:45:37", "max_grad_norm": 1.0, "loss": 0.5005823969841003, "grad_norm": 1.8615056276321411, "learning_rate": 1.718710704383865e-05} +{"ts": "2025-12-22T18:06:47", "event": "train_log", "step": 222, "epoch": 0.647584320875114, "progress_pct": 32.36, "epoch_pct": 32.38, "eta": "13:43:31", "max_grad_norm": 1.0, "loss": 0.4949319064617157, "grad_norm": 1.1445401906967163, "learning_rate": 1.7151610816869214e-05} +{"ts": "2025-12-22T18:08:25", "event": "train_log", "step": 223, "epoch": 0.6505013673655424, "progress_pct": 32.51, "epoch_pct": 32.53, "eta": "13:41:26", "max_grad_norm": 1.0, "loss": 0.5073204040527344, "grad_norm": 0.9726515412330627, "learning_rate": 1.711592918024229e-05} +{"ts": "2025-12-22T18:10:04", "event": "train_log", "step": 224, "epoch": 0.6534184138559709, "progress_pct": 32.65, "epoch_pct": 32.67, "eta": "13:39:25", "max_grad_norm": 1.0, "loss": 0.47885262966156006, "grad_norm": 1.4491140842437744, "learning_rate": 1.7080063059024998e-05} +{"ts": "2025-12-22T18:11:43", "event": "train_log", "step": 225, "epoch": 0.6563354603463992, "progress_pct": 32.8, "epoch_pct": 32.82, "eta": "13:37:23", "max_grad_norm": 1.0, "loss": 0.5775837898254395, "grad_norm": 1.0070592164993286, "learning_rate": 1.7044013383067327e-05} +{"ts": "2025-12-22T18:13:20", "event": "train_log", "step": 226, "epoch": 0.6592525068368277, "progress_pct": 32.94, "epoch_pct": 32.96, "eta": "13:35:18", "max_grad_norm": 1.0, "loss": 0.5050399899482727, "grad_norm": 0.966221272945404, "learning_rate": 1.7007781086978037e-05} +{"ts": "2025-12-22T18:14:58", "event": "train_log", "step": 227, "epoch": 0.6621695533272561, "progress_pct": 33.09, "epoch_pct": 33.11, "eta": "13:33:15", "max_grad_norm": 1.0, "loss": 0.5737045407295227, "grad_norm": 0.9808815121650696, "learning_rate": 1.6971367110100407e-05} +{"ts": "2025-12-22T18:16:34", "event": "train_log", "step": 228, "epoch": 0.6650865998176846, "progress_pct": 33.24, "epoch_pct": 33.25, "eta": "13:31:08", "max_grad_norm": 1.0, "loss": 0.48077821731567383, "grad_norm": 1.0158127546310425, "learning_rate": 1.6934772396487906e-05} +{"ts": "2025-12-22T18:18:07", "event": "train_log", "step": 229, "epoch": 0.668003646308113, "progress_pct": 33.38, "epoch_pct": 33.4, "eta": "13:28:55", "max_grad_norm": 1.0, "loss": 0.5614925026893616, "grad_norm": 1.32015860080719, "learning_rate": 1.6897997894879706e-05} +{"ts": "2025-12-22T18:19:39", "event": "train_log", "step": 230, "epoch": 0.6709206927985415, "progress_pct": 33.53, "epoch_pct": 33.55, "eta": "13:26:41", "max_grad_norm": 1.0, "loss": 0.4970760643482208, "grad_norm": 1.1055903434753418, "learning_rate": 1.686104455867608e-05} +{"ts": "2025-12-22T18:21:10", "event": "train_log", "step": 231, "epoch": 0.67383773928897, "progress_pct": 33.67, "epoch_pct": 33.69, "eta": "13:24:25", "max_grad_norm": 1.0, "loss": 0.5540452003479004, "grad_norm": 1.0804500579833984, "learning_rate": 1.682391334591371e-05} +{"ts": "2025-12-22T18:22:42", "event": "train_log", "step": 232, "epoch": 0.6767547857793984, "progress_pct": 33.82, "epoch_pct": 33.84, "eta": "13:22:11", "max_grad_norm": 1.0, "loss": 0.5778501033782959, "grad_norm": 1.1906245946884155, "learning_rate": 1.6786605219240807e-05} +{"ts": "2025-12-22T18:24:16", "event": "train_log", "step": 233, "epoch": 0.6796718322698267, "progress_pct": 33.97, "epoch_pct": 33.98, "eta": "13:20:01", "max_grad_norm": 1.0, "loss": 0.49073565006256104, "grad_norm": 0.9758645296096802, "learning_rate": 1.6749121145892192e-05} +{"ts": "2025-12-22T18:25:50", "event": "train_log", "step": 234, "epoch": 0.6825888787602552, "progress_pct": 34.11, "epoch_pct": 34.13, "eta": "13:17:53", "max_grad_norm": 1.0, "loss": 0.4828741252422333, "grad_norm": 1.1678364276885986, "learning_rate": 1.6711462097664207e-05} +{"ts": "2025-12-22T18:27:21", "event": "train_log", "step": 235, "epoch": 0.6855059252506837, "progress_pct": 34.26, "epoch_pct": 34.28, "eta": "13:15:38", "max_grad_norm": 1.0, "loss": 0.5143818855285645, "grad_norm": 1.148301362991333, "learning_rate": 1.6673629050889507e-05} +{"ts": "2025-12-22T18:28:55", "event": "train_log", "step": 236, "epoch": 0.6884229717411121, "progress_pct": 34.4, "epoch_pct": 34.42, "eta": "13:13:29", "max_grad_norm": 1.0, "loss": 0.5301160216331482, "grad_norm": 1.005898356437683, "learning_rate": 1.6635622986411776e-05} +{"ts": "2025-12-22T18:30:30", "event": "train_log", "step": 237, "epoch": 0.6913400182315406, "progress_pct": 34.55, "epoch_pct": 34.57, "eta": "13:11:23", "max_grad_norm": 1.0, "loss": 0.4800386130809784, "grad_norm": 1.2227320671081543, "learning_rate": 1.659744488956027e-05} +{"ts": "2025-12-22T18:32:06", "event": "train_log", "step": 238, "epoch": 0.694257064721969, "progress_pct": 34.69, "epoch_pct": 34.71, "eta": "13:09:20", "max_grad_norm": 1.0, "loss": 0.5098081827163696, "grad_norm": 0.986456573009491, "learning_rate": 1.6559095750124296e-05} +{"ts": "2025-12-22T18:33:42", "event": "train_log", "step": 239, "epoch": 0.6971741112123975, "progress_pct": 34.84, "epoch_pct": 34.86, "eta": "13:07:15", "max_grad_norm": 1.0, "loss": 0.5147273540496826, "grad_norm": 1.1474376916885376, "learning_rate": 1.6520576562327518e-05} +{"ts": "2025-12-22T18:35:19", "event": "train_log", "step": 240, "epoch": 0.7000911577028259, "progress_pct": 34.99, "epoch_pct": 35.0, "eta": "13:05:13", "max_grad_norm": 1.0, "loss": 0.5023190379142761, "grad_norm": 1.10917067527771, "learning_rate": 1.6481888324802223e-05} +{"ts": "2025-12-22T18:36:55", "event": "train_log", "step": 241, "epoch": 0.7030082041932544, "progress_pct": 35.13, "epoch_pct": 35.15, "eta": "13:03:10", "max_grad_norm": 1.0, "loss": 0.5282092690467834, "grad_norm": 1.2339262962341309, "learning_rate": 1.644303204056341e-05} +{"ts": "2025-12-22T18:38:35", "event": "train_log", "step": 242, "epoch": 0.7059252506836827, "progress_pct": 35.28, "epoch_pct": 35.3, "eta": "13:01:14", "max_grad_norm": 1.0, "loss": 0.5635963082313538, "grad_norm": 0.997941255569458, "learning_rate": 1.640400871698277e-05} +{"ts": "2025-12-22T18:40:15", "event": "train_log", "step": 243, "epoch": 0.7088422971741112, "progress_pct": 35.42, "epoch_pct": 35.44, "eta": "12:59:18", "max_grad_norm": 1.0, "loss": 0.5577977895736694, "grad_norm": 1.0345823764801025, "learning_rate": 1.63648193657626e-05} +{"ts": "2025-12-22T18:41:52", "event": "train_log", "step": 244, "epoch": 0.7117593436645396, "progress_pct": 35.57, "epoch_pct": 35.59, "eta": "12:57:16", "max_grad_norm": 1.0, "loss": 0.4365362524986267, "grad_norm": 1.3468303680419922, "learning_rate": 1.6325465002909554e-05} +{"ts": "2025-12-22T18:43:34", "event": "train_log", "step": 245, "epoch": 0.7146763901549681, "progress_pct": 35.71, "epoch_pct": 35.73, "eta": "12:55:25", "max_grad_norm": 1.0, "loss": 0.46069926023483276, "grad_norm": 1.2817128896713257, "learning_rate": 1.628594664870831e-05} +{"ts": "2025-12-22T18:45:07", "event": "train_log", "step": 246, "epoch": 0.7175934366453965, "progress_pct": 35.86, "epoch_pct": 35.88, "eta": "12:53:17", "max_grad_norm": 1.0, "loss": 0.5476971864700317, "grad_norm": 1.043311357498169, "learning_rate": 1.6246265327695117e-05} +{"ts": "2025-12-22T18:46:47", "event": "train_log", "step": 247, "epoch": 0.720510483135825, "progress_pct": 36.01, "epoch_pct": 36.03, "eta": "12:51:23", "max_grad_norm": 1.0, "loss": 0.48051249980926514, "grad_norm": 1.0297389030456543, "learning_rate": 1.620642206863124e-05} +{"ts": "2025-12-22T18:48:25", "event": "train_log", "step": 248, "epoch": 0.7234275296262535, "progress_pct": 36.15, "epoch_pct": 36.17, "eta": "12:49:24", "max_grad_norm": 1.0, "loss": 0.5683314800262451, "grad_norm": 1.4869836568832397, "learning_rate": 1.6166417904476257e-05} +{"ts": "2025-12-22T18:50:00", "event": "train_log", "step": 249, "epoch": 0.7263445761166819, "progress_pct": 36.3, "epoch_pct": 36.32, "eta": "12:47:21", "max_grad_norm": 1.0, "loss": 0.5277887582778931, "grad_norm": 1.0628005266189575, "learning_rate": 1.6126253872361336e-05} +{"ts": "2025-12-22T18:51:35", "event": "train_log", "step": 250, "epoch": 0.7292616226071102, "progress_pct": 36.44, "epoch_pct": 36.46, "eta": "12:45:16", "max_grad_norm": 1.0, "loss": 0.5048879384994507, "grad_norm": 1.2682170867919922, "learning_rate": 1.608593101356229e-05} +{"ts": "2025-12-22T19:11:10", "event": "train_log", "step": 250, "epoch": 0.7292616226071102, "progress_pct": 36.44, "epoch_pct": 36.46, "eta": "13:19:26", "max_grad_norm": 1.0, "eval_loss": 0.5038471221923828, "eval_runtime": 1175.0375, "eval_samples_per_second": 0.538, "eval_steps_per_second": 0.538} +{"ts": "2025-12-22T19:12:52", "event": "train_log", "step": 251, "epoch": 0.7321786690975387, "progress_pct": 36.59, "epoch_pct": 36.61, "eta": "13:17:22", "max_grad_norm": 1.0, "loss": 0.5093721151351929, "grad_norm": 1.7376199960708618, "learning_rate": 1.6045450373472626e-05} +{"ts": "2025-12-22T19:14:28", "event": "train_log", "step": 252, "epoch": 0.7350957155879672, "progress_pct": 36.73, "epoch_pct": 36.75, "eta": "13:15:07", "max_grad_norm": 1.0, "loss": 0.4796055555343628, "grad_norm": 1.6047718524932861, "learning_rate": 1.6004813001576405e-05} +{"ts": "2025-12-22T19:16:06", "event": "train_log", "step": 253, "epoch": 0.7380127620783956, "progress_pct": 36.88, "epoch_pct": 36.9, "eta": "13:12:58", "max_grad_norm": 1.0, "loss": 0.4733014702796936, "grad_norm": 1.3582886457443237, "learning_rate": 1.5964019951421058e-05} +{"ts": "2025-12-22T19:17:45", "event": "train_log", "step": 254, "epoch": 0.7409298085688241, "progress_pct": 37.03, "epoch_pct": 37.05, "eta": "13:10:49", "max_grad_norm": 1.0, "loss": 0.5312032103538513, "grad_norm": 0.9468897581100464, "learning_rate": 1.5923072280590072e-05} +{"ts": "2025-12-22T19:19:20", "event": "train_log", "step": 255, "epoch": 0.7438468550592525, "progress_pct": 37.17, "epoch_pct": 37.19, "eta": "13:08:35", "max_grad_norm": 1.0, "loss": 0.47576645016670227, "grad_norm": 1.3890198469161987, "learning_rate": 1.5881971050675547e-05} +{"ts": "2025-12-22T19:20:55", "event": "train_log", "step": 256, "epoch": 0.746763901549681, "progress_pct": 37.32, "epoch_pct": 37.34, "eta": "13:06:20", "max_grad_norm": 1.0, "loss": 0.5555092096328735, "grad_norm": 1.782992959022522, "learning_rate": 1.584071732725071e-05} +{"ts": "2025-12-22T19:22:33", "event": "train_log", "step": 257, "epoch": 0.7496809480401094, "progress_pct": 37.46, "epoch_pct": 37.48, "eta": "13:04:11", "max_grad_norm": 1.0, "loss": 0.5148727893829346, "grad_norm": 1.1790621280670166, "learning_rate": 1.5799312179842265e-05} +{"ts": "2025-12-22T19:24:07", "event": "train_log", "step": 258, "epoch": 0.7525979945305379, "progress_pct": 37.61, "epoch_pct": 37.63, "eta": "13:01:55", "max_grad_norm": 1.0, "loss": 0.49939870834350586, "grad_norm": 1.446694254875183, "learning_rate": 1.5757756681902664e-05} +{"ts": "2025-12-22T19:25:44", "event": "train_log", "step": 259, "epoch": 0.7555150410209662, "progress_pct": 37.76, "epoch_pct": 37.78, "eta": "12:59:45", "max_grad_norm": 1.0, "loss": 0.562156081199646, "grad_norm": 1.1786166429519653, "learning_rate": 1.571605191078229e-05} +{"ts": "2025-12-22T19:27:22", "event": "train_log", "step": 260, "epoch": 0.7584320875113947, "progress_pct": 37.9, "epoch_pct": 37.92, "eta": "12:57:36", "max_grad_norm": 1.0, "loss": 0.49580734968185425, "grad_norm": 1.16925847530365, "learning_rate": 1.567419894770151e-05} +{"ts": "2025-12-22T19:28:58", "event": "train_log", "step": 261, "epoch": 0.7613491340018231, "progress_pct": 38.05, "epoch_pct": 38.07, "eta": "12:55:24", "max_grad_norm": 1.0, "loss": 0.4821680784225464, "grad_norm": 1.60944664478302, "learning_rate": 1.5632198877722676e-05} +{"ts": "2025-12-22T19:30:34", "event": "train_log", "step": 262, "epoch": 0.7642661804922516, "progress_pct": 38.19, "epoch_pct": 38.21, "eta": "12:53:13", "max_grad_norm": 1.0, "loss": 0.4392276406288147, "grad_norm": 1.3957884311676025, "learning_rate": 1.5590052789721946e-05} +{"ts": "2025-12-22T19:32:09", "event": "train_log", "step": 263, "epoch": 0.76718322698268, "progress_pct": 38.34, "epoch_pct": 38.36, "eta": "12:51:00", "max_grad_norm": 1.0, "loss": 0.39603114128112793, "grad_norm": 1.636195421218872, "learning_rate": 1.5547761776361096e-05} +{"ts": "2025-12-22T19:33:44", "event": "train_log", "step": 264, "epoch": 0.7701002734731085, "progress_pct": 38.48, "epoch_pct": 38.51, "eta": "12:48:49", "max_grad_norm": 1.0, "loss": 0.4833749234676361, "grad_norm": 1.496766448020935, "learning_rate": 1.550532693405917e-05} +{"ts": "2025-12-22T19:35:18", "event": "train_log", "step": 265, "epoch": 0.773017319963537, "progress_pct": 38.63, "epoch_pct": 38.65, "eta": "12:46:34", "max_grad_norm": 1.0, "loss": 0.43738317489624023, "grad_norm": 1.3587844371795654, "learning_rate": 1.5462749362964058e-05} +{"ts": "2025-12-22T19:36:51", "event": "train_log", "step": 266, "epoch": 0.7759343664539654, "progress_pct": 38.78, "epoch_pct": 38.8, "eta": "12:44:19", "max_grad_norm": 1.0, "loss": 0.4476737380027771, "grad_norm": 1.670704960823059, "learning_rate": 1.5420030166923983e-05} +{"ts": "2025-12-22T19:38:23", "event": "train_log", "step": 267, "epoch": 0.7788514129443938, "progress_pct": 38.92, "epoch_pct": 38.94, "eta": "12:42:03", "max_grad_norm": 1.0, "loss": 0.42266708612442017, "grad_norm": 1.2674932479858398, "learning_rate": 1.537717045345888e-05} +{"ts": "2025-12-22T19:39:55", "event": "train_log", "step": 268, "epoch": 0.7817684594348222, "progress_pct": 39.07, "epoch_pct": 39.09, "eta": "12:39:47", "max_grad_norm": 1.0, "loss": 0.5245381593704224, "grad_norm": 2.0639536380767822, "learning_rate": 1.5334171333731666e-05} +{"ts": "2025-12-22T19:41:28", "event": "train_log", "step": 269, "epoch": 0.7846855059252507, "progress_pct": 39.21, "epoch_pct": 39.23, "eta": "12:37:33", "max_grad_norm": 1.0, "loss": 0.5166443586349487, "grad_norm": 1.2091766595840454, "learning_rate": 1.529103392251946e-05} +{"ts": "2025-12-22T19:43:08", "event": "train_log", "step": 270, "epoch": 0.7876025524156791, "progress_pct": 39.36, "epoch_pct": 39.38, "eta": "12:35:31", "max_grad_norm": 1.0, "loss": 0.5674265027046204, "grad_norm": 1.1021631956100464, "learning_rate": 1.5247759338184653e-05} +{"ts": "2025-12-22T19:44:50", "event": "train_log", "step": 271, "epoch": 0.7905195989061076, "progress_pct": 39.5, "epoch_pct": 39.53, "eta": "12:33:31", "max_grad_norm": 1.0, "loss": 0.40855613350868225, "grad_norm": 1.3143829107284546, "learning_rate": 1.520434870264595e-05} +{"ts": "2025-12-22T19:46:33", "event": "train_log", "step": 272, "epoch": 0.793436645396536, "progress_pct": 39.65, "epoch_pct": 39.67, "eta": "12:31:33", "max_grad_norm": 1.0, "loss": 0.4308925271034241, "grad_norm": 1.1784812211990356, "learning_rate": 1.5160803141349244e-05} +{"ts": "2025-12-22T19:48:13", "event": "train_log", "step": 273, "epoch": 0.7963536918869645, "progress_pct": 39.8, "epoch_pct": 39.82, "eta": "12:29:31", "max_grad_norm": 1.0, "loss": 0.45035502314567566, "grad_norm": 2.1635706424713135, "learning_rate": 1.5117123783238458e-05} +{"ts": "2025-12-22T19:49:54", "event": "train_log", "step": 274, "epoch": 0.7992707383773929, "progress_pct": 39.94, "epoch_pct": 39.96, "eta": "12:27:29", "max_grad_norm": 1.0, "loss": 0.5095728635787964, "grad_norm": 1.569203495979309, "learning_rate": 1.5073311760726287e-05} +{"ts": "2025-12-22T19:51:32", "event": "train_log", "step": 275, "epoch": 0.8021877848678214, "progress_pct": 40.09, "epoch_pct": 40.11, "eta": "12:25:25", "max_grad_norm": 1.0, "loss": 0.496748685836792, "grad_norm": 2.532621383666992, "learning_rate": 1.5029368209664822e-05} +{"ts": "2025-12-22T19:53:10", "event": "train_log", "step": 276, "epoch": 0.8051048313582497, "progress_pct": 40.23, "epoch_pct": 40.26, "eta": "12:23:20", "max_grad_norm": 1.0, "loss": 0.4972914159297943, "grad_norm": 1.6312552690505981, "learning_rate": 1.4985294269316098e-05} +{"ts": "2025-12-22T19:54:47", "event": "train_log", "step": 277, "epoch": 0.8080218778486782, "progress_pct": 40.38, "epoch_pct": 40.4, "eta": "12:21:14", "max_grad_norm": 1.0, "loss": 0.5589750409126282, "grad_norm": 1.3996756076812744, "learning_rate": 1.4941091082322579e-05} +{"ts": "2025-12-22T19:56:23", "event": "train_log", "step": 278, "epoch": 0.8109389243391066, "progress_pct": 40.52, "epoch_pct": 40.55, "eta": "12:19:06", "max_grad_norm": 1.0, "loss": 0.5349453687667847, "grad_norm": 1.1288363933563232, "learning_rate": 1.4896759794677526e-05} +{"ts": "2025-12-22T19:57:59", "event": "train_log", "step": 279, "epoch": 0.8138559708295351, "progress_pct": 40.67, "epoch_pct": 40.69, "eta": "12:16:59", "max_grad_norm": 1.0, "loss": 0.46511000394821167, "grad_norm": 1.6913920640945435, "learning_rate": 1.4852301555695268e-05} +{"ts": "2025-12-22T19:59:36", "event": "train_log", "step": 280, "epoch": 0.8167730173199635, "progress_pct": 40.82, "epoch_pct": 40.84, "eta": "12:14:53", "max_grad_norm": 1.0, "loss": 0.4715422987937927, "grad_norm": 1.1913212537765503, "learning_rate": 1.4807717517981439e-05} +{"ts": "2025-12-22T20:01:10", "event": "train_log", "step": 281, "epoch": 0.819690063810392, "progress_pct": 40.96, "epoch_pct": 40.98, "eta": "12:12:44", "max_grad_norm": 1.0, "loss": 0.53330397605896, "grad_norm": 1.1179691553115845, "learning_rate": 1.476300883740307e-05} +{"ts": "2025-12-22T20:02:47", "event": "train_log", "step": 282, "epoch": 0.8226071103008205, "progress_pct": 41.11, "epoch_pct": 41.13, "eta": "12:10:38", "max_grad_norm": 1.0, "loss": 0.47564437985420227, "grad_norm": 1.7473797798156738, "learning_rate": 1.4718176673058624e-05} +{"ts": "2025-12-22T20:04:22", "event": "train_log", "step": 283, "epoch": 0.8255241567912489, "progress_pct": 41.25, "epoch_pct": 41.28, "eta": "12:08:31", "max_grad_norm": 1.0, "loss": 0.46364277601242065, "grad_norm": 1.2653177976608276, "learning_rate": 1.4673222187247963e-05} +{"ts": "2025-12-22T20:06:01", "event": "train_log", "step": 284, "epoch": 0.8284412032816773, "progress_pct": 41.4, "epoch_pct": 41.42, "eta": "12:06:29", "max_grad_norm": 1.0, "loss": 0.4778091013431549, "grad_norm": 1.2567330598831177, "learning_rate": 1.4628146545442202e-05} +{"ts": "2025-12-22T20:07:37", "event": "train_log", "step": 285, "epoch": 0.8313582497721057, "progress_pct": 41.55, "epoch_pct": 41.57, "eta": "12:04:24", "max_grad_norm": 1.0, "loss": 0.4480203688144684, "grad_norm": 1.5848406553268433, "learning_rate": 1.4582950916253488e-05} +{"ts": "2025-12-22T20:09:12", "event": "train_log", "step": 286, "epoch": 0.8342752962625342, "progress_pct": 41.69, "epoch_pct": 41.71, "eta": "12:02:16", "max_grad_norm": 1.0, "loss": 0.37945032119750977, "grad_norm": 1.3278183937072754, "learning_rate": 1.453763647140472e-05} +{"ts": "2025-12-22T20:10:47", "event": "train_log", "step": 287, "epoch": 0.8371923427529626, "progress_pct": 41.84, "epoch_pct": 41.86, "eta": "12:00:09", "max_grad_norm": 1.0, "loss": 0.5306747555732727, "grad_norm": 1.0961651802062988, "learning_rate": 1.4492204385699155e-05} +{"ts": "2025-12-22T20:12:22", "event": "train_log", "step": 288, "epoch": 0.8401093892433911, "progress_pct": 41.98, "epoch_pct": 42.01, "eta": "11:58:03", "max_grad_norm": 1.0, "loss": 0.49950045347213745, "grad_norm": 1.176276683807373, "learning_rate": 1.4446655836989961e-05} +{"ts": "2025-12-22T20:14:03", "event": "train_log", "step": 289, "epoch": 0.8430264357338195, "progress_pct": 42.13, "epoch_pct": 42.15, "eta": "11:56:04", "max_grad_norm": 1.0, "loss": 0.494475394487381, "grad_norm": 1.2228269577026367, "learning_rate": 1.4400992006149674e-05} +{"ts": "2025-12-22T20:15:42", "event": "train_log", "step": 290, "epoch": 0.845943482224248, "progress_pct": 42.27, "epoch_pct": 42.3, "eta": "11:54:03", "max_grad_norm": 1.0, "loss": 0.44170859456062317, "grad_norm": 1.1584209203720093, "learning_rate": 1.4355214077039592e-05} +{"ts": "2025-12-22T20:17:17", "event": "train_log", "step": 291, "epoch": 0.8488605287146764, "progress_pct": 42.42, "epoch_pct": 42.44, "eta": "11:51:57", "max_grad_norm": 1.0, "loss": 0.4359871745109558, "grad_norm": 1.2041938304901123, "learning_rate": 1.4309323236479071e-05} +{"ts": "2025-12-22T20:18:55", "event": "train_log", "step": 292, "epoch": 0.8517775752051049, "progress_pct": 42.57, "epoch_pct": 42.59, "eta": "11:49:56", "max_grad_norm": 1.0, "loss": 0.45031386613845825, "grad_norm": 1.279645562171936, "learning_rate": 1.4263320674214762e-05} +{"ts": "2025-12-22T20:20:30", "event": "train_log", "step": 293, "epoch": 0.8546946216955332, "progress_pct": 42.71, "epoch_pct": 42.73, "eta": "11:47:50", "max_grad_norm": 1.0, "loss": 0.4832204580307007, "grad_norm": 1.3958357572555542, "learning_rate": 1.4217207582889769e-05} +{"ts": "2025-12-22T20:22:08", "event": "train_log", "step": 294, "epoch": 0.8576116681859617, "progress_pct": 42.86, "epoch_pct": 42.88, "eta": "11:45:48", "max_grad_norm": 1.0, "loss": 0.5154346227645874, "grad_norm": 1.2788586616516113, "learning_rate": 1.4170985158012725e-05} +{"ts": "2025-12-22T20:23:43", "event": "train_log", "step": 295, "epoch": 0.8605287146763901, "progress_pct": 43.0, "epoch_pct": 43.03, "eta": "11:43:44", "max_grad_norm": 1.0, "loss": 0.46777206659317017, "grad_norm": 1.3634892702102661, "learning_rate": 1.4124654597926795e-05} +{"ts": "2025-12-22T20:25:23", "event": "train_log", "step": 296, "epoch": 0.8634457611668186, "progress_pct": 43.15, "epoch_pct": 43.17, "eta": "11:41:45", "max_grad_norm": 1.0, "loss": 0.4247053265571594, "grad_norm": 1.2719579935073853, "learning_rate": 1.4078217103778619e-05} +{"ts": "2025-12-22T20:26:57", "event": "train_log", "step": 297, "epoch": 0.866362807657247, "progress_pct": 43.29, "epoch_pct": 43.32, "eta": "11:39:39", "max_grad_norm": 1.0, "loss": 0.38349640369415283, "grad_norm": 2.890467643737793, "learning_rate": 1.4031673879487161e-05} +{"ts": "2025-12-22T20:28:33", "event": "train_log", "step": 298, "epoch": 0.8692798541476755, "progress_pct": 43.44, "epoch_pct": 43.46, "eta": "11:37:36", "max_grad_norm": 1.0, "loss": 0.4134889543056488, "grad_norm": 2.4354801177978516, "learning_rate": 1.3985026131712499e-05} +{"ts": "2025-12-22T20:30:09", "event": "train_log", "step": 299, "epoch": 0.872196900638104, "progress_pct": 43.59, "epoch_pct": 43.61, "eta": "11:35:31", "max_grad_norm": 1.0, "loss": 0.5176680684089661, "grad_norm": 1.0138323307037354, "learning_rate": 1.3938275069824541e-05} +{"ts": "2025-12-22T20:31:44", "event": "train_log", "step": 300, "epoch": 0.8751139471285324, "progress_pct": 43.73, "epoch_pct": 43.76, "eta": "11:33:28", "max_grad_norm": 1.0, "loss": 0.4818477928638458, "grad_norm": 1.2316186428070068, "learning_rate": 1.389142190587168e-05} +{"ts": "2025-12-22T20:51:33", "event": "train_log", "step": 300, "epoch": 0.8751139471285324, "progress_pct": 43.73, "epoch_pct": 43.76, "eta": "11:58:58", "max_grad_norm": 1.0, "eval_loss": 0.4752846360206604, "eval_runtime": 1189.1666, "eval_samples_per_second": 0.531, "eval_steps_per_second": 0.531} +{"ts": "2025-12-22T20:53:13", "event": "train_log", "step": 301, "epoch": 0.8780309936189608, "progress_pct": 43.88, "epoch_pct": 43.9, "eta": "11:56:51", "max_grad_norm": 1.0, "loss": 0.47766175866127014, "grad_norm": 1.515487551689148, "learning_rate": 1.384446785454936e-05} +{"ts": "2025-12-22T20:54:53", "event": "train_log", "step": 302, "epoch": 0.8809480401093892, "progress_pct": 44.02, "epoch_pct": 44.05, "eta": "11:54:43", "max_grad_norm": 1.0, "loss": 0.49297061562538147, "grad_norm": 1.4357497692108154, "learning_rate": 1.3797414133168591e-05} +{"ts": "2025-12-22T20:56:33", "event": "train_log", "step": 303, "epoch": 0.8838650865998177, "progress_pct": 44.17, "epoch_pct": 44.19, "eta": "11:52:37", "max_grad_norm": 1.0, "loss": 0.4629015326499939, "grad_norm": 1.2523037195205688, "learning_rate": 1.3750261961624383e-05} +{"ts": "2025-12-22T20:58:11", "event": "train_log", "step": 304, "epoch": 0.8867821330902461, "progress_pct": 44.31, "epoch_pct": 44.34, "eta": "11:50:29", "max_grad_norm": 1.0, "loss": 0.3773120045661926, "grad_norm": 3.5790023803710938, "learning_rate": 1.3703012562364124e-05} +{"ts": "2025-12-22T20:59:45", "event": "train_log", "step": 305, "epoch": 0.8896991795806746, "progress_pct": 44.46, "epoch_pct": 44.48, "eta": "11:48:15", "max_grad_norm": 1.0, "loss": 0.496719628572464, "grad_norm": 1.9305704832077026, "learning_rate": 1.3655667160355892e-05} +{"ts": "2025-12-22T21:01:18", "event": "train_log", "step": 306, "epoch": 0.892616226071103, "progress_pct": 44.61, "epoch_pct": 44.63, "eta": "11:46:00", "max_grad_norm": 1.0, "loss": 0.49487072229385376, "grad_norm": 1.1506154537200928, "learning_rate": 1.3608226983056687e-05} +{"ts": "2025-12-22T21:02:50", "event": "train_log", "step": 307, "epoch": 0.8955332725615315, "progress_pct": 44.75, "epoch_pct": 44.78, "eta": "11:43:45", "max_grad_norm": 1.0, "loss": 0.4910697937011719, "grad_norm": 1.8046090602874756, "learning_rate": 1.3560693260380614e-05} +{"ts": "2025-12-22T21:04:22", "event": "train_log", "step": 308, "epoch": 0.8984503190519599, "progress_pct": 44.9, "epoch_pct": 44.92, "eta": "11:41:30", "max_grad_norm": 1.0, "loss": 0.508246660232544, "grad_norm": 2.0088653564453125, "learning_rate": 1.3513067224667e-05} +{"ts": "2025-12-22T21:05:58", "event": "train_log", "step": 309, "epoch": 0.9013673655423883, "progress_pct": 45.04, "epoch_pct": 45.07, "eta": "11:39:19", "max_grad_norm": 1.0, "loss": 0.5125166177749634, "grad_norm": 1.2966033220291138, "learning_rate": 1.3465350110648437e-05} +{"ts": "2025-12-22T21:07:33", "event": "train_log", "step": 310, "epoch": 0.9042844120328167, "progress_pct": 45.19, "epoch_pct": 45.21, "eta": "11:37:08", "max_grad_norm": 1.0, "loss": 0.43942537903785706, "grad_norm": 1.9976309537887573, "learning_rate": 1.3417543155418775e-05} +{"ts": "2025-12-22T21:09:09", "event": "train_log", "step": 311, "epoch": 0.9072014585232452, "progress_pct": 45.34, "epoch_pct": 45.36, "eta": "11:34:59", "max_grad_norm": 1.0, "loss": 0.4839101731777191, "grad_norm": 1.2663682699203491, "learning_rate": 1.336964759840105e-05} +{"ts": "2025-12-22T21:10:44", "event": "train_log", "step": 312, "epoch": 0.9101185050136736, "progress_pct": 45.48, "epoch_pct": 45.51, "eta": "11:32:48", "max_grad_norm": 1.0, "loss": 0.48008066415786743, "grad_norm": 1.1223328113555908, "learning_rate": 1.3321664681315354e-05} +{"ts": "2025-12-22T21:12:22", "event": "train_log", "step": 313, "epoch": 0.9130355515041021, "progress_pct": 45.63, "epoch_pct": 45.65, "eta": "11:30:41", "max_grad_norm": 1.0, "loss": 0.47250309586524963, "grad_norm": 1.5786972045898438, "learning_rate": 1.3273595648146634e-05} +{"ts": "2025-12-22T21:13:58", "event": "train_log", "step": 314, "epoch": 0.9159525979945305, "progress_pct": 45.77, "epoch_pct": 45.8, "eta": "11:28:33", "max_grad_norm": 1.0, "loss": 0.5149738788604736, "grad_norm": 1.2150241136550903, "learning_rate": 1.322544174511245e-05} +{"ts": "2025-12-22T21:15:36", "event": "train_log", "step": 315, "epoch": 0.918869644484959, "progress_pct": 45.92, "epoch_pct": 45.94, "eta": "11:26:26", "max_grad_norm": 1.0, "loss": 0.4430195093154907, "grad_norm": 1.3676542043685913, "learning_rate": 1.3177204220630662e-05} +{"ts": "2025-12-22T21:17:10", "event": "train_log", "step": 316, "epoch": 0.9217866909753875, "progress_pct": 46.06, "epoch_pct": 46.09, "eta": "11:24:15", "max_grad_norm": 1.0, "loss": 0.4798983037471771, "grad_norm": 1.0703285932540894, "learning_rate": 1.3128884325287064e-05} +{"ts": "2025-12-22T21:18:46", "event": "train_log", "step": 317, "epoch": 0.9247037374658159, "progress_pct": 46.21, "epoch_pct": 46.24, "eta": "11:22:07", "max_grad_norm": 1.0, "loss": 0.4241073727607727, "grad_norm": 1.3131535053253174, "learning_rate": 1.308048331180296e-05} +{"ts": "2025-12-22T21:20:29", "event": "train_log", "step": 318, "epoch": 0.9276207839562443, "progress_pct": 46.36, "epoch_pct": 46.38, "eta": "11:20:07", "max_grad_norm": 1.0, "loss": 0.527199923992157, "grad_norm": 1.4485348463058472, "learning_rate": 1.3032002435002698e-05} +{"ts": "2025-12-22T21:22:02", "event": "train_log", "step": 319, "epoch": 0.9305378304466727, "progress_pct": 46.5, "epoch_pct": 46.53, "eta": "11:17:55", "max_grad_norm": 1.0, "loss": 0.47125962376594543, "grad_norm": 1.370936393737793, "learning_rate": 1.2983442951781114e-05} +{"ts": "2025-12-22T21:23:42", "event": "train_log", "step": 320, "epoch": 0.9334548769371012, "progress_pct": 46.65, "epoch_pct": 46.67, "eta": "11:15:52", "max_grad_norm": 1.0, "loss": 0.4814244210720062, "grad_norm": 1.2369643449783325, "learning_rate": 1.2934806121070973e-05} +{"ts": "2025-12-22T21:25:20", "event": "train_log", "step": 321, "epoch": 0.9363719234275296, "progress_pct": 46.79, "epoch_pct": 46.82, "eta": "11:13:46", "max_grad_norm": 1.0, "loss": 0.4915548264980316, "grad_norm": 1.2632933855056763, "learning_rate": 1.2886093203810314e-05} +{"ts": "2025-12-22T21:27:00", "event": "train_log", "step": 322, "epoch": 0.9392889699179581, "progress_pct": 46.94, "epoch_pct": 46.96, "eta": "11:11:44", "max_grad_norm": 1.0, "loss": 0.5325602293014526, "grad_norm": 1.054569959640503, "learning_rate": 1.2837305462909764e-05} +{"ts": "2025-12-22T21:28:36", "event": "train_log", "step": 323, "epoch": 0.9422060164083865, "progress_pct": 47.08, "epoch_pct": 47.11, "eta": "11:09:37", "max_grad_norm": 1.0, "loss": 0.43607404828071594, "grad_norm": 1.15959632396698, "learning_rate": 1.27884441632198e-05} +{"ts": "2025-12-22T21:30:11", "event": "train_log", "step": 324, "epoch": 0.945123062898815, "progress_pct": 47.23, "epoch_pct": 47.26, "eta": "11:07:28", "max_grad_norm": 1.0, "loss": 0.4631507992744446, "grad_norm": 1.1667979955673218, "learning_rate": 1.2739510571497945e-05} +{"ts": "2025-12-22T21:31:46", "event": "train_log", "step": 325, "epoch": 0.9480401093892434, "progress_pct": 47.38, "epoch_pct": 47.4, "eta": "11:05:20", "max_grad_norm": 1.0, "loss": 0.4935731887817383, "grad_norm": 1.6009081602096558, "learning_rate": 1.2690505956375944e-05} +{"ts": "2025-12-22T21:33:21", "event": "train_log", "step": 326, "epoch": 0.9509571558796718, "progress_pct": 47.52, "epoch_pct": 47.55, "eta": "11:03:13", "max_grad_norm": 1.0, "loss": 0.45883435010910034, "grad_norm": 1.1193996667861938, "learning_rate": 1.2641431588326858e-05} +{"ts": "2025-12-22T21:34:59", "event": "train_log", "step": 327, "epoch": 0.9538742023701002, "progress_pct": 47.67, "epoch_pct": 47.69, "eta": "11:01:08", "max_grad_norm": 1.0, "loss": 0.5206276178359985, "grad_norm": 1.5365067720413208, "learning_rate": 1.2592288739632138e-05} +{"ts": "2025-12-22T21:36:33", "event": "train_log", "step": 328, "epoch": 0.9567912488605287, "progress_pct": 47.81, "epoch_pct": 47.84, "eta": "10:59:00", "max_grad_norm": 1.0, "loss": 0.5242853760719299, "grad_norm": 1.0714622735977173, "learning_rate": 1.2543078684348632e-05} +{"ts": "2025-12-22T21:38:11", "event": "train_log", "step": 329, "epoch": 0.9597082953509571, "progress_pct": 47.96, "epoch_pct": 47.99, "eta": "10:56:55", "max_grad_norm": 1.0, "loss": 0.4794357717037201, "grad_norm": 1.3009248971939087, "learning_rate": 1.2493802698275557e-05} +{"ts": "2025-12-22T21:39:45", "event": "train_log", "step": 330, "epoch": 0.9626253418413856, "progress_pct": 48.1, "epoch_pct": 48.13, "eta": "10:54:47", "max_grad_norm": 1.0, "loss": 0.5849282145500183, "grad_norm": 1.495771050453186, "learning_rate": 1.244446205892143e-05} +{"ts": "2025-12-22T21:41:24", "event": "train_log", "step": 331, "epoch": 0.965542388331814, "progress_pct": 48.25, "epoch_pct": 48.28, "eta": "10:52:45", "max_grad_norm": 1.0, "loss": 0.47758305072784424, "grad_norm": 1.2046003341674805, "learning_rate": 1.2395058045470935e-05} +{"ts": "2025-12-22T21:43:01", "event": "train_log", "step": 332, "epoch": 0.9684594348222425, "progress_pct": 48.4, "epoch_pct": 48.42, "eta": "10:50:40", "max_grad_norm": 1.0, "loss": 0.4490663409233093, "grad_norm": 1.1362569332122803, "learning_rate": 1.2345591938751772e-05} +{"ts": "2025-12-22T21:44:39", "event": "train_log", "step": 333, "epoch": 0.971376481312671, "progress_pct": 48.54, "epoch_pct": 48.57, "eta": "10:48:37", "max_grad_norm": 1.0, "loss": 0.4035309851169586, "grad_norm": 1.2658129930496216, "learning_rate": 1.2296065021201438e-05} +{"ts": "2025-12-22T21:46:14", "event": "train_log", "step": 334, "epoch": 0.9742935278030994, "progress_pct": 48.69, "epoch_pct": 48.71, "eta": "10:46:31", "max_grad_norm": 1.0, "loss": 0.495273619890213, "grad_norm": 4.370306015014648, "learning_rate": 1.2246478576833993e-05} +{"ts": "2025-12-22T21:47:51", "event": "train_log", "step": 335, "epoch": 0.9772105742935278, "progress_pct": 48.83, "epoch_pct": 48.86, "eta": "10:44:27", "max_grad_norm": 1.0, "loss": 0.46410733461380005, "grad_norm": 1.3863654136657715, "learning_rate": 1.219683389120676e-05} +{"ts": "2025-12-22T21:49:30", "event": "train_log", "step": 336, "epoch": 0.9801276207839562, "progress_pct": 48.98, "epoch_pct": 49.01, "eta": "10:42:25", "max_grad_norm": 1.0, "loss": 0.4301709830760956, "grad_norm": 1.4544321298599243, "learning_rate": 1.2147132251387004e-05} +{"ts": "2025-12-22T21:51:04", "event": "train_log", "step": 337, "epoch": 0.9830446672743847, "progress_pct": 49.13, "epoch_pct": 49.15, "eta": "10:40:18", "max_grad_norm": 1.0, "loss": 0.48892468214035034, "grad_norm": 1.0852457284927368, "learning_rate": 1.2097374945918554e-05} +{"ts": "2025-12-22T21:52:43", "event": "train_log", "step": 338, "epoch": 0.9859617137648131, "progress_pct": 49.27, "epoch_pct": 49.3, "eta": "10:38:17", "max_grad_norm": 1.0, "loss": 0.4667983055114746, "grad_norm": 1.5062257051467896, "learning_rate": 1.2047563264788412e-05} +{"ts": "2025-12-22T21:54:17", "event": "train_log", "step": 339, "epoch": 0.9888787602552416, "progress_pct": 49.42, "epoch_pct": 49.44, "eta": "10:36:11", "max_grad_norm": 1.0, "loss": 0.4827345013618469, "grad_norm": 1.2472951412200928, "learning_rate": 1.199769849939329e-05} +{"ts": "2025-12-22T21:55:54", "event": "train_log", "step": 340, "epoch": 0.99179580674567, "progress_pct": 49.56, "epoch_pct": 49.59, "eta": "10:34:07", "max_grad_norm": 1.0, "loss": 0.405245304107666, "grad_norm": 1.2589871883392334, "learning_rate": 1.1947781942506151e-05} +{"ts": "2025-12-22T21:57:29", "event": "train_log", "step": 341, "epoch": 0.9947128532360985, "progress_pct": 49.71, "epoch_pct": 49.74, "eta": "10:32:02", "max_grad_norm": 1.0, "loss": 0.37956133484840393, "grad_norm": 1.25636625289917, "learning_rate": 1.1897814888242679e-05} +{"ts": "2025-12-22T21:59:04", "event": "train_log", "step": 342, "epoch": 0.9976298997265269, "progress_pct": 49.85, "epoch_pct": 49.88, "eta": "10:29:57", "max_grad_norm": 1.0, "loss": 0.489456444978714, "grad_norm": 2.7064895629882812, "learning_rate": 1.1847798632027726e-05} +{"ts": "2025-12-22T22:00:20", "event": "train_log", "step": 343, "epoch": 1.0, "progress_pct": 50.0, "epoch_pct": 50.0, "eta": "10:27:33", "max_grad_norm": 1.0, "loss": 0.46473199129104614, "grad_norm": 1.6156240701675415, "learning_rate": 1.1797734470561744e-05} +{"ts": "2025-12-22T22:01:59", "event": "train_log", "step": 344, "epoch": 1.0029170464904285, "progress_pct": 50.15, "epoch_pct": 50.15, "eta": "10:25:32", "max_grad_norm": 1.0, "loss": 0.3504878282546997, "grad_norm": 1.3046343326568604, "learning_rate": 1.1747623701787143e-05} +{"ts": "2025-12-22T22:03:33", "event": "train_log", "step": 345, "epoch": 1.005834092980857, "progress_pct": 50.29, "epoch_pct": 50.29, "eta": "10:23:28", "max_grad_norm": 1.0, "loss": 0.4719260334968567, "grad_norm": 1.414828896522522, "learning_rate": 1.1697467624854666e-05} +{"ts": "2025-12-22T22:05:09", "event": "train_log", "step": 346, "epoch": 1.0087511394712854, "progress_pct": 50.44, "epoch_pct": 50.44, "eta": "10:21:24", "max_grad_norm": 1.0, "loss": 0.45313555002212524, "grad_norm": 1.1873356103897095, "learning_rate": 1.164726754008969e-05} +{"ts": "2025-12-22T22:06:42", "event": "train_log", "step": 347, "epoch": 1.0116681859617138, "progress_pct": 50.58, "epoch_pct": 50.58, "eta": "10:19:19", "max_grad_norm": 1.0, "loss": 0.4365478456020355, "grad_norm": 1.1382380723953247, "learning_rate": 1.1597024748958526e-05} +{"ts": "2025-12-22T22:08:20", "event": "train_log", "step": 348, "epoch": 1.0145852324521423, "progress_pct": 50.73, "epoch_pct": 50.73, "eta": "10:17:18", "max_grad_norm": 1.0, "loss": 0.3694503605365753, "grad_norm": 1.8141961097717285, "learning_rate": 1.1546740554034661e-05} +{"ts": "2025-12-22T22:09:55", "event": "train_log", "step": 349, "epoch": 1.0175022789425707, "progress_pct": 50.87, "epoch_pct": 50.88, "eta": "10:15:14", "max_grad_norm": 1.0, "loss": 0.4755721688270569, "grad_norm": 1.333388328552246, "learning_rate": 1.1496416258965015e-05} +{"ts": "2025-12-22T22:11:28", "event": "train_log", "step": 350, "epoch": 1.0204193254329992, "progress_pct": 51.02, "epoch_pct": 51.02, "eta": "10:13:08", "max_grad_norm": 1.0, "loss": 0.4227846562862396, "grad_norm": 1.3464443683624268, "learning_rate": 1.1446053168436117e-05} +{"ts": "2025-12-22T22:31:42", "event": "train_log", "step": 350, "epoch": 1.0204193254329992, "progress_pct": 51.02, "epoch_pct": 51.02, "eta": "10:32:34", "max_grad_norm": 1.0, "eval_loss": 0.44924086332321167, "eval_runtime": 1214.6648, "eval_samples_per_second": 0.52, "eval_steps_per_second": 0.52} +{"ts": "2025-12-22T22:33:22", "event": "train_log", "step": 351, "epoch": 1.0233363719234276, "progress_pct": 51.17, "epoch_pct": 51.17, "eta": "10:30:28", "max_grad_norm": 1.0, "loss": 0.44300130009651184, "grad_norm": 1.2682689428329468, "learning_rate": 1.1395652588140292e-05} +{"ts": "2025-12-22T22:35:01", "event": "train_log", "step": 352, "epoch": 1.0262534184138559, "progress_pct": 51.31, "epoch_pct": 51.31, "eta": "10:28:22", "max_grad_norm": 1.0, "loss": 0.5106258988380432, "grad_norm": 1.7737696170806885, "learning_rate": 1.1345215824741814e-05} +{"ts": "2025-12-22T22:36:38", "event": "train_log", "step": 353, "epoch": 1.0291704649042843, "progress_pct": 51.46, "epoch_pct": 51.46, "eta": "10:26:14", "max_grad_norm": 1.0, "loss": 0.45930635929107666, "grad_norm": 1.2601238489151, "learning_rate": 1.1294744185843014e-05} +{"ts": "2025-12-22T22:38:14", "event": "train_log", "step": 354, "epoch": 1.0320875113947128, "progress_pct": 51.6, "epoch_pct": 51.6, "eta": "10:24:06", "max_grad_norm": 1.0, "loss": 0.44163084030151367, "grad_norm": 1.2162678241729736, "learning_rate": 1.1244238979950406e-05} +{"ts": "2025-12-22T22:39:50", "event": "train_log", "step": 355, "epoch": 1.0350045578851412, "progress_pct": 51.75, "epoch_pct": 51.75, "eta": "10:21:57", "max_grad_norm": 1.0, "loss": 0.510662317276001, "grad_norm": 1.0905817747116089, "learning_rate": 1.1193701516440733e-05} +{"ts": "2025-12-22T22:41:27", "event": "train_log", "step": 356, "epoch": 1.0379216043755697, "progress_pct": 51.9, "epoch_pct": 51.9, "eta": "10:19:50", "max_grad_norm": 1.0, "loss": 0.5297917127609253, "grad_norm": 0.9624952673912048, "learning_rate": 1.1143133105527048e-05} +{"ts": "2025-12-22T22:43:04", "event": "train_log", "step": 357, "epoch": 1.0408386508659981, "progress_pct": 52.04, "epoch_pct": 52.04, "eta": "10:17:43", "max_grad_norm": 1.0, "loss": 0.4332093596458435, "grad_norm": 1.2757681608200073, "learning_rate": 1.1092535058224725e-05} +{"ts": "2025-12-22T22:44:40", "event": "train_log", "step": 358, "epoch": 1.0437556973564266, "progress_pct": 52.19, "epoch_pct": 52.19, "eta": "10:15:35", "max_grad_norm": 1.0, "loss": 0.4337635040283203, "grad_norm": 1.6885719299316406, "learning_rate": 1.104190868631748e-05} +{"ts": "2025-12-22T22:46:15", "event": "train_log", "step": 359, "epoch": 1.046672743846855, "progress_pct": 52.33, "epoch_pct": 52.33, "eta": "10:13:26", "max_grad_norm": 1.0, "loss": 0.45411020517349243, "grad_norm": 1.175484538078308, "learning_rate": 1.099125530232336e-05} +{"ts": "2025-12-22T22:47:54", "event": "train_log", "step": 360, "epoch": 1.0495897903372835, "progress_pct": 52.48, "epoch_pct": 52.48, "eta": "10:11:21", "max_grad_norm": 1.0, "loss": 0.5333439707756042, "grad_norm": 1.0964939594268799, "learning_rate": 1.0940576219460723e-05} +{"ts": "2025-12-22T22:49:31", "event": "train_log", "step": 361, "epoch": 1.052506836827712, "progress_pct": 52.62, "epoch_pct": 52.63, "eta": "10:09:15", "max_grad_norm": 1.0, "loss": 0.4400906264781952, "grad_norm": 1.5493136644363403, "learning_rate": 1.0889872751614176e-05} +{"ts": "2025-12-22T22:51:06", "event": "train_log", "step": 362, "epoch": 1.0554238833181404, "progress_pct": 52.77, "epoch_pct": 52.77, "eta": "10:07:07", "max_grad_norm": 1.0, "loss": 0.31049978733062744, "grad_norm": 1.2491416931152344, "learning_rate": 1.0839146213300526e-05} +{"ts": "2025-12-22T22:52:42", "event": "train_log", "step": 363, "epoch": 1.0583409298085689, "progress_pct": 52.92, "epoch_pct": 52.92, "eta": "10:05:00", "max_grad_norm": 1.0, "loss": 0.389009028673172, "grad_norm": 1.7213693857192993, "learning_rate": 1.0788397919634694e-05} +{"ts": "2025-12-22T22:54:20", "event": "train_log", "step": 364, "epoch": 1.0612579762989973, "progress_pct": 53.06, "epoch_pct": 53.06, "eta": "10:02:54", "max_grad_norm": 1.0, "loss": 0.4068562984466553, "grad_norm": 1.5405336618423462, "learning_rate": 1.0737629186295621e-05} +{"ts": "2025-12-22T22:55:57", "event": "train_log", "step": 365, "epoch": 1.0641750227894258, "progress_pct": 53.21, "epoch_pct": 53.21, "eta": "10:00:49", "max_grad_norm": 1.0, "loss": 0.47358617186546326, "grad_norm": 1.225455641746521, "learning_rate": 1.0686841329492159e-05} +{"ts": "2025-12-22T22:57:35", "event": "train_log", "step": 366, "epoch": 1.0670920692798542, "progress_pct": 53.35, "epoch_pct": 53.35, "eta": "09:58:44", "max_grad_norm": 1.0, "loss": 0.47050854563713074, "grad_norm": 1.3436250686645508, "learning_rate": 1.0636035665928945e-05} +{"ts": "2025-12-22T22:59:10", "event": "train_log", "step": 367, "epoch": 1.0700091157702827, "progress_pct": 53.5, "epoch_pct": 53.5, "eta": "09:56:37", "max_grad_norm": 1.0, "loss": 0.43496906757354736, "grad_norm": 1.4952112436294556, "learning_rate": 1.058521351277227e-05} +{"ts": "2025-12-22T23:00:48", "event": "train_log", "step": 368, "epoch": 1.072926162260711, "progress_pct": 53.64, "epoch_pct": 53.65, "eta": "09:54:32", "max_grad_norm": 1.0, "loss": 0.45711052417755127, "grad_norm": 1.549112319946289, "learning_rate": 1.0534376187615924e-05} +{"ts": "2025-12-22T23:02:28", "event": "train_log", "step": 369, "epoch": 1.0758432087511394, "progress_pct": 53.79, "epoch_pct": 53.79, "eta": "09:52:29", "max_grad_norm": 1.0, "loss": 0.45045915246009827, "grad_norm": 1.3851526975631714, "learning_rate": 1.048352500844704e-05} +{"ts": "2025-12-22T23:04:02", "event": "train_log", "step": 370, "epoch": 1.0787602552415678, "progress_pct": 53.94, "epoch_pct": 53.94, "eta": "09:50:22", "max_grad_norm": 1.0, "loss": 0.3736046254634857, "grad_norm": 1.6302049160003662, "learning_rate": 1.0432661293611927e-05} +{"ts": "2025-12-22T23:05:40", "event": "train_log", "step": 371, "epoch": 1.0816773017319963, "progress_pct": 54.08, "epoch_pct": 54.08, "eta": "09:48:18", "max_grad_norm": 1.0, "loss": 0.42242100834846497, "grad_norm": 1.3365869522094727, "learning_rate": 1.0381786361781885e-05} +{"ts": "2025-12-22T23:07:20", "event": "train_log", "step": 372, "epoch": 1.0845943482224247, "progress_pct": 54.23, "epoch_pct": 54.23, "eta": "09:46:16", "max_grad_norm": 1.0, "loss": 0.44570961594581604, "grad_norm": 1.4369138479232788, "learning_rate": 1.0330901531919026e-05} +{"ts": "2025-12-22T23:08:56", "event": "train_log", "step": 373, "epoch": 1.0875113947128532, "progress_pct": 54.37, "epoch_pct": 54.38, "eta": "09:44:10", "max_grad_norm": 1.0, "loss": 0.43440738320350647, "grad_norm": 1.3528283834457397, "learning_rate": 1.0280008123242069e-05} +{"ts": "2025-12-22T23:10:30", "event": "train_log", "step": 374, "epoch": 1.0904284412032816, "progress_pct": 54.52, "epoch_pct": 54.52, "eta": "09:42:04", "max_grad_norm": 1.0, "loss": 0.3960394263267517, "grad_norm": 1.469660997390747, "learning_rate": 1.0229107455192147e-05} +{"ts": "2025-12-22T23:12:08", "event": "train_log", "step": 375, "epoch": 1.09334548769371, "progress_pct": 54.66, "epoch_pct": 54.67, "eta": "09:40:00", "max_grad_norm": 1.0, "loss": 0.47834208607673645, "grad_norm": 1.4542185068130493, "learning_rate": 1.0178200847398595e-05} +{"ts": "2025-12-22T23:13:43", "event": "train_log", "step": 376, "epoch": 1.0962625341841385, "progress_pct": 54.81, "epoch_pct": 54.81, "eta": "09:37:54", "max_grad_norm": 1.0, "loss": 0.42791086435317993, "grad_norm": 1.6470292806625366, "learning_rate": 1.0127289619644737e-05} +{"ts": "2025-12-22T23:15:20", "event": "train_log", "step": 377, "epoch": 1.099179580674567, "progress_pct": 54.96, "epoch_pct": 54.96, "eta": "09:35:50", "max_grad_norm": 1.0, "loss": 0.4401305019855499, "grad_norm": 1.1934021711349487, "learning_rate": 1.0076375091833681e-05} +{"ts": "2025-12-22T23:16:55", "event": "train_log", "step": 378, "epoch": 1.1020966271649955, "progress_pct": 55.1, "epoch_pct": 55.1, "eta": "09:33:45", "max_grad_norm": 1.0, "loss": 0.4816555678844452, "grad_norm": 0.9786668419837952, "learning_rate": 1.0025458583954078e-05} +{"ts": "2025-12-22T23:18:32", "event": "train_log", "step": 379, "epoch": 1.105013673655424, "progress_pct": 55.25, "epoch_pct": 55.25, "eta": "09:31:41", "max_grad_norm": 1.0, "loss": 0.41516968607902527, "grad_norm": 1.1348779201507568, "learning_rate": 9.974541416045924e-06} +{"ts": "2025-12-22T23:20:08", "event": "train_log", "step": 380, "epoch": 1.1079307201458524, "progress_pct": 55.39, "epoch_pct": 55.4, "eta": "09:29:36", "max_grad_norm": 1.0, "loss": 0.48087278008461, "grad_norm": 1.0188615322113037, "learning_rate": 9.923624908166322e-06} +{"ts": "2025-12-22T23:21:45", "event": "train_log", "step": 381, "epoch": 1.1108477666362808, "progress_pct": 55.54, "epoch_pct": 55.54, "eta": "09:27:33", "max_grad_norm": 1.0, "loss": 0.41974008083343506, "grad_norm": 1.0821740627288818, "learning_rate": 9.872710380355263e-06} +{"ts": "2025-12-22T23:23:23", "event": "train_log", "step": 382, "epoch": 1.1137648131267093, "progress_pct": 55.69, "epoch_pct": 55.69, "eta": "09:25:30", "max_grad_norm": 1.0, "loss": 0.42703643441200256, "grad_norm": 1.250951886177063, "learning_rate": 9.82179915260141e-06} +{"ts": "2025-12-22T23:24:58", "event": "train_log", "step": 383, "epoch": 1.1166818596171377, "progress_pct": 55.83, "epoch_pct": 55.83, "eta": "09:23:25", "max_grad_norm": 1.0, "loss": 0.43801453709602356, "grad_norm": 1.4528254270553589, "learning_rate": 9.770892544807856e-06} +{"ts": "2025-12-22T23:26:35", "event": "train_log", "step": 384, "epoch": 1.1195989061075662, "progress_pct": 55.98, "epoch_pct": 55.98, "eta": "09:21:22", "max_grad_norm": 1.0, "loss": 0.4344240725040436, "grad_norm": 1.813859462738037, "learning_rate": 9.719991876757934e-06} +{"ts": "2025-12-22T23:28:08", "event": "train_log", "step": 385, "epoch": 1.1225159525979946, "progress_pct": 56.12, "epoch_pct": 56.13, "eta": "09:19:17", "max_grad_norm": 1.0, "loss": 0.4356998801231384, "grad_norm": 1.6681253910064697, "learning_rate": 9.669098468080976e-06} +{"ts": "2025-12-22T23:29:45", "event": "train_log", "step": 386, "epoch": 1.125432999088423, "progress_pct": 56.27, "epoch_pct": 56.27, "eta": "09:17:14", "max_grad_norm": 1.0, "loss": 0.43189188838005066, "grad_norm": 1.3447953462600708, "learning_rate": 9.618213638218117e-06} +{"ts": "2025-12-22T23:31:27", "event": "train_log", "step": 387, "epoch": 1.1283500455788513, "progress_pct": 56.41, "epoch_pct": 56.42, "eta": "09:15:15", "max_grad_norm": 1.0, "loss": 0.34984707832336426, "grad_norm": 1.9577926397323608, "learning_rate": 9.567338706388074e-06} +{"ts": "2025-12-22T23:33:07", "event": "train_log", "step": 388, "epoch": 1.1312670920692798, "progress_pct": 56.56, "epoch_pct": 56.56, "eta": "09:13:14", "max_grad_norm": 1.0, "loss": 0.4243963062763214, "grad_norm": 1.5225576162338257, "learning_rate": 9.516474991552965e-06} +{"ts": "2025-12-22T23:34:45", "event": "train_log", "step": 389, "epoch": 1.1341841385597082, "progress_pct": 56.71, "epoch_pct": 56.71, "eta": "09:11:14", "max_grad_norm": 1.0, "loss": 0.3414606750011444, "grad_norm": 1.7416809797286987, "learning_rate": 9.46562381238408e-06} +{"ts": "2025-12-22T23:36:24", "event": "train_log", "step": 390, "epoch": 1.1371011850501367, "progress_pct": 56.85, "epoch_pct": 56.86, "eta": "09:09:13", "max_grad_norm": 1.0, "loss": 0.387447327375412, "grad_norm": 1.8358951807022095, "learning_rate": 9.414786487227732e-06} +{"ts": "2025-12-22T23:38:05", "event": "train_log", "step": 391, "epoch": 1.1400182315405651, "progress_pct": 57.0, "epoch_pct": 57.0, "eta": "09:07:13", "max_grad_norm": 1.0, "loss": 0.4599088728427887, "grad_norm": 1.9706153869628906, "learning_rate": 9.363964334071057e-06} +{"ts": "2025-12-22T23:39:41", "event": "train_log", "step": 392, "epoch": 1.1429352780309936, "progress_pct": 57.14, "epoch_pct": 57.15, "eta": "09:05:10", "max_grad_norm": 1.0, "loss": 0.4633581042289734, "grad_norm": 1.0604286193847656, "learning_rate": 9.313158670507843e-06} +{"ts": "2025-12-22T23:41:15", "event": "train_log", "step": 393, "epoch": 1.145852324521422, "progress_pct": 57.29, "epoch_pct": 57.29, "eta": "09:03:07", "max_grad_norm": 1.0, "loss": 0.3872259557247162, "grad_norm": 1.4851202964782715, "learning_rate": 9.262370813704379e-06} +{"ts": "2025-12-22T23:42:49", "event": "train_log", "step": 394, "epoch": 1.1487693710118505, "progress_pct": 57.43, "epoch_pct": 57.44, "eta": "09:01:02", "max_grad_norm": 1.0, "loss": 0.5215944647789001, "grad_norm": 1.7839159965515137, "learning_rate": 9.21160208036531e-06} +{"ts": "2025-12-22T23:44:24", "event": "train_log", "step": 395, "epoch": 1.151686417502279, "progress_pct": 57.58, "epoch_pct": 57.58, "eta": "08:59:00", "max_grad_norm": 1.0, "loss": 0.4030425548553467, "grad_norm": 1.3054656982421875, "learning_rate": 9.160853786699475e-06} +{"ts": "2025-12-22T23:45:59", "event": "train_log", "step": 396, "epoch": 1.1546034639927074, "progress_pct": 57.73, "epoch_pct": 57.73, "eta": "08:56:57", "max_grad_norm": 1.0, "loss": 0.4032524824142456, "grad_norm": 3.8467981815338135, "learning_rate": 9.110127248385827e-06} +{"ts": "2025-12-22T23:47:36", "event": "train_log", "step": 397, "epoch": 1.1575205104831359, "progress_pct": 57.87, "epoch_pct": 57.88, "eta": "08:54:55", "max_grad_norm": 1.0, "loss": 0.46577155590057373, "grad_norm": 1.8513801097869873, "learning_rate": 9.05942378053928e-06} +{"ts": "2025-12-22T23:49:11", "event": "train_log", "step": 398, "epoch": 1.1604375569735643, "progress_pct": 58.02, "epoch_pct": 58.02, "eta": "08:52:53", "max_grad_norm": 1.0, "loss": 0.39114487171173096, "grad_norm": 1.312689185142517, "learning_rate": 9.008744697676642e-06} +{"ts": "2025-12-22T23:50:47", "event": "train_log", "step": 399, "epoch": 1.1633546034639928, "progress_pct": 58.16, "epoch_pct": 58.17, "eta": "08:50:51", "max_grad_norm": 1.0, "loss": 0.481199711561203, "grad_norm": 1.1996328830718994, "learning_rate": 8.958091313682521e-06} +{"ts": "2025-12-22T23:52:22", "event": "train_log", "step": 400, "epoch": 1.1662716499544212, "progress_pct": 58.31, "epoch_pct": 58.31, "eta": "08:48:48", "max_grad_norm": 1.0, "loss": 0.3803558945655823, "grad_norm": 5.172409534454346, "learning_rate": 8.90746494177528e-06} +{"ts": "2025-12-23T00:12:28", "event": "train_log", "step": 400, "epoch": 1.1662716499544212, "progress_pct": 58.31, "epoch_pct": 58.31, "eta": "09:03:10", "max_grad_norm": 1.0, "eval_loss": 0.4318464398384094, "eval_runtime": 1206.0306, "eval_samples_per_second": 0.524, "eval_steps_per_second": 0.524} +{"ts": "2025-12-23T00:14:08", "event": "train_log", "step": 401, "epoch": 1.1691886964448497, "progress_pct": 58.45, "epoch_pct": 58.46, "eta": "09:01:06", "max_grad_norm": 1.0, "loss": 0.39636704325675964, "grad_norm": 1.0115015506744385, "learning_rate": 8.856866894472954e-06} +{"ts": "2025-12-23T00:15:45", "event": "train_log", "step": 402, "epoch": 1.172105742935278, "progress_pct": 58.6, "epoch_pct": 58.61, "eta": "08:59:01", "max_grad_norm": 1.0, "loss": 0.4076298475265503, "grad_norm": 1.1557435989379883, "learning_rate": 8.806298483559268e-06} +{"ts": "2025-12-23T00:17:21", "event": "train_log", "step": 403, "epoch": 1.1750227894257064, "progress_pct": 58.75, "epoch_pct": 58.75, "eta": "08:56:54", "max_grad_norm": 1.0, "loss": 0.44352248311042786, "grad_norm": 1.2802515029907227, "learning_rate": 8.755761020049597e-06} +{"ts": "2025-12-23T00:18:57", "event": "train_log", "step": 404, "epoch": 1.1779398359161348, "progress_pct": 58.89, "epoch_pct": 58.9, "eta": "08:54:48", "max_grad_norm": 1.0, "loss": 0.390497624874115, "grad_norm": 1.2755069732666016, "learning_rate": 8.705255814156988e-06} +{"ts": "2025-12-23T00:20:31", "event": "train_log", "step": 405, "epoch": 1.1808568824065633, "progress_pct": 59.04, "epoch_pct": 59.04, "eta": "08:52:40", "max_grad_norm": 1.0, "loss": 0.35810694098472595, "grad_norm": 1.2799782752990723, "learning_rate": 8.654784175258188e-06} +{"ts": "2025-12-23T00:22:08", "event": "train_log", "step": 406, "epoch": 1.1837739288969917, "progress_pct": 59.18, "epoch_pct": 59.19, "eta": "08:50:36", "max_grad_norm": 1.0, "loss": 0.3890265226364136, "grad_norm": 1.0968674421310425, "learning_rate": 8.604347411859713e-06} +{"ts": "2025-12-23T00:23:45", "event": "train_log", "step": 407, "epoch": 1.1866909753874202, "progress_pct": 59.33, "epoch_pct": 59.33, "eta": "08:48:30", "max_grad_norm": 1.0, "loss": 0.3916901648044586, "grad_norm": 1.3334455490112305, "learning_rate": 8.553946831563886e-06} +{"ts": "2025-12-23T00:25:22", "event": "train_log", "step": 408, "epoch": 1.1896080218778486, "progress_pct": 59.48, "epoch_pct": 59.48, "eta": "08:46:25", "max_grad_norm": 1.0, "loss": 0.5231326222419739, "grad_norm": 1.1888184547424316, "learning_rate": 8.503583741034988e-06} +{"ts": "2025-12-23T00:26:58", "event": "train_log", "step": 409, "epoch": 1.192525068368277, "progress_pct": 59.62, "epoch_pct": 59.63, "eta": "08:44:20", "max_grad_norm": 1.0, "loss": 0.4249858558177948, "grad_norm": 1.1163763999938965, "learning_rate": 8.45325944596534e-06} +{"ts": "2025-12-23T00:28:34", "event": "train_log", "step": 410, "epoch": 1.1954421148587056, "progress_pct": 59.77, "epoch_pct": 59.77, "eta": "08:42:14", "max_grad_norm": 1.0, "loss": 0.5201632380485535, "grad_norm": 1.3470333814620972, "learning_rate": 8.40297525104148e-06} +{"ts": "2025-12-23T00:30:11", "event": "train_log", "step": 411, "epoch": 1.198359161349134, "progress_pct": 59.91, "epoch_pct": 59.92, "eta": "08:40:09", "max_grad_norm": 1.0, "loss": 0.39376699924468994, "grad_norm": 1.5412285327911377, "learning_rate": 8.35273245991031e-06} +{"ts": "2025-12-23T00:31:46", "event": "train_log", "step": 412, "epoch": 1.2012762078395625, "progress_pct": 60.06, "epoch_pct": 60.06, "eta": "08:38:04", "max_grad_norm": 1.0, "loss": 0.39554283022880554, "grad_norm": 1.3408735990524292, "learning_rate": 8.302532375145339e-06} +{"ts": "2025-12-23T00:33:23", "event": "train_log", "step": 413, "epoch": 1.204193254329991, "progress_pct": 60.2, "epoch_pct": 60.21, "eta": "08:36:00", "max_grad_norm": 1.0, "loss": 0.42424261569976807, "grad_norm": 1.990668773651123, "learning_rate": 8.25237629821286e-06} +{"ts": "2025-12-23T00:35:01", "event": "train_log", "step": 414, "epoch": 1.2071103008204194, "progress_pct": 60.35, "epoch_pct": 60.36, "eta": "08:33:56", "max_grad_norm": 1.0, "loss": 0.3234582543373108, "grad_norm": 1.6471989154815674, "learning_rate": 8.202265529438259e-06} +{"ts": "2025-12-23T00:36:41", "event": "train_log", "step": 415, "epoch": 1.2100273473108478, "progress_pct": 60.5, "epoch_pct": 60.5, "eta": "08:31:54", "max_grad_norm": 1.0, "loss": 0.39163246750831604, "grad_norm": 1.1483631134033203, "learning_rate": 8.152201367972275e-06} +{"ts": "2025-12-23T00:38:21", "event": "train_log", "step": 416, "epoch": 1.2129443938012763, "progress_pct": 60.64, "epoch_pct": 60.65, "eta": "08:29:51", "max_grad_norm": 1.0, "loss": 0.5055042505264282, "grad_norm": 1.800149917602539, "learning_rate": 8.102185111757323e-06} +{"ts": "2025-12-23T00:39:56", "event": "train_log", "step": 417, "epoch": 1.2158614402917047, "progress_pct": 60.79, "epoch_pct": 60.79, "eta": "08:27:47", "max_grad_norm": 1.0, "loss": 0.4761751592159271, "grad_norm": 1.4394795894622803, "learning_rate": 8.052218057493849e-06} +{"ts": "2025-12-23T00:41:35", "event": "train_log", "step": 418, "epoch": 1.2187784867821332, "progress_pct": 60.93, "epoch_pct": 60.94, "eta": "08:25:45", "max_grad_norm": 1.0, "loss": 0.4490141272544861, "grad_norm": 1.622689962387085, "learning_rate": 8.002301500606715e-06} +{"ts": "2025-12-23T00:43:10", "event": "train_log", "step": 419, "epoch": 1.2216955332725616, "progress_pct": 61.08, "epoch_pct": 61.08, "eta": "08:23:40", "max_grad_norm": 1.0, "loss": 0.3964035212993622, "grad_norm": 1.2564961910247803, "learning_rate": 7.952436735211593e-06} +{"ts": "2025-12-23T00:44:46", "event": "train_log", "step": 420, "epoch": 1.22461257976299, "progress_pct": 61.22, "epoch_pct": 61.23, "eta": "08:21:35", "max_grad_norm": 1.0, "loss": 0.46039122343063354, "grad_norm": 1.3248411417007446, "learning_rate": 7.902625054081449e-06} +{"ts": "2025-12-23T00:46:22", "event": "train_log", "step": 421, "epoch": 1.2275296262534183, "progress_pct": 61.37, "epoch_pct": 61.38, "eta": "08:19:31", "max_grad_norm": 1.0, "loss": 0.49916595220565796, "grad_norm": 1.568983793258667, "learning_rate": 7.852867748613e-06} +{"ts": "2025-12-23T00:47:54", "event": "train_log", "step": 422, "epoch": 1.2304466727438468, "progress_pct": 61.52, "epoch_pct": 61.52, "eta": "08:17:25", "max_grad_norm": 1.0, "loss": 0.4035068154335022, "grad_norm": 1.4784491062164307, "learning_rate": 7.803166108793243e-06} +{"ts": "2025-12-23T00:49:25", "event": "train_log", "step": 423, "epoch": 1.2333637192342752, "progress_pct": 61.66, "epoch_pct": 61.67, "eta": "08:15:19", "max_grad_norm": 1.0, "loss": 0.4154140055179596, "grad_norm": 1.2940057516098022, "learning_rate": 7.753521423166007e-06} +{"ts": "2025-12-23T00:50:57", "event": "train_log", "step": 424, "epoch": 1.2362807657247037, "progress_pct": 61.81, "epoch_pct": 61.81, "eta": "08:13:13", "max_grad_norm": 1.0, "loss": 0.39541637897491455, "grad_norm": 1.167786717414856, "learning_rate": 7.703934978798565e-06} +{"ts": "2025-12-23T00:52:29", "event": "train_log", "step": 425, "epoch": 1.2391978122151321, "progress_pct": 61.95, "epoch_pct": 61.96, "eta": "08:11:07", "max_grad_norm": 1.0, "loss": 0.37744253873825073, "grad_norm": 1.5126771926879883, "learning_rate": 7.65440806124823e-06} +{"ts": "2025-12-23T00:54:05", "event": "train_log", "step": 426, "epoch": 1.2421148587055606, "progress_pct": 62.1, "epoch_pct": 62.11, "eta": "08:09:03", "max_grad_norm": 1.0, "loss": 0.46380615234375, "grad_norm": 1.2595263719558716, "learning_rate": 7.604941954529067e-06} +{"ts": "2025-12-23T00:55:38", "event": "train_log", "step": 427, "epoch": 1.245031905195989, "progress_pct": 62.24, "epoch_pct": 62.25, "eta": "08:06:59", "max_grad_norm": 1.0, "loss": 0.3391319513320923, "grad_norm": 1.4258298873901367, "learning_rate": 7.555537941078573e-06} +{"ts": "2025-12-23T00:57:10", "event": "train_log", "step": 428, "epoch": 1.2479489516864175, "progress_pct": 62.39, "epoch_pct": 62.4, "eta": "08:04:53", "max_grad_norm": 1.0, "loss": 0.39805102348327637, "grad_norm": 1.5371774435043335, "learning_rate": 7.506197301724446e-06} +{"ts": "2025-12-23T00:58:44", "event": "train_log", "step": 429, "epoch": 1.250865998176846, "progress_pct": 62.54, "epoch_pct": 62.54, "eta": "08:02:49", "max_grad_norm": 1.0, "loss": 0.37969034910202026, "grad_norm": 1.3789173364639282, "learning_rate": 7.456921315651371e-06} +{"ts": "2025-12-23T01:00:17", "event": "train_log", "step": 430, "epoch": 1.2537830446672744, "progress_pct": 62.68, "epoch_pct": 62.69, "eta": "08:00:45", "max_grad_norm": 1.0, "loss": 0.3841526508331299, "grad_norm": 1.32931649684906, "learning_rate": 7.407711260367867e-06} +{"ts": "2025-12-23T01:01:55", "event": "train_log", "step": 431, "epoch": 1.2567000911577029, "progress_pct": 62.83, "epoch_pct": 62.84, "eta": "07:58:43", "max_grad_norm": 1.0, "loss": 0.340289443731308, "grad_norm": 1.2836817502975464, "learning_rate": 7.358568411673145e-06} +{"ts": "2025-12-23T01:03:28", "event": "train_log", "step": 432, "epoch": 1.2596171376481313, "progress_pct": 62.97, "epoch_pct": 62.98, "eta": "07:56:39", "max_grad_norm": 1.0, "loss": 0.44747158885002136, "grad_norm": 1.0418318510055542, "learning_rate": 7.309494043624059e-06} +{"ts": "2025-12-23T01:05:06", "event": "train_log", "step": 433, "epoch": 1.2625341841385598, "progress_pct": 63.12, "epoch_pct": 63.13, "eta": "07:54:38", "max_grad_norm": 1.0, "loss": 0.45737382769584656, "grad_norm": 1.1769362688064575, "learning_rate": 7.260489428502058e-06} +{"ts": "2025-12-23T01:06:42", "event": "train_log", "step": 434, "epoch": 1.265451230628988, "progress_pct": 63.27, "epoch_pct": 63.27, "eta": "07:52:36", "max_grad_norm": 1.0, "loss": 0.3827931582927704, "grad_norm": 2.2730748653411865, "learning_rate": 7.211555836780203e-06} +{"ts": "2025-12-23T01:08:22", "event": "train_log", "step": 435, "epoch": 1.2683682771194165, "progress_pct": 63.41, "epoch_pct": 63.42, "eta": "07:50:36", "max_grad_norm": 1.0, "loss": 0.3589435815811157, "grad_norm": 1.263096809387207, "learning_rate": 7.162694537090235e-06} +{"ts": "2025-12-23T01:09:57", "event": "train_log", "step": 436, "epoch": 1.271285323609845, "progress_pct": 63.56, "epoch_pct": 63.56, "eta": "07:48:33", "max_grad_norm": 1.0, "loss": 0.45206642150878906, "grad_norm": 1.4073514938354492, "learning_rate": 7.113906796189692e-06} +{"ts": "2025-12-23T01:11:36", "event": "train_log", "step": 437, "epoch": 1.2742023701002734, "progress_pct": 63.7, "epoch_pct": 63.71, "eta": "07:46:34", "max_grad_norm": 1.0, "loss": 0.5409261584281921, "grad_norm": 1.064585566520691, "learning_rate": 7.0651938789290306e-06} +{"ts": "2025-12-23T01:13:13", "event": "train_log", "step": 438, "epoch": 1.2771194165907018, "progress_pct": 63.85, "epoch_pct": 63.86, "eta": "07:44:32", "max_grad_norm": 1.0, "loss": 0.40680158138275146, "grad_norm": 1.2346999645233154, "learning_rate": 7.016557048218889e-06} +{"ts": "2025-12-23T01:14:50", "event": "train_log", "step": 439, "epoch": 1.2800364630811303, "progress_pct": 63.99, "epoch_pct": 64.0, "eta": "07:42:31", "max_grad_norm": 1.0, "loss": 0.38718655705451965, "grad_norm": 1.5816547870635986, "learning_rate": 6.967997564997306e-06} +{"ts": "2025-12-23T01:16:25", "event": "train_log", "step": 440, "epoch": 1.2829535095715587, "progress_pct": 64.14, "epoch_pct": 64.15, "eta": "07:40:29", "max_grad_norm": 1.0, "loss": 0.4863276779651642, "grad_norm": 1.085268259048462, "learning_rate": 6.919516688197041e-06} +{"ts": "2025-12-23T01:18:04", "event": "train_log", "step": 441, "epoch": 1.2858705560619872, "progress_pct": 64.29, "epoch_pct": 64.29, "eta": "07:38:29", "max_grad_norm": 1.0, "loss": 0.39562875032424927, "grad_norm": 1.0984629392623901, "learning_rate": 6.871115674712937e-06} +{"ts": "2025-12-23T01:19:40", "event": "train_log", "step": 442, "epoch": 1.2887876025524156, "progress_pct": 64.43, "epoch_pct": 64.44, "eta": "07:36:28", "max_grad_norm": 1.0, "loss": 0.44437694549560547, "grad_norm": 1.3004229068756104, "learning_rate": 6.822795779369339e-06} +{"ts": "2025-12-23T01:21:17", "event": "train_log", "step": 443, "epoch": 1.291704649042844, "progress_pct": 64.58, "epoch_pct": 64.59, "eta": "07:34:27", "max_grad_norm": 1.0, "loss": 0.4728967249393463, "grad_norm": 1.3541183471679688, "learning_rate": 6.774558254887553e-06} +{"ts": "2025-12-23T01:22:51", "event": "train_log", "step": 444, "epoch": 1.2946216955332726, "progress_pct": 64.72, "epoch_pct": 64.73, "eta": "07:32:25", "max_grad_norm": 1.0, "loss": 0.4052809476852417, "grad_norm": 1.2485377788543701, "learning_rate": 6.7264043518533695e-06} +{"ts": "2025-12-23T01:24:31", "event": "train_log", "step": 445, "epoch": 1.297538742023701, "progress_pct": 64.87, "epoch_pct": 64.88, "eta": "07:30:26", "max_grad_norm": 1.0, "loss": 0.40149861574172974, "grad_norm": 1.412827730178833, "learning_rate": 6.67833531868465e-06} +{"ts": "2025-12-23T01:26:05", "event": "train_log", "step": 446, "epoch": 1.3004557885141295, "progress_pct": 65.01, "epoch_pct": 65.02, "eta": "07:28:24", "max_grad_norm": 1.0, "loss": 0.44107240438461304, "grad_norm": 1.5576224327087402, "learning_rate": 6.630352401598953e-06} +{"ts": "2025-12-23T01:27:42", "event": "train_log", "step": 447, "epoch": 1.303372835004558, "progress_pct": 65.16, "epoch_pct": 65.17, "eta": "07:26:25", "max_grad_norm": 1.0, "loss": 0.4898405969142914, "grad_norm": 1.1551047563552856, "learning_rate": 6.582456844581226e-06} +{"ts": "2025-12-23T01:29:17", "event": "train_log", "step": 448, "epoch": 1.3062898814949864, "progress_pct": 65.31, "epoch_pct": 65.31, "eta": "07:24:23", "max_grad_norm": 1.0, "loss": 0.4791329801082611, "grad_norm": 1.9939689636230469, "learning_rate": 6.5346498893515645e-06} +{"ts": "2025-12-23T01:30:54", "event": "train_log", "step": 449, "epoch": 1.3092069279854148, "progress_pct": 65.45, "epoch_pct": 65.46, "eta": "07:22:23", "max_grad_norm": 1.0, "loss": 0.472908616065979, "grad_norm": 1.4782553911209106, "learning_rate": 6.486932775333002e-06} +{"ts": "2025-12-23T01:32:30", "event": "train_log", "step": 450, "epoch": 1.3121239744758433, "progress_pct": 65.6, "epoch_pct": 65.61, "eta": "07:20:23", "max_grad_norm": 1.0, "loss": 0.514995276927948, "grad_norm": 1.2496148347854614, "learning_rate": 6.439306739619387e-06} +{"ts": "2025-12-23T01:52:27", "event": "train_log", "step": 450, "epoch": 1.3121239744758433, "progress_pct": 65.6, "epoch_pct": 65.61, "eta": "07:30:51", "max_grad_norm": 1.0, "eval_loss": 0.4178673028945923, "eval_runtime": 1197.5534, "eval_samples_per_second": 0.528, "eval_steps_per_second": 0.528} +{"ts": "2025-12-23T01:54:04", "event": "train_log", "step": 451, "epoch": 1.3150410209662717, "progress_pct": 65.74, "epoch_pct": 65.75, "eta": "07:28:47", "max_grad_norm": 1.0, "loss": 0.4087896943092346, "grad_norm": 1.3996772766113281, "learning_rate": 6.391773016943316e-06} +{"ts": "2025-12-23T01:55:40", "event": "train_log", "step": 452, "epoch": 1.3179580674567002, "progress_pct": 65.89, "epoch_pct": 65.9, "eta": "07:26:43", "max_grad_norm": 1.0, "loss": 0.43224579095840454, "grad_norm": 1.20390784740448, "learning_rate": 6.344332839644111e-06} +{"ts": "2025-12-23T01:57:15", "event": "train_log", "step": 453, "epoch": 1.3208751139471286, "progress_pct": 66.03, "epoch_pct": 66.04, "eta": "07:24:38", "max_grad_norm": 1.0, "loss": 0.44104093313217163, "grad_norm": 1.2709496021270752, "learning_rate": 6.296987437635876e-06} +{"ts": "2025-12-23T01:58:52", "event": "train_log", "step": 454, "epoch": 1.323792160437557, "progress_pct": 66.18, "epoch_pct": 66.19, "eta": "07:22:35", "max_grad_norm": 1.0, "loss": 0.47084498405456543, "grad_norm": 1.0112334489822388, "learning_rate": 6.249738038375618e-06} +{"ts": "2025-12-23T02:00:28", "event": "train_log", "step": 455, "epoch": 1.3267092069279856, "progress_pct": 66.33, "epoch_pct": 66.34, "eta": "07:20:31", "max_grad_norm": 1.0, "loss": 0.4700928032398224, "grad_norm": 1.0771515369415283, "learning_rate": 6.202585866831411e-06} +{"ts": "2025-12-23T02:02:05", "event": "train_log", "step": 456, "epoch": 1.3296262534184138, "progress_pct": 66.47, "epoch_pct": 66.48, "eta": "07:18:27", "max_grad_norm": 1.0, "loss": 0.345747709274292, "grad_norm": 1.4937143325805664, "learning_rate": 6.15553214545064e-06} +{"ts": "2025-12-23T02:03:38", "event": "train_log", "step": 457, "epoch": 1.3325432999088422, "progress_pct": 66.62, "epoch_pct": 66.63, "eta": "07:16:22", "max_grad_norm": 1.0, "loss": 0.33824583888053894, "grad_norm": 1.1348456144332886, "learning_rate": 6.108578094128321e-06} +{"ts": "2025-12-23T02:05:10", "event": "train_log", "step": 458, "epoch": 1.3354603463992707, "progress_pct": 66.76, "epoch_pct": 66.77, "eta": "07:14:17", "max_grad_norm": 1.0, "loss": 0.3528832197189331, "grad_norm": 1.2502707242965698, "learning_rate": 6.061724930175461e-06} +{"ts": "2025-12-23T02:06:44", "event": "train_log", "step": 459, "epoch": 1.3383773928896991, "progress_pct": 66.91, "epoch_pct": 66.92, "eta": "07:12:13", "max_grad_norm": 1.0, "loss": 0.4413869082927704, "grad_norm": 1.5359619855880737, "learning_rate": 6.014973868287504e-06} +{"ts": "2025-12-23T02:08:17", "event": "train_log", "step": 460, "epoch": 1.3412944393801276, "progress_pct": 67.06, "epoch_pct": 67.06, "eta": "07:10:08", "max_grad_norm": 1.0, "loss": 0.6849499940872192, "grad_norm": 0.9747081398963928, "learning_rate": 5.9683261205128395e-06} +{"ts": "2025-12-23T02:09:50", "event": "train_log", "step": 461, "epoch": 1.344211485870556, "progress_pct": 67.2, "epoch_pct": 67.21, "eta": "07:08:04", "max_grad_norm": 1.0, "loss": 0.3901931047439575, "grad_norm": 1.3150533437728882, "learning_rate": 5.921782896221383e-06} +{"ts": "2025-12-23T02:11:23", "event": "train_log", "step": 462, "epoch": 1.3471285323609845, "progress_pct": 67.35, "epoch_pct": 67.36, "eta": "07:05:59", "max_grad_norm": 1.0, "loss": 0.37498384714126587, "grad_norm": 1.137770652770996, "learning_rate": 5.875345402073207e-06} +{"ts": "2025-12-23T02:13:04", "event": "train_log", "step": 463, "epoch": 1.350045578851413, "progress_pct": 67.49, "epoch_pct": 67.5, "eta": "07:03:59", "max_grad_norm": 1.0, "loss": 0.3874579966068268, "grad_norm": 1.2216367721557617, "learning_rate": 5.829014841987277e-06} +{"ts": "2025-12-23T02:14:44", "event": "train_log", "step": 464, "epoch": 1.3529626253418414, "progress_pct": 67.64, "epoch_pct": 67.65, "eta": "07:01:58", "max_grad_norm": 1.0, "loss": 0.384797066450119, "grad_norm": 1.135439157485962, "learning_rate": 5.782792417110233e-06} +{"ts": "2025-12-23T02:16:26", "event": "train_log", "step": 465, "epoch": 1.3558796718322699, "progress_pct": 67.78, "epoch_pct": 67.79, "eta": "06:59:58", "max_grad_norm": 1.0, "loss": 0.46303266286849976, "grad_norm": 1.2400696277618408, "learning_rate": 5.736679325785239e-06} +{"ts": "2025-12-23T02:18:01", "event": "train_log", "step": 466, "epoch": 1.3587967183226983, "progress_pct": 67.93, "epoch_pct": 67.94, "eta": "06:57:55", "max_grad_norm": 1.0, "loss": 0.5068309903144836, "grad_norm": 1.8848882913589478, "learning_rate": 5.6906767635209304e-06} +{"ts": "2025-12-23T02:19:43", "event": "train_log", "step": 467, "epoch": 1.3617137648131268, "progress_pct": 68.08, "epoch_pct": 68.09, "eta": "06:55:56", "max_grad_norm": 1.0, "loss": 0.364332914352417, "grad_norm": 1.4707008600234985, "learning_rate": 5.644785922960412e-06} +{"ts": "2025-12-23T02:21:18", "event": "train_log", "step": 468, "epoch": 1.364630811303555, "progress_pct": 68.22, "epoch_pct": 68.23, "eta": "06:53:53", "max_grad_norm": 1.0, "loss": 0.485107421875, "grad_norm": 2.4436841011047363, "learning_rate": 5.599007993850329e-06} +{"ts": "2025-12-23T02:22:57", "event": "train_log", "step": 469, "epoch": 1.3675478577939835, "progress_pct": 68.37, "epoch_pct": 68.38, "eta": "06:51:52", "max_grad_norm": 1.0, "loss": 0.34547489881515503, "grad_norm": 1.1924740076065063, "learning_rate": 5.553344163010039e-06} +{"ts": "2025-12-23T02:24:34", "event": "train_log", "step": 470, "epoch": 1.370464904284412, "progress_pct": 68.51, "epoch_pct": 68.52, "eta": "06:49:51", "max_grad_norm": 1.0, "loss": 0.39645254611968994, "grad_norm": 1.1255877017974854, "learning_rate": 5.507795614300846e-06} +{"ts": "2025-12-23T02:26:10", "event": "train_log", "step": 471, "epoch": 1.3733819507748404, "progress_pct": 68.66, "epoch_pct": 68.67, "eta": "06:47:48", "max_grad_norm": 1.0, "loss": 0.4267856478691101, "grad_norm": 1.0937018394470215, "learning_rate": 5.4623635285952815e-06} +{"ts": "2025-12-23T02:27:49", "event": "train_log", "step": 472, "epoch": 1.3762989972652688, "progress_pct": 68.8, "epoch_pct": 68.81, "eta": "06:45:48", "max_grad_norm": 1.0, "loss": 0.3669992983341217, "grad_norm": 1.3355520963668823, "learning_rate": 5.417049083746513e-06} +{"ts": "2025-12-23T02:29:23", "event": "train_log", "step": 473, "epoch": 1.3792160437556973, "progress_pct": 68.95, "epoch_pct": 68.96, "eta": "06:43:45", "max_grad_norm": 1.0, "loss": 0.3873697519302368, "grad_norm": 1.7302504777908325, "learning_rate": 5.3718534545578035e-06} +{"ts": "2025-12-23T02:31:00", "event": "train_log", "step": 474, "epoch": 1.3821330902461257, "progress_pct": 69.1, "epoch_pct": 69.11, "eta": "06:41:44", "max_grad_norm": 1.0, "loss": 0.4581540524959564, "grad_norm": 1.17263662815094, "learning_rate": 5.326777812752041e-06} +{"ts": "2025-12-23T02:32:40", "event": "train_log", "step": 475, "epoch": 1.3850501367365542, "progress_pct": 69.24, "epoch_pct": 69.25, "eta": "06:39:44", "max_grad_norm": 1.0, "loss": 0.43062761425971985, "grad_norm": 1.0998128652572632, "learning_rate": 5.281823326941377e-06} +{"ts": "2025-12-23T02:34:17", "event": "train_log", "step": 476, "epoch": 1.3879671832269826, "progress_pct": 69.39, "epoch_pct": 69.4, "eta": "06:37:43", "max_grad_norm": 1.0, "loss": 0.381741464138031, "grad_norm": 1.1194556951522827, "learning_rate": 5.236991162596932e-06} +{"ts": "2025-12-23T02:35:52", "event": "train_log", "step": 477, "epoch": 1.390884229717411, "progress_pct": 69.53, "epoch_pct": 69.54, "eta": "06:35:41", "max_grad_norm": 1.0, "loss": 0.49175748229026794, "grad_norm": 1.2759051322937012, "learning_rate": 5.19228248201856e-06} +{"ts": "2025-12-23T02:37:27", "event": "train_log", "step": 478, "epoch": 1.3938012762078396, "progress_pct": 69.68, "epoch_pct": 69.69, "eta": "06:33:39", "max_grad_norm": 1.0, "loss": 0.4997562766075134, "grad_norm": 1.2134747505187988, "learning_rate": 5.147698444304732e-06} +{"ts": "2025-12-23T02:39:02", "event": "train_log", "step": 479, "epoch": 1.396718322698268, "progress_pct": 69.83, "epoch_pct": 69.84, "eta": "06:31:38", "max_grad_norm": 1.0, "loss": 0.42580488324165344, "grad_norm": 1.0833078622817993, "learning_rate": 5.1032402053224804e-06} +{"ts": "2025-12-23T02:40:39", "event": "train_log", "step": 480, "epoch": 1.3996353691886965, "progress_pct": 69.97, "epoch_pct": 69.98, "eta": "06:29:38", "max_grad_norm": 1.0, "loss": 0.5015593767166138, "grad_norm": 1.4838510751724243, "learning_rate": 5.058908917677426e-06} +{"ts": "2025-12-23T02:42:15", "event": "train_log", "step": 481, "epoch": 1.402552415679125, "progress_pct": 70.12, "epoch_pct": 70.13, "eta": "06:27:36", "max_grad_norm": 1.0, "loss": 0.34739193320274353, "grad_norm": 1.218610167503357, "learning_rate": 5.014705730683904e-06} +{"ts": "2025-12-23T02:43:52", "event": "train_log", "step": 482, "epoch": 1.4054694621695534, "progress_pct": 70.26, "epoch_pct": 70.27, "eta": "06:25:36", "max_grad_norm": 1.0, "loss": 0.41708022356033325, "grad_norm": 1.1883307695388794, "learning_rate": 4.970631790335181e-06} +{"ts": "2025-12-23T02:45:30", "event": "train_log", "step": 483, "epoch": 1.4083865086599818, "progress_pct": 70.41, "epoch_pct": 70.42, "eta": "06:23:36", "max_grad_norm": 1.0, "loss": 0.43546172976493835, "grad_norm": 1.209291696548462, "learning_rate": 4.926688239273713e-06} +{"ts": "2025-12-23T02:47:07", "event": "train_log", "step": 484, "epoch": 1.4113035551504103, "progress_pct": 70.55, "epoch_pct": 70.57, "eta": "06:21:36", "max_grad_norm": 1.0, "loss": 0.44491735100746155, "grad_norm": 1.0801606178283691, "learning_rate": 4.882876216761543e-06} +{"ts": "2025-12-23T02:48:45", "event": "train_log", "step": 485, "epoch": 1.4142206016408387, "progress_pct": 70.7, "epoch_pct": 70.71, "eta": "06:19:36", "max_grad_norm": 1.0, "loss": 0.436122864484787, "grad_norm": 1.2746628522872925, "learning_rate": 4.839196858650763e-06} +{"ts": "2025-12-23T02:50:26", "event": "train_log", "step": 486, "epoch": 1.4171376481312672, "progress_pct": 70.85, "epoch_pct": 70.86, "eta": "06:17:38", "max_grad_norm": 1.0, "loss": 0.3750447630882263, "grad_norm": 1.4465962648391724, "learning_rate": 4.795651297354056e-06} +{"ts": "2025-12-23T02:52:03", "event": "train_log", "step": 487, "epoch": 1.4200546946216956, "progress_pct": 70.99, "epoch_pct": 71.0, "eta": "06:15:38", "max_grad_norm": 1.0, "loss": 0.38286519050598145, "grad_norm": 1.6736211776733398, "learning_rate": 4.752240661815346e-06} +{"ts": "2025-12-23T02:53:43", "event": "train_log", "step": 488, "epoch": 1.422971741112124, "progress_pct": 71.14, "epoch_pct": 71.15, "eta": "06:13:39", "max_grad_norm": 1.0, "loss": 0.4488063156604767, "grad_norm": 1.1946996450424194, "learning_rate": 4.708966077480544e-06} +{"ts": "2025-12-23T02:55:19", "event": "train_log", "step": 489, "epoch": 1.4258887876025526, "progress_pct": 71.28, "epoch_pct": 71.29, "eta": "06:11:39", "max_grad_norm": 1.0, "loss": 0.44088613986968994, "grad_norm": 1.42599356174469, "learning_rate": 4.665828666268335e-06} +{"ts": "2025-12-23T02:56:54", "event": "train_log", "step": 490, "epoch": 1.4288058340929808, "progress_pct": 71.43, "epoch_pct": 71.44, "eta": "06:09:38", "max_grad_norm": 1.0, "loss": 0.4030645489692688, "grad_norm": 1.2281016111373901, "learning_rate": 4.622829546541121e-06} +{"ts": "2025-12-23T02:58:30", "event": "train_log", "step": 491, "epoch": 1.4317228805834092, "progress_pct": 71.57, "epoch_pct": 71.59, "eta": "06:07:39", "max_grad_norm": 1.0, "loss": 0.44702020287513733, "grad_norm": 1.2875670194625854, "learning_rate": 4.57996983307602e-06} +{"ts": "2025-12-23T03:00:06", "event": "train_log", "step": 492, "epoch": 1.4346399270738377, "progress_pct": 71.72, "epoch_pct": 71.73, "eta": "06:05:39", "max_grad_norm": 1.0, "loss": 0.4067370593547821, "grad_norm": 1.2456860542297363, "learning_rate": 4.537250637035947e-06} +{"ts": "2025-12-23T03:01:44", "event": "train_log", "step": 493, "epoch": 1.4375569735642661, "progress_pct": 71.87, "epoch_pct": 71.88, "eta": "06:03:40", "max_grad_norm": 1.0, "loss": 0.4237740635871887, "grad_norm": 1.2822725772857666, "learning_rate": 4.494673065940833e-06} +{"ts": "2025-12-23T03:03:21", "event": "train_log", "step": 494, "epoch": 1.4404740200546946, "progress_pct": 72.01, "epoch_pct": 72.02, "eta": "06:01:41", "max_grad_norm": 1.0, "loss": 0.40579724311828613, "grad_norm": 1.5517818927764893, "learning_rate": 4.452238223638906e-06} +{"ts": "2025-12-23T03:04:58", "event": "train_log", "step": 495, "epoch": 1.443391066545123, "progress_pct": 72.16, "epoch_pct": 72.17, "eta": "05:59:41", "max_grad_norm": 1.0, "loss": 0.38880717754364014, "grad_norm": 1.275344967842102, "learning_rate": 4.409947210278056e-06} +{"ts": "2025-12-23T03:06:32", "event": "train_log", "step": 496, "epoch": 1.4463081130355515, "progress_pct": 72.3, "epoch_pct": 72.32, "eta": "05:57:41", "max_grad_norm": 1.0, "loss": 0.4042310416698456, "grad_norm": 1.22952139377594, "learning_rate": 4.367801122277327e-06} +{"ts": "2025-12-23T03:08:11", "event": "train_log", "step": 497, "epoch": 1.44922515952598, "progress_pct": 72.45, "epoch_pct": 72.46, "eta": "05:55:43", "max_grad_norm": 1.0, "loss": 0.5408368110656738, "grad_norm": 1.122261643409729, "learning_rate": 4.325801052298493e-06} +{"ts": "2025-12-23T03:09:47", "event": "train_log", "step": 498, "epoch": 1.4521422060164084, "progress_pct": 72.59, "epoch_pct": 72.61, "eta": "05:53:43", "max_grad_norm": 1.0, "loss": 0.37697717547416687, "grad_norm": 1.5885361433029175, "learning_rate": 4.283948089217715e-06} +{"ts": "2025-12-23T03:11:22", "event": "train_log", "step": 499, "epoch": 1.4550592525068369, "progress_pct": 72.74, "epoch_pct": 72.75, "eta": "05:51:44", "max_grad_norm": 1.0, "loss": 0.3811529576778412, "grad_norm": 2.3565149307250977, "learning_rate": 4.242243318097338e-06} +{"ts": "2025-12-23T03:13:00", "event": "train_log", "step": 500, "epoch": 1.4579762989972653, "progress_pct": 72.89, "epoch_pct": 72.9, "eta": "05:49:45", "max_grad_norm": 1.0, "loss": 0.414781391620636, "grad_norm": 1.1944137811660767, "learning_rate": 4.200687820157735e-06} +{"ts": "2025-12-23T03:32:49", "event": "train_log", "step": 500, "epoch": 1.4579762989972653, "progress_pct": 72.89, "epoch_pct": 72.9, "eta": "05:57:08", "max_grad_norm": 1.0, "eval_loss": 0.40706494450569153, "eval_runtime": 1189.1593, "eval_samples_per_second": 0.531, "eval_steps_per_second": 0.531} +{"ts": "2025-12-23T03:34:22", "event": "train_log", "step": 501, "epoch": 1.4608933454876938, "progress_pct": 73.03, "epoch_pct": 73.04, "eta": "05:55:04", "max_grad_norm": 1.0, "loss": 0.38155990839004517, "grad_norm": 1.0442464351654053, "learning_rate": 4.159282672749289e-06} +{"ts": "2025-12-23T03:35:57", "event": "train_log", "step": 502, "epoch": 1.463810391978122, "progress_pct": 73.18, "epoch_pct": 73.19, "eta": "05:53:02", "max_grad_norm": 1.0, "loss": 0.4830601215362549, "grad_norm": 1.7274727821350098, "learning_rate": 4.118028949324453e-06} +{"ts": "2025-12-23T03:37:32", "event": "train_log", "step": 503, "epoch": 1.4667274384685505, "progress_pct": 73.32, "epoch_pct": 73.34, "eta": "05:50:59", "max_grad_norm": 1.0, "loss": 0.3975123167037964, "grad_norm": 2.064513921737671, "learning_rate": 4.0769277194099345e-06} +{"ts": "2025-12-23T03:39:08", "event": "train_log", "step": 504, "epoch": 1.469644484958979, "progress_pct": 73.47, "epoch_pct": 73.48, "eta": "05:48:58", "max_grad_norm": 1.0, "loss": 0.37033841013908386, "grad_norm": 1.7695534229278564, "learning_rate": 4.035980048578942e-06} +{"ts": "2025-12-23T03:40:45", "event": "train_log", "step": 505, "epoch": 1.4725615314494074, "progress_pct": 73.62, "epoch_pct": 73.63, "eta": "05:46:56", "max_grad_norm": 1.0, "loss": 0.39567673206329346, "grad_norm": 1.4455046653747559, "learning_rate": 3.995186998423597e-06} +{"ts": "2025-12-23T03:42:17", "event": "train_log", "step": 506, "epoch": 1.4754785779398358, "progress_pct": 73.76, "epoch_pct": 73.77, "eta": "05:44:53", "max_grad_norm": 1.0, "loss": 0.44786664843559265, "grad_norm": 1.1791958808898926, "learning_rate": 3.9545496265273765e-06} +{"ts": "2025-12-23T03:43:51", "event": "train_log", "step": 507, "epoch": 1.4783956244302643, "progress_pct": 73.91, "epoch_pct": 73.92, "eta": "05:42:50", "max_grad_norm": 1.0, "loss": 0.3333263099193573, "grad_norm": 2.0874717235565186, "learning_rate": 3.9140689864377105e-06} +{"ts": "2025-12-23T03:45:23", "event": "train_log", "step": 508, "epoch": 1.4813126709206927, "progress_pct": 74.05, "epoch_pct": 74.07, "eta": "05:40:47", "max_grad_norm": 1.0, "loss": 0.5105943083763123, "grad_norm": 1.5897501707077026, "learning_rate": 3.873746127638668e-06} +{"ts": "2025-12-23T03:46:55", "event": "train_log", "step": 509, "epoch": 1.4842297174111212, "progress_pct": 74.2, "epoch_pct": 74.21, "eta": "05:38:44", "max_grad_norm": 1.0, "loss": 0.43922683596611023, "grad_norm": 1.5059760808944702, "learning_rate": 3.833582095523749e-06} +{"ts": "2025-12-23T03:48:30", "event": "train_log", "step": 510, "epoch": 1.4871467639015497, "progress_pct": 74.34, "epoch_pct": 74.36, "eta": "05:36:43", "max_grad_norm": 1.0, "loss": 0.4584790766239166, "grad_norm": 1.379347562789917, "learning_rate": 3.7935779313687648e-06} +{"ts": "2025-12-23T03:50:10", "event": "train_log", "step": 511, "epoch": 1.490063810391978, "progress_pct": 74.49, "epoch_pct": 74.5, "eta": "05:34:43", "max_grad_norm": 1.0, "loss": 0.5217512249946594, "grad_norm": 1.0984690189361572, "learning_rate": 3.7537346723048816e-06} +{"ts": "2025-12-23T03:51:50", "event": "train_log", "step": 512, "epoch": 1.4929808568824066, "progress_pct": 74.64, "epoch_pct": 74.65, "eta": "05:32:43", "max_grad_norm": 1.0, "loss": 0.4180052876472473, "grad_norm": 1.5944225788116455, "learning_rate": 3.71405335129169e-06} +{"ts": "2025-12-23T03:53:28", "event": "train_log", "step": 513, "epoch": 1.495897903372835, "progress_pct": 74.78, "epoch_pct": 74.79, "eta": "05:30:43", "max_grad_norm": 1.0, "loss": 0.4584833085536957, "grad_norm": 1.2745033502578735, "learning_rate": 3.6745349970904465e-06} +{"ts": "2025-12-23T03:55:08", "event": "train_log", "step": 514, "epoch": 1.4988149498632635, "progress_pct": 74.93, "epoch_pct": 74.94, "eta": "05:28:43", "max_grad_norm": 1.0, "loss": 0.3202287554740906, "grad_norm": 1.2746814489364624, "learning_rate": 3.6351806342374007e-06} +{"ts": "2025-12-23T03:56:30", "event": "train_log", "step": 515, "epoch": 1.501731996353692, "progress_pct": 75.07, "epoch_pct": 75.09, "eta": "05:26:38", "max_grad_norm": 1.0, "loss": 0.37963351607322693, "grad_norm": 1.409638524055481, "learning_rate": 3.5959912830172348e-06} +{"ts": "2025-12-23T03:57:42", "event": "train_log", "step": 516, "epoch": 1.5046490428441204, "progress_pct": 75.22, "epoch_pct": 75.23, "eta": "05:24:29", "max_grad_norm": 1.0, "loss": 0.43133026361465454, "grad_norm": 1.1655553579330444, "learning_rate": 3.556967959436591e-06} +{"ts": "2025-12-23T03:58:53", "event": "train_log", "step": 517, "epoch": 1.5075660893345488, "progress_pct": 75.36, "epoch_pct": 75.38, "eta": "05:22:20", "max_grad_norm": 1.0, "loss": 0.3739299178123474, "grad_norm": 1.0495020151138306, "learning_rate": 3.518111675197776e-06} +{"ts": "2025-12-23T04:00:03", "event": "train_log", "step": 518, "epoch": 1.5104831358249773, "progress_pct": 75.51, "epoch_pct": 75.52, "eta": "05:20:11", "max_grad_norm": 1.0, "loss": 0.4099601209163666, "grad_norm": 1.3055057525634766, "learning_rate": 3.4794234376724835e-06} +{"ts": "2025-12-23T04:01:13", "event": "train_log", "step": 519, "epoch": 1.5134001823154057, "progress_pct": 75.66, "epoch_pct": 75.67, "eta": "05:18:03", "max_grad_norm": 1.0, "loss": 0.380616158246994, "grad_norm": 1.2252463102340698, "learning_rate": 3.4409042498757084e-06} +{"ts": "2025-12-23T04:02:23", "event": "train_log", "step": 520, "epoch": 1.5163172288058342, "progress_pct": 75.8, "epoch_pct": 75.82, "eta": "05:15:54", "max_grad_norm": 1.0, "loss": 0.3510003685951233, "grad_norm": 1.2728638648986816, "learning_rate": 3.4025551104397294e-06} +{"ts": "2025-12-23T04:03:33", "event": "train_log", "step": 521, "epoch": 1.5192342752962626, "progress_pct": 75.95, "epoch_pct": 75.96, "eta": "05:13:46", "max_grad_norm": 1.0, "loss": 0.4087940752506256, "grad_norm": 2.70664644241333, "learning_rate": 3.3643770135882282e-06} +{"ts": "2025-12-23T04:04:43", "event": "train_log", "step": 522, "epoch": 1.522151321786691, "progress_pct": 76.09, "epoch_pct": 76.11, "eta": "05:11:38", "max_grad_norm": 1.0, "loss": 0.45614126324653625, "grad_norm": 1.6197112798690796, "learning_rate": 3.3263709491104933e-06} +{"ts": "2025-12-23T04:05:53", "event": "train_log", "step": 523, "epoch": 1.5250683682771196, "progress_pct": 76.24, "epoch_pct": 76.25, "eta": "05:09:31", "max_grad_norm": 1.0, "loss": 0.3824586272239685, "grad_norm": 1.3596103191375732, "learning_rate": 3.2885379023357956e-06} +{"ts": "2025-12-23T04:07:04", "event": "train_log", "step": 524, "epoch": 1.527985414767548, "progress_pct": 76.38, "epoch_pct": 76.4, "eta": "05:07:23", "max_grad_norm": 1.0, "loss": 0.47717779874801636, "grad_norm": 1.1768635511398315, "learning_rate": 3.2508788541078097e-06} +{"ts": "2025-12-23T04:08:14", "event": "train_log", "step": 525, "epoch": 1.5309024612579762, "progress_pct": 76.53, "epoch_pct": 76.55, "eta": "05:05:16", "max_grad_norm": 1.0, "loss": 0.4013281762599945, "grad_norm": 1.669474482536316, "learning_rate": 3.2133947807591958e-06} +{"ts": "2025-12-23T04:09:24", "event": "train_log", "step": 526, "epoch": 1.5338195077484047, "progress_pct": 76.68, "epoch_pct": 76.69, "eta": "05:03:09", "max_grad_norm": 1.0, "loss": 0.367280513048172, "grad_norm": 1.600868582725525, "learning_rate": 3.1760866540862932e-06} +{"ts": "2025-12-23T04:10:34", "event": "train_log", "step": 527, "epoch": 1.5367365542388332, "progress_pct": 76.82, "epoch_pct": 76.84, "eta": "05:01:02", "max_grad_norm": 1.0, "loss": 0.4432409405708313, "grad_norm": 1.1689515113830566, "learning_rate": 3.138955441323923e-06} +{"ts": "2025-12-23T04:11:45", "event": "train_log", "step": 528, "epoch": 1.5396536007292616, "progress_pct": 76.97, "epoch_pct": 76.98, "eta": "04:58:56", "max_grad_norm": 1.0, "loss": 0.4219942092895508, "grad_norm": 2.361961603164673, "learning_rate": 3.1020021051202973e-06} +{"ts": "2025-12-23T04:12:57", "event": "train_log", "step": 529, "epoch": 1.54257064721969, "progress_pct": 77.11, "epoch_pct": 77.13, "eta": "04:56:50", "max_grad_norm": 1.0, "loss": 0.3672596514225006, "grad_norm": 1.1962230205535889, "learning_rate": 3.0652276035120964e-06} +{"ts": "2025-12-23T04:14:09", "event": "train_log", "step": 530, "epoch": 1.5454876937101185, "progress_pct": 77.26, "epoch_pct": 77.27, "eta": "04:54:44", "max_grad_norm": 1.0, "loss": 0.42919260263442993, "grad_norm": 1.4149441719055176, "learning_rate": 3.0286328898995963e-06} +{"ts": "2025-12-23T04:15:21", "event": "train_log", "step": 531, "epoch": 1.548404740200547, "progress_pct": 77.41, "epoch_pct": 77.42, "eta": "04:52:39", "max_grad_norm": 1.0, "loss": 0.4499061107635498, "grad_norm": 1.2668434381484985, "learning_rate": 2.992218913021966e-06} +{"ts": "2025-12-23T04:16:32", "event": "train_log", "step": 532, "epoch": 1.5513217866909754, "progress_pct": 77.55, "epoch_pct": 77.57, "eta": "04:50:33", "max_grad_norm": 1.0, "loss": 0.34660714864730835, "grad_norm": 1.268114686012268, "learning_rate": 2.9559866169326734e-06} +{"ts": "2025-12-23T04:17:43", "event": "train_log", "step": 533, "epoch": 1.5542388331814039, "progress_pct": 77.7, "epoch_pct": 77.71, "eta": "04:48:28", "max_grad_norm": 1.0, "loss": 0.38239023089408875, "grad_norm": 1.0086419582366943, "learning_rate": 2.919936940975007e-06} +{"ts": "2025-12-23T04:18:56", "event": "train_log", "step": 534, "epoch": 1.557155879671832, "progress_pct": 77.84, "epoch_pct": 77.86, "eta": "04:46:23", "max_grad_norm": 1.0, "loss": 0.48240017890930176, "grad_norm": 1.0700170993804932, "learning_rate": 2.884070819757712e-06} +{"ts": "2025-12-23T04:20:07", "event": "train_log", "step": 535, "epoch": 1.5600729261622606, "progress_pct": 77.99, "epoch_pct": 78.0, "eta": "04:44:18", "max_grad_norm": 1.0, "loss": 0.4098761975765228, "grad_norm": 1.2101227045059204, "learning_rate": 2.8483891831307873e-06} +{"ts": "2025-12-23T04:21:17", "event": "train_log", "step": 536, "epoch": 1.562989972652689, "progress_pct": 78.13, "epoch_pct": 78.15, "eta": "04:42:13", "max_grad_norm": 1.0, "loss": 0.45641395449638367, "grad_norm": 1.2731400728225708, "learning_rate": 2.8128929561613505e-06} +{"ts": "2025-12-23T04:22:27", "event": "train_log", "step": 537, "epoch": 1.5659070191431175, "progress_pct": 78.28, "epoch_pct": 78.3, "eta": "04:40:09", "max_grad_norm": 1.0, "loss": 0.42283985018730164, "grad_norm": 1.1474392414093018, "learning_rate": 2.777583059109671e-06} +{"ts": "2025-12-23T04:23:38", "event": "train_log", "step": 538, "epoch": 1.568824065633546, "progress_pct": 78.43, "epoch_pct": 78.44, "eta": "04:38:04", "max_grad_norm": 1.0, "loss": 0.3469158113002777, "grad_norm": 1.789881944656372, "learning_rate": 2.7424604074053028e-06} +{"ts": "2025-12-23T04:24:48", "event": "train_log", "step": 539, "epoch": 1.5717411121239744, "progress_pct": 78.57, "epoch_pct": 78.59, "eta": "04:36:00", "max_grad_norm": 1.0, "loss": 0.35837510228157043, "grad_norm": 1.3426933288574219, "learning_rate": 2.707525911623362e-06} +{"ts": "2025-12-23T04:25:58", "event": "train_log", "step": 540, "epoch": 1.5746581586144028, "progress_pct": 78.72, "epoch_pct": 78.73, "eta": "04:33:56", "max_grad_norm": 1.0, "loss": 0.4736083745956421, "grad_norm": 1.2343578338623047, "learning_rate": 2.672780477460901e-06} +{"ts": "2025-12-23T04:27:09", "event": "train_log", "step": 541, "epoch": 1.5775752051048313, "progress_pct": 78.86, "epoch_pct": 78.88, "eta": "04:31:52", "max_grad_norm": 1.0, "loss": 0.34345340728759766, "grad_norm": 1.516298770904541, "learning_rate": 2.638225005713457e-06} +{"ts": "2025-12-23T04:28:19", "event": "train_log", "step": 542, "epoch": 1.5804922515952597, "progress_pct": 79.01, "epoch_pct": 79.02, "eta": "04:29:48", "max_grad_norm": 1.0, "loss": 0.4134179949760437, "grad_norm": 1.1488829851150513, "learning_rate": 2.6038603922516705e-06} +{"ts": "2025-12-23T04:29:29", "event": "train_log", "step": 543, "epoch": 1.5834092980856882, "progress_pct": 79.15, "epoch_pct": 79.17, "eta": "04:27:45", "max_grad_norm": 1.0, "loss": 0.3297592103481293, "grad_norm": 1.4486491680145264, "learning_rate": 2.569687527998073e-06} +{"ts": "2025-12-23T04:30:39", "event": "train_log", "step": 544, "epoch": 1.5863263445761167, "progress_pct": 79.3, "epoch_pct": 79.32, "eta": "04:25:41", "max_grad_norm": 1.0, "loss": 0.3958476185798645, "grad_norm": 1.272691011428833, "learning_rate": 2.5357072989039855e-06} +{"ts": "2025-12-23T04:31:50", "event": "train_log", "step": 545, "epoch": 1.589243391066545, "progress_pct": 79.45, "epoch_pct": 79.46, "eta": "04:23:38", "max_grad_norm": 1.0, "loss": 0.4125611186027527, "grad_norm": 1.244240641593933, "learning_rate": 2.501920585926555e-06} +{"ts": "2025-12-23T04:32:59", "event": "train_log", "step": 546, "epoch": 1.5921604375569736, "progress_pct": 79.59, "epoch_pct": 79.61, "eta": "04:21:35", "max_grad_norm": 1.0, "loss": 0.3762253224849701, "grad_norm": 1.5844073295593262, "learning_rate": 2.4683282650058992e-06} +{"ts": "2025-12-23T04:34:10", "event": "train_log", "step": 547, "epoch": 1.595077484047402, "progress_pct": 79.74, "epoch_pct": 79.75, "eta": "04:19:33", "max_grad_norm": 1.0, "loss": 0.37053319811820984, "grad_norm": 1.8209946155548096, "learning_rate": 2.4349312070424258e-06} +{"ts": "2025-12-23T04:35:21", "event": "train_log", "step": 548, "epoch": 1.5979945305378305, "progress_pct": 79.88, "epoch_pct": 79.9, "eta": "04:17:30", "max_grad_norm": 1.0, "loss": 0.5004774332046509, "grad_norm": 1.3752915859222412, "learning_rate": 2.4017302778742247e-06} +{"ts": "2025-12-23T04:36:32", "event": "train_log", "step": 549, "epoch": 1.600911577028259, "progress_pct": 80.03, "epoch_pct": 80.05, "eta": "04:15:28", "max_grad_norm": 1.0, "loss": 0.39014023542404175, "grad_norm": 5.143753528594971, "learning_rate": 2.36872633825464e-06} +{"ts": "2025-12-23T04:37:44", "event": "train_log", "step": 550, "epoch": 1.6038286235186874, "progress_pct": 80.17, "epoch_pct": 80.19, "eta": "04:13:26", "max_grad_norm": 1.0, "loss": 0.378440260887146, "grad_norm": 1.0730944871902466, "learning_rate": 2.335920243829941e-06} +{"ts": "2025-12-23T04:52:38", "event": "train_log", "step": 550, "epoch": 1.6038286235186874, "progress_pct": 80.17, "epoch_pct": 80.19, "eta": "04:17:07", "max_grad_norm": 1.0, "eval_loss": 0.40037089586257935, "eval_runtime": 893.7411, "eval_samples_per_second": 0.707, "eval_steps_per_second": 0.707} +{"ts": "2025-12-23T04:53:49", "event": "train_log", "step": 551, "epoch": 1.6067456700091158, "progress_pct": 80.32, "epoch_pct": 80.34, "eta": "04:15:03", "max_grad_norm": 1.0, "loss": 0.4471960663795471, "grad_norm": 1.5507797002792358, "learning_rate": 2.3033128451171548e-06} +{"ts": "2025-12-23T04:54:59", "event": "train_log", "step": 552, "epoch": 1.6096627164995443, "progress_pct": 80.47, "epoch_pct": 80.48, "eta": "04:12:59", "max_grad_norm": 1.0, "loss": 0.3658301830291748, "grad_norm": 1.9462968111038208, "learning_rate": 2.2709049874819924e-06} +{"ts": "2025-12-23T04:56:10", "event": "train_log", "step": 553, "epoch": 1.6125797629899727, "progress_pct": 80.61, "epoch_pct": 80.63, "eta": "04:10:56", "max_grad_norm": 1.0, "loss": 0.3911179304122925, "grad_norm": 1.2034238576889038, "learning_rate": 2.238697511116962e-06} +{"ts": "2025-12-23T04:57:21", "event": "train_log", "step": 554, "epoch": 1.6154968094804012, "progress_pct": 80.76, "epoch_pct": 80.77, "eta": "04:08:53", "max_grad_norm": 1.0, "loss": 0.3998897671699524, "grad_norm": 1.3574327230453491, "learning_rate": 2.2066912510195636e-06} +{"ts": "2025-12-23T04:58:34", "event": "train_log", "step": 555, "epoch": 1.6184138559708297, "progress_pct": 80.9, "epoch_pct": 80.92, "eta": "04:06:50", "max_grad_norm": 1.0, "loss": 0.38577449321746826, "grad_norm": 1.1973012685775757, "learning_rate": 2.1748870369706507e-06} +{"ts": "2025-12-23T04:59:46", "event": "train_log", "step": 556, "epoch": 1.621330902461258, "progress_pct": 81.05, "epoch_pct": 81.07, "eta": "04:04:47", "max_grad_norm": 1.0, "loss": 0.411307156085968, "grad_norm": 1.9365874528884888, "learning_rate": 2.1432856935129144e-06} +{"ts": "2025-12-23T05:00:57", "event": "train_log", "step": 557, "epoch": 1.6242479489516866, "progress_pct": 81.2, "epoch_pct": 81.21, "eta": "04:02:45", "max_grad_norm": 1.0, "loss": 0.38424253463745117, "grad_norm": 1.3558642864227295, "learning_rate": 2.1118880399295106e-06} +{"ts": "2025-12-23T05:02:09", "event": "train_log", "step": 558, "epoch": 1.627164995442115, "progress_pct": 81.34, "epoch_pct": 81.36, "eta": "04:00:43", "max_grad_norm": 1.0, "loss": 0.39943546056747437, "grad_norm": 1.4368890523910522, "learning_rate": 2.0806948902228075e-06} +{"ts": "2025-12-23T05:03:19", "event": "train_log", "step": 559, "epoch": 1.6300820419325432, "progress_pct": 81.49, "epoch_pct": 81.5, "eta": "03:58:40", "max_grad_norm": 1.0, "loss": 0.36787641048431396, "grad_norm": 1.6266753673553467, "learning_rate": 2.0497070530933084e-06} +{"ts": "2025-12-23T05:04:29", "event": "train_log", "step": 560, "epoch": 1.6329990884229717, "progress_pct": 81.63, "epoch_pct": 81.65, "eta": "03:56:38", "max_grad_norm": 1.0, "loss": 0.3781934380531311, "grad_norm": 1.2600938081741333, "learning_rate": 2.0189253319186576e-06} +{"ts": "2025-12-23T05:05:40", "event": "train_log", "step": 561, "epoch": 1.6359161349134002, "progress_pct": 81.78, "epoch_pct": 81.8, "eta": "03:54:36", "max_grad_norm": 1.0, "loss": 0.4132305383682251, "grad_norm": 1.975071907043457, "learning_rate": 1.9883505247328237e-06} +{"ts": "2025-12-23T05:06:50", "event": "train_log", "step": 562, "epoch": 1.6388331814038286, "progress_pct": 81.92, "epoch_pct": 81.94, "eta": "03:52:34", "max_grad_norm": 1.0, "loss": 0.3727574646472931, "grad_norm": 1.4095909595489502, "learning_rate": 1.9579834242054154e-06} +{"ts": "2025-12-23T05:08:00", "event": "train_log", "step": 563, "epoch": 1.641750227894257, "progress_pct": 82.07, "epoch_pct": 82.09, "eta": "03:50:32", "max_grad_norm": 1.0, "loss": 0.33786773681640625, "grad_norm": 1.4271371364593506, "learning_rate": 1.9278248176211243e-06} +{"ts": "2025-12-23T05:09:10", "event": "train_log", "step": 564, "epoch": 1.6446672743846855, "progress_pct": 82.22, "epoch_pct": 82.23, "eta": "03:48:30", "max_grad_norm": 1.0, "loss": 0.33035099506378174, "grad_norm": 1.5907646417617798, "learning_rate": 1.8978754868593074e-06} +{"ts": "2025-12-23T05:10:20", "event": "train_log", "step": 565, "epoch": 1.647584320875114, "progress_pct": 82.36, "epoch_pct": 82.38, "eta": "03:46:29", "max_grad_norm": 1.0, "loss": 0.41707149147987366, "grad_norm": 1.1315702199935913, "learning_rate": 1.8681362083737387e-06} +{"ts": "2025-12-23T05:11:31", "event": "train_log", "step": 566, "epoch": 1.6505013673655424, "progress_pct": 82.51, "epoch_pct": 82.53, "eta": "03:44:28", "max_grad_norm": 1.0, "loss": 0.43079230189323425, "grad_norm": 1.4737143516540527, "learning_rate": 1.8386077531724556e-06} +{"ts": "2025-12-23T05:12:41", "event": "train_log", "step": 567, "epoch": 1.6534184138559709, "progress_pct": 82.65, "epoch_pct": 82.67, "eta": "03:42:27", "max_grad_norm": 1.0, "loss": 0.3524904251098633, "grad_norm": 1.1006760597229004, "learning_rate": 1.8092908867977822e-06} +{"ts": "2025-12-23T05:13:51", "event": "train_log", "step": 568, "epoch": 1.6563354603463991, "progress_pct": 82.8, "epoch_pct": 82.82, "eta": "03:40:26", "max_grad_norm": 1.0, "loss": 0.3695681691169739, "grad_norm": 1.4066118001937866, "learning_rate": 1.780186369306479e-06} +{"ts": "2025-12-23T05:15:01", "event": "train_log", "step": 569, "epoch": 1.6592525068368276, "progress_pct": 82.94, "epoch_pct": 82.96, "eta": "03:38:25", "max_grad_norm": 1.0, "loss": 0.35596007108688354, "grad_norm": 1.6444640159606934, "learning_rate": 1.7512949552500412e-06} +{"ts": "2025-12-23T05:16:12", "event": "train_log", "step": 570, "epoch": 1.662169553327256, "progress_pct": 83.09, "epoch_pct": 83.11, "eta": "03:36:25", "max_grad_norm": 1.0, "loss": 0.4520571827888489, "grad_norm": 1.159480094909668, "learning_rate": 1.7226173936551282e-06} +{"ts": "2025-12-23T05:17:23", "event": "train_log", "step": 571, "epoch": 1.6650865998176845, "progress_pct": 83.24, "epoch_pct": 83.25, "eta": "03:34:24", "max_grad_norm": 1.0, "loss": 0.4702282249927521, "grad_norm": 1.5874221324920654, "learning_rate": 1.6941544280041567e-06} +{"ts": "2025-12-23T05:18:34", "event": "train_log", "step": 572, "epoch": 1.668003646308113, "progress_pct": 83.38, "epoch_pct": 83.4, "eta": "03:32:24", "max_grad_norm": 1.0, "loss": 0.3803800046443939, "grad_norm": 1.6153535842895508, "learning_rate": 1.6659067962160157e-06} +{"ts": "2025-12-23T05:19:46", "event": "train_log", "step": 573, "epoch": 1.6709206927985414, "progress_pct": 83.53, "epoch_pct": 83.55, "eta": "03:30:25", "max_grad_norm": 1.0, "loss": 0.4368419051170349, "grad_norm": 1.0748940706253052, "learning_rate": 1.6378752306269386e-06} +{"ts": "2025-12-23T05:20:57", "event": "train_log", "step": 574, "epoch": 1.6738377392889698, "progress_pct": 83.67, "epoch_pct": 83.69, "eta": "03:28:25", "max_grad_norm": 1.0, "loss": 0.4195623993873596, "grad_norm": 1.5286788940429688, "learning_rate": 1.6100604579715185e-06} +{"ts": "2025-12-23T05:22:08", "event": "train_log", "step": 575, "epoch": 1.6767547857793983, "progress_pct": 83.82, "epoch_pct": 83.84, "eta": "03:26:26", "max_grad_norm": 1.0, "loss": 0.4366849660873413, "grad_norm": 1.1433510780334473, "learning_rate": 1.5824631993638651e-06} +{"ts": "2025-12-23T05:23:20", "event": "train_log", "step": 576, "epoch": 1.6796718322698267, "progress_pct": 83.97, "epoch_pct": 83.98, "eta": "03:24:26", "max_grad_norm": 1.0, "loss": 0.5555303692817688, "grad_norm": 1.9694907665252686, "learning_rate": 1.5550841702789122e-06} +{"ts": "2025-12-23T05:24:30", "event": "train_log", "step": 577, "epoch": 1.6825888787602552, "progress_pct": 84.11, "epoch_pct": 84.13, "eta": "03:22:27", "max_grad_norm": 1.0, "loss": 0.40394848585128784, "grad_norm": 1.7587188482284546, "learning_rate": 1.5279240805338647e-06} +{"ts": "2025-12-23T05:25:40", "event": "train_log", "step": 578, "epoch": 1.6855059252506837, "progress_pct": 84.26, "epoch_pct": 84.28, "eta": "03:20:28", "max_grad_norm": 1.0, "loss": 0.49564215540885925, "grad_norm": 1.063381314277649, "learning_rate": 1.5009836342697993e-06} +{"ts": "2025-12-23T05:26:50", "event": "train_log", "step": 579, "epoch": 1.688422971741112, "progress_pct": 84.4, "epoch_pct": 84.42, "eta": "03:18:29", "max_grad_norm": 1.0, "loss": 0.3891904950141907, "grad_norm": 1.1742531061172485, "learning_rate": 1.4742635299334063e-06} +{"ts": "2025-12-23T05:28:00", "event": "train_log", "step": 580, "epoch": 1.6913400182315406, "progress_pct": 84.55, "epoch_pct": 84.57, "eta": "03:16:30", "max_grad_norm": 1.0, "loss": 0.35497623682022095, "grad_norm": 1.499934196472168, "learning_rate": 1.4477644602588848e-06} +{"ts": "2025-12-23T05:29:12", "event": "train_log", "step": 581, "epoch": 1.694257064721969, "progress_pct": 84.69, "epoch_pct": 84.71, "eta": "03:14:32", "max_grad_norm": 1.0, "loss": 0.4062272012233734, "grad_norm": 1.5112360715866089, "learning_rate": 1.421487112249984e-06} +{"ts": "2025-12-23T05:30:22", "event": "train_log", "step": 582, "epoch": 1.6971741112123975, "progress_pct": 84.84, "epoch_pct": 84.86, "eta": "03:12:33", "max_grad_norm": 1.0, "loss": 0.3655265271663666, "grad_norm": 1.3583141565322876, "learning_rate": 1.3954321671621885e-06} +{"ts": "2025-12-23T05:31:32", "event": "train_log", "step": 583, "epoch": 1.700091157702826, "progress_pct": 84.99, "epoch_pct": 85.0, "eta": "03:10:35", "max_grad_norm": 1.0, "loss": 0.37418332695961, "grad_norm": 2.8181653022766113, "learning_rate": 1.3696003004850577e-06} +{"ts": "2025-12-23T05:32:42", "event": "train_log", "step": 584, "epoch": 1.7030082041932544, "progress_pct": 85.13, "epoch_pct": 85.15, "eta": "03:08:37", "max_grad_norm": 1.0, "loss": 0.4946930408477783, "grad_norm": 0.967166543006897, "learning_rate": 1.3439921819247138e-06} +{"ts": "2025-12-23T05:33:52", "event": "train_log", "step": 585, "epoch": 1.7059252506836828, "progress_pct": 85.28, "epoch_pct": 85.3, "eta": "03:06:39", "max_grad_norm": 1.0, "loss": 0.5101871490478516, "grad_norm": 1.2773699760437012, "learning_rate": 1.3186084753864813e-06} +{"ts": "2025-12-23T05:35:02", "event": "train_log", "step": 586, "epoch": 1.7088422971741113, "progress_pct": 85.42, "epoch_pct": 85.44, "eta": "03:04:41", "max_grad_norm": 1.0, "loss": 0.3688133656978607, "grad_norm": 1.2814991474151611, "learning_rate": 1.293449838957671e-06} +{"ts": "2025-12-23T05:36:13", "event": "train_log", "step": 587, "epoch": 1.7117593436645397, "progress_pct": 85.57, "epoch_pct": 85.59, "eta": "03:02:43", "max_grad_norm": 1.0, "loss": 0.4739398956298828, "grad_norm": 1.594966173171997, "learning_rate": 1.2685169248905228e-06} +{"ts": "2025-12-23T05:37:24", "event": "train_log", "step": 588, "epoch": 1.7146763901549682, "progress_pct": 85.71, "epoch_pct": 85.73, "eta": "03:00:46", "max_grad_norm": 1.0, "loss": 0.3719588816165924, "grad_norm": 1.1471531391143799, "learning_rate": 1.2438103795852885e-06} +{"ts": "2025-12-23T05:38:35", "event": "train_log", "step": 589, "epoch": 1.7175934366453967, "progress_pct": 85.86, "epoch_pct": 85.88, "eta": "02:58:49", "max_grad_norm": 1.0, "loss": 0.4119298458099365, "grad_norm": 1.1657356023788452, "learning_rate": 1.2193308435734852e-06} +{"ts": "2025-12-23T05:39:47", "event": "train_log", "step": 590, "epoch": 1.720510483135825, "progress_pct": 86.01, "epoch_pct": 86.03, "eta": "02:56:52", "max_grad_norm": 1.0, "loss": 0.38277503848075867, "grad_norm": 1.1239042282104492, "learning_rate": 1.1950789515012783e-06} +{"ts": "2025-12-23T05:40:57", "event": "train_log", "step": 591, "epoch": 1.7234275296262536, "progress_pct": 86.15, "epoch_pct": 86.17, "eta": "02:54:55", "max_grad_norm": 1.0, "loss": 0.35080626606941223, "grad_norm": 1.149478554725647, "learning_rate": 1.1710553321130324e-06} +{"ts": "2025-12-23T05:42:09", "event": "train_log", "step": 592, "epoch": 1.726344576116682, "progress_pct": 86.3, "epoch_pct": 86.32, "eta": "02:52:58", "max_grad_norm": 1.0, "loss": 0.3991318345069885, "grad_norm": 1.2020260095596313, "learning_rate": 1.1472606082350112e-06} +{"ts": "2025-12-23T05:43:20", "event": "train_log", "step": 593, "epoch": 1.7292616226071102, "progress_pct": 86.44, "epoch_pct": 86.46, "eta": "02:51:01", "max_grad_norm": 1.0, "loss": 0.45791420340538025, "grad_norm": 1.101475477218628, "learning_rate": 1.123695396759229e-06} +{"ts": "2025-12-23T05:44:32", "event": "train_log", "step": 594, "epoch": 1.7321786690975387, "progress_pct": 86.59, "epoch_pct": 86.61, "eta": "02:49:05", "max_grad_norm": 1.0, "loss": 0.39805036783218384, "grad_norm": 0.9617101550102234, "learning_rate": 1.1003603086274584e-06} +{"ts": "2025-12-23T05:45:42", "event": "train_log", "step": 595, "epoch": 1.7350957155879672, "progress_pct": 86.73, "epoch_pct": 86.75, "eta": "02:47:09", "max_grad_norm": 1.0, "loss": 0.35753339529037476, "grad_norm": 1.1439731121063232, "learning_rate": 1.07725594881539e-06} +{"ts": "2025-12-23T05:46:53", "event": "train_log", "step": 596, "epoch": 1.7380127620783956, "progress_pct": 86.88, "epoch_pct": 86.9, "eta": "02:45:13", "max_grad_norm": 1.0, "loss": 0.42581748962402344, "grad_norm": 1.0350618362426758, "learning_rate": 1.0543829163169516e-06} +{"ts": "2025-12-23T05:48:03", "event": "train_log", "step": 597, "epoch": 1.740929808568824, "progress_pct": 87.03, "epoch_pct": 87.05, "eta": "02:43:16", "max_grad_norm": 1.0, "loss": 0.34685325622558594, "grad_norm": 1.2865227460861206, "learning_rate": 1.031741804128773e-06} +{"ts": "2025-12-23T05:49:13", "event": "train_log", "step": 598, "epoch": 1.7438468550592525, "progress_pct": 87.17, "epoch_pct": 87.19, "eta": "02:41:21", "max_grad_norm": 1.0, "loss": 0.48401936888694763, "grad_norm": 1.2079373598098755, "learning_rate": 1.0093331992348154e-06} +{"ts": "2025-12-23T05:50:24", "event": "train_log", "step": 599, "epoch": 1.746763901549681, "progress_pct": 87.32, "epoch_pct": 87.34, "eta": "02:39:25", "max_grad_norm": 1.0, "loss": 0.387456476688385, "grad_norm": 1.1684436798095703, "learning_rate": 9.871576825911577e-07} +{"ts": "2025-12-23T05:51:34", "event": "train_log", "step": 600, "epoch": 1.7496809480401094, "progress_pct": 87.46, "epoch_pct": 87.48, "eta": "02:37:29", "max_grad_norm": 1.0, "loss": 0.40196847915649414, "grad_norm": 1.298045039176941, "learning_rate": 9.65215829110927e-07} +{"ts": "2025-12-23T06:06:46", "event": "train_log", "step": 600, "epoch": 1.7496809480401094, "progress_pct": 87.46, "epoch_pct": 87.48, "eta": "02:39:40", "max_grad_norm": 1.0, "eval_loss": 0.3965963125228882, "eval_runtime": 912.3102, "eval_samples_per_second": 0.693, "eval_steps_per_second": 0.693} +{"ts": "2025-12-23T06:07:58", "event": "train_log", "step": 601, "epoch": 1.7525979945305379, "progress_pct": 87.61, "epoch_pct": 87.63, "eta": "02:37:43", "max_grad_norm": 1.0, "loss": 0.3990224003791809, "grad_norm": 1.24501371383667, "learning_rate": 9.435082076493974e-07} +{"ts": "2025-12-23T06:09:08", "event": "train_log", "step": 602, "epoch": 1.7555150410209661, "progress_pct": 87.76, "epoch_pct": 87.78, "eta": "02:35:46", "max_grad_norm": 1.0, "loss": 0.44232451915740967, "grad_norm": 1.0634632110595703, "learning_rate": 9.220353809892435e-07} +{"ts": "2025-12-23T06:10:18", "event": "train_log", "step": 603, "epoch": 1.7584320875113946, "progress_pct": 87.9, "epoch_pct": 87.92, "eta": "02:33:49", "max_grad_norm": 1.0, "loss": 0.5336061716079712, "grad_norm": 1.0276325941085815, "learning_rate": 9.007979058259475e-07} +{"ts": "2025-12-23T06:11:29", "event": "train_log", "step": 604, "epoch": 1.761349134001823, "progress_pct": 88.05, "epoch_pct": 88.07, "eta": "02:31:52", "max_grad_norm": 1.0, "loss": 0.35023194551467896, "grad_norm": 1.1488786935806274, "learning_rate": 8.797963327533698e-07} +{"ts": "2025-12-23T06:12:39", "event": "train_log", "step": 605, "epoch": 1.7642661804922515, "progress_pct": 88.19, "epoch_pct": 88.21, "eta": "02:29:56", "max_grad_norm": 1.0, "loss": 0.4461829662322998, "grad_norm": 1.171109676361084, "learning_rate": 8.590312062494699e-07} +{"ts": "2025-12-23T06:13:50", "event": "train_log", "step": 606, "epoch": 1.76718322698268, "progress_pct": 88.34, "epoch_pct": 88.36, "eta": "02:27:59", "max_grad_norm": 1.0, "loss": 0.3448236584663391, "grad_norm": 1.3948134183883667, "learning_rate": 8.385030646621938e-07} +{"ts": "2025-12-23T06:15:00", "event": "train_log", "step": 607, "epoch": 1.7701002734731084, "progress_pct": 88.48, "epoch_pct": 88.51, "eta": "02:26:03", "max_grad_norm": 1.0, "loss": 0.39913487434387207, "grad_norm": 1.144608497619629, "learning_rate": 8.18212440195515e-07} +{"ts": "2025-12-23T06:16:10", "event": "train_log", "step": 608, "epoch": 1.7730173199635368, "progress_pct": 88.63, "epoch_pct": 88.65, "eta": "02:24:07", "max_grad_norm": 1.0, "loss": 0.40005186200141907, "grad_norm": 1.1941088438034058, "learning_rate": 7.981598588956396e-07} +{"ts": "2025-12-23T06:17:21", "event": "train_log", "step": 609, "epoch": 1.7759343664539653, "progress_pct": 88.78, "epoch_pct": 88.8, "eta": "02:22:11", "max_grad_norm": 1.0, "loss": 0.38895174860954285, "grad_norm": 1.1087690591812134, "learning_rate": 7.783458406373656e-07} +{"ts": "2025-12-23T06:18:31", "event": "train_log", "step": 610, "epoch": 1.7788514129443938, "progress_pct": 88.92, "epoch_pct": 88.94, "eta": "02:20:15", "max_grad_norm": 1.0, "loss": 0.36259594559669495, "grad_norm": 1.1787676811218262, "learning_rate": 7.587708991106069e-07} +{"ts": "2025-12-23T06:19:41", "event": "train_log", "step": 611, "epoch": 1.7817684594348222, "progress_pct": 89.07, "epoch_pct": 89.09, "eta": "02:18:19", "max_grad_norm": 1.0, "loss": 0.44475269317626953, "grad_norm": 1.1265360116958618, "learning_rate": 7.394355418070731e-07} +{"ts": "2025-12-23T06:20:51", "event": "train_log", "step": 612, "epoch": 1.7846855059252507, "progress_pct": 89.21, "epoch_pct": 89.23, "eta": "02:16:24", "max_grad_norm": 1.0, "loss": 0.3823542594909668, "grad_norm": 1.2230898141860962, "learning_rate": 7.203402700071138e-07} +{"ts": "2025-12-23T06:22:03", "event": "train_log", "step": 613, "epoch": 1.7876025524156791, "progress_pct": 89.36, "epoch_pct": 89.38, "eta": "02:14:28", "max_grad_norm": 1.0, "loss": 0.43276944756507874, "grad_norm": 1.0893492698669434, "learning_rate": 7.01485578766724e-07} +{"ts": "2025-12-23T06:23:15", "event": "train_log", "step": 614, "epoch": 1.7905195989061076, "progress_pct": 89.5, "epoch_pct": 89.53, "eta": "02:12:33", "max_grad_norm": 1.0, "loss": 0.5362570881843567, "grad_norm": 1.039494514465332, "learning_rate": 6.828719569047082e-07} +{"ts": "2025-12-23T06:24:26", "event": "train_log", "step": 615, "epoch": 1.793436645396536, "progress_pct": 89.65, "epoch_pct": 89.67, "eta": "02:10:38", "max_grad_norm": 1.0, "loss": 0.34828731417655945, "grad_norm": 1.0307413339614868, "learning_rate": 6.644998869900054e-07} +{"ts": "2025-12-23T06:25:37", "event": "train_log", "step": 616, "epoch": 1.7963536918869645, "progress_pct": 89.8, "epoch_pct": 89.82, "eta": "02:08:43", "max_grad_norm": 1.0, "loss": 0.3669811487197876, "grad_norm": 1.1253540515899658, "learning_rate": 6.463698453291823e-07} +{"ts": "2025-12-23T06:26:48", "event": "train_log", "step": 617, "epoch": 1.799270738377393, "progress_pct": 89.94, "epoch_pct": 89.96, "eta": "02:06:49", "max_grad_norm": 1.0, "loss": 0.3868233561515808, "grad_norm": 1.1103028059005737, "learning_rate": 6.28482301954082e-07} +{"ts": "2025-12-23T06:28:00", "event": "train_log", "step": 618, "epoch": 1.8021877848678214, "progress_pct": 90.09, "epoch_pct": 90.11, "eta": "02:04:54", "max_grad_norm": 1.0, "loss": 0.4123673439025879, "grad_norm": 1.0804798603057861, "learning_rate": 6.108377206096394e-07} +{"ts": "2025-12-23T06:29:11", "event": "train_log", "step": 619, "epoch": 1.8051048313582498, "progress_pct": 90.23, "epoch_pct": 90.26, "eta": "02:03:00", "max_grad_norm": 1.0, "loss": 0.44468799233436584, "grad_norm": 1.1068788766860962, "learning_rate": 5.934365587418567e-07} +{"ts": "2025-12-23T06:30:21", "event": "train_log", "step": 620, "epoch": 1.8080218778486783, "progress_pct": 90.38, "epoch_pct": 90.4, "eta": "02:01:05", "max_grad_norm": 1.0, "loss": 0.3586595356464386, "grad_norm": 1.0318645238876343, "learning_rate": 5.762792674859474e-07} +{"ts": "2025-12-23T06:31:31", "event": "train_log", "step": 621, "epoch": 1.8109389243391067, "progress_pct": 90.52, "epoch_pct": 90.55, "eta": "01:59:11", "max_grad_norm": 1.0, "loss": 0.4580552577972412, "grad_norm": 1.1553035974502563, "learning_rate": 5.593662916546361e-07} +{"ts": "2025-12-23T06:32:42", "event": "train_log", "step": 622, "epoch": 1.8138559708295352, "progress_pct": 90.67, "epoch_pct": 90.69, "eta": "01:57:17", "max_grad_norm": 1.0, "loss": 0.42412641644477844, "grad_norm": 1.3010531663894653, "learning_rate": 5.426980697266271e-07} +{"ts": "2025-12-23T06:33:52", "event": "train_log", "step": 623, "epoch": 1.8167730173199637, "progress_pct": 90.82, "epoch_pct": 90.84, "eta": "01:55:23", "max_grad_norm": 1.0, "loss": 0.38257676362991333, "grad_norm": 1.1858006715774536, "learning_rate": 5.262750338352418e-07} +{"ts": "2025-12-23T06:35:02", "event": "train_log", "step": 624, "epoch": 1.8196900638103921, "progress_pct": 90.96, "epoch_pct": 90.98, "eta": "01:53:29", "max_grad_norm": 1.0, "loss": 0.48365846276283264, "grad_norm": 1.1341536045074463, "learning_rate": 5.100976097572074e-07} +{"ts": "2025-12-23T06:36:12", "event": "train_log", "step": 625, "epoch": 1.8226071103008206, "progress_pct": 91.11, "epoch_pct": 91.13, "eta": "01:51:35", "max_grad_norm": 1.0, "loss": 0.3893233835697174, "grad_norm": 1.112844467163086, "learning_rate": 4.941662169016237e-07} +{"ts": "2025-12-23T06:37:22", "event": "train_log", "step": 626, "epoch": 1.825524156791249, "progress_pct": 91.25, "epoch_pct": 91.28, "eta": "01:49:42", "max_grad_norm": 1.0, "loss": 0.38869139552116394, "grad_norm": 1.1846497058868408, "learning_rate": 4.784812682990903e-07} +{"ts": "2025-12-23T06:38:33", "event": "train_log", "step": 627, "epoch": 1.8284412032816773, "progress_pct": 91.4, "epoch_pct": 91.42, "eta": "01:47:48", "max_grad_norm": 1.0, "loss": 0.36156678199768066, "grad_norm": 1.1383928060531616, "learning_rate": 4.6304317059099326e-07} +{"ts": "2025-12-23T06:39:43", "event": "train_log", "step": 628, "epoch": 1.8313582497721057, "progress_pct": 91.55, "epoch_pct": 91.57, "eta": "01:45:55", "max_grad_norm": 1.0, "loss": 0.40910348296165466, "grad_norm": 1.0891298055648804, "learning_rate": 4.478523240189703e-07} +{"ts": "2025-12-23T06:40:53", "event": "train_log", "step": 629, "epoch": 1.8342752962625342, "progress_pct": 91.69, "epoch_pct": 91.71, "eta": "01:44:02", "max_grad_norm": 1.0, "loss": 0.3360365629196167, "grad_norm": 1.1337662935256958, "learning_rate": 4.3290912241452545e-07} +{"ts": "2025-12-23T06:42:04", "event": "train_log", "step": 630, "epoch": 1.8371923427529626, "progress_pct": 91.84, "epoch_pct": 91.86, "eta": "01:42:09", "max_grad_norm": 1.0, "loss": 0.44318532943725586, "grad_norm": 1.280463695526123, "learning_rate": 4.182139531888263e-07} +{"ts": "2025-12-23T06:43:15", "event": "train_log", "step": 631, "epoch": 1.840109389243391, "progress_pct": 91.98, "epoch_pct": 92.01, "eta": "01:40:16", "max_grad_norm": 1.0, "loss": 0.37003564834594727, "grad_norm": 1.1408170461654663, "learning_rate": 4.0376719732265647e-07} +{"ts": "2025-12-23T06:44:26", "event": "train_log", "step": 632, "epoch": 1.8430264357338195, "progress_pct": 92.13, "epoch_pct": 92.15, "eta": "01:38:24", "max_grad_norm": 1.0, "loss": 0.355985552072525, "grad_norm": 0.9730168581008911, "learning_rate": 3.8956922935653895e-07} +{"ts": "2025-12-23T06:45:38", "event": "train_log", "step": 633, "epoch": 1.845943482224248, "progress_pct": 92.27, "epoch_pct": 92.3, "eta": "01:36:31", "max_grad_norm": 1.0, "loss": 0.3911808729171753, "grad_norm": 1.0643151998519897, "learning_rate": 3.756204173810263e-07} +{"ts": "2025-12-23T06:46:50", "event": "train_log", "step": 634, "epoch": 1.8488605287146764, "progress_pct": 92.42, "epoch_pct": 92.44, "eta": "01:34:39", "max_grad_norm": 1.0, "loss": 0.314385324716568, "grad_norm": 1.1769851446151733, "learning_rate": 3.61921123027158e-07} +{"ts": "2025-12-23T06:48:00", "event": "train_log", "step": 635, "epoch": 1.8517775752051049, "progress_pct": 92.57, "epoch_pct": 92.59, "eta": "01:32:46", "max_grad_norm": 1.0, "loss": 0.3375144302845001, "grad_norm": 0.921336829662323, "learning_rate": 3.484717014570838e-07} +{"ts": "2025-12-23T06:49:12", "event": "train_log", "step": 636, "epoch": 1.8546946216955331, "progress_pct": 92.71, "epoch_pct": 92.73, "eta": "01:30:54", "max_grad_norm": 1.0, "loss": 0.4461369514465332, "grad_norm": 0.9904773235321045, "learning_rate": 3.3527250135485744e-07} +{"ts": "2025-12-23T06:50:22", "event": "train_log", "step": 637, "epoch": 1.8576116681859616, "progress_pct": 92.86, "epoch_pct": 92.88, "eta": "01:29:02", "max_grad_norm": 1.0, "loss": 0.398414671421051, "grad_norm": 1.0844534635543823, "learning_rate": 3.223238649173954e-07} +{"ts": "2025-12-23T06:51:33", "event": "train_log", "step": 638, "epoch": 1.86052871467639, "progress_pct": 93.0, "epoch_pct": 93.03, "eta": "01:27:10", "max_grad_norm": 1.0, "loss": 0.35938704013824463, "grad_norm": 0.9829220771789551, "learning_rate": 3.096261278456048e-07} +{"ts": "2025-12-23T06:52:44", "event": "train_log", "step": 639, "epoch": 1.8634457611668185, "progress_pct": 93.15, "epoch_pct": 93.17, "eta": "01:25:19", "max_grad_norm": 1.0, "loss": 0.3783624768257141, "grad_norm": 1.13048255443573, "learning_rate": 2.971796193356835e-07} +{"ts": "2025-12-23T06:53:54", "event": "train_log", "step": 640, "epoch": 1.866362807657247, "progress_pct": 93.29, "epoch_pct": 93.32, "eta": "01:23:27", "max_grad_norm": 1.0, "loss": 0.3601874113082886, "grad_norm": 1.4307893514633179, "learning_rate": 2.8498466207058095e-07} +{"ts": "2025-12-23T06:55:04", "event": "train_log", "step": 641, "epoch": 1.8692798541476754, "progress_pct": 93.44, "epoch_pct": 93.46, "eta": "01:21:35", "max_grad_norm": 1.0, "loss": 0.43897169828414917, "grad_norm": 1.1835116147994995, "learning_rate": 2.7304157221163753e-07} +{"ts": "2025-12-23T06:56:16", "event": "train_log", "step": 642, "epoch": 1.8721969006381038, "progress_pct": 93.59, "epoch_pct": 93.61, "eta": "01:19:44", "max_grad_norm": 1.0, "loss": 0.4407995343208313, "grad_norm": 1.0730469226837158, "learning_rate": 2.613506593903825e-07} +{"ts": "2025-12-23T06:57:26", "event": "train_log", "step": 643, "epoch": 1.8751139471285323, "progress_pct": 93.73, "epoch_pct": 93.76, "eta": "01:17:53", "max_grad_norm": 1.0, "loss": 0.4105035960674286, "grad_norm": 0.9504678845405579, "learning_rate": 2.499122267005105e-07} +{"ts": "2025-12-23T06:58:36", "event": "train_log", "step": 644, "epoch": 1.8780309936189608, "progress_pct": 93.88, "epoch_pct": 93.9, "eta": "01:16:01", "max_grad_norm": 1.0, "loss": 0.41521430015563965, "grad_norm": 1.2599385976791382, "learning_rate": 2.387265706900199e-07} +{"ts": "2025-12-23T06:59:47", "event": "train_log", "step": 645, "epoch": 1.8809480401093892, "progress_pct": 94.02, "epoch_pct": 94.05, "eta": "01:14:10", "max_grad_norm": 1.0, "loss": 0.33491846919059753, "grad_norm": 1.035783052444458, "learning_rate": 2.2779398135353127e-07} +{"ts": "2025-12-23T07:00:57", "event": "train_log", "step": 646, "epoch": 1.8838650865998177, "progress_pct": 94.17, "epoch_pct": 94.19, "eta": "01:12:19", "max_grad_norm": 1.0, "loss": 0.3367970287799835, "grad_norm": 1.1612690687179565, "learning_rate": 2.1711474212476325e-07} +{"ts": "2025-12-23T07:02:07", "event": "train_log", "step": 647, "epoch": 1.8867821330902461, "progress_pct": 94.31, "epoch_pct": 94.34, "eta": "01:10:29", "max_grad_norm": 1.0, "loss": 0.46374717354774475, "grad_norm": 1.2541207075119019, "learning_rate": 2.066891298691831e-07} +{"ts": "2025-12-23T07:03:17", "event": "train_log", "step": 648, "epoch": 1.8896991795806746, "progress_pct": 94.46, "epoch_pct": 94.48, "eta": "01:08:38", "max_grad_norm": 1.0, "loss": 0.3799871802330017, "grad_norm": 1.1037088632583618, "learning_rate": 1.9651741487683562e-07} +{"ts": "2025-12-23T07:04:28", "event": "train_log", "step": 649, "epoch": 1.892616226071103, "progress_pct": 94.61, "epoch_pct": 94.63, "eta": "01:06:47", "max_grad_norm": 1.0, "loss": 0.40523889660835266, "grad_norm": 1.3611476421356201, "learning_rate": 1.8659986085532988e-07} +{"ts": "2025-12-23T07:05:40", "event": "train_log", "step": 650, "epoch": 1.8955332725615315, "progress_pct": 94.75, "epoch_pct": 94.78, "eta": "01:04:57", "max_grad_norm": 1.0, "loss": 0.38399839401245117, "grad_norm": 1.1628823280334473, "learning_rate": 1.7693672492300473e-07} +{"ts": "2025-12-23T07:20:44", "event": "train_log", "step": 650, "epoch": 1.8955332725615315, "progress_pct": 94.75, "epoch_pct": 94.78, "eta": "01:05:47", "max_grad_norm": 1.0, "eval_loss": 0.3949255049228668, "eval_runtime": 903.6455, "eval_samples_per_second": 0.699, "eval_steps_per_second": 0.699} +{"ts": "2025-12-23T07:21:54", "event": "train_log", "step": 651, "epoch": 1.89845031905196, "progress_pct": 94.9, "epoch_pct": 94.92, "eta": "01:03:55", "max_grad_norm": 1.0, "loss": 0.4280855059623718, "grad_norm": 1.1185522079467773, "learning_rate": 1.675282576022641e-07} +{"ts": "2025-12-23T07:23:04", "event": "train_log", "step": 652, "epoch": 1.9013673655423884, "progress_pct": 95.04, "epoch_pct": 95.07, "eta": "01:02:04", "max_grad_norm": 1.0, "loss": 0.3026162087917328, "grad_norm": 1.1962717771530151, "learning_rate": 1.5837470281307666e-07} +{"ts": "2025-12-23T07:24:15", "event": "train_log", "step": 653, "epoch": 1.9042844120328168, "progress_pct": 95.19, "epoch_pct": 95.21, "eta": "01:00:12", "max_grad_norm": 1.0, "loss": 0.43283963203430176, "grad_norm": 1.1818240880966187, "learning_rate": 1.4947629786666084e-07} +{"ts": "2025-12-23T07:25:25", "event": "train_log", "step": 654, "epoch": 1.9072014585232453, "progress_pct": 95.34, "epoch_pct": 95.36, "eta": "00:58:21", "max_grad_norm": 1.0, "loss": 0.435259610414505, "grad_norm": 1.161944031715393, "learning_rate": 1.4083327345932208e-07} +{"ts": "2025-12-23T07:26:36", "event": "train_log", "step": 655, "epoch": 1.9101185050136738, "progress_pct": 95.48, "epoch_pct": 95.51, "eta": "00:56:30", "max_grad_norm": 1.0, "loss": 0.3258042633533478, "grad_norm": 1.1311709880828857, "learning_rate": 1.32445853666483e-07} +{"ts": "2025-12-23T07:27:48", "event": "train_log", "step": 656, "epoch": 1.9130355515041022, "progress_pct": 95.63, "epoch_pct": 95.65, "eta": "00:54:39", "max_grad_norm": 1.0, "loss": 0.40951770544052124, "grad_norm": 1.0152852535247803, "learning_rate": 1.2431425593686263e-07} +{"ts": "2025-12-23T07:29:01", "event": "train_log", "step": 657, "epoch": 1.9159525979945307, "progress_pct": 95.77, "epoch_pct": 95.8, "eta": "00:52:48", "max_grad_norm": 1.0, "loss": 0.3610893785953522, "grad_norm": 1.2698794603347778, "learning_rate": 1.164386910868498e-07} +{"ts": "2025-12-23T07:30:13", "event": "train_log", "step": 658, "epoch": 1.9188696444849591, "progress_pct": 95.92, "epoch_pct": 95.94, "eta": "00:50:57", "max_grad_norm": 1.0, "loss": 0.31951773166656494, "grad_norm": 1.1092722415924072, "learning_rate": 1.0881936329502851e-07} +{"ts": "2025-12-23T07:31:25", "event": "train_log", "step": 659, "epoch": 1.9217866909753876, "progress_pct": 96.06, "epoch_pct": 96.09, "eta": "00:49:06", "max_grad_norm": 1.0, "loss": 0.3756055235862732, "grad_norm": 1.2378597259521484, "learning_rate": 1.0145647009689008e-07} +{"ts": "2025-12-23T07:32:36", "event": "train_log", "step": 660, "epoch": 1.924703737465816, "progress_pct": 96.21, "epoch_pct": 96.24, "eta": "00:47:15", "max_grad_norm": 1.0, "loss": 0.26117536425590515, "grad_norm": 1.0100237131118774, "learning_rate": 9.43502023797116e-08} +{"ts": "2025-12-23T07:33:47", "event": "train_log", "step": 661, "epoch": 1.9276207839562443, "progress_pct": 96.36, "epoch_pct": 96.38, "eta": "00:45:25", "max_grad_norm": 1.0, "loss": 0.3092282712459564, "grad_norm": 1.2368487119674683, "learning_rate": 8.750074437760325e-08} +{"ts": "2025-12-23T07:34:57", "event": "train_log", "step": 662, "epoch": 1.9305378304466727, "progress_pct": 96.5, "epoch_pct": 96.53, "eta": "00:43:35", "max_grad_norm": 1.0, "loss": 0.4076297879219055, "grad_norm": 1.0328837633132935, "learning_rate": 8.090827366673548e-08} +{"ts": "2025-12-23T07:36:07", "event": "train_log", "step": 663, "epoch": 1.9334548769371012, "progress_pct": 96.65, "epoch_pct": 96.67, "eta": "00:41:44", "max_grad_norm": 1.0, "loss": 0.40007251501083374, "grad_norm": 0.9885771870613098, "learning_rate": 7.457296116073487e-08} +{"ts": "2025-12-23T07:37:17", "event": "train_log", "step": 664, "epoch": 1.9363719234275296, "progress_pct": 96.79, "epoch_pct": 96.82, "eta": "00:39:54", "max_grad_norm": 1.0, "loss": 0.3751019239425659, "grad_norm": 1.19287109375, "learning_rate": 6.849497110625214e-08} +{"ts": "2025-12-23T07:38:28", "event": "train_log", "step": 665, "epoch": 1.939288969917958, "progress_pct": 96.94, "epoch_pct": 96.96, "eta": "00:38:04", "max_grad_norm": 1.0, "loss": 0.4558236300945282, "grad_norm": 1.134682536125183, "learning_rate": 6.267446107870334e-08} +{"ts": "2025-12-23T07:39:39", "event": "train_log", "step": 666, "epoch": 1.9422060164083865, "progress_pct": 97.08, "epoch_pct": 97.11, "eta": "00:36:14", "max_grad_norm": 1.0, "loss": 0.5070392489433289, "grad_norm": 3.414883852005005, "learning_rate": 5.7111581978185336e-08} +{"ts": "2025-12-23T07:40:49", "event": "train_log", "step": 667, "epoch": 1.945123062898815, "progress_pct": 97.23, "epoch_pct": 97.26, "eta": "00:34:24", "max_grad_norm": 1.0, "loss": 0.389989972114563, "grad_norm": 1.179479956626892, "learning_rate": 5.180647802556671e-08} +{"ts": "2025-12-23T07:41:59", "event": "train_log", "step": 668, "epoch": 1.9480401093892434, "progress_pct": 97.38, "epoch_pct": 97.4, "eta": "00:32:35", "max_grad_norm": 1.0, "loss": 0.460910826921463, "grad_norm": 1.1473273038864136, "learning_rate": 4.675928675874186e-08} +{"ts": "2025-12-23T07:43:10", "event": "train_log", "step": 669, "epoch": 1.9509571558796717, "progress_pct": 97.52, "epoch_pct": 97.55, "eta": "00:30:45", "max_grad_norm": 1.0, "loss": 0.5488728284835815, "grad_norm": 0.9269355535507202, "learning_rate": 4.197013902907165e-08} +{"ts": "2025-12-23T07:44:20", "event": "train_log", "step": 670, "epoch": 1.9538742023701001, "progress_pct": 97.67, "epoch_pct": 97.69, "eta": "00:28:55", "max_grad_norm": 1.0, "loss": 0.39483463764190674, "grad_norm": 1.1781370639801025, "learning_rate": 3.7439158997989445e-08} +{"ts": "2025-12-23T07:45:30", "event": "train_log", "step": 671, "epoch": 1.9567912488605286, "progress_pct": 97.81, "epoch_pct": 97.84, "eta": "00:27:06", "max_grad_norm": 1.0, "loss": 0.38600990176200867, "grad_norm": 1.1759430170059204, "learning_rate": 3.316646413377811e-08} +{"ts": "2025-12-23T07:46:41", "event": "train_log", "step": 672, "epoch": 1.959708295350957, "progress_pct": 97.96, "epoch_pct": 97.99, "eta": "00:25:17", "max_grad_norm": 1.0, "loss": 0.4657193422317505, "grad_norm": 1.1981792449951172, "learning_rate": 2.9152165208529147e-08} +{"ts": "2025-12-23T07:47:52", "event": "train_log", "step": 673, "epoch": 1.9626253418413855, "progress_pct": 98.1, "epoch_pct": 98.13, "eta": "00:23:28", "max_grad_norm": 1.0, "loss": 0.46212077140808105, "grad_norm": 1.186043620109558, "learning_rate": 2.5396366295272756e-08} +{"ts": "2025-12-23T07:49:03", "event": "train_log", "step": 674, "epoch": 1.965542388331814, "progress_pct": 98.25, "epoch_pct": 98.28, "eta": "00:21:39", "max_grad_norm": 1.0, "loss": 0.4416077733039856, "grad_norm": 1.115103840827942, "learning_rate": 2.1899164765271096e-08} +{"ts": "2025-12-23T07:50:15", "event": "train_log", "step": 675, "epoch": 1.9684594348222424, "progress_pct": 98.4, "epoch_pct": 98.42, "eta": "00:19:50", "max_grad_norm": 1.0, "loss": 0.3557685911655426, "grad_norm": 1.2150691747665405, "learning_rate": 1.866065128550365e-08} +{"ts": "2025-12-23T07:51:26", "event": "train_log", "step": 676, "epoch": 1.9713764813126708, "progress_pct": 98.54, "epoch_pct": 98.57, "eta": "00:18:01", "max_grad_norm": 1.0, "loss": 0.32865390181541443, "grad_norm": 1.096506953239441, "learning_rate": 1.5680909816309098e-08} +{"ts": "2025-12-23T07:52:39", "event": "train_log", "step": 677, "epoch": 1.9742935278030993, "progress_pct": 98.69, "epoch_pct": 98.71, "eta": "00:16:13", "max_grad_norm": 1.0, "loss": 0.37568721175193787, "grad_norm": 1.0974191427230835, "learning_rate": 1.2960017609213727e-08} +{"ts": "2025-12-23T07:53:50", "event": "train_log", "step": 678, "epoch": 1.9772105742935278, "progress_pct": 98.83, "epoch_pct": 98.86, "eta": "00:14:24", "max_grad_norm": 1.0, "loss": 0.329836905002594, "grad_norm": 1.1290082931518555, "learning_rate": 1.0498045204924145e-08} +{"ts": "2025-12-23T07:55:00", "event": "train_log", "step": 679, "epoch": 1.9801276207839562, "progress_pct": 98.98, "epoch_pct": 99.01, "eta": "00:12:36", "max_grad_norm": 1.0, "loss": 0.2694982886314392, "grad_norm": 1.0609803199768066, "learning_rate": 8.295056431504301e-09} +{"ts": "2025-12-23T07:56:10", "event": "train_log", "step": 680, "epoch": 1.9830446672743847, "progress_pct": 99.13, "epoch_pct": 99.15, "eta": "00:10:47", "max_grad_norm": 1.0, "loss": 0.4270719587802887, "grad_norm": 0.9838472604751587, "learning_rate": 6.3511084027156885e-09} +{"ts": "2025-12-23T07:57:20", "event": "train_log", "step": 681, "epoch": 1.9859617137648131, "progress_pct": 99.27, "epoch_pct": 99.3, "eta": "00:08:59", "max_grad_norm": 1.0, "loss": 0.4060650169849396, "grad_norm": 1.1900098323822021, "learning_rate": 4.666251516536324e-09} +{"ts": "2025-12-23T07:58:30", "event": "train_log", "step": 682, "epoch": 1.9888787602552416, "progress_pct": 99.42, "epoch_pct": 99.44, "eta": "00:07:11", "max_grad_norm": 1.0, "loss": 0.3900409936904907, "grad_norm": 0.9812174439430237, "learning_rate": 3.2405294538606637e-09} +{"ts": "2025-12-23T07:59:41", "event": "train_log", "step": 683, "epoch": 1.99179580674567, "progress_pct": 99.56, "epoch_pct": 99.59, "eta": "00:05:23", "max_grad_norm": 1.0, "loss": 0.3999583125114441, "grad_norm": 1.1988210678100586, "learning_rate": 2.073979177357188e-09} +{"ts": "2025-12-23T08:00:51", "event": "train_log", "step": 684, "epoch": 1.9947128532360985, "progress_pct": 99.71, "epoch_pct": 99.74, "eta": "00:03:35", "max_grad_norm": 1.0, "loss": 0.46780622005462646, "grad_norm": 0.9738736152648926, "learning_rate": 1.1666309305202738e-09} +{"ts": "2025-12-23T08:02:01", "event": "train_log", "step": 685, "epoch": 1.997629899726527, "progress_pct": 99.85, "epoch_pct": 99.88, "eta": "00:01:47", "max_grad_norm": 1.0, "loss": 0.4595794975757599, "grad_norm": 0.9841824173927307, "learning_rate": 5.18508236878601e-10} +{"ts": "2025-12-23T08:02:59", "event": "train_log", "step": 686, "epoch": 2.0, "progress_pct": 100.0, "epoch_pct": 100.0, "eta": "00:00:00", "max_grad_norm": 1.0, "loss": 0.5136060118675232, "grad_norm": 1.0865421295166016, "learning_rate": 1.2962789938897323e-10} +{"ts": "2025-12-23T08:03:00", "event": "train_log", "step": 686, "epoch": 2.0, "progress_pct": 100.0, "epoch_pct": 100.0, "eta": "00:00:00", "max_grad_norm": 1.0, "train_runtime": 73813.7847, "train_samples_per_second": 0.149, "train_steps_per_second": 0.009, "total_flos": 6.317102071220797e+18, "train_loss": 0.5158847309757599} +{"ts": "2025-12-23T08:18:17", "event": "train_log", "step": 686, "epoch": 2.0, "progress_pct": 100.0, "epoch_pct": 100.0, "eta": "00:00:00", "max_grad_norm": 1.0, "eval_loss": 0.3965963125228882, "eval_runtime": 916.1187, "eval_samples_per_second": 0.69, "eval_steps_per_second": 0.69}