diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..4dad65ab84b63900ed6c1615183430c6f9d4cc55 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +cpt_qwen_14B/best_adapter/tokenizer.json filter=lfs diff=lfs merge=lfs -text +cpt_qwen_14B/checkpoints/checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +cpt_qwen_14B/checkpoints/checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +cpt_qwen_14B/checkpoints/checkpoint-300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +cpt_qwen_14B/checkpoints/checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +cpt_qwen_14B/checkpoints/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +cpt_qwen_14B/checkpoints/checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +cpt_qwen_14B/checkpoints/checkpoint-656/tokenizer.json filter=lfs diff=lfs merge=lfs -text +cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/run-g6vlcw0j.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/cpt_qwen_14B/best_adapter/README.md b/cpt_qwen_14B/best_adapter/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8dfda26032514233f3e70a4012f1cfd1ddbbb609 --- /dev/null +++ b/cpt_qwen_14B/best_adapter/README.md @@ -0,0 +1,207 @@ +--- +base_model: /workspace/Models/Qwen2.5-Coder-14B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:/workspace/Models/Qwen2.5-Coder-14B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/cpt_qwen_14B/best_adapter/adapter_config.json b/cpt_qwen_14B/best_adapter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7111d3f1eb9dd0b4b1f5c431d5a0f99fb0799d88 --- /dev/null +++ b/cpt_qwen_14B/best_adapter/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "/workspace/Models/Qwen2.5-Coder-14B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/cpt_qwen_14B/best_adapter/adapter_model.safetensors b/cpt_qwen_14B/best_adapter/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..99d5939e2605b44abdbcc01e0cdccdd954c4b7ce --- /dev/null +++ b/cpt_qwen_14B/best_adapter/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:732e678c9e22bba352641afc71ed5fc2394671dd0d66707e288224822a906558 +size 201378736 diff --git a/cpt_qwen_14B/best_adapter/chat_template.jinja b/cpt_qwen_14B/best_adapter/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..28028c056af412405debd878cdda0171e35fa5d1 --- /dev/null +++ b/cpt_qwen_14B/best_adapter/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/cpt_qwen_14B/best_adapter/tokenizer.json b/cpt_qwen_14B/best_adapter/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/cpt_qwen_14B/best_adapter/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/cpt_qwen_14B/best_adapter/tokenizer_config.json b/cpt_qwen_14B/best_adapter/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..217274ef8275420e4bf3b976f3948901cd3d176f --- /dev/null +++ b/cpt_qwen_14B/best_adapter/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": true, + "model_max_length": 32768, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/cpt_qwen_14B/best_adapter/training_args.bin b/cpt_qwen_14B/best_adapter/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..048bda796c842f6323a1b07ab055ba4ed7ed862c --- /dev/null +++ b/cpt_qwen_14B/best_adapter/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a501e145e724e2b92f08ecb0badd762f4f3a8472eb7b53a96edc24d2ff6150ce +size 5201 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-100/README.md b/cpt_qwen_14B/checkpoints/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8dfda26032514233f3e70a4012f1cfd1ddbbb609 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-100/README.md @@ -0,0 +1,207 @@ +--- +base_model: /workspace/Models/Qwen2.5-Coder-14B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:/workspace/Models/Qwen2.5-Coder-14B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/cpt_qwen_14B/checkpoints/checkpoint-100/adapter_config.json b/cpt_qwen_14B/checkpoints/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..81c31359285f7e351a44275c30b6882f4c6b50c0 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-100/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "/workspace/Models/Qwen2.5-Coder-14B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/cpt_qwen_14B/checkpoints/checkpoint-100/adapter_model.safetensors b/cpt_qwen_14B/checkpoints/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d72e8ead397bc6aad8a0a78e6e666a5cfc7c2dda --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eb9828020a915a338a62e297a70ad08859fd2caf23d051f1106384bd1013c18 +size 201378736 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-100/chat_template.jinja b/cpt_qwen_14B/checkpoints/checkpoint-100/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..28028c056af412405debd878cdda0171e35fa5d1 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-100/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-100/optimizer.pt b/cpt_qwen_14B/checkpoints/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0d888548d6c07f94a83eeff74932c2e2457ef48 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7210a9536a1373d78b688acc34126df7e1e110a1466aba22b339760271bbd078 +size 102698471 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-100/rng_state.pth b/cpt_qwen_14B/checkpoints/checkpoint-100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5ec8fe45e0b0e6c0c970bb302d9a3907304812f6 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f9c723f3b3ec93075e4df80d4b9fca594bf5084eeca69dfc4bee734176b2011 +size 14645 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-100/scheduler.pt b/cpt_qwen_14B/checkpoints/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5eeed4ed99e406ada2e15be657ff71b5a3e5ad60 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da9afcff37b2d4b9e726256840193d30b8c1205dbf5b286ebb18b671682f2424 +size 1465 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-100/tokenizer.json b/cpt_qwen_14B/checkpoints/checkpoint-100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-100/tokenizer_config.json b/cpt_qwen_14B/checkpoints/checkpoint-100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..217274ef8275420e4bf3b976f3948901cd3d176f --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-100/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": true, + "model_max_length": 32768, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-100/trainer_state.json b/cpt_qwen_14B/checkpoints/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e736c05df93b9feb9db523415b7712ebaa535155 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-100/trainer_state.json @@ -0,0 +1,750 @@ +{ + "best_global_step": 100, + "best_metric": 0.884428083896637, + "best_model_checkpoint": "runs/cpt_run_14b/checkpoints/checkpoint-100", + "epoch": 0.30569354222392053, + "eval_steps": 50, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003056935422239205, + "grad_norm": 0.06516239047050476, + "learning_rate": 0.0, + "loss": 1.138384461402893, + "step": 1 + }, + { + "epoch": 0.00611387084447841, + "grad_norm": 0.05343673378229141, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.983342707157135, + "step": 2 + }, + { + "epoch": 0.009170806266717615, + "grad_norm": 0.05608418956398964, + "learning_rate": 6.060606060606061e-07, + "loss": 1.0762118101119995, + "step": 3 + }, + { + "epoch": 0.01222774168895682, + "grad_norm": 0.06523486226797104, + "learning_rate": 9.090909090909091e-07, + "loss": 1.084489345550537, + "step": 4 + }, + { + "epoch": 0.015284677111196026, + "grad_norm": 0.06582186371088028, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.2037022113800049, + "step": 5 + }, + { + "epoch": 0.01834161253343523, + "grad_norm": 0.06097998470067978, + "learning_rate": 1.5151515151515152e-06, + "loss": 1.10005784034729, + "step": 6 + }, + { + "epoch": 0.021398547955674436, + "grad_norm": 0.10365528613328934, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.0895193815231323, + "step": 7 + }, + { + "epoch": 0.02445548337791364, + "grad_norm": 0.06312141567468643, + "learning_rate": 2.1212121212121216e-06, + "loss": 1.0593242645263672, + "step": 8 + }, + { + "epoch": 0.027512418800152847, + "grad_norm": 0.05508403480052948, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.9772955179214478, + "step": 9 + }, + { + "epoch": 0.030569354222392053, + "grad_norm": 0.06006711348891258, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.084238886833191, + "step": 10 + }, + { + "epoch": 0.033626289644631255, + "grad_norm": 0.0588749423623085, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.0786534547805786, + "step": 11 + }, + { + "epoch": 0.03668322506687046, + "grad_norm": 0.046551357954740524, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.0370622873306274, + "step": 12 + }, + { + "epoch": 0.039740160489109666, + "grad_norm": 0.061659567058086395, + "learning_rate": 3.6363636363636366e-06, + "loss": 1.0646986961364746, + "step": 13 + }, + { + "epoch": 0.04279709591134887, + "grad_norm": 0.06007347255945206, + "learning_rate": 3.93939393939394e-06, + "loss": 1.0311307907104492, + "step": 14 + }, + { + "epoch": 0.04585403133358808, + "grad_norm": 0.07314135134220123, + "learning_rate": 4.242424242424243e-06, + "loss": 1.1300500631332397, + "step": 15 + }, + { + "epoch": 0.04891096675582728, + "grad_norm": 0.060934022068977356, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.0197452306747437, + "step": 16 + }, + { + "epoch": 0.05196790217806649, + "grad_norm": 0.056856051087379456, + "learning_rate": 4.848484848484849e-06, + "loss": 1.0438549518585205, + "step": 17 + }, + { + "epoch": 0.055024837600305694, + "grad_norm": 0.05908689647912979, + "learning_rate": 5.151515151515152e-06, + "loss": 1.0398856401443481, + "step": 18 + }, + { + "epoch": 0.0580817730225449, + "grad_norm": 0.07411840558052063, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.107885479927063, + "step": 19 + }, + { + "epoch": 0.061138708444784105, + "grad_norm": 0.0749165341258049, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.1060967445373535, + "step": 20 + }, + { + "epoch": 0.06419564386702331, + "grad_norm": 0.06720177084207535, + "learning_rate": 6.060606060606061e-06, + "loss": 1.0471720695495605, + "step": 21 + }, + { + "epoch": 0.06725257928926251, + "grad_norm": 0.05990725755691528, + "learning_rate": 6.363636363636364e-06, + "loss": 1.0944981575012207, + "step": 22 + }, + { + "epoch": 0.07030951471150172, + "grad_norm": 0.06672193855047226, + "learning_rate": 6.666666666666667e-06, + "loss": 1.1477092504501343, + "step": 23 + }, + { + "epoch": 0.07336645013374092, + "grad_norm": 0.06145205348730087, + "learning_rate": 6.969696969696971e-06, + "loss": 1.0591784715652466, + "step": 24 + }, + { + "epoch": 0.07642338555598013, + "grad_norm": 0.0757482647895813, + "learning_rate": 7.272727272727273e-06, + "loss": 1.0500165224075317, + "step": 25 + }, + { + "epoch": 0.07948032097821933, + "grad_norm": 0.07848478108644485, + "learning_rate": 7.5757575757575764e-06, + "loss": 1.0747522115707397, + "step": 26 + }, + { + "epoch": 0.08253725640045854, + "grad_norm": 0.07740631699562073, + "learning_rate": 7.87878787878788e-06, + "loss": 1.132310152053833, + "step": 27 + }, + { + "epoch": 0.08559419182269774, + "grad_norm": 0.07476603239774704, + "learning_rate": 8.181818181818183e-06, + "loss": 1.0339502096176147, + "step": 28 + }, + { + "epoch": 0.08865112724493696, + "grad_norm": 0.0779196098446846, + "learning_rate": 8.484848484848486e-06, + "loss": 1.1047282218933105, + "step": 29 + }, + { + "epoch": 0.09170806266717615, + "grad_norm": 0.06962384283542633, + "learning_rate": 8.787878787878788e-06, + "loss": 1.004916787147522, + "step": 30 + }, + { + "epoch": 0.09476499808941537, + "grad_norm": 0.06369175016880035, + "learning_rate": 9.090909090909091e-06, + "loss": 0.9296417832374573, + "step": 31 + }, + { + "epoch": 0.09782193351165457, + "grad_norm": 0.07470260560512543, + "learning_rate": 9.393939393939396e-06, + "loss": 1.0721708536148071, + "step": 32 + }, + { + "epoch": 0.10087886893389378, + "grad_norm": 0.07948213815689087, + "learning_rate": 9.696969696969698e-06, + "loss": 1.0350117683410645, + "step": 33 + }, + { + "epoch": 0.10393580435613298, + "grad_norm": 0.07066022604703903, + "learning_rate": 1e-05, + "loss": 1.026305913925171, + "step": 34 + }, + { + "epoch": 0.10699273977837218, + "grad_norm": 0.07774543762207031, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.0509816408157349, + "step": 35 + }, + { + "epoch": 0.11004967520061139, + "grad_norm": 0.07501248270273209, + "learning_rate": 1.0606060606060606e-05, + "loss": 1.0011574029922485, + "step": 36 + }, + { + "epoch": 0.11310661062285059, + "grad_norm": 0.6622501611709595, + "learning_rate": 1.0909090909090909e-05, + "loss": 0.9754424691200256, + "step": 37 + }, + { + "epoch": 0.1161635460450898, + "grad_norm": 0.07566080242395401, + "learning_rate": 1.1212121212121212e-05, + "loss": 1.0342774391174316, + "step": 38 + }, + { + "epoch": 0.119220481467329, + "grad_norm": 0.07573831081390381, + "learning_rate": 1.1515151515151517e-05, + "loss": 0.9714518785476685, + "step": 39 + }, + { + "epoch": 0.12227741688956821, + "grad_norm": 0.08083852380514145, + "learning_rate": 1.181818181818182e-05, + "loss": 1.1050316095352173, + "step": 40 + }, + { + "epoch": 0.12533435231180742, + "grad_norm": 0.08540588617324829, + "learning_rate": 1.2121212121212122e-05, + "loss": 1.0871070623397827, + "step": 41 + }, + { + "epoch": 0.12839128773404662, + "grad_norm": 0.07391592115163803, + "learning_rate": 1.2424242424242425e-05, + "loss": 1.0206722021102905, + "step": 42 + }, + { + "epoch": 0.13144822315628582, + "grad_norm": 0.07063689082860947, + "learning_rate": 1.2727272727272728e-05, + "loss": 0.9775047898292542, + "step": 43 + }, + { + "epoch": 0.13450515857852502, + "grad_norm": 0.07288888841867447, + "learning_rate": 1.3030303030303032e-05, + "loss": 1.1132858991622925, + "step": 44 + }, + { + "epoch": 0.13756209400076425, + "grad_norm": 0.07641777396202087, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.0707701444625854, + "step": 45 + }, + { + "epoch": 0.14061902942300344, + "grad_norm": 0.06990326195955276, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.9328265190124512, + "step": 46 + }, + { + "epoch": 0.14367596484524264, + "grad_norm": 0.0834241658449173, + "learning_rate": 1.3939393939393942e-05, + "loss": 1.0131721496582031, + "step": 47 + }, + { + "epoch": 0.14673290026748184, + "grad_norm": 0.0714937075972557, + "learning_rate": 1.4242424242424245e-05, + "loss": 0.940493106842041, + "step": 48 + }, + { + "epoch": 0.14978983568972107, + "grad_norm": 0.07770547270774841, + "learning_rate": 1.4545454545454546e-05, + "loss": 1.0435771942138672, + "step": 49 + }, + { + "epoch": 0.15284677111196027, + "grad_norm": 0.07950945198535919, + "learning_rate": 1.484848484848485e-05, + "loss": 1.0382137298583984, + "step": 50 + }, + { + "epoch": 0.15284677111196027, + "eval_loss": 1.0129202604293823, + "eval_runtime": 724.3664, + "eval_samples_per_second": 0.832, + "eval_steps_per_second": 0.832, + "step": 50 + }, + { + "epoch": 0.15590370653419947, + "grad_norm": 0.06961936503648758, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.9690049886703491, + "step": 51 + }, + { + "epoch": 0.15896064195643866, + "grad_norm": 0.069523885846138, + "learning_rate": 1.5454545454545454e-05, + "loss": 0.9830482006072998, + "step": 52 + }, + { + "epoch": 0.16201757737867786, + "grad_norm": 0.0764622762799263, + "learning_rate": 1.575757575757576e-05, + "loss": 1.0895472764968872, + "step": 53 + }, + { + "epoch": 0.1650745128009171, + "grad_norm": 0.1413721889257431, + "learning_rate": 1.606060606060606e-05, + "loss": 1.0354574918746948, + "step": 54 + }, + { + "epoch": 0.1681314482231563, + "grad_norm": 0.06818042695522308, + "learning_rate": 1.6363636363636366e-05, + "loss": 0.8534265160560608, + "step": 55 + }, + { + "epoch": 0.1711883836453955, + "grad_norm": 0.0722246989607811, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.9580274820327759, + "step": 56 + }, + { + "epoch": 0.17424531906763469, + "grad_norm": 0.07113443315029144, + "learning_rate": 1.6969696969696972e-05, + "loss": 1.0721848011016846, + "step": 57 + }, + { + "epoch": 0.1773022544898739, + "grad_norm": 0.08412107080221176, + "learning_rate": 1.7272727272727274e-05, + "loss": 1.1180150508880615, + "step": 58 + }, + { + "epoch": 0.1803591899121131, + "grad_norm": 0.07381036877632141, + "learning_rate": 1.7575757575757576e-05, + "loss": 1.0384547710418701, + "step": 59 + }, + { + "epoch": 0.1834161253343523, + "grad_norm": 0.07089001685380936, + "learning_rate": 1.787878787878788e-05, + "loss": 1.0446016788482666, + "step": 60 + }, + { + "epoch": 0.1864730607565915, + "grad_norm": 0.11576953530311584, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.0015051364898682, + "step": 61 + }, + { + "epoch": 0.18952999617883073, + "grad_norm": 0.08030868321657181, + "learning_rate": 1.8484848484848487e-05, + "loss": 0.9642710089683533, + "step": 62 + }, + { + "epoch": 0.19258693160106993, + "grad_norm": 0.08332342654466629, + "learning_rate": 1.8787878787878792e-05, + "loss": 1.0722991228103638, + "step": 63 + }, + { + "epoch": 0.19564386702330913, + "grad_norm": 0.08000365644693375, + "learning_rate": 1.9090909090909094e-05, + "loss": 1.0104647874832153, + "step": 64 + }, + { + "epoch": 0.19870080244554833, + "grad_norm": 0.08139508217573166, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9445061087608337, + "step": 65 + }, + { + "epoch": 0.20175773786778756, + "grad_norm": 0.08749893307685852, + "learning_rate": 1.96969696969697e-05, + "loss": 1.080810308456421, + "step": 66 + }, + { + "epoch": 0.20481467329002676, + "grad_norm": 0.0786912813782692, + "learning_rate": 2e-05, + "loss": 0.9705753922462463, + "step": 67 + }, + { + "epoch": 0.20787160871226595, + "grad_norm": 0.08962028473615646, + "learning_rate": 1.9999858236410775e-05, + "loss": 0.962783694267273, + "step": 68 + }, + { + "epoch": 0.21092854413450515, + "grad_norm": 0.08402887731790543, + "learning_rate": 1.9999432949662483e-05, + "loss": 0.9959614872932434, + "step": 69 + }, + { + "epoch": 0.21398547955674435, + "grad_norm": 0.08036444336175919, + "learning_rate": 1.9998724151813157e-05, + "loss": 0.9569960832595825, + "step": 70 + }, + { + "epoch": 0.21704241497898358, + "grad_norm": 0.08247046917676926, + "learning_rate": 1.9997731862959143e-05, + "loss": 1.0012171268463135, + "step": 71 + }, + { + "epoch": 0.22009935040122278, + "grad_norm": 0.08966264873743057, + "learning_rate": 1.999645611123453e-05, + "loss": 1.0403809547424316, + "step": 72 + }, + { + "epoch": 0.22315628582346198, + "grad_norm": 0.08061660826206207, + "learning_rate": 1.999489693281034e-05, + "loss": 1.0089740753173828, + "step": 73 + }, + { + "epoch": 0.22621322124570117, + "grad_norm": 0.09005365520715714, + "learning_rate": 1.9993054371893526e-05, + "loss": 0.9333044290542603, + "step": 74 + }, + { + "epoch": 0.2292701566679404, + "grad_norm": 0.08651519566774368, + "learning_rate": 1.9990928480725694e-05, + "loss": 0.9284015893936157, + "step": 75 + }, + { + "epoch": 0.2323270920901796, + "grad_norm": 0.08141147345304489, + "learning_rate": 1.9988519319581637e-05, + "loss": 0.9782730340957642, + "step": 76 + }, + { + "epoch": 0.2353840275124188, + "grad_norm": 0.08344405144453049, + "learning_rate": 1.998582695676762e-05, + "loss": 0.9723064303398132, + "step": 77 + }, + { + "epoch": 0.238440962934658, + "grad_norm": 0.08019903302192688, + "learning_rate": 1.998285146861945e-05, + "loss": 0.9648997783660889, + "step": 78 + }, + { + "epoch": 0.24149789835689722, + "grad_norm": 0.08113416284322739, + "learning_rate": 1.99795929395003e-05, + "loss": 0.9263214468955994, + "step": 79 + }, + { + "epoch": 0.24455483377913642, + "grad_norm": 0.08127513527870178, + "learning_rate": 1.997605146179833e-05, + "loss": 0.8745232224464417, + "step": 80 + }, + { + "epoch": 0.24761176920137562, + "grad_norm": 0.09934187680482864, + "learning_rate": 1.997222713592405e-05, + "loss": 0.8722782135009766, + "step": 81 + }, + { + "epoch": 0.25066870462361485, + "grad_norm": 0.09701363742351532, + "learning_rate": 1.9968120070307503e-05, + "loss": 1.0084266662597656, + "step": 82 + }, + { + "epoch": 0.253725640045854, + "grad_norm": 0.08335654437541962, + "learning_rate": 1.9963730381395154e-05, + "loss": 0.9239332675933838, + "step": 83 + }, + { + "epoch": 0.25678257546809324, + "grad_norm": 0.09161650389432907, + "learning_rate": 1.9959058193646618e-05, + "loss": 0.9878032207489014, + "step": 84 + }, + { + "epoch": 0.2598395108903324, + "grad_norm": 0.08067663013935089, + "learning_rate": 1.9954103639531116e-05, + "loss": 0.9113098382949829, + "step": 85 + }, + { + "epoch": 0.26289644631257164, + "grad_norm": 0.09619539976119995, + "learning_rate": 1.9948866859523717e-05, + "loss": 0.9527600407600403, + "step": 86 + }, + { + "epoch": 0.26595338173481087, + "grad_norm": 0.10015493631362915, + "learning_rate": 1.9943348002101374e-05, + "loss": 0.9569152593612671, + "step": 87 + }, + { + "epoch": 0.26901031715705004, + "grad_norm": 0.09012345969676971, + "learning_rate": 1.993754722373869e-05, + "loss": 0.8912045359611511, + "step": 88 + }, + { + "epoch": 0.27206725257928926, + "grad_norm": 0.10342805832624435, + "learning_rate": 1.9931464688903502e-05, + "loss": 0.856104850769043, + "step": 89 + }, + { + "epoch": 0.2751241880015285, + "grad_norm": 0.10218493640422821, + "learning_rate": 1.9925100570052194e-05, + "loss": 0.9631397128105164, + "step": 90 + }, + { + "epoch": 0.27818112342376766, + "grad_norm": 0.10909046977758408, + "learning_rate": 1.9918455047624847e-05, + "loss": 0.8532565236091614, + "step": 91 + }, + { + "epoch": 0.2812380588460069, + "grad_norm": 0.10714197903871536, + "learning_rate": 1.9911528310040073e-05, + "loss": 0.9691859483718872, + "step": 92 + }, + { + "epoch": 0.28429499426824606, + "grad_norm": 0.1108694076538086, + "learning_rate": 1.990432055368971e-05, + "loss": 0.9374334812164307, + "step": 93 + }, + { + "epoch": 0.2873519296904853, + "grad_norm": 0.10037308186292648, + "learning_rate": 1.989683198293324e-05, + "loss": 0.9166896343231201, + "step": 94 + }, + { + "epoch": 0.2904088651127245, + "grad_norm": 0.10246684402227402, + "learning_rate": 1.9889062810092002e-05, + "loss": 1.0059239864349365, + "step": 95 + }, + { + "epoch": 0.2934658005349637, + "grad_norm": 0.09954962879419327, + "learning_rate": 1.9881013255443152e-05, + "loss": 1.00413179397583, + "step": 96 + }, + { + "epoch": 0.2965227359572029, + "grad_norm": 0.11006761342287064, + "learning_rate": 1.9872683547213446e-05, + "loss": 0.9414035677909851, + "step": 97 + }, + { + "epoch": 0.29957967137944214, + "grad_norm": 0.1014382541179657, + "learning_rate": 1.9864073921572756e-05, + "loss": 0.9155468940734863, + "step": 98 + }, + { + "epoch": 0.3026366068016813, + "grad_norm": 0.09883157908916473, + "learning_rate": 1.9855184622627362e-05, + "loss": 0.9429305195808411, + "step": 99 + }, + { + "epoch": 0.30569354222392053, + "grad_norm": 0.11199072748422623, + "learning_rate": 1.9846015902413053e-05, + "loss": 0.9143528342247009, + "step": 100 + }, + { + "epoch": 0.30569354222392053, + "eval_loss": 0.884428083896637, + "eval_runtime": 723.8143, + "eval_samples_per_second": 0.833, + "eval_steps_per_second": 0.833, + "step": 100 + } + ], + "logging_steps": 1, + "max_steps": 656, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.521459497664512e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-100/training_args.bin b/cpt_qwen_14B/checkpoints/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eddbb43a2cebb928dbed6e955a37ebfa3174f4b5 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a8e308e47eb936f678712445b19ddc52638f354c37c813ecaa432f69120a2e +size 5201 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-200/README.md b/cpt_qwen_14B/checkpoints/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8dfda26032514233f3e70a4012f1cfd1ddbbb609 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-200/README.md @@ -0,0 +1,207 @@ +--- +base_model: /workspace/Models/Qwen2.5-Coder-14B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:/workspace/Models/Qwen2.5-Coder-14B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/cpt_qwen_14B/checkpoints/checkpoint-200/adapter_config.json b/cpt_qwen_14B/checkpoints/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..81c31359285f7e351a44275c30b6882f4c6b50c0 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-200/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "/workspace/Models/Qwen2.5-Coder-14B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/cpt_qwen_14B/checkpoints/checkpoint-200/adapter_model.safetensors b/cpt_qwen_14B/checkpoints/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5b2751a8bc960068f215cf7d87ee20cbd531fff3 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e32cd39a05adef845494ef625c330566280da483f1df43a7d896cde3d72e625 +size 201378736 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-200/chat_template.jinja b/cpt_qwen_14B/checkpoints/checkpoint-200/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..28028c056af412405debd878cdda0171e35fa5d1 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-200/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-200/optimizer.pt b/cpt_qwen_14B/checkpoints/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e81efcc2057d31ee1e24b2d47edab5b4381a2a2 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cec9971f0f83d3992bfdb8378f1f654b8058c6e92f4735b925ed926ebefea84 +size 102698471 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-200/rng_state.pth b/cpt_qwen_14B/checkpoints/checkpoint-200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..fa3c89f30ac6f9559d46d5fd70e6dd21ed13add0 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e70b27b141a53396d5df1322c439b8190c0da577dedafcca185e03254d54da1 +size 14645 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-200/scheduler.pt b/cpt_qwen_14B/checkpoints/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a87f1870019cae1c73195f21e5330497ceb22cd9 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2ea6b9b029d63c1e2f3731428c97dfbc99ba9388eac107e6ecbed49452b1af9 +size 1465 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-200/tokenizer.json b/cpt_qwen_14B/checkpoints/checkpoint-200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-200/tokenizer_config.json b/cpt_qwen_14B/checkpoints/checkpoint-200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..217274ef8275420e4bf3b976f3948901cd3d176f --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-200/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": true, + "model_max_length": 32768, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-200/trainer_state.json b/cpt_qwen_14B/checkpoints/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8ce3f73989cd916d77d59fa0f83b36ddbbb64095 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-200/trainer_state.json @@ -0,0 +1,1466 @@ +{ + "best_global_step": 200, + "best_metric": 0.7551760673522949, + "best_model_checkpoint": "runs/cpt_run_14b/checkpoints/checkpoint-200", + "epoch": 0.6113870844478411, + "eval_steps": 50, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003056935422239205, + "grad_norm": 0.06516239047050476, + "learning_rate": 0.0, + "loss": 1.138384461402893, + "step": 1 + }, + { + "epoch": 0.00611387084447841, + "grad_norm": 0.05343673378229141, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.983342707157135, + "step": 2 + }, + { + "epoch": 0.009170806266717615, + "grad_norm": 0.05608418956398964, + "learning_rate": 6.060606060606061e-07, + "loss": 1.0762118101119995, + "step": 3 + }, + { + "epoch": 0.01222774168895682, + "grad_norm": 0.06523486226797104, + "learning_rate": 9.090909090909091e-07, + "loss": 1.084489345550537, + "step": 4 + }, + { + "epoch": 0.015284677111196026, + "grad_norm": 0.06582186371088028, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.2037022113800049, + "step": 5 + }, + { + "epoch": 0.01834161253343523, + "grad_norm": 0.06097998470067978, + "learning_rate": 1.5151515151515152e-06, + "loss": 1.10005784034729, + "step": 6 + }, + { + "epoch": 0.021398547955674436, + "grad_norm": 0.10365528613328934, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.0895193815231323, + "step": 7 + }, + { + "epoch": 0.02445548337791364, + "grad_norm": 0.06312141567468643, + "learning_rate": 2.1212121212121216e-06, + "loss": 1.0593242645263672, + "step": 8 + }, + { + "epoch": 0.027512418800152847, + "grad_norm": 0.05508403480052948, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.9772955179214478, + "step": 9 + }, + { + "epoch": 0.030569354222392053, + "grad_norm": 0.06006711348891258, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.084238886833191, + "step": 10 + }, + { + "epoch": 0.033626289644631255, + "grad_norm": 0.0588749423623085, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.0786534547805786, + "step": 11 + }, + { + "epoch": 0.03668322506687046, + "grad_norm": 0.046551357954740524, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.0370622873306274, + "step": 12 + }, + { + "epoch": 0.039740160489109666, + "grad_norm": 0.061659567058086395, + "learning_rate": 3.6363636363636366e-06, + "loss": 1.0646986961364746, + "step": 13 + }, + { + "epoch": 0.04279709591134887, + "grad_norm": 0.06007347255945206, + "learning_rate": 3.93939393939394e-06, + "loss": 1.0311307907104492, + "step": 14 + }, + { + "epoch": 0.04585403133358808, + "grad_norm": 0.07314135134220123, + "learning_rate": 4.242424242424243e-06, + "loss": 1.1300500631332397, + "step": 15 + }, + { + "epoch": 0.04891096675582728, + "grad_norm": 0.060934022068977356, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.0197452306747437, + "step": 16 + }, + { + "epoch": 0.05196790217806649, + "grad_norm": 0.056856051087379456, + "learning_rate": 4.848484848484849e-06, + "loss": 1.0438549518585205, + "step": 17 + }, + { + "epoch": 0.055024837600305694, + "grad_norm": 0.05908689647912979, + "learning_rate": 5.151515151515152e-06, + "loss": 1.0398856401443481, + "step": 18 + }, + { + "epoch": 0.0580817730225449, + "grad_norm": 0.07411840558052063, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.107885479927063, + "step": 19 + }, + { + "epoch": 0.061138708444784105, + "grad_norm": 0.0749165341258049, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.1060967445373535, + "step": 20 + }, + { + "epoch": 0.06419564386702331, + "grad_norm": 0.06720177084207535, + "learning_rate": 6.060606060606061e-06, + "loss": 1.0471720695495605, + "step": 21 + }, + { + "epoch": 0.06725257928926251, + "grad_norm": 0.05990725755691528, + "learning_rate": 6.363636363636364e-06, + "loss": 1.0944981575012207, + "step": 22 + }, + { + "epoch": 0.07030951471150172, + "grad_norm": 0.06672193855047226, + "learning_rate": 6.666666666666667e-06, + "loss": 1.1477092504501343, + "step": 23 + }, + { + "epoch": 0.07336645013374092, + "grad_norm": 0.06145205348730087, + "learning_rate": 6.969696969696971e-06, + "loss": 1.0591784715652466, + "step": 24 + }, + { + "epoch": 0.07642338555598013, + "grad_norm": 0.0757482647895813, + "learning_rate": 7.272727272727273e-06, + "loss": 1.0500165224075317, + "step": 25 + }, + { + "epoch": 0.07948032097821933, + "grad_norm": 0.07848478108644485, + "learning_rate": 7.5757575757575764e-06, + "loss": 1.0747522115707397, + "step": 26 + }, + { + "epoch": 0.08253725640045854, + "grad_norm": 0.07740631699562073, + "learning_rate": 7.87878787878788e-06, + "loss": 1.132310152053833, + "step": 27 + }, + { + "epoch": 0.08559419182269774, + "grad_norm": 0.07476603239774704, + "learning_rate": 8.181818181818183e-06, + "loss": 1.0339502096176147, + "step": 28 + }, + { + "epoch": 0.08865112724493696, + "grad_norm": 0.0779196098446846, + "learning_rate": 8.484848484848486e-06, + "loss": 1.1047282218933105, + "step": 29 + }, + { + "epoch": 0.09170806266717615, + "grad_norm": 0.06962384283542633, + "learning_rate": 8.787878787878788e-06, + "loss": 1.004916787147522, + "step": 30 + }, + { + "epoch": 0.09476499808941537, + "grad_norm": 0.06369175016880035, + "learning_rate": 9.090909090909091e-06, + "loss": 0.9296417832374573, + "step": 31 + }, + { + "epoch": 0.09782193351165457, + "grad_norm": 0.07470260560512543, + "learning_rate": 9.393939393939396e-06, + "loss": 1.0721708536148071, + "step": 32 + }, + { + "epoch": 0.10087886893389378, + "grad_norm": 0.07948213815689087, + "learning_rate": 9.696969696969698e-06, + "loss": 1.0350117683410645, + "step": 33 + }, + { + "epoch": 0.10393580435613298, + "grad_norm": 0.07066022604703903, + "learning_rate": 1e-05, + "loss": 1.026305913925171, + "step": 34 + }, + { + "epoch": 0.10699273977837218, + "grad_norm": 0.07774543762207031, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.0509816408157349, + "step": 35 + }, + { + "epoch": 0.11004967520061139, + "grad_norm": 0.07501248270273209, + "learning_rate": 1.0606060606060606e-05, + "loss": 1.0011574029922485, + "step": 36 + }, + { + "epoch": 0.11310661062285059, + "grad_norm": 0.6622501611709595, + "learning_rate": 1.0909090909090909e-05, + "loss": 0.9754424691200256, + "step": 37 + }, + { + "epoch": 0.1161635460450898, + "grad_norm": 0.07566080242395401, + "learning_rate": 1.1212121212121212e-05, + "loss": 1.0342774391174316, + "step": 38 + }, + { + "epoch": 0.119220481467329, + "grad_norm": 0.07573831081390381, + "learning_rate": 1.1515151515151517e-05, + "loss": 0.9714518785476685, + "step": 39 + }, + { + "epoch": 0.12227741688956821, + "grad_norm": 0.08083852380514145, + "learning_rate": 1.181818181818182e-05, + "loss": 1.1050316095352173, + "step": 40 + }, + { + "epoch": 0.12533435231180742, + "grad_norm": 0.08540588617324829, + "learning_rate": 1.2121212121212122e-05, + "loss": 1.0871070623397827, + "step": 41 + }, + { + "epoch": 0.12839128773404662, + "grad_norm": 0.07391592115163803, + "learning_rate": 1.2424242424242425e-05, + "loss": 1.0206722021102905, + "step": 42 + }, + { + "epoch": 0.13144822315628582, + "grad_norm": 0.07063689082860947, + "learning_rate": 1.2727272727272728e-05, + "loss": 0.9775047898292542, + "step": 43 + }, + { + "epoch": 0.13450515857852502, + "grad_norm": 0.07288888841867447, + "learning_rate": 1.3030303030303032e-05, + "loss": 1.1132858991622925, + "step": 44 + }, + { + "epoch": 0.13756209400076425, + "grad_norm": 0.07641777396202087, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.0707701444625854, + "step": 45 + }, + { + "epoch": 0.14061902942300344, + "grad_norm": 0.06990326195955276, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.9328265190124512, + "step": 46 + }, + { + "epoch": 0.14367596484524264, + "grad_norm": 0.0834241658449173, + "learning_rate": 1.3939393939393942e-05, + "loss": 1.0131721496582031, + "step": 47 + }, + { + "epoch": 0.14673290026748184, + "grad_norm": 0.0714937075972557, + "learning_rate": 1.4242424242424245e-05, + "loss": 0.940493106842041, + "step": 48 + }, + { + "epoch": 0.14978983568972107, + "grad_norm": 0.07770547270774841, + "learning_rate": 1.4545454545454546e-05, + "loss": 1.0435771942138672, + "step": 49 + }, + { + "epoch": 0.15284677111196027, + "grad_norm": 0.07950945198535919, + "learning_rate": 1.484848484848485e-05, + "loss": 1.0382137298583984, + "step": 50 + }, + { + "epoch": 0.15284677111196027, + "eval_loss": 1.0129202604293823, + "eval_runtime": 724.3664, + "eval_samples_per_second": 0.832, + "eval_steps_per_second": 0.832, + "step": 50 + }, + { + "epoch": 0.15590370653419947, + "grad_norm": 0.06961936503648758, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.9690049886703491, + "step": 51 + }, + { + "epoch": 0.15896064195643866, + "grad_norm": 0.069523885846138, + "learning_rate": 1.5454545454545454e-05, + "loss": 0.9830482006072998, + "step": 52 + }, + { + "epoch": 0.16201757737867786, + "grad_norm": 0.0764622762799263, + "learning_rate": 1.575757575757576e-05, + "loss": 1.0895472764968872, + "step": 53 + }, + { + "epoch": 0.1650745128009171, + "grad_norm": 0.1413721889257431, + "learning_rate": 1.606060606060606e-05, + "loss": 1.0354574918746948, + "step": 54 + }, + { + "epoch": 0.1681314482231563, + "grad_norm": 0.06818042695522308, + "learning_rate": 1.6363636363636366e-05, + "loss": 0.8534265160560608, + "step": 55 + }, + { + "epoch": 0.1711883836453955, + "grad_norm": 0.0722246989607811, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.9580274820327759, + "step": 56 + }, + { + "epoch": 0.17424531906763469, + "grad_norm": 0.07113443315029144, + "learning_rate": 1.6969696969696972e-05, + "loss": 1.0721848011016846, + "step": 57 + }, + { + "epoch": 0.1773022544898739, + "grad_norm": 0.08412107080221176, + "learning_rate": 1.7272727272727274e-05, + "loss": 1.1180150508880615, + "step": 58 + }, + { + "epoch": 0.1803591899121131, + "grad_norm": 0.07381036877632141, + "learning_rate": 1.7575757575757576e-05, + "loss": 1.0384547710418701, + "step": 59 + }, + { + "epoch": 0.1834161253343523, + "grad_norm": 0.07089001685380936, + "learning_rate": 1.787878787878788e-05, + "loss": 1.0446016788482666, + "step": 60 + }, + { + "epoch": 0.1864730607565915, + "grad_norm": 0.11576953530311584, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.0015051364898682, + "step": 61 + }, + { + "epoch": 0.18952999617883073, + "grad_norm": 0.08030868321657181, + "learning_rate": 1.8484848484848487e-05, + "loss": 0.9642710089683533, + "step": 62 + }, + { + "epoch": 0.19258693160106993, + "grad_norm": 0.08332342654466629, + "learning_rate": 1.8787878787878792e-05, + "loss": 1.0722991228103638, + "step": 63 + }, + { + "epoch": 0.19564386702330913, + "grad_norm": 0.08000365644693375, + "learning_rate": 1.9090909090909094e-05, + "loss": 1.0104647874832153, + "step": 64 + }, + { + "epoch": 0.19870080244554833, + "grad_norm": 0.08139508217573166, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9445061087608337, + "step": 65 + }, + { + "epoch": 0.20175773786778756, + "grad_norm": 0.08749893307685852, + "learning_rate": 1.96969696969697e-05, + "loss": 1.080810308456421, + "step": 66 + }, + { + "epoch": 0.20481467329002676, + "grad_norm": 0.0786912813782692, + "learning_rate": 2e-05, + "loss": 0.9705753922462463, + "step": 67 + }, + { + "epoch": 0.20787160871226595, + "grad_norm": 0.08962028473615646, + "learning_rate": 1.9999858236410775e-05, + "loss": 0.962783694267273, + "step": 68 + }, + { + "epoch": 0.21092854413450515, + "grad_norm": 0.08402887731790543, + "learning_rate": 1.9999432949662483e-05, + "loss": 0.9959614872932434, + "step": 69 + }, + { + "epoch": 0.21398547955674435, + "grad_norm": 0.08036444336175919, + "learning_rate": 1.9998724151813157e-05, + "loss": 0.9569960832595825, + "step": 70 + }, + { + "epoch": 0.21704241497898358, + "grad_norm": 0.08247046917676926, + "learning_rate": 1.9997731862959143e-05, + "loss": 1.0012171268463135, + "step": 71 + }, + { + "epoch": 0.22009935040122278, + "grad_norm": 0.08966264873743057, + "learning_rate": 1.999645611123453e-05, + "loss": 1.0403809547424316, + "step": 72 + }, + { + "epoch": 0.22315628582346198, + "grad_norm": 0.08061660826206207, + "learning_rate": 1.999489693281034e-05, + "loss": 1.0089740753173828, + "step": 73 + }, + { + "epoch": 0.22621322124570117, + "grad_norm": 0.09005365520715714, + "learning_rate": 1.9993054371893526e-05, + "loss": 0.9333044290542603, + "step": 74 + }, + { + "epoch": 0.2292701566679404, + "grad_norm": 0.08651519566774368, + "learning_rate": 1.9990928480725694e-05, + "loss": 0.9284015893936157, + "step": 75 + }, + { + "epoch": 0.2323270920901796, + "grad_norm": 0.08141147345304489, + "learning_rate": 1.9988519319581637e-05, + "loss": 0.9782730340957642, + "step": 76 + }, + { + "epoch": 0.2353840275124188, + "grad_norm": 0.08344405144453049, + "learning_rate": 1.998582695676762e-05, + "loss": 0.9723064303398132, + "step": 77 + }, + { + "epoch": 0.238440962934658, + "grad_norm": 0.08019903302192688, + "learning_rate": 1.998285146861945e-05, + "loss": 0.9648997783660889, + "step": 78 + }, + { + "epoch": 0.24149789835689722, + "grad_norm": 0.08113416284322739, + "learning_rate": 1.99795929395003e-05, + "loss": 0.9263214468955994, + "step": 79 + }, + { + "epoch": 0.24455483377913642, + "grad_norm": 0.08127513527870178, + "learning_rate": 1.997605146179833e-05, + "loss": 0.8745232224464417, + "step": 80 + }, + { + "epoch": 0.24761176920137562, + "grad_norm": 0.09934187680482864, + "learning_rate": 1.997222713592405e-05, + "loss": 0.8722782135009766, + "step": 81 + }, + { + "epoch": 0.25066870462361485, + "grad_norm": 0.09701363742351532, + "learning_rate": 1.9968120070307503e-05, + "loss": 1.0084266662597656, + "step": 82 + }, + { + "epoch": 0.253725640045854, + "grad_norm": 0.08335654437541962, + "learning_rate": 1.9963730381395154e-05, + "loss": 0.9239332675933838, + "step": 83 + }, + { + "epoch": 0.25678257546809324, + "grad_norm": 0.09161650389432907, + "learning_rate": 1.9959058193646618e-05, + "loss": 0.9878032207489014, + "step": 84 + }, + { + "epoch": 0.2598395108903324, + "grad_norm": 0.08067663013935089, + "learning_rate": 1.9954103639531116e-05, + "loss": 0.9113098382949829, + "step": 85 + }, + { + "epoch": 0.26289644631257164, + "grad_norm": 0.09619539976119995, + "learning_rate": 1.9948866859523717e-05, + "loss": 0.9527600407600403, + "step": 86 + }, + { + "epoch": 0.26595338173481087, + "grad_norm": 0.10015493631362915, + "learning_rate": 1.9943348002101374e-05, + "loss": 0.9569152593612671, + "step": 87 + }, + { + "epoch": 0.26901031715705004, + "grad_norm": 0.09012345969676971, + "learning_rate": 1.993754722373869e-05, + "loss": 0.8912045359611511, + "step": 88 + }, + { + "epoch": 0.27206725257928926, + "grad_norm": 0.10342805832624435, + "learning_rate": 1.9931464688903502e-05, + "loss": 0.856104850769043, + "step": 89 + }, + { + "epoch": 0.2751241880015285, + "grad_norm": 0.10218493640422821, + "learning_rate": 1.9925100570052194e-05, + "loss": 0.9631397128105164, + "step": 90 + }, + { + "epoch": 0.27818112342376766, + "grad_norm": 0.10909046977758408, + "learning_rate": 1.9918455047624847e-05, + "loss": 0.8532565236091614, + "step": 91 + }, + { + "epoch": 0.2812380588460069, + "grad_norm": 0.10714197903871536, + "learning_rate": 1.9911528310040073e-05, + "loss": 0.9691859483718872, + "step": 92 + }, + { + "epoch": 0.28429499426824606, + "grad_norm": 0.1108694076538086, + "learning_rate": 1.990432055368971e-05, + "loss": 0.9374334812164307, + "step": 93 + }, + { + "epoch": 0.2873519296904853, + "grad_norm": 0.10037308186292648, + "learning_rate": 1.989683198293324e-05, + "loss": 0.9166896343231201, + "step": 94 + }, + { + "epoch": 0.2904088651127245, + "grad_norm": 0.10246684402227402, + "learning_rate": 1.9889062810092002e-05, + "loss": 1.0059239864349365, + "step": 95 + }, + { + "epoch": 0.2934658005349637, + "grad_norm": 0.09954962879419327, + "learning_rate": 1.9881013255443152e-05, + "loss": 1.00413179397583, + "step": 96 + }, + { + "epoch": 0.2965227359572029, + "grad_norm": 0.11006761342287064, + "learning_rate": 1.9872683547213446e-05, + "loss": 0.9414035677909851, + "step": 97 + }, + { + "epoch": 0.29957967137944214, + "grad_norm": 0.1014382541179657, + "learning_rate": 1.9864073921572756e-05, + "loss": 0.9155468940734863, + "step": 98 + }, + { + "epoch": 0.3026366068016813, + "grad_norm": 0.09883157908916473, + "learning_rate": 1.9855184622627362e-05, + "loss": 0.9429305195808411, + "step": 99 + }, + { + "epoch": 0.30569354222392053, + "grad_norm": 0.11199072748422623, + "learning_rate": 1.9846015902413053e-05, + "loss": 0.9143528342247009, + "step": 100 + }, + { + "epoch": 0.30569354222392053, + "eval_loss": 0.884428083896637, + "eval_runtime": 723.8143, + "eval_samples_per_second": 0.833, + "eval_steps_per_second": 0.833, + "step": 100 + }, + { + "epoch": 0.3087504776461597, + "grad_norm": 0.10796016454696655, + "learning_rate": 1.9836568020887963e-05, + "loss": 0.9726455211639404, + "step": 101 + }, + { + "epoch": 0.31180741306839893, + "grad_norm": 0.10056383162736893, + "learning_rate": 1.982684124592521e-05, + "loss": 0.8932135701179504, + "step": 102 + }, + { + "epoch": 0.31486434849063816, + "grad_norm": 0.10836594551801682, + "learning_rate": 1.9816835853305306e-05, + "loss": 0.919749915599823, + "step": 103 + }, + { + "epoch": 0.31792128391287733, + "grad_norm": 0.12032149732112885, + "learning_rate": 1.9806552126708322e-05, + "loss": 0.871781587600708, + "step": 104 + }, + { + "epoch": 0.32097821933511655, + "grad_norm": 0.10854160040616989, + "learning_rate": 1.9795990357705853e-05, + "loss": 0.8587784171104431, + "step": 105 + }, + { + "epoch": 0.3240351547573557, + "grad_norm": 0.10819399356842041, + "learning_rate": 1.978515084575276e-05, + "loss": 0.8524806499481201, + "step": 106 + }, + { + "epoch": 0.32709209017959495, + "grad_norm": 0.10226067155599594, + "learning_rate": 1.9774033898178668e-05, + "loss": 0.7892144918441772, + "step": 107 + }, + { + "epoch": 0.3301490256018342, + "grad_norm": 0.1071159616112709, + "learning_rate": 1.976263983017925e-05, + "loss": 0.8833234906196594, + "step": 108 + }, + { + "epoch": 0.33320596102407335, + "grad_norm": 0.11434526741504669, + "learning_rate": 1.9750968964807305e-05, + "loss": 0.861842155456543, + "step": 109 + }, + { + "epoch": 0.3362628964463126, + "grad_norm": 0.1159641221165657, + "learning_rate": 1.9739021632963584e-05, + "loss": 0.8987889289855957, + "step": 110 + }, + { + "epoch": 0.3393198318685518, + "grad_norm": 0.12371373921632767, + "learning_rate": 1.9726798173387417e-05, + "loss": 0.9710193872451782, + "step": 111 + }, + { + "epoch": 0.342376767290791, + "grad_norm": 0.11441531032323837, + "learning_rate": 1.97142989326471e-05, + "loss": 0.8199151158332825, + "step": 112 + }, + { + "epoch": 0.3454337027130302, + "grad_norm": 0.11842846125364304, + "learning_rate": 1.9701524265130088e-05, + "loss": 0.8845276236534119, + "step": 113 + }, + { + "epoch": 0.34849063813526937, + "grad_norm": 0.10813732445240021, + "learning_rate": 1.9688474533032916e-05, + "loss": 0.7964264750480652, + "step": 114 + }, + { + "epoch": 0.3515475735575086, + "grad_norm": 0.11050347238779068, + "learning_rate": 1.9675150106350957e-05, + "loss": 0.9630422592163086, + "step": 115 + }, + { + "epoch": 0.3546045089797478, + "grad_norm": 0.10537250339984894, + "learning_rate": 1.9661551362867926e-05, + "loss": 0.7706905007362366, + "step": 116 + }, + { + "epoch": 0.357661444401987, + "grad_norm": 0.11390368640422821, + "learning_rate": 1.9647678688145163e-05, + "loss": 0.8541204929351807, + "step": 117 + }, + { + "epoch": 0.3607183798242262, + "grad_norm": 0.10318922251462936, + "learning_rate": 1.963353247551069e-05, + "loss": 0.7400562763214111, + "step": 118 + }, + { + "epoch": 0.3637753152464654, + "grad_norm": 0.1347586214542389, + "learning_rate": 1.9619113126048086e-05, + "loss": 0.9232871532440186, + "step": 119 + }, + { + "epoch": 0.3668322506687046, + "grad_norm": 0.11458177119493484, + "learning_rate": 1.96044210485851e-05, + "loss": 0.833285927772522, + "step": 120 + }, + { + "epoch": 0.36988918609094384, + "grad_norm": 0.12361041456460953, + "learning_rate": 1.958945665968206e-05, + "loss": 0.7887391448020935, + "step": 121 + }, + { + "epoch": 0.372946121513183, + "grad_norm": 0.11985408514738083, + "learning_rate": 1.9574220383620054e-05, + "loss": 0.8206446170806885, + "step": 122 + }, + { + "epoch": 0.37600305693542224, + "grad_norm": 0.1355939507484436, + "learning_rate": 1.9558712652388932e-05, + "loss": 0.7648542523384094, + "step": 123 + }, + { + "epoch": 0.37905999235766147, + "grad_norm": 0.1229313388466835, + "learning_rate": 1.954293390567501e-05, + "loss": 0.8573335409164429, + "step": 124 + }, + { + "epoch": 0.38211692777990064, + "grad_norm": 0.11425124108791351, + "learning_rate": 1.9526884590848646e-05, + "loss": 0.7412531971931458, + "step": 125 + }, + { + "epoch": 0.38517386320213987, + "grad_norm": 0.12430041283369064, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.8098543882369995, + "step": 126 + }, + { + "epoch": 0.38823079862437904, + "grad_norm": 0.12492368370294571, + "learning_rate": 1.9493976084683814e-05, + "loss": 0.8814713954925537, + "step": 127 + }, + { + "epoch": 0.39128773404661826, + "grad_norm": 0.14428824186325073, + "learning_rate": 1.9477117826390934e-05, + "loss": 0.8231979608535767, + "step": 128 + }, + { + "epoch": 0.3943446694688575, + "grad_norm": 0.12010085582733154, + "learning_rate": 1.9459990866050337e-05, + "loss": 0.7015627026557922, + "step": 129 + }, + { + "epoch": 0.39740160489109666, + "grad_norm": 0.11819776892662048, + "learning_rate": 1.9442595689257898e-05, + "loss": 0.8086729645729065, + "step": 130 + }, + { + "epoch": 0.4004585403133359, + "grad_norm": 0.12211033701896667, + "learning_rate": 1.9424932789214158e-05, + "loss": 0.8234002590179443, + "step": 131 + }, + { + "epoch": 0.4035154757355751, + "grad_norm": 0.14926476776599884, + "learning_rate": 1.9407002666710334e-05, + "loss": 0.874608039855957, + "step": 132 + }, + { + "epoch": 0.4065724111578143, + "grad_norm": 0.13012923300266266, + "learning_rate": 1.9388805830114132e-05, + "loss": 0.8491607904434204, + "step": 133 + }, + { + "epoch": 0.4096293465800535, + "grad_norm": 0.12012261897325516, + "learning_rate": 1.937034279535533e-05, + "loss": 0.7269159555435181, + "step": 134 + }, + { + "epoch": 0.4126862820022927, + "grad_norm": 0.15302567183971405, + "learning_rate": 1.9351614085911134e-05, + "loss": 0.8560839891433716, + "step": 135 + }, + { + "epoch": 0.4157432174245319, + "grad_norm": 0.12234190106391907, + "learning_rate": 1.933262023279137e-05, + "loss": 0.8211904764175415, + "step": 136 + }, + { + "epoch": 0.41880015284677113, + "grad_norm": 0.14427296817302704, + "learning_rate": 1.9313361774523387e-05, + "loss": 0.8500057458877563, + "step": 137 + }, + { + "epoch": 0.4218570882690103, + "grad_norm": 0.1314094066619873, + "learning_rate": 1.929383925713682e-05, + "loss": 0.7589091658592224, + "step": 138 + }, + { + "epoch": 0.42491402369124953, + "grad_norm": 0.1576734483242035, + "learning_rate": 1.92740532341481e-05, + "loss": 0.7581073641777039, + "step": 139 + }, + { + "epoch": 0.4279709591134887, + "grad_norm": 0.15788713097572327, + "learning_rate": 1.925400426654475e-05, + "loss": 0.809050440788269, + "step": 140 + }, + { + "epoch": 0.43102789453572793, + "grad_norm": 0.13364559412002563, + "learning_rate": 1.9233692922769497e-05, + "loss": 0.7990086078643799, + "step": 141 + }, + { + "epoch": 0.43408482995796716, + "grad_norm": 0.14786465466022491, + "learning_rate": 1.921311977870413e-05, + "loss": 0.8675815463066101, + "step": 142 + }, + { + "epoch": 0.4371417653802063, + "grad_norm": 0.14621882140636444, + "learning_rate": 1.9192285417653208e-05, + "loss": 0.8713765740394592, + "step": 143 + }, + { + "epoch": 0.44019870080244555, + "grad_norm": 0.12874048948287964, + "learning_rate": 1.917119043032749e-05, + "loss": 0.7361871004104614, + "step": 144 + }, + { + "epoch": 0.4432556362246848, + "grad_norm": 0.12183775007724762, + "learning_rate": 1.9149835414827193e-05, + "loss": 0.7311941385269165, + "step": 145 + }, + { + "epoch": 0.44631257164692395, + "grad_norm": 0.1397160291671753, + "learning_rate": 1.912822097662505e-05, + "loss": 0.8189159035682678, + "step": 146 + }, + { + "epoch": 0.4493695070691632, + "grad_norm": 0.1458273082971573, + "learning_rate": 1.9106347728549134e-05, + "loss": 0.8288135528564453, + "step": 147 + }, + { + "epoch": 0.45242644249140235, + "grad_norm": 0.16898781061172485, + "learning_rate": 1.908421629076547e-05, + "loss": 0.7878037095069885, + "step": 148 + }, + { + "epoch": 0.4554833779136416, + "grad_norm": 0.1638474315404892, + "learning_rate": 1.9061827290760466e-05, + "loss": 0.8059952259063721, + "step": 149 + }, + { + "epoch": 0.4585403133358808, + "grad_norm": 0.14130882918834686, + "learning_rate": 1.9039181363323128e-05, + "loss": 0.7346830368041992, + "step": 150 + }, + { + "epoch": 0.4585403133358808, + "eval_loss": 0.7979016900062561, + "eval_runtime": 828.6295, + "eval_samples_per_second": 0.728, + "eval_steps_per_second": 0.728, + "step": 150 + }, + { + "epoch": 0.46159724875811997, + "grad_norm": 0.14427433907985687, + "learning_rate": 1.9016279150527044e-05, + "loss": 0.7583403587341309, + "step": 151 + }, + { + "epoch": 0.4646541841803592, + "grad_norm": 0.1515798568725586, + "learning_rate": 1.8993121301712194e-05, + "loss": 0.7908380031585693, + "step": 152 + }, + { + "epoch": 0.46771111960259837, + "grad_norm": 0.14444488286972046, + "learning_rate": 1.896970847346653e-05, + "loss": 0.7916130423545837, + "step": 153 + }, + { + "epoch": 0.4707680550248376, + "grad_norm": 0.1460912823677063, + "learning_rate": 1.8946041329607364e-05, + "loss": 0.7750643491744995, + "step": 154 + }, + { + "epoch": 0.4738249904470768, + "grad_norm": 0.13896244764328003, + "learning_rate": 1.892212054116255e-05, + "loss": 0.8059666156768799, + "step": 155 + }, + { + "epoch": 0.476881925869316, + "grad_norm": 0.16133630275726318, + "learning_rate": 1.889794678635145e-05, + "loss": 0.8327827453613281, + "step": 156 + }, + { + "epoch": 0.4799388612915552, + "grad_norm": 0.1474636346101761, + "learning_rate": 1.8873520750565716e-05, + "loss": 0.8498989343643188, + "step": 157 + }, + { + "epoch": 0.48299579671379445, + "grad_norm": 0.17222349345684052, + "learning_rate": 1.884884312634985e-05, + "loss": 0.7750177979469299, + "step": 158 + }, + { + "epoch": 0.4860527321360336, + "grad_norm": 0.15558090806007385, + "learning_rate": 1.8823914613381568e-05, + "loss": 0.7326169013977051, + "step": 159 + }, + { + "epoch": 0.48910966755827284, + "grad_norm": 0.13808321952819824, + "learning_rate": 1.8798735918451963e-05, + "loss": 0.8308709859848022, + "step": 160 + }, + { + "epoch": 0.492166602980512, + "grad_norm": 0.1761898398399353, + "learning_rate": 1.8773307755445468e-05, + "loss": 0.7805465459823608, + "step": 161 + }, + { + "epoch": 0.49522353840275124, + "grad_norm": 0.160477414727211, + "learning_rate": 1.874763084531961e-05, + "loss": 0.8538846969604492, + "step": 162 + }, + { + "epoch": 0.49828047382499047, + "grad_norm": 0.15238745510578156, + "learning_rate": 1.872170591608459e-05, + "loss": 0.8801217675209045, + "step": 163 + }, + { + "epoch": 0.5013374092472297, + "grad_norm": 0.1567080318927765, + "learning_rate": 1.86955337027826e-05, + "loss": 0.7205259799957275, + "step": 164 + }, + { + "epoch": 0.5043943446694689, + "grad_norm": 0.13637851178646088, + "learning_rate": 1.866911494746702e-05, + "loss": 0.7636491656303406, + "step": 165 + }, + { + "epoch": 0.507451280091708, + "grad_norm": 0.15563489496707916, + "learning_rate": 1.8642450399181373e-05, + "loss": 0.7982497811317444, + "step": 166 + }, + { + "epoch": 0.5105082155139473, + "grad_norm": 0.15503396093845367, + "learning_rate": 1.8615540813938063e-05, + "loss": 0.8737778067588806, + "step": 167 + }, + { + "epoch": 0.5135651509361865, + "grad_norm": 0.16095557808876038, + "learning_rate": 1.8588386954696972e-05, + "loss": 0.796604335308075, + "step": 168 + }, + { + "epoch": 0.5166220863584257, + "grad_norm": 0.1713593453168869, + "learning_rate": 1.856098959134381e-05, + "loss": 0.8247392177581787, + "step": 169 + }, + { + "epoch": 0.5196790217806648, + "grad_norm": 0.18239113688468933, + "learning_rate": 1.8533349500668295e-05, + "loss": 0.7838484644889832, + "step": 170 + }, + { + "epoch": 0.5227359572029041, + "grad_norm": 0.15745767951011658, + "learning_rate": 1.850546746634211e-05, + "loss": 0.7856907248497009, + "step": 171 + }, + { + "epoch": 0.5257928926251433, + "grad_norm": 0.16820666193962097, + "learning_rate": 1.8477344278896708e-05, + "loss": 0.7829679846763611, + "step": 172 + }, + { + "epoch": 0.5288498280473825, + "grad_norm": 0.16975544393062592, + "learning_rate": 1.84489807357009e-05, + "loss": 0.7374375462532043, + "step": 173 + }, + { + "epoch": 0.5319067634696217, + "grad_norm": 0.167228102684021, + "learning_rate": 1.8420377640938204e-05, + "loss": 0.712837815284729, + "step": 174 + }, + { + "epoch": 0.5349636988918609, + "grad_norm": 0.15955154597759247, + "learning_rate": 1.839153580558411e-05, + "loss": 0.7645693421363831, + "step": 175 + }, + { + "epoch": 0.5380206343141001, + "grad_norm": 0.18378689885139465, + "learning_rate": 1.8362456047383032e-05, + "loss": 0.7974956631660461, + "step": 176 + }, + { + "epoch": 0.5410775697363394, + "grad_norm": 0.15777672827243805, + "learning_rate": 1.833313919082515e-05, + "loss": 0.8957571983337402, + "step": 177 + }, + { + "epoch": 0.5441345051585785, + "grad_norm": 0.15292386710643768, + "learning_rate": 1.8303586067123028e-05, + "loss": 0.7635619044303894, + "step": 178 + }, + { + "epoch": 0.5471914405808177, + "grad_norm": 0.178152397274971, + "learning_rate": 1.8273797514188043e-05, + "loss": 0.7849246263504028, + "step": 179 + }, + { + "epoch": 0.550248376003057, + "grad_norm": 0.15916013717651367, + "learning_rate": 1.824377437660663e-05, + "loss": 0.6975343227386475, + "step": 180 + }, + { + "epoch": 0.5533053114252962, + "grad_norm": 0.18172231316566467, + "learning_rate": 1.821351750561634e-05, + "loss": 0.7675164341926575, + "step": 181 + }, + { + "epoch": 0.5563622468475353, + "grad_norm": 0.16241903603076935, + "learning_rate": 1.818302775908169e-05, + "loss": 0.7950343489646912, + "step": 182 + }, + { + "epoch": 0.5594191822697746, + "grad_norm": 0.18727579712867737, + "learning_rate": 1.8152306001469875e-05, + "loss": 0.787315309047699, + "step": 183 + }, + { + "epoch": 0.5624761176920138, + "grad_norm": 0.1627933531999588, + "learning_rate": 1.8121353103826213e-05, + "loss": 0.7141211628913879, + "step": 184 + }, + { + "epoch": 0.565533053114253, + "grad_norm": 0.4369247555732727, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.8476608395576477, + "step": 185 + }, + { + "epoch": 0.5685899885364921, + "grad_norm": 0.16494786739349365, + "learning_rate": 1.8058757405367003e-05, + "loss": 0.720562756061554, + "step": 186 + }, + { + "epoch": 0.5716469239587314, + "grad_norm": 0.175015389919281, + "learning_rate": 1.8027116379309637e-05, + "loss": 0.7589252591133118, + "step": 187 + }, + { + "epoch": 0.5747038593809706, + "grad_norm": 0.1769978553056717, + "learning_rate": 1.799524776268646e-05, + "loss": 0.7644155621528625, + "step": 188 + }, + { + "epoch": 0.5777607948032097, + "grad_norm": 0.18481792509555817, + "learning_rate": 1.796315245905936e-05, + "loss": 0.7885835766792297, + "step": 189 + }, + { + "epoch": 0.580817730225449, + "grad_norm": 0.1668689250946045, + "learning_rate": 1.7930831378417437e-05, + "loss": 0.7377231121063232, + "step": 190 + }, + { + "epoch": 0.5838746656476882, + "grad_norm": 0.178734689950943, + "learning_rate": 1.7898285437151163e-05, + "loss": 0.7388894557952881, + "step": 191 + }, + { + "epoch": 0.5869316010699274, + "grad_norm": 0.1740068644285202, + "learning_rate": 1.786551555802643e-05, + "loss": 0.8209859728813171, + "step": 192 + }, + { + "epoch": 0.5899885364921666, + "grad_norm": 0.19211041927337646, + "learning_rate": 1.783252267015837e-05, + "loss": 0.7305737733840942, + "step": 193 + }, + { + "epoch": 0.5930454719144058, + "grad_norm": 0.16644936800003052, + "learning_rate": 1.779930770898503e-05, + "loss": 0.7760804891586304, + "step": 194 + }, + { + "epoch": 0.596102407336645, + "grad_norm": 0.1773686707019806, + "learning_rate": 1.776587161624083e-05, + "loss": 0.7879236936569214, + "step": 195 + }, + { + "epoch": 0.5991593427588843, + "grad_norm": 0.17508819699287415, + "learning_rate": 1.7732215339929874e-05, + "loss": 0.7307407259941101, + "step": 196 + }, + { + "epoch": 0.6022162781811234, + "grad_norm": 0.17211101949214935, + "learning_rate": 1.7698339834299064e-05, + "loss": 0.7293214797973633, + "step": 197 + }, + { + "epoch": 0.6052732136033626, + "grad_norm": 0.18085215985774994, + "learning_rate": 1.7664246059811058e-05, + "loss": 0.763083279132843, + "step": 198 + }, + { + "epoch": 0.6083301490256018, + "grad_norm": 0.20243075489997864, + "learning_rate": 1.7629934983117025e-05, + "loss": 0.7372676134109497, + "step": 199 + }, + { + "epoch": 0.6113870844478411, + "grad_norm": 0.18152795732021332, + "learning_rate": 1.759540757702924e-05, + "loss": 0.7121898531913757, + "step": 200 + }, + { + "epoch": 0.6113870844478411, + "eval_loss": 0.7551760673522949, + "eval_runtime": 900.209, + "eval_samples_per_second": 0.67, + "eval_steps_per_second": 0.67, + "step": 200 + } + ], + "logging_steps": 1, + "max_steps": 656, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1042918995329024e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-200/training_args.bin b/cpt_qwen_14B/checkpoints/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eddbb43a2cebb928dbed6e955a37ebfa3174f4b5 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a8e308e47eb936f678712445b19ddc52638f354c37c813ecaa432f69120a2e +size 5201 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-300/README.md b/cpt_qwen_14B/checkpoints/checkpoint-300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8dfda26032514233f3e70a4012f1cfd1ddbbb609 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-300/README.md @@ -0,0 +1,207 @@ +--- +base_model: /workspace/Models/Qwen2.5-Coder-14B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:/workspace/Models/Qwen2.5-Coder-14B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/cpt_qwen_14B/checkpoints/checkpoint-300/adapter_config.json b/cpt_qwen_14B/checkpoints/checkpoint-300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..81c31359285f7e351a44275c30b6882f4c6b50c0 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-300/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "/workspace/Models/Qwen2.5-Coder-14B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/cpt_qwen_14B/checkpoints/checkpoint-300/adapter_model.safetensors b/cpt_qwen_14B/checkpoints/checkpoint-300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e690ade5c4014d2e0f077e28b7c3d44e96bae478 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c370210851f30b5de992e131f4276973cf2b5feb6969fb7638ee7128b6b9674 +size 201378736 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-300/chat_template.jinja b/cpt_qwen_14B/checkpoints/checkpoint-300/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..28028c056af412405debd878cdda0171e35fa5d1 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-300/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-300/optimizer.pt b/cpt_qwen_14B/checkpoints/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..00952b4ccf2a98ca984039b4f6d91d8b4c128b7c --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ad69f3254fd02c5361123fcfc2dea4b905fca69152def982a3356e7b8afd4ed +size 102698855 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-300/rng_state.pth b/cpt_qwen_14B/checkpoints/checkpoint-300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3f637b180bf457a8380b43cdf6ff944178c29898 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11c8108834e0d26f3ed6483fb470f43109ba4656c99180d6c32043763dd0a2df +size 14645 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-300/scheduler.pt b/cpt_qwen_14B/checkpoints/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..24a826a916bfb86c966e03fb4948ce4214c1086d --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dd5d6b57b6230ca0af9ae6da5d18fafe1b14b1ea92b2fe466790a371d1f85fe +size 1465 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-300/tokenizer.json b/cpt_qwen_14B/checkpoints/checkpoint-300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-300/tokenizer_config.json b/cpt_qwen_14B/checkpoints/checkpoint-300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..217274ef8275420e4bf3b976f3948901cd3d176f --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-300/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": true, + "model_max_length": 32768, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-300/trainer_state.json b/cpt_qwen_14B/checkpoints/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9b77ea45ff7386d06cf03fd31a260adb82f932d7 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-300/trainer_state.json @@ -0,0 +1,2182 @@ +{ + "best_global_step": 300, + "best_metric": 0.7063615918159485, + "best_model_checkpoint": "runs/cpt_run_14b/checkpoints/checkpoint-300", + "epoch": 0.9170806266717616, + "eval_steps": 50, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003056935422239205, + "grad_norm": 0.06516239047050476, + "learning_rate": 0.0, + "loss": 1.138384461402893, + "step": 1 + }, + { + "epoch": 0.00611387084447841, + "grad_norm": 0.05343673378229141, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.983342707157135, + "step": 2 + }, + { + "epoch": 0.009170806266717615, + "grad_norm": 0.05608418956398964, + "learning_rate": 6.060606060606061e-07, + "loss": 1.0762118101119995, + "step": 3 + }, + { + "epoch": 0.01222774168895682, + "grad_norm": 0.06523486226797104, + "learning_rate": 9.090909090909091e-07, + "loss": 1.084489345550537, + "step": 4 + }, + { + "epoch": 0.015284677111196026, + "grad_norm": 0.06582186371088028, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.2037022113800049, + "step": 5 + }, + { + "epoch": 0.01834161253343523, + "grad_norm": 0.06097998470067978, + "learning_rate": 1.5151515151515152e-06, + "loss": 1.10005784034729, + "step": 6 + }, + { + "epoch": 0.021398547955674436, + "grad_norm": 0.10365528613328934, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.0895193815231323, + "step": 7 + }, + { + "epoch": 0.02445548337791364, + "grad_norm": 0.06312141567468643, + "learning_rate": 2.1212121212121216e-06, + "loss": 1.0593242645263672, + "step": 8 + }, + { + "epoch": 0.027512418800152847, + "grad_norm": 0.05508403480052948, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.9772955179214478, + "step": 9 + }, + { + "epoch": 0.030569354222392053, + "grad_norm": 0.06006711348891258, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.084238886833191, + "step": 10 + }, + { + "epoch": 0.033626289644631255, + "grad_norm": 0.0588749423623085, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.0786534547805786, + "step": 11 + }, + { + "epoch": 0.03668322506687046, + "grad_norm": 0.046551357954740524, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.0370622873306274, + "step": 12 + }, + { + "epoch": 0.039740160489109666, + "grad_norm": 0.061659567058086395, + "learning_rate": 3.6363636363636366e-06, + "loss": 1.0646986961364746, + "step": 13 + }, + { + "epoch": 0.04279709591134887, + "grad_norm": 0.06007347255945206, + "learning_rate": 3.93939393939394e-06, + "loss": 1.0311307907104492, + "step": 14 + }, + { + "epoch": 0.04585403133358808, + "grad_norm": 0.07314135134220123, + "learning_rate": 4.242424242424243e-06, + "loss": 1.1300500631332397, + "step": 15 + }, + { + "epoch": 0.04891096675582728, + "grad_norm": 0.060934022068977356, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.0197452306747437, + "step": 16 + }, + { + "epoch": 0.05196790217806649, + "grad_norm": 0.056856051087379456, + "learning_rate": 4.848484848484849e-06, + "loss": 1.0438549518585205, + "step": 17 + }, + { + "epoch": 0.055024837600305694, + "grad_norm": 0.05908689647912979, + "learning_rate": 5.151515151515152e-06, + "loss": 1.0398856401443481, + "step": 18 + }, + { + "epoch": 0.0580817730225449, + "grad_norm": 0.07411840558052063, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.107885479927063, + "step": 19 + }, + { + "epoch": 0.061138708444784105, + "grad_norm": 0.0749165341258049, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.1060967445373535, + "step": 20 + }, + { + "epoch": 0.06419564386702331, + "grad_norm": 0.06720177084207535, + "learning_rate": 6.060606060606061e-06, + "loss": 1.0471720695495605, + "step": 21 + }, + { + "epoch": 0.06725257928926251, + "grad_norm": 0.05990725755691528, + "learning_rate": 6.363636363636364e-06, + "loss": 1.0944981575012207, + "step": 22 + }, + { + "epoch": 0.07030951471150172, + "grad_norm": 0.06672193855047226, + "learning_rate": 6.666666666666667e-06, + "loss": 1.1477092504501343, + "step": 23 + }, + { + "epoch": 0.07336645013374092, + "grad_norm": 0.06145205348730087, + "learning_rate": 6.969696969696971e-06, + "loss": 1.0591784715652466, + "step": 24 + }, + { + "epoch": 0.07642338555598013, + "grad_norm": 0.0757482647895813, + "learning_rate": 7.272727272727273e-06, + "loss": 1.0500165224075317, + "step": 25 + }, + { + "epoch": 0.07948032097821933, + "grad_norm": 0.07848478108644485, + "learning_rate": 7.5757575757575764e-06, + "loss": 1.0747522115707397, + "step": 26 + }, + { + "epoch": 0.08253725640045854, + "grad_norm": 0.07740631699562073, + "learning_rate": 7.87878787878788e-06, + "loss": 1.132310152053833, + "step": 27 + }, + { + "epoch": 0.08559419182269774, + "grad_norm": 0.07476603239774704, + "learning_rate": 8.181818181818183e-06, + "loss": 1.0339502096176147, + "step": 28 + }, + { + "epoch": 0.08865112724493696, + "grad_norm": 0.0779196098446846, + "learning_rate": 8.484848484848486e-06, + "loss": 1.1047282218933105, + "step": 29 + }, + { + "epoch": 0.09170806266717615, + "grad_norm": 0.06962384283542633, + "learning_rate": 8.787878787878788e-06, + "loss": 1.004916787147522, + "step": 30 + }, + { + "epoch": 0.09476499808941537, + "grad_norm": 0.06369175016880035, + "learning_rate": 9.090909090909091e-06, + "loss": 0.9296417832374573, + "step": 31 + }, + { + "epoch": 0.09782193351165457, + "grad_norm": 0.07470260560512543, + "learning_rate": 9.393939393939396e-06, + "loss": 1.0721708536148071, + "step": 32 + }, + { + "epoch": 0.10087886893389378, + "grad_norm": 0.07948213815689087, + "learning_rate": 9.696969696969698e-06, + "loss": 1.0350117683410645, + "step": 33 + }, + { + "epoch": 0.10393580435613298, + "grad_norm": 0.07066022604703903, + "learning_rate": 1e-05, + "loss": 1.026305913925171, + "step": 34 + }, + { + "epoch": 0.10699273977837218, + "grad_norm": 0.07774543762207031, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.0509816408157349, + "step": 35 + }, + { + "epoch": 0.11004967520061139, + "grad_norm": 0.07501248270273209, + "learning_rate": 1.0606060606060606e-05, + "loss": 1.0011574029922485, + "step": 36 + }, + { + "epoch": 0.11310661062285059, + "grad_norm": 0.6622501611709595, + "learning_rate": 1.0909090909090909e-05, + "loss": 0.9754424691200256, + "step": 37 + }, + { + "epoch": 0.1161635460450898, + "grad_norm": 0.07566080242395401, + "learning_rate": 1.1212121212121212e-05, + "loss": 1.0342774391174316, + "step": 38 + }, + { + "epoch": 0.119220481467329, + "grad_norm": 0.07573831081390381, + "learning_rate": 1.1515151515151517e-05, + "loss": 0.9714518785476685, + "step": 39 + }, + { + "epoch": 0.12227741688956821, + "grad_norm": 0.08083852380514145, + "learning_rate": 1.181818181818182e-05, + "loss": 1.1050316095352173, + "step": 40 + }, + { + "epoch": 0.12533435231180742, + "grad_norm": 0.08540588617324829, + "learning_rate": 1.2121212121212122e-05, + "loss": 1.0871070623397827, + "step": 41 + }, + { + "epoch": 0.12839128773404662, + "grad_norm": 0.07391592115163803, + "learning_rate": 1.2424242424242425e-05, + "loss": 1.0206722021102905, + "step": 42 + }, + { + "epoch": 0.13144822315628582, + "grad_norm": 0.07063689082860947, + "learning_rate": 1.2727272727272728e-05, + "loss": 0.9775047898292542, + "step": 43 + }, + { + "epoch": 0.13450515857852502, + "grad_norm": 0.07288888841867447, + "learning_rate": 1.3030303030303032e-05, + "loss": 1.1132858991622925, + "step": 44 + }, + { + "epoch": 0.13756209400076425, + "grad_norm": 0.07641777396202087, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.0707701444625854, + "step": 45 + }, + { + "epoch": 0.14061902942300344, + "grad_norm": 0.06990326195955276, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.9328265190124512, + "step": 46 + }, + { + "epoch": 0.14367596484524264, + "grad_norm": 0.0834241658449173, + "learning_rate": 1.3939393939393942e-05, + "loss": 1.0131721496582031, + "step": 47 + }, + { + "epoch": 0.14673290026748184, + "grad_norm": 0.0714937075972557, + "learning_rate": 1.4242424242424245e-05, + "loss": 0.940493106842041, + "step": 48 + }, + { + "epoch": 0.14978983568972107, + "grad_norm": 0.07770547270774841, + "learning_rate": 1.4545454545454546e-05, + "loss": 1.0435771942138672, + "step": 49 + }, + { + "epoch": 0.15284677111196027, + "grad_norm": 0.07950945198535919, + "learning_rate": 1.484848484848485e-05, + "loss": 1.0382137298583984, + "step": 50 + }, + { + "epoch": 0.15284677111196027, + "eval_loss": 1.0129202604293823, + "eval_runtime": 724.3664, + "eval_samples_per_second": 0.832, + "eval_steps_per_second": 0.832, + "step": 50 + }, + { + "epoch": 0.15590370653419947, + "grad_norm": 0.06961936503648758, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.9690049886703491, + "step": 51 + }, + { + "epoch": 0.15896064195643866, + "grad_norm": 0.069523885846138, + "learning_rate": 1.5454545454545454e-05, + "loss": 0.9830482006072998, + "step": 52 + }, + { + "epoch": 0.16201757737867786, + "grad_norm": 0.0764622762799263, + "learning_rate": 1.575757575757576e-05, + "loss": 1.0895472764968872, + "step": 53 + }, + { + "epoch": 0.1650745128009171, + "grad_norm": 0.1413721889257431, + "learning_rate": 1.606060606060606e-05, + "loss": 1.0354574918746948, + "step": 54 + }, + { + "epoch": 0.1681314482231563, + "grad_norm": 0.06818042695522308, + "learning_rate": 1.6363636363636366e-05, + "loss": 0.8534265160560608, + "step": 55 + }, + { + "epoch": 0.1711883836453955, + "grad_norm": 0.0722246989607811, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.9580274820327759, + "step": 56 + }, + { + "epoch": 0.17424531906763469, + "grad_norm": 0.07113443315029144, + "learning_rate": 1.6969696969696972e-05, + "loss": 1.0721848011016846, + "step": 57 + }, + { + "epoch": 0.1773022544898739, + "grad_norm": 0.08412107080221176, + "learning_rate": 1.7272727272727274e-05, + "loss": 1.1180150508880615, + "step": 58 + }, + { + "epoch": 0.1803591899121131, + "grad_norm": 0.07381036877632141, + "learning_rate": 1.7575757575757576e-05, + "loss": 1.0384547710418701, + "step": 59 + }, + { + "epoch": 0.1834161253343523, + "grad_norm": 0.07089001685380936, + "learning_rate": 1.787878787878788e-05, + "loss": 1.0446016788482666, + "step": 60 + }, + { + "epoch": 0.1864730607565915, + "grad_norm": 0.11576953530311584, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.0015051364898682, + "step": 61 + }, + { + "epoch": 0.18952999617883073, + "grad_norm": 0.08030868321657181, + "learning_rate": 1.8484848484848487e-05, + "loss": 0.9642710089683533, + "step": 62 + }, + { + "epoch": 0.19258693160106993, + "grad_norm": 0.08332342654466629, + "learning_rate": 1.8787878787878792e-05, + "loss": 1.0722991228103638, + "step": 63 + }, + { + "epoch": 0.19564386702330913, + "grad_norm": 0.08000365644693375, + "learning_rate": 1.9090909090909094e-05, + "loss": 1.0104647874832153, + "step": 64 + }, + { + "epoch": 0.19870080244554833, + "grad_norm": 0.08139508217573166, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9445061087608337, + "step": 65 + }, + { + "epoch": 0.20175773786778756, + "grad_norm": 0.08749893307685852, + "learning_rate": 1.96969696969697e-05, + "loss": 1.080810308456421, + "step": 66 + }, + { + "epoch": 0.20481467329002676, + "grad_norm": 0.0786912813782692, + "learning_rate": 2e-05, + "loss": 0.9705753922462463, + "step": 67 + }, + { + "epoch": 0.20787160871226595, + "grad_norm": 0.08962028473615646, + "learning_rate": 1.9999858236410775e-05, + "loss": 0.962783694267273, + "step": 68 + }, + { + "epoch": 0.21092854413450515, + "grad_norm": 0.08402887731790543, + "learning_rate": 1.9999432949662483e-05, + "loss": 0.9959614872932434, + "step": 69 + }, + { + "epoch": 0.21398547955674435, + "grad_norm": 0.08036444336175919, + "learning_rate": 1.9998724151813157e-05, + "loss": 0.9569960832595825, + "step": 70 + }, + { + "epoch": 0.21704241497898358, + "grad_norm": 0.08247046917676926, + "learning_rate": 1.9997731862959143e-05, + "loss": 1.0012171268463135, + "step": 71 + }, + { + "epoch": 0.22009935040122278, + "grad_norm": 0.08966264873743057, + "learning_rate": 1.999645611123453e-05, + "loss": 1.0403809547424316, + "step": 72 + }, + { + "epoch": 0.22315628582346198, + "grad_norm": 0.08061660826206207, + "learning_rate": 1.999489693281034e-05, + "loss": 1.0089740753173828, + "step": 73 + }, + { + "epoch": 0.22621322124570117, + "grad_norm": 0.09005365520715714, + "learning_rate": 1.9993054371893526e-05, + "loss": 0.9333044290542603, + "step": 74 + }, + { + "epoch": 0.2292701566679404, + "grad_norm": 0.08651519566774368, + "learning_rate": 1.9990928480725694e-05, + "loss": 0.9284015893936157, + "step": 75 + }, + { + "epoch": 0.2323270920901796, + "grad_norm": 0.08141147345304489, + "learning_rate": 1.9988519319581637e-05, + "loss": 0.9782730340957642, + "step": 76 + }, + { + "epoch": 0.2353840275124188, + "grad_norm": 0.08344405144453049, + "learning_rate": 1.998582695676762e-05, + "loss": 0.9723064303398132, + "step": 77 + }, + { + "epoch": 0.238440962934658, + "grad_norm": 0.08019903302192688, + "learning_rate": 1.998285146861945e-05, + "loss": 0.9648997783660889, + "step": 78 + }, + { + "epoch": 0.24149789835689722, + "grad_norm": 0.08113416284322739, + "learning_rate": 1.99795929395003e-05, + "loss": 0.9263214468955994, + "step": 79 + }, + { + "epoch": 0.24455483377913642, + "grad_norm": 0.08127513527870178, + "learning_rate": 1.997605146179833e-05, + "loss": 0.8745232224464417, + "step": 80 + }, + { + "epoch": 0.24761176920137562, + "grad_norm": 0.09934187680482864, + "learning_rate": 1.997222713592405e-05, + "loss": 0.8722782135009766, + "step": 81 + }, + { + "epoch": 0.25066870462361485, + "grad_norm": 0.09701363742351532, + "learning_rate": 1.9968120070307503e-05, + "loss": 1.0084266662597656, + "step": 82 + }, + { + "epoch": 0.253725640045854, + "grad_norm": 0.08335654437541962, + "learning_rate": 1.9963730381395154e-05, + "loss": 0.9239332675933838, + "step": 83 + }, + { + "epoch": 0.25678257546809324, + "grad_norm": 0.09161650389432907, + "learning_rate": 1.9959058193646618e-05, + "loss": 0.9878032207489014, + "step": 84 + }, + { + "epoch": 0.2598395108903324, + "grad_norm": 0.08067663013935089, + "learning_rate": 1.9954103639531116e-05, + "loss": 0.9113098382949829, + "step": 85 + }, + { + "epoch": 0.26289644631257164, + "grad_norm": 0.09619539976119995, + "learning_rate": 1.9948866859523717e-05, + "loss": 0.9527600407600403, + "step": 86 + }, + { + "epoch": 0.26595338173481087, + "grad_norm": 0.10015493631362915, + "learning_rate": 1.9943348002101374e-05, + "loss": 0.9569152593612671, + "step": 87 + }, + { + "epoch": 0.26901031715705004, + "grad_norm": 0.09012345969676971, + "learning_rate": 1.993754722373869e-05, + "loss": 0.8912045359611511, + "step": 88 + }, + { + "epoch": 0.27206725257928926, + "grad_norm": 0.10342805832624435, + "learning_rate": 1.9931464688903502e-05, + "loss": 0.856104850769043, + "step": 89 + }, + { + "epoch": 0.2751241880015285, + "grad_norm": 0.10218493640422821, + "learning_rate": 1.9925100570052194e-05, + "loss": 0.9631397128105164, + "step": 90 + }, + { + "epoch": 0.27818112342376766, + "grad_norm": 0.10909046977758408, + "learning_rate": 1.9918455047624847e-05, + "loss": 0.8532565236091614, + "step": 91 + }, + { + "epoch": 0.2812380588460069, + "grad_norm": 0.10714197903871536, + "learning_rate": 1.9911528310040073e-05, + "loss": 0.9691859483718872, + "step": 92 + }, + { + "epoch": 0.28429499426824606, + "grad_norm": 0.1108694076538086, + "learning_rate": 1.990432055368971e-05, + "loss": 0.9374334812164307, + "step": 93 + }, + { + "epoch": 0.2873519296904853, + "grad_norm": 0.10037308186292648, + "learning_rate": 1.989683198293324e-05, + "loss": 0.9166896343231201, + "step": 94 + }, + { + "epoch": 0.2904088651127245, + "grad_norm": 0.10246684402227402, + "learning_rate": 1.9889062810092002e-05, + "loss": 1.0059239864349365, + "step": 95 + }, + { + "epoch": 0.2934658005349637, + "grad_norm": 0.09954962879419327, + "learning_rate": 1.9881013255443152e-05, + "loss": 1.00413179397583, + "step": 96 + }, + { + "epoch": 0.2965227359572029, + "grad_norm": 0.11006761342287064, + "learning_rate": 1.9872683547213446e-05, + "loss": 0.9414035677909851, + "step": 97 + }, + { + "epoch": 0.29957967137944214, + "grad_norm": 0.1014382541179657, + "learning_rate": 1.9864073921572756e-05, + "loss": 0.9155468940734863, + "step": 98 + }, + { + "epoch": 0.3026366068016813, + "grad_norm": 0.09883157908916473, + "learning_rate": 1.9855184622627362e-05, + "loss": 0.9429305195808411, + "step": 99 + }, + { + "epoch": 0.30569354222392053, + "grad_norm": 0.11199072748422623, + "learning_rate": 1.9846015902413053e-05, + "loss": 0.9143528342247009, + "step": 100 + }, + { + "epoch": 0.30569354222392053, + "eval_loss": 0.884428083896637, + "eval_runtime": 723.8143, + "eval_samples_per_second": 0.833, + "eval_steps_per_second": 0.833, + "step": 100 + }, + { + "epoch": 0.3087504776461597, + "grad_norm": 0.10796016454696655, + "learning_rate": 1.9836568020887963e-05, + "loss": 0.9726455211639404, + "step": 101 + }, + { + "epoch": 0.31180741306839893, + "grad_norm": 0.10056383162736893, + "learning_rate": 1.982684124592521e-05, + "loss": 0.8932135701179504, + "step": 102 + }, + { + "epoch": 0.31486434849063816, + "grad_norm": 0.10836594551801682, + "learning_rate": 1.9816835853305306e-05, + "loss": 0.919749915599823, + "step": 103 + }, + { + "epoch": 0.31792128391287733, + "grad_norm": 0.12032149732112885, + "learning_rate": 1.9806552126708322e-05, + "loss": 0.871781587600708, + "step": 104 + }, + { + "epoch": 0.32097821933511655, + "grad_norm": 0.10854160040616989, + "learning_rate": 1.9795990357705853e-05, + "loss": 0.8587784171104431, + "step": 105 + }, + { + "epoch": 0.3240351547573557, + "grad_norm": 0.10819399356842041, + "learning_rate": 1.978515084575276e-05, + "loss": 0.8524806499481201, + "step": 106 + }, + { + "epoch": 0.32709209017959495, + "grad_norm": 0.10226067155599594, + "learning_rate": 1.9774033898178668e-05, + "loss": 0.7892144918441772, + "step": 107 + }, + { + "epoch": 0.3301490256018342, + "grad_norm": 0.1071159616112709, + "learning_rate": 1.976263983017925e-05, + "loss": 0.8833234906196594, + "step": 108 + }, + { + "epoch": 0.33320596102407335, + "grad_norm": 0.11434526741504669, + "learning_rate": 1.9750968964807305e-05, + "loss": 0.861842155456543, + "step": 109 + }, + { + "epoch": 0.3362628964463126, + "grad_norm": 0.1159641221165657, + "learning_rate": 1.9739021632963584e-05, + "loss": 0.8987889289855957, + "step": 110 + }, + { + "epoch": 0.3393198318685518, + "grad_norm": 0.12371373921632767, + "learning_rate": 1.9726798173387417e-05, + "loss": 0.9710193872451782, + "step": 111 + }, + { + "epoch": 0.342376767290791, + "grad_norm": 0.11441531032323837, + "learning_rate": 1.97142989326471e-05, + "loss": 0.8199151158332825, + "step": 112 + }, + { + "epoch": 0.3454337027130302, + "grad_norm": 0.11842846125364304, + "learning_rate": 1.9701524265130088e-05, + "loss": 0.8845276236534119, + "step": 113 + }, + { + "epoch": 0.34849063813526937, + "grad_norm": 0.10813732445240021, + "learning_rate": 1.9688474533032916e-05, + "loss": 0.7964264750480652, + "step": 114 + }, + { + "epoch": 0.3515475735575086, + "grad_norm": 0.11050347238779068, + "learning_rate": 1.9675150106350957e-05, + "loss": 0.9630422592163086, + "step": 115 + }, + { + "epoch": 0.3546045089797478, + "grad_norm": 0.10537250339984894, + "learning_rate": 1.9661551362867926e-05, + "loss": 0.7706905007362366, + "step": 116 + }, + { + "epoch": 0.357661444401987, + "grad_norm": 0.11390368640422821, + "learning_rate": 1.9647678688145163e-05, + "loss": 0.8541204929351807, + "step": 117 + }, + { + "epoch": 0.3607183798242262, + "grad_norm": 0.10318922251462936, + "learning_rate": 1.963353247551069e-05, + "loss": 0.7400562763214111, + "step": 118 + }, + { + "epoch": 0.3637753152464654, + "grad_norm": 0.1347586214542389, + "learning_rate": 1.9619113126048086e-05, + "loss": 0.9232871532440186, + "step": 119 + }, + { + "epoch": 0.3668322506687046, + "grad_norm": 0.11458177119493484, + "learning_rate": 1.96044210485851e-05, + "loss": 0.833285927772522, + "step": 120 + }, + { + "epoch": 0.36988918609094384, + "grad_norm": 0.12361041456460953, + "learning_rate": 1.958945665968206e-05, + "loss": 0.7887391448020935, + "step": 121 + }, + { + "epoch": 0.372946121513183, + "grad_norm": 0.11985408514738083, + "learning_rate": 1.9574220383620054e-05, + "loss": 0.8206446170806885, + "step": 122 + }, + { + "epoch": 0.37600305693542224, + "grad_norm": 0.1355939507484436, + "learning_rate": 1.9558712652388932e-05, + "loss": 0.7648542523384094, + "step": 123 + }, + { + "epoch": 0.37905999235766147, + "grad_norm": 0.1229313388466835, + "learning_rate": 1.954293390567501e-05, + "loss": 0.8573335409164429, + "step": 124 + }, + { + "epoch": 0.38211692777990064, + "grad_norm": 0.11425124108791351, + "learning_rate": 1.9526884590848646e-05, + "loss": 0.7412531971931458, + "step": 125 + }, + { + "epoch": 0.38517386320213987, + "grad_norm": 0.12430041283369064, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.8098543882369995, + "step": 126 + }, + { + "epoch": 0.38823079862437904, + "grad_norm": 0.12492368370294571, + "learning_rate": 1.9493976084683814e-05, + "loss": 0.8814713954925537, + "step": 127 + }, + { + "epoch": 0.39128773404661826, + "grad_norm": 0.14428824186325073, + "learning_rate": 1.9477117826390934e-05, + "loss": 0.8231979608535767, + "step": 128 + }, + { + "epoch": 0.3943446694688575, + "grad_norm": 0.12010085582733154, + "learning_rate": 1.9459990866050337e-05, + "loss": 0.7015627026557922, + "step": 129 + }, + { + "epoch": 0.39740160489109666, + "grad_norm": 0.11819776892662048, + "learning_rate": 1.9442595689257898e-05, + "loss": 0.8086729645729065, + "step": 130 + }, + { + "epoch": 0.4004585403133359, + "grad_norm": 0.12211033701896667, + "learning_rate": 1.9424932789214158e-05, + "loss": 0.8234002590179443, + "step": 131 + }, + { + "epoch": 0.4035154757355751, + "grad_norm": 0.14926476776599884, + "learning_rate": 1.9407002666710334e-05, + "loss": 0.874608039855957, + "step": 132 + }, + { + "epoch": 0.4065724111578143, + "grad_norm": 0.13012923300266266, + "learning_rate": 1.9388805830114132e-05, + "loss": 0.8491607904434204, + "step": 133 + }, + { + "epoch": 0.4096293465800535, + "grad_norm": 0.12012261897325516, + "learning_rate": 1.937034279535533e-05, + "loss": 0.7269159555435181, + "step": 134 + }, + { + "epoch": 0.4126862820022927, + "grad_norm": 0.15302567183971405, + "learning_rate": 1.9351614085911134e-05, + "loss": 0.8560839891433716, + "step": 135 + }, + { + "epoch": 0.4157432174245319, + "grad_norm": 0.12234190106391907, + "learning_rate": 1.933262023279137e-05, + "loss": 0.8211904764175415, + "step": 136 + }, + { + "epoch": 0.41880015284677113, + "grad_norm": 0.14427296817302704, + "learning_rate": 1.9313361774523387e-05, + "loss": 0.8500057458877563, + "step": 137 + }, + { + "epoch": 0.4218570882690103, + "grad_norm": 0.1314094066619873, + "learning_rate": 1.929383925713682e-05, + "loss": 0.7589091658592224, + "step": 138 + }, + { + "epoch": 0.42491402369124953, + "grad_norm": 0.1576734483242035, + "learning_rate": 1.92740532341481e-05, + "loss": 0.7581073641777039, + "step": 139 + }, + { + "epoch": 0.4279709591134887, + "grad_norm": 0.15788713097572327, + "learning_rate": 1.925400426654475e-05, + "loss": 0.809050440788269, + "step": 140 + }, + { + "epoch": 0.43102789453572793, + "grad_norm": 0.13364559412002563, + "learning_rate": 1.9233692922769497e-05, + "loss": 0.7990086078643799, + "step": 141 + }, + { + "epoch": 0.43408482995796716, + "grad_norm": 0.14786465466022491, + "learning_rate": 1.921311977870413e-05, + "loss": 0.8675815463066101, + "step": 142 + }, + { + "epoch": 0.4371417653802063, + "grad_norm": 0.14621882140636444, + "learning_rate": 1.9192285417653208e-05, + "loss": 0.8713765740394592, + "step": 143 + }, + { + "epoch": 0.44019870080244555, + "grad_norm": 0.12874048948287964, + "learning_rate": 1.917119043032749e-05, + "loss": 0.7361871004104614, + "step": 144 + }, + { + "epoch": 0.4432556362246848, + "grad_norm": 0.12183775007724762, + "learning_rate": 1.9149835414827193e-05, + "loss": 0.7311941385269165, + "step": 145 + }, + { + "epoch": 0.44631257164692395, + "grad_norm": 0.1397160291671753, + "learning_rate": 1.912822097662505e-05, + "loss": 0.8189159035682678, + "step": 146 + }, + { + "epoch": 0.4493695070691632, + "grad_norm": 0.1458273082971573, + "learning_rate": 1.9106347728549134e-05, + "loss": 0.8288135528564453, + "step": 147 + }, + { + "epoch": 0.45242644249140235, + "grad_norm": 0.16898781061172485, + "learning_rate": 1.908421629076547e-05, + "loss": 0.7878037095069885, + "step": 148 + }, + { + "epoch": 0.4554833779136416, + "grad_norm": 0.1638474315404892, + "learning_rate": 1.9061827290760466e-05, + "loss": 0.8059952259063721, + "step": 149 + }, + { + "epoch": 0.4585403133358808, + "grad_norm": 0.14130882918834686, + "learning_rate": 1.9039181363323128e-05, + "loss": 0.7346830368041992, + "step": 150 + }, + { + "epoch": 0.4585403133358808, + "eval_loss": 0.7979016900062561, + "eval_runtime": 828.6295, + "eval_samples_per_second": 0.728, + "eval_steps_per_second": 0.728, + "step": 150 + }, + { + "epoch": 0.46159724875811997, + "grad_norm": 0.14427433907985687, + "learning_rate": 1.9016279150527044e-05, + "loss": 0.7583403587341309, + "step": 151 + }, + { + "epoch": 0.4646541841803592, + "grad_norm": 0.1515798568725586, + "learning_rate": 1.8993121301712194e-05, + "loss": 0.7908380031585693, + "step": 152 + }, + { + "epoch": 0.46771111960259837, + "grad_norm": 0.14444488286972046, + "learning_rate": 1.896970847346653e-05, + "loss": 0.7916130423545837, + "step": 153 + }, + { + "epoch": 0.4707680550248376, + "grad_norm": 0.1460912823677063, + "learning_rate": 1.8946041329607364e-05, + "loss": 0.7750643491744995, + "step": 154 + }, + { + "epoch": 0.4738249904470768, + "grad_norm": 0.13896244764328003, + "learning_rate": 1.892212054116255e-05, + "loss": 0.8059666156768799, + "step": 155 + }, + { + "epoch": 0.476881925869316, + "grad_norm": 0.16133630275726318, + "learning_rate": 1.889794678635145e-05, + "loss": 0.8327827453613281, + "step": 156 + }, + { + "epoch": 0.4799388612915552, + "grad_norm": 0.1474636346101761, + "learning_rate": 1.8873520750565716e-05, + "loss": 0.8498989343643188, + "step": 157 + }, + { + "epoch": 0.48299579671379445, + "grad_norm": 0.17222349345684052, + "learning_rate": 1.884884312634985e-05, + "loss": 0.7750177979469299, + "step": 158 + }, + { + "epoch": 0.4860527321360336, + "grad_norm": 0.15558090806007385, + "learning_rate": 1.8823914613381568e-05, + "loss": 0.7326169013977051, + "step": 159 + }, + { + "epoch": 0.48910966755827284, + "grad_norm": 0.13808321952819824, + "learning_rate": 1.8798735918451963e-05, + "loss": 0.8308709859848022, + "step": 160 + }, + { + "epoch": 0.492166602980512, + "grad_norm": 0.1761898398399353, + "learning_rate": 1.8773307755445468e-05, + "loss": 0.7805465459823608, + "step": 161 + }, + { + "epoch": 0.49522353840275124, + "grad_norm": 0.160477414727211, + "learning_rate": 1.874763084531961e-05, + "loss": 0.8538846969604492, + "step": 162 + }, + { + "epoch": 0.49828047382499047, + "grad_norm": 0.15238745510578156, + "learning_rate": 1.872170591608459e-05, + "loss": 0.8801217675209045, + "step": 163 + }, + { + "epoch": 0.5013374092472297, + "grad_norm": 0.1567080318927765, + "learning_rate": 1.86955337027826e-05, + "loss": 0.7205259799957275, + "step": 164 + }, + { + "epoch": 0.5043943446694689, + "grad_norm": 0.13637851178646088, + "learning_rate": 1.866911494746702e-05, + "loss": 0.7636491656303406, + "step": 165 + }, + { + "epoch": 0.507451280091708, + "grad_norm": 0.15563489496707916, + "learning_rate": 1.8642450399181373e-05, + "loss": 0.7982497811317444, + "step": 166 + }, + { + "epoch": 0.5105082155139473, + "grad_norm": 0.15503396093845367, + "learning_rate": 1.8615540813938063e-05, + "loss": 0.8737778067588806, + "step": 167 + }, + { + "epoch": 0.5135651509361865, + "grad_norm": 0.16095557808876038, + "learning_rate": 1.8588386954696972e-05, + "loss": 0.796604335308075, + "step": 168 + }, + { + "epoch": 0.5166220863584257, + "grad_norm": 0.1713593453168869, + "learning_rate": 1.856098959134381e-05, + "loss": 0.8247392177581787, + "step": 169 + }, + { + "epoch": 0.5196790217806648, + "grad_norm": 0.18239113688468933, + "learning_rate": 1.8533349500668295e-05, + "loss": 0.7838484644889832, + "step": 170 + }, + { + "epoch": 0.5227359572029041, + "grad_norm": 0.15745767951011658, + "learning_rate": 1.850546746634211e-05, + "loss": 0.7856907248497009, + "step": 171 + }, + { + "epoch": 0.5257928926251433, + "grad_norm": 0.16820666193962097, + "learning_rate": 1.8477344278896708e-05, + "loss": 0.7829679846763611, + "step": 172 + }, + { + "epoch": 0.5288498280473825, + "grad_norm": 0.16975544393062592, + "learning_rate": 1.84489807357009e-05, + "loss": 0.7374375462532043, + "step": 173 + }, + { + "epoch": 0.5319067634696217, + "grad_norm": 0.167228102684021, + "learning_rate": 1.8420377640938204e-05, + "loss": 0.712837815284729, + "step": 174 + }, + { + "epoch": 0.5349636988918609, + "grad_norm": 0.15955154597759247, + "learning_rate": 1.839153580558411e-05, + "loss": 0.7645693421363831, + "step": 175 + }, + { + "epoch": 0.5380206343141001, + "grad_norm": 0.18378689885139465, + "learning_rate": 1.8362456047383032e-05, + "loss": 0.7974956631660461, + "step": 176 + }, + { + "epoch": 0.5410775697363394, + "grad_norm": 0.15777672827243805, + "learning_rate": 1.833313919082515e-05, + "loss": 0.8957571983337402, + "step": 177 + }, + { + "epoch": 0.5441345051585785, + "grad_norm": 0.15292386710643768, + "learning_rate": 1.8303586067123028e-05, + "loss": 0.7635619044303894, + "step": 178 + }, + { + "epoch": 0.5471914405808177, + "grad_norm": 0.178152397274971, + "learning_rate": 1.8273797514188043e-05, + "loss": 0.7849246263504028, + "step": 179 + }, + { + "epoch": 0.550248376003057, + "grad_norm": 0.15916013717651367, + "learning_rate": 1.824377437660663e-05, + "loss": 0.6975343227386475, + "step": 180 + }, + { + "epoch": 0.5533053114252962, + "grad_norm": 0.18172231316566467, + "learning_rate": 1.821351750561634e-05, + "loss": 0.7675164341926575, + "step": 181 + }, + { + "epoch": 0.5563622468475353, + "grad_norm": 0.16241903603076935, + "learning_rate": 1.818302775908169e-05, + "loss": 0.7950343489646912, + "step": 182 + }, + { + "epoch": 0.5594191822697746, + "grad_norm": 0.18727579712867737, + "learning_rate": 1.8152306001469875e-05, + "loss": 0.787315309047699, + "step": 183 + }, + { + "epoch": 0.5624761176920138, + "grad_norm": 0.1627933531999588, + "learning_rate": 1.8121353103826213e-05, + "loss": 0.7141211628913879, + "step": 184 + }, + { + "epoch": 0.565533053114253, + "grad_norm": 0.4369247555732727, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.8476608395576477, + "step": 185 + }, + { + "epoch": 0.5685899885364921, + "grad_norm": 0.16494786739349365, + "learning_rate": 1.8058757405367003e-05, + "loss": 0.720562756061554, + "step": 186 + }, + { + "epoch": 0.5716469239587314, + "grad_norm": 0.175015389919281, + "learning_rate": 1.8027116379309637e-05, + "loss": 0.7589252591133118, + "step": 187 + }, + { + "epoch": 0.5747038593809706, + "grad_norm": 0.1769978553056717, + "learning_rate": 1.799524776268646e-05, + "loss": 0.7644155621528625, + "step": 188 + }, + { + "epoch": 0.5777607948032097, + "grad_norm": 0.18481792509555817, + "learning_rate": 1.796315245905936e-05, + "loss": 0.7885835766792297, + "step": 189 + }, + { + "epoch": 0.580817730225449, + "grad_norm": 0.1668689250946045, + "learning_rate": 1.7930831378417437e-05, + "loss": 0.7377231121063232, + "step": 190 + }, + { + "epoch": 0.5838746656476882, + "grad_norm": 0.178734689950943, + "learning_rate": 1.7898285437151163e-05, + "loss": 0.7388894557952881, + "step": 191 + }, + { + "epoch": 0.5869316010699274, + "grad_norm": 0.1740068644285202, + "learning_rate": 1.786551555802643e-05, + "loss": 0.8209859728813171, + "step": 192 + }, + { + "epoch": 0.5899885364921666, + "grad_norm": 0.19211041927337646, + "learning_rate": 1.783252267015837e-05, + "loss": 0.7305737733840942, + "step": 193 + }, + { + "epoch": 0.5930454719144058, + "grad_norm": 0.16644936800003052, + "learning_rate": 1.779930770898503e-05, + "loss": 0.7760804891586304, + "step": 194 + }, + { + "epoch": 0.596102407336645, + "grad_norm": 0.1773686707019806, + "learning_rate": 1.776587161624083e-05, + "loss": 0.7879236936569214, + "step": 195 + }, + { + "epoch": 0.5991593427588843, + "grad_norm": 0.17508819699287415, + "learning_rate": 1.7732215339929874e-05, + "loss": 0.7307407259941101, + "step": 196 + }, + { + "epoch": 0.6022162781811234, + "grad_norm": 0.17211101949214935, + "learning_rate": 1.7698339834299064e-05, + "loss": 0.7293214797973633, + "step": 197 + }, + { + "epoch": 0.6052732136033626, + "grad_norm": 0.18085215985774994, + "learning_rate": 1.7664246059811058e-05, + "loss": 0.763083279132843, + "step": 198 + }, + { + "epoch": 0.6083301490256018, + "grad_norm": 0.20243075489997864, + "learning_rate": 1.7629934983117025e-05, + "loss": 0.7372676134109497, + "step": 199 + }, + { + "epoch": 0.6113870844478411, + "grad_norm": 0.18152795732021332, + "learning_rate": 1.759540757702924e-05, + "loss": 0.7121898531913757, + "step": 200 + }, + { + "epoch": 0.6113870844478411, + "eval_loss": 0.7551760673522949, + "eval_runtime": 900.209, + "eval_samples_per_second": 0.67, + "eval_steps_per_second": 0.67, + "step": 200 + }, + { + "epoch": 0.6144440198700802, + "grad_norm": 0.18808062374591827, + "learning_rate": 1.7560664820493502e-05, + "loss": 0.734307050704956, + "step": 201 + }, + { + "epoch": 0.6175009552923194, + "grad_norm": 0.18151243031024933, + "learning_rate": 1.7525707698561383e-05, + "loss": 0.7998429536819458, + "step": 202 + }, + { + "epoch": 0.6205578907145587, + "grad_norm": 0.19583043456077576, + "learning_rate": 1.7490537202362313e-05, + "loss": 0.7546265721321106, + "step": 203 + }, + { + "epoch": 0.6236148261367979, + "grad_norm": 0.2508557140827179, + "learning_rate": 1.7455154329075427e-05, + "loss": 0.7810050249099731, + "step": 204 + }, + { + "epoch": 0.626671761559037, + "grad_norm": 0.1685105562210083, + "learning_rate": 1.741956008190136e-05, + "loss": 0.7558917999267578, + "step": 205 + }, + { + "epoch": 0.6297286969812763, + "grad_norm": 0.18195222318172455, + "learning_rate": 1.7383755470033756e-05, + "loss": 0.7216942310333252, + "step": 206 + }, + { + "epoch": 0.6327856324035155, + "grad_norm": 0.1878063678741455, + "learning_rate": 1.7347741508630673e-05, + "loss": 0.7417092323303223, + "step": 207 + }, + { + "epoch": 0.6358425678257547, + "grad_norm": 0.25273698568344116, + "learning_rate": 1.73115192187858e-05, + "loss": 0.807498037815094, + "step": 208 + }, + { + "epoch": 0.6388995032479939, + "grad_norm": 0.2451465129852295, + "learning_rate": 1.7275089627499493e-05, + "loss": 0.7557163238525391, + "step": 209 + }, + { + "epoch": 0.6419564386702331, + "grad_norm": 0.19272617995738983, + "learning_rate": 1.7238453767649683e-05, + "loss": 0.8285109996795654, + "step": 210 + }, + { + "epoch": 0.6450133740924723, + "grad_norm": 0.1869518756866455, + "learning_rate": 1.720161267796256e-05, + "loss": 0.7824444770812988, + "step": 211 + }, + { + "epoch": 0.6480703095147115, + "grad_norm": 0.2029627561569214, + "learning_rate": 1.7164567402983153e-05, + "loss": 0.7018642425537109, + "step": 212 + }, + { + "epoch": 0.6511272449369507, + "grad_norm": 0.23215501010417938, + "learning_rate": 1.7127318993045686e-05, + "loss": 0.7263948917388916, + "step": 213 + }, + { + "epoch": 0.6541841803591899, + "grad_norm": 0.19869184494018555, + "learning_rate": 1.7089868504243816e-05, + "loss": 0.8285576105117798, + "step": 214 + }, + { + "epoch": 0.6572411157814291, + "grad_norm": 0.22871531546115875, + "learning_rate": 1.705221699840069e-05, + "loss": 0.7871490716934204, + "step": 215 + }, + { + "epoch": 0.6602980512036684, + "grad_norm": 0.17945580184459686, + "learning_rate": 1.701436554303882e-05, + "loss": 0.740180492401123, + "step": 216 + }, + { + "epoch": 0.6633549866259075, + "grad_norm": 0.20516762137413025, + "learning_rate": 1.6976315211349848e-05, + "loss": 0.7542892098426819, + "step": 217 + }, + { + "epoch": 0.6664119220481467, + "grad_norm": 0.22108283638954163, + "learning_rate": 1.6938067082164093e-05, + "loss": 0.8117404580116272, + "step": 218 + }, + { + "epoch": 0.669468857470386, + "grad_norm": 0.22329698503017426, + "learning_rate": 1.6899622239919965e-05, + "loss": 0.8002716898918152, + "step": 219 + }, + { + "epoch": 0.6725257928926252, + "grad_norm": 0.23545362055301666, + "learning_rate": 1.6860981774633228e-05, + "loss": 0.7750573754310608, + "step": 220 + }, + { + "epoch": 0.6755827283148643, + "grad_norm": 0.21816480159759521, + "learning_rate": 1.6822146781866097e-05, + "loss": 0.8051223754882812, + "step": 221 + }, + { + "epoch": 0.6786396637371036, + "grad_norm": 0.18638508021831512, + "learning_rate": 1.6783118362696162e-05, + "loss": 0.7286484241485596, + "step": 222 + }, + { + "epoch": 0.6816965991593428, + "grad_norm": 0.16794732213020325, + "learning_rate": 1.6743897623685178e-05, + "loss": 0.7001460194587708, + "step": 223 + }, + { + "epoch": 0.684753534581582, + "grad_norm": 0.21157318353652954, + "learning_rate": 1.6704485676847695e-05, + "loss": 0.7479901313781738, + "step": 224 + }, + { + "epoch": 0.6878104700038211, + "grad_norm": 0.35601308941841125, + "learning_rate": 1.666488363961952e-05, + "loss": 0.7660019397735596, + "step": 225 + }, + { + "epoch": 0.6908674054260604, + "grad_norm": 0.17416611313819885, + "learning_rate": 1.662509263482604e-05, + "loss": 0.7157142162322998, + "step": 226 + }, + { + "epoch": 0.6939243408482996, + "grad_norm": 0.19655123353004456, + "learning_rate": 1.658511379065039e-05, + "loss": 0.7894638776779175, + "step": 227 + }, + { + "epoch": 0.6969812762705387, + "grad_norm": 0.2034345269203186, + "learning_rate": 1.6544948240601453e-05, + "loss": 0.6853711009025574, + "step": 228 + }, + { + "epoch": 0.700038211692778, + "grad_norm": 0.199235200881958, + "learning_rate": 1.6504597123481737e-05, + "loss": 0.7487372756004333, + "step": 229 + }, + { + "epoch": 0.7030951471150172, + "grad_norm": 0.20407404005527496, + "learning_rate": 1.6464061583355088e-05, + "loss": 0.7335573434829712, + "step": 230 + }, + { + "epoch": 0.7061520825372564, + "grad_norm": 0.22096174955368042, + "learning_rate": 1.6423342769514227e-05, + "loss": 0.7659798264503479, + "step": 231 + }, + { + "epoch": 0.7092090179594956, + "grad_norm": 0.1916825920343399, + "learning_rate": 1.6382441836448203e-05, + "loss": 0.7162011861801147, + "step": 232 + }, + { + "epoch": 0.7122659533817348, + "grad_norm": 0.20505093038082123, + "learning_rate": 1.6341359943809626e-05, + "loss": 0.6957600116729736, + "step": 233 + }, + { + "epoch": 0.715322888803974, + "grad_norm": 0.19968082010746002, + "learning_rate": 1.6300098256381807e-05, + "loss": 0.6724053025245667, + "step": 234 + }, + { + "epoch": 0.7183798242262133, + "grad_norm": 0.19768832623958588, + "learning_rate": 1.625865794404573e-05, + "loss": 0.774741530418396, + "step": 235 + }, + { + "epoch": 0.7214367596484524, + "grad_norm": 0.19257694482803345, + "learning_rate": 1.621704018174688e-05, + "loss": 0.6658651828765869, + "step": 236 + }, + { + "epoch": 0.7244936950706916, + "grad_norm": 0.21594858169555664, + "learning_rate": 1.617524614946192e-05, + "loss": 0.810744047164917, + "step": 237 + }, + { + "epoch": 0.7275506304929308, + "grad_norm": 0.2107633650302887, + "learning_rate": 1.6133277032165264e-05, + "loss": 0.7623897194862366, + "step": 238 + }, + { + "epoch": 0.7306075659151701, + "grad_norm": 0.20114055275917053, + "learning_rate": 1.6091134019795447e-05, + "loss": 0.7082816362380981, + "step": 239 + }, + { + "epoch": 0.7336645013374092, + "grad_norm": 0.2542732059955597, + "learning_rate": 1.604881830722141e-05, + "loss": 0.7051193714141846, + "step": 240 + }, + { + "epoch": 0.7367214367596484, + "grad_norm": 0.19180485606193542, + "learning_rate": 1.600633109420861e-05, + "loss": 0.7895385026931763, + "step": 241 + }, + { + "epoch": 0.7397783721818877, + "grad_norm": 0.368756502866745, + "learning_rate": 1.5963673585385016e-05, + "loss": 0.7146293520927429, + "step": 242 + }, + { + "epoch": 0.7428353076041269, + "grad_norm": 0.18490125238895416, + "learning_rate": 1.5920846990206934e-05, + "loss": 0.650428056716919, + "step": 243 + }, + { + "epoch": 0.745892243026366, + "grad_norm": 0.23592503368854523, + "learning_rate": 1.5877852522924733e-05, + "loss": 0.6367110013961792, + "step": 244 + }, + { + "epoch": 0.7489491784486053, + "grad_norm": 0.20223264396190643, + "learning_rate": 1.5834691402548415e-05, + "loss": 0.6563615798950195, + "step": 245 + }, + { + "epoch": 0.7520061138708445, + "grad_norm": 0.27459946274757385, + "learning_rate": 1.5791364852813047e-05, + "loss": 0.7361881136894226, + "step": 246 + }, + { + "epoch": 0.7550630492930837, + "grad_norm": 0.21085411310195923, + "learning_rate": 1.5747874102144073e-05, + "loss": 0.7373813390731812, + "step": 247 + }, + { + "epoch": 0.7581199847153229, + "grad_norm": 0.23332320153713226, + "learning_rate": 1.5704220383622464e-05, + "loss": 0.6971457004547119, + "step": 248 + }, + { + "epoch": 0.7611769201375621, + "grad_norm": 0.23525936901569366, + "learning_rate": 1.5660404934949798e-05, + "loss": 0.6756627559661865, + "step": 249 + }, + { + "epoch": 0.7642338555598013, + "grad_norm": 0.2150791585445404, + "learning_rate": 1.5616428998413122e-05, + "loss": 0.7029792666435242, + "step": 250 + }, + { + "epoch": 0.7642338555598013, + "eval_loss": 0.7269901633262634, + "eval_runtime": 877.665, + "eval_samples_per_second": 0.687, + "eval_steps_per_second": 0.687, + "step": 250 + }, + { + "epoch": 0.7672907909820404, + "grad_norm": 0.19510552287101746, + "learning_rate": 1.5572293820849754e-05, + "loss": 0.715162992477417, + "step": 251 + }, + { + "epoch": 0.7703477264042797, + "grad_norm": 0.25246763229370117, + "learning_rate": 1.5528000653611935e-05, + "loss": 0.634660542011261, + "step": 252 + }, + { + "epoch": 0.7734046618265189, + "grad_norm": 0.2980027496814728, + "learning_rate": 1.5483550752531337e-05, + "loss": 0.7154463529586792, + "step": 253 + }, + { + "epoch": 0.7764615972487581, + "grad_norm": 0.2730556130409241, + "learning_rate": 1.5438945377883463e-05, + "loss": 0.8110946416854858, + "step": 254 + }, + { + "epoch": 0.7795185326709974, + "grad_norm": 0.17258886992931366, + "learning_rate": 1.5394185794351914e-05, + "loss": 0.72202467918396, + "step": 255 + }, + { + "epoch": 0.7825754680932365, + "grad_norm": 0.19966280460357666, + "learning_rate": 1.5349273270992537e-05, + "loss": 0.7368704080581665, + "step": 256 + }, + { + "epoch": 0.7856324035154757, + "grad_norm": 0.23305682837963104, + "learning_rate": 1.5304209081197425e-05, + "loss": 0.7429723143577576, + "step": 257 + }, + { + "epoch": 0.788689338937715, + "grad_norm": 0.21786810457706451, + "learning_rate": 1.5258994502658846e-05, + "loss": 0.6498424410820007, + "step": 258 + }, + { + "epoch": 0.7917462743599541, + "grad_norm": 0.2370925396680832, + "learning_rate": 1.5213630817332985e-05, + "loss": 0.7379459142684937, + "step": 259 + }, + { + "epoch": 0.7948032097821933, + "grad_norm": 0.25566384196281433, + "learning_rate": 1.5168119311403611e-05, + "loss": 0.6742876172065735, + "step": 260 + }, + { + "epoch": 0.7978601452044326, + "grad_norm": 0.2171633243560791, + "learning_rate": 1.512246127524561e-05, + "loss": 0.72329181432724, + "step": 261 + }, + { + "epoch": 0.8009170806266718, + "grad_norm": 0.23292019963264465, + "learning_rate": 1.50766580033884e-05, + "loss": 0.765812873840332, + "step": 262 + }, + { + "epoch": 0.8039740160489109, + "grad_norm": 0.19427980482578278, + "learning_rate": 1.5030710794479226e-05, + "loss": 0.7872639298439026, + "step": 263 + }, + { + "epoch": 0.8070309514711502, + "grad_norm": 0.2460346817970276, + "learning_rate": 1.4984620951246333e-05, + "loss": 0.6940722465515137, + "step": 264 + }, + { + "epoch": 0.8100878868933894, + "grad_norm": 0.2493411898612976, + "learning_rate": 1.4938389780462044e-05, + "loss": 0.7680137157440186, + "step": 265 + }, + { + "epoch": 0.8131448223156286, + "grad_norm": 0.23873573541641235, + "learning_rate": 1.4892018592905702e-05, + "loss": 0.6780916452407837, + "step": 266 + }, + { + "epoch": 0.8162017577378677, + "grad_norm": 0.2580571174621582, + "learning_rate": 1.4845508703326504e-05, + "loss": 0.7183764576911926, + "step": 267 + }, + { + "epoch": 0.819258693160107, + "grad_norm": 0.2125079482793808, + "learning_rate": 1.4798861430406221e-05, + "loss": 0.8207096457481384, + "step": 268 + }, + { + "epoch": 0.8223156285823462, + "grad_norm": 0.21065691113471985, + "learning_rate": 1.4752078096721827e-05, + "loss": 0.7414214611053467, + "step": 269 + }, + { + "epoch": 0.8253725640045854, + "grad_norm": 0.25807511806488037, + "learning_rate": 1.4705160028707976e-05, + "loss": 0.7086384296417236, + "step": 270 + }, + { + "epoch": 0.8284294994268246, + "grad_norm": 0.2444671094417572, + "learning_rate": 1.4658108556619417e-05, + "loss": 0.7065964937210083, + "step": 271 + }, + { + "epoch": 0.8314864348490638, + "grad_norm": 0.200303316116333, + "learning_rate": 1.461092501449326e-05, + "loss": 0.7533905506134033, + "step": 272 + }, + { + "epoch": 0.834543370271303, + "grad_norm": 0.2807226777076721, + "learning_rate": 1.4563610740111163e-05, + "loss": 0.756553053855896, + "step": 273 + }, + { + "epoch": 0.8376003056935423, + "grad_norm": 0.2516884207725525, + "learning_rate": 1.4516167074961394e-05, + "loss": 0.8125098347663879, + "step": 274 + }, + { + "epoch": 0.8406572411157814, + "grad_norm": 0.22799813747406006, + "learning_rate": 1.4468595364200808e-05, + "loss": 0.7360811829566956, + "step": 275 + }, + { + "epoch": 0.8437141765380206, + "grad_norm": 0.27390384674072266, + "learning_rate": 1.4420896956616698e-05, + "loss": 0.7135312557220459, + "step": 276 + }, + { + "epoch": 0.8467711119602599, + "grad_norm": 0.2811775505542755, + "learning_rate": 1.4373073204588556e-05, + "loss": 0.7489083409309387, + "step": 277 + }, + { + "epoch": 0.8498280473824991, + "grad_norm": 0.2652314603328705, + "learning_rate": 1.4325125464049725e-05, + "loss": 0.752477765083313, + "step": 278 + }, + { + "epoch": 0.8528849828047382, + "grad_norm": 0.2218960076570511, + "learning_rate": 1.427705509444897e-05, + "loss": 0.6534979939460754, + "step": 279 + }, + { + "epoch": 0.8559419182269774, + "grad_norm": 0.23746474087238312, + "learning_rate": 1.4228863458711915e-05, + "loss": 0.7061883211135864, + "step": 280 + }, + { + "epoch": 0.8589988536492167, + "grad_norm": 0.21507228910923004, + "learning_rate": 1.4180551923202406e-05, + "loss": 0.7044329643249512, + "step": 281 + }, + { + "epoch": 0.8620557890714559, + "grad_norm": 0.2412186861038208, + "learning_rate": 1.4132121857683782e-05, + "loss": 0.706013023853302, + "step": 282 + }, + { + "epoch": 0.865112724493695, + "grad_norm": 0.2832106947898865, + "learning_rate": 1.4083574635280029e-05, + "loss": 0.6572445631027222, + "step": 283 + }, + { + "epoch": 0.8681696599159343, + "grad_norm": 0.21925900876522064, + "learning_rate": 1.403491163243684e-05, + "loss": 0.675041139125824, + "step": 284 + }, + { + "epoch": 0.8712265953381735, + "grad_norm": 0.22488665580749512, + "learning_rate": 1.3986134228882607e-05, + "loss": 0.7474229335784912, + "step": 285 + }, + { + "epoch": 0.8742835307604127, + "grad_norm": 0.2221737653017044, + "learning_rate": 1.3937243807589291e-05, + "loss": 0.7394901514053345, + "step": 286 + }, + { + "epoch": 0.8773404661826519, + "grad_norm": 0.29034581780433655, + "learning_rate": 1.388824175473321e-05, + "loss": 0.7346636056900024, + "step": 287 + }, + { + "epoch": 0.8803974016048911, + "grad_norm": 0.2580259144306183, + "learning_rate": 1.383912945965574e-05, + "loss": 0.8125481009483337, + "step": 288 + }, + { + "epoch": 0.8834543370271303, + "grad_norm": 0.2533118724822998, + "learning_rate": 1.3789908314823932e-05, + "loss": 0.6768131256103516, + "step": 289 + }, + { + "epoch": 0.8865112724493696, + "grad_norm": 0.2074616551399231, + "learning_rate": 1.3740579715791017e-05, + "loss": 0.7096269726753235, + "step": 290 + }, + { + "epoch": 0.8895682078716087, + "grad_norm": 0.29789987206459045, + "learning_rate": 1.3691145061156843e-05, + "loss": 0.6973364353179932, + "step": 291 + }, + { + "epoch": 0.8926251432938479, + "grad_norm": 0.2937224805355072, + "learning_rate": 1.3641605752528225e-05, + "loss": 0.7693608999252319, + "step": 292 + }, + { + "epoch": 0.8956820787160871, + "grad_norm": 0.27355870604515076, + "learning_rate": 1.3591963194479198e-05, + "loss": 0.6870795488357544, + "step": 293 + }, + { + "epoch": 0.8987390141383264, + "grad_norm": 0.22792251408100128, + "learning_rate": 1.3542218794511212e-05, + "loss": 0.7095532417297363, + "step": 294 + }, + { + "epoch": 0.9017959495605655, + "grad_norm": 0.2855125665664673, + "learning_rate": 1.3492373963013199e-05, + "loss": 0.7536489963531494, + "step": 295 + }, + { + "epoch": 0.9048528849828047, + "grad_norm": 0.24969056248664856, + "learning_rate": 1.3442430113221602e-05, + "loss": 0.7433043718338013, + "step": 296 + }, + { + "epoch": 0.907909820405044, + "grad_norm": 0.24534980952739716, + "learning_rate": 1.3392388661180303e-05, + "loss": 0.7204138040542603, + "step": 297 + }, + { + "epoch": 0.9109667558272831, + "grad_norm": 0.2540739178657532, + "learning_rate": 1.3342251025700474e-05, + "loss": 0.7114053964614868, + "step": 298 + }, + { + "epoch": 0.9140236912495223, + "grad_norm": 0.2494630217552185, + "learning_rate": 1.3292018628320346e-05, + "loss": 0.7337151169776917, + "step": 299 + }, + { + "epoch": 0.9170806266717616, + "grad_norm": 0.3079741597175598, + "learning_rate": 1.3241692893264909e-05, + "loss": 0.7486672401428223, + "step": 300 + }, + { + "epoch": 0.9170806266717616, + "eval_loss": 0.7063615918159485, + "eval_runtime": 882.246, + "eval_samples_per_second": 0.683, + "eval_steps_per_second": 0.683, + "step": 300 + } + ], + "logging_steps": 1, + "max_steps": 656, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6564378492993536e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-300/training_args.bin b/cpt_qwen_14B/checkpoints/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eddbb43a2cebb928dbed6e955a37ebfa3174f4b5 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a8e308e47eb936f678712445b19ddc52638f354c37c813ecaa432f69120a2e +size 5201 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-400/README.md b/cpt_qwen_14B/checkpoints/checkpoint-400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8dfda26032514233f3e70a4012f1cfd1ddbbb609 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-400/README.md @@ -0,0 +1,207 @@ +--- +base_model: /workspace/Models/Qwen2.5-Coder-14B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:/workspace/Models/Qwen2.5-Coder-14B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/cpt_qwen_14B/checkpoints/checkpoint-400/adapter_config.json b/cpt_qwen_14B/checkpoints/checkpoint-400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..81c31359285f7e351a44275c30b6882f4c6b50c0 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-400/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "/workspace/Models/Qwen2.5-Coder-14B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/cpt_qwen_14B/checkpoints/checkpoint-400/adapter_model.safetensors b/cpt_qwen_14B/checkpoints/checkpoint-400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6904bc8dd2bb3e24f3b0df113f537c355b2a41f3 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc35c49614987939ccf4e73733555cd12c1d55e627db1e2f836d2341ca58bd60 +size 201378736 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-400/chat_template.jinja b/cpt_qwen_14B/checkpoints/checkpoint-400/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..28028c056af412405debd878cdda0171e35fa5d1 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-400/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-400/optimizer.pt b/cpt_qwen_14B/checkpoints/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf3321ba58ff90c8bcdd968c2dd3fb406c98048c --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be4e4b976fbe87d4881b1cf906d435f007d6f2c6775114b03ca77718ebb3e099 +size 102698855 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-400/rng_state.pth b/cpt_qwen_14B/checkpoints/checkpoint-400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2a7354c0d86f5737a12ef584abf0c9e37238b394 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54f252272095f008d9c3adc5557d863356ce442db9820129678a2dbdeb028a30 +size 14645 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-400/scheduler.pt b/cpt_qwen_14B/checkpoints/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5679761f7b23ec5dfcc6e06e49f23530e7710c39 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c58aed3093aa166a91339aaba61e306c07de6b4c4581b6fcc79de090acb72707 +size 1465 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-400/tokenizer.json b/cpt_qwen_14B/checkpoints/checkpoint-400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-400/tokenizer_config.json b/cpt_qwen_14B/checkpoints/checkpoint-400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..217274ef8275420e4bf3b976f3948901cd3d176f --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-400/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": true, + "model_max_length": 32768, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-400/trainer_state.json b/cpt_qwen_14B/checkpoints/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..724bf1529f08b9a656df6bffe85fa5c2b2dc775e --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-400/trainer_state.json @@ -0,0 +1,2898 @@ +{ + "best_global_step": 400, + "best_metric": 0.6789794564247131, + "best_model_checkpoint": "runs/cpt_run_14b/checkpoints/checkpoint-400", + "epoch": 1.2200993504012227, + "eval_steps": 50, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003056935422239205, + "grad_norm": 0.06516239047050476, + "learning_rate": 0.0, + "loss": 1.138384461402893, + "step": 1 + }, + { + "epoch": 0.00611387084447841, + "grad_norm": 0.05343673378229141, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.983342707157135, + "step": 2 + }, + { + "epoch": 0.009170806266717615, + "grad_norm": 0.05608418956398964, + "learning_rate": 6.060606060606061e-07, + "loss": 1.0762118101119995, + "step": 3 + }, + { + "epoch": 0.01222774168895682, + "grad_norm": 0.06523486226797104, + "learning_rate": 9.090909090909091e-07, + "loss": 1.084489345550537, + "step": 4 + }, + { + "epoch": 0.015284677111196026, + "grad_norm": 0.06582186371088028, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.2037022113800049, + "step": 5 + }, + { + "epoch": 0.01834161253343523, + "grad_norm": 0.06097998470067978, + "learning_rate": 1.5151515151515152e-06, + "loss": 1.10005784034729, + "step": 6 + }, + { + "epoch": 0.021398547955674436, + "grad_norm": 0.10365528613328934, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.0895193815231323, + "step": 7 + }, + { + "epoch": 0.02445548337791364, + "grad_norm": 0.06312141567468643, + "learning_rate": 2.1212121212121216e-06, + "loss": 1.0593242645263672, + "step": 8 + }, + { + "epoch": 0.027512418800152847, + "grad_norm": 0.05508403480052948, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.9772955179214478, + "step": 9 + }, + { + "epoch": 0.030569354222392053, + "grad_norm": 0.06006711348891258, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.084238886833191, + "step": 10 + }, + { + "epoch": 0.033626289644631255, + "grad_norm": 0.0588749423623085, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.0786534547805786, + "step": 11 + }, + { + "epoch": 0.03668322506687046, + "grad_norm": 0.046551357954740524, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.0370622873306274, + "step": 12 + }, + { + "epoch": 0.039740160489109666, + "grad_norm": 0.061659567058086395, + "learning_rate": 3.6363636363636366e-06, + "loss": 1.0646986961364746, + "step": 13 + }, + { + "epoch": 0.04279709591134887, + "grad_norm": 0.06007347255945206, + "learning_rate": 3.93939393939394e-06, + "loss": 1.0311307907104492, + "step": 14 + }, + { + "epoch": 0.04585403133358808, + "grad_norm": 0.07314135134220123, + "learning_rate": 4.242424242424243e-06, + "loss": 1.1300500631332397, + "step": 15 + }, + { + "epoch": 0.04891096675582728, + "grad_norm": 0.060934022068977356, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.0197452306747437, + "step": 16 + }, + { + "epoch": 0.05196790217806649, + "grad_norm": 0.056856051087379456, + "learning_rate": 4.848484848484849e-06, + "loss": 1.0438549518585205, + "step": 17 + }, + { + "epoch": 0.055024837600305694, + "grad_norm": 0.05908689647912979, + "learning_rate": 5.151515151515152e-06, + "loss": 1.0398856401443481, + "step": 18 + }, + { + "epoch": 0.0580817730225449, + "grad_norm": 0.07411840558052063, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.107885479927063, + "step": 19 + }, + { + "epoch": 0.061138708444784105, + "grad_norm": 0.0749165341258049, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.1060967445373535, + "step": 20 + }, + { + "epoch": 0.06419564386702331, + "grad_norm": 0.06720177084207535, + "learning_rate": 6.060606060606061e-06, + "loss": 1.0471720695495605, + "step": 21 + }, + { + "epoch": 0.06725257928926251, + "grad_norm": 0.05990725755691528, + "learning_rate": 6.363636363636364e-06, + "loss": 1.0944981575012207, + "step": 22 + }, + { + "epoch": 0.07030951471150172, + "grad_norm": 0.06672193855047226, + "learning_rate": 6.666666666666667e-06, + "loss": 1.1477092504501343, + "step": 23 + }, + { + "epoch": 0.07336645013374092, + "grad_norm": 0.06145205348730087, + "learning_rate": 6.969696969696971e-06, + "loss": 1.0591784715652466, + "step": 24 + }, + { + "epoch": 0.07642338555598013, + "grad_norm": 0.0757482647895813, + "learning_rate": 7.272727272727273e-06, + "loss": 1.0500165224075317, + "step": 25 + }, + { + "epoch": 0.07948032097821933, + "grad_norm": 0.07848478108644485, + "learning_rate": 7.5757575757575764e-06, + "loss": 1.0747522115707397, + "step": 26 + }, + { + "epoch": 0.08253725640045854, + "grad_norm": 0.07740631699562073, + "learning_rate": 7.87878787878788e-06, + "loss": 1.132310152053833, + "step": 27 + }, + { + "epoch": 0.08559419182269774, + "grad_norm": 0.07476603239774704, + "learning_rate": 8.181818181818183e-06, + "loss": 1.0339502096176147, + "step": 28 + }, + { + "epoch": 0.08865112724493696, + "grad_norm": 0.0779196098446846, + "learning_rate": 8.484848484848486e-06, + "loss": 1.1047282218933105, + "step": 29 + }, + { + "epoch": 0.09170806266717615, + "grad_norm": 0.06962384283542633, + "learning_rate": 8.787878787878788e-06, + "loss": 1.004916787147522, + "step": 30 + }, + { + "epoch": 0.09476499808941537, + "grad_norm": 0.06369175016880035, + "learning_rate": 9.090909090909091e-06, + "loss": 0.9296417832374573, + "step": 31 + }, + { + "epoch": 0.09782193351165457, + "grad_norm": 0.07470260560512543, + "learning_rate": 9.393939393939396e-06, + "loss": 1.0721708536148071, + "step": 32 + }, + { + "epoch": 0.10087886893389378, + "grad_norm": 0.07948213815689087, + "learning_rate": 9.696969696969698e-06, + "loss": 1.0350117683410645, + "step": 33 + }, + { + "epoch": 0.10393580435613298, + "grad_norm": 0.07066022604703903, + "learning_rate": 1e-05, + "loss": 1.026305913925171, + "step": 34 + }, + { + "epoch": 0.10699273977837218, + "grad_norm": 0.07774543762207031, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.0509816408157349, + "step": 35 + }, + { + "epoch": 0.11004967520061139, + "grad_norm": 0.07501248270273209, + "learning_rate": 1.0606060606060606e-05, + "loss": 1.0011574029922485, + "step": 36 + }, + { + "epoch": 0.11310661062285059, + "grad_norm": 0.6622501611709595, + "learning_rate": 1.0909090909090909e-05, + "loss": 0.9754424691200256, + "step": 37 + }, + { + "epoch": 0.1161635460450898, + "grad_norm": 0.07566080242395401, + "learning_rate": 1.1212121212121212e-05, + "loss": 1.0342774391174316, + "step": 38 + }, + { + "epoch": 0.119220481467329, + "grad_norm": 0.07573831081390381, + "learning_rate": 1.1515151515151517e-05, + "loss": 0.9714518785476685, + "step": 39 + }, + { + "epoch": 0.12227741688956821, + "grad_norm": 0.08083852380514145, + "learning_rate": 1.181818181818182e-05, + "loss": 1.1050316095352173, + "step": 40 + }, + { + "epoch": 0.12533435231180742, + "grad_norm": 0.08540588617324829, + "learning_rate": 1.2121212121212122e-05, + "loss": 1.0871070623397827, + "step": 41 + }, + { + "epoch": 0.12839128773404662, + "grad_norm": 0.07391592115163803, + "learning_rate": 1.2424242424242425e-05, + "loss": 1.0206722021102905, + "step": 42 + }, + { + "epoch": 0.13144822315628582, + "grad_norm": 0.07063689082860947, + "learning_rate": 1.2727272727272728e-05, + "loss": 0.9775047898292542, + "step": 43 + }, + { + "epoch": 0.13450515857852502, + "grad_norm": 0.07288888841867447, + "learning_rate": 1.3030303030303032e-05, + "loss": 1.1132858991622925, + "step": 44 + }, + { + "epoch": 0.13756209400076425, + "grad_norm": 0.07641777396202087, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.0707701444625854, + "step": 45 + }, + { + "epoch": 0.14061902942300344, + "grad_norm": 0.06990326195955276, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.9328265190124512, + "step": 46 + }, + { + "epoch": 0.14367596484524264, + "grad_norm": 0.0834241658449173, + "learning_rate": 1.3939393939393942e-05, + "loss": 1.0131721496582031, + "step": 47 + }, + { + "epoch": 0.14673290026748184, + "grad_norm": 0.0714937075972557, + "learning_rate": 1.4242424242424245e-05, + "loss": 0.940493106842041, + "step": 48 + }, + { + "epoch": 0.14978983568972107, + "grad_norm": 0.07770547270774841, + "learning_rate": 1.4545454545454546e-05, + "loss": 1.0435771942138672, + "step": 49 + }, + { + "epoch": 0.15284677111196027, + "grad_norm": 0.07950945198535919, + "learning_rate": 1.484848484848485e-05, + "loss": 1.0382137298583984, + "step": 50 + }, + { + "epoch": 0.15284677111196027, + "eval_loss": 1.0129202604293823, + "eval_runtime": 724.3664, + "eval_samples_per_second": 0.832, + "eval_steps_per_second": 0.832, + "step": 50 + }, + { + "epoch": 0.15590370653419947, + "grad_norm": 0.06961936503648758, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.9690049886703491, + "step": 51 + }, + { + "epoch": 0.15896064195643866, + "grad_norm": 0.069523885846138, + "learning_rate": 1.5454545454545454e-05, + "loss": 0.9830482006072998, + "step": 52 + }, + { + "epoch": 0.16201757737867786, + "grad_norm": 0.0764622762799263, + "learning_rate": 1.575757575757576e-05, + "loss": 1.0895472764968872, + "step": 53 + }, + { + "epoch": 0.1650745128009171, + "grad_norm": 0.1413721889257431, + "learning_rate": 1.606060606060606e-05, + "loss": 1.0354574918746948, + "step": 54 + }, + { + "epoch": 0.1681314482231563, + "grad_norm": 0.06818042695522308, + "learning_rate": 1.6363636363636366e-05, + "loss": 0.8534265160560608, + "step": 55 + }, + { + "epoch": 0.1711883836453955, + "grad_norm": 0.0722246989607811, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.9580274820327759, + "step": 56 + }, + { + "epoch": 0.17424531906763469, + "grad_norm": 0.07113443315029144, + "learning_rate": 1.6969696969696972e-05, + "loss": 1.0721848011016846, + "step": 57 + }, + { + "epoch": 0.1773022544898739, + "grad_norm": 0.08412107080221176, + "learning_rate": 1.7272727272727274e-05, + "loss": 1.1180150508880615, + "step": 58 + }, + { + "epoch": 0.1803591899121131, + "grad_norm": 0.07381036877632141, + "learning_rate": 1.7575757575757576e-05, + "loss": 1.0384547710418701, + "step": 59 + }, + { + "epoch": 0.1834161253343523, + "grad_norm": 0.07089001685380936, + "learning_rate": 1.787878787878788e-05, + "loss": 1.0446016788482666, + "step": 60 + }, + { + "epoch": 0.1864730607565915, + "grad_norm": 0.11576953530311584, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.0015051364898682, + "step": 61 + }, + { + "epoch": 0.18952999617883073, + "grad_norm": 0.08030868321657181, + "learning_rate": 1.8484848484848487e-05, + "loss": 0.9642710089683533, + "step": 62 + }, + { + "epoch": 0.19258693160106993, + "grad_norm": 0.08332342654466629, + "learning_rate": 1.8787878787878792e-05, + "loss": 1.0722991228103638, + "step": 63 + }, + { + "epoch": 0.19564386702330913, + "grad_norm": 0.08000365644693375, + "learning_rate": 1.9090909090909094e-05, + "loss": 1.0104647874832153, + "step": 64 + }, + { + "epoch": 0.19870080244554833, + "grad_norm": 0.08139508217573166, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9445061087608337, + "step": 65 + }, + { + "epoch": 0.20175773786778756, + "grad_norm": 0.08749893307685852, + "learning_rate": 1.96969696969697e-05, + "loss": 1.080810308456421, + "step": 66 + }, + { + "epoch": 0.20481467329002676, + "grad_norm": 0.0786912813782692, + "learning_rate": 2e-05, + "loss": 0.9705753922462463, + "step": 67 + }, + { + "epoch": 0.20787160871226595, + "grad_norm": 0.08962028473615646, + "learning_rate": 1.9999858236410775e-05, + "loss": 0.962783694267273, + "step": 68 + }, + { + "epoch": 0.21092854413450515, + "grad_norm": 0.08402887731790543, + "learning_rate": 1.9999432949662483e-05, + "loss": 0.9959614872932434, + "step": 69 + }, + { + "epoch": 0.21398547955674435, + "grad_norm": 0.08036444336175919, + "learning_rate": 1.9998724151813157e-05, + "loss": 0.9569960832595825, + "step": 70 + }, + { + "epoch": 0.21704241497898358, + "grad_norm": 0.08247046917676926, + "learning_rate": 1.9997731862959143e-05, + "loss": 1.0012171268463135, + "step": 71 + }, + { + "epoch": 0.22009935040122278, + "grad_norm": 0.08966264873743057, + "learning_rate": 1.999645611123453e-05, + "loss": 1.0403809547424316, + "step": 72 + }, + { + "epoch": 0.22315628582346198, + "grad_norm": 0.08061660826206207, + "learning_rate": 1.999489693281034e-05, + "loss": 1.0089740753173828, + "step": 73 + }, + { + "epoch": 0.22621322124570117, + "grad_norm": 0.09005365520715714, + "learning_rate": 1.9993054371893526e-05, + "loss": 0.9333044290542603, + "step": 74 + }, + { + "epoch": 0.2292701566679404, + "grad_norm": 0.08651519566774368, + "learning_rate": 1.9990928480725694e-05, + "loss": 0.9284015893936157, + "step": 75 + }, + { + "epoch": 0.2323270920901796, + "grad_norm": 0.08141147345304489, + "learning_rate": 1.9988519319581637e-05, + "loss": 0.9782730340957642, + "step": 76 + }, + { + "epoch": 0.2353840275124188, + "grad_norm": 0.08344405144453049, + "learning_rate": 1.998582695676762e-05, + "loss": 0.9723064303398132, + "step": 77 + }, + { + "epoch": 0.238440962934658, + "grad_norm": 0.08019903302192688, + "learning_rate": 1.998285146861945e-05, + "loss": 0.9648997783660889, + "step": 78 + }, + { + "epoch": 0.24149789835689722, + "grad_norm": 0.08113416284322739, + "learning_rate": 1.99795929395003e-05, + "loss": 0.9263214468955994, + "step": 79 + }, + { + "epoch": 0.24455483377913642, + "grad_norm": 0.08127513527870178, + "learning_rate": 1.997605146179833e-05, + "loss": 0.8745232224464417, + "step": 80 + }, + { + "epoch": 0.24761176920137562, + "grad_norm": 0.09934187680482864, + "learning_rate": 1.997222713592405e-05, + "loss": 0.8722782135009766, + "step": 81 + }, + { + "epoch": 0.25066870462361485, + "grad_norm": 0.09701363742351532, + "learning_rate": 1.9968120070307503e-05, + "loss": 1.0084266662597656, + "step": 82 + }, + { + "epoch": 0.253725640045854, + "grad_norm": 0.08335654437541962, + "learning_rate": 1.9963730381395154e-05, + "loss": 0.9239332675933838, + "step": 83 + }, + { + "epoch": 0.25678257546809324, + "grad_norm": 0.09161650389432907, + "learning_rate": 1.9959058193646618e-05, + "loss": 0.9878032207489014, + "step": 84 + }, + { + "epoch": 0.2598395108903324, + "grad_norm": 0.08067663013935089, + "learning_rate": 1.9954103639531116e-05, + "loss": 0.9113098382949829, + "step": 85 + }, + { + "epoch": 0.26289644631257164, + "grad_norm": 0.09619539976119995, + "learning_rate": 1.9948866859523717e-05, + "loss": 0.9527600407600403, + "step": 86 + }, + { + "epoch": 0.26595338173481087, + "grad_norm": 0.10015493631362915, + "learning_rate": 1.9943348002101374e-05, + "loss": 0.9569152593612671, + "step": 87 + }, + { + "epoch": 0.26901031715705004, + "grad_norm": 0.09012345969676971, + "learning_rate": 1.993754722373869e-05, + "loss": 0.8912045359611511, + "step": 88 + }, + { + "epoch": 0.27206725257928926, + "grad_norm": 0.10342805832624435, + "learning_rate": 1.9931464688903502e-05, + "loss": 0.856104850769043, + "step": 89 + }, + { + "epoch": 0.2751241880015285, + "grad_norm": 0.10218493640422821, + "learning_rate": 1.9925100570052194e-05, + "loss": 0.9631397128105164, + "step": 90 + }, + { + "epoch": 0.27818112342376766, + "grad_norm": 0.10909046977758408, + "learning_rate": 1.9918455047624847e-05, + "loss": 0.8532565236091614, + "step": 91 + }, + { + "epoch": 0.2812380588460069, + "grad_norm": 0.10714197903871536, + "learning_rate": 1.9911528310040073e-05, + "loss": 0.9691859483718872, + "step": 92 + }, + { + "epoch": 0.28429499426824606, + "grad_norm": 0.1108694076538086, + "learning_rate": 1.990432055368971e-05, + "loss": 0.9374334812164307, + "step": 93 + }, + { + "epoch": 0.2873519296904853, + "grad_norm": 0.10037308186292648, + "learning_rate": 1.989683198293324e-05, + "loss": 0.9166896343231201, + "step": 94 + }, + { + "epoch": 0.2904088651127245, + "grad_norm": 0.10246684402227402, + "learning_rate": 1.9889062810092002e-05, + "loss": 1.0059239864349365, + "step": 95 + }, + { + "epoch": 0.2934658005349637, + "grad_norm": 0.09954962879419327, + "learning_rate": 1.9881013255443152e-05, + "loss": 1.00413179397583, + "step": 96 + }, + { + "epoch": 0.2965227359572029, + "grad_norm": 0.11006761342287064, + "learning_rate": 1.9872683547213446e-05, + "loss": 0.9414035677909851, + "step": 97 + }, + { + "epoch": 0.29957967137944214, + "grad_norm": 0.1014382541179657, + "learning_rate": 1.9864073921572756e-05, + "loss": 0.9155468940734863, + "step": 98 + }, + { + "epoch": 0.3026366068016813, + "grad_norm": 0.09883157908916473, + "learning_rate": 1.9855184622627362e-05, + "loss": 0.9429305195808411, + "step": 99 + }, + { + "epoch": 0.30569354222392053, + "grad_norm": 0.11199072748422623, + "learning_rate": 1.9846015902413053e-05, + "loss": 0.9143528342247009, + "step": 100 + }, + { + "epoch": 0.30569354222392053, + "eval_loss": 0.884428083896637, + "eval_runtime": 723.8143, + "eval_samples_per_second": 0.833, + "eval_steps_per_second": 0.833, + "step": 100 + }, + { + "epoch": 0.3087504776461597, + "grad_norm": 0.10796016454696655, + "learning_rate": 1.9836568020887963e-05, + "loss": 0.9726455211639404, + "step": 101 + }, + { + "epoch": 0.31180741306839893, + "grad_norm": 0.10056383162736893, + "learning_rate": 1.982684124592521e-05, + "loss": 0.8932135701179504, + "step": 102 + }, + { + "epoch": 0.31486434849063816, + "grad_norm": 0.10836594551801682, + "learning_rate": 1.9816835853305306e-05, + "loss": 0.919749915599823, + "step": 103 + }, + { + "epoch": 0.31792128391287733, + "grad_norm": 0.12032149732112885, + "learning_rate": 1.9806552126708322e-05, + "loss": 0.871781587600708, + "step": 104 + }, + { + "epoch": 0.32097821933511655, + "grad_norm": 0.10854160040616989, + "learning_rate": 1.9795990357705853e-05, + "loss": 0.8587784171104431, + "step": 105 + }, + { + "epoch": 0.3240351547573557, + "grad_norm": 0.10819399356842041, + "learning_rate": 1.978515084575276e-05, + "loss": 0.8524806499481201, + "step": 106 + }, + { + "epoch": 0.32709209017959495, + "grad_norm": 0.10226067155599594, + "learning_rate": 1.9774033898178668e-05, + "loss": 0.7892144918441772, + "step": 107 + }, + { + "epoch": 0.3301490256018342, + "grad_norm": 0.1071159616112709, + "learning_rate": 1.976263983017925e-05, + "loss": 0.8833234906196594, + "step": 108 + }, + { + "epoch": 0.33320596102407335, + "grad_norm": 0.11434526741504669, + "learning_rate": 1.9750968964807305e-05, + "loss": 0.861842155456543, + "step": 109 + }, + { + "epoch": 0.3362628964463126, + "grad_norm": 0.1159641221165657, + "learning_rate": 1.9739021632963584e-05, + "loss": 0.8987889289855957, + "step": 110 + }, + { + "epoch": 0.3393198318685518, + "grad_norm": 0.12371373921632767, + "learning_rate": 1.9726798173387417e-05, + "loss": 0.9710193872451782, + "step": 111 + }, + { + "epoch": 0.342376767290791, + "grad_norm": 0.11441531032323837, + "learning_rate": 1.97142989326471e-05, + "loss": 0.8199151158332825, + "step": 112 + }, + { + "epoch": 0.3454337027130302, + "grad_norm": 0.11842846125364304, + "learning_rate": 1.9701524265130088e-05, + "loss": 0.8845276236534119, + "step": 113 + }, + { + "epoch": 0.34849063813526937, + "grad_norm": 0.10813732445240021, + "learning_rate": 1.9688474533032916e-05, + "loss": 0.7964264750480652, + "step": 114 + }, + { + "epoch": 0.3515475735575086, + "grad_norm": 0.11050347238779068, + "learning_rate": 1.9675150106350957e-05, + "loss": 0.9630422592163086, + "step": 115 + }, + { + "epoch": 0.3546045089797478, + "grad_norm": 0.10537250339984894, + "learning_rate": 1.9661551362867926e-05, + "loss": 0.7706905007362366, + "step": 116 + }, + { + "epoch": 0.357661444401987, + "grad_norm": 0.11390368640422821, + "learning_rate": 1.9647678688145163e-05, + "loss": 0.8541204929351807, + "step": 117 + }, + { + "epoch": 0.3607183798242262, + "grad_norm": 0.10318922251462936, + "learning_rate": 1.963353247551069e-05, + "loss": 0.7400562763214111, + "step": 118 + }, + { + "epoch": 0.3637753152464654, + "grad_norm": 0.1347586214542389, + "learning_rate": 1.9619113126048086e-05, + "loss": 0.9232871532440186, + "step": 119 + }, + { + "epoch": 0.3668322506687046, + "grad_norm": 0.11458177119493484, + "learning_rate": 1.96044210485851e-05, + "loss": 0.833285927772522, + "step": 120 + }, + { + "epoch": 0.36988918609094384, + "grad_norm": 0.12361041456460953, + "learning_rate": 1.958945665968206e-05, + "loss": 0.7887391448020935, + "step": 121 + }, + { + "epoch": 0.372946121513183, + "grad_norm": 0.11985408514738083, + "learning_rate": 1.9574220383620054e-05, + "loss": 0.8206446170806885, + "step": 122 + }, + { + "epoch": 0.37600305693542224, + "grad_norm": 0.1355939507484436, + "learning_rate": 1.9558712652388932e-05, + "loss": 0.7648542523384094, + "step": 123 + }, + { + "epoch": 0.37905999235766147, + "grad_norm": 0.1229313388466835, + "learning_rate": 1.954293390567501e-05, + "loss": 0.8573335409164429, + "step": 124 + }, + { + "epoch": 0.38211692777990064, + "grad_norm": 0.11425124108791351, + "learning_rate": 1.9526884590848646e-05, + "loss": 0.7412531971931458, + "step": 125 + }, + { + "epoch": 0.38517386320213987, + "grad_norm": 0.12430041283369064, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.8098543882369995, + "step": 126 + }, + { + "epoch": 0.38823079862437904, + "grad_norm": 0.12492368370294571, + "learning_rate": 1.9493976084683814e-05, + "loss": 0.8814713954925537, + "step": 127 + }, + { + "epoch": 0.39128773404661826, + "grad_norm": 0.14428824186325073, + "learning_rate": 1.9477117826390934e-05, + "loss": 0.8231979608535767, + "step": 128 + }, + { + "epoch": 0.3943446694688575, + "grad_norm": 0.12010085582733154, + "learning_rate": 1.9459990866050337e-05, + "loss": 0.7015627026557922, + "step": 129 + }, + { + "epoch": 0.39740160489109666, + "grad_norm": 0.11819776892662048, + "learning_rate": 1.9442595689257898e-05, + "loss": 0.8086729645729065, + "step": 130 + }, + { + "epoch": 0.4004585403133359, + "grad_norm": 0.12211033701896667, + "learning_rate": 1.9424932789214158e-05, + "loss": 0.8234002590179443, + "step": 131 + }, + { + "epoch": 0.4035154757355751, + "grad_norm": 0.14926476776599884, + "learning_rate": 1.9407002666710334e-05, + "loss": 0.874608039855957, + "step": 132 + }, + { + "epoch": 0.4065724111578143, + "grad_norm": 0.13012923300266266, + "learning_rate": 1.9388805830114132e-05, + "loss": 0.8491607904434204, + "step": 133 + }, + { + "epoch": 0.4096293465800535, + "grad_norm": 0.12012261897325516, + "learning_rate": 1.937034279535533e-05, + "loss": 0.7269159555435181, + "step": 134 + }, + { + "epoch": 0.4126862820022927, + "grad_norm": 0.15302567183971405, + "learning_rate": 1.9351614085911134e-05, + "loss": 0.8560839891433716, + "step": 135 + }, + { + "epoch": 0.4157432174245319, + "grad_norm": 0.12234190106391907, + "learning_rate": 1.933262023279137e-05, + "loss": 0.8211904764175415, + "step": 136 + }, + { + "epoch": 0.41880015284677113, + "grad_norm": 0.14427296817302704, + "learning_rate": 1.9313361774523387e-05, + "loss": 0.8500057458877563, + "step": 137 + }, + { + "epoch": 0.4218570882690103, + "grad_norm": 0.1314094066619873, + "learning_rate": 1.929383925713682e-05, + "loss": 0.7589091658592224, + "step": 138 + }, + { + "epoch": 0.42491402369124953, + "grad_norm": 0.1576734483242035, + "learning_rate": 1.92740532341481e-05, + "loss": 0.7581073641777039, + "step": 139 + }, + { + "epoch": 0.4279709591134887, + "grad_norm": 0.15788713097572327, + "learning_rate": 1.925400426654475e-05, + "loss": 0.809050440788269, + "step": 140 + }, + { + "epoch": 0.43102789453572793, + "grad_norm": 0.13364559412002563, + "learning_rate": 1.9233692922769497e-05, + "loss": 0.7990086078643799, + "step": 141 + }, + { + "epoch": 0.43408482995796716, + "grad_norm": 0.14786465466022491, + "learning_rate": 1.921311977870413e-05, + "loss": 0.8675815463066101, + "step": 142 + }, + { + "epoch": 0.4371417653802063, + "grad_norm": 0.14621882140636444, + "learning_rate": 1.9192285417653208e-05, + "loss": 0.8713765740394592, + "step": 143 + }, + { + "epoch": 0.44019870080244555, + "grad_norm": 0.12874048948287964, + "learning_rate": 1.917119043032749e-05, + "loss": 0.7361871004104614, + "step": 144 + }, + { + "epoch": 0.4432556362246848, + "grad_norm": 0.12183775007724762, + "learning_rate": 1.9149835414827193e-05, + "loss": 0.7311941385269165, + "step": 145 + }, + { + "epoch": 0.44631257164692395, + "grad_norm": 0.1397160291671753, + "learning_rate": 1.912822097662505e-05, + "loss": 0.8189159035682678, + "step": 146 + }, + { + "epoch": 0.4493695070691632, + "grad_norm": 0.1458273082971573, + "learning_rate": 1.9106347728549134e-05, + "loss": 0.8288135528564453, + "step": 147 + }, + { + "epoch": 0.45242644249140235, + "grad_norm": 0.16898781061172485, + "learning_rate": 1.908421629076547e-05, + "loss": 0.7878037095069885, + "step": 148 + }, + { + "epoch": 0.4554833779136416, + "grad_norm": 0.1638474315404892, + "learning_rate": 1.9061827290760466e-05, + "loss": 0.8059952259063721, + "step": 149 + }, + { + "epoch": 0.4585403133358808, + "grad_norm": 0.14130882918834686, + "learning_rate": 1.9039181363323128e-05, + "loss": 0.7346830368041992, + "step": 150 + }, + { + "epoch": 0.4585403133358808, + "eval_loss": 0.7979016900062561, + "eval_runtime": 828.6295, + "eval_samples_per_second": 0.728, + "eval_steps_per_second": 0.728, + "step": 150 + }, + { + "epoch": 0.46159724875811997, + "grad_norm": 0.14427433907985687, + "learning_rate": 1.9016279150527044e-05, + "loss": 0.7583403587341309, + "step": 151 + }, + { + "epoch": 0.4646541841803592, + "grad_norm": 0.1515798568725586, + "learning_rate": 1.8993121301712194e-05, + "loss": 0.7908380031585693, + "step": 152 + }, + { + "epoch": 0.46771111960259837, + "grad_norm": 0.14444488286972046, + "learning_rate": 1.896970847346653e-05, + "loss": 0.7916130423545837, + "step": 153 + }, + { + "epoch": 0.4707680550248376, + "grad_norm": 0.1460912823677063, + "learning_rate": 1.8946041329607364e-05, + "loss": 0.7750643491744995, + "step": 154 + }, + { + "epoch": 0.4738249904470768, + "grad_norm": 0.13896244764328003, + "learning_rate": 1.892212054116255e-05, + "loss": 0.8059666156768799, + "step": 155 + }, + { + "epoch": 0.476881925869316, + "grad_norm": 0.16133630275726318, + "learning_rate": 1.889794678635145e-05, + "loss": 0.8327827453613281, + "step": 156 + }, + { + "epoch": 0.4799388612915552, + "grad_norm": 0.1474636346101761, + "learning_rate": 1.8873520750565716e-05, + "loss": 0.8498989343643188, + "step": 157 + }, + { + "epoch": 0.48299579671379445, + "grad_norm": 0.17222349345684052, + "learning_rate": 1.884884312634985e-05, + "loss": 0.7750177979469299, + "step": 158 + }, + { + "epoch": 0.4860527321360336, + "grad_norm": 0.15558090806007385, + "learning_rate": 1.8823914613381568e-05, + "loss": 0.7326169013977051, + "step": 159 + }, + { + "epoch": 0.48910966755827284, + "grad_norm": 0.13808321952819824, + "learning_rate": 1.8798735918451963e-05, + "loss": 0.8308709859848022, + "step": 160 + }, + { + "epoch": 0.492166602980512, + "grad_norm": 0.1761898398399353, + "learning_rate": 1.8773307755445468e-05, + "loss": 0.7805465459823608, + "step": 161 + }, + { + "epoch": 0.49522353840275124, + "grad_norm": 0.160477414727211, + "learning_rate": 1.874763084531961e-05, + "loss": 0.8538846969604492, + "step": 162 + }, + { + "epoch": 0.49828047382499047, + "grad_norm": 0.15238745510578156, + "learning_rate": 1.872170591608459e-05, + "loss": 0.8801217675209045, + "step": 163 + }, + { + "epoch": 0.5013374092472297, + "grad_norm": 0.1567080318927765, + "learning_rate": 1.86955337027826e-05, + "loss": 0.7205259799957275, + "step": 164 + }, + { + "epoch": 0.5043943446694689, + "grad_norm": 0.13637851178646088, + "learning_rate": 1.866911494746702e-05, + "loss": 0.7636491656303406, + "step": 165 + }, + { + "epoch": 0.507451280091708, + "grad_norm": 0.15563489496707916, + "learning_rate": 1.8642450399181373e-05, + "loss": 0.7982497811317444, + "step": 166 + }, + { + "epoch": 0.5105082155139473, + "grad_norm": 0.15503396093845367, + "learning_rate": 1.8615540813938063e-05, + "loss": 0.8737778067588806, + "step": 167 + }, + { + "epoch": 0.5135651509361865, + "grad_norm": 0.16095557808876038, + "learning_rate": 1.8588386954696972e-05, + "loss": 0.796604335308075, + "step": 168 + }, + { + "epoch": 0.5166220863584257, + "grad_norm": 0.1713593453168869, + "learning_rate": 1.856098959134381e-05, + "loss": 0.8247392177581787, + "step": 169 + }, + { + "epoch": 0.5196790217806648, + "grad_norm": 0.18239113688468933, + "learning_rate": 1.8533349500668295e-05, + "loss": 0.7838484644889832, + "step": 170 + }, + { + "epoch": 0.5227359572029041, + "grad_norm": 0.15745767951011658, + "learning_rate": 1.850546746634211e-05, + "loss": 0.7856907248497009, + "step": 171 + }, + { + "epoch": 0.5257928926251433, + "grad_norm": 0.16820666193962097, + "learning_rate": 1.8477344278896708e-05, + "loss": 0.7829679846763611, + "step": 172 + }, + { + "epoch": 0.5288498280473825, + "grad_norm": 0.16975544393062592, + "learning_rate": 1.84489807357009e-05, + "loss": 0.7374375462532043, + "step": 173 + }, + { + "epoch": 0.5319067634696217, + "grad_norm": 0.167228102684021, + "learning_rate": 1.8420377640938204e-05, + "loss": 0.712837815284729, + "step": 174 + }, + { + "epoch": 0.5349636988918609, + "grad_norm": 0.15955154597759247, + "learning_rate": 1.839153580558411e-05, + "loss": 0.7645693421363831, + "step": 175 + }, + { + "epoch": 0.5380206343141001, + "grad_norm": 0.18378689885139465, + "learning_rate": 1.8362456047383032e-05, + "loss": 0.7974956631660461, + "step": 176 + }, + { + "epoch": 0.5410775697363394, + "grad_norm": 0.15777672827243805, + "learning_rate": 1.833313919082515e-05, + "loss": 0.8957571983337402, + "step": 177 + }, + { + "epoch": 0.5441345051585785, + "grad_norm": 0.15292386710643768, + "learning_rate": 1.8303586067123028e-05, + "loss": 0.7635619044303894, + "step": 178 + }, + { + "epoch": 0.5471914405808177, + "grad_norm": 0.178152397274971, + "learning_rate": 1.8273797514188043e-05, + "loss": 0.7849246263504028, + "step": 179 + }, + { + "epoch": 0.550248376003057, + "grad_norm": 0.15916013717651367, + "learning_rate": 1.824377437660663e-05, + "loss": 0.6975343227386475, + "step": 180 + }, + { + "epoch": 0.5533053114252962, + "grad_norm": 0.18172231316566467, + "learning_rate": 1.821351750561634e-05, + "loss": 0.7675164341926575, + "step": 181 + }, + { + "epoch": 0.5563622468475353, + "grad_norm": 0.16241903603076935, + "learning_rate": 1.818302775908169e-05, + "loss": 0.7950343489646912, + "step": 182 + }, + { + "epoch": 0.5594191822697746, + "grad_norm": 0.18727579712867737, + "learning_rate": 1.8152306001469875e-05, + "loss": 0.787315309047699, + "step": 183 + }, + { + "epoch": 0.5624761176920138, + "grad_norm": 0.1627933531999588, + "learning_rate": 1.8121353103826213e-05, + "loss": 0.7141211628913879, + "step": 184 + }, + { + "epoch": 0.565533053114253, + "grad_norm": 0.4369247555732727, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.8476608395576477, + "step": 185 + }, + { + "epoch": 0.5685899885364921, + "grad_norm": 0.16494786739349365, + "learning_rate": 1.8058757405367003e-05, + "loss": 0.720562756061554, + "step": 186 + }, + { + "epoch": 0.5716469239587314, + "grad_norm": 0.175015389919281, + "learning_rate": 1.8027116379309637e-05, + "loss": 0.7589252591133118, + "step": 187 + }, + { + "epoch": 0.5747038593809706, + "grad_norm": 0.1769978553056717, + "learning_rate": 1.799524776268646e-05, + "loss": 0.7644155621528625, + "step": 188 + }, + { + "epoch": 0.5777607948032097, + "grad_norm": 0.18481792509555817, + "learning_rate": 1.796315245905936e-05, + "loss": 0.7885835766792297, + "step": 189 + }, + { + "epoch": 0.580817730225449, + "grad_norm": 0.1668689250946045, + "learning_rate": 1.7930831378417437e-05, + "loss": 0.7377231121063232, + "step": 190 + }, + { + "epoch": 0.5838746656476882, + "grad_norm": 0.178734689950943, + "learning_rate": 1.7898285437151163e-05, + "loss": 0.7388894557952881, + "step": 191 + }, + { + "epoch": 0.5869316010699274, + "grad_norm": 0.1740068644285202, + "learning_rate": 1.786551555802643e-05, + "loss": 0.8209859728813171, + "step": 192 + }, + { + "epoch": 0.5899885364921666, + "grad_norm": 0.19211041927337646, + "learning_rate": 1.783252267015837e-05, + "loss": 0.7305737733840942, + "step": 193 + }, + { + "epoch": 0.5930454719144058, + "grad_norm": 0.16644936800003052, + "learning_rate": 1.779930770898503e-05, + "loss": 0.7760804891586304, + "step": 194 + }, + { + "epoch": 0.596102407336645, + "grad_norm": 0.1773686707019806, + "learning_rate": 1.776587161624083e-05, + "loss": 0.7879236936569214, + "step": 195 + }, + { + "epoch": 0.5991593427588843, + "grad_norm": 0.17508819699287415, + "learning_rate": 1.7732215339929874e-05, + "loss": 0.7307407259941101, + "step": 196 + }, + { + "epoch": 0.6022162781811234, + "grad_norm": 0.17211101949214935, + "learning_rate": 1.7698339834299064e-05, + "loss": 0.7293214797973633, + "step": 197 + }, + { + "epoch": 0.6052732136033626, + "grad_norm": 0.18085215985774994, + "learning_rate": 1.7664246059811058e-05, + "loss": 0.763083279132843, + "step": 198 + }, + { + "epoch": 0.6083301490256018, + "grad_norm": 0.20243075489997864, + "learning_rate": 1.7629934983117025e-05, + "loss": 0.7372676134109497, + "step": 199 + }, + { + "epoch": 0.6113870844478411, + "grad_norm": 0.18152795732021332, + "learning_rate": 1.759540757702924e-05, + "loss": 0.7121898531913757, + "step": 200 + }, + { + "epoch": 0.6113870844478411, + "eval_loss": 0.7551760673522949, + "eval_runtime": 900.209, + "eval_samples_per_second": 0.67, + "eval_steps_per_second": 0.67, + "step": 200 + }, + { + "epoch": 0.6144440198700802, + "grad_norm": 0.18808062374591827, + "learning_rate": 1.7560664820493502e-05, + "loss": 0.734307050704956, + "step": 201 + }, + { + "epoch": 0.6175009552923194, + "grad_norm": 0.18151243031024933, + "learning_rate": 1.7525707698561383e-05, + "loss": 0.7998429536819458, + "step": 202 + }, + { + "epoch": 0.6205578907145587, + "grad_norm": 0.19583043456077576, + "learning_rate": 1.7490537202362313e-05, + "loss": 0.7546265721321106, + "step": 203 + }, + { + "epoch": 0.6236148261367979, + "grad_norm": 0.2508557140827179, + "learning_rate": 1.7455154329075427e-05, + "loss": 0.7810050249099731, + "step": 204 + }, + { + "epoch": 0.626671761559037, + "grad_norm": 0.1685105562210083, + "learning_rate": 1.741956008190136e-05, + "loss": 0.7558917999267578, + "step": 205 + }, + { + "epoch": 0.6297286969812763, + "grad_norm": 0.18195222318172455, + "learning_rate": 1.7383755470033756e-05, + "loss": 0.7216942310333252, + "step": 206 + }, + { + "epoch": 0.6327856324035155, + "grad_norm": 0.1878063678741455, + "learning_rate": 1.7347741508630673e-05, + "loss": 0.7417092323303223, + "step": 207 + }, + { + "epoch": 0.6358425678257547, + "grad_norm": 0.25273698568344116, + "learning_rate": 1.73115192187858e-05, + "loss": 0.807498037815094, + "step": 208 + }, + { + "epoch": 0.6388995032479939, + "grad_norm": 0.2451465129852295, + "learning_rate": 1.7275089627499493e-05, + "loss": 0.7557163238525391, + "step": 209 + }, + { + "epoch": 0.6419564386702331, + "grad_norm": 0.19272617995738983, + "learning_rate": 1.7238453767649683e-05, + "loss": 0.8285109996795654, + "step": 210 + }, + { + "epoch": 0.6450133740924723, + "grad_norm": 0.1869518756866455, + "learning_rate": 1.720161267796256e-05, + "loss": 0.7824444770812988, + "step": 211 + }, + { + "epoch": 0.6480703095147115, + "grad_norm": 0.2029627561569214, + "learning_rate": 1.7164567402983153e-05, + "loss": 0.7018642425537109, + "step": 212 + }, + { + "epoch": 0.6511272449369507, + "grad_norm": 0.23215501010417938, + "learning_rate": 1.7127318993045686e-05, + "loss": 0.7263948917388916, + "step": 213 + }, + { + "epoch": 0.6541841803591899, + "grad_norm": 0.19869184494018555, + "learning_rate": 1.7089868504243816e-05, + "loss": 0.8285576105117798, + "step": 214 + }, + { + "epoch": 0.6572411157814291, + "grad_norm": 0.22871531546115875, + "learning_rate": 1.705221699840069e-05, + "loss": 0.7871490716934204, + "step": 215 + }, + { + "epoch": 0.6602980512036684, + "grad_norm": 0.17945580184459686, + "learning_rate": 1.701436554303882e-05, + "loss": 0.740180492401123, + "step": 216 + }, + { + "epoch": 0.6633549866259075, + "grad_norm": 0.20516762137413025, + "learning_rate": 1.6976315211349848e-05, + "loss": 0.7542892098426819, + "step": 217 + }, + { + "epoch": 0.6664119220481467, + "grad_norm": 0.22108283638954163, + "learning_rate": 1.6938067082164093e-05, + "loss": 0.8117404580116272, + "step": 218 + }, + { + "epoch": 0.669468857470386, + "grad_norm": 0.22329698503017426, + "learning_rate": 1.6899622239919965e-05, + "loss": 0.8002716898918152, + "step": 219 + }, + { + "epoch": 0.6725257928926252, + "grad_norm": 0.23545362055301666, + "learning_rate": 1.6860981774633228e-05, + "loss": 0.7750573754310608, + "step": 220 + }, + { + "epoch": 0.6755827283148643, + "grad_norm": 0.21816480159759521, + "learning_rate": 1.6822146781866097e-05, + "loss": 0.8051223754882812, + "step": 221 + }, + { + "epoch": 0.6786396637371036, + "grad_norm": 0.18638508021831512, + "learning_rate": 1.6783118362696162e-05, + "loss": 0.7286484241485596, + "step": 222 + }, + { + "epoch": 0.6816965991593428, + "grad_norm": 0.16794732213020325, + "learning_rate": 1.6743897623685178e-05, + "loss": 0.7001460194587708, + "step": 223 + }, + { + "epoch": 0.684753534581582, + "grad_norm": 0.21157318353652954, + "learning_rate": 1.6704485676847695e-05, + "loss": 0.7479901313781738, + "step": 224 + }, + { + "epoch": 0.6878104700038211, + "grad_norm": 0.35601308941841125, + "learning_rate": 1.666488363961952e-05, + "loss": 0.7660019397735596, + "step": 225 + }, + { + "epoch": 0.6908674054260604, + "grad_norm": 0.17416611313819885, + "learning_rate": 1.662509263482604e-05, + "loss": 0.7157142162322998, + "step": 226 + }, + { + "epoch": 0.6939243408482996, + "grad_norm": 0.19655123353004456, + "learning_rate": 1.658511379065039e-05, + "loss": 0.7894638776779175, + "step": 227 + }, + { + "epoch": 0.6969812762705387, + "grad_norm": 0.2034345269203186, + "learning_rate": 1.6544948240601453e-05, + "loss": 0.6853711009025574, + "step": 228 + }, + { + "epoch": 0.700038211692778, + "grad_norm": 0.199235200881958, + "learning_rate": 1.6504597123481737e-05, + "loss": 0.7487372756004333, + "step": 229 + }, + { + "epoch": 0.7030951471150172, + "grad_norm": 0.20407404005527496, + "learning_rate": 1.6464061583355088e-05, + "loss": 0.7335573434829712, + "step": 230 + }, + { + "epoch": 0.7061520825372564, + "grad_norm": 0.22096174955368042, + "learning_rate": 1.6423342769514227e-05, + "loss": 0.7659798264503479, + "step": 231 + }, + { + "epoch": 0.7092090179594956, + "grad_norm": 0.1916825920343399, + "learning_rate": 1.6382441836448203e-05, + "loss": 0.7162011861801147, + "step": 232 + }, + { + "epoch": 0.7122659533817348, + "grad_norm": 0.20505093038082123, + "learning_rate": 1.6341359943809626e-05, + "loss": 0.6957600116729736, + "step": 233 + }, + { + "epoch": 0.715322888803974, + "grad_norm": 0.19968082010746002, + "learning_rate": 1.6300098256381807e-05, + "loss": 0.6724053025245667, + "step": 234 + }, + { + "epoch": 0.7183798242262133, + "grad_norm": 0.19768832623958588, + "learning_rate": 1.625865794404573e-05, + "loss": 0.774741530418396, + "step": 235 + }, + { + "epoch": 0.7214367596484524, + "grad_norm": 0.19257694482803345, + "learning_rate": 1.621704018174688e-05, + "loss": 0.6658651828765869, + "step": 236 + }, + { + "epoch": 0.7244936950706916, + "grad_norm": 0.21594858169555664, + "learning_rate": 1.617524614946192e-05, + "loss": 0.810744047164917, + "step": 237 + }, + { + "epoch": 0.7275506304929308, + "grad_norm": 0.2107633650302887, + "learning_rate": 1.6133277032165264e-05, + "loss": 0.7623897194862366, + "step": 238 + }, + { + "epoch": 0.7306075659151701, + "grad_norm": 0.20114055275917053, + "learning_rate": 1.6091134019795447e-05, + "loss": 0.7082816362380981, + "step": 239 + }, + { + "epoch": 0.7336645013374092, + "grad_norm": 0.2542732059955597, + "learning_rate": 1.604881830722141e-05, + "loss": 0.7051193714141846, + "step": 240 + }, + { + "epoch": 0.7367214367596484, + "grad_norm": 0.19180485606193542, + "learning_rate": 1.600633109420861e-05, + "loss": 0.7895385026931763, + "step": 241 + }, + { + "epoch": 0.7397783721818877, + "grad_norm": 0.368756502866745, + "learning_rate": 1.5963673585385016e-05, + "loss": 0.7146293520927429, + "step": 242 + }, + { + "epoch": 0.7428353076041269, + "grad_norm": 0.18490125238895416, + "learning_rate": 1.5920846990206934e-05, + "loss": 0.650428056716919, + "step": 243 + }, + { + "epoch": 0.745892243026366, + "grad_norm": 0.23592503368854523, + "learning_rate": 1.5877852522924733e-05, + "loss": 0.6367110013961792, + "step": 244 + }, + { + "epoch": 0.7489491784486053, + "grad_norm": 0.20223264396190643, + "learning_rate": 1.5834691402548415e-05, + "loss": 0.6563615798950195, + "step": 245 + }, + { + "epoch": 0.7520061138708445, + "grad_norm": 0.27459946274757385, + "learning_rate": 1.5791364852813047e-05, + "loss": 0.7361881136894226, + "step": 246 + }, + { + "epoch": 0.7550630492930837, + "grad_norm": 0.21085411310195923, + "learning_rate": 1.5747874102144073e-05, + "loss": 0.7373813390731812, + "step": 247 + }, + { + "epoch": 0.7581199847153229, + "grad_norm": 0.23332320153713226, + "learning_rate": 1.5704220383622464e-05, + "loss": 0.6971457004547119, + "step": 248 + }, + { + "epoch": 0.7611769201375621, + "grad_norm": 0.23525936901569366, + "learning_rate": 1.5660404934949798e-05, + "loss": 0.6756627559661865, + "step": 249 + }, + { + "epoch": 0.7642338555598013, + "grad_norm": 0.2150791585445404, + "learning_rate": 1.5616428998413122e-05, + "loss": 0.7029792666435242, + "step": 250 + }, + { + "epoch": 0.7642338555598013, + "eval_loss": 0.7269901633262634, + "eval_runtime": 877.665, + "eval_samples_per_second": 0.687, + "eval_steps_per_second": 0.687, + "step": 250 + }, + { + "epoch": 0.7672907909820404, + "grad_norm": 0.19510552287101746, + "learning_rate": 1.5572293820849754e-05, + "loss": 0.715162992477417, + "step": 251 + }, + { + "epoch": 0.7703477264042797, + "grad_norm": 0.25246763229370117, + "learning_rate": 1.5528000653611935e-05, + "loss": 0.634660542011261, + "step": 252 + }, + { + "epoch": 0.7734046618265189, + "grad_norm": 0.2980027496814728, + "learning_rate": 1.5483550752531337e-05, + "loss": 0.7154463529586792, + "step": 253 + }, + { + "epoch": 0.7764615972487581, + "grad_norm": 0.2730556130409241, + "learning_rate": 1.5438945377883463e-05, + "loss": 0.8110946416854858, + "step": 254 + }, + { + "epoch": 0.7795185326709974, + "grad_norm": 0.17258886992931366, + "learning_rate": 1.5394185794351914e-05, + "loss": 0.72202467918396, + "step": 255 + }, + { + "epoch": 0.7825754680932365, + "grad_norm": 0.19966280460357666, + "learning_rate": 1.5349273270992537e-05, + "loss": 0.7368704080581665, + "step": 256 + }, + { + "epoch": 0.7856324035154757, + "grad_norm": 0.23305682837963104, + "learning_rate": 1.5304209081197425e-05, + "loss": 0.7429723143577576, + "step": 257 + }, + { + "epoch": 0.788689338937715, + "grad_norm": 0.21786810457706451, + "learning_rate": 1.5258994502658846e-05, + "loss": 0.6498424410820007, + "step": 258 + }, + { + "epoch": 0.7917462743599541, + "grad_norm": 0.2370925396680832, + "learning_rate": 1.5213630817332985e-05, + "loss": 0.7379459142684937, + "step": 259 + }, + { + "epoch": 0.7948032097821933, + "grad_norm": 0.25566384196281433, + "learning_rate": 1.5168119311403611e-05, + "loss": 0.6742876172065735, + "step": 260 + }, + { + "epoch": 0.7978601452044326, + "grad_norm": 0.2171633243560791, + "learning_rate": 1.512246127524561e-05, + "loss": 0.72329181432724, + "step": 261 + }, + { + "epoch": 0.8009170806266718, + "grad_norm": 0.23292019963264465, + "learning_rate": 1.50766580033884e-05, + "loss": 0.765812873840332, + "step": 262 + }, + { + "epoch": 0.8039740160489109, + "grad_norm": 0.19427980482578278, + "learning_rate": 1.5030710794479226e-05, + "loss": 0.7872639298439026, + "step": 263 + }, + { + "epoch": 0.8070309514711502, + "grad_norm": 0.2460346817970276, + "learning_rate": 1.4984620951246333e-05, + "loss": 0.6940722465515137, + "step": 264 + }, + { + "epoch": 0.8100878868933894, + "grad_norm": 0.2493411898612976, + "learning_rate": 1.4938389780462044e-05, + "loss": 0.7680137157440186, + "step": 265 + }, + { + "epoch": 0.8131448223156286, + "grad_norm": 0.23873573541641235, + "learning_rate": 1.4892018592905702e-05, + "loss": 0.6780916452407837, + "step": 266 + }, + { + "epoch": 0.8162017577378677, + "grad_norm": 0.2580571174621582, + "learning_rate": 1.4845508703326504e-05, + "loss": 0.7183764576911926, + "step": 267 + }, + { + "epoch": 0.819258693160107, + "grad_norm": 0.2125079482793808, + "learning_rate": 1.4798861430406221e-05, + "loss": 0.8207096457481384, + "step": 268 + }, + { + "epoch": 0.8223156285823462, + "grad_norm": 0.21065691113471985, + "learning_rate": 1.4752078096721827e-05, + "loss": 0.7414214611053467, + "step": 269 + }, + { + "epoch": 0.8253725640045854, + "grad_norm": 0.25807511806488037, + "learning_rate": 1.4705160028707976e-05, + "loss": 0.7086384296417236, + "step": 270 + }, + { + "epoch": 0.8284294994268246, + "grad_norm": 0.2444671094417572, + "learning_rate": 1.4658108556619417e-05, + "loss": 0.7065964937210083, + "step": 271 + }, + { + "epoch": 0.8314864348490638, + "grad_norm": 0.200303316116333, + "learning_rate": 1.461092501449326e-05, + "loss": 0.7533905506134033, + "step": 272 + }, + { + "epoch": 0.834543370271303, + "grad_norm": 0.2807226777076721, + "learning_rate": 1.4563610740111163e-05, + "loss": 0.756553053855896, + "step": 273 + }, + { + "epoch": 0.8376003056935423, + "grad_norm": 0.2516884207725525, + "learning_rate": 1.4516167074961394e-05, + "loss": 0.8125098347663879, + "step": 274 + }, + { + "epoch": 0.8406572411157814, + "grad_norm": 0.22799813747406006, + "learning_rate": 1.4468595364200808e-05, + "loss": 0.7360811829566956, + "step": 275 + }, + { + "epoch": 0.8437141765380206, + "grad_norm": 0.27390384674072266, + "learning_rate": 1.4420896956616698e-05, + "loss": 0.7135312557220459, + "step": 276 + }, + { + "epoch": 0.8467711119602599, + "grad_norm": 0.2811775505542755, + "learning_rate": 1.4373073204588556e-05, + "loss": 0.7489083409309387, + "step": 277 + }, + { + "epoch": 0.8498280473824991, + "grad_norm": 0.2652314603328705, + "learning_rate": 1.4325125464049725e-05, + "loss": 0.752477765083313, + "step": 278 + }, + { + "epoch": 0.8528849828047382, + "grad_norm": 0.2218960076570511, + "learning_rate": 1.427705509444897e-05, + "loss": 0.6534979939460754, + "step": 279 + }, + { + "epoch": 0.8559419182269774, + "grad_norm": 0.23746474087238312, + "learning_rate": 1.4228863458711915e-05, + "loss": 0.7061883211135864, + "step": 280 + }, + { + "epoch": 0.8589988536492167, + "grad_norm": 0.21507228910923004, + "learning_rate": 1.4180551923202406e-05, + "loss": 0.7044329643249512, + "step": 281 + }, + { + "epoch": 0.8620557890714559, + "grad_norm": 0.2412186861038208, + "learning_rate": 1.4132121857683782e-05, + "loss": 0.706013023853302, + "step": 282 + }, + { + "epoch": 0.865112724493695, + "grad_norm": 0.2832106947898865, + "learning_rate": 1.4083574635280029e-05, + "loss": 0.6572445631027222, + "step": 283 + }, + { + "epoch": 0.8681696599159343, + "grad_norm": 0.21925900876522064, + "learning_rate": 1.403491163243684e-05, + "loss": 0.675041139125824, + "step": 284 + }, + { + "epoch": 0.8712265953381735, + "grad_norm": 0.22488665580749512, + "learning_rate": 1.3986134228882607e-05, + "loss": 0.7474229335784912, + "step": 285 + }, + { + "epoch": 0.8742835307604127, + "grad_norm": 0.2221737653017044, + "learning_rate": 1.3937243807589291e-05, + "loss": 0.7394901514053345, + "step": 286 + }, + { + "epoch": 0.8773404661826519, + "grad_norm": 0.29034581780433655, + "learning_rate": 1.388824175473321e-05, + "loss": 0.7346636056900024, + "step": 287 + }, + { + "epoch": 0.8803974016048911, + "grad_norm": 0.2580259144306183, + "learning_rate": 1.383912945965574e-05, + "loss": 0.8125481009483337, + "step": 288 + }, + { + "epoch": 0.8834543370271303, + "grad_norm": 0.2533118724822998, + "learning_rate": 1.3789908314823932e-05, + "loss": 0.6768131256103516, + "step": 289 + }, + { + "epoch": 0.8865112724493696, + "grad_norm": 0.2074616551399231, + "learning_rate": 1.3740579715791017e-05, + "loss": 0.7096269726753235, + "step": 290 + }, + { + "epoch": 0.8895682078716087, + "grad_norm": 0.29789987206459045, + "learning_rate": 1.3691145061156843e-05, + "loss": 0.6973364353179932, + "step": 291 + }, + { + "epoch": 0.8926251432938479, + "grad_norm": 0.2937224805355072, + "learning_rate": 1.3641605752528225e-05, + "loss": 0.7693608999252319, + "step": 292 + }, + { + "epoch": 0.8956820787160871, + "grad_norm": 0.27355870604515076, + "learning_rate": 1.3591963194479198e-05, + "loss": 0.6870795488357544, + "step": 293 + }, + { + "epoch": 0.8987390141383264, + "grad_norm": 0.22792251408100128, + "learning_rate": 1.3542218794511212e-05, + "loss": 0.7095532417297363, + "step": 294 + }, + { + "epoch": 0.9017959495605655, + "grad_norm": 0.2855125665664673, + "learning_rate": 1.3492373963013199e-05, + "loss": 0.7536489963531494, + "step": 295 + }, + { + "epoch": 0.9048528849828047, + "grad_norm": 0.24969056248664856, + "learning_rate": 1.3442430113221602e-05, + "loss": 0.7433043718338013, + "step": 296 + }, + { + "epoch": 0.907909820405044, + "grad_norm": 0.24534980952739716, + "learning_rate": 1.3392388661180303e-05, + "loss": 0.7204138040542603, + "step": 297 + }, + { + "epoch": 0.9109667558272831, + "grad_norm": 0.2540739178657532, + "learning_rate": 1.3342251025700474e-05, + "loss": 0.7114053964614868, + "step": 298 + }, + { + "epoch": 0.9140236912495223, + "grad_norm": 0.2494630217552185, + "learning_rate": 1.3292018628320346e-05, + "loss": 0.7337151169776917, + "step": 299 + }, + { + "epoch": 0.9170806266717616, + "grad_norm": 0.3079741597175598, + "learning_rate": 1.3241692893264909e-05, + "loss": 0.7486672401428223, + "step": 300 + }, + { + "epoch": 0.9170806266717616, + "eval_loss": 0.7063615918159485, + "eval_runtime": 882.246, + "eval_samples_per_second": 0.683, + "eval_steps_per_second": 0.683, + "step": 300 + }, + { + "epoch": 0.9201375620940008, + "grad_norm": 0.23425859212875366, + "learning_rate": 1.3191275247405525e-05, + "loss": 0.7614796161651611, + "step": 301 + }, + { + "epoch": 0.9231944975162399, + "grad_norm": 0.22468142211437225, + "learning_rate": 1.314076712021949e-05, + "loss": 0.7109901309013367, + "step": 302 + }, + { + "epoch": 0.9262514329384792, + "grad_norm": 0.4165630042552948, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.6816924810409546, + "step": 303 + }, + { + "epoch": 0.9293083683607184, + "grad_norm": 0.2934052646160126, + "learning_rate": 1.3039485152562951e-05, + "loss": 0.7403143644332886, + "step": 304 + }, + { + "epoch": 0.9323653037829576, + "grad_norm": 0.24021990597248077, + "learning_rate": 1.2988714183711504e-05, + "loss": 0.7116130590438843, + "step": 305 + }, + { + "epoch": 0.9354222392051967, + "grad_norm": 0.25670015811920166, + "learning_rate": 1.2937858476690089e-05, + "loss": 0.745186984539032, + "step": 306 + }, + { + "epoch": 0.938479174627436, + "grad_norm": 0.3273049592971802, + "learning_rate": 1.2886919473396212e-05, + "loss": 0.811728298664093, + "step": 307 + }, + { + "epoch": 0.9415361100496752, + "grad_norm": 0.295612633228302, + "learning_rate": 1.2835898618089064e-05, + "loss": 0.6898178458213806, + "step": 308 + }, + { + "epoch": 0.9445930454719144, + "grad_norm": 0.22936004400253296, + "learning_rate": 1.2784797357348562e-05, + "loss": 0.7637606263160706, + "step": 309 + }, + { + "epoch": 0.9476499808941536, + "grad_norm": 0.2491123378276825, + "learning_rate": 1.2733617140034329e-05, + "loss": 0.6364520788192749, + "step": 310 + }, + { + "epoch": 0.9507069163163928, + "grad_norm": 0.29433801770210266, + "learning_rate": 1.268235941724463e-05, + "loss": 0.7065365314483643, + "step": 311 + }, + { + "epoch": 0.953763851738632, + "grad_norm": 0.25174376368522644, + "learning_rate": 1.2631025642275212e-05, + "loss": 0.73712158203125, + "step": 312 + }, + { + "epoch": 0.9568207871608713, + "grad_norm": 0.3259194493293762, + "learning_rate": 1.257961727057812e-05, + "loss": 0.6926214694976807, + "step": 313 + }, + { + "epoch": 0.9598777225831104, + "grad_norm": 0.31702667474746704, + "learning_rate": 1.2528135759720403e-05, + "loss": 0.7626583576202393, + "step": 314 + }, + { + "epoch": 0.9629346580053496, + "grad_norm": 0.24691395461559296, + "learning_rate": 1.2476582569342819e-05, + "loss": 0.7628929018974304, + "step": 315 + }, + { + "epoch": 0.9659915934275889, + "grad_norm": 0.2896668314933777, + "learning_rate": 1.2424959161118425e-05, + "loss": 0.7070521116256714, + "step": 316 + }, + { + "epoch": 0.9690485288498281, + "grad_norm": 0.2587420642375946, + "learning_rate": 1.2373266998711152e-05, + "loss": 0.7804452180862427, + "step": 317 + }, + { + "epoch": 0.9721054642720672, + "grad_norm": 0.28757819533348083, + "learning_rate": 1.232150754773429e-05, + "loss": 0.7271901369094849, + "step": 318 + }, + { + "epoch": 0.9751623996943064, + "grad_norm": 0.2600923478603363, + "learning_rate": 1.2269682275708951e-05, + "loss": 0.6629395484924316, + "step": 319 + }, + { + "epoch": 0.9782193351165457, + "grad_norm": 0.3455665111541748, + "learning_rate": 1.2217792652022452e-05, + "loss": 0.7750409841537476, + "step": 320 + }, + { + "epoch": 0.9812762705387849, + "grad_norm": 0.27122899889945984, + "learning_rate": 1.2165840147886656e-05, + "loss": 0.6742854118347168, + "step": 321 + }, + { + "epoch": 0.984333205961024, + "grad_norm": 0.2357456535100937, + "learning_rate": 1.2113826236296245e-05, + "loss": 0.7265107035636902, + "step": 322 + }, + { + "epoch": 0.9873901413832633, + "grad_norm": 0.21315616369247437, + "learning_rate": 1.2061752391986982e-05, + "loss": 0.7203768491744995, + "step": 323 + }, + { + "epoch": 0.9904470768055025, + "grad_norm": 0.24696163833141327, + "learning_rate": 1.2009620091393885e-05, + "loss": 0.8011739253997803, + "step": 324 + }, + { + "epoch": 0.9935040122277417, + "grad_norm": 0.246279776096344, + "learning_rate": 1.1957430812609361e-05, + "loss": 0.7316861152648926, + "step": 325 + }, + { + "epoch": 0.9965609476499809, + "grad_norm": 0.26160112023353577, + "learning_rate": 1.1905186035341304e-05, + "loss": 0.6602386236190796, + "step": 326 + }, + { + "epoch": 0.9996178830722201, + "grad_norm": 0.27144137024879456, + "learning_rate": 1.1852887240871145e-05, + "loss": 0.7162635326385498, + "step": 327 + }, + { + "epoch": 1.0, + "grad_norm": 0.6650471091270447, + "learning_rate": 1.1800535912011846e-05, + "loss": 0.6108165383338928, + "step": 328 + }, + { + "epoch": 1.0030569354222392, + "grad_norm": 0.25604233145713806, + "learning_rate": 1.1748133533065864e-05, + "loss": 0.6724814176559448, + "step": 329 + }, + { + "epoch": 1.0061138708444783, + "grad_norm": 0.30289238691329956, + "learning_rate": 1.1695681589783065e-05, + "loss": 0.7010799050331116, + "step": 330 + }, + { + "epoch": 1.0091708062667175, + "grad_norm": 0.28697144985198975, + "learning_rate": 1.1643181569318596e-05, + "loss": 0.7199532985687256, + "step": 331 + }, + { + "epoch": 1.012227741688957, + "grad_norm": 0.26302677392959595, + "learning_rate": 1.1590634960190722e-05, + "loss": 0.6887974143028259, + "step": 332 + }, + { + "epoch": 1.015284677111196, + "grad_norm": 0.2987605631351471, + "learning_rate": 1.1538043252238629e-05, + "loss": 0.7237250208854675, + "step": 333 + }, + { + "epoch": 1.0183416125334352, + "grad_norm": 0.25947025418281555, + "learning_rate": 1.1485407936580169e-05, + "loss": 0.7092999815940857, + "step": 334 + }, + { + "epoch": 1.0213985479556744, + "grad_norm": 0.3119892477989197, + "learning_rate": 1.1432730505569597e-05, + "loss": 0.6797397136688232, + "step": 335 + }, + { + "epoch": 1.0244554833779136, + "grad_norm": 0.2772631347179413, + "learning_rate": 1.1380012452755259e-05, + "loss": 0.7330094575881958, + "step": 336 + }, + { + "epoch": 1.0275124188001528, + "grad_norm": 0.34601089358329773, + "learning_rate": 1.1327255272837221e-05, + "loss": 0.711042582988739, + "step": 337 + }, + { + "epoch": 1.0305693542223922, + "grad_norm": 0.30404818058013916, + "learning_rate": 1.1274460461624925e-05, + "loss": 0.6593371033668518, + "step": 338 + }, + { + "epoch": 1.0336262896446313, + "grad_norm": 0.249643474817276, + "learning_rate": 1.1221629515994754e-05, + "loss": 0.7230923175811768, + "step": 339 + }, + { + "epoch": 1.0366832250668705, + "grad_norm": 0.2772657871246338, + "learning_rate": 1.1168763933847608e-05, + "loss": 0.6847513914108276, + "step": 340 + }, + { + "epoch": 1.0397401604891097, + "grad_norm": 0.3479171395301819, + "learning_rate": 1.1115865214066414e-05, + "loss": 0.673307478427887, + "step": 341 + }, + { + "epoch": 1.0427970959113488, + "grad_norm": 0.3393602669239044, + "learning_rate": 1.1062934856473655e-05, + "loss": 0.7529383897781372, + "step": 342 + }, + { + "epoch": 1.045854031333588, + "grad_norm": 0.22780737280845642, + "learning_rate": 1.1009974361788822e-05, + "loss": 0.6309706568717957, + "step": 343 + }, + { + "epoch": 1.0489109667558272, + "grad_norm": 0.2966362237930298, + "learning_rate": 1.095698523158588e-05, + "loss": 0.6944005489349365, + "step": 344 + }, + { + "epoch": 1.0519679021780666, + "grad_norm": 0.27519309520721436, + "learning_rate": 1.0903968968250682e-05, + "loss": 0.6714650392532349, + "step": 345 + }, + { + "epoch": 1.0550248376003057, + "grad_norm": 0.36684176325798035, + "learning_rate": 1.085092707493839e-05, + "loss": 0.6740344762802124, + "step": 346 + }, + { + "epoch": 1.058081773022545, + "grad_norm": 0.35729631781578064, + "learning_rate": 1.0797861055530832e-05, + "loss": 0.6590248942375183, + "step": 347 + }, + { + "epoch": 1.061138708444784, + "grad_norm": 0.33536043763160706, + "learning_rate": 1.0744772414593889e-05, + "loss": 0.7020372748374939, + "step": 348 + }, + { + "epoch": 1.0641956438670233, + "grad_norm": 0.3144095838069916, + "learning_rate": 1.0691662657334815e-05, + "loss": 0.7195531725883484, + "step": 349 + }, + { + "epoch": 1.0672525792892624, + "grad_norm": 0.37244805693626404, + "learning_rate": 1.0638533289559574e-05, + "loss": 0.6678342819213867, + "step": 350 + }, + { + "epoch": 1.0672525792892624, + "eval_loss": 0.6917262673377991, + "eval_runtime": 874.9693, + "eval_samples_per_second": 0.689, + "eval_steps_per_second": 0.689, + "step": 350 + }, + { + "epoch": 1.0703095147115018, + "grad_norm": 0.45918041467666626, + "learning_rate": 1.0585385817630137e-05, + "loss": 0.6641817092895508, + "step": 351 + }, + { + "epoch": 1.073366450133741, + "grad_norm": 0.4126392900943756, + "learning_rate": 1.0532221748421786e-05, + "loss": 0.6774541139602661, + "step": 352 + }, + { + "epoch": 1.0764233855559802, + "grad_norm": 0.5425148606300354, + "learning_rate": 1.047904258928037e-05, + "loss": 0.7386555075645447, + "step": 353 + }, + { + "epoch": 1.0794803209782193, + "grad_norm": 0.40561115741729736, + "learning_rate": 1.0425849847979586e-05, + "loss": 0.7061327695846558, + "step": 354 + }, + { + "epoch": 1.0825372564004585, + "grad_norm": 0.489343523979187, + "learning_rate": 1.0372645032678215e-05, + "loss": 0.7486766576766968, + "step": 355 + }, + { + "epoch": 1.0855941918226977, + "grad_norm": 0.7414161562919617, + "learning_rate": 1.031942965187738e-05, + "loss": 0.7111566066741943, + "step": 356 + }, + { + "epoch": 1.0886511272449368, + "grad_norm": 0.308473140001297, + "learning_rate": 1.026620521437775e-05, + "loss": 0.7629879713058472, + "step": 357 + }, + { + "epoch": 1.0917080626671762, + "grad_norm": 0.27350732684135437, + "learning_rate": 1.0212973229236787e-05, + "loss": 0.7136012315750122, + "step": 358 + }, + { + "epoch": 1.0947649980894154, + "grad_norm": 0.37481266260147095, + "learning_rate": 1.0159735205725949e-05, + "loss": 0.6634767055511475, + "step": 359 + }, + { + "epoch": 1.0978219335116546, + "grad_norm": 0.2903526723384857, + "learning_rate": 1.0106492653287893e-05, + "loss": 0.6604923009872437, + "step": 360 + }, + { + "epoch": 1.1008788689338938, + "grad_norm": 0.372989296913147, + "learning_rate": 1.0053247081493684e-05, + "loss": 0.6701731085777283, + "step": 361 + }, + { + "epoch": 1.103935804356133, + "grad_norm": 0.38386791944503784, + "learning_rate": 1e-05, + "loss": 0.6767977476119995, + "step": 362 + }, + { + "epoch": 1.106992739778372, + "grad_norm": 0.2837046682834625, + "learning_rate": 9.946752918506319e-06, + "loss": 0.5886228680610657, + "step": 363 + }, + { + "epoch": 1.1100496752006115, + "grad_norm": 0.3196772038936615, + "learning_rate": 9.893507346712112e-06, + "loss": 0.6662254929542542, + "step": 364 + }, + { + "epoch": 1.1131066106228507, + "grad_norm": 0.36623135209083557, + "learning_rate": 9.840264794274053e-06, + "loss": 0.6507357954978943, + "step": 365 + }, + { + "epoch": 1.1161635460450898, + "grad_norm": 0.2803555727005005, + "learning_rate": 9.787026770763216e-06, + "loss": 0.6636874675750732, + "step": 366 + }, + { + "epoch": 1.119220481467329, + "grad_norm": 0.329513818025589, + "learning_rate": 9.733794785622254e-06, + "loss": 0.6378857493400574, + "step": 367 + }, + { + "epoch": 1.1222774168895682, + "grad_norm": 0.24419358372688293, + "learning_rate": 9.680570348122626e-06, + "loss": 0.6794115900993347, + "step": 368 + }, + { + "epoch": 1.1253343523118073, + "grad_norm": 0.2971822917461395, + "learning_rate": 9.627354967321785e-06, + "loss": 0.6401248574256897, + "step": 369 + }, + { + "epoch": 1.1283912877340465, + "grad_norm": 0.5112190842628479, + "learning_rate": 9.574150152020415e-06, + "loss": 0.6886081695556641, + "step": 370 + }, + { + "epoch": 1.131448223156286, + "grad_norm": 0.4284913241863251, + "learning_rate": 9.520957410719632e-06, + "loss": 0.6842222213745117, + "step": 371 + }, + { + "epoch": 1.134505158578525, + "grad_norm": 0.34164664149284363, + "learning_rate": 9.467778251578217e-06, + "loss": 0.6238314509391785, + "step": 372 + }, + { + "epoch": 1.1375620940007642, + "grad_norm": 0.3294171392917633, + "learning_rate": 9.414614182369862e-06, + "loss": 0.6947107911109924, + "step": 373 + }, + { + "epoch": 1.1406190294230034, + "grad_norm": 0.2544155418872833, + "learning_rate": 9.361466710440428e-06, + "loss": 0.717319905757904, + "step": 374 + }, + { + "epoch": 1.1436759648452426, + "grad_norm": 0.3111848533153534, + "learning_rate": 9.308337342665188e-06, + "loss": 0.6222032904624939, + "step": 375 + }, + { + "epoch": 1.1467329002674818, + "grad_norm": 0.3157130777835846, + "learning_rate": 9.255227585406116e-06, + "loss": 0.6126186847686768, + "step": 376 + }, + { + "epoch": 1.1497898356897212, + "grad_norm": 0.29625123739242554, + "learning_rate": 9.202138944469168e-06, + "loss": 0.7452324032783508, + "step": 377 + }, + { + "epoch": 1.1528467711119603, + "grad_norm": 0.31600719690322876, + "learning_rate": 9.149072925061614e-06, + "loss": 0.715571403503418, + "step": 378 + }, + { + "epoch": 1.1559037065341995, + "grad_norm": 0.25878727436065674, + "learning_rate": 9.096031031749321e-06, + "loss": 0.7256120443344116, + "step": 379 + }, + { + "epoch": 1.1589606419564387, + "grad_norm": 0.4058121144771576, + "learning_rate": 9.043014768414125e-06, + "loss": 0.6728136539459229, + "step": 380 + }, + { + "epoch": 1.1620175773786778, + "grad_norm": 0.31269821524620056, + "learning_rate": 8.99002563821118e-06, + "loss": 0.6662668585777283, + "step": 381 + }, + { + "epoch": 1.165074512800917, + "grad_norm": 0.2512218654155731, + "learning_rate": 8.937065143526349e-06, + "loss": 0.6415850520133972, + "step": 382 + }, + { + "epoch": 1.1681314482231562, + "grad_norm": 0.3284171223640442, + "learning_rate": 8.884134785933588e-06, + "loss": 0.6695276498794556, + "step": 383 + }, + { + "epoch": 1.1711883836453956, + "grad_norm": 0.2994699478149414, + "learning_rate": 8.831236066152397e-06, + "loss": 0.7347006797790527, + "step": 384 + }, + { + "epoch": 1.1742453190676347, + "grad_norm": 0.2981257140636444, + "learning_rate": 8.778370484005245e-06, + "loss": 0.6707600951194763, + "step": 385 + }, + { + "epoch": 1.177302254489874, + "grad_norm": 0.2934776842594147, + "learning_rate": 8.725539538375078e-06, + "loss": 0.7245328426361084, + "step": 386 + }, + { + "epoch": 1.180359189912113, + "grad_norm": 0.33115988969802856, + "learning_rate": 8.672744727162782e-06, + "loss": 0.7029488682746887, + "step": 387 + }, + { + "epoch": 1.1834161253343523, + "grad_norm": 0.3322703540325165, + "learning_rate": 8.619987547244746e-06, + "loss": 0.6896190643310547, + "step": 388 + }, + { + "epoch": 1.1864730607565914, + "grad_norm": 0.29254966974258423, + "learning_rate": 8.567269494430404e-06, + "loss": 0.6859920620918274, + "step": 389 + }, + { + "epoch": 1.1895299961788308, + "grad_norm": 0.2923297584056854, + "learning_rate": 8.514592063419833e-06, + "loss": 0.6437527537345886, + "step": 390 + }, + { + "epoch": 1.19258693160107, + "grad_norm": 0.3074567914009094, + "learning_rate": 8.461956747761375e-06, + "loss": 0.7113338708877563, + "step": 391 + }, + { + "epoch": 1.1956438670233092, + "grad_norm": 0.3027377128601074, + "learning_rate": 8.409365039809282e-06, + "loss": 0.7111615538597107, + "step": 392 + }, + { + "epoch": 1.1987008024455483, + "grad_norm": 0.28992199897766113, + "learning_rate": 8.356818430681409e-06, + "loss": 0.7768589854240417, + "step": 393 + }, + { + "epoch": 1.2017577378677875, + "grad_norm": 0.2630784213542938, + "learning_rate": 8.304318410216937e-06, + "loss": 0.5940375328063965, + "step": 394 + }, + { + "epoch": 1.2048146732900267, + "grad_norm": 0.30487746000289917, + "learning_rate": 8.251866466934137e-06, + "loss": 0.6600077748298645, + "step": 395 + }, + { + "epoch": 1.2078716087122658, + "grad_norm": 0.4152087867259979, + "learning_rate": 8.199464087988158e-06, + "loss": 0.6806260347366333, + "step": 396 + }, + { + "epoch": 1.2109285441345052, + "grad_norm": 0.32374435663223267, + "learning_rate": 8.147112759128859e-06, + "loss": 0.7205727100372314, + "step": 397 + }, + { + "epoch": 1.2139854795567444, + "grad_norm": 0.3009904623031616, + "learning_rate": 8.094813964658698e-06, + "loss": 0.6570584774017334, + "step": 398 + }, + { + "epoch": 1.2170424149789836, + "grad_norm": 0.5213649272918701, + "learning_rate": 8.042569187390642e-06, + "loss": 0.6663621664047241, + "step": 399 + }, + { + "epoch": 1.2200993504012227, + "grad_norm": 0.30124184489250183, + "learning_rate": 7.990379908606118e-06, + "loss": 0.672550618648529, + "step": 400 + }, + { + "epoch": 1.2200993504012227, + "eval_loss": 0.6789794564247131, + "eval_runtime": 875.5101, + "eval_samples_per_second": 0.689, + "eval_steps_per_second": 0.689, + "step": 400 + } + ], + "logging_steps": 1, + "max_steps": 656, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.2037525220053484e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-400/training_args.bin b/cpt_qwen_14B/checkpoints/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eddbb43a2cebb928dbed6e955a37ebfa3174f4b5 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a8e308e47eb936f678712445b19ddc52638f354c37c813ecaa432f69120a2e +size 5201 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-500/README.md b/cpt_qwen_14B/checkpoints/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8dfda26032514233f3e70a4012f1cfd1ddbbb609 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-500/README.md @@ -0,0 +1,207 @@ +--- +base_model: /workspace/Models/Qwen2.5-Coder-14B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:/workspace/Models/Qwen2.5-Coder-14B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/cpt_qwen_14B/checkpoints/checkpoint-500/adapter_config.json b/cpt_qwen_14B/checkpoints/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..81c31359285f7e351a44275c30b6882f4c6b50c0 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-500/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "/workspace/Models/Qwen2.5-Coder-14B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/cpt_qwen_14B/checkpoints/checkpoint-500/adapter_model.safetensors b/cpt_qwen_14B/checkpoints/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7b0ebf5c7f2137ceb0228eca3f71a8443e27c0ca --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c70d2a53a400c71d8d34be7ccd544b39185feac4cb968ee59b98c6e3dbc90913 +size 201378736 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-500/chat_template.jinja b/cpt_qwen_14B/checkpoints/checkpoint-500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..28028c056af412405debd878cdda0171e35fa5d1 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-500/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-500/optimizer.pt b/cpt_qwen_14B/checkpoints/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..97af2efee9dc6b5da79387c5369027c2a7c8e3b9 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81729b89c963345e5acafb87c607b3e5edb27ba8def41c160368e4cd16dd45cc +size 102698855 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-500/rng_state.pth b/cpt_qwen_14B/checkpoints/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..22013d4f81b0f56871da8a7046e675ea02cdaed2 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b95a35909dc3f185e8c73422430ed46fd396e54727021d1feac0301f97e5283e +size 14645 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-500/scheduler.pt b/cpt_qwen_14B/checkpoints/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9eff74fecc034d3a53f2570cbdb73830a50c3658 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04c5882a8aa54af03d9393fb475b28894fabd02ad367621f66634acf39263adb +size 1465 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-500/tokenizer.json b/cpt_qwen_14B/checkpoints/checkpoint-500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-500/tokenizer_config.json b/cpt_qwen_14B/checkpoints/checkpoint-500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..217274ef8275420e4bf3b976f3948901cd3d176f --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-500/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": true, + "model_max_length": 32768, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-500/trainer_state.json b/cpt_qwen_14B/checkpoints/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..576c3ae26a452c9a5972328d745062b93a88da19 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-500/trainer_state.json @@ -0,0 +1,3614 @@ +{ + "best_global_step": 500, + "best_metric": 0.6648170948028564, + "best_model_checkpoint": "runs/cpt_run_14b/checkpoints/checkpoint-500", + "epoch": 1.5257928926251432, + "eval_steps": 50, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003056935422239205, + "grad_norm": 0.06516239047050476, + "learning_rate": 0.0, + "loss": 1.138384461402893, + "step": 1 + }, + { + "epoch": 0.00611387084447841, + "grad_norm": 0.05343673378229141, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.983342707157135, + "step": 2 + }, + { + "epoch": 0.009170806266717615, + "grad_norm": 0.05608418956398964, + "learning_rate": 6.060606060606061e-07, + "loss": 1.0762118101119995, + "step": 3 + }, + { + "epoch": 0.01222774168895682, + "grad_norm": 0.06523486226797104, + "learning_rate": 9.090909090909091e-07, + "loss": 1.084489345550537, + "step": 4 + }, + { + "epoch": 0.015284677111196026, + "grad_norm": 0.06582186371088028, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.2037022113800049, + "step": 5 + }, + { + "epoch": 0.01834161253343523, + "grad_norm": 0.06097998470067978, + "learning_rate": 1.5151515151515152e-06, + "loss": 1.10005784034729, + "step": 6 + }, + { + "epoch": 0.021398547955674436, + "grad_norm": 0.10365528613328934, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.0895193815231323, + "step": 7 + }, + { + "epoch": 0.02445548337791364, + "grad_norm": 0.06312141567468643, + "learning_rate": 2.1212121212121216e-06, + "loss": 1.0593242645263672, + "step": 8 + }, + { + "epoch": 0.027512418800152847, + "grad_norm": 0.05508403480052948, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.9772955179214478, + "step": 9 + }, + { + "epoch": 0.030569354222392053, + "grad_norm": 0.06006711348891258, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.084238886833191, + "step": 10 + }, + { + "epoch": 0.033626289644631255, + "grad_norm": 0.0588749423623085, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.0786534547805786, + "step": 11 + }, + { + "epoch": 0.03668322506687046, + "grad_norm": 0.046551357954740524, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.0370622873306274, + "step": 12 + }, + { + "epoch": 0.039740160489109666, + "grad_norm": 0.061659567058086395, + "learning_rate": 3.6363636363636366e-06, + "loss": 1.0646986961364746, + "step": 13 + }, + { + "epoch": 0.04279709591134887, + "grad_norm": 0.06007347255945206, + "learning_rate": 3.93939393939394e-06, + "loss": 1.0311307907104492, + "step": 14 + }, + { + "epoch": 0.04585403133358808, + "grad_norm": 0.07314135134220123, + "learning_rate": 4.242424242424243e-06, + "loss": 1.1300500631332397, + "step": 15 + }, + { + "epoch": 0.04891096675582728, + "grad_norm": 0.060934022068977356, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.0197452306747437, + "step": 16 + }, + { + "epoch": 0.05196790217806649, + "grad_norm": 0.056856051087379456, + "learning_rate": 4.848484848484849e-06, + "loss": 1.0438549518585205, + "step": 17 + }, + { + "epoch": 0.055024837600305694, + "grad_norm": 0.05908689647912979, + "learning_rate": 5.151515151515152e-06, + "loss": 1.0398856401443481, + "step": 18 + }, + { + "epoch": 0.0580817730225449, + "grad_norm": 0.07411840558052063, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.107885479927063, + "step": 19 + }, + { + "epoch": 0.061138708444784105, + "grad_norm": 0.0749165341258049, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.1060967445373535, + "step": 20 + }, + { + "epoch": 0.06419564386702331, + "grad_norm": 0.06720177084207535, + "learning_rate": 6.060606060606061e-06, + "loss": 1.0471720695495605, + "step": 21 + }, + { + "epoch": 0.06725257928926251, + "grad_norm": 0.05990725755691528, + "learning_rate": 6.363636363636364e-06, + "loss": 1.0944981575012207, + "step": 22 + }, + { + "epoch": 0.07030951471150172, + "grad_norm": 0.06672193855047226, + "learning_rate": 6.666666666666667e-06, + "loss": 1.1477092504501343, + "step": 23 + }, + { + "epoch": 0.07336645013374092, + "grad_norm": 0.06145205348730087, + "learning_rate": 6.969696969696971e-06, + "loss": 1.0591784715652466, + "step": 24 + }, + { + "epoch": 0.07642338555598013, + "grad_norm": 0.0757482647895813, + "learning_rate": 7.272727272727273e-06, + "loss": 1.0500165224075317, + "step": 25 + }, + { + "epoch": 0.07948032097821933, + "grad_norm": 0.07848478108644485, + "learning_rate": 7.5757575757575764e-06, + "loss": 1.0747522115707397, + "step": 26 + }, + { + "epoch": 0.08253725640045854, + "grad_norm": 0.07740631699562073, + "learning_rate": 7.87878787878788e-06, + "loss": 1.132310152053833, + "step": 27 + }, + { + "epoch": 0.08559419182269774, + "grad_norm": 0.07476603239774704, + "learning_rate": 8.181818181818183e-06, + "loss": 1.0339502096176147, + "step": 28 + }, + { + "epoch": 0.08865112724493696, + "grad_norm": 0.0779196098446846, + "learning_rate": 8.484848484848486e-06, + "loss": 1.1047282218933105, + "step": 29 + }, + { + "epoch": 0.09170806266717615, + "grad_norm": 0.06962384283542633, + "learning_rate": 8.787878787878788e-06, + "loss": 1.004916787147522, + "step": 30 + }, + { + "epoch": 0.09476499808941537, + "grad_norm": 0.06369175016880035, + "learning_rate": 9.090909090909091e-06, + "loss": 0.9296417832374573, + "step": 31 + }, + { + "epoch": 0.09782193351165457, + "grad_norm": 0.07470260560512543, + "learning_rate": 9.393939393939396e-06, + "loss": 1.0721708536148071, + "step": 32 + }, + { + "epoch": 0.10087886893389378, + "grad_norm": 0.07948213815689087, + "learning_rate": 9.696969696969698e-06, + "loss": 1.0350117683410645, + "step": 33 + }, + { + "epoch": 0.10393580435613298, + "grad_norm": 0.07066022604703903, + "learning_rate": 1e-05, + "loss": 1.026305913925171, + "step": 34 + }, + { + "epoch": 0.10699273977837218, + "grad_norm": 0.07774543762207031, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.0509816408157349, + "step": 35 + }, + { + "epoch": 0.11004967520061139, + "grad_norm": 0.07501248270273209, + "learning_rate": 1.0606060606060606e-05, + "loss": 1.0011574029922485, + "step": 36 + }, + { + "epoch": 0.11310661062285059, + "grad_norm": 0.6622501611709595, + "learning_rate": 1.0909090909090909e-05, + "loss": 0.9754424691200256, + "step": 37 + }, + { + "epoch": 0.1161635460450898, + "grad_norm": 0.07566080242395401, + "learning_rate": 1.1212121212121212e-05, + "loss": 1.0342774391174316, + "step": 38 + }, + { + "epoch": 0.119220481467329, + "grad_norm": 0.07573831081390381, + "learning_rate": 1.1515151515151517e-05, + "loss": 0.9714518785476685, + "step": 39 + }, + { + "epoch": 0.12227741688956821, + "grad_norm": 0.08083852380514145, + "learning_rate": 1.181818181818182e-05, + "loss": 1.1050316095352173, + "step": 40 + }, + { + "epoch": 0.12533435231180742, + "grad_norm": 0.08540588617324829, + "learning_rate": 1.2121212121212122e-05, + "loss": 1.0871070623397827, + "step": 41 + }, + { + "epoch": 0.12839128773404662, + "grad_norm": 0.07391592115163803, + "learning_rate": 1.2424242424242425e-05, + "loss": 1.0206722021102905, + "step": 42 + }, + { + "epoch": 0.13144822315628582, + "grad_norm": 0.07063689082860947, + "learning_rate": 1.2727272727272728e-05, + "loss": 0.9775047898292542, + "step": 43 + }, + { + "epoch": 0.13450515857852502, + "grad_norm": 0.07288888841867447, + "learning_rate": 1.3030303030303032e-05, + "loss": 1.1132858991622925, + "step": 44 + }, + { + "epoch": 0.13756209400076425, + "grad_norm": 0.07641777396202087, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.0707701444625854, + "step": 45 + }, + { + "epoch": 0.14061902942300344, + "grad_norm": 0.06990326195955276, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.9328265190124512, + "step": 46 + }, + { + "epoch": 0.14367596484524264, + "grad_norm": 0.0834241658449173, + "learning_rate": 1.3939393939393942e-05, + "loss": 1.0131721496582031, + "step": 47 + }, + { + "epoch": 0.14673290026748184, + "grad_norm": 0.0714937075972557, + "learning_rate": 1.4242424242424245e-05, + "loss": 0.940493106842041, + "step": 48 + }, + { + "epoch": 0.14978983568972107, + "grad_norm": 0.07770547270774841, + "learning_rate": 1.4545454545454546e-05, + "loss": 1.0435771942138672, + "step": 49 + }, + { + "epoch": 0.15284677111196027, + "grad_norm": 0.07950945198535919, + "learning_rate": 1.484848484848485e-05, + "loss": 1.0382137298583984, + "step": 50 + }, + { + "epoch": 0.15284677111196027, + "eval_loss": 1.0129202604293823, + "eval_runtime": 724.3664, + "eval_samples_per_second": 0.832, + "eval_steps_per_second": 0.832, + "step": 50 + }, + { + "epoch": 0.15590370653419947, + "grad_norm": 0.06961936503648758, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.9690049886703491, + "step": 51 + }, + { + "epoch": 0.15896064195643866, + "grad_norm": 0.069523885846138, + "learning_rate": 1.5454545454545454e-05, + "loss": 0.9830482006072998, + "step": 52 + }, + { + "epoch": 0.16201757737867786, + "grad_norm": 0.0764622762799263, + "learning_rate": 1.575757575757576e-05, + "loss": 1.0895472764968872, + "step": 53 + }, + { + "epoch": 0.1650745128009171, + "grad_norm": 0.1413721889257431, + "learning_rate": 1.606060606060606e-05, + "loss": 1.0354574918746948, + "step": 54 + }, + { + "epoch": 0.1681314482231563, + "grad_norm": 0.06818042695522308, + "learning_rate": 1.6363636363636366e-05, + "loss": 0.8534265160560608, + "step": 55 + }, + { + "epoch": 0.1711883836453955, + "grad_norm": 0.0722246989607811, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.9580274820327759, + "step": 56 + }, + { + "epoch": 0.17424531906763469, + "grad_norm": 0.07113443315029144, + "learning_rate": 1.6969696969696972e-05, + "loss": 1.0721848011016846, + "step": 57 + }, + { + "epoch": 0.1773022544898739, + "grad_norm": 0.08412107080221176, + "learning_rate": 1.7272727272727274e-05, + "loss": 1.1180150508880615, + "step": 58 + }, + { + "epoch": 0.1803591899121131, + "grad_norm": 0.07381036877632141, + "learning_rate": 1.7575757575757576e-05, + "loss": 1.0384547710418701, + "step": 59 + }, + { + "epoch": 0.1834161253343523, + "grad_norm": 0.07089001685380936, + "learning_rate": 1.787878787878788e-05, + "loss": 1.0446016788482666, + "step": 60 + }, + { + "epoch": 0.1864730607565915, + "grad_norm": 0.11576953530311584, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.0015051364898682, + "step": 61 + }, + { + "epoch": 0.18952999617883073, + "grad_norm": 0.08030868321657181, + "learning_rate": 1.8484848484848487e-05, + "loss": 0.9642710089683533, + "step": 62 + }, + { + "epoch": 0.19258693160106993, + "grad_norm": 0.08332342654466629, + "learning_rate": 1.8787878787878792e-05, + "loss": 1.0722991228103638, + "step": 63 + }, + { + "epoch": 0.19564386702330913, + "grad_norm": 0.08000365644693375, + "learning_rate": 1.9090909090909094e-05, + "loss": 1.0104647874832153, + "step": 64 + }, + { + "epoch": 0.19870080244554833, + "grad_norm": 0.08139508217573166, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9445061087608337, + "step": 65 + }, + { + "epoch": 0.20175773786778756, + "grad_norm": 0.08749893307685852, + "learning_rate": 1.96969696969697e-05, + "loss": 1.080810308456421, + "step": 66 + }, + { + "epoch": 0.20481467329002676, + "grad_norm": 0.0786912813782692, + "learning_rate": 2e-05, + "loss": 0.9705753922462463, + "step": 67 + }, + { + "epoch": 0.20787160871226595, + "grad_norm": 0.08962028473615646, + "learning_rate": 1.9999858236410775e-05, + "loss": 0.962783694267273, + "step": 68 + }, + { + "epoch": 0.21092854413450515, + "grad_norm": 0.08402887731790543, + "learning_rate": 1.9999432949662483e-05, + "loss": 0.9959614872932434, + "step": 69 + }, + { + "epoch": 0.21398547955674435, + "grad_norm": 0.08036444336175919, + "learning_rate": 1.9998724151813157e-05, + "loss": 0.9569960832595825, + "step": 70 + }, + { + "epoch": 0.21704241497898358, + "grad_norm": 0.08247046917676926, + "learning_rate": 1.9997731862959143e-05, + "loss": 1.0012171268463135, + "step": 71 + }, + { + "epoch": 0.22009935040122278, + "grad_norm": 0.08966264873743057, + "learning_rate": 1.999645611123453e-05, + "loss": 1.0403809547424316, + "step": 72 + }, + { + "epoch": 0.22315628582346198, + "grad_norm": 0.08061660826206207, + "learning_rate": 1.999489693281034e-05, + "loss": 1.0089740753173828, + "step": 73 + }, + { + "epoch": 0.22621322124570117, + "grad_norm": 0.09005365520715714, + "learning_rate": 1.9993054371893526e-05, + "loss": 0.9333044290542603, + "step": 74 + }, + { + "epoch": 0.2292701566679404, + "grad_norm": 0.08651519566774368, + "learning_rate": 1.9990928480725694e-05, + "loss": 0.9284015893936157, + "step": 75 + }, + { + "epoch": 0.2323270920901796, + "grad_norm": 0.08141147345304489, + "learning_rate": 1.9988519319581637e-05, + "loss": 0.9782730340957642, + "step": 76 + }, + { + "epoch": 0.2353840275124188, + "grad_norm": 0.08344405144453049, + "learning_rate": 1.998582695676762e-05, + "loss": 0.9723064303398132, + "step": 77 + }, + { + "epoch": 0.238440962934658, + "grad_norm": 0.08019903302192688, + "learning_rate": 1.998285146861945e-05, + "loss": 0.9648997783660889, + "step": 78 + }, + { + "epoch": 0.24149789835689722, + "grad_norm": 0.08113416284322739, + "learning_rate": 1.99795929395003e-05, + "loss": 0.9263214468955994, + "step": 79 + }, + { + "epoch": 0.24455483377913642, + "grad_norm": 0.08127513527870178, + "learning_rate": 1.997605146179833e-05, + "loss": 0.8745232224464417, + "step": 80 + }, + { + "epoch": 0.24761176920137562, + "grad_norm": 0.09934187680482864, + "learning_rate": 1.997222713592405e-05, + "loss": 0.8722782135009766, + "step": 81 + }, + { + "epoch": 0.25066870462361485, + "grad_norm": 0.09701363742351532, + "learning_rate": 1.9968120070307503e-05, + "loss": 1.0084266662597656, + "step": 82 + }, + { + "epoch": 0.253725640045854, + "grad_norm": 0.08335654437541962, + "learning_rate": 1.9963730381395154e-05, + "loss": 0.9239332675933838, + "step": 83 + }, + { + "epoch": 0.25678257546809324, + "grad_norm": 0.09161650389432907, + "learning_rate": 1.9959058193646618e-05, + "loss": 0.9878032207489014, + "step": 84 + }, + { + "epoch": 0.2598395108903324, + "grad_norm": 0.08067663013935089, + "learning_rate": 1.9954103639531116e-05, + "loss": 0.9113098382949829, + "step": 85 + }, + { + "epoch": 0.26289644631257164, + "grad_norm": 0.09619539976119995, + "learning_rate": 1.9948866859523717e-05, + "loss": 0.9527600407600403, + "step": 86 + }, + { + "epoch": 0.26595338173481087, + "grad_norm": 0.10015493631362915, + "learning_rate": 1.9943348002101374e-05, + "loss": 0.9569152593612671, + "step": 87 + }, + { + "epoch": 0.26901031715705004, + "grad_norm": 0.09012345969676971, + "learning_rate": 1.993754722373869e-05, + "loss": 0.8912045359611511, + "step": 88 + }, + { + "epoch": 0.27206725257928926, + "grad_norm": 0.10342805832624435, + "learning_rate": 1.9931464688903502e-05, + "loss": 0.856104850769043, + "step": 89 + }, + { + "epoch": 0.2751241880015285, + "grad_norm": 0.10218493640422821, + "learning_rate": 1.9925100570052194e-05, + "loss": 0.9631397128105164, + "step": 90 + }, + { + "epoch": 0.27818112342376766, + "grad_norm": 0.10909046977758408, + "learning_rate": 1.9918455047624847e-05, + "loss": 0.8532565236091614, + "step": 91 + }, + { + "epoch": 0.2812380588460069, + "grad_norm": 0.10714197903871536, + "learning_rate": 1.9911528310040073e-05, + "loss": 0.9691859483718872, + "step": 92 + }, + { + "epoch": 0.28429499426824606, + "grad_norm": 0.1108694076538086, + "learning_rate": 1.990432055368971e-05, + "loss": 0.9374334812164307, + "step": 93 + }, + { + "epoch": 0.2873519296904853, + "grad_norm": 0.10037308186292648, + "learning_rate": 1.989683198293324e-05, + "loss": 0.9166896343231201, + "step": 94 + }, + { + "epoch": 0.2904088651127245, + "grad_norm": 0.10246684402227402, + "learning_rate": 1.9889062810092002e-05, + "loss": 1.0059239864349365, + "step": 95 + }, + { + "epoch": 0.2934658005349637, + "grad_norm": 0.09954962879419327, + "learning_rate": 1.9881013255443152e-05, + "loss": 1.00413179397583, + "step": 96 + }, + { + "epoch": 0.2965227359572029, + "grad_norm": 0.11006761342287064, + "learning_rate": 1.9872683547213446e-05, + "loss": 0.9414035677909851, + "step": 97 + }, + { + "epoch": 0.29957967137944214, + "grad_norm": 0.1014382541179657, + "learning_rate": 1.9864073921572756e-05, + "loss": 0.9155468940734863, + "step": 98 + }, + { + "epoch": 0.3026366068016813, + "grad_norm": 0.09883157908916473, + "learning_rate": 1.9855184622627362e-05, + "loss": 0.9429305195808411, + "step": 99 + }, + { + "epoch": 0.30569354222392053, + "grad_norm": 0.11199072748422623, + "learning_rate": 1.9846015902413053e-05, + "loss": 0.9143528342247009, + "step": 100 + }, + { + "epoch": 0.30569354222392053, + "eval_loss": 0.884428083896637, + "eval_runtime": 723.8143, + "eval_samples_per_second": 0.833, + "eval_steps_per_second": 0.833, + "step": 100 + }, + { + "epoch": 0.3087504776461597, + "grad_norm": 0.10796016454696655, + "learning_rate": 1.9836568020887963e-05, + "loss": 0.9726455211639404, + "step": 101 + }, + { + "epoch": 0.31180741306839893, + "grad_norm": 0.10056383162736893, + "learning_rate": 1.982684124592521e-05, + "loss": 0.8932135701179504, + "step": 102 + }, + { + "epoch": 0.31486434849063816, + "grad_norm": 0.10836594551801682, + "learning_rate": 1.9816835853305306e-05, + "loss": 0.919749915599823, + "step": 103 + }, + { + "epoch": 0.31792128391287733, + "grad_norm": 0.12032149732112885, + "learning_rate": 1.9806552126708322e-05, + "loss": 0.871781587600708, + "step": 104 + }, + { + "epoch": 0.32097821933511655, + "grad_norm": 0.10854160040616989, + "learning_rate": 1.9795990357705853e-05, + "loss": 0.8587784171104431, + "step": 105 + }, + { + "epoch": 0.3240351547573557, + "grad_norm": 0.10819399356842041, + "learning_rate": 1.978515084575276e-05, + "loss": 0.8524806499481201, + "step": 106 + }, + { + "epoch": 0.32709209017959495, + "grad_norm": 0.10226067155599594, + "learning_rate": 1.9774033898178668e-05, + "loss": 0.7892144918441772, + "step": 107 + }, + { + "epoch": 0.3301490256018342, + "grad_norm": 0.1071159616112709, + "learning_rate": 1.976263983017925e-05, + "loss": 0.8833234906196594, + "step": 108 + }, + { + "epoch": 0.33320596102407335, + "grad_norm": 0.11434526741504669, + "learning_rate": 1.9750968964807305e-05, + "loss": 0.861842155456543, + "step": 109 + }, + { + "epoch": 0.3362628964463126, + "grad_norm": 0.1159641221165657, + "learning_rate": 1.9739021632963584e-05, + "loss": 0.8987889289855957, + "step": 110 + }, + { + "epoch": 0.3393198318685518, + "grad_norm": 0.12371373921632767, + "learning_rate": 1.9726798173387417e-05, + "loss": 0.9710193872451782, + "step": 111 + }, + { + "epoch": 0.342376767290791, + "grad_norm": 0.11441531032323837, + "learning_rate": 1.97142989326471e-05, + "loss": 0.8199151158332825, + "step": 112 + }, + { + "epoch": 0.3454337027130302, + "grad_norm": 0.11842846125364304, + "learning_rate": 1.9701524265130088e-05, + "loss": 0.8845276236534119, + "step": 113 + }, + { + "epoch": 0.34849063813526937, + "grad_norm": 0.10813732445240021, + "learning_rate": 1.9688474533032916e-05, + "loss": 0.7964264750480652, + "step": 114 + }, + { + "epoch": 0.3515475735575086, + "grad_norm": 0.11050347238779068, + "learning_rate": 1.9675150106350957e-05, + "loss": 0.9630422592163086, + "step": 115 + }, + { + "epoch": 0.3546045089797478, + "grad_norm": 0.10537250339984894, + "learning_rate": 1.9661551362867926e-05, + "loss": 0.7706905007362366, + "step": 116 + }, + { + "epoch": 0.357661444401987, + "grad_norm": 0.11390368640422821, + "learning_rate": 1.9647678688145163e-05, + "loss": 0.8541204929351807, + "step": 117 + }, + { + "epoch": 0.3607183798242262, + "grad_norm": 0.10318922251462936, + "learning_rate": 1.963353247551069e-05, + "loss": 0.7400562763214111, + "step": 118 + }, + { + "epoch": 0.3637753152464654, + "grad_norm": 0.1347586214542389, + "learning_rate": 1.9619113126048086e-05, + "loss": 0.9232871532440186, + "step": 119 + }, + { + "epoch": 0.3668322506687046, + "grad_norm": 0.11458177119493484, + "learning_rate": 1.96044210485851e-05, + "loss": 0.833285927772522, + "step": 120 + }, + { + "epoch": 0.36988918609094384, + "grad_norm": 0.12361041456460953, + "learning_rate": 1.958945665968206e-05, + "loss": 0.7887391448020935, + "step": 121 + }, + { + "epoch": 0.372946121513183, + "grad_norm": 0.11985408514738083, + "learning_rate": 1.9574220383620054e-05, + "loss": 0.8206446170806885, + "step": 122 + }, + { + "epoch": 0.37600305693542224, + "grad_norm": 0.1355939507484436, + "learning_rate": 1.9558712652388932e-05, + "loss": 0.7648542523384094, + "step": 123 + }, + { + "epoch": 0.37905999235766147, + "grad_norm": 0.1229313388466835, + "learning_rate": 1.954293390567501e-05, + "loss": 0.8573335409164429, + "step": 124 + }, + { + "epoch": 0.38211692777990064, + "grad_norm": 0.11425124108791351, + "learning_rate": 1.9526884590848646e-05, + "loss": 0.7412531971931458, + "step": 125 + }, + { + "epoch": 0.38517386320213987, + "grad_norm": 0.12430041283369064, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.8098543882369995, + "step": 126 + }, + { + "epoch": 0.38823079862437904, + "grad_norm": 0.12492368370294571, + "learning_rate": 1.9493976084683814e-05, + "loss": 0.8814713954925537, + "step": 127 + }, + { + "epoch": 0.39128773404661826, + "grad_norm": 0.14428824186325073, + "learning_rate": 1.9477117826390934e-05, + "loss": 0.8231979608535767, + "step": 128 + }, + { + "epoch": 0.3943446694688575, + "grad_norm": 0.12010085582733154, + "learning_rate": 1.9459990866050337e-05, + "loss": 0.7015627026557922, + "step": 129 + }, + { + "epoch": 0.39740160489109666, + "grad_norm": 0.11819776892662048, + "learning_rate": 1.9442595689257898e-05, + "loss": 0.8086729645729065, + "step": 130 + }, + { + "epoch": 0.4004585403133359, + "grad_norm": 0.12211033701896667, + "learning_rate": 1.9424932789214158e-05, + "loss": 0.8234002590179443, + "step": 131 + }, + { + "epoch": 0.4035154757355751, + "grad_norm": 0.14926476776599884, + "learning_rate": 1.9407002666710334e-05, + "loss": 0.874608039855957, + "step": 132 + }, + { + "epoch": 0.4065724111578143, + "grad_norm": 0.13012923300266266, + "learning_rate": 1.9388805830114132e-05, + "loss": 0.8491607904434204, + "step": 133 + }, + { + "epoch": 0.4096293465800535, + "grad_norm": 0.12012261897325516, + "learning_rate": 1.937034279535533e-05, + "loss": 0.7269159555435181, + "step": 134 + }, + { + "epoch": 0.4126862820022927, + "grad_norm": 0.15302567183971405, + "learning_rate": 1.9351614085911134e-05, + "loss": 0.8560839891433716, + "step": 135 + }, + { + "epoch": 0.4157432174245319, + "grad_norm": 0.12234190106391907, + "learning_rate": 1.933262023279137e-05, + "loss": 0.8211904764175415, + "step": 136 + }, + { + "epoch": 0.41880015284677113, + "grad_norm": 0.14427296817302704, + "learning_rate": 1.9313361774523387e-05, + "loss": 0.8500057458877563, + "step": 137 + }, + { + "epoch": 0.4218570882690103, + "grad_norm": 0.1314094066619873, + "learning_rate": 1.929383925713682e-05, + "loss": 0.7589091658592224, + "step": 138 + }, + { + "epoch": 0.42491402369124953, + "grad_norm": 0.1576734483242035, + "learning_rate": 1.92740532341481e-05, + "loss": 0.7581073641777039, + "step": 139 + }, + { + "epoch": 0.4279709591134887, + "grad_norm": 0.15788713097572327, + "learning_rate": 1.925400426654475e-05, + "loss": 0.809050440788269, + "step": 140 + }, + { + "epoch": 0.43102789453572793, + "grad_norm": 0.13364559412002563, + "learning_rate": 1.9233692922769497e-05, + "loss": 0.7990086078643799, + "step": 141 + }, + { + "epoch": 0.43408482995796716, + "grad_norm": 0.14786465466022491, + "learning_rate": 1.921311977870413e-05, + "loss": 0.8675815463066101, + "step": 142 + }, + { + "epoch": 0.4371417653802063, + "grad_norm": 0.14621882140636444, + "learning_rate": 1.9192285417653208e-05, + "loss": 0.8713765740394592, + "step": 143 + }, + { + "epoch": 0.44019870080244555, + "grad_norm": 0.12874048948287964, + "learning_rate": 1.917119043032749e-05, + "loss": 0.7361871004104614, + "step": 144 + }, + { + "epoch": 0.4432556362246848, + "grad_norm": 0.12183775007724762, + "learning_rate": 1.9149835414827193e-05, + "loss": 0.7311941385269165, + "step": 145 + }, + { + "epoch": 0.44631257164692395, + "grad_norm": 0.1397160291671753, + "learning_rate": 1.912822097662505e-05, + "loss": 0.8189159035682678, + "step": 146 + }, + { + "epoch": 0.4493695070691632, + "grad_norm": 0.1458273082971573, + "learning_rate": 1.9106347728549134e-05, + "loss": 0.8288135528564453, + "step": 147 + }, + { + "epoch": 0.45242644249140235, + "grad_norm": 0.16898781061172485, + "learning_rate": 1.908421629076547e-05, + "loss": 0.7878037095069885, + "step": 148 + }, + { + "epoch": 0.4554833779136416, + "grad_norm": 0.1638474315404892, + "learning_rate": 1.9061827290760466e-05, + "loss": 0.8059952259063721, + "step": 149 + }, + { + "epoch": 0.4585403133358808, + "grad_norm": 0.14130882918834686, + "learning_rate": 1.9039181363323128e-05, + "loss": 0.7346830368041992, + "step": 150 + }, + { + "epoch": 0.4585403133358808, + "eval_loss": 0.7979016900062561, + "eval_runtime": 828.6295, + "eval_samples_per_second": 0.728, + "eval_steps_per_second": 0.728, + "step": 150 + }, + { + "epoch": 0.46159724875811997, + "grad_norm": 0.14427433907985687, + "learning_rate": 1.9016279150527044e-05, + "loss": 0.7583403587341309, + "step": 151 + }, + { + "epoch": 0.4646541841803592, + "grad_norm": 0.1515798568725586, + "learning_rate": 1.8993121301712194e-05, + "loss": 0.7908380031585693, + "step": 152 + }, + { + "epoch": 0.46771111960259837, + "grad_norm": 0.14444488286972046, + "learning_rate": 1.896970847346653e-05, + "loss": 0.7916130423545837, + "step": 153 + }, + { + "epoch": 0.4707680550248376, + "grad_norm": 0.1460912823677063, + "learning_rate": 1.8946041329607364e-05, + "loss": 0.7750643491744995, + "step": 154 + }, + { + "epoch": 0.4738249904470768, + "grad_norm": 0.13896244764328003, + "learning_rate": 1.892212054116255e-05, + "loss": 0.8059666156768799, + "step": 155 + }, + { + "epoch": 0.476881925869316, + "grad_norm": 0.16133630275726318, + "learning_rate": 1.889794678635145e-05, + "loss": 0.8327827453613281, + "step": 156 + }, + { + "epoch": 0.4799388612915552, + "grad_norm": 0.1474636346101761, + "learning_rate": 1.8873520750565716e-05, + "loss": 0.8498989343643188, + "step": 157 + }, + { + "epoch": 0.48299579671379445, + "grad_norm": 0.17222349345684052, + "learning_rate": 1.884884312634985e-05, + "loss": 0.7750177979469299, + "step": 158 + }, + { + "epoch": 0.4860527321360336, + "grad_norm": 0.15558090806007385, + "learning_rate": 1.8823914613381568e-05, + "loss": 0.7326169013977051, + "step": 159 + }, + { + "epoch": 0.48910966755827284, + "grad_norm": 0.13808321952819824, + "learning_rate": 1.8798735918451963e-05, + "loss": 0.8308709859848022, + "step": 160 + }, + { + "epoch": 0.492166602980512, + "grad_norm": 0.1761898398399353, + "learning_rate": 1.8773307755445468e-05, + "loss": 0.7805465459823608, + "step": 161 + }, + { + "epoch": 0.49522353840275124, + "grad_norm": 0.160477414727211, + "learning_rate": 1.874763084531961e-05, + "loss": 0.8538846969604492, + "step": 162 + }, + { + "epoch": 0.49828047382499047, + "grad_norm": 0.15238745510578156, + "learning_rate": 1.872170591608459e-05, + "loss": 0.8801217675209045, + "step": 163 + }, + { + "epoch": 0.5013374092472297, + "grad_norm": 0.1567080318927765, + "learning_rate": 1.86955337027826e-05, + "loss": 0.7205259799957275, + "step": 164 + }, + { + "epoch": 0.5043943446694689, + "grad_norm": 0.13637851178646088, + "learning_rate": 1.866911494746702e-05, + "loss": 0.7636491656303406, + "step": 165 + }, + { + "epoch": 0.507451280091708, + "grad_norm": 0.15563489496707916, + "learning_rate": 1.8642450399181373e-05, + "loss": 0.7982497811317444, + "step": 166 + }, + { + "epoch": 0.5105082155139473, + "grad_norm": 0.15503396093845367, + "learning_rate": 1.8615540813938063e-05, + "loss": 0.8737778067588806, + "step": 167 + }, + { + "epoch": 0.5135651509361865, + "grad_norm": 0.16095557808876038, + "learning_rate": 1.8588386954696972e-05, + "loss": 0.796604335308075, + "step": 168 + }, + { + "epoch": 0.5166220863584257, + "grad_norm": 0.1713593453168869, + "learning_rate": 1.856098959134381e-05, + "loss": 0.8247392177581787, + "step": 169 + }, + { + "epoch": 0.5196790217806648, + "grad_norm": 0.18239113688468933, + "learning_rate": 1.8533349500668295e-05, + "loss": 0.7838484644889832, + "step": 170 + }, + { + "epoch": 0.5227359572029041, + "grad_norm": 0.15745767951011658, + "learning_rate": 1.850546746634211e-05, + "loss": 0.7856907248497009, + "step": 171 + }, + { + "epoch": 0.5257928926251433, + "grad_norm": 0.16820666193962097, + "learning_rate": 1.8477344278896708e-05, + "loss": 0.7829679846763611, + "step": 172 + }, + { + "epoch": 0.5288498280473825, + "grad_norm": 0.16975544393062592, + "learning_rate": 1.84489807357009e-05, + "loss": 0.7374375462532043, + "step": 173 + }, + { + "epoch": 0.5319067634696217, + "grad_norm": 0.167228102684021, + "learning_rate": 1.8420377640938204e-05, + "loss": 0.712837815284729, + "step": 174 + }, + { + "epoch": 0.5349636988918609, + "grad_norm": 0.15955154597759247, + "learning_rate": 1.839153580558411e-05, + "loss": 0.7645693421363831, + "step": 175 + }, + { + "epoch": 0.5380206343141001, + "grad_norm": 0.18378689885139465, + "learning_rate": 1.8362456047383032e-05, + "loss": 0.7974956631660461, + "step": 176 + }, + { + "epoch": 0.5410775697363394, + "grad_norm": 0.15777672827243805, + "learning_rate": 1.833313919082515e-05, + "loss": 0.8957571983337402, + "step": 177 + }, + { + "epoch": 0.5441345051585785, + "grad_norm": 0.15292386710643768, + "learning_rate": 1.8303586067123028e-05, + "loss": 0.7635619044303894, + "step": 178 + }, + { + "epoch": 0.5471914405808177, + "grad_norm": 0.178152397274971, + "learning_rate": 1.8273797514188043e-05, + "loss": 0.7849246263504028, + "step": 179 + }, + { + "epoch": 0.550248376003057, + "grad_norm": 0.15916013717651367, + "learning_rate": 1.824377437660663e-05, + "loss": 0.6975343227386475, + "step": 180 + }, + { + "epoch": 0.5533053114252962, + "grad_norm": 0.18172231316566467, + "learning_rate": 1.821351750561634e-05, + "loss": 0.7675164341926575, + "step": 181 + }, + { + "epoch": 0.5563622468475353, + "grad_norm": 0.16241903603076935, + "learning_rate": 1.818302775908169e-05, + "loss": 0.7950343489646912, + "step": 182 + }, + { + "epoch": 0.5594191822697746, + "grad_norm": 0.18727579712867737, + "learning_rate": 1.8152306001469875e-05, + "loss": 0.787315309047699, + "step": 183 + }, + { + "epoch": 0.5624761176920138, + "grad_norm": 0.1627933531999588, + "learning_rate": 1.8121353103826213e-05, + "loss": 0.7141211628913879, + "step": 184 + }, + { + "epoch": 0.565533053114253, + "grad_norm": 0.4369247555732727, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.8476608395576477, + "step": 185 + }, + { + "epoch": 0.5685899885364921, + "grad_norm": 0.16494786739349365, + "learning_rate": 1.8058757405367003e-05, + "loss": 0.720562756061554, + "step": 186 + }, + { + "epoch": 0.5716469239587314, + "grad_norm": 0.175015389919281, + "learning_rate": 1.8027116379309637e-05, + "loss": 0.7589252591133118, + "step": 187 + }, + { + "epoch": 0.5747038593809706, + "grad_norm": 0.1769978553056717, + "learning_rate": 1.799524776268646e-05, + "loss": 0.7644155621528625, + "step": 188 + }, + { + "epoch": 0.5777607948032097, + "grad_norm": 0.18481792509555817, + "learning_rate": 1.796315245905936e-05, + "loss": 0.7885835766792297, + "step": 189 + }, + { + "epoch": 0.580817730225449, + "grad_norm": 0.1668689250946045, + "learning_rate": 1.7930831378417437e-05, + "loss": 0.7377231121063232, + "step": 190 + }, + { + "epoch": 0.5838746656476882, + "grad_norm": 0.178734689950943, + "learning_rate": 1.7898285437151163e-05, + "loss": 0.7388894557952881, + "step": 191 + }, + { + "epoch": 0.5869316010699274, + "grad_norm": 0.1740068644285202, + "learning_rate": 1.786551555802643e-05, + "loss": 0.8209859728813171, + "step": 192 + }, + { + "epoch": 0.5899885364921666, + "grad_norm": 0.19211041927337646, + "learning_rate": 1.783252267015837e-05, + "loss": 0.7305737733840942, + "step": 193 + }, + { + "epoch": 0.5930454719144058, + "grad_norm": 0.16644936800003052, + "learning_rate": 1.779930770898503e-05, + "loss": 0.7760804891586304, + "step": 194 + }, + { + "epoch": 0.596102407336645, + "grad_norm": 0.1773686707019806, + "learning_rate": 1.776587161624083e-05, + "loss": 0.7879236936569214, + "step": 195 + }, + { + "epoch": 0.5991593427588843, + "grad_norm": 0.17508819699287415, + "learning_rate": 1.7732215339929874e-05, + "loss": 0.7307407259941101, + "step": 196 + }, + { + "epoch": 0.6022162781811234, + "grad_norm": 0.17211101949214935, + "learning_rate": 1.7698339834299064e-05, + "loss": 0.7293214797973633, + "step": 197 + }, + { + "epoch": 0.6052732136033626, + "grad_norm": 0.18085215985774994, + "learning_rate": 1.7664246059811058e-05, + "loss": 0.763083279132843, + "step": 198 + }, + { + "epoch": 0.6083301490256018, + "grad_norm": 0.20243075489997864, + "learning_rate": 1.7629934983117025e-05, + "loss": 0.7372676134109497, + "step": 199 + }, + { + "epoch": 0.6113870844478411, + "grad_norm": 0.18152795732021332, + "learning_rate": 1.759540757702924e-05, + "loss": 0.7121898531913757, + "step": 200 + }, + { + "epoch": 0.6113870844478411, + "eval_loss": 0.7551760673522949, + "eval_runtime": 900.209, + "eval_samples_per_second": 0.67, + "eval_steps_per_second": 0.67, + "step": 200 + }, + { + "epoch": 0.6144440198700802, + "grad_norm": 0.18808062374591827, + "learning_rate": 1.7560664820493502e-05, + "loss": 0.734307050704956, + "step": 201 + }, + { + "epoch": 0.6175009552923194, + "grad_norm": 0.18151243031024933, + "learning_rate": 1.7525707698561383e-05, + "loss": 0.7998429536819458, + "step": 202 + }, + { + "epoch": 0.6205578907145587, + "grad_norm": 0.19583043456077576, + "learning_rate": 1.7490537202362313e-05, + "loss": 0.7546265721321106, + "step": 203 + }, + { + "epoch": 0.6236148261367979, + "grad_norm": 0.2508557140827179, + "learning_rate": 1.7455154329075427e-05, + "loss": 0.7810050249099731, + "step": 204 + }, + { + "epoch": 0.626671761559037, + "grad_norm": 0.1685105562210083, + "learning_rate": 1.741956008190136e-05, + "loss": 0.7558917999267578, + "step": 205 + }, + { + "epoch": 0.6297286969812763, + "grad_norm": 0.18195222318172455, + "learning_rate": 1.7383755470033756e-05, + "loss": 0.7216942310333252, + "step": 206 + }, + { + "epoch": 0.6327856324035155, + "grad_norm": 0.1878063678741455, + "learning_rate": 1.7347741508630673e-05, + "loss": 0.7417092323303223, + "step": 207 + }, + { + "epoch": 0.6358425678257547, + "grad_norm": 0.25273698568344116, + "learning_rate": 1.73115192187858e-05, + "loss": 0.807498037815094, + "step": 208 + }, + { + "epoch": 0.6388995032479939, + "grad_norm": 0.2451465129852295, + "learning_rate": 1.7275089627499493e-05, + "loss": 0.7557163238525391, + "step": 209 + }, + { + "epoch": 0.6419564386702331, + "grad_norm": 0.19272617995738983, + "learning_rate": 1.7238453767649683e-05, + "loss": 0.8285109996795654, + "step": 210 + }, + { + "epoch": 0.6450133740924723, + "grad_norm": 0.1869518756866455, + "learning_rate": 1.720161267796256e-05, + "loss": 0.7824444770812988, + "step": 211 + }, + { + "epoch": 0.6480703095147115, + "grad_norm": 0.2029627561569214, + "learning_rate": 1.7164567402983153e-05, + "loss": 0.7018642425537109, + "step": 212 + }, + { + "epoch": 0.6511272449369507, + "grad_norm": 0.23215501010417938, + "learning_rate": 1.7127318993045686e-05, + "loss": 0.7263948917388916, + "step": 213 + }, + { + "epoch": 0.6541841803591899, + "grad_norm": 0.19869184494018555, + "learning_rate": 1.7089868504243816e-05, + "loss": 0.8285576105117798, + "step": 214 + }, + { + "epoch": 0.6572411157814291, + "grad_norm": 0.22871531546115875, + "learning_rate": 1.705221699840069e-05, + "loss": 0.7871490716934204, + "step": 215 + }, + { + "epoch": 0.6602980512036684, + "grad_norm": 0.17945580184459686, + "learning_rate": 1.701436554303882e-05, + "loss": 0.740180492401123, + "step": 216 + }, + { + "epoch": 0.6633549866259075, + "grad_norm": 0.20516762137413025, + "learning_rate": 1.6976315211349848e-05, + "loss": 0.7542892098426819, + "step": 217 + }, + { + "epoch": 0.6664119220481467, + "grad_norm": 0.22108283638954163, + "learning_rate": 1.6938067082164093e-05, + "loss": 0.8117404580116272, + "step": 218 + }, + { + "epoch": 0.669468857470386, + "grad_norm": 0.22329698503017426, + "learning_rate": 1.6899622239919965e-05, + "loss": 0.8002716898918152, + "step": 219 + }, + { + "epoch": 0.6725257928926252, + "grad_norm": 0.23545362055301666, + "learning_rate": 1.6860981774633228e-05, + "loss": 0.7750573754310608, + "step": 220 + }, + { + "epoch": 0.6755827283148643, + "grad_norm": 0.21816480159759521, + "learning_rate": 1.6822146781866097e-05, + "loss": 0.8051223754882812, + "step": 221 + }, + { + "epoch": 0.6786396637371036, + "grad_norm": 0.18638508021831512, + "learning_rate": 1.6783118362696162e-05, + "loss": 0.7286484241485596, + "step": 222 + }, + { + "epoch": 0.6816965991593428, + "grad_norm": 0.16794732213020325, + "learning_rate": 1.6743897623685178e-05, + "loss": 0.7001460194587708, + "step": 223 + }, + { + "epoch": 0.684753534581582, + "grad_norm": 0.21157318353652954, + "learning_rate": 1.6704485676847695e-05, + "loss": 0.7479901313781738, + "step": 224 + }, + { + "epoch": 0.6878104700038211, + "grad_norm": 0.35601308941841125, + "learning_rate": 1.666488363961952e-05, + "loss": 0.7660019397735596, + "step": 225 + }, + { + "epoch": 0.6908674054260604, + "grad_norm": 0.17416611313819885, + "learning_rate": 1.662509263482604e-05, + "loss": 0.7157142162322998, + "step": 226 + }, + { + "epoch": 0.6939243408482996, + "grad_norm": 0.19655123353004456, + "learning_rate": 1.658511379065039e-05, + "loss": 0.7894638776779175, + "step": 227 + }, + { + "epoch": 0.6969812762705387, + "grad_norm": 0.2034345269203186, + "learning_rate": 1.6544948240601453e-05, + "loss": 0.6853711009025574, + "step": 228 + }, + { + "epoch": 0.700038211692778, + "grad_norm": 0.199235200881958, + "learning_rate": 1.6504597123481737e-05, + "loss": 0.7487372756004333, + "step": 229 + }, + { + "epoch": 0.7030951471150172, + "grad_norm": 0.20407404005527496, + "learning_rate": 1.6464061583355088e-05, + "loss": 0.7335573434829712, + "step": 230 + }, + { + "epoch": 0.7061520825372564, + "grad_norm": 0.22096174955368042, + "learning_rate": 1.6423342769514227e-05, + "loss": 0.7659798264503479, + "step": 231 + }, + { + "epoch": 0.7092090179594956, + "grad_norm": 0.1916825920343399, + "learning_rate": 1.6382441836448203e-05, + "loss": 0.7162011861801147, + "step": 232 + }, + { + "epoch": 0.7122659533817348, + "grad_norm": 0.20505093038082123, + "learning_rate": 1.6341359943809626e-05, + "loss": 0.6957600116729736, + "step": 233 + }, + { + "epoch": 0.715322888803974, + "grad_norm": 0.19968082010746002, + "learning_rate": 1.6300098256381807e-05, + "loss": 0.6724053025245667, + "step": 234 + }, + { + "epoch": 0.7183798242262133, + "grad_norm": 0.19768832623958588, + "learning_rate": 1.625865794404573e-05, + "loss": 0.774741530418396, + "step": 235 + }, + { + "epoch": 0.7214367596484524, + "grad_norm": 0.19257694482803345, + "learning_rate": 1.621704018174688e-05, + "loss": 0.6658651828765869, + "step": 236 + }, + { + "epoch": 0.7244936950706916, + "grad_norm": 0.21594858169555664, + "learning_rate": 1.617524614946192e-05, + "loss": 0.810744047164917, + "step": 237 + }, + { + "epoch": 0.7275506304929308, + "grad_norm": 0.2107633650302887, + "learning_rate": 1.6133277032165264e-05, + "loss": 0.7623897194862366, + "step": 238 + }, + { + "epoch": 0.7306075659151701, + "grad_norm": 0.20114055275917053, + "learning_rate": 1.6091134019795447e-05, + "loss": 0.7082816362380981, + "step": 239 + }, + { + "epoch": 0.7336645013374092, + "grad_norm": 0.2542732059955597, + "learning_rate": 1.604881830722141e-05, + "loss": 0.7051193714141846, + "step": 240 + }, + { + "epoch": 0.7367214367596484, + "grad_norm": 0.19180485606193542, + "learning_rate": 1.600633109420861e-05, + "loss": 0.7895385026931763, + "step": 241 + }, + { + "epoch": 0.7397783721818877, + "grad_norm": 0.368756502866745, + "learning_rate": 1.5963673585385016e-05, + "loss": 0.7146293520927429, + "step": 242 + }, + { + "epoch": 0.7428353076041269, + "grad_norm": 0.18490125238895416, + "learning_rate": 1.5920846990206934e-05, + "loss": 0.650428056716919, + "step": 243 + }, + { + "epoch": 0.745892243026366, + "grad_norm": 0.23592503368854523, + "learning_rate": 1.5877852522924733e-05, + "loss": 0.6367110013961792, + "step": 244 + }, + { + "epoch": 0.7489491784486053, + "grad_norm": 0.20223264396190643, + "learning_rate": 1.5834691402548415e-05, + "loss": 0.6563615798950195, + "step": 245 + }, + { + "epoch": 0.7520061138708445, + "grad_norm": 0.27459946274757385, + "learning_rate": 1.5791364852813047e-05, + "loss": 0.7361881136894226, + "step": 246 + }, + { + "epoch": 0.7550630492930837, + "grad_norm": 0.21085411310195923, + "learning_rate": 1.5747874102144073e-05, + "loss": 0.7373813390731812, + "step": 247 + }, + { + "epoch": 0.7581199847153229, + "grad_norm": 0.23332320153713226, + "learning_rate": 1.5704220383622464e-05, + "loss": 0.6971457004547119, + "step": 248 + }, + { + "epoch": 0.7611769201375621, + "grad_norm": 0.23525936901569366, + "learning_rate": 1.5660404934949798e-05, + "loss": 0.6756627559661865, + "step": 249 + }, + { + "epoch": 0.7642338555598013, + "grad_norm": 0.2150791585445404, + "learning_rate": 1.5616428998413122e-05, + "loss": 0.7029792666435242, + "step": 250 + }, + { + "epoch": 0.7642338555598013, + "eval_loss": 0.7269901633262634, + "eval_runtime": 877.665, + "eval_samples_per_second": 0.687, + "eval_steps_per_second": 0.687, + "step": 250 + }, + { + "epoch": 0.7672907909820404, + "grad_norm": 0.19510552287101746, + "learning_rate": 1.5572293820849754e-05, + "loss": 0.715162992477417, + "step": 251 + }, + { + "epoch": 0.7703477264042797, + "grad_norm": 0.25246763229370117, + "learning_rate": 1.5528000653611935e-05, + "loss": 0.634660542011261, + "step": 252 + }, + { + "epoch": 0.7734046618265189, + "grad_norm": 0.2980027496814728, + "learning_rate": 1.5483550752531337e-05, + "loss": 0.7154463529586792, + "step": 253 + }, + { + "epoch": 0.7764615972487581, + "grad_norm": 0.2730556130409241, + "learning_rate": 1.5438945377883463e-05, + "loss": 0.8110946416854858, + "step": 254 + }, + { + "epoch": 0.7795185326709974, + "grad_norm": 0.17258886992931366, + "learning_rate": 1.5394185794351914e-05, + "loss": 0.72202467918396, + "step": 255 + }, + { + "epoch": 0.7825754680932365, + "grad_norm": 0.19966280460357666, + "learning_rate": 1.5349273270992537e-05, + "loss": 0.7368704080581665, + "step": 256 + }, + { + "epoch": 0.7856324035154757, + "grad_norm": 0.23305682837963104, + "learning_rate": 1.5304209081197425e-05, + "loss": 0.7429723143577576, + "step": 257 + }, + { + "epoch": 0.788689338937715, + "grad_norm": 0.21786810457706451, + "learning_rate": 1.5258994502658846e-05, + "loss": 0.6498424410820007, + "step": 258 + }, + { + "epoch": 0.7917462743599541, + "grad_norm": 0.2370925396680832, + "learning_rate": 1.5213630817332985e-05, + "loss": 0.7379459142684937, + "step": 259 + }, + { + "epoch": 0.7948032097821933, + "grad_norm": 0.25566384196281433, + "learning_rate": 1.5168119311403611e-05, + "loss": 0.6742876172065735, + "step": 260 + }, + { + "epoch": 0.7978601452044326, + "grad_norm": 0.2171633243560791, + "learning_rate": 1.512246127524561e-05, + "loss": 0.72329181432724, + "step": 261 + }, + { + "epoch": 0.8009170806266718, + "grad_norm": 0.23292019963264465, + "learning_rate": 1.50766580033884e-05, + "loss": 0.765812873840332, + "step": 262 + }, + { + "epoch": 0.8039740160489109, + "grad_norm": 0.19427980482578278, + "learning_rate": 1.5030710794479226e-05, + "loss": 0.7872639298439026, + "step": 263 + }, + { + "epoch": 0.8070309514711502, + "grad_norm": 0.2460346817970276, + "learning_rate": 1.4984620951246333e-05, + "loss": 0.6940722465515137, + "step": 264 + }, + { + "epoch": 0.8100878868933894, + "grad_norm": 0.2493411898612976, + "learning_rate": 1.4938389780462044e-05, + "loss": 0.7680137157440186, + "step": 265 + }, + { + "epoch": 0.8131448223156286, + "grad_norm": 0.23873573541641235, + "learning_rate": 1.4892018592905702e-05, + "loss": 0.6780916452407837, + "step": 266 + }, + { + "epoch": 0.8162017577378677, + "grad_norm": 0.2580571174621582, + "learning_rate": 1.4845508703326504e-05, + "loss": 0.7183764576911926, + "step": 267 + }, + { + "epoch": 0.819258693160107, + "grad_norm": 0.2125079482793808, + "learning_rate": 1.4798861430406221e-05, + "loss": 0.8207096457481384, + "step": 268 + }, + { + "epoch": 0.8223156285823462, + "grad_norm": 0.21065691113471985, + "learning_rate": 1.4752078096721827e-05, + "loss": 0.7414214611053467, + "step": 269 + }, + { + "epoch": 0.8253725640045854, + "grad_norm": 0.25807511806488037, + "learning_rate": 1.4705160028707976e-05, + "loss": 0.7086384296417236, + "step": 270 + }, + { + "epoch": 0.8284294994268246, + "grad_norm": 0.2444671094417572, + "learning_rate": 1.4658108556619417e-05, + "loss": 0.7065964937210083, + "step": 271 + }, + { + "epoch": 0.8314864348490638, + "grad_norm": 0.200303316116333, + "learning_rate": 1.461092501449326e-05, + "loss": 0.7533905506134033, + "step": 272 + }, + { + "epoch": 0.834543370271303, + "grad_norm": 0.2807226777076721, + "learning_rate": 1.4563610740111163e-05, + "loss": 0.756553053855896, + "step": 273 + }, + { + "epoch": 0.8376003056935423, + "grad_norm": 0.2516884207725525, + "learning_rate": 1.4516167074961394e-05, + "loss": 0.8125098347663879, + "step": 274 + }, + { + "epoch": 0.8406572411157814, + "grad_norm": 0.22799813747406006, + "learning_rate": 1.4468595364200808e-05, + "loss": 0.7360811829566956, + "step": 275 + }, + { + "epoch": 0.8437141765380206, + "grad_norm": 0.27390384674072266, + "learning_rate": 1.4420896956616698e-05, + "loss": 0.7135312557220459, + "step": 276 + }, + { + "epoch": 0.8467711119602599, + "grad_norm": 0.2811775505542755, + "learning_rate": 1.4373073204588556e-05, + "loss": 0.7489083409309387, + "step": 277 + }, + { + "epoch": 0.8498280473824991, + "grad_norm": 0.2652314603328705, + "learning_rate": 1.4325125464049725e-05, + "loss": 0.752477765083313, + "step": 278 + }, + { + "epoch": 0.8528849828047382, + "grad_norm": 0.2218960076570511, + "learning_rate": 1.427705509444897e-05, + "loss": 0.6534979939460754, + "step": 279 + }, + { + "epoch": 0.8559419182269774, + "grad_norm": 0.23746474087238312, + "learning_rate": 1.4228863458711915e-05, + "loss": 0.7061883211135864, + "step": 280 + }, + { + "epoch": 0.8589988536492167, + "grad_norm": 0.21507228910923004, + "learning_rate": 1.4180551923202406e-05, + "loss": 0.7044329643249512, + "step": 281 + }, + { + "epoch": 0.8620557890714559, + "grad_norm": 0.2412186861038208, + "learning_rate": 1.4132121857683782e-05, + "loss": 0.706013023853302, + "step": 282 + }, + { + "epoch": 0.865112724493695, + "grad_norm": 0.2832106947898865, + "learning_rate": 1.4083574635280029e-05, + "loss": 0.6572445631027222, + "step": 283 + }, + { + "epoch": 0.8681696599159343, + "grad_norm": 0.21925900876522064, + "learning_rate": 1.403491163243684e-05, + "loss": 0.675041139125824, + "step": 284 + }, + { + "epoch": 0.8712265953381735, + "grad_norm": 0.22488665580749512, + "learning_rate": 1.3986134228882607e-05, + "loss": 0.7474229335784912, + "step": 285 + }, + { + "epoch": 0.8742835307604127, + "grad_norm": 0.2221737653017044, + "learning_rate": 1.3937243807589291e-05, + "loss": 0.7394901514053345, + "step": 286 + }, + { + "epoch": 0.8773404661826519, + "grad_norm": 0.29034581780433655, + "learning_rate": 1.388824175473321e-05, + "loss": 0.7346636056900024, + "step": 287 + }, + { + "epoch": 0.8803974016048911, + "grad_norm": 0.2580259144306183, + "learning_rate": 1.383912945965574e-05, + "loss": 0.8125481009483337, + "step": 288 + }, + { + "epoch": 0.8834543370271303, + "grad_norm": 0.2533118724822998, + "learning_rate": 1.3789908314823932e-05, + "loss": 0.6768131256103516, + "step": 289 + }, + { + "epoch": 0.8865112724493696, + "grad_norm": 0.2074616551399231, + "learning_rate": 1.3740579715791017e-05, + "loss": 0.7096269726753235, + "step": 290 + }, + { + "epoch": 0.8895682078716087, + "grad_norm": 0.29789987206459045, + "learning_rate": 1.3691145061156843e-05, + "loss": 0.6973364353179932, + "step": 291 + }, + { + "epoch": 0.8926251432938479, + "grad_norm": 0.2937224805355072, + "learning_rate": 1.3641605752528225e-05, + "loss": 0.7693608999252319, + "step": 292 + }, + { + "epoch": 0.8956820787160871, + "grad_norm": 0.27355870604515076, + "learning_rate": 1.3591963194479198e-05, + "loss": 0.6870795488357544, + "step": 293 + }, + { + "epoch": 0.8987390141383264, + "grad_norm": 0.22792251408100128, + "learning_rate": 1.3542218794511212e-05, + "loss": 0.7095532417297363, + "step": 294 + }, + { + "epoch": 0.9017959495605655, + "grad_norm": 0.2855125665664673, + "learning_rate": 1.3492373963013199e-05, + "loss": 0.7536489963531494, + "step": 295 + }, + { + "epoch": 0.9048528849828047, + "grad_norm": 0.24969056248664856, + "learning_rate": 1.3442430113221602e-05, + "loss": 0.7433043718338013, + "step": 296 + }, + { + "epoch": 0.907909820405044, + "grad_norm": 0.24534980952739716, + "learning_rate": 1.3392388661180303e-05, + "loss": 0.7204138040542603, + "step": 297 + }, + { + "epoch": 0.9109667558272831, + "grad_norm": 0.2540739178657532, + "learning_rate": 1.3342251025700474e-05, + "loss": 0.7114053964614868, + "step": 298 + }, + { + "epoch": 0.9140236912495223, + "grad_norm": 0.2494630217552185, + "learning_rate": 1.3292018628320346e-05, + "loss": 0.7337151169776917, + "step": 299 + }, + { + "epoch": 0.9170806266717616, + "grad_norm": 0.3079741597175598, + "learning_rate": 1.3241692893264909e-05, + "loss": 0.7486672401428223, + "step": 300 + }, + { + "epoch": 0.9170806266717616, + "eval_loss": 0.7063615918159485, + "eval_runtime": 882.246, + "eval_samples_per_second": 0.683, + "eval_steps_per_second": 0.683, + "step": 300 + }, + { + "epoch": 0.9201375620940008, + "grad_norm": 0.23425859212875366, + "learning_rate": 1.3191275247405525e-05, + "loss": 0.7614796161651611, + "step": 301 + }, + { + "epoch": 0.9231944975162399, + "grad_norm": 0.22468142211437225, + "learning_rate": 1.314076712021949e-05, + "loss": 0.7109901309013367, + "step": 302 + }, + { + "epoch": 0.9262514329384792, + "grad_norm": 0.4165630042552948, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.6816924810409546, + "step": 303 + }, + { + "epoch": 0.9293083683607184, + "grad_norm": 0.2934052646160126, + "learning_rate": 1.3039485152562951e-05, + "loss": 0.7403143644332886, + "step": 304 + }, + { + "epoch": 0.9323653037829576, + "grad_norm": 0.24021990597248077, + "learning_rate": 1.2988714183711504e-05, + "loss": 0.7116130590438843, + "step": 305 + }, + { + "epoch": 0.9354222392051967, + "grad_norm": 0.25670015811920166, + "learning_rate": 1.2937858476690089e-05, + "loss": 0.745186984539032, + "step": 306 + }, + { + "epoch": 0.938479174627436, + "grad_norm": 0.3273049592971802, + "learning_rate": 1.2886919473396212e-05, + "loss": 0.811728298664093, + "step": 307 + }, + { + "epoch": 0.9415361100496752, + "grad_norm": 0.295612633228302, + "learning_rate": 1.2835898618089064e-05, + "loss": 0.6898178458213806, + "step": 308 + }, + { + "epoch": 0.9445930454719144, + "grad_norm": 0.22936004400253296, + "learning_rate": 1.2784797357348562e-05, + "loss": 0.7637606263160706, + "step": 309 + }, + { + "epoch": 0.9476499808941536, + "grad_norm": 0.2491123378276825, + "learning_rate": 1.2733617140034329e-05, + "loss": 0.6364520788192749, + "step": 310 + }, + { + "epoch": 0.9507069163163928, + "grad_norm": 0.29433801770210266, + "learning_rate": 1.268235941724463e-05, + "loss": 0.7065365314483643, + "step": 311 + }, + { + "epoch": 0.953763851738632, + "grad_norm": 0.25174376368522644, + "learning_rate": 1.2631025642275212e-05, + "loss": 0.73712158203125, + "step": 312 + }, + { + "epoch": 0.9568207871608713, + "grad_norm": 0.3259194493293762, + "learning_rate": 1.257961727057812e-05, + "loss": 0.6926214694976807, + "step": 313 + }, + { + "epoch": 0.9598777225831104, + "grad_norm": 0.31702667474746704, + "learning_rate": 1.2528135759720403e-05, + "loss": 0.7626583576202393, + "step": 314 + }, + { + "epoch": 0.9629346580053496, + "grad_norm": 0.24691395461559296, + "learning_rate": 1.2476582569342819e-05, + "loss": 0.7628929018974304, + "step": 315 + }, + { + "epoch": 0.9659915934275889, + "grad_norm": 0.2896668314933777, + "learning_rate": 1.2424959161118425e-05, + "loss": 0.7070521116256714, + "step": 316 + }, + { + "epoch": 0.9690485288498281, + "grad_norm": 0.2587420642375946, + "learning_rate": 1.2373266998711152e-05, + "loss": 0.7804452180862427, + "step": 317 + }, + { + "epoch": 0.9721054642720672, + "grad_norm": 0.28757819533348083, + "learning_rate": 1.232150754773429e-05, + "loss": 0.7271901369094849, + "step": 318 + }, + { + "epoch": 0.9751623996943064, + "grad_norm": 0.2600923478603363, + "learning_rate": 1.2269682275708951e-05, + "loss": 0.6629395484924316, + "step": 319 + }, + { + "epoch": 0.9782193351165457, + "grad_norm": 0.3455665111541748, + "learning_rate": 1.2217792652022452e-05, + "loss": 0.7750409841537476, + "step": 320 + }, + { + "epoch": 0.9812762705387849, + "grad_norm": 0.27122899889945984, + "learning_rate": 1.2165840147886656e-05, + "loss": 0.6742854118347168, + "step": 321 + }, + { + "epoch": 0.984333205961024, + "grad_norm": 0.2357456535100937, + "learning_rate": 1.2113826236296245e-05, + "loss": 0.7265107035636902, + "step": 322 + }, + { + "epoch": 0.9873901413832633, + "grad_norm": 0.21315616369247437, + "learning_rate": 1.2061752391986982e-05, + "loss": 0.7203768491744995, + "step": 323 + }, + { + "epoch": 0.9904470768055025, + "grad_norm": 0.24696163833141327, + "learning_rate": 1.2009620091393885e-05, + "loss": 0.8011739253997803, + "step": 324 + }, + { + "epoch": 0.9935040122277417, + "grad_norm": 0.246279776096344, + "learning_rate": 1.1957430812609361e-05, + "loss": 0.7316861152648926, + "step": 325 + }, + { + "epoch": 0.9965609476499809, + "grad_norm": 0.26160112023353577, + "learning_rate": 1.1905186035341304e-05, + "loss": 0.6602386236190796, + "step": 326 + }, + { + "epoch": 0.9996178830722201, + "grad_norm": 0.27144137024879456, + "learning_rate": 1.1852887240871145e-05, + "loss": 0.7162635326385498, + "step": 327 + }, + { + "epoch": 1.0, + "grad_norm": 0.6650471091270447, + "learning_rate": 1.1800535912011846e-05, + "loss": 0.6108165383338928, + "step": 328 + }, + { + "epoch": 1.0030569354222392, + "grad_norm": 0.25604233145713806, + "learning_rate": 1.1748133533065864e-05, + "loss": 0.6724814176559448, + "step": 329 + }, + { + "epoch": 1.0061138708444783, + "grad_norm": 0.30289238691329956, + "learning_rate": 1.1695681589783065e-05, + "loss": 0.7010799050331116, + "step": 330 + }, + { + "epoch": 1.0091708062667175, + "grad_norm": 0.28697144985198975, + "learning_rate": 1.1643181569318596e-05, + "loss": 0.7199532985687256, + "step": 331 + }, + { + "epoch": 1.012227741688957, + "grad_norm": 0.26302677392959595, + "learning_rate": 1.1590634960190722e-05, + "loss": 0.6887974143028259, + "step": 332 + }, + { + "epoch": 1.015284677111196, + "grad_norm": 0.2987605631351471, + "learning_rate": 1.1538043252238629e-05, + "loss": 0.7237250208854675, + "step": 333 + }, + { + "epoch": 1.0183416125334352, + "grad_norm": 0.25947025418281555, + "learning_rate": 1.1485407936580169e-05, + "loss": 0.7092999815940857, + "step": 334 + }, + { + "epoch": 1.0213985479556744, + "grad_norm": 0.3119892477989197, + "learning_rate": 1.1432730505569597e-05, + "loss": 0.6797397136688232, + "step": 335 + }, + { + "epoch": 1.0244554833779136, + "grad_norm": 0.2772631347179413, + "learning_rate": 1.1380012452755259e-05, + "loss": 0.7330094575881958, + "step": 336 + }, + { + "epoch": 1.0275124188001528, + "grad_norm": 0.34601089358329773, + "learning_rate": 1.1327255272837221e-05, + "loss": 0.711042582988739, + "step": 337 + }, + { + "epoch": 1.0305693542223922, + "grad_norm": 0.30404818058013916, + "learning_rate": 1.1274460461624925e-05, + "loss": 0.6593371033668518, + "step": 338 + }, + { + "epoch": 1.0336262896446313, + "grad_norm": 0.249643474817276, + "learning_rate": 1.1221629515994754e-05, + "loss": 0.7230923175811768, + "step": 339 + }, + { + "epoch": 1.0366832250668705, + "grad_norm": 0.2772657871246338, + "learning_rate": 1.1168763933847608e-05, + "loss": 0.6847513914108276, + "step": 340 + }, + { + "epoch": 1.0397401604891097, + "grad_norm": 0.3479171395301819, + "learning_rate": 1.1115865214066414e-05, + "loss": 0.673307478427887, + "step": 341 + }, + { + "epoch": 1.0427970959113488, + "grad_norm": 0.3393602669239044, + "learning_rate": 1.1062934856473655e-05, + "loss": 0.7529383897781372, + "step": 342 + }, + { + "epoch": 1.045854031333588, + "grad_norm": 0.22780737280845642, + "learning_rate": 1.1009974361788822e-05, + "loss": 0.6309706568717957, + "step": 343 + }, + { + "epoch": 1.0489109667558272, + "grad_norm": 0.2966362237930298, + "learning_rate": 1.095698523158588e-05, + "loss": 0.6944005489349365, + "step": 344 + }, + { + "epoch": 1.0519679021780666, + "grad_norm": 0.27519309520721436, + "learning_rate": 1.0903968968250682e-05, + "loss": 0.6714650392532349, + "step": 345 + }, + { + "epoch": 1.0550248376003057, + "grad_norm": 0.36684176325798035, + "learning_rate": 1.085092707493839e-05, + "loss": 0.6740344762802124, + "step": 346 + }, + { + "epoch": 1.058081773022545, + "grad_norm": 0.35729631781578064, + "learning_rate": 1.0797861055530832e-05, + "loss": 0.6590248942375183, + "step": 347 + }, + { + "epoch": 1.061138708444784, + "grad_norm": 0.33536043763160706, + "learning_rate": 1.0744772414593889e-05, + "loss": 0.7020372748374939, + "step": 348 + }, + { + "epoch": 1.0641956438670233, + "grad_norm": 0.3144095838069916, + "learning_rate": 1.0691662657334815e-05, + "loss": 0.7195531725883484, + "step": 349 + }, + { + "epoch": 1.0672525792892624, + "grad_norm": 0.37244805693626404, + "learning_rate": 1.0638533289559574e-05, + "loss": 0.6678342819213867, + "step": 350 + }, + { + "epoch": 1.0672525792892624, + "eval_loss": 0.6917262673377991, + "eval_runtime": 874.9693, + "eval_samples_per_second": 0.689, + "eval_steps_per_second": 0.689, + "step": 350 + }, + { + "epoch": 1.0703095147115018, + "grad_norm": 0.45918041467666626, + "learning_rate": 1.0585385817630137e-05, + "loss": 0.6641817092895508, + "step": 351 + }, + { + "epoch": 1.073366450133741, + "grad_norm": 0.4126392900943756, + "learning_rate": 1.0532221748421786e-05, + "loss": 0.6774541139602661, + "step": 352 + }, + { + "epoch": 1.0764233855559802, + "grad_norm": 0.5425148606300354, + "learning_rate": 1.047904258928037e-05, + "loss": 0.7386555075645447, + "step": 353 + }, + { + "epoch": 1.0794803209782193, + "grad_norm": 0.40561115741729736, + "learning_rate": 1.0425849847979586e-05, + "loss": 0.7061327695846558, + "step": 354 + }, + { + "epoch": 1.0825372564004585, + "grad_norm": 0.489343523979187, + "learning_rate": 1.0372645032678215e-05, + "loss": 0.7486766576766968, + "step": 355 + }, + { + "epoch": 1.0855941918226977, + "grad_norm": 0.7414161562919617, + "learning_rate": 1.031942965187738e-05, + "loss": 0.7111566066741943, + "step": 356 + }, + { + "epoch": 1.0886511272449368, + "grad_norm": 0.308473140001297, + "learning_rate": 1.026620521437775e-05, + "loss": 0.7629879713058472, + "step": 357 + }, + { + "epoch": 1.0917080626671762, + "grad_norm": 0.27350732684135437, + "learning_rate": 1.0212973229236787e-05, + "loss": 0.7136012315750122, + "step": 358 + }, + { + "epoch": 1.0947649980894154, + "grad_norm": 0.37481266260147095, + "learning_rate": 1.0159735205725949e-05, + "loss": 0.6634767055511475, + "step": 359 + }, + { + "epoch": 1.0978219335116546, + "grad_norm": 0.2903526723384857, + "learning_rate": 1.0106492653287893e-05, + "loss": 0.6604923009872437, + "step": 360 + }, + { + "epoch": 1.1008788689338938, + "grad_norm": 0.372989296913147, + "learning_rate": 1.0053247081493684e-05, + "loss": 0.6701731085777283, + "step": 361 + }, + { + "epoch": 1.103935804356133, + "grad_norm": 0.38386791944503784, + "learning_rate": 1e-05, + "loss": 0.6767977476119995, + "step": 362 + }, + { + "epoch": 1.106992739778372, + "grad_norm": 0.2837046682834625, + "learning_rate": 9.946752918506319e-06, + "loss": 0.5886228680610657, + "step": 363 + }, + { + "epoch": 1.1100496752006115, + "grad_norm": 0.3196772038936615, + "learning_rate": 9.893507346712112e-06, + "loss": 0.6662254929542542, + "step": 364 + }, + { + "epoch": 1.1131066106228507, + "grad_norm": 0.36623135209083557, + "learning_rate": 9.840264794274053e-06, + "loss": 0.6507357954978943, + "step": 365 + }, + { + "epoch": 1.1161635460450898, + "grad_norm": 0.2803555727005005, + "learning_rate": 9.787026770763216e-06, + "loss": 0.6636874675750732, + "step": 366 + }, + { + "epoch": 1.119220481467329, + "grad_norm": 0.329513818025589, + "learning_rate": 9.733794785622254e-06, + "loss": 0.6378857493400574, + "step": 367 + }, + { + "epoch": 1.1222774168895682, + "grad_norm": 0.24419358372688293, + "learning_rate": 9.680570348122626e-06, + "loss": 0.6794115900993347, + "step": 368 + }, + { + "epoch": 1.1253343523118073, + "grad_norm": 0.2971822917461395, + "learning_rate": 9.627354967321785e-06, + "loss": 0.6401248574256897, + "step": 369 + }, + { + "epoch": 1.1283912877340465, + "grad_norm": 0.5112190842628479, + "learning_rate": 9.574150152020415e-06, + "loss": 0.6886081695556641, + "step": 370 + }, + { + "epoch": 1.131448223156286, + "grad_norm": 0.4284913241863251, + "learning_rate": 9.520957410719632e-06, + "loss": 0.6842222213745117, + "step": 371 + }, + { + "epoch": 1.134505158578525, + "grad_norm": 0.34164664149284363, + "learning_rate": 9.467778251578217e-06, + "loss": 0.6238314509391785, + "step": 372 + }, + { + "epoch": 1.1375620940007642, + "grad_norm": 0.3294171392917633, + "learning_rate": 9.414614182369862e-06, + "loss": 0.6947107911109924, + "step": 373 + }, + { + "epoch": 1.1406190294230034, + "grad_norm": 0.2544155418872833, + "learning_rate": 9.361466710440428e-06, + "loss": 0.717319905757904, + "step": 374 + }, + { + "epoch": 1.1436759648452426, + "grad_norm": 0.3111848533153534, + "learning_rate": 9.308337342665188e-06, + "loss": 0.6222032904624939, + "step": 375 + }, + { + "epoch": 1.1467329002674818, + "grad_norm": 0.3157130777835846, + "learning_rate": 9.255227585406116e-06, + "loss": 0.6126186847686768, + "step": 376 + }, + { + "epoch": 1.1497898356897212, + "grad_norm": 0.29625123739242554, + "learning_rate": 9.202138944469168e-06, + "loss": 0.7452324032783508, + "step": 377 + }, + { + "epoch": 1.1528467711119603, + "grad_norm": 0.31600719690322876, + "learning_rate": 9.149072925061614e-06, + "loss": 0.715571403503418, + "step": 378 + }, + { + "epoch": 1.1559037065341995, + "grad_norm": 0.25878727436065674, + "learning_rate": 9.096031031749321e-06, + "loss": 0.7256120443344116, + "step": 379 + }, + { + "epoch": 1.1589606419564387, + "grad_norm": 0.4058121144771576, + "learning_rate": 9.043014768414125e-06, + "loss": 0.6728136539459229, + "step": 380 + }, + { + "epoch": 1.1620175773786778, + "grad_norm": 0.31269821524620056, + "learning_rate": 8.99002563821118e-06, + "loss": 0.6662668585777283, + "step": 381 + }, + { + "epoch": 1.165074512800917, + "grad_norm": 0.2512218654155731, + "learning_rate": 8.937065143526349e-06, + "loss": 0.6415850520133972, + "step": 382 + }, + { + "epoch": 1.1681314482231562, + "grad_norm": 0.3284171223640442, + "learning_rate": 8.884134785933588e-06, + "loss": 0.6695276498794556, + "step": 383 + }, + { + "epoch": 1.1711883836453956, + "grad_norm": 0.2994699478149414, + "learning_rate": 8.831236066152397e-06, + "loss": 0.7347006797790527, + "step": 384 + }, + { + "epoch": 1.1742453190676347, + "grad_norm": 0.2981257140636444, + "learning_rate": 8.778370484005245e-06, + "loss": 0.6707600951194763, + "step": 385 + }, + { + "epoch": 1.177302254489874, + "grad_norm": 0.2934776842594147, + "learning_rate": 8.725539538375078e-06, + "loss": 0.7245328426361084, + "step": 386 + }, + { + "epoch": 1.180359189912113, + "grad_norm": 0.33115988969802856, + "learning_rate": 8.672744727162782e-06, + "loss": 0.7029488682746887, + "step": 387 + }, + { + "epoch": 1.1834161253343523, + "grad_norm": 0.3322703540325165, + "learning_rate": 8.619987547244746e-06, + "loss": 0.6896190643310547, + "step": 388 + }, + { + "epoch": 1.1864730607565914, + "grad_norm": 0.29254966974258423, + "learning_rate": 8.567269494430404e-06, + "loss": 0.6859920620918274, + "step": 389 + }, + { + "epoch": 1.1895299961788308, + "grad_norm": 0.2923297584056854, + "learning_rate": 8.514592063419833e-06, + "loss": 0.6437527537345886, + "step": 390 + }, + { + "epoch": 1.19258693160107, + "grad_norm": 0.3074567914009094, + "learning_rate": 8.461956747761375e-06, + "loss": 0.7113338708877563, + "step": 391 + }, + { + "epoch": 1.1956438670233092, + "grad_norm": 0.3027377128601074, + "learning_rate": 8.409365039809282e-06, + "loss": 0.7111615538597107, + "step": 392 + }, + { + "epoch": 1.1987008024455483, + "grad_norm": 0.28992199897766113, + "learning_rate": 8.356818430681409e-06, + "loss": 0.7768589854240417, + "step": 393 + }, + { + "epoch": 1.2017577378677875, + "grad_norm": 0.2630784213542938, + "learning_rate": 8.304318410216937e-06, + "loss": 0.5940375328063965, + "step": 394 + }, + { + "epoch": 1.2048146732900267, + "grad_norm": 0.30487746000289917, + "learning_rate": 8.251866466934137e-06, + "loss": 0.6600077748298645, + "step": 395 + }, + { + "epoch": 1.2078716087122658, + "grad_norm": 0.4152087867259979, + "learning_rate": 8.199464087988158e-06, + "loss": 0.6806260347366333, + "step": 396 + }, + { + "epoch": 1.2109285441345052, + "grad_norm": 0.32374435663223267, + "learning_rate": 8.147112759128859e-06, + "loss": 0.7205727100372314, + "step": 397 + }, + { + "epoch": 1.2139854795567444, + "grad_norm": 0.3009904623031616, + "learning_rate": 8.094813964658698e-06, + "loss": 0.6570584774017334, + "step": 398 + }, + { + "epoch": 1.2170424149789836, + "grad_norm": 0.5213649272918701, + "learning_rate": 8.042569187390642e-06, + "loss": 0.6663621664047241, + "step": 399 + }, + { + "epoch": 1.2200993504012227, + "grad_norm": 0.30124184489250183, + "learning_rate": 7.990379908606118e-06, + "loss": 0.672550618648529, + "step": 400 + }, + { + "epoch": 1.2200993504012227, + "eval_loss": 0.6789794564247131, + "eval_runtime": 875.5101, + "eval_samples_per_second": 0.689, + "eval_steps_per_second": 0.689, + "step": 400 + }, + { + "epoch": 1.223156285823462, + "grad_norm": 0.31681662797927856, + "learning_rate": 7.938247608013021e-06, + "loss": 0.682239830493927, + "step": 401 + }, + { + "epoch": 1.226213221245701, + "grad_norm": 0.29261210560798645, + "learning_rate": 7.886173763703757e-06, + "loss": 0.6976956725120544, + "step": 402 + }, + { + "epoch": 1.2292701566679405, + "grad_norm": 0.32044124603271484, + "learning_rate": 7.834159852113347e-06, + "loss": 0.6931061744689941, + "step": 403 + }, + { + "epoch": 1.2323270920901797, + "grad_norm": 0.36050841212272644, + "learning_rate": 7.78220734797755e-06, + "loss": 0.7304666638374329, + "step": 404 + }, + { + "epoch": 1.2353840275124188, + "grad_norm": 0.31268882751464844, + "learning_rate": 7.73031772429105e-06, + "loss": 0.5944494605064392, + "step": 405 + }, + { + "epoch": 1.238440962934658, + "grad_norm": 0.33469483256340027, + "learning_rate": 7.678492452265713e-06, + "loss": 0.708702802658081, + "step": 406 + }, + { + "epoch": 1.2414978983568972, + "grad_norm": 0.2789304852485657, + "learning_rate": 7.626733001288852e-06, + "loss": 0.614046037197113, + "step": 407 + }, + { + "epoch": 1.2445548337791363, + "grad_norm": 0.42240089178085327, + "learning_rate": 7.575040838881578e-06, + "loss": 0.7044576406478882, + "step": 408 + }, + { + "epoch": 1.2476117692013755, + "grad_norm": 0.3652958571910858, + "learning_rate": 7.523417430657186e-06, + "loss": 0.7595829963684082, + "step": 409 + }, + { + "epoch": 1.250668704623615, + "grad_norm": 0.28300684690475464, + "learning_rate": 7.471864240279598e-06, + "loss": 0.7289992570877075, + "step": 410 + }, + { + "epoch": 1.253725640045854, + "grad_norm": 0.3463844358921051, + "learning_rate": 7.420382729421883e-06, + "loss": 0.7410037517547607, + "step": 411 + }, + { + "epoch": 1.2567825754680932, + "grad_norm": 0.30792665481567383, + "learning_rate": 7.368974357724789e-06, + "loss": 0.6920305490493774, + "step": 412 + }, + { + "epoch": 1.2598395108903324, + "grad_norm": 0.4354027509689331, + "learning_rate": 7.317640582755373e-06, + "loss": 0.6581035256385803, + "step": 413 + }, + { + "epoch": 1.2628964463125716, + "grad_norm": 0.5033990144729614, + "learning_rate": 7.266382859965673e-06, + "loss": 0.7377368211746216, + "step": 414 + }, + { + "epoch": 1.265953381734811, + "grad_norm": 0.30040669441223145, + "learning_rate": 7.2152026426514395e-06, + "loss": 0.7075121402740479, + "step": 415 + }, + { + "epoch": 1.2690103171570501, + "grad_norm": 0.25443559885025024, + "learning_rate": 7.164101381910939e-06, + "loss": 0.6314805150032043, + "step": 416 + }, + { + "epoch": 1.2720672525792893, + "grad_norm": 0.3807917535305023, + "learning_rate": 7.113080526603793e-06, + "loss": 0.6594043970108032, + "step": 417 + }, + { + "epoch": 1.2751241880015285, + "grad_norm": 0.40388163924217224, + "learning_rate": 7.062141523309918e-06, + "loss": 0.7092217206954956, + "step": 418 + }, + { + "epoch": 1.2781811234237677, + "grad_norm": 0.31380078196525574, + "learning_rate": 7.011285816288496e-06, + "loss": 0.6039083003997803, + "step": 419 + }, + { + "epoch": 1.2812380588460068, + "grad_norm": 0.3492945730686188, + "learning_rate": 6.96051484743705e-06, + "loss": 0.648531973361969, + "step": 420 + }, + { + "epoch": 1.284294994268246, + "grad_norm": 0.2891562283039093, + "learning_rate": 6.909830056250527e-06, + "loss": 0.6646198630332947, + "step": 421 + }, + { + "epoch": 1.2873519296904852, + "grad_norm": 0.316986083984375, + "learning_rate": 6.859232879780515e-06, + "loss": 0.7188717126846313, + "step": 422 + }, + { + "epoch": 1.2904088651127246, + "grad_norm": 0.38996225595474243, + "learning_rate": 6.8087247525944745e-06, + "loss": 0.6890851855278015, + "step": 423 + }, + { + "epoch": 1.2934658005349637, + "grad_norm": 0.3303278684616089, + "learning_rate": 6.758307106735094e-06, + "loss": 0.7118897438049316, + "step": 424 + }, + { + "epoch": 1.296522735957203, + "grad_norm": 0.26401078701019287, + "learning_rate": 6.707981371679657e-06, + "loss": 0.6749597787857056, + "step": 425 + }, + { + "epoch": 1.299579671379442, + "grad_norm": 0.3269912898540497, + "learning_rate": 6.657748974299529e-06, + "loss": 0.6718383431434631, + "step": 426 + }, + { + "epoch": 1.3026366068016813, + "grad_norm": 0.35413047671318054, + "learning_rate": 6.607611338819697e-06, + "loss": 0.6674888134002686, + "step": 427 + }, + { + "epoch": 1.3056935422239206, + "grad_norm": 0.44566094875335693, + "learning_rate": 6.557569886778401e-06, + "loss": 0.6900228261947632, + "step": 428 + }, + { + "epoch": 1.3087504776461598, + "grad_norm": 0.3536953628063202, + "learning_rate": 6.507626036986804e-06, + "loss": 0.6681596040725708, + "step": 429 + }, + { + "epoch": 1.311807413068399, + "grad_norm": 0.43866440653800964, + "learning_rate": 6.457781205488791e-06, + "loss": 0.7463353872299194, + "step": 430 + }, + { + "epoch": 1.3148643484906382, + "grad_norm": 0.32117530703544617, + "learning_rate": 6.408036805520801e-06, + "loss": 0.7138527035713196, + "step": 431 + }, + { + "epoch": 1.3179212839128773, + "grad_norm": 0.3075023293495178, + "learning_rate": 6.358394247471779e-06, + "loss": 0.6958800554275513, + "step": 432 + }, + { + "epoch": 1.3209782193351165, + "grad_norm": 0.31068870425224304, + "learning_rate": 6.308854938843161e-06, + "loss": 0.6728611588478088, + "step": 433 + }, + { + "epoch": 1.3240351547573557, + "grad_norm": 0.2871341407299042, + "learning_rate": 6.259420284208987e-06, + "loss": 0.6983805894851685, + "step": 434 + }, + { + "epoch": 1.3270920901795948, + "grad_norm": 0.3626168966293335, + "learning_rate": 6.210091685176067e-06, + "loss": 0.6707543134689331, + "step": 435 + }, + { + "epoch": 1.3301490256018342, + "grad_norm": 0.2960391640663147, + "learning_rate": 6.160870540344261e-06, + "loss": 0.6212095618247986, + "step": 436 + }, + { + "epoch": 1.3332059610240734, + "grad_norm": 0.29114195704460144, + "learning_rate": 6.111758245266795e-06, + "loss": 0.695442795753479, + "step": 437 + }, + { + "epoch": 1.3362628964463126, + "grad_norm": 0.2911393642425537, + "learning_rate": 6.0627561924107145e-06, + "loss": 0.7576844096183777, + "step": 438 + }, + { + "epoch": 1.3393198318685517, + "grad_norm": 0.2754829227924347, + "learning_rate": 6.013865771117394e-06, + "loss": 0.7611621022224426, + "step": 439 + }, + { + "epoch": 1.342376767290791, + "grad_norm": 0.47688090801239014, + "learning_rate": 5.965088367563162e-06, + "loss": 0.6706432104110718, + "step": 440 + }, + { + "epoch": 1.3454337027130303, + "grad_norm": 0.38662102818489075, + "learning_rate": 5.916425364719975e-06, + "loss": 0.7257411479949951, + "step": 441 + }, + { + "epoch": 1.3484906381352695, + "grad_norm": 0.29597020149230957, + "learning_rate": 5.867878142316221e-06, + "loss": 0.6695491671562195, + "step": 442 + }, + { + "epoch": 1.3515475735575087, + "grad_norm": 0.36503320932388306, + "learning_rate": 5.8194480767976e-06, + "loss": 0.6762661933898926, + "step": 443 + }, + { + "epoch": 1.3546045089797478, + "grad_norm": 0.29297393560409546, + "learning_rate": 5.7711365412880895e-06, + "loss": 0.6601616740226746, + "step": 444 + }, + { + "epoch": 1.357661444401987, + "grad_norm": 0.3229820430278778, + "learning_rate": 5.7229449055510335e-06, + "loss": 0.7049432992935181, + "step": 445 + }, + { + "epoch": 1.3607183798242262, + "grad_norm": 0.3359116017818451, + "learning_rate": 5.674874535950279e-06, + "loss": 0.6643913388252258, + "step": 446 + }, + { + "epoch": 1.3637753152464653, + "grad_norm": 0.349298357963562, + "learning_rate": 5.626926795411447e-06, + "loss": 0.7177180647850037, + "step": 447 + }, + { + "epoch": 1.3668322506687045, + "grad_norm": 0.30045273900032043, + "learning_rate": 5.579103043383305e-06, + "loss": 0.6765077710151672, + "step": 448 + }, + { + "epoch": 1.369889186090944, + "grad_norm": 0.3676189184188843, + "learning_rate": 5.531404635799191e-06, + "loss": 0.6421419978141785, + "step": 449 + }, + { + "epoch": 1.372946121513183, + "grad_norm": 0.3337932527065277, + "learning_rate": 5.4838329250386076e-06, + "loss": 0.649316668510437, + "step": 450 + }, + { + "epoch": 1.372946121513183, + "eval_loss": 0.6703284978866577, + "eval_runtime": 907.8663, + "eval_samples_per_second": 0.664, + "eval_steps_per_second": 0.664, + "step": 450 + }, + { + "epoch": 1.3760030569354222, + "grad_norm": 0.314387708902359, + "learning_rate": 5.436389259888841e-06, + "loss": 0.7333119511604309, + "step": 451 + }, + { + "epoch": 1.3790599923576614, + "grad_norm": 0.4056478440761566, + "learning_rate": 5.38907498550674e-06, + "loss": 0.6451212763786316, + "step": 452 + }, + { + "epoch": 1.3821169277799006, + "grad_norm": 0.42358386516571045, + "learning_rate": 5.341891443380585e-06, + "loss": 0.6462752819061279, + "step": 453 + }, + { + "epoch": 1.38517386320214, + "grad_norm": 0.3606562912464142, + "learning_rate": 5.294839971292026e-06, + "loss": 0.717352569103241, + "step": 454 + }, + { + "epoch": 1.3882307986243791, + "grad_norm": 0.3014855682849884, + "learning_rate": 5.247921903278177e-06, + "loss": 0.7015582323074341, + "step": 455 + }, + { + "epoch": 1.3912877340466183, + "grad_norm": 0.5155187845230103, + "learning_rate": 5.20113856959378e-06, + "loss": 0.6660122275352478, + "step": 456 + }, + { + "epoch": 1.3943446694688575, + "grad_norm": 0.35195642709732056, + "learning_rate": 5.1544912966735e-06, + "loss": 0.6980377435684204, + "step": 457 + }, + { + "epoch": 1.3974016048910967, + "grad_norm": 0.28842753171920776, + "learning_rate": 5.1079814070943e-06, + "loss": 0.6926653385162354, + "step": 458 + }, + { + "epoch": 1.4004585403133358, + "grad_norm": 0.354425311088562, + "learning_rate": 5.06161021953796e-06, + "loss": 0.6412813067436218, + "step": 459 + }, + { + "epoch": 1.403515475735575, + "grad_norm": 0.30584967136383057, + "learning_rate": 5.015379048753669e-06, + "loss": 0.6897266507148743, + "step": 460 + }, + { + "epoch": 1.4065724111578142, + "grad_norm": 0.3659093677997589, + "learning_rate": 4.9692892055207784e-06, + "loss": 0.6777257919311523, + "step": 461 + }, + { + "epoch": 1.4096293465800536, + "grad_norm": 0.6798201203346252, + "learning_rate": 4.923341996611604e-06, + "loss": 0.7499118447303772, + "step": 462 + }, + { + "epoch": 1.4126862820022927, + "grad_norm": 0.36423686146736145, + "learning_rate": 4.877538724754392e-06, + "loss": 0.6341705322265625, + "step": 463 + }, + { + "epoch": 1.415743217424532, + "grad_norm": 0.29527905583381653, + "learning_rate": 4.831880688596392e-06, + "loss": 0.566770076751709, + "step": 464 + }, + { + "epoch": 1.418800152846771, + "grad_norm": 0.3342158794403076, + "learning_rate": 4.7863691826670146e-06, + "loss": 0.6926667094230652, + "step": 465 + }, + { + "epoch": 1.4218570882690102, + "grad_norm": 0.35585087537765503, + "learning_rate": 4.741005497341154e-06, + "loss": 0.6302958130836487, + "step": 466 + }, + { + "epoch": 1.4249140236912496, + "grad_norm": 0.5740730166435242, + "learning_rate": 4.695790918802577e-06, + "loss": 0.7842360138893127, + "step": 467 + }, + { + "epoch": 1.4279709591134888, + "grad_norm": 0.4422702491283417, + "learning_rate": 4.650726729007465e-06, + "loss": 0.6199318766593933, + "step": 468 + }, + { + "epoch": 1.431027894535728, + "grad_norm": 0.3458646833896637, + "learning_rate": 4.605814205648087e-06, + "loss": 0.7013853788375854, + "step": 469 + }, + { + "epoch": 1.4340848299579672, + "grad_norm": 0.326727956533432, + "learning_rate": 4.56105462211654e-06, + "loss": 0.7208451628684998, + "step": 470 + }, + { + "epoch": 1.4371417653802063, + "grad_norm": 0.3491531014442444, + "learning_rate": 4.516449247468666e-06, + "loss": 0.6491535902023315, + "step": 471 + }, + { + "epoch": 1.4401987008024455, + "grad_norm": 0.31401777267456055, + "learning_rate": 4.4719993463880695e-06, + "loss": 0.6603784561157227, + "step": 472 + }, + { + "epoch": 1.4432556362246847, + "grad_norm": 0.3741454780101776, + "learning_rate": 4.427706179150247e-06, + "loss": 0.6068110466003418, + "step": 473 + }, + { + "epoch": 1.4463125716469238, + "grad_norm": 0.3205011188983917, + "learning_rate": 4.383571001586883e-06, + "loss": 0.6427788138389587, + "step": 474 + }, + { + "epoch": 1.4493695070691632, + "grad_norm": 0.2519795894622803, + "learning_rate": 4.339595065050206e-06, + "loss": 0.626676082611084, + "step": 475 + }, + { + "epoch": 1.4524264424914024, + "grad_norm": 0.3499923050403595, + "learning_rate": 4.29577961637754e-06, + "loss": 0.7192115187644958, + "step": 476 + }, + { + "epoch": 1.4554833779136416, + "grad_norm": 0.6267193555831909, + "learning_rate": 4.2521258978559324e-06, + "loss": 0.6705955862998962, + "step": 477 + }, + { + "epoch": 1.4585403133358807, + "grad_norm": 0.5547561049461365, + "learning_rate": 4.208635147186956e-06, + "loss": 0.6040648818016052, + "step": 478 + }, + { + "epoch": 1.46159724875812, + "grad_norm": 0.2949749529361725, + "learning_rate": 4.165308597451586e-06, + "loss": 0.6205201148986816, + "step": 479 + }, + { + "epoch": 1.4646541841803593, + "grad_norm": 0.2873048782348633, + "learning_rate": 4.12214747707527e-06, + "loss": 0.6886979937553406, + "step": 480 + }, + { + "epoch": 1.4677111196025985, + "grad_norm": 0.33694973587989807, + "learning_rate": 4.079153009793068e-06, + "loss": 0.6656784415245056, + "step": 481 + }, + { + "epoch": 1.4707680550248377, + "grad_norm": 0.3373357057571411, + "learning_rate": 4.036326414614985e-06, + "loss": 0.6573168635368347, + "step": 482 + }, + { + "epoch": 1.4738249904470768, + "grad_norm": 0.3189850151538849, + "learning_rate": 3.99366890579139e-06, + "loss": 0.6631187200546265, + "step": 483 + }, + { + "epoch": 1.476881925869316, + "grad_norm": 0.34659212827682495, + "learning_rate": 3.951181692778594e-06, + "loss": 0.5881021022796631, + "step": 484 + }, + { + "epoch": 1.4799388612915552, + "grad_norm": 0.4184463918209076, + "learning_rate": 3.908865980204555e-06, + "loss": 0.7232425212860107, + "step": 485 + }, + { + "epoch": 1.4829957967137943, + "grad_norm": 0.3163282573223114, + "learning_rate": 3.86672296783474e-06, + "loss": 0.6624961495399475, + "step": 486 + }, + { + "epoch": 1.4860527321360335, + "grad_norm": 0.3175446689128876, + "learning_rate": 3.824753850538082e-06, + "loss": 0.6616235971450806, + "step": 487 + }, + { + "epoch": 1.489109667558273, + "grad_norm": 0.3493629992008209, + "learning_rate": 3.782959818253126e-06, + "loss": 0.6923587918281555, + "step": 488 + }, + { + "epoch": 1.492166602980512, + "grad_norm": 0.30385154485702515, + "learning_rate": 3.741342055954269e-06, + "loss": 0.6668528914451599, + "step": 489 + }, + { + "epoch": 1.4952235384027512, + "grad_norm": 0.319979727268219, + "learning_rate": 3.699901743618194e-06, + "loss": 0.6276881098747253, + "step": 490 + }, + { + "epoch": 1.4982804738249904, + "grad_norm": 0.28717750310897827, + "learning_rate": 3.658640056190378e-06, + "loss": 0.7676356434822083, + "step": 491 + }, + { + "epoch": 1.5013374092472298, + "grad_norm": 0.4701229929924011, + "learning_rate": 3.617558163551802e-06, + "loss": 0.6021715402603149, + "step": 492 + }, + { + "epoch": 1.504394344669469, + "grad_norm": 0.4959515929222107, + "learning_rate": 3.576657230485775e-06, + "loss": 0.7243677973747253, + "step": 493 + }, + { + "epoch": 1.5074512800917081, + "grad_norm": 0.32071781158447266, + "learning_rate": 3.5359384166449185e-06, + "loss": 0.7030311822891235, + "step": 494 + }, + { + "epoch": 1.5105082155139473, + "grad_norm": 0.3393514156341553, + "learning_rate": 3.4954028765182633e-06, + "loss": 0.6344490051269531, + "step": 495 + }, + { + "epoch": 1.5135651509361865, + "grad_norm": 0.273512065410614, + "learning_rate": 3.4550517593985512e-06, + "loss": 0.5816606879234314, + "step": 496 + }, + { + "epoch": 1.5166220863584257, + "grad_norm": 0.6631937026977539, + "learning_rate": 3.414886209349615e-06, + "loss": 0.6091232895851135, + "step": 497 + }, + { + "epoch": 1.5196790217806648, + "grad_norm": 0.6976932287216187, + "learning_rate": 3.3749073651739594e-06, + "loss": 0.7076858282089233, + "step": 498 + }, + { + "epoch": 1.522735957202904, + "grad_norm": 0.35580119490623474, + "learning_rate": 3.3351163603804805e-06, + "loss": 0.6363418698310852, + "step": 499 + }, + { + "epoch": 1.5257928926251432, + "grad_norm": 0.30289211869239807, + "learning_rate": 3.2955143231523067e-06, + "loss": 0.6716225147247314, + "step": 500 + }, + { + "epoch": 1.5257928926251432, + "eval_loss": 0.6648170948028564, + "eval_runtime": 870.3243, + "eval_samples_per_second": 0.693, + "eval_steps_per_second": 0.693, + "step": 500 + } + ], + "logging_steps": 1, + "max_steps": 656, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.7558984717717996e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-500/training_args.bin b/cpt_qwen_14B/checkpoints/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eddbb43a2cebb928dbed6e955a37ebfa3174f4b5 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a8e308e47eb936f678712445b19ddc52638f354c37c813ecaa432f69120a2e +size 5201 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-600/README.md b/cpt_qwen_14B/checkpoints/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8dfda26032514233f3e70a4012f1cfd1ddbbb609 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-600/README.md @@ -0,0 +1,207 @@ +--- +base_model: /workspace/Models/Qwen2.5-Coder-14B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:/workspace/Models/Qwen2.5-Coder-14B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/cpt_qwen_14B/checkpoints/checkpoint-600/adapter_config.json b/cpt_qwen_14B/checkpoints/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..81c31359285f7e351a44275c30b6882f4c6b50c0 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-600/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "/workspace/Models/Qwen2.5-Coder-14B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/cpt_qwen_14B/checkpoints/checkpoint-600/adapter_model.safetensors b/cpt_qwen_14B/checkpoints/checkpoint-600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..99d5939e2605b44abdbcc01e0cdccdd954c4b7ce --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:732e678c9e22bba352641afc71ed5fc2394671dd0d66707e288224822a906558 +size 201378736 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-600/chat_template.jinja b/cpt_qwen_14B/checkpoints/checkpoint-600/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..28028c056af412405debd878cdda0171e35fa5d1 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-600/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-600/optimizer.pt b/cpt_qwen_14B/checkpoints/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..32a46cfd5d53db6a5d1e0dc1d4eff3c5d4d0eb59 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a1b384343dba430cd0c0bdf0b562562377b77be375c965d867ac051a2553247 +size 102698855 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-600/rng_state.pth b/cpt_qwen_14B/checkpoints/checkpoint-600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b4cb5b30a3a307b59d3e554d1a2090c55a360565 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d138f30ed8262e22adbedb4292511830825af3e11ab2f7e38dbc4175c911121 +size 14645 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-600/scheduler.pt b/cpt_qwen_14B/checkpoints/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..757957c6fb1dd98034bdbb30304b1d96c06e3afb --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c85b1ab24be70571cc5f0b1d2b5ef99552fc0688a803aa516d4915fbb0825664 +size 1465 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-600/tokenizer.json b/cpt_qwen_14B/checkpoints/checkpoint-600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-600/tokenizer_config.json b/cpt_qwen_14B/checkpoints/checkpoint-600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..217274ef8275420e4bf3b976f3948901cd3d176f --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-600/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": true, + "model_max_length": 32768, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-600/trainer_state.json b/cpt_qwen_14B/checkpoints/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e29289561f8897066f23d17a66f36e98066a7d2c --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-600/trainer_state.json @@ -0,0 +1,4330 @@ +{ + "best_global_step": 600, + "best_metric": 0.6604031324386597, + "best_model_checkpoint": "runs/cpt_run_14b/checkpoints/checkpoint-600", + "epoch": 1.8314864348490638, + "eval_steps": 50, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003056935422239205, + "grad_norm": 0.06516239047050476, + "learning_rate": 0.0, + "loss": 1.138384461402893, + "step": 1 + }, + { + "epoch": 0.00611387084447841, + "grad_norm": 0.05343673378229141, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.983342707157135, + "step": 2 + }, + { + "epoch": 0.009170806266717615, + "grad_norm": 0.05608418956398964, + "learning_rate": 6.060606060606061e-07, + "loss": 1.0762118101119995, + "step": 3 + }, + { + "epoch": 0.01222774168895682, + "grad_norm": 0.06523486226797104, + "learning_rate": 9.090909090909091e-07, + "loss": 1.084489345550537, + "step": 4 + }, + { + "epoch": 0.015284677111196026, + "grad_norm": 0.06582186371088028, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.2037022113800049, + "step": 5 + }, + { + "epoch": 0.01834161253343523, + "grad_norm": 0.06097998470067978, + "learning_rate": 1.5151515151515152e-06, + "loss": 1.10005784034729, + "step": 6 + }, + { + "epoch": 0.021398547955674436, + "grad_norm": 0.10365528613328934, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.0895193815231323, + "step": 7 + }, + { + "epoch": 0.02445548337791364, + "grad_norm": 0.06312141567468643, + "learning_rate": 2.1212121212121216e-06, + "loss": 1.0593242645263672, + "step": 8 + }, + { + "epoch": 0.027512418800152847, + "grad_norm": 0.05508403480052948, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.9772955179214478, + "step": 9 + }, + { + "epoch": 0.030569354222392053, + "grad_norm": 0.06006711348891258, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.084238886833191, + "step": 10 + }, + { + "epoch": 0.033626289644631255, + "grad_norm": 0.0588749423623085, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.0786534547805786, + "step": 11 + }, + { + "epoch": 0.03668322506687046, + "grad_norm": 0.046551357954740524, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.0370622873306274, + "step": 12 + }, + { + "epoch": 0.039740160489109666, + "grad_norm": 0.061659567058086395, + "learning_rate": 3.6363636363636366e-06, + "loss": 1.0646986961364746, + "step": 13 + }, + { + "epoch": 0.04279709591134887, + "grad_norm": 0.06007347255945206, + "learning_rate": 3.93939393939394e-06, + "loss": 1.0311307907104492, + "step": 14 + }, + { + "epoch": 0.04585403133358808, + "grad_norm": 0.07314135134220123, + "learning_rate": 4.242424242424243e-06, + "loss": 1.1300500631332397, + "step": 15 + }, + { + "epoch": 0.04891096675582728, + "grad_norm": 0.060934022068977356, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.0197452306747437, + "step": 16 + }, + { + "epoch": 0.05196790217806649, + "grad_norm": 0.056856051087379456, + "learning_rate": 4.848484848484849e-06, + "loss": 1.0438549518585205, + "step": 17 + }, + { + "epoch": 0.055024837600305694, + "grad_norm": 0.05908689647912979, + "learning_rate": 5.151515151515152e-06, + "loss": 1.0398856401443481, + "step": 18 + }, + { + "epoch": 0.0580817730225449, + "grad_norm": 0.07411840558052063, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.107885479927063, + "step": 19 + }, + { + "epoch": 0.061138708444784105, + "grad_norm": 0.0749165341258049, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.1060967445373535, + "step": 20 + }, + { + "epoch": 0.06419564386702331, + "grad_norm": 0.06720177084207535, + "learning_rate": 6.060606060606061e-06, + "loss": 1.0471720695495605, + "step": 21 + }, + { + "epoch": 0.06725257928926251, + "grad_norm": 0.05990725755691528, + "learning_rate": 6.363636363636364e-06, + "loss": 1.0944981575012207, + "step": 22 + }, + { + "epoch": 0.07030951471150172, + "grad_norm": 0.06672193855047226, + "learning_rate": 6.666666666666667e-06, + "loss": 1.1477092504501343, + "step": 23 + }, + { + "epoch": 0.07336645013374092, + "grad_norm": 0.06145205348730087, + "learning_rate": 6.969696969696971e-06, + "loss": 1.0591784715652466, + "step": 24 + }, + { + "epoch": 0.07642338555598013, + "grad_norm": 0.0757482647895813, + "learning_rate": 7.272727272727273e-06, + "loss": 1.0500165224075317, + "step": 25 + }, + { + "epoch": 0.07948032097821933, + "grad_norm": 0.07848478108644485, + "learning_rate": 7.5757575757575764e-06, + "loss": 1.0747522115707397, + "step": 26 + }, + { + "epoch": 0.08253725640045854, + "grad_norm": 0.07740631699562073, + "learning_rate": 7.87878787878788e-06, + "loss": 1.132310152053833, + "step": 27 + }, + { + "epoch": 0.08559419182269774, + "grad_norm": 0.07476603239774704, + "learning_rate": 8.181818181818183e-06, + "loss": 1.0339502096176147, + "step": 28 + }, + { + "epoch": 0.08865112724493696, + "grad_norm": 0.0779196098446846, + "learning_rate": 8.484848484848486e-06, + "loss": 1.1047282218933105, + "step": 29 + }, + { + "epoch": 0.09170806266717615, + "grad_norm": 0.06962384283542633, + "learning_rate": 8.787878787878788e-06, + "loss": 1.004916787147522, + "step": 30 + }, + { + "epoch": 0.09476499808941537, + "grad_norm": 0.06369175016880035, + "learning_rate": 9.090909090909091e-06, + "loss": 0.9296417832374573, + "step": 31 + }, + { + "epoch": 0.09782193351165457, + "grad_norm": 0.07470260560512543, + "learning_rate": 9.393939393939396e-06, + "loss": 1.0721708536148071, + "step": 32 + }, + { + "epoch": 0.10087886893389378, + "grad_norm": 0.07948213815689087, + "learning_rate": 9.696969696969698e-06, + "loss": 1.0350117683410645, + "step": 33 + }, + { + "epoch": 0.10393580435613298, + "grad_norm": 0.07066022604703903, + "learning_rate": 1e-05, + "loss": 1.026305913925171, + "step": 34 + }, + { + "epoch": 0.10699273977837218, + "grad_norm": 0.07774543762207031, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.0509816408157349, + "step": 35 + }, + { + "epoch": 0.11004967520061139, + "grad_norm": 0.07501248270273209, + "learning_rate": 1.0606060606060606e-05, + "loss": 1.0011574029922485, + "step": 36 + }, + { + "epoch": 0.11310661062285059, + "grad_norm": 0.6622501611709595, + "learning_rate": 1.0909090909090909e-05, + "loss": 0.9754424691200256, + "step": 37 + }, + { + "epoch": 0.1161635460450898, + "grad_norm": 0.07566080242395401, + "learning_rate": 1.1212121212121212e-05, + "loss": 1.0342774391174316, + "step": 38 + }, + { + "epoch": 0.119220481467329, + "grad_norm": 0.07573831081390381, + "learning_rate": 1.1515151515151517e-05, + "loss": 0.9714518785476685, + "step": 39 + }, + { + "epoch": 0.12227741688956821, + "grad_norm": 0.08083852380514145, + "learning_rate": 1.181818181818182e-05, + "loss": 1.1050316095352173, + "step": 40 + }, + { + "epoch": 0.12533435231180742, + "grad_norm": 0.08540588617324829, + "learning_rate": 1.2121212121212122e-05, + "loss": 1.0871070623397827, + "step": 41 + }, + { + "epoch": 0.12839128773404662, + "grad_norm": 0.07391592115163803, + "learning_rate": 1.2424242424242425e-05, + "loss": 1.0206722021102905, + "step": 42 + }, + { + "epoch": 0.13144822315628582, + "grad_norm": 0.07063689082860947, + "learning_rate": 1.2727272727272728e-05, + "loss": 0.9775047898292542, + "step": 43 + }, + { + "epoch": 0.13450515857852502, + "grad_norm": 0.07288888841867447, + "learning_rate": 1.3030303030303032e-05, + "loss": 1.1132858991622925, + "step": 44 + }, + { + "epoch": 0.13756209400076425, + "grad_norm": 0.07641777396202087, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.0707701444625854, + "step": 45 + }, + { + "epoch": 0.14061902942300344, + "grad_norm": 0.06990326195955276, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.9328265190124512, + "step": 46 + }, + { + "epoch": 0.14367596484524264, + "grad_norm": 0.0834241658449173, + "learning_rate": 1.3939393939393942e-05, + "loss": 1.0131721496582031, + "step": 47 + }, + { + "epoch": 0.14673290026748184, + "grad_norm": 0.0714937075972557, + "learning_rate": 1.4242424242424245e-05, + "loss": 0.940493106842041, + "step": 48 + }, + { + "epoch": 0.14978983568972107, + "grad_norm": 0.07770547270774841, + "learning_rate": 1.4545454545454546e-05, + "loss": 1.0435771942138672, + "step": 49 + }, + { + "epoch": 0.15284677111196027, + "grad_norm": 0.07950945198535919, + "learning_rate": 1.484848484848485e-05, + "loss": 1.0382137298583984, + "step": 50 + }, + { + "epoch": 0.15284677111196027, + "eval_loss": 1.0129202604293823, + "eval_runtime": 724.3664, + "eval_samples_per_second": 0.832, + "eval_steps_per_second": 0.832, + "step": 50 + }, + { + "epoch": 0.15590370653419947, + "grad_norm": 0.06961936503648758, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.9690049886703491, + "step": 51 + }, + { + "epoch": 0.15896064195643866, + "grad_norm": 0.069523885846138, + "learning_rate": 1.5454545454545454e-05, + "loss": 0.9830482006072998, + "step": 52 + }, + { + "epoch": 0.16201757737867786, + "grad_norm": 0.0764622762799263, + "learning_rate": 1.575757575757576e-05, + "loss": 1.0895472764968872, + "step": 53 + }, + { + "epoch": 0.1650745128009171, + "grad_norm": 0.1413721889257431, + "learning_rate": 1.606060606060606e-05, + "loss": 1.0354574918746948, + "step": 54 + }, + { + "epoch": 0.1681314482231563, + "grad_norm": 0.06818042695522308, + "learning_rate": 1.6363636363636366e-05, + "loss": 0.8534265160560608, + "step": 55 + }, + { + "epoch": 0.1711883836453955, + "grad_norm": 0.0722246989607811, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.9580274820327759, + "step": 56 + }, + { + "epoch": 0.17424531906763469, + "grad_norm": 0.07113443315029144, + "learning_rate": 1.6969696969696972e-05, + "loss": 1.0721848011016846, + "step": 57 + }, + { + "epoch": 0.1773022544898739, + "grad_norm": 0.08412107080221176, + "learning_rate": 1.7272727272727274e-05, + "loss": 1.1180150508880615, + "step": 58 + }, + { + "epoch": 0.1803591899121131, + "grad_norm": 0.07381036877632141, + "learning_rate": 1.7575757575757576e-05, + "loss": 1.0384547710418701, + "step": 59 + }, + { + "epoch": 0.1834161253343523, + "grad_norm": 0.07089001685380936, + "learning_rate": 1.787878787878788e-05, + "loss": 1.0446016788482666, + "step": 60 + }, + { + "epoch": 0.1864730607565915, + "grad_norm": 0.11576953530311584, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.0015051364898682, + "step": 61 + }, + { + "epoch": 0.18952999617883073, + "grad_norm": 0.08030868321657181, + "learning_rate": 1.8484848484848487e-05, + "loss": 0.9642710089683533, + "step": 62 + }, + { + "epoch": 0.19258693160106993, + "grad_norm": 0.08332342654466629, + "learning_rate": 1.8787878787878792e-05, + "loss": 1.0722991228103638, + "step": 63 + }, + { + "epoch": 0.19564386702330913, + "grad_norm": 0.08000365644693375, + "learning_rate": 1.9090909090909094e-05, + "loss": 1.0104647874832153, + "step": 64 + }, + { + "epoch": 0.19870080244554833, + "grad_norm": 0.08139508217573166, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9445061087608337, + "step": 65 + }, + { + "epoch": 0.20175773786778756, + "grad_norm": 0.08749893307685852, + "learning_rate": 1.96969696969697e-05, + "loss": 1.080810308456421, + "step": 66 + }, + { + "epoch": 0.20481467329002676, + "grad_norm": 0.0786912813782692, + "learning_rate": 2e-05, + "loss": 0.9705753922462463, + "step": 67 + }, + { + "epoch": 0.20787160871226595, + "grad_norm": 0.08962028473615646, + "learning_rate": 1.9999858236410775e-05, + "loss": 0.962783694267273, + "step": 68 + }, + { + "epoch": 0.21092854413450515, + "grad_norm": 0.08402887731790543, + "learning_rate": 1.9999432949662483e-05, + "loss": 0.9959614872932434, + "step": 69 + }, + { + "epoch": 0.21398547955674435, + "grad_norm": 0.08036444336175919, + "learning_rate": 1.9998724151813157e-05, + "loss": 0.9569960832595825, + "step": 70 + }, + { + "epoch": 0.21704241497898358, + "grad_norm": 0.08247046917676926, + "learning_rate": 1.9997731862959143e-05, + "loss": 1.0012171268463135, + "step": 71 + }, + { + "epoch": 0.22009935040122278, + "grad_norm": 0.08966264873743057, + "learning_rate": 1.999645611123453e-05, + "loss": 1.0403809547424316, + "step": 72 + }, + { + "epoch": 0.22315628582346198, + "grad_norm": 0.08061660826206207, + "learning_rate": 1.999489693281034e-05, + "loss": 1.0089740753173828, + "step": 73 + }, + { + "epoch": 0.22621322124570117, + "grad_norm": 0.09005365520715714, + "learning_rate": 1.9993054371893526e-05, + "loss": 0.9333044290542603, + "step": 74 + }, + { + "epoch": 0.2292701566679404, + "grad_norm": 0.08651519566774368, + "learning_rate": 1.9990928480725694e-05, + "loss": 0.9284015893936157, + "step": 75 + }, + { + "epoch": 0.2323270920901796, + "grad_norm": 0.08141147345304489, + "learning_rate": 1.9988519319581637e-05, + "loss": 0.9782730340957642, + "step": 76 + }, + { + "epoch": 0.2353840275124188, + "grad_norm": 0.08344405144453049, + "learning_rate": 1.998582695676762e-05, + "loss": 0.9723064303398132, + "step": 77 + }, + { + "epoch": 0.238440962934658, + "grad_norm": 0.08019903302192688, + "learning_rate": 1.998285146861945e-05, + "loss": 0.9648997783660889, + "step": 78 + }, + { + "epoch": 0.24149789835689722, + "grad_norm": 0.08113416284322739, + "learning_rate": 1.99795929395003e-05, + "loss": 0.9263214468955994, + "step": 79 + }, + { + "epoch": 0.24455483377913642, + "grad_norm": 0.08127513527870178, + "learning_rate": 1.997605146179833e-05, + "loss": 0.8745232224464417, + "step": 80 + }, + { + "epoch": 0.24761176920137562, + "grad_norm": 0.09934187680482864, + "learning_rate": 1.997222713592405e-05, + "loss": 0.8722782135009766, + "step": 81 + }, + { + "epoch": 0.25066870462361485, + "grad_norm": 0.09701363742351532, + "learning_rate": 1.9968120070307503e-05, + "loss": 1.0084266662597656, + "step": 82 + }, + { + "epoch": 0.253725640045854, + "grad_norm": 0.08335654437541962, + "learning_rate": 1.9963730381395154e-05, + "loss": 0.9239332675933838, + "step": 83 + }, + { + "epoch": 0.25678257546809324, + "grad_norm": 0.09161650389432907, + "learning_rate": 1.9959058193646618e-05, + "loss": 0.9878032207489014, + "step": 84 + }, + { + "epoch": 0.2598395108903324, + "grad_norm": 0.08067663013935089, + "learning_rate": 1.9954103639531116e-05, + "loss": 0.9113098382949829, + "step": 85 + }, + { + "epoch": 0.26289644631257164, + "grad_norm": 0.09619539976119995, + "learning_rate": 1.9948866859523717e-05, + "loss": 0.9527600407600403, + "step": 86 + }, + { + "epoch": 0.26595338173481087, + "grad_norm": 0.10015493631362915, + "learning_rate": 1.9943348002101374e-05, + "loss": 0.9569152593612671, + "step": 87 + }, + { + "epoch": 0.26901031715705004, + "grad_norm": 0.09012345969676971, + "learning_rate": 1.993754722373869e-05, + "loss": 0.8912045359611511, + "step": 88 + }, + { + "epoch": 0.27206725257928926, + "grad_norm": 0.10342805832624435, + "learning_rate": 1.9931464688903502e-05, + "loss": 0.856104850769043, + "step": 89 + }, + { + "epoch": 0.2751241880015285, + "grad_norm": 0.10218493640422821, + "learning_rate": 1.9925100570052194e-05, + "loss": 0.9631397128105164, + "step": 90 + }, + { + "epoch": 0.27818112342376766, + "grad_norm": 0.10909046977758408, + "learning_rate": 1.9918455047624847e-05, + "loss": 0.8532565236091614, + "step": 91 + }, + { + "epoch": 0.2812380588460069, + "grad_norm": 0.10714197903871536, + "learning_rate": 1.9911528310040073e-05, + "loss": 0.9691859483718872, + "step": 92 + }, + { + "epoch": 0.28429499426824606, + "grad_norm": 0.1108694076538086, + "learning_rate": 1.990432055368971e-05, + "loss": 0.9374334812164307, + "step": 93 + }, + { + "epoch": 0.2873519296904853, + "grad_norm": 0.10037308186292648, + "learning_rate": 1.989683198293324e-05, + "loss": 0.9166896343231201, + "step": 94 + }, + { + "epoch": 0.2904088651127245, + "grad_norm": 0.10246684402227402, + "learning_rate": 1.9889062810092002e-05, + "loss": 1.0059239864349365, + "step": 95 + }, + { + "epoch": 0.2934658005349637, + "grad_norm": 0.09954962879419327, + "learning_rate": 1.9881013255443152e-05, + "loss": 1.00413179397583, + "step": 96 + }, + { + "epoch": 0.2965227359572029, + "grad_norm": 0.11006761342287064, + "learning_rate": 1.9872683547213446e-05, + "loss": 0.9414035677909851, + "step": 97 + }, + { + "epoch": 0.29957967137944214, + "grad_norm": 0.1014382541179657, + "learning_rate": 1.9864073921572756e-05, + "loss": 0.9155468940734863, + "step": 98 + }, + { + "epoch": 0.3026366068016813, + "grad_norm": 0.09883157908916473, + "learning_rate": 1.9855184622627362e-05, + "loss": 0.9429305195808411, + "step": 99 + }, + { + "epoch": 0.30569354222392053, + "grad_norm": 0.11199072748422623, + "learning_rate": 1.9846015902413053e-05, + "loss": 0.9143528342247009, + "step": 100 + }, + { + "epoch": 0.30569354222392053, + "eval_loss": 0.884428083896637, + "eval_runtime": 723.8143, + "eval_samples_per_second": 0.833, + "eval_steps_per_second": 0.833, + "step": 100 + }, + { + "epoch": 0.3087504776461597, + "grad_norm": 0.10796016454696655, + "learning_rate": 1.9836568020887963e-05, + "loss": 0.9726455211639404, + "step": 101 + }, + { + "epoch": 0.31180741306839893, + "grad_norm": 0.10056383162736893, + "learning_rate": 1.982684124592521e-05, + "loss": 0.8932135701179504, + "step": 102 + }, + { + "epoch": 0.31486434849063816, + "grad_norm": 0.10836594551801682, + "learning_rate": 1.9816835853305306e-05, + "loss": 0.919749915599823, + "step": 103 + }, + { + "epoch": 0.31792128391287733, + "grad_norm": 0.12032149732112885, + "learning_rate": 1.9806552126708322e-05, + "loss": 0.871781587600708, + "step": 104 + }, + { + "epoch": 0.32097821933511655, + "grad_norm": 0.10854160040616989, + "learning_rate": 1.9795990357705853e-05, + "loss": 0.8587784171104431, + "step": 105 + }, + { + "epoch": 0.3240351547573557, + "grad_norm": 0.10819399356842041, + "learning_rate": 1.978515084575276e-05, + "loss": 0.8524806499481201, + "step": 106 + }, + { + "epoch": 0.32709209017959495, + "grad_norm": 0.10226067155599594, + "learning_rate": 1.9774033898178668e-05, + "loss": 0.7892144918441772, + "step": 107 + }, + { + "epoch": 0.3301490256018342, + "grad_norm": 0.1071159616112709, + "learning_rate": 1.976263983017925e-05, + "loss": 0.8833234906196594, + "step": 108 + }, + { + "epoch": 0.33320596102407335, + "grad_norm": 0.11434526741504669, + "learning_rate": 1.9750968964807305e-05, + "loss": 0.861842155456543, + "step": 109 + }, + { + "epoch": 0.3362628964463126, + "grad_norm": 0.1159641221165657, + "learning_rate": 1.9739021632963584e-05, + "loss": 0.8987889289855957, + "step": 110 + }, + { + "epoch": 0.3393198318685518, + "grad_norm": 0.12371373921632767, + "learning_rate": 1.9726798173387417e-05, + "loss": 0.9710193872451782, + "step": 111 + }, + { + "epoch": 0.342376767290791, + "grad_norm": 0.11441531032323837, + "learning_rate": 1.97142989326471e-05, + "loss": 0.8199151158332825, + "step": 112 + }, + { + "epoch": 0.3454337027130302, + "grad_norm": 0.11842846125364304, + "learning_rate": 1.9701524265130088e-05, + "loss": 0.8845276236534119, + "step": 113 + }, + { + "epoch": 0.34849063813526937, + "grad_norm": 0.10813732445240021, + "learning_rate": 1.9688474533032916e-05, + "loss": 0.7964264750480652, + "step": 114 + }, + { + "epoch": 0.3515475735575086, + "grad_norm": 0.11050347238779068, + "learning_rate": 1.9675150106350957e-05, + "loss": 0.9630422592163086, + "step": 115 + }, + { + "epoch": 0.3546045089797478, + "grad_norm": 0.10537250339984894, + "learning_rate": 1.9661551362867926e-05, + "loss": 0.7706905007362366, + "step": 116 + }, + { + "epoch": 0.357661444401987, + "grad_norm": 0.11390368640422821, + "learning_rate": 1.9647678688145163e-05, + "loss": 0.8541204929351807, + "step": 117 + }, + { + "epoch": 0.3607183798242262, + "grad_norm": 0.10318922251462936, + "learning_rate": 1.963353247551069e-05, + "loss": 0.7400562763214111, + "step": 118 + }, + { + "epoch": 0.3637753152464654, + "grad_norm": 0.1347586214542389, + "learning_rate": 1.9619113126048086e-05, + "loss": 0.9232871532440186, + "step": 119 + }, + { + "epoch": 0.3668322506687046, + "grad_norm": 0.11458177119493484, + "learning_rate": 1.96044210485851e-05, + "loss": 0.833285927772522, + "step": 120 + }, + { + "epoch": 0.36988918609094384, + "grad_norm": 0.12361041456460953, + "learning_rate": 1.958945665968206e-05, + "loss": 0.7887391448020935, + "step": 121 + }, + { + "epoch": 0.372946121513183, + "grad_norm": 0.11985408514738083, + "learning_rate": 1.9574220383620054e-05, + "loss": 0.8206446170806885, + "step": 122 + }, + { + "epoch": 0.37600305693542224, + "grad_norm": 0.1355939507484436, + "learning_rate": 1.9558712652388932e-05, + "loss": 0.7648542523384094, + "step": 123 + }, + { + "epoch": 0.37905999235766147, + "grad_norm": 0.1229313388466835, + "learning_rate": 1.954293390567501e-05, + "loss": 0.8573335409164429, + "step": 124 + }, + { + "epoch": 0.38211692777990064, + "grad_norm": 0.11425124108791351, + "learning_rate": 1.9526884590848646e-05, + "loss": 0.7412531971931458, + "step": 125 + }, + { + "epoch": 0.38517386320213987, + "grad_norm": 0.12430041283369064, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.8098543882369995, + "step": 126 + }, + { + "epoch": 0.38823079862437904, + "grad_norm": 0.12492368370294571, + "learning_rate": 1.9493976084683814e-05, + "loss": 0.8814713954925537, + "step": 127 + }, + { + "epoch": 0.39128773404661826, + "grad_norm": 0.14428824186325073, + "learning_rate": 1.9477117826390934e-05, + "loss": 0.8231979608535767, + "step": 128 + }, + { + "epoch": 0.3943446694688575, + "grad_norm": 0.12010085582733154, + "learning_rate": 1.9459990866050337e-05, + "loss": 0.7015627026557922, + "step": 129 + }, + { + "epoch": 0.39740160489109666, + "grad_norm": 0.11819776892662048, + "learning_rate": 1.9442595689257898e-05, + "loss": 0.8086729645729065, + "step": 130 + }, + { + "epoch": 0.4004585403133359, + "grad_norm": 0.12211033701896667, + "learning_rate": 1.9424932789214158e-05, + "loss": 0.8234002590179443, + "step": 131 + }, + { + "epoch": 0.4035154757355751, + "grad_norm": 0.14926476776599884, + "learning_rate": 1.9407002666710334e-05, + "loss": 0.874608039855957, + "step": 132 + }, + { + "epoch": 0.4065724111578143, + "grad_norm": 0.13012923300266266, + "learning_rate": 1.9388805830114132e-05, + "loss": 0.8491607904434204, + "step": 133 + }, + { + "epoch": 0.4096293465800535, + "grad_norm": 0.12012261897325516, + "learning_rate": 1.937034279535533e-05, + "loss": 0.7269159555435181, + "step": 134 + }, + { + "epoch": 0.4126862820022927, + "grad_norm": 0.15302567183971405, + "learning_rate": 1.9351614085911134e-05, + "loss": 0.8560839891433716, + "step": 135 + }, + { + "epoch": 0.4157432174245319, + "grad_norm": 0.12234190106391907, + "learning_rate": 1.933262023279137e-05, + "loss": 0.8211904764175415, + "step": 136 + }, + { + "epoch": 0.41880015284677113, + "grad_norm": 0.14427296817302704, + "learning_rate": 1.9313361774523387e-05, + "loss": 0.8500057458877563, + "step": 137 + }, + { + "epoch": 0.4218570882690103, + "grad_norm": 0.1314094066619873, + "learning_rate": 1.929383925713682e-05, + "loss": 0.7589091658592224, + "step": 138 + }, + { + "epoch": 0.42491402369124953, + "grad_norm": 0.1576734483242035, + "learning_rate": 1.92740532341481e-05, + "loss": 0.7581073641777039, + "step": 139 + }, + { + "epoch": 0.4279709591134887, + "grad_norm": 0.15788713097572327, + "learning_rate": 1.925400426654475e-05, + "loss": 0.809050440788269, + "step": 140 + }, + { + "epoch": 0.43102789453572793, + "grad_norm": 0.13364559412002563, + "learning_rate": 1.9233692922769497e-05, + "loss": 0.7990086078643799, + "step": 141 + }, + { + "epoch": 0.43408482995796716, + "grad_norm": 0.14786465466022491, + "learning_rate": 1.921311977870413e-05, + "loss": 0.8675815463066101, + "step": 142 + }, + { + "epoch": 0.4371417653802063, + "grad_norm": 0.14621882140636444, + "learning_rate": 1.9192285417653208e-05, + "loss": 0.8713765740394592, + "step": 143 + }, + { + "epoch": 0.44019870080244555, + "grad_norm": 0.12874048948287964, + "learning_rate": 1.917119043032749e-05, + "loss": 0.7361871004104614, + "step": 144 + }, + { + "epoch": 0.4432556362246848, + "grad_norm": 0.12183775007724762, + "learning_rate": 1.9149835414827193e-05, + "loss": 0.7311941385269165, + "step": 145 + }, + { + "epoch": 0.44631257164692395, + "grad_norm": 0.1397160291671753, + "learning_rate": 1.912822097662505e-05, + "loss": 0.8189159035682678, + "step": 146 + }, + { + "epoch": 0.4493695070691632, + "grad_norm": 0.1458273082971573, + "learning_rate": 1.9106347728549134e-05, + "loss": 0.8288135528564453, + "step": 147 + }, + { + "epoch": 0.45242644249140235, + "grad_norm": 0.16898781061172485, + "learning_rate": 1.908421629076547e-05, + "loss": 0.7878037095069885, + "step": 148 + }, + { + "epoch": 0.4554833779136416, + "grad_norm": 0.1638474315404892, + "learning_rate": 1.9061827290760466e-05, + "loss": 0.8059952259063721, + "step": 149 + }, + { + "epoch": 0.4585403133358808, + "grad_norm": 0.14130882918834686, + "learning_rate": 1.9039181363323128e-05, + "loss": 0.7346830368041992, + "step": 150 + }, + { + "epoch": 0.4585403133358808, + "eval_loss": 0.7979016900062561, + "eval_runtime": 828.6295, + "eval_samples_per_second": 0.728, + "eval_steps_per_second": 0.728, + "step": 150 + }, + { + "epoch": 0.46159724875811997, + "grad_norm": 0.14427433907985687, + "learning_rate": 1.9016279150527044e-05, + "loss": 0.7583403587341309, + "step": 151 + }, + { + "epoch": 0.4646541841803592, + "grad_norm": 0.1515798568725586, + "learning_rate": 1.8993121301712194e-05, + "loss": 0.7908380031585693, + "step": 152 + }, + { + "epoch": 0.46771111960259837, + "grad_norm": 0.14444488286972046, + "learning_rate": 1.896970847346653e-05, + "loss": 0.7916130423545837, + "step": 153 + }, + { + "epoch": 0.4707680550248376, + "grad_norm": 0.1460912823677063, + "learning_rate": 1.8946041329607364e-05, + "loss": 0.7750643491744995, + "step": 154 + }, + { + "epoch": 0.4738249904470768, + "grad_norm": 0.13896244764328003, + "learning_rate": 1.892212054116255e-05, + "loss": 0.8059666156768799, + "step": 155 + }, + { + "epoch": 0.476881925869316, + "grad_norm": 0.16133630275726318, + "learning_rate": 1.889794678635145e-05, + "loss": 0.8327827453613281, + "step": 156 + }, + { + "epoch": 0.4799388612915552, + "grad_norm": 0.1474636346101761, + "learning_rate": 1.8873520750565716e-05, + "loss": 0.8498989343643188, + "step": 157 + }, + { + "epoch": 0.48299579671379445, + "grad_norm": 0.17222349345684052, + "learning_rate": 1.884884312634985e-05, + "loss": 0.7750177979469299, + "step": 158 + }, + { + "epoch": 0.4860527321360336, + "grad_norm": 0.15558090806007385, + "learning_rate": 1.8823914613381568e-05, + "loss": 0.7326169013977051, + "step": 159 + }, + { + "epoch": 0.48910966755827284, + "grad_norm": 0.13808321952819824, + "learning_rate": 1.8798735918451963e-05, + "loss": 0.8308709859848022, + "step": 160 + }, + { + "epoch": 0.492166602980512, + "grad_norm": 0.1761898398399353, + "learning_rate": 1.8773307755445468e-05, + "loss": 0.7805465459823608, + "step": 161 + }, + { + "epoch": 0.49522353840275124, + "grad_norm": 0.160477414727211, + "learning_rate": 1.874763084531961e-05, + "loss": 0.8538846969604492, + "step": 162 + }, + { + "epoch": 0.49828047382499047, + "grad_norm": 0.15238745510578156, + "learning_rate": 1.872170591608459e-05, + "loss": 0.8801217675209045, + "step": 163 + }, + { + "epoch": 0.5013374092472297, + "grad_norm": 0.1567080318927765, + "learning_rate": 1.86955337027826e-05, + "loss": 0.7205259799957275, + "step": 164 + }, + { + "epoch": 0.5043943446694689, + "grad_norm": 0.13637851178646088, + "learning_rate": 1.866911494746702e-05, + "loss": 0.7636491656303406, + "step": 165 + }, + { + "epoch": 0.507451280091708, + "grad_norm": 0.15563489496707916, + "learning_rate": 1.8642450399181373e-05, + "loss": 0.7982497811317444, + "step": 166 + }, + { + "epoch": 0.5105082155139473, + "grad_norm": 0.15503396093845367, + "learning_rate": 1.8615540813938063e-05, + "loss": 0.8737778067588806, + "step": 167 + }, + { + "epoch": 0.5135651509361865, + "grad_norm": 0.16095557808876038, + "learning_rate": 1.8588386954696972e-05, + "loss": 0.796604335308075, + "step": 168 + }, + { + "epoch": 0.5166220863584257, + "grad_norm": 0.1713593453168869, + "learning_rate": 1.856098959134381e-05, + "loss": 0.8247392177581787, + "step": 169 + }, + { + "epoch": 0.5196790217806648, + "grad_norm": 0.18239113688468933, + "learning_rate": 1.8533349500668295e-05, + "loss": 0.7838484644889832, + "step": 170 + }, + { + "epoch": 0.5227359572029041, + "grad_norm": 0.15745767951011658, + "learning_rate": 1.850546746634211e-05, + "loss": 0.7856907248497009, + "step": 171 + }, + { + "epoch": 0.5257928926251433, + "grad_norm": 0.16820666193962097, + "learning_rate": 1.8477344278896708e-05, + "loss": 0.7829679846763611, + "step": 172 + }, + { + "epoch": 0.5288498280473825, + "grad_norm": 0.16975544393062592, + "learning_rate": 1.84489807357009e-05, + "loss": 0.7374375462532043, + "step": 173 + }, + { + "epoch": 0.5319067634696217, + "grad_norm": 0.167228102684021, + "learning_rate": 1.8420377640938204e-05, + "loss": 0.712837815284729, + "step": 174 + }, + { + "epoch": 0.5349636988918609, + "grad_norm": 0.15955154597759247, + "learning_rate": 1.839153580558411e-05, + "loss": 0.7645693421363831, + "step": 175 + }, + { + "epoch": 0.5380206343141001, + "grad_norm": 0.18378689885139465, + "learning_rate": 1.8362456047383032e-05, + "loss": 0.7974956631660461, + "step": 176 + }, + { + "epoch": 0.5410775697363394, + "grad_norm": 0.15777672827243805, + "learning_rate": 1.833313919082515e-05, + "loss": 0.8957571983337402, + "step": 177 + }, + { + "epoch": 0.5441345051585785, + "grad_norm": 0.15292386710643768, + "learning_rate": 1.8303586067123028e-05, + "loss": 0.7635619044303894, + "step": 178 + }, + { + "epoch": 0.5471914405808177, + "grad_norm": 0.178152397274971, + "learning_rate": 1.8273797514188043e-05, + "loss": 0.7849246263504028, + "step": 179 + }, + { + "epoch": 0.550248376003057, + "grad_norm": 0.15916013717651367, + "learning_rate": 1.824377437660663e-05, + "loss": 0.6975343227386475, + "step": 180 + }, + { + "epoch": 0.5533053114252962, + "grad_norm": 0.18172231316566467, + "learning_rate": 1.821351750561634e-05, + "loss": 0.7675164341926575, + "step": 181 + }, + { + "epoch": 0.5563622468475353, + "grad_norm": 0.16241903603076935, + "learning_rate": 1.818302775908169e-05, + "loss": 0.7950343489646912, + "step": 182 + }, + { + "epoch": 0.5594191822697746, + "grad_norm": 0.18727579712867737, + "learning_rate": 1.8152306001469875e-05, + "loss": 0.787315309047699, + "step": 183 + }, + { + "epoch": 0.5624761176920138, + "grad_norm": 0.1627933531999588, + "learning_rate": 1.8121353103826213e-05, + "loss": 0.7141211628913879, + "step": 184 + }, + { + "epoch": 0.565533053114253, + "grad_norm": 0.4369247555732727, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.8476608395576477, + "step": 185 + }, + { + "epoch": 0.5685899885364921, + "grad_norm": 0.16494786739349365, + "learning_rate": 1.8058757405367003e-05, + "loss": 0.720562756061554, + "step": 186 + }, + { + "epoch": 0.5716469239587314, + "grad_norm": 0.175015389919281, + "learning_rate": 1.8027116379309637e-05, + "loss": 0.7589252591133118, + "step": 187 + }, + { + "epoch": 0.5747038593809706, + "grad_norm": 0.1769978553056717, + "learning_rate": 1.799524776268646e-05, + "loss": 0.7644155621528625, + "step": 188 + }, + { + "epoch": 0.5777607948032097, + "grad_norm": 0.18481792509555817, + "learning_rate": 1.796315245905936e-05, + "loss": 0.7885835766792297, + "step": 189 + }, + { + "epoch": 0.580817730225449, + "grad_norm": 0.1668689250946045, + "learning_rate": 1.7930831378417437e-05, + "loss": 0.7377231121063232, + "step": 190 + }, + { + "epoch": 0.5838746656476882, + "grad_norm": 0.178734689950943, + "learning_rate": 1.7898285437151163e-05, + "loss": 0.7388894557952881, + "step": 191 + }, + { + "epoch": 0.5869316010699274, + "grad_norm": 0.1740068644285202, + "learning_rate": 1.786551555802643e-05, + "loss": 0.8209859728813171, + "step": 192 + }, + { + "epoch": 0.5899885364921666, + "grad_norm": 0.19211041927337646, + "learning_rate": 1.783252267015837e-05, + "loss": 0.7305737733840942, + "step": 193 + }, + { + "epoch": 0.5930454719144058, + "grad_norm": 0.16644936800003052, + "learning_rate": 1.779930770898503e-05, + "loss": 0.7760804891586304, + "step": 194 + }, + { + "epoch": 0.596102407336645, + "grad_norm": 0.1773686707019806, + "learning_rate": 1.776587161624083e-05, + "loss": 0.7879236936569214, + "step": 195 + }, + { + "epoch": 0.5991593427588843, + "grad_norm": 0.17508819699287415, + "learning_rate": 1.7732215339929874e-05, + "loss": 0.7307407259941101, + "step": 196 + }, + { + "epoch": 0.6022162781811234, + "grad_norm": 0.17211101949214935, + "learning_rate": 1.7698339834299064e-05, + "loss": 0.7293214797973633, + "step": 197 + }, + { + "epoch": 0.6052732136033626, + "grad_norm": 0.18085215985774994, + "learning_rate": 1.7664246059811058e-05, + "loss": 0.763083279132843, + "step": 198 + }, + { + "epoch": 0.6083301490256018, + "grad_norm": 0.20243075489997864, + "learning_rate": 1.7629934983117025e-05, + "loss": 0.7372676134109497, + "step": 199 + }, + { + "epoch": 0.6113870844478411, + "grad_norm": 0.18152795732021332, + "learning_rate": 1.759540757702924e-05, + "loss": 0.7121898531913757, + "step": 200 + }, + { + "epoch": 0.6113870844478411, + "eval_loss": 0.7551760673522949, + "eval_runtime": 900.209, + "eval_samples_per_second": 0.67, + "eval_steps_per_second": 0.67, + "step": 200 + }, + { + "epoch": 0.6144440198700802, + "grad_norm": 0.18808062374591827, + "learning_rate": 1.7560664820493502e-05, + "loss": 0.734307050704956, + "step": 201 + }, + { + "epoch": 0.6175009552923194, + "grad_norm": 0.18151243031024933, + "learning_rate": 1.7525707698561383e-05, + "loss": 0.7998429536819458, + "step": 202 + }, + { + "epoch": 0.6205578907145587, + "grad_norm": 0.19583043456077576, + "learning_rate": 1.7490537202362313e-05, + "loss": 0.7546265721321106, + "step": 203 + }, + { + "epoch": 0.6236148261367979, + "grad_norm": 0.2508557140827179, + "learning_rate": 1.7455154329075427e-05, + "loss": 0.7810050249099731, + "step": 204 + }, + { + "epoch": 0.626671761559037, + "grad_norm": 0.1685105562210083, + "learning_rate": 1.741956008190136e-05, + "loss": 0.7558917999267578, + "step": 205 + }, + { + "epoch": 0.6297286969812763, + "grad_norm": 0.18195222318172455, + "learning_rate": 1.7383755470033756e-05, + "loss": 0.7216942310333252, + "step": 206 + }, + { + "epoch": 0.6327856324035155, + "grad_norm": 0.1878063678741455, + "learning_rate": 1.7347741508630673e-05, + "loss": 0.7417092323303223, + "step": 207 + }, + { + "epoch": 0.6358425678257547, + "grad_norm": 0.25273698568344116, + "learning_rate": 1.73115192187858e-05, + "loss": 0.807498037815094, + "step": 208 + }, + { + "epoch": 0.6388995032479939, + "grad_norm": 0.2451465129852295, + "learning_rate": 1.7275089627499493e-05, + "loss": 0.7557163238525391, + "step": 209 + }, + { + "epoch": 0.6419564386702331, + "grad_norm": 0.19272617995738983, + "learning_rate": 1.7238453767649683e-05, + "loss": 0.8285109996795654, + "step": 210 + }, + { + "epoch": 0.6450133740924723, + "grad_norm": 0.1869518756866455, + "learning_rate": 1.720161267796256e-05, + "loss": 0.7824444770812988, + "step": 211 + }, + { + "epoch": 0.6480703095147115, + "grad_norm": 0.2029627561569214, + "learning_rate": 1.7164567402983153e-05, + "loss": 0.7018642425537109, + "step": 212 + }, + { + "epoch": 0.6511272449369507, + "grad_norm": 0.23215501010417938, + "learning_rate": 1.7127318993045686e-05, + "loss": 0.7263948917388916, + "step": 213 + }, + { + "epoch": 0.6541841803591899, + "grad_norm": 0.19869184494018555, + "learning_rate": 1.7089868504243816e-05, + "loss": 0.8285576105117798, + "step": 214 + }, + { + "epoch": 0.6572411157814291, + "grad_norm": 0.22871531546115875, + "learning_rate": 1.705221699840069e-05, + "loss": 0.7871490716934204, + "step": 215 + }, + { + "epoch": 0.6602980512036684, + "grad_norm": 0.17945580184459686, + "learning_rate": 1.701436554303882e-05, + "loss": 0.740180492401123, + "step": 216 + }, + { + "epoch": 0.6633549866259075, + "grad_norm": 0.20516762137413025, + "learning_rate": 1.6976315211349848e-05, + "loss": 0.7542892098426819, + "step": 217 + }, + { + "epoch": 0.6664119220481467, + "grad_norm": 0.22108283638954163, + "learning_rate": 1.6938067082164093e-05, + "loss": 0.8117404580116272, + "step": 218 + }, + { + "epoch": 0.669468857470386, + "grad_norm": 0.22329698503017426, + "learning_rate": 1.6899622239919965e-05, + "loss": 0.8002716898918152, + "step": 219 + }, + { + "epoch": 0.6725257928926252, + "grad_norm": 0.23545362055301666, + "learning_rate": 1.6860981774633228e-05, + "loss": 0.7750573754310608, + "step": 220 + }, + { + "epoch": 0.6755827283148643, + "grad_norm": 0.21816480159759521, + "learning_rate": 1.6822146781866097e-05, + "loss": 0.8051223754882812, + "step": 221 + }, + { + "epoch": 0.6786396637371036, + "grad_norm": 0.18638508021831512, + "learning_rate": 1.6783118362696162e-05, + "loss": 0.7286484241485596, + "step": 222 + }, + { + "epoch": 0.6816965991593428, + "grad_norm": 0.16794732213020325, + "learning_rate": 1.6743897623685178e-05, + "loss": 0.7001460194587708, + "step": 223 + }, + { + "epoch": 0.684753534581582, + "grad_norm": 0.21157318353652954, + "learning_rate": 1.6704485676847695e-05, + "loss": 0.7479901313781738, + "step": 224 + }, + { + "epoch": 0.6878104700038211, + "grad_norm": 0.35601308941841125, + "learning_rate": 1.666488363961952e-05, + "loss": 0.7660019397735596, + "step": 225 + }, + { + "epoch": 0.6908674054260604, + "grad_norm": 0.17416611313819885, + "learning_rate": 1.662509263482604e-05, + "loss": 0.7157142162322998, + "step": 226 + }, + { + "epoch": 0.6939243408482996, + "grad_norm": 0.19655123353004456, + "learning_rate": 1.658511379065039e-05, + "loss": 0.7894638776779175, + "step": 227 + }, + { + "epoch": 0.6969812762705387, + "grad_norm": 0.2034345269203186, + "learning_rate": 1.6544948240601453e-05, + "loss": 0.6853711009025574, + "step": 228 + }, + { + "epoch": 0.700038211692778, + "grad_norm": 0.199235200881958, + "learning_rate": 1.6504597123481737e-05, + "loss": 0.7487372756004333, + "step": 229 + }, + { + "epoch": 0.7030951471150172, + "grad_norm": 0.20407404005527496, + "learning_rate": 1.6464061583355088e-05, + "loss": 0.7335573434829712, + "step": 230 + }, + { + "epoch": 0.7061520825372564, + "grad_norm": 0.22096174955368042, + "learning_rate": 1.6423342769514227e-05, + "loss": 0.7659798264503479, + "step": 231 + }, + { + "epoch": 0.7092090179594956, + "grad_norm": 0.1916825920343399, + "learning_rate": 1.6382441836448203e-05, + "loss": 0.7162011861801147, + "step": 232 + }, + { + "epoch": 0.7122659533817348, + "grad_norm": 0.20505093038082123, + "learning_rate": 1.6341359943809626e-05, + "loss": 0.6957600116729736, + "step": 233 + }, + { + "epoch": 0.715322888803974, + "grad_norm": 0.19968082010746002, + "learning_rate": 1.6300098256381807e-05, + "loss": 0.6724053025245667, + "step": 234 + }, + { + "epoch": 0.7183798242262133, + "grad_norm": 0.19768832623958588, + "learning_rate": 1.625865794404573e-05, + "loss": 0.774741530418396, + "step": 235 + }, + { + "epoch": 0.7214367596484524, + "grad_norm": 0.19257694482803345, + "learning_rate": 1.621704018174688e-05, + "loss": 0.6658651828765869, + "step": 236 + }, + { + "epoch": 0.7244936950706916, + "grad_norm": 0.21594858169555664, + "learning_rate": 1.617524614946192e-05, + "loss": 0.810744047164917, + "step": 237 + }, + { + "epoch": 0.7275506304929308, + "grad_norm": 0.2107633650302887, + "learning_rate": 1.6133277032165264e-05, + "loss": 0.7623897194862366, + "step": 238 + }, + { + "epoch": 0.7306075659151701, + "grad_norm": 0.20114055275917053, + "learning_rate": 1.6091134019795447e-05, + "loss": 0.7082816362380981, + "step": 239 + }, + { + "epoch": 0.7336645013374092, + "grad_norm": 0.2542732059955597, + "learning_rate": 1.604881830722141e-05, + "loss": 0.7051193714141846, + "step": 240 + }, + { + "epoch": 0.7367214367596484, + "grad_norm": 0.19180485606193542, + "learning_rate": 1.600633109420861e-05, + "loss": 0.7895385026931763, + "step": 241 + }, + { + "epoch": 0.7397783721818877, + "grad_norm": 0.368756502866745, + "learning_rate": 1.5963673585385016e-05, + "loss": 0.7146293520927429, + "step": 242 + }, + { + "epoch": 0.7428353076041269, + "grad_norm": 0.18490125238895416, + "learning_rate": 1.5920846990206934e-05, + "loss": 0.650428056716919, + "step": 243 + }, + { + "epoch": 0.745892243026366, + "grad_norm": 0.23592503368854523, + "learning_rate": 1.5877852522924733e-05, + "loss": 0.6367110013961792, + "step": 244 + }, + { + "epoch": 0.7489491784486053, + "grad_norm": 0.20223264396190643, + "learning_rate": 1.5834691402548415e-05, + "loss": 0.6563615798950195, + "step": 245 + }, + { + "epoch": 0.7520061138708445, + "grad_norm": 0.27459946274757385, + "learning_rate": 1.5791364852813047e-05, + "loss": 0.7361881136894226, + "step": 246 + }, + { + "epoch": 0.7550630492930837, + "grad_norm": 0.21085411310195923, + "learning_rate": 1.5747874102144073e-05, + "loss": 0.7373813390731812, + "step": 247 + }, + { + "epoch": 0.7581199847153229, + "grad_norm": 0.23332320153713226, + "learning_rate": 1.5704220383622464e-05, + "loss": 0.6971457004547119, + "step": 248 + }, + { + "epoch": 0.7611769201375621, + "grad_norm": 0.23525936901569366, + "learning_rate": 1.5660404934949798e-05, + "loss": 0.6756627559661865, + "step": 249 + }, + { + "epoch": 0.7642338555598013, + "grad_norm": 0.2150791585445404, + "learning_rate": 1.5616428998413122e-05, + "loss": 0.7029792666435242, + "step": 250 + }, + { + "epoch": 0.7642338555598013, + "eval_loss": 0.7269901633262634, + "eval_runtime": 877.665, + "eval_samples_per_second": 0.687, + "eval_steps_per_second": 0.687, + "step": 250 + }, + { + "epoch": 0.7672907909820404, + "grad_norm": 0.19510552287101746, + "learning_rate": 1.5572293820849754e-05, + "loss": 0.715162992477417, + "step": 251 + }, + { + "epoch": 0.7703477264042797, + "grad_norm": 0.25246763229370117, + "learning_rate": 1.5528000653611935e-05, + "loss": 0.634660542011261, + "step": 252 + }, + { + "epoch": 0.7734046618265189, + "grad_norm": 0.2980027496814728, + "learning_rate": 1.5483550752531337e-05, + "loss": 0.7154463529586792, + "step": 253 + }, + { + "epoch": 0.7764615972487581, + "grad_norm": 0.2730556130409241, + "learning_rate": 1.5438945377883463e-05, + "loss": 0.8110946416854858, + "step": 254 + }, + { + "epoch": 0.7795185326709974, + "grad_norm": 0.17258886992931366, + "learning_rate": 1.5394185794351914e-05, + "loss": 0.72202467918396, + "step": 255 + }, + { + "epoch": 0.7825754680932365, + "grad_norm": 0.19966280460357666, + "learning_rate": 1.5349273270992537e-05, + "loss": 0.7368704080581665, + "step": 256 + }, + { + "epoch": 0.7856324035154757, + "grad_norm": 0.23305682837963104, + "learning_rate": 1.5304209081197425e-05, + "loss": 0.7429723143577576, + "step": 257 + }, + { + "epoch": 0.788689338937715, + "grad_norm": 0.21786810457706451, + "learning_rate": 1.5258994502658846e-05, + "loss": 0.6498424410820007, + "step": 258 + }, + { + "epoch": 0.7917462743599541, + "grad_norm": 0.2370925396680832, + "learning_rate": 1.5213630817332985e-05, + "loss": 0.7379459142684937, + "step": 259 + }, + { + "epoch": 0.7948032097821933, + "grad_norm": 0.25566384196281433, + "learning_rate": 1.5168119311403611e-05, + "loss": 0.6742876172065735, + "step": 260 + }, + { + "epoch": 0.7978601452044326, + "grad_norm": 0.2171633243560791, + "learning_rate": 1.512246127524561e-05, + "loss": 0.72329181432724, + "step": 261 + }, + { + "epoch": 0.8009170806266718, + "grad_norm": 0.23292019963264465, + "learning_rate": 1.50766580033884e-05, + "loss": 0.765812873840332, + "step": 262 + }, + { + "epoch": 0.8039740160489109, + "grad_norm": 0.19427980482578278, + "learning_rate": 1.5030710794479226e-05, + "loss": 0.7872639298439026, + "step": 263 + }, + { + "epoch": 0.8070309514711502, + "grad_norm": 0.2460346817970276, + "learning_rate": 1.4984620951246333e-05, + "loss": 0.6940722465515137, + "step": 264 + }, + { + "epoch": 0.8100878868933894, + "grad_norm": 0.2493411898612976, + "learning_rate": 1.4938389780462044e-05, + "loss": 0.7680137157440186, + "step": 265 + }, + { + "epoch": 0.8131448223156286, + "grad_norm": 0.23873573541641235, + "learning_rate": 1.4892018592905702e-05, + "loss": 0.6780916452407837, + "step": 266 + }, + { + "epoch": 0.8162017577378677, + "grad_norm": 0.2580571174621582, + "learning_rate": 1.4845508703326504e-05, + "loss": 0.7183764576911926, + "step": 267 + }, + { + "epoch": 0.819258693160107, + "grad_norm": 0.2125079482793808, + "learning_rate": 1.4798861430406221e-05, + "loss": 0.8207096457481384, + "step": 268 + }, + { + "epoch": 0.8223156285823462, + "grad_norm": 0.21065691113471985, + "learning_rate": 1.4752078096721827e-05, + "loss": 0.7414214611053467, + "step": 269 + }, + { + "epoch": 0.8253725640045854, + "grad_norm": 0.25807511806488037, + "learning_rate": 1.4705160028707976e-05, + "loss": 0.7086384296417236, + "step": 270 + }, + { + "epoch": 0.8284294994268246, + "grad_norm": 0.2444671094417572, + "learning_rate": 1.4658108556619417e-05, + "loss": 0.7065964937210083, + "step": 271 + }, + { + "epoch": 0.8314864348490638, + "grad_norm": 0.200303316116333, + "learning_rate": 1.461092501449326e-05, + "loss": 0.7533905506134033, + "step": 272 + }, + { + "epoch": 0.834543370271303, + "grad_norm": 0.2807226777076721, + "learning_rate": 1.4563610740111163e-05, + "loss": 0.756553053855896, + "step": 273 + }, + { + "epoch": 0.8376003056935423, + "grad_norm": 0.2516884207725525, + "learning_rate": 1.4516167074961394e-05, + "loss": 0.8125098347663879, + "step": 274 + }, + { + "epoch": 0.8406572411157814, + "grad_norm": 0.22799813747406006, + "learning_rate": 1.4468595364200808e-05, + "loss": 0.7360811829566956, + "step": 275 + }, + { + "epoch": 0.8437141765380206, + "grad_norm": 0.27390384674072266, + "learning_rate": 1.4420896956616698e-05, + "loss": 0.7135312557220459, + "step": 276 + }, + { + "epoch": 0.8467711119602599, + "grad_norm": 0.2811775505542755, + "learning_rate": 1.4373073204588556e-05, + "loss": 0.7489083409309387, + "step": 277 + }, + { + "epoch": 0.8498280473824991, + "grad_norm": 0.2652314603328705, + "learning_rate": 1.4325125464049725e-05, + "loss": 0.752477765083313, + "step": 278 + }, + { + "epoch": 0.8528849828047382, + "grad_norm": 0.2218960076570511, + "learning_rate": 1.427705509444897e-05, + "loss": 0.6534979939460754, + "step": 279 + }, + { + "epoch": 0.8559419182269774, + "grad_norm": 0.23746474087238312, + "learning_rate": 1.4228863458711915e-05, + "loss": 0.7061883211135864, + "step": 280 + }, + { + "epoch": 0.8589988536492167, + "grad_norm": 0.21507228910923004, + "learning_rate": 1.4180551923202406e-05, + "loss": 0.7044329643249512, + "step": 281 + }, + { + "epoch": 0.8620557890714559, + "grad_norm": 0.2412186861038208, + "learning_rate": 1.4132121857683782e-05, + "loss": 0.706013023853302, + "step": 282 + }, + { + "epoch": 0.865112724493695, + "grad_norm": 0.2832106947898865, + "learning_rate": 1.4083574635280029e-05, + "loss": 0.6572445631027222, + "step": 283 + }, + { + "epoch": 0.8681696599159343, + "grad_norm": 0.21925900876522064, + "learning_rate": 1.403491163243684e-05, + "loss": 0.675041139125824, + "step": 284 + }, + { + "epoch": 0.8712265953381735, + "grad_norm": 0.22488665580749512, + "learning_rate": 1.3986134228882607e-05, + "loss": 0.7474229335784912, + "step": 285 + }, + { + "epoch": 0.8742835307604127, + "grad_norm": 0.2221737653017044, + "learning_rate": 1.3937243807589291e-05, + "loss": 0.7394901514053345, + "step": 286 + }, + { + "epoch": 0.8773404661826519, + "grad_norm": 0.29034581780433655, + "learning_rate": 1.388824175473321e-05, + "loss": 0.7346636056900024, + "step": 287 + }, + { + "epoch": 0.8803974016048911, + "grad_norm": 0.2580259144306183, + "learning_rate": 1.383912945965574e-05, + "loss": 0.8125481009483337, + "step": 288 + }, + { + "epoch": 0.8834543370271303, + "grad_norm": 0.2533118724822998, + "learning_rate": 1.3789908314823932e-05, + "loss": 0.6768131256103516, + "step": 289 + }, + { + "epoch": 0.8865112724493696, + "grad_norm": 0.2074616551399231, + "learning_rate": 1.3740579715791017e-05, + "loss": 0.7096269726753235, + "step": 290 + }, + { + "epoch": 0.8895682078716087, + "grad_norm": 0.29789987206459045, + "learning_rate": 1.3691145061156843e-05, + "loss": 0.6973364353179932, + "step": 291 + }, + { + "epoch": 0.8926251432938479, + "grad_norm": 0.2937224805355072, + "learning_rate": 1.3641605752528225e-05, + "loss": 0.7693608999252319, + "step": 292 + }, + { + "epoch": 0.8956820787160871, + "grad_norm": 0.27355870604515076, + "learning_rate": 1.3591963194479198e-05, + "loss": 0.6870795488357544, + "step": 293 + }, + { + "epoch": 0.8987390141383264, + "grad_norm": 0.22792251408100128, + "learning_rate": 1.3542218794511212e-05, + "loss": 0.7095532417297363, + "step": 294 + }, + { + "epoch": 0.9017959495605655, + "grad_norm": 0.2855125665664673, + "learning_rate": 1.3492373963013199e-05, + "loss": 0.7536489963531494, + "step": 295 + }, + { + "epoch": 0.9048528849828047, + "grad_norm": 0.24969056248664856, + "learning_rate": 1.3442430113221602e-05, + "loss": 0.7433043718338013, + "step": 296 + }, + { + "epoch": 0.907909820405044, + "grad_norm": 0.24534980952739716, + "learning_rate": 1.3392388661180303e-05, + "loss": 0.7204138040542603, + "step": 297 + }, + { + "epoch": 0.9109667558272831, + "grad_norm": 0.2540739178657532, + "learning_rate": 1.3342251025700474e-05, + "loss": 0.7114053964614868, + "step": 298 + }, + { + "epoch": 0.9140236912495223, + "grad_norm": 0.2494630217552185, + "learning_rate": 1.3292018628320346e-05, + "loss": 0.7337151169776917, + "step": 299 + }, + { + "epoch": 0.9170806266717616, + "grad_norm": 0.3079741597175598, + "learning_rate": 1.3241692893264909e-05, + "loss": 0.7486672401428223, + "step": 300 + }, + { + "epoch": 0.9170806266717616, + "eval_loss": 0.7063615918159485, + "eval_runtime": 882.246, + "eval_samples_per_second": 0.683, + "eval_steps_per_second": 0.683, + "step": 300 + }, + { + "epoch": 0.9201375620940008, + "grad_norm": 0.23425859212875366, + "learning_rate": 1.3191275247405525e-05, + "loss": 0.7614796161651611, + "step": 301 + }, + { + "epoch": 0.9231944975162399, + "grad_norm": 0.22468142211437225, + "learning_rate": 1.314076712021949e-05, + "loss": 0.7109901309013367, + "step": 302 + }, + { + "epoch": 0.9262514329384792, + "grad_norm": 0.4165630042552948, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.6816924810409546, + "step": 303 + }, + { + "epoch": 0.9293083683607184, + "grad_norm": 0.2934052646160126, + "learning_rate": 1.3039485152562951e-05, + "loss": 0.7403143644332886, + "step": 304 + }, + { + "epoch": 0.9323653037829576, + "grad_norm": 0.24021990597248077, + "learning_rate": 1.2988714183711504e-05, + "loss": 0.7116130590438843, + "step": 305 + }, + { + "epoch": 0.9354222392051967, + "grad_norm": 0.25670015811920166, + "learning_rate": 1.2937858476690089e-05, + "loss": 0.745186984539032, + "step": 306 + }, + { + "epoch": 0.938479174627436, + "grad_norm": 0.3273049592971802, + "learning_rate": 1.2886919473396212e-05, + "loss": 0.811728298664093, + "step": 307 + }, + { + "epoch": 0.9415361100496752, + "grad_norm": 0.295612633228302, + "learning_rate": 1.2835898618089064e-05, + "loss": 0.6898178458213806, + "step": 308 + }, + { + "epoch": 0.9445930454719144, + "grad_norm": 0.22936004400253296, + "learning_rate": 1.2784797357348562e-05, + "loss": 0.7637606263160706, + "step": 309 + }, + { + "epoch": 0.9476499808941536, + "grad_norm": 0.2491123378276825, + "learning_rate": 1.2733617140034329e-05, + "loss": 0.6364520788192749, + "step": 310 + }, + { + "epoch": 0.9507069163163928, + "grad_norm": 0.29433801770210266, + "learning_rate": 1.268235941724463e-05, + "loss": 0.7065365314483643, + "step": 311 + }, + { + "epoch": 0.953763851738632, + "grad_norm": 0.25174376368522644, + "learning_rate": 1.2631025642275212e-05, + "loss": 0.73712158203125, + "step": 312 + }, + { + "epoch": 0.9568207871608713, + "grad_norm": 0.3259194493293762, + "learning_rate": 1.257961727057812e-05, + "loss": 0.6926214694976807, + "step": 313 + }, + { + "epoch": 0.9598777225831104, + "grad_norm": 0.31702667474746704, + "learning_rate": 1.2528135759720403e-05, + "loss": 0.7626583576202393, + "step": 314 + }, + { + "epoch": 0.9629346580053496, + "grad_norm": 0.24691395461559296, + "learning_rate": 1.2476582569342819e-05, + "loss": 0.7628929018974304, + "step": 315 + }, + { + "epoch": 0.9659915934275889, + "grad_norm": 0.2896668314933777, + "learning_rate": 1.2424959161118425e-05, + "loss": 0.7070521116256714, + "step": 316 + }, + { + "epoch": 0.9690485288498281, + "grad_norm": 0.2587420642375946, + "learning_rate": 1.2373266998711152e-05, + "loss": 0.7804452180862427, + "step": 317 + }, + { + "epoch": 0.9721054642720672, + "grad_norm": 0.28757819533348083, + "learning_rate": 1.232150754773429e-05, + "loss": 0.7271901369094849, + "step": 318 + }, + { + "epoch": 0.9751623996943064, + "grad_norm": 0.2600923478603363, + "learning_rate": 1.2269682275708951e-05, + "loss": 0.6629395484924316, + "step": 319 + }, + { + "epoch": 0.9782193351165457, + "grad_norm": 0.3455665111541748, + "learning_rate": 1.2217792652022452e-05, + "loss": 0.7750409841537476, + "step": 320 + }, + { + "epoch": 0.9812762705387849, + "grad_norm": 0.27122899889945984, + "learning_rate": 1.2165840147886656e-05, + "loss": 0.6742854118347168, + "step": 321 + }, + { + "epoch": 0.984333205961024, + "grad_norm": 0.2357456535100937, + "learning_rate": 1.2113826236296245e-05, + "loss": 0.7265107035636902, + "step": 322 + }, + { + "epoch": 0.9873901413832633, + "grad_norm": 0.21315616369247437, + "learning_rate": 1.2061752391986982e-05, + "loss": 0.7203768491744995, + "step": 323 + }, + { + "epoch": 0.9904470768055025, + "grad_norm": 0.24696163833141327, + "learning_rate": 1.2009620091393885e-05, + "loss": 0.8011739253997803, + "step": 324 + }, + { + "epoch": 0.9935040122277417, + "grad_norm": 0.246279776096344, + "learning_rate": 1.1957430812609361e-05, + "loss": 0.7316861152648926, + "step": 325 + }, + { + "epoch": 0.9965609476499809, + "grad_norm": 0.26160112023353577, + "learning_rate": 1.1905186035341304e-05, + "loss": 0.6602386236190796, + "step": 326 + }, + { + "epoch": 0.9996178830722201, + "grad_norm": 0.27144137024879456, + "learning_rate": 1.1852887240871145e-05, + "loss": 0.7162635326385498, + "step": 327 + }, + { + "epoch": 1.0, + "grad_norm": 0.6650471091270447, + "learning_rate": 1.1800535912011846e-05, + "loss": 0.6108165383338928, + "step": 328 + }, + { + "epoch": 1.0030569354222392, + "grad_norm": 0.25604233145713806, + "learning_rate": 1.1748133533065864e-05, + "loss": 0.6724814176559448, + "step": 329 + }, + { + "epoch": 1.0061138708444783, + "grad_norm": 0.30289238691329956, + "learning_rate": 1.1695681589783065e-05, + "loss": 0.7010799050331116, + "step": 330 + }, + { + "epoch": 1.0091708062667175, + "grad_norm": 0.28697144985198975, + "learning_rate": 1.1643181569318596e-05, + "loss": 0.7199532985687256, + "step": 331 + }, + { + "epoch": 1.012227741688957, + "grad_norm": 0.26302677392959595, + "learning_rate": 1.1590634960190722e-05, + "loss": 0.6887974143028259, + "step": 332 + }, + { + "epoch": 1.015284677111196, + "grad_norm": 0.2987605631351471, + "learning_rate": 1.1538043252238629e-05, + "loss": 0.7237250208854675, + "step": 333 + }, + { + "epoch": 1.0183416125334352, + "grad_norm": 0.25947025418281555, + "learning_rate": 1.1485407936580169e-05, + "loss": 0.7092999815940857, + "step": 334 + }, + { + "epoch": 1.0213985479556744, + "grad_norm": 0.3119892477989197, + "learning_rate": 1.1432730505569597e-05, + "loss": 0.6797397136688232, + "step": 335 + }, + { + "epoch": 1.0244554833779136, + "grad_norm": 0.2772631347179413, + "learning_rate": 1.1380012452755259e-05, + "loss": 0.7330094575881958, + "step": 336 + }, + { + "epoch": 1.0275124188001528, + "grad_norm": 0.34601089358329773, + "learning_rate": 1.1327255272837221e-05, + "loss": 0.711042582988739, + "step": 337 + }, + { + "epoch": 1.0305693542223922, + "grad_norm": 0.30404818058013916, + "learning_rate": 1.1274460461624925e-05, + "loss": 0.6593371033668518, + "step": 338 + }, + { + "epoch": 1.0336262896446313, + "grad_norm": 0.249643474817276, + "learning_rate": 1.1221629515994754e-05, + "loss": 0.7230923175811768, + "step": 339 + }, + { + "epoch": 1.0366832250668705, + "grad_norm": 0.2772657871246338, + "learning_rate": 1.1168763933847608e-05, + "loss": 0.6847513914108276, + "step": 340 + }, + { + "epoch": 1.0397401604891097, + "grad_norm": 0.3479171395301819, + "learning_rate": 1.1115865214066414e-05, + "loss": 0.673307478427887, + "step": 341 + }, + { + "epoch": 1.0427970959113488, + "grad_norm": 0.3393602669239044, + "learning_rate": 1.1062934856473655e-05, + "loss": 0.7529383897781372, + "step": 342 + }, + { + "epoch": 1.045854031333588, + "grad_norm": 0.22780737280845642, + "learning_rate": 1.1009974361788822e-05, + "loss": 0.6309706568717957, + "step": 343 + }, + { + "epoch": 1.0489109667558272, + "grad_norm": 0.2966362237930298, + "learning_rate": 1.095698523158588e-05, + "loss": 0.6944005489349365, + "step": 344 + }, + { + "epoch": 1.0519679021780666, + "grad_norm": 0.27519309520721436, + "learning_rate": 1.0903968968250682e-05, + "loss": 0.6714650392532349, + "step": 345 + }, + { + "epoch": 1.0550248376003057, + "grad_norm": 0.36684176325798035, + "learning_rate": 1.085092707493839e-05, + "loss": 0.6740344762802124, + "step": 346 + }, + { + "epoch": 1.058081773022545, + "grad_norm": 0.35729631781578064, + "learning_rate": 1.0797861055530832e-05, + "loss": 0.6590248942375183, + "step": 347 + }, + { + "epoch": 1.061138708444784, + "grad_norm": 0.33536043763160706, + "learning_rate": 1.0744772414593889e-05, + "loss": 0.7020372748374939, + "step": 348 + }, + { + "epoch": 1.0641956438670233, + "grad_norm": 0.3144095838069916, + "learning_rate": 1.0691662657334815e-05, + "loss": 0.7195531725883484, + "step": 349 + }, + { + "epoch": 1.0672525792892624, + "grad_norm": 0.37244805693626404, + "learning_rate": 1.0638533289559574e-05, + "loss": 0.6678342819213867, + "step": 350 + }, + { + "epoch": 1.0672525792892624, + "eval_loss": 0.6917262673377991, + "eval_runtime": 874.9693, + "eval_samples_per_second": 0.689, + "eval_steps_per_second": 0.689, + "step": 350 + }, + { + "epoch": 1.0703095147115018, + "grad_norm": 0.45918041467666626, + "learning_rate": 1.0585385817630137e-05, + "loss": 0.6641817092895508, + "step": 351 + }, + { + "epoch": 1.073366450133741, + "grad_norm": 0.4126392900943756, + "learning_rate": 1.0532221748421786e-05, + "loss": 0.6774541139602661, + "step": 352 + }, + { + "epoch": 1.0764233855559802, + "grad_norm": 0.5425148606300354, + "learning_rate": 1.047904258928037e-05, + "loss": 0.7386555075645447, + "step": 353 + }, + { + "epoch": 1.0794803209782193, + "grad_norm": 0.40561115741729736, + "learning_rate": 1.0425849847979586e-05, + "loss": 0.7061327695846558, + "step": 354 + }, + { + "epoch": 1.0825372564004585, + "grad_norm": 0.489343523979187, + "learning_rate": 1.0372645032678215e-05, + "loss": 0.7486766576766968, + "step": 355 + }, + { + "epoch": 1.0855941918226977, + "grad_norm": 0.7414161562919617, + "learning_rate": 1.031942965187738e-05, + "loss": 0.7111566066741943, + "step": 356 + }, + { + "epoch": 1.0886511272449368, + "grad_norm": 0.308473140001297, + "learning_rate": 1.026620521437775e-05, + "loss": 0.7629879713058472, + "step": 357 + }, + { + "epoch": 1.0917080626671762, + "grad_norm": 0.27350732684135437, + "learning_rate": 1.0212973229236787e-05, + "loss": 0.7136012315750122, + "step": 358 + }, + { + "epoch": 1.0947649980894154, + "grad_norm": 0.37481266260147095, + "learning_rate": 1.0159735205725949e-05, + "loss": 0.6634767055511475, + "step": 359 + }, + { + "epoch": 1.0978219335116546, + "grad_norm": 0.2903526723384857, + "learning_rate": 1.0106492653287893e-05, + "loss": 0.6604923009872437, + "step": 360 + }, + { + "epoch": 1.1008788689338938, + "grad_norm": 0.372989296913147, + "learning_rate": 1.0053247081493684e-05, + "loss": 0.6701731085777283, + "step": 361 + }, + { + "epoch": 1.103935804356133, + "grad_norm": 0.38386791944503784, + "learning_rate": 1e-05, + "loss": 0.6767977476119995, + "step": 362 + }, + { + "epoch": 1.106992739778372, + "grad_norm": 0.2837046682834625, + "learning_rate": 9.946752918506319e-06, + "loss": 0.5886228680610657, + "step": 363 + }, + { + "epoch": 1.1100496752006115, + "grad_norm": 0.3196772038936615, + "learning_rate": 9.893507346712112e-06, + "loss": 0.6662254929542542, + "step": 364 + }, + { + "epoch": 1.1131066106228507, + "grad_norm": 0.36623135209083557, + "learning_rate": 9.840264794274053e-06, + "loss": 0.6507357954978943, + "step": 365 + }, + { + "epoch": 1.1161635460450898, + "grad_norm": 0.2803555727005005, + "learning_rate": 9.787026770763216e-06, + "loss": 0.6636874675750732, + "step": 366 + }, + { + "epoch": 1.119220481467329, + "grad_norm": 0.329513818025589, + "learning_rate": 9.733794785622254e-06, + "loss": 0.6378857493400574, + "step": 367 + }, + { + "epoch": 1.1222774168895682, + "grad_norm": 0.24419358372688293, + "learning_rate": 9.680570348122626e-06, + "loss": 0.6794115900993347, + "step": 368 + }, + { + "epoch": 1.1253343523118073, + "grad_norm": 0.2971822917461395, + "learning_rate": 9.627354967321785e-06, + "loss": 0.6401248574256897, + "step": 369 + }, + { + "epoch": 1.1283912877340465, + "grad_norm": 0.5112190842628479, + "learning_rate": 9.574150152020415e-06, + "loss": 0.6886081695556641, + "step": 370 + }, + { + "epoch": 1.131448223156286, + "grad_norm": 0.4284913241863251, + "learning_rate": 9.520957410719632e-06, + "loss": 0.6842222213745117, + "step": 371 + }, + { + "epoch": 1.134505158578525, + "grad_norm": 0.34164664149284363, + "learning_rate": 9.467778251578217e-06, + "loss": 0.6238314509391785, + "step": 372 + }, + { + "epoch": 1.1375620940007642, + "grad_norm": 0.3294171392917633, + "learning_rate": 9.414614182369862e-06, + "loss": 0.6947107911109924, + "step": 373 + }, + { + "epoch": 1.1406190294230034, + "grad_norm": 0.2544155418872833, + "learning_rate": 9.361466710440428e-06, + "loss": 0.717319905757904, + "step": 374 + }, + { + "epoch": 1.1436759648452426, + "grad_norm": 0.3111848533153534, + "learning_rate": 9.308337342665188e-06, + "loss": 0.6222032904624939, + "step": 375 + }, + { + "epoch": 1.1467329002674818, + "grad_norm": 0.3157130777835846, + "learning_rate": 9.255227585406116e-06, + "loss": 0.6126186847686768, + "step": 376 + }, + { + "epoch": 1.1497898356897212, + "grad_norm": 0.29625123739242554, + "learning_rate": 9.202138944469168e-06, + "loss": 0.7452324032783508, + "step": 377 + }, + { + "epoch": 1.1528467711119603, + "grad_norm": 0.31600719690322876, + "learning_rate": 9.149072925061614e-06, + "loss": 0.715571403503418, + "step": 378 + }, + { + "epoch": 1.1559037065341995, + "grad_norm": 0.25878727436065674, + "learning_rate": 9.096031031749321e-06, + "loss": 0.7256120443344116, + "step": 379 + }, + { + "epoch": 1.1589606419564387, + "grad_norm": 0.4058121144771576, + "learning_rate": 9.043014768414125e-06, + "loss": 0.6728136539459229, + "step": 380 + }, + { + "epoch": 1.1620175773786778, + "grad_norm": 0.31269821524620056, + "learning_rate": 8.99002563821118e-06, + "loss": 0.6662668585777283, + "step": 381 + }, + { + "epoch": 1.165074512800917, + "grad_norm": 0.2512218654155731, + "learning_rate": 8.937065143526349e-06, + "loss": 0.6415850520133972, + "step": 382 + }, + { + "epoch": 1.1681314482231562, + "grad_norm": 0.3284171223640442, + "learning_rate": 8.884134785933588e-06, + "loss": 0.6695276498794556, + "step": 383 + }, + { + "epoch": 1.1711883836453956, + "grad_norm": 0.2994699478149414, + "learning_rate": 8.831236066152397e-06, + "loss": 0.7347006797790527, + "step": 384 + }, + { + "epoch": 1.1742453190676347, + "grad_norm": 0.2981257140636444, + "learning_rate": 8.778370484005245e-06, + "loss": 0.6707600951194763, + "step": 385 + }, + { + "epoch": 1.177302254489874, + "grad_norm": 0.2934776842594147, + "learning_rate": 8.725539538375078e-06, + "loss": 0.7245328426361084, + "step": 386 + }, + { + "epoch": 1.180359189912113, + "grad_norm": 0.33115988969802856, + "learning_rate": 8.672744727162782e-06, + "loss": 0.7029488682746887, + "step": 387 + }, + { + "epoch": 1.1834161253343523, + "grad_norm": 0.3322703540325165, + "learning_rate": 8.619987547244746e-06, + "loss": 0.6896190643310547, + "step": 388 + }, + { + "epoch": 1.1864730607565914, + "grad_norm": 0.29254966974258423, + "learning_rate": 8.567269494430404e-06, + "loss": 0.6859920620918274, + "step": 389 + }, + { + "epoch": 1.1895299961788308, + "grad_norm": 0.2923297584056854, + "learning_rate": 8.514592063419833e-06, + "loss": 0.6437527537345886, + "step": 390 + }, + { + "epoch": 1.19258693160107, + "grad_norm": 0.3074567914009094, + "learning_rate": 8.461956747761375e-06, + "loss": 0.7113338708877563, + "step": 391 + }, + { + "epoch": 1.1956438670233092, + "grad_norm": 0.3027377128601074, + "learning_rate": 8.409365039809282e-06, + "loss": 0.7111615538597107, + "step": 392 + }, + { + "epoch": 1.1987008024455483, + "grad_norm": 0.28992199897766113, + "learning_rate": 8.356818430681409e-06, + "loss": 0.7768589854240417, + "step": 393 + }, + { + "epoch": 1.2017577378677875, + "grad_norm": 0.2630784213542938, + "learning_rate": 8.304318410216937e-06, + "loss": 0.5940375328063965, + "step": 394 + }, + { + "epoch": 1.2048146732900267, + "grad_norm": 0.30487746000289917, + "learning_rate": 8.251866466934137e-06, + "loss": 0.6600077748298645, + "step": 395 + }, + { + "epoch": 1.2078716087122658, + "grad_norm": 0.4152087867259979, + "learning_rate": 8.199464087988158e-06, + "loss": 0.6806260347366333, + "step": 396 + }, + { + "epoch": 1.2109285441345052, + "grad_norm": 0.32374435663223267, + "learning_rate": 8.147112759128859e-06, + "loss": 0.7205727100372314, + "step": 397 + }, + { + "epoch": 1.2139854795567444, + "grad_norm": 0.3009904623031616, + "learning_rate": 8.094813964658698e-06, + "loss": 0.6570584774017334, + "step": 398 + }, + { + "epoch": 1.2170424149789836, + "grad_norm": 0.5213649272918701, + "learning_rate": 8.042569187390642e-06, + "loss": 0.6663621664047241, + "step": 399 + }, + { + "epoch": 1.2200993504012227, + "grad_norm": 0.30124184489250183, + "learning_rate": 7.990379908606118e-06, + "loss": 0.672550618648529, + "step": 400 + }, + { + "epoch": 1.2200993504012227, + "eval_loss": 0.6789794564247131, + "eval_runtime": 875.5101, + "eval_samples_per_second": 0.689, + "eval_steps_per_second": 0.689, + "step": 400 + }, + { + "epoch": 1.223156285823462, + "grad_norm": 0.31681662797927856, + "learning_rate": 7.938247608013021e-06, + "loss": 0.682239830493927, + "step": 401 + }, + { + "epoch": 1.226213221245701, + "grad_norm": 0.29261210560798645, + "learning_rate": 7.886173763703757e-06, + "loss": 0.6976956725120544, + "step": 402 + }, + { + "epoch": 1.2292701566679405, + "grad_norm": 0.32044124603271484, + "learning_rate": 7.834159852113347e-06, + "loss": 0.6931061744689941, + "step": 403 + }, + { + "epoch": 1.2323270920901797, + "grad_norm": 0.36050841212272644, + "learning_rate": 7.78220734797755e-06, + "loss": 0.7304666638374329, + "step": 404 + }, + { + "epoch": 1.2353840275124188, + "grad_norm": 0.31268882751464844, + "learning_rate": 7.73031772429105e-06, + "loss": 0.5944494605064392, + "step": 405 + }, + { + "epoch": 1.238440962934658, + "grad_norm": 0.33469483256340027, + "learning_rate": 7.678492452265713e-06, + "loss": 0.708702802658081, + "step": 406 + }, + { + "epoch": 1.2414978983568972, + "grad_norm": 0.2789304852485657, + "learning_rate": 7.626733001288852e-06, + "loss": 0.614046037197113, + "step": 407 + }, + { + "epoch": 1.2445548337791363, + "grad_norm": 0.42240089178085327, + "learning_rate": 7.575040838881578e-06, + "loss": 0.7044576406478882, + "step": 408 + }, + { + "epoch": 1.2476117692013755, + "grad_norm": 0.3652958571910858, + "learning_rate": 7.523417430657186e-06, + "loss": 0.7595829963684082, + "step": 409 + }, + { + "epoch": 1.250668704623615, + "grad_norm": 0.28300684690475464, + "learning_rate": 7.471864240279598e-06, + "loss": 0.7289992570877075, + "step": 410 + }, + { + "epoch": 1.253725640045854, + "grad_norm": 0.3463844358921051, + "learning_rate": 7.420382729421883e-06, + "loss": 0.7410037517547607, + "step": 411 + }, + { + "epoch": 1.2567825754680932, + "grad_norm": 0.30792665481567383, + "learning_rate": 7.368974357724789e-06, + "loss": 0.6920305490493774, + "step": 412 + }, + { + "epoch": 1.2598395108903324, + "grad_norm": 0.4354027509689331, + "learning_rate": 7.317640582755373e-06, + "loss": 0.6581035256385803, + "step": 413 + }, + { + "epoch": 1.2628964463125716, + "grad_norm": 0.5033990144729614, + "learning_rate": 7.266382859965673e-06, + "loss": 0.7377368211746216, + "step": 414 + }, + { + "epoch": 1.265953381734811, + "grad_norm": 0.30040669441223145, + "learning_rate": 7.2152026426514395e-06, + "loss": 0.7075121402740479, + "step": 415 + }, + { + "epoch": 1.2690103171570501, + "grad_norm": 0.25443559885025024, + "learning_rate": 7.164101381910939e-06, + "loss": 0.6314805150032043, + "step": 416 + }, + { + "epoch": 1.2720672525792893, + "grad_norm": 0.3807917535305023, + "learning_rate": 7.113080526603793e-06, + "loss": 0.6594043970108032, + "step": 417 + }, + { + "epoch": 1.2751241880015285, + "grad_norm": 0.40388163924217224, + "learning_rate": 7.062141523309918e-06, + "loss": 0.7092217206954956, + "step": 418 + }, + { + "epoch": 1.2781811234237677, + "grad_norm": 0.31380078196525574, + "learning_rate": 7.011285816288496e-06, + "loss": 0.6039083003997803, + "step": 419 + }, + { + "epoch": 1.2812380588460068, + "grad_norm": 0.3492945730686188, + "learning_rate": 6.96051484743705e-06, + "loss": 0.648531973361969, + "step": 420 + }, + { + "epoch": 1.284294994268246, + "grad_norm": 0.2891562283039093, + "learning_rate": 6.909830056250527e-06, + "loss": 0.6646198630332947, + "step": 421 + }, + { + "epoch": 1.2873519296904852, + "grad_norm": 0.316986083984375, + "learning_rate": 6.859232879780515e-06, + "loss": 0.7188717126846313, + "step": 422 + }, + { + "epoch": 1.2904088651127246, + "grad_norm": 0.38996225595474243, + "learning_rate": 6.8087247525944745e-06, + "loss": 0.6890851855278015, + "step": 423 + }, + { + "epoch": 1.2934658005349637, + "grad_norm": 0.3303278684616089, + "learning_rate": 6.758307106735094e-06, + "loss": 0.7118897438049316, + "step": 424 + }, + { + "epoch": 1.296522735957203, + "grad_norm": 0.26401078701019287, + "learning_rate": 6.707981371679657e-06, + "loss": 0.6749597787857056, + "step": 425 + }, + { + "epoch": 1.299579671379442, + "grad_norm": 0.3269912898540497, + "learning_rate": 6.657748974299529e-06, + "loss": 0.6718383431434631, + "step": 426 + }, + { + "epoch": 1.3026366068016813, + "grad_norm": 0.35413047671318054, + "learning_rate": 6.607611338819697e-06, + "loss": 0.6674888134002686, + "step": 427 + }, + { + "epoch": 1.3056935422239206, + "grad_norm": 0.44566094875335693, + "learning_rate": 6.557569886778401e-06, + "loss": 0.6900228261947632, + "step": 428 + }, + { + "epoch": 1.3087504776461598, + "grad_norm": 0.3536953628063202, + "learning_rate": 6.507626036986804e-06, + "loss": 0.6681596040725708, + "step": 429 + }, + { + "epoch": 1.311807413068399, + "grad_norm": 0.43866440653800964, + "learning_rate": 6.457781205488791e-06, + "loss": 0.7463353872299194, + "step": 430 + }, + { + "epoch": 1.3148643484906382, + "grad_norm": 0.32117530703544617, + "learning_rate": 6.408036805520801e-06, + "loss": 0.7138527035713196, + "step": 431 + }, + { + "epoch": 1.3179212839128773, + "grad_norm": 0.3075023293495178, + "learning_rate": 6.358394247471779e-06, + "loss": 0.6958800554275513, + "step": 432 + }, + { + "epoch": 1.3209782193351165, + "grad_norm": 0.31068870425224304, + "learning_rate": 6.308854938843161e-06, + "loss": 0.6728611588478088, + "step": 433 + }, + { + "epoch": 1.3240351547573557, + "grad_norm": 0.2871341407299042, + "learning_rate": 6.259420284208987e-06, + "loss": 0.6983805894851685, + "step": 434 + }, + { + "epoch": 1.3270920901795948, + "grad_norm": 0.3626168966293335, + "learning_rate": 6.210091685176067e-06, + "loss": 0.6707543134689331, + "step": 435 + }, + { + "epoch": 1.3301490256018342, + "grad_norm": 0.2960391640663147, + "learning_rate": 6.160870540344261e-06, + "loss": 0.6212095618247986, + "step": 436 + }, + { + "epoch": 1.3332059610240734, + "grad_norm": 0.29114195704460144, + "learning_rate": 6.111758245266795e-06, + "loss": 0.695442795753479, + "step": 437 + }, + { + "epoch": 1.3362628964463126, + "grad_norm": 0.2911393642425537, + "learning_rate": 6.0627561924107145e-06, + "loss": 0.7576844096183777, + "step": 438 + }, + { + "epoch": 1.3393198318685517, + "grad_norm": 0.2754829227924347, + "learning_rate": 6.013865771117394e-06, + "loss": 0.7611621022224426, + "step": 439 + }, + { + "epoch": 1.342376767290791, + "grad_norm": 0.47688090801239014, + "learning_rate": 5.965088367563162e-06, + "loss": 0.6706432104110718, + "step": 440 + }, + { + "epoch": 1.3454337027130303, + "grad_norm": 0.38662102818489075, + "learning_rate": 5.916425364719975e-06, + "loss": 0.7257411479949951, + "step": 441 + }, + { + "epoch": 1.3484906381352695, + "grad_norm": 0.29597020149230957, + "learning_rate": 5.867878142316221e-06, + "loss": 0.6695491671562195, + "step": 442 + }, + { + "epoch": 1.3515475735575087, + "grad_norm": 0.36503320932388306, + "learning_rate": 5.8194480767976e-06, + "loss": 0.6762661933898926, + "step": 443 + }, + { + "epoch": 1.3546045089797478, + "grad_norm": 0.29297393560409546, + "learning_rate": 5.7711365412880895e-06, + "loss": 0.6601616740226746, + "step": 444 + }, + { + "epoch": 1.357661444401987, + "grad_norm": 0.3229820430278778, + "learning_rate": 5.7229449055510335e-06, + "loss": 0.7049432992935181, + "step": 445 + }, + { + "epoch": 1.3607183798242262, + "grad_norm": 0.3359116017818451, + "learning_rate": 5.674874535950279e-06, + "loss": 0.6643913388252258, + "step": 446 + }, + { + "epoch": 1.3637753152464653, + "grad_norm": 0.349298357963562, + "learning_rate": 5.626926795411447e-06, + "loss": 0.7177180647850037, + "step": 447 + }, + { + "epoch": 1.3668322506687045, + "grad_norm": 0.30045273900032043, + "learning_rate": 5.579103043383305e-06, + "loss": 0.6765077710151672, + "step": 448 + }, + { + "epoch": 1.369889186090944, + "grad_norm": 0.3676189184188843, + "learning_rate": 5.531404635799191e-06, + "loss": 0.6421419978141785, + "step": 449 + }, + { + "epoch": 1.372946121513183, + "grad_norm": 0.3337932527065277, + "learning_rate": 5.4838329250386076e-06, + "loss": 0.649316668510437, + "step": 450 + }, + { + "epoch": 1.372946121513183, + "eval_loss": 0.6703284978866577, + "eval_runtime": 907.8663, + "eval_samples_per_second": 0.664, + "eval_steps_per_second": 0.664, + "step": 450 + }, + { + "epoch": 1.3760030569354222, + "grad_norm": 0.314387708902359, + "learning_rate": 5.436389259888841e-06, + "loss": 0.7333119511604309, + "step": 451 + }, + { + "epoch": 1.3790599923576614, + "grad_norm": 0.4056478440761566, + "learning_rate": 5.38907498550674e-06, + "loss": 0.6451212763786316, + "step": 452 + }, + { + "epoch": 1.3821169277799006, + "grad_norm": 0.42358386516571045, + "learning_rate": 5.341891443380585e-06, + "loss": 0.6462752819061279, + "step": 453 + }, + { + "epoch": 1.38517386320214, + "grad_norm": 0.3606562912464142, + "learning_rate": 5.294839971292026e-06, + "loss": 0.717352569103241, + "step": 454 + }, + { + "epoch": 1.3882307986243791, + "grad_norm": 0.3014855682849884, + "learning_rate": 5.247921903278177e-06, + "loss": 0.7015582323074341, + "step": 455 + }, + { + "epoch": 1.3912877340466183, + "grad_norm": 0.5155187845230103, + "learning_rate": 5.20113856959378e-06, + "loss": 0.6660122275352478, + "step": 456 + }, + { + "epoch": 1.3943446694688575, + "grad_norm": 0.35195642709732056, + "learning_rate": 5.1544912966735e-06, + "loss": 0.6980377435684204, + "step": 457 + }, + { + "epoch": 1.3974016048910967, + "grad_norm": 0.28842753171920776, + "learning_rate": 5.1079814070943e-06, + "loss": 0.6926653385162354, + "step": 458 + }, + { + "epoch": 1.4004585403133358, + "grad_norm": 0.354425311088562, + "learning_rate": 5.06161021953796e-06, + "loss": 0.6412813067436218, + "step": 459 + }, + { + "epoch": 1.403515475735575, + "grad_norm": 0.30584967136383057, + "learning_rate": 5.015379048753669e-06, + "loss": 0.6897266507148743, + "step": 460 + }, + { + "epoch": 1.4065724111578142, + "grad_norm": 0.3659093677997589, + "learning_rate": 4.9692892055207784e-06, + "loss": 0.6777257919311523, + "step": 461 + }, + { + "epoch": 1.4096293465800536, + "grad_norm": 0.6798201203346252, + "learning_rate": 4.923341996611604e-06, + "loss": 0.7499118447303772, + "step": 462 + }, + { + "epoch": 1.4126862820022927, + "grad_norm": 0.36423686146736145, + "learning_rate": 4.877538724754392e-06, + "loss": 0.6341705322265625, + "step": 463 + }, + { + "epoch": 1.415743217424532, + "grad_norm": 0.29527905583381653, + "learning_rate": 4.831880688596392e-06, + "loss": 0.566770076751709, + "step": 464 + }, + { + "epoch": 1.418800152846771, + "grad_norm": 0.3342158794403076, + "learning_rate": 4.7863691826670146e-06, + "loss": 0.6926667094230652, + "step": 465 + }, + { + "epoch": 1.4218570882690102, + "grad_norm": 0.35585087537765503, + "learning_rate": 4.741005497341154e-06, + "loss": 0.6302958130836487, + "step": 466 + }, + { + "epoch": 1.4249140236912496, + "grad_norm": 0.5740730166435242, + "learning_rate": 4.695790918802577e-06, + "loss": 0.7842360138893127, + "step": 467 + }, + { + "epoch": 1.4279709591134888, + "grad_norm": 0.4422702491283417, + "learning_rate": 4.650726729007465e-06, + "loss": 0.6199318766593933, + "step": 468 + }, + { + "epoch": 1.431027894535728, + "grad_norm": 0.3458646833896637, + "learning_rate": 4.605814205648087e-06, + "loss": 0.7013853788375854, + "step": 469 + }, + { + "epoch": 1.4340848299579672, + "grad_norm": 0.326727956533432, + "learning_rate": 4.56105462211654e-06, + "loss": 0.7208451628684998, + "step": 470 + }, + { + "epoch": 1.4371417653802063, + "grad_norm": 0.3491531014442444, + "learning_rate": 4.516449247468666e-06, + "loss": 0.6491535902023315, + "step": 471 + }, + { + "epoch": 1.4401987008024455, + "grad_norm": 0.31401777267456055, + "learning_rate": 4.4719993463880695e-06, + "loss": 0.6603784561157227, + "step": 472 + }, + { + "epoch": 1.4432556362246847, + "grad_norm": 0.3741454780101776, + "learning_rate": 4.427706179150247e-06, + "loss": 0.6068110466003418, + "step": 473 + }, + { + "epoch": 1.4463125716469238, + "grad_norm": 0.3205011188983917, + "learning_rate": 4.383571001586883e-06, + "loss": 0.6427788138389587, + "step": 474 + }, + { + "epoch": 1.4493695070691632, + "grad_norm": 0.2519795894622803, + "learning_rate": 4.339595065050206e-06, + "loss": 0.626676082611084, + "step": 475 + }, + { + "epoch": 1.4524264424914024, + "grad_norm": 0.3499923050403595, + "learning_rate": 4.29577961637754e-06, + "loss": 0.7192115187644958, + "step": 476 + }, + { + "epoch": 1.4554833779136416, + "grad_norm": 0.6267193555831909, + "learning_rate": 4.2521258978559324e-06, + "loss": 0.6705955862998962, + "step": 477 + }, + { + "epoch": 1.4585403133358807, + "grad_norm": 0.5547561049461365, + "learning_rate": 4.208635147186956e-06, + "loss": 0.6040648818016052, + "step": 478 + }, + { + "epoch": 1.46159724875812, + "grad_norm": 0.2949749529361725, + "learning_rate": 4.165308597451586e-06, + "loss": 0.6205201148986816, + "step": 479 + }, + { + "epoch": 1.4646541841803593, + "grad_norm": 0.2873048782348633, + "learning_rate": 4.12214747707527e-06, + "loss": 0.6886979937553406, + "step": 480 + }, + { + "epoch": 1.4677111196025985, + "grad_norm": 0.33694973587989807, + "learning_rate": 4.079153009793068e-06, + "loss": 0.6656784415245056, + "step": 481 + }, + { + "epoch": 1.4707680550248377, + "grad_norm": 0.3373357057571411, + "learning_rate": 4.036326414614985e-06, + "loss": 0.6573168635368347, + "step": 482 + }, + { + "epoch": 1.4738249904470768, + "grad_norm": 0.3189850151538849, + "learning_rate": 3.99366890579139e-06, + "loss": 0.6631187200546265, + "step": 483 + }, + { + "epoch": 1.476881925869316, + "grad_norm": 0.34659212827682495, + "learning_rate": 3.951181692778594e-06, + "loss": 0.5881021022796631, + "step": 484 + }, + { + "epoch": 1.4799388612915552, + "grad_norm": 0.4184463918209076, + "learning_rate": 3.908865980204555e-06, + "loss": 0.7232425212860107, + "step": 485 + }, + { + "epoch": 1.4829957967137943, + "grad_norm": 0.3163282573223114, + "learning_rate": 3.86672296783474e-06, + "loss": 0.6624961495399475, + "step": 486 + }, + { + "epoch": 1.4860527321360335, + "grad_norm": 0.3175446689128876, + "learning_rate": 3.824753850538082e-06, + "loss": 0.6616235971450806, + "step": 487 + }, + { + "epoch": 1.489109667558273, + "grad_norm": 0.3493629992008209, + "learning_rate": 3.782959818253126e-06, + "loss": 0.6923587918281555, + "step": 488 + }, + { + "epoch": 1.492166602980512, + "grad_norm": 0.30385154485702515, + "learning_rate": 3.741342055954269e-06, + "loss": 0.6668528914451599, + "step": 489 + }, + { + "epoch": 1.4952235384027512, + "grad_norm": 0.319979727268219, + "learning_rate": 3.699901743618194e-06, + "loss": 0.6276881098747253, + "step": 490 + }, + { + "epoch": 1.4982804738249904, + "grad_norm": 0.28717750310897827, + "learning_rate": 3.658640056190378e-06, + "loss": 0.7676356434822083, + "step": 491 + }, + { + "epoch": 1.5013374092472298, + "grad_norm": 0.4701229929924011, + "learning_rate": 3.617558163551802e-06, + "loss": 0.6021715402603149, + "step": 492 + }, + { + "epoch": 1.504394344669469, + "grad_norm": 0.4959515929222107, + "learning_rate": 3.576657230485775e-06, + "loss": 0.7243677973747253, + "step": 493 + }, + { + "epoch": 1.5074512800917081, + "grad_norm": 0.32071781158447266, + "learning_rate": 3.5359384166449185e-06, + "loss": 0.7030311822891235, + "step": 494 + }, + { + "epoch": 1.5105082155139473, + "grad_norm": 0.3393514156341553, + "learning_rate": 3.4954028765182633e-06, + "loss": 0.6344490051269531, + "step": 495 + }, + { + "epoch": 1.5135651509361865, + "grad_norm": 0.273512065410614, + "learning_rate": 3.4550517593985512e-06, + "loss": 0.5816606879234314, + "step": 496 + }, + { + "epoch": 1.5166220863584257, + "grad_norm": 0.6631937026977539, + "learning_rate": 3.414886209349615e-06, + "loss": 0.6091232895851135, + "step": 497 + }, + { + "epoch": 1.5196790217806648, + "grad_norm": 0.6976932287216187, + "learning_rate": 3.3749073651739594e-06, + "loss": 0.7076858282089233, + "step": 498 + }, + { + "epoch": 1.522735957202904, + "grad_norm": 0.35580119490623474, + "learning_rate": 3.3351163603804805e-06, + "loss": 0.6363418698310852, + "step": 499 + }, + { + "epoch": 1.5257928926251432, + "grad_norm": 0.30289211869239807, + "learning_rate": 3.2955143231523067e-06, + "loss": 0.6716225147247314, + "step": 500 + }, + { + "epoch": 1.5257928926251432, + "eval_loss": 0.6648170948028564, + "eval_runtime": 870.3243, + "eval_samples_per_second": 0.693, + "eval_steps_per_second": 0.693, + "step": 500 + }, + { + "epoch": 1.5288498280473823, + "grad_norm": 0.33276933431625366, + "learning_rate": 3.2561023763148237e-06, + "loss": 0.6512227058410645, + "step": 501 + }, + { + "epoch": 1.5319067634696217, + "grad_norm": 0.40328240394592285, + "learning_rate": 3.216881637303839e-06, + "loss": 0.7053738236427307, + "step": 502 + }, + { + "epoch": 1.534963698891861, + "grad_norm": 0.2589263916015625, + "learning_rate": 3.177853218133905e-06, + "loss": 0.697374165058136, + "step": 503 + }, + { + "epoch": 1.5380206343141, + "grad_norm": 0.5453576445579529, + "learning_rate": 3.1390182253667745e-06, + "loss": 0.6664954423904419, + "step": 504 + }, + { + "epoch": 1.5410775697363395, + "grad_norm": 0.5521278381347656, + "learning_rate": 3.100377760080041e-06, + "loss": 0.662231981754303, + "step": 505 + }, + { + "epoch": 1.5441345051585786, + "grad_norm": 0.3097061216831207, + "learning_rate": 3.0619329178359103e-06, + "loss": 0.751462459564209, + "step": 506 + }, + { + "epoch": 1.5471914405808178, + "grad_norm": 0.32505670189857483, + "learning_rate": 3.023684788650154e-06, + "loss": 0.6908425688743591, + "step": 507 + }, + { + "epoch": 1.550248376003057, + "grad_norm": 0.4177548587322235, + "learning_rate": 2.985634456961184e-06, + "loss": 0.6698168516159058, + "step": 508 + }, + { + "epoch": 1.5533053114252962, + "grad_norm": 0.3030829131603241, + "learning_rate": 2.947783001599315e-06, + "loss": 0.6403611302375793, + "step": 509 + }, + { + "epoch": 1.5563622468475353, + "grad_norm": 0.2690201997756958, + "learning_rate": 2.9101314957561864e-06, + "loss": 0.6056875586509705, + "step": 510 + }, + { + "epoch": 1.5594191822697745, + "grad_norm": 0.2733827829360962, + "learning_rate": 2.8726810069543156e-06, + "loss": 0.7140977382659912, + "step": 511 + }, + { + "epoch": 1.5624761176920137, + "grad_norm": 0.2995041310787201, + "learning_rate": 2.8354325970168483e-06, + "loss": 0.6062126159667969, + "step": 512 + }, + { + "epoch": 1.5655330531142528, + "grad_norm": 0.2860231101512909, + "learning_rate": 2.7983873220374415e-06, + "loss": 0.6048973798751831, + "step": 513 + }, + { + "epoch": 1.568589988536492, + "grad_norm": 0.3419671058654785, + "learning_rate": 2.7615462323503186e-06, + "loss": 0.630670964717865, + "step": 514 + }, + { + "epoch": 1.5716469239587314, + "grad_norm": 0.3721083700656891, + "learning_rate": 2.724910372500508e-06, + "loss": 0.6205880641937256, + "step": 515 + }, + { + "epoch": 1.5747038593809706, + "grad_norm": 0.8053601384162903, + "learning_rate": 2.6884807812142043e-06, + "loss": 0.6468279361724854, + "step": 516 + }, + { + "epoch": 1.5777607948032097, + "grad_norm": 0.30676576495170593, + "learning_rate": 2.6522584913693295e-06, + "loss": 0.6104784607887268, + "step": 517 + }, + { + "epoch": 1.5808177302254491, + "grad_norm": 0.32430994510650635, + "learning_rate": 2.616244529966244e-06, + "loss": 0.6879785060882568, + "step": 518 + }, + { + "epoch": 1.5838746656476883, + "grad_norm": 0.2668575942516327, + "learning_rate": 2.5804399180986417e-06, + "loss": 0.6742456555366516, + "step": 519 + }, + { + "epoch": 1.5869316010699275, + "grad_norm": 0.41760483384132385, + "learning_rate": 2.544845670924575e-06, + "loss": 0.5823814868927002, + "step": 520 + }, + { + "epoch": 1.5899885364921666, + "grad_norm": 0.332041472196579, + "learning_rate": 2.509462797637693e-06, + "loss": 0.653259813785553, + "step": 521 + }, + { + "epoch": 1.5930454719144058, + "grad_norm": 0.3437623381614685, + "learning_rate": 2.4742923014386154e-06, + "loss": 0.6304376721382141, + "step": 522 + }, + { + "epoch": 1.596102407336645, + "grad_norm": 0.2744190990924835, + "learning_rate": 2.4393351795065023e-06, + "loss": 0.8250125646591187, + "step": 523 + }, + { + "epoch": 1.5991593427588842, + "grad_norm": 0.3014289140701294, + "learning_rate": 2.4045924229707663e-06, + "loss": 0.7557496428489685, + "step": 524 + }, + { + "epoch": 1.6022162781811233, + "grad_norm": 0.33593595027923584, + "learning_rate": 2.3700650168829765e-06, + "loss": 0.6550201773643494, + "step": 525 + }, + { + "epoch": 1.6052732136033625, + "grad_norm": 0.289989173412323, + "learning_rate": 2.3357539401889438e-06, + "loss": 0.5847223997116089, + "step": 526 + }, + { + "epoch": 1.6083301490256017, + "grad_norm": 0.3140230178833008, + "learning_rate": 2.3016601657009364e-06, + "loss": 0.7059583067893982, + "step": 527 + }, + { + "epoch": 1.611387084447841, + "grad_norm": 0.5017932653427124, + "learning_rate": 2.2677846600701305e-06, + "loss": 0.6565676927566528, + "step": 528 + }, + { + "epoch": 1.6144440198700802, + "grad_norm": 0.2757347822189331, + "learning_rate": 2.234128383759174e-06, + "loss": 0.5888017416000366, + "step": 529 + }, + { + "epoch": 1.6175009552923194, + "grad_norm": 0.3413706421852112, + "learning_rate": 2.2006922910149743e-06, + "loss": 0.6747739315032959, + "step": 530 + }, + { + "epoch": 1.6205578907145588, + "grad_norm": 0.2861206829547882, + "learning_rate": 2.167477329841633e-06, + "loss": 0.6995899677276611, + "step": 531 + }, + { + "epoch": 1.623614826136798, + "grad_norm": 0.4095499515533447, + "learning_rate": 2.1344844419735757e-06, + "loss": 0.6285294890403748, + "step": 532 + }, + { + "epoch": 1.6266717615590371, + "grad_norm": 0.25976240634918213, + "learning_rate": 2.101714562848841e-06, + "loss": 0.607745349407196, + "step": 533 + }, + { + "epoch": 1.6297286969812763, + "grad_norm": 0.2760326564311981, + "learning_rate": 2.069168621582567e-06, + "loss": 0.681461751461029, + "step": 534 + }, + { + "epoch": 1.6327856324035155, + "grad_norm": 0.29883530735969543, + "learning_rate": 2.0368475409406396e-06, + "loss": 0.6930239200592041, + "step": 535 + }, + { + "epoch": 1.6358425678257547, + "grad_norm": 0.2769938111305237, + "learning_rate": 2.004752237313544e-06, + "loss": 0.6871459484100342, + "step": 536 + }, + { + "epoch": 1.6388995032479938, + "grad_norm": 0.5758352875709534, + "learning_rate": 1.972883620690366e-06, + "loss": 0.6905091404914856, + "step": 537 + }, + { + "epoch": 1.641956438670233, + "grad_norm": 0.302348792552948, + "learning_rate": 1.9412425946329994e-06, + "loss": 0.7119919061660767, + "step": 538 + }, + { + "epoch": 1.6450133740924722, + "grad_norm": 0.2754940986633301, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.6610316038131714, + "step": 539 + }, + { + "epoch": 1.6480703095147113, + "grad_norm": 0.27256953716278076, + "learning_rate": 1.8786468961737902e-06, + "loss": 0.6504456996917725, + "step": 540 + }, + { + "epoch": 1.6511272449369507, + "grad_norm": 0.3459402620792389, + "learning_rate": 1.8476939985301257e-06, + "loss": 0.673663854598999, + "step": 541 + }, + { + "epoch": 1.65418418035919, + "grad_norm": 0.374275267124176, + "learning_rate": 1.81697224091831e-06, + "loss": 0.6528961658477783, + "step": 542 + }, + { + "epoch": 1.657241115781429, + "grad_norm": 0.310211181640625, + "learning_rate": 1.7864824943836633e-06, + "loss": 0.664339005947113, + "step": 543 + }, + { + "epoch": 1.6602980512036685, + "grad_norm": 0.34453052282333374, + "learning_rate": 1.7562256233933717e-06, + "loss": 0.6874368190765381, + "step": 544 + }, + { + "epoch": 1.6633549866259076, + "grad_norm": 0.3484613299369812, + "learning_rate": 1.7262024858119597e-06, + "loss": 0.7023600935935974, + "step": 545 + }, + { + "epoch": 1.6664119220481468, + "grad_norm": 0.45776957273483276, + "learning_rate": 1.6964139328769736e-06, + "loss": 0.6404401659965515, + "step": 546 + }, + { + "epoch": 1.669468857470386, + "grad_norm": 0.2930310368537903, + "learning_rate": 1.6668608091748495e-06, + "loss": 0.6716583967208862, + "step": 547 + }, + { + "epoch": 1.6725257928926252, + "grad_norm": 0.3713250160217285, + "learning_rate": 1.637543952616969e-06, + "loss": 0.6601635813713074, + "step": 548 + }, + { + "epoch": 1.6755827283148643, + "grad_norm": 0.3368103802204132, + "learning_rate": 1.6084641944158918e-06, + "loss": 0.6788731217384338, + "step": 549 + }, + { + "epoch": 1.6786396637371035, + "grad_norm": 0.2993035912513733, + "learning_rate": 1.5796223590617987e-06, + "loss": 0.6544529795646667, + "step": 550 + }, + { + "epoch": 1.6786396637371035, + "eval_loss": 0.6616687178611755, + "eval_runtime": 875.9833, + "eval_samples_per_second": 0.688, + "eval_steps_per_second": 0.688, + "step": 550 + }, + { + "epoch": 1.6816965991593427, + "grad_norm": 0.44005870819091797, + "learning_rate": 1.5510192642991073e-06, + "loss": 0.6850336194038391, + "step": 551 + }, + { + "epoch": 1.6847535345815818, + "grad_norm": 0.4457947611808777, + "learning_rate": 1.522655721103291e-06, + "loss": 0.6001553535461426, + "step": 552 + }, + { + "epoch": 1.687810470003821, + "grad_norm": 0.47378861904144287, + "learning_rate": 1.494532533657893e-06, + "loss": 0.7040194272994995, + "step": 553 + }, + { + "epoch": 1.6908674054260604, + "grad_norm": 0.38698890805244446, + "learning_rate": 1.4666504993317089e-06, + "loss": 0.7009314298629761, + "step": 554 + }, + { + "epoch": 1.6939243408482996, + "grad_norm": 0.3362627625465393, + "learning_rate": 1.4390104086561886e-06, + "loss": 0.6950737237930298, + "step": 555 + }, + { + "epoch": 1.6969812762705387, + "grad_norm": 0.36643826961517334, + "learning_rate": 1.4116130453030296e-06, + "loss": 0.6862865686416626, + "step": 556 + }, + { + "epoch": 1.7000382116927781, + "grad_norm": 0.33834755420684814, + "learning_rate": 1.3844591860619382e-06, + "loss": 0.6385370492935181, + "step": 557 + }, + { + "epoch": 1.7030951471150173, + "grad_norm": 0.2850823700428009, + "learning_rate": 1.3575496008186307e-06, + "loss": 0.5935351848602295, + "step": 558 + }, + { + "epoch": 1.7061520825372565, + "grad_norm": 0.29303666949272156, + "learning_rate": 1.330885052532981e-06, + "loss": 0.6652261018753052, + "step": 559 + }, + { + "epoch": 1.7092090179594956, + "grad_norm": 0.2667746841907501, + "learning_rate": 1.3044662972174005e-06, + "loss": 0.6116664409637451, + "step": 560 + }, + { + "epoch": 1.7122659533817348, + "grad_norm": 0.35388344526290894, + "learning_rate": 1.2782940839154113e-06, + "loss": 0.6909575462341309, + "step": 561 + }, + { + "epoch": 1.715322888803974, + "grad_norm": 0.3212358057498932, + "learning_rate": 1.2523691546803872e-06, + "loss": 0.5729340314865112, + "step": 562 + }, + { + "epoch": 1.7183798242262132, + "grad_norm": 0.3078250288963318, + "learning_rate": 1.2266922445545348e-06, + "loss": 0.6341389417648315, + "step": 563 + }, + { + "epoch": 1.7214367596484523, + "grad_norm": 0.3041326403617859, + "learning_rate": 1.201264081548038e-06, + "loss": 0.7670491337776184, + "step": 564 + }, + { + "epoch": 1.7244936950706915, + "grad_norm": 0.3577534854412079, + "learning_rate": 1.176085386618434e-06, + "loss": 0.7452418804168701, + "step": 565 + }, + { + "epoch": 1.7275506304929307, + "grad_norm": 0.3138960897922516, + "learning_rate": 1.151156873650151e-06, + "loss": 0.6182627081871033, + "step": 566 + }, + { + "epoch": 1.73060756591517, + "grad_norm": 0.29401692748069763, + "learning_rate": 1.1264792494342858e-06, + "loss": 0.7683947682380676, + "step": 567 + }, + { + "epoch": 1.7336645013374092, + "grad_norm": 0.42694059014320374, + "learning_rate": 1.1020532136485517e-06, + "loss": 0.6643114686012268, + "step": 568 + }, + { + "epoch": 1.7367214367596484, + "grad_norm": 0.3185805082321167, + "learning_rate": 1.0778794588374542e-06, + "loss": 0.6443809866905212, + "step": 569 + }, + { + "epoch": 1.7397783721818878, + "grad_norm": 0.39810633659362793, + "learning_rate": 1.0539586703926396e-06, + "loss": 0.6940271258354187, + "step": 570 + }, + { + "epoch": 1.742835307604127, + "grad_norm": 0.3531099557876587, + "learning_rate": 1.0302915265334722e-06, + "loss": 0.62273770570755, + "step": 571 + }, + { + "epoch": 1.7458922430263661, + "grad_norm": 0.303533136844635, + "learning_rate": 1.0068786982878087e-06, + "loss": 0.6589292883872986, + "step": 572 + }, + { + "epoch": 1.7489491784486053, + "grad_norm": 0.3740532398223877, + "learning_rate": 9.837208494729567e-07, + "loss": 0.7088748216629028, + "step": 573 + }, + { + "epoch": 1.7520061138708445, + "grad_norm": 0.28268831968307495, + "learning_rate": 9.608186366768746e-07, + "loss": 0.6833463907241821, + "step": 574 + }, + { + "epoch": 1.7550630492930837, + "grad_norm": 0.31762558221817017, + "learning_rate": 9.381727092395365e-07, + "loss": 0.6840337514877319, + "step": 575 + }, + { + "epoch": 1.7581199847153228, + "grad_norm": 0.3333055078983307, + "learning_rate": 9.157837092345334e-07, + "loss": 0.7084675431251526, + "step": 576 + }, + { + "epoch": 1.761176920137562, + "grad_norm": 0.2991984784603119, + "learning_rate": 8.936522714508678e-07, + "loss": 0.7238477468490601, + "step": 577 + }, + { + "epoch": 1.7642338555598012, + "grad_norm": 0.28052636981010437, + "learning_rate": 8.71779023374949e-07, + "loss": 0.6483154892921448, + "step": 578 + }, + { + "epoch": 1.7672907909820403, + "grad_norm": 0.31360605359077454, + "learning_rate": 8.501645851728091e-07, + "loss": 0.6550958156585693, + "step": 579 + }, + { + "epoch": 1.7703477264042797, + "grad_norm": 0.2856346666812897, + "learning_rate": 8.28809569672514e-07, + "loss": 0.6386545300483704, + "step": 580 + }, + { + "epoch": 1.773404661826519, + "grad_norm": 0.4174005389213562, + "learning_rate": 8.077145823467924e-07, + "loss": 0.6630646586418152, + "step": 581 + }, + { + "epoch": 1.776461597248758, + "grad_norm": 0.2678094506263733, + "learning_rate": 7.868802212958704e-07, + "loss": 0.7088242769241333, + "step": 582 + }, + { + "epoch": 1.7795185326709975, + "grad_norm": 0.33474841713905334, + "learning_rate": 7.663070772305081e-07, + "loss": 0.7061930298805237, + "step": 583 + }, + { + "epoch": 1.7825754680932366, + "grad_norm": 0.30635929107666016, + "learning_rate": 7.459957334552526e-07, + "loss": 0.7023921608924866, + "step": 584 + }, + { + "epoch": 1.7856324035154758, + "grad_norm": 0.3720168173313141, + "learning_rate": 7.259467658519026e-07, + "loss": 0.6405187845230103, + "step": 585 + }, + { + "epoch": 1.788689338937715, + "grad_norm": 0.30746224522590637, + "learning_rate": 7.061607428631823e-07, + "loss": 0.7479575872421265, + "step": 586 + }, + { + "epoch": 1.7917462743599541, + "grad_norm": 0.37346151471138, + "learning_rate": 6.866382254766158e-07, + "loss": 0.73829185962677, + "step": 587 + }, + { + "epoch": 1.7948032097821933, + "grad_norm": 0.3968294858932495, + "learning_rate": 6.673797672086335e-07, + "loss": 0.7156046032905579, + "step": 588 + }, + { + "epoch": 1.7978601452044325, + "grad_norm": 0.3264223635196686, + "learning_rate": 6.483859140888648e-07, + "loss": 0.6457011699676514, + "step": 589 + }, + { + "epoch": 1.8009170806266717, + "grad_norm": 0.3268529772758484, + "learning_rate": 6.296572046446725e-07, + "loss": 0.7092617750167847, + "step": 590 + }, + { + "epoch": 1.8039740160489108, + "grad_norm": 0.2968194782733917, + "learning_rate": 6.111941698858681e-07, + "loss": 0.7103247046470642, + "step": 591 + }, + { + "epoch": 1.8070309514711502, + "grad_norm": 0.6012208461761475, + "learning_rate": 5.929973332896677e-07, + "loss": 0.6195952892303467, + "step": 592 + }, + { + "epoch": 1.8100878868933894, + "grad_norm": 0.31401294469833374, + "learning_rate": 5.750672107858435e-07, + "loss": 0.7382717728614807, + "step": 593 + }, + { + "epoch": 1.8131448223156286, + "grad_norm": 0.3620605170726776, + "learning_rate": 5.574043107421023e-07, + "loss": 0.612289547920227, + "step": 594 + }, + { + "epoch": 1.8162017577378677, + "grad_norm": 0.2869480848312378, + "learning_rate": 5.400091339496638e-07, + "loss": 0.7518821358680725, + "step": 595 + }, + { + "epoch": 1.8192586931601071, + "grad_norm": 0.33768531680107117, + "learning_rate": 5.228821736090684e-07, + "loss": 0.7100391983985901, + "step": 596 + }, + { + "epoch": 1.8223156285823463, + "grad_norm": 0.39242854714393616, + "learning_rate": 5.060239153161872e-07, + "loss": 0.6121487617492676, + "step": 597 + }, + { + "epoch": 1.8253725640045855, + "grad_norm": 0.35079774260520935, + "learning_rate": 4.894348370484648e-07, + "loss": 0.6359960436820984, + "step": 598 + }, + { + "epoch": 1.8284294994268246, + "grad_norm": 0.29979392886161804, + "learning_rate": 4.731154091513546e-07, + "loss": 0.7085576057434082, + "step": 599 + }, + { + "epoch": 1.8314864348490638, + "grad_norm": 0.4967261850833893, + "learning_rate": 4.570660943249927e-07, + "loss": 0.6123998165130615, + "step": 600 + }, + { + "epoch": 1.8314864348490638, + "eval_loss": 0.6604031324386597, + "eval_runtime": 874.6571, + "eval_samples_per_second": 0.689, + "eval_steps_per_second": 0.689, + "step": 600 + } + ], + "logging_steps": 1, + "max_steps": 656, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.308044421538251e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-600/training_args.bin b/cpt_qwen_14B/checkpoints/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eddbb43a2cebb928dbed6e955a37ebfa3174f4b5 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a8e308e47eb936f678712445b19ddc52638f354c37c813ecaa432f69120a2e +size 5201 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-656/README.md b/cpt_qwen_14B/checkpoints/checkpoint-656/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8dfda26032514233f3e70a4012f1cfd1ddbbb609 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-656/README.md @@ -0,0 +1,207 @@ +--- +base_model: /workspace/Models/Qwen2.5-Coder-14B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:/workspace/Models/Qwen2.5-Coder-14B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/cpt_qwen_14B/checkpoints/checkpoint-656/adapter_config.json b/cpt_qwen_14B/checkpoints/checkpoint-656/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..81c31359285f7e351a44275c30b6882f4c6b50c0 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-656/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "/workspace/Models/Qwen2.5-Coder-14B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/cpt_qwen_14B/checkpoints/checkpoint-656/adapter_model.safetensors b/cpt_qwen_14B/checkpoints/checkpoint-656/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b1105d332292f54c5c79fcde2530cdf056c70ff7 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-656/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93c93aba91b6a0927e2dc718a8aea115aa221981a6dea920e20239a06cd6a449 +size 201378736 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-656/chat_template.jinja b/cpt_qwen_14B/checkpoints/checkpoint-656/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..28028c056af412405debd878cdda0171e35fa5d1 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-656/chat_template.jinja @@ -0,0 +1,54 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0]['role'] == 'system' %} + {{- messages[0]['content'] }} + {%- else %} + {{- 'You are a helpful assistant.' }} + {%- endif %} + {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0]['role'] == 'system' %} + {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }} + {%- else %} + {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role }} + {%- if message.content %} + {{- '\n' + message.content }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {{- tool_call.arguments | tojson }} + {{- '}\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-656/optimizer.pt b/cpt_qwen_14B/checkpoints/checkpoint-656/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f2dfe4bb77421b4ff1c90792f5897e894cd8e61 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-656/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0be0569f0343e03430d575194e27cbab6a963abf115c8e3fea96fb96442dc12 +size 102698855 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-656/rng_state.pth b/cpt_qwen_14B/checkpoints/checkpoint-656/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..501217433fc197a00f322325abe7357c9bdf62bf --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-656/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a49c8c3398143cc28911b6969b07277f7afa283688aaa54d0565617d3f340902 +size 14645 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-656/scheduler.pt b/cpt_qwen_14B/checkpoints/checkpoint-656/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9839e3be268159a8813432321e23d9cb938c8653 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-656/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e600c2f24d5792fd24f99dec87ac7cb1ad2f46a5dd8615865300483f359b4a3c +size 1465 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-656/tokenizer.json b/cpt_qwen_14B/checkpoints/checkpoint-656/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..34510ff0037cd50428af467a17ead5a96140a32c --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-656/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8 +size 11421892 diff --git a/cpt_qwen_14B/checkpoints/checkpoint-656/tokenizer_config.json b/cpt_qwen_14B/checkpoints/checkpoint-656/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..217274ef8275420e4bf3b976f3948901cd3d176f --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-656/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": true, + "model_max_length": 32768, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-656/trainer_state.json b/cpt_qwen_14B/checkpoints/checkpoint-656/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..505f849e59477c8c88866c39e272c5fd1fd94c53 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-656/trainer_state.json @@ -0,0 +1,4730 @@ +{ + "best_global_step": 650, + "best_metric": 0.6601914763450623, + "best_model_checkpoint": "runs/cpt_run_14b/checkpoints/checkpoint-600", + "epoch": 2.0, + "eval_steps": 50, + "global_step": 656, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003056935422239205, + "grad_norm": 0.06516239047050476, + "learning_rate": 0.0, + "loss": 1.138384461402893, + "step": 1 + }, + { + "epoch": 0.00611387084447841, + "grad_norm": 0.05343673378229141, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.983342707157135, + "step": 2 + }, + { + "epoch": 0.009170806266717615, + "grad_norm": 0.05608418956398964, + "learning_rate": 6.060606060606061e-07, + "loss": 1.0762118101119995, + "step": 3 + }, + { + "epoch": 0.01222774168895682, + "grad_norm": 0.06523486226797104, + "learning_rate": 9.090909090909091e-07, + "loss": 1.084489345550537, + "step": 4 + }, + { + "epoch": 0.015284677111196026, + "grad_norm": 0.06582186371088028, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.2037022113800049, + "step": 5 + }, + { + "epoch": 0.01834161253343523, + "grad_norm": 0.06097998470067978, + "learning_rate": 1.5151515151515152e-06, + "loss": 1.10005784034729, + "step": 6 + }, + { + "epoch": 0.021398547955674436, + "grad_norm": 0.10365528613328934, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.0895193815231323, + "step": 7 + }, + { + "epoch": 0.02445548337791364, + "grad_norm": 0.06312141567468643, + "learning_rate": 2.1212121212121216e-06, + "loss": 1.0593242645263672, + "step": 8 + }, + { + "epoch": 0.027512418800152847, + "grad_norm": 0.05508403480052948, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.9772955179214478, + "step": 9 + }, + { + "epoch": 0.030569354222392053, + "grad_norm": 0.06006711348891258, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.084238886833191, + "step": 10 + }, + { + "epoch": 0.033626289644631255, + "grad_norm": 0.0588749423623085, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.0786534547805786, + "step": 11 + }, + { + "epoch": 0.03668322506687046, + "grad_norm": 0.046551357954740524, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.0370622873306274, + "step": 12 + }, + { + "epoch": 0.039740160489109666, + "grad_norm": 0.061659567058086395, + "learning_rate": 3.6363636363636366e-06, + "loss": 1.0646986961364746, + "step": 13 + }, + { + "epoch": 0.04279709591134887, + "grad_norm": 0.06007347255945206, + "learning_rate": 3.93939393939394e-06, + "loss": 1.0311307907104492, + "step": 14 + }, + { + "epoch": 0.04585403133358808, + "grad_norm": 0.07314135134220123, + "learning_rate": 4.242424242424243e-06, + "loss": 1.1300500631332397, + "step": 15 + }, + { + "epoch": 0.04891096675582728, + "grad_norm": 0.060934022068977356, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.0197452306747437, + "step": 16 + }, + { + "epoch": 0.05196790217806649, + "grad_norm": 0.056856051087379456, + "learning_rate": 4.848484848484849e-06, + "loss": 1.0438549518585205, + "step": 17 + }, + { + "epoch": 0.055024837600305694, + "grad_norm": 0.05908689647912979, + "learning_rate": 5.151515151515152e-06, + "loss": 1.0398856401443481, + "step": 18 + }, + { + "epoch": 0.0580817730225449, + "grad_norm": 0.07411840558052063, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.107885479927063, + "step": 19 + }, + { + "epoch": 0.061138708444784105, + "grad_norm": 0.0749165341258049, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.1060967445373535, + "step": 20 + }, + { + "epoch": 0.06419564386702331, + "grad_norm": 0.06720177084207535, + "learning_rate": 6.060606060606061e-06, + "loss": 1.0471720695495605, + "step": 21 + }, + { + "epoch": 0.06725257928926251, + "grad_norm": 0.05990725755691528, + "learning_rate": 6.363636363636364e-06, + "loss": 1.0944981575012207, + "step": 22 + }, + { + "epoch": 0.07030951471150172, + "grad_norm": 0.06672193855047226, + "learning_rate": 6.666666666666667e-06, + "loss": 1.1477092504501343, + "step": 23 + }, + { + "epoch": 0.07336645013374092, + "grad_norm": 0.06145205348730087, + "learning_rate": 6.969696969696971e-06, + "loss": 1.0591784715652466, + "step": 24 + }, + { + "epoch": 0.07642338555598013, + "grad_norm": 0.0757482647895813, + "learning_rate": 7.272727272727273e-06, + "loss": 1.0500165224075317, + "step": 25 + }, + { + "epoch": 0.07948032097821933, + "grad_norm": 0.07848478108644485, + "learning_rate": 7.5757575757575764e-06, + "loss": 1.0747522115707397, + "step": 26 + }, + { + "epoch": 0.08253725640045854, + "grad_norm": 0.07740631699562073, + "learning_rate": 7.87878787878788e-06, + "loss": 1.132310152053833, + "step": 27 + }, + { + "epoch": 0.08559419182269774, + "grad_norm": 0.07476603239774704, + "learning_rate": 8.181818181818183e-06, + "loss": 1.0339502096176147, + "step": 28 + }, + { + "epoch": 0.08865112724493696, + "grad_norm": 0.0779196098446846, + "learning_rate": 8.484848484848486e-06, + "loss": 1.1047282218933105, + "step": 29 + }, + { + "epoch": 0.09170806266717615, + "grad_norm": 0.06962384283542633, + "learning_rate": 8.787878787878788e-06, + "loss": 1.004916787147522, + "step": 30 + }, + { + "epoch": 0.09476499808941537, + "grad_norm": 0.06369175016880035, + "learning_rate": 9.090909090909091e-06, + "loss": 0.9296417832374573, + "step": 31 + }, + { + "epoch": 0.09782193351165457, + "grad_norm": 0.07470260560512543, + "learning_rate": 9.393939393939396e-06, + "loss": 1.0721708536148071, + "step": 32 + }, + { + "epoch": 0.10087886893389378, + "grad_norm": 0.07948213815689087, + "learning_rate": 9.696969696969698e-06, + "loss": 1.0350117683410645, + "step": 33 + }, + { + "epoch": 0.10393580435613298, + "grad_norm": 0.07066022604703903, + "learning_rate": 1e-05, + "loss": 1.026305913925171, + "step": 34 + }, + { + "epoch": 0.10699273977837218, + "grad_norm": 0.07774543762207031, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.0509816408157349, + "step": 35 + }, + { + "epoch": 0.11004967520061139, + "grad_norm": 0.07501248270273209, + "learning_rate": 1.0606060606060606e-05, + "loss": 1.0011574029922485, + "step": 36 + }, + { + "epoch": 0.11310661062285059, + "grad_norm": 0.6622501611709595, + "learning_rate": 1.0909090909090909e-05, + "loss": 0.9754424691200256, + "step": 37 + }, + { + "epoch": 0.1161635460450898, + "grad_norm": 0.07566080242395401, + "learning_rate": 1.1212121212121212e-05, + "loss": 1.0342774391174316, + "step": 38 + }, + { + "epoch": 0.119220481467329, + "grad_norm": 0.07573831081390381, + "learning_rate": 1.1515151515151517e-05, + "loss": 0.9714518785476685, + "step": 39 + }, + { + "epoch": 0.12227741688956821, + "grad_norm": 0.08083852380514145, + "learning_rate": 1.181818181818182e-05, + "loss": 1.1050316095352173, + "step": 40 + }, + { + "epoch": 0.12533435231180742, + "grad_norm": 0.08540588617324829, + "learning_rate": 1.2121212121212122e-05, + "loss": 1.0871070623397827, + "step": 41 + }, + { + "epoch": 0.12839128773404662, + "grad_norm": 0.07391592115163803, + "learning_rate": 1.2424242424242425e-05, + "loss": 1.0206722021102905, + "step": 42 + }, + { + "epoch": 0.13144822315628582, + "grad_norm": 0.07063689082860947, + "learning_rate": 1.2727272727272728e-05, + "loss": 0.9775047898292542, + "step": 43 + }, + { + "epoch": 0.13450515857852502, + "grad_norm": 0.07288888841867447, + "learning_rate": 1.3030303030303032e-05, + "loss": 1.1132858991622925, + "step": 44 + }, + { + "epoch": 0.13756209400076425, + "grad_norm": 0.07641777396202087, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.0707701444625854, + "step": 45 + }, + { + "epoch": 0.14061902942300344, + "grad_norm": 0.06990326195955276, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.9328265190124512, + "step": 46 + }, + { + "epoch": 0.14367596484524264, + "grad_norm": 0.0834241658449173, + "learning_rate": 1.3939393939393942e-05, + "loss": 1.0131721496582031, + "step": 47 + }, + { + "epoch": 0.14673290026748184, + "grad_norm": 0.0714937075972557, + "learning_rate": 1.4242424242424245e-05, + "loss": 0.940493106842041, + "step": 48 + }, + { + "epoch": 0.14978983568972107, + "grad_norm": 0.07770547270774841, + "learning_rate": 1.4545454545454546e-05, + "loss": 1.0435771942138672, + "step": 49 + }, + { + "epoch": 0.15284677111196027, + "grad_norm": 0.07950945198535919, + "learning_rate": 1.484848484848485e-05, + "loss": 1.0382137298583984, + "step": 50 + }, + { + "epoch": 0.15284677111196027, + "eval_loss": 1.0129202604293823, + "eval_runtime": 724.3664, + "eval_samples_per_second": 0.832, + "eval_steps_per_second": 0.832, + "step": 50 + }, + { + "epoch": 0.15590370653419947, + "grad_norm": 0.06961936503648758, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.9690049886703491, + "step": 51 + }, + { + "epoch": 0.15896064195643866, + "grad_norm": 0.069523885846138, + "learning_rate": 1.5454545454545454e-05, + "loss": 0.9830482006072998, + "step": 52 + }, + { + "epoch": 0.16201757737867786, + "grad_norm": 0.0764622762799263, + "learning_rate": 1.575757575757576e-05, + "loss": 1.0895472764968872, + "step": 53 + }, + { + "epoch": 0.1650745128009171, + "grad_norm": 0.1413721889257431, + "learning_rate": 1.606060606060606e-05, + "loss": 1.0354574918746948, + "step": 54 + }, + { + "epoch": 0.1681314482231563, + "grad_norm": 0.06818042695522308, + "learning_rate": 1.6363636363636366e-05, + "loss": 0.8534265160560608, + "step": 55 + }, + { + "epoch": 0.1711883836453955, + "grad_norm": 0.0722246989607811, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.9580274820327759, + "step": 56 + }, + { + "epoch": 0.17424531906763469, + "grad_norm": 0.07113443315029144, + "learning_rate": 1.6969696969696972e-05, + "loss": 1.0721848011016846, + "step": 57 + }, + { + "epoch": 0.1773022544898739, + "grad_norm": 0.08412107080221176, + "learning_rate": 1.7272727272727274e-05, + "loss": 1.1180150508880615, + "step": 58 + }, + { + "epoch": 0.1803591899121131, + "grad_norm": 0.07381036877632141, + "learning_rate": 1.7575757575757576e-05, + "loss": 1.0384547710418701, + "step": 59 + }, + { + "epoch": 0.1834161253343523, + "grad_norm": 0.07089001685380936, + "learning_rate": 1.787878787878788e-05, + "loss": 1.0446016788482666, + "step": 60 + }, + { + "epoch": 0.1864730607565915, + "grad_norm": 0.11576953530311584, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.0015051364898682, + "step": 61 + }, + { + "epoch": 0.18952999617883073, + "grad_norm": 0.08030868321657181, + "learning_rate": 1.8484848484848487e-05, + "loss": 0.9642710089683533, + "step": 62 + }, + { + "epoch": 0.19258693160106993, + "grad_norm": 0.08332342654466629, + "learning_rate": 1.8787878787878792e-05, + "loss": 1.0722991228103638, + "step": 63 + }, + { + "epoch": 0.19564386702330913, + "grad_norm": 0.08000365644693375, + "learning_rate": 1.9090909090909094e-05, + "loss": 1.0104647874832153, + "step": 64 + }, + { + "epoch": 0.19870080244554833, + "grad_norm": 0.08139508217573166, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9445061087608337, + "step": 65 + }, + { + "epoch": 0.20175773786778756, + "grad_norm": 0.08749893307685852, + "learning_rate": 1.96969696969697e-05, + "loss": 1.080810308456421, + "step": 66 + }, + { + "epoch": 0.20481467329002676, + "grad_norm": 0.0786912813782692, + "learning_rate": 2e-05, + "loss": 0.9705753922462463, + "step": 67 + }, + { + "epoch": 0.20787160871226595, + "grad_norm": 0.08962028473615646, + "learning_rate": 1.9999858236410775e-05, + "loss": 0.962783694267273, + "step": 68 + }, + { + "epoch": 0.21092854413450515, + "grad_norm": 0.08402887731790543, + "learning_rate": 1.9999432949662483e-05, + "loss": 0.9959614872932434, + "step": 69 + }, + { + "epoch": 0.21398547955674435, + "grad_norm": 0.08036444336175919, + "learning_rate": 1.9998724151813157e-05, + "loss": 0.9569960832595825, + "step": 70 + }, + { + "epoch": 0.21704241497898358, + "grad_norm": 0.08247046917676926, + "learning_rate": 1.9997731862959143e-05, + "loss": 1.0012171268463135, + "step": 71 + }, + { + "epoch": 0.22009935040122278, + "grad_norm": 0.08966264873743057, + "learning_rate": 1.999645611123453e-05, + "loss": 1.0403809547424316, + "step": 72 + }, + { + "epoch": 0.22315628582346198, + "grad_norm": 0.08061660826206207, + "learning_rate": 1.999489693281034e-05, + "loss": 1.0089740753173828, + "step": 73 + }, + { + "epoch": 0.22621322124570117, + "grad_norm": 0.09005365520715714, + "learning_rate": 1.9993054371893526e-05, + "loss": 0.9333044290542603, + "step": 74 + }, + { + "epoch": 0.2292701566679404, + "grad_norm": 0.08651519566774368, + "learning_rate": 1.9990928480725694e-05, + "loss": 0.9284015893936157, + "step": 75 + }, + { + "epoch": 0.2323270920901796, + "grad_norm": 0.08141147345304489, + "learning_rate": 1.9988519319581637e-05, + "loss": 0.9782730340957642, + "step": 76 + }, + { + "epoch": 0.2353840275124188, + "grad_norm": 0.08344405144453049, + "learning_rate": 1.998582695676762e-05, + "loss": 0.9723064303398132, + "step": 77 + }, + { + "epoch": 0.238440962934658, + "grad_norm": 0.08019903302192688, + "learning_rate": 1.998285146861945e-05, + "loss": 0.9648997783660889, + "step": 78 + }, + { + "epoch": 0.24149789835689722, + "grad_norm": 0.08113416284322739, + "learning_rate": 1.99795929395003e-05, + "loss": 0.9263214468955994, + "step": 79 + }, + { + "epoch": 0.24455483377913642, + "grad_norm": 0.08127513527870178, + "learning_rate": 1.997605146179833e-05, + "loss": 0.8745232224464417, + "step": 80 + }, + { + "epoch": 0.24761176920137562, + "grad_norm": 0.09934187680482864, + "learning_rate": 1.997222713592405e-05, + "loss": 0.8722782135009766, + "step": 81 + }, + { + "epoch": 0.25066870462361485, + "grad_norm": 0.09701363742351532, + "learning_rate": 1.9968120070307503e-05, + "loss": 1.0084266662597656, + "step": 82 + }, + { + "epoch": 0.253725640045854, + "grad_norm": 0.08335654437541962, + "learning_rate": 1.9963730381395154e-05, + "loss": 0.9239332675933838, + "step": 83 + }, + { + "epoch": 0.25678257546809324, + "grad_norm": 0.09161650389432907, + "learning_rate": 1.9959058193646618e-05, + "loss": 0.9878032207489014, + "step": 84 + }, + { + "epoch": 0.2598395108903324, + "grad_norm": 0.08067663013935089, + "learning_rate": 1.9954103639531116e-05, + "loss": 0.9113098382949829, + "step": 85 + }, + { + "epoch": 0.26289644631257164, + "grad_norm": 0.09619539976119995, + "learning_rate": 1.9948866859523717e-05, + "loss": 0.9527600407600403, + "step": 86 + }, + { + "epoch": 0.26595338173481087, + "grad_norm": 0.10015493631362915, + "learning_rate": 1.9943348002101374e-05, + "loss": 0.9569152593612671, + "step": 87 + }, + { + "epoch": 0.26901031715705004, + "grad_norm": 0.09012345969676971, + "learning_rate": 1.993754722373869e-05, + "loss": 0.8912045359611511, + "step": 88 + }, + { + "epoch": 0.27206725257928926, + "grad_norm": 0.10342805832624435, + "learning_rate": 1.9931464688903502e-05, + "loss": 0.856104850769043, + "step": 89 + }, + { + "epoch": 0.2751241880015285, + "grad_norm": 0.10218493640422821, + "learning_rate": 1.9925100570052194e-05, + "loss": 0.9631397128105164, + "step": 90 + }, + { + "epoch": 0.27818112342376766, + "grad_norm": 0.10909046977758408, + "learning_rate": 1.9918455047624847e-05, + "loss": 0.8532565236091614, + "step": 91 + }, + { + "epoch": 0.2812380588460069, + "grad_norm": 0.10714197903871536, + "learning_rate": 1.9911528310040073e-05, + "loss": 0.9691859483718872, + "step": 92 + }, + { + "epoch": 0.28429499426824606, + "grad_norm": 0.1108694076538086, + "learning_rate": 1.990432055368971e-05, + "loss": 0.9374334812164307, + "step": 93 + }, + { + "epoch": 0.2873519296904853, + "grad_norm": 0.10037308186292648, + "learning_rate": 1.989683198293324e-05, + "loss": 0.9166896343231201, + "step": 94 + }, + { + "epoch": 0.2904088651127245, + "grad_norm": 0.10246684402227402, + "learning_rate": 1.9889062810092002e-05, + "loss": 1.0059239864349365, + "step": 95 + }, + { + "epoch": 0.2934658005349637, + "grad_norm": 0.09954962879419327, + "learning_rate": 1.9881013255443152e-05, + "loss": 1.00413179397583, + "step": 96 + }, + { + "epoch": 0.2965227359572029, + "grad_norm": 0.11006761342287064, + "learning_rate": 1.9872683547213446e-05, + "loss": 0.9414035677909851, + "step": 97 + }, + { + "epoch": 0.29957967137944214, + "grad_norm": 0.1014382541179657, + "learning_rate": 1.9864073921572756e-05, + "loss": 0.9155468940734863, + "step": 98 + }, + { + "epoch": 0.3026366068016813, + "grad_norm": 0.09883157908916473, + "learning_rate": 1.9855184622627362e-05, + "loss": 0.9429305195808411, + "step": 99 + }, + { + "epoch": 0.30569354222392053, + "grad_norm": 0.11199072748422623, + "learning_rate": 1.9846015902413053e-05, + "loss": 0.9143528342247009, + "step": 100 + }, + { + "epoch": 0.30569354222392053, + "eval_loss": 0.884428083896637, + "eval_runtime": 723.8143, + "eval_samples_per_second": 0.833, + "eval_steps_per_second": 0.833, + "step": 100 + }, + { + "epoch": 0.3087504776461597, + "grad_norm": 0.10796016454696655, + "learning_rate": 1.9836568020887963e-05, + "loss": 0.9726455211639404, + "step": 101 + }, + { + "epoch": 0.31180741306839893, + "grad_norm": 0.10056383162736893, + "learning_rate": 1.982684124592521e-05, + "loss": 0.8932135701179504, + "step": 102 + }, + { + "epoch": 0.31486434849063816, + "grad_norm": 0.10836594551801682, + "learning_rate": 1.9816835853305306e-05, + "loss": 0.919749915599823, + "step": 103 + }, + { + "epoch": 0.31792128391287733, + "grad_norm": 0.12032149732112885, + "learning_rate": 1.9806552126708322e-05, + "loss": 0.871781587600708, + "step": 104 + }, + { + "epoch": 0.32097821933511655, + "grad_norm": 0.10854160040616989, + "learning_rate": 1.9795990357705853e-05, + "loss": 0.8587784171104431, + "step": 105 + }, + { + "epoch": 0.3240351547573557, + "grad_norm": 0.10819399356842041, + "learning_rate": 1.978515084575276e-05, + "loss": 0.8524806499481201, + "step": 106 + }, + { + "epoch": 0.32709209017959495, + "grad_norm": 0.10226067155599594, + "learning_rate": 1.9774033898178668e-05, + "loss": 0.7892144918441772, + "step": 107 + }, + { + "epoch": 0.3301490256018342, + "grad_norm": 0.1071159616112709, + "learning_rate": 1.976263983017925e-05, + "loss": 0.8833234906196594, + "step": 108 + }, + { + "epoch": 0.33320596102407335, + "grad_norm": 0.11434526741504669, + "learning_rate": 1.9750968964807305e-05, + "loss": 0.861842155456543, + "step": 109 + }, + { + "epoch": 0.3362628964463126, + "grad_norm": 0.1159641221165657, + "learning_rate": 1.9739021632963584e-05, + "loss": 0.8987889289855957, + "step": 110 + }, + { + "epoch": 0.3393198318685518, + "grad_norm": 0.12371373921632767, + "learning_rate": 1.9726798173387417e-05, + "loss": 0.9710193872451782, + "step": 111 + }, + { + "epoch": 0.342376767290791, + "grad_norm": 0.11441531032323837, + "learning_rate": 1.97142989326471e-05, + "loss": 0.8199151158332825, + "step": 112 + }, + { + "epoch": 0.3454337027130302, + "grad_norm": 0.11842846125364304, + "learning_rate": 1.9701524265130088e-05, + "loss": 0.8845276236534119, + "step": 113 + }, + { + "epoch": 0.34849063813526937, + "grad_norm": 0.10813732445240021, + "learning_rate": 1.9688474533032916e-05, + "loss": 0.7964264750480652, + "step": 114 + }, + { + "epoch": 0.3515475735575086, + "grad_norm": 0.11050347238779068, + "learning_rate": 1.9675150106350957e-05, + "loss": 0.9630422592163086, + "step": 115 + }, + { + "epoch": 0.3546045089797478, + "grad_norm": 0.10537250339984894, + "learning_rate": 1.9661551362867926e-05, + "loss": 0.7706905007362366, + "step": 116 + }, + { + "epoch": 0.357661444401987, + "grad_norm": 0.11390368640422821, + "learning_rate": 1.9647678688145163e-05, + "loss": 0.8541204929351807, + "step": 117 + }, + { + "epoch": 0.3607183798242262, + "grad_norm": 0.10318922251462936, + "learning_rate": 1.963353247551069e-05, + "loss": 0.7400562763214111, + "step": 118 + }, + { + "epoch": 0.3637753152464654, + "grad_norm": 0.1347586214542389, + "learning_rate": 1.9619113126048086e-05, + "loss": 0.9232871532440186, + "step": 119 + }, + { + "epoch": 0.3668322506687046, + "grad_norm": 0.11458177119493484, + "learning_rate": 1.96044210485851e-05, + "loss": 0.833285927772522, + "step": 120 + }, + { + "epoch": 0.36988918609094384, + "grad_norm": 0.12361041456460953, + "learning_rate": 1.958945665968206e-05, + "loss": 0.7887391448020935, + "step": 121 + }, + { + "epoch": 0.372946121513183, + "grad_norm": 0.11985408514738083, + "learning_rate": 1.9574220383620054e-05, + "loss": 0.8206446170806885, + "step": 122 + }, + { + "epoch": 0.37600305693542224, + "grad_norm": 0.1355939507484436, + "learning_rate": 1.9558712652388932e-05, + "loss": 0.7648542523384094, + "step": 123 + }, + { + "epoch": 0.37905999235766147, + "grad_norm": 0.1229313388466835, + "learning_rate": 1.954293390567501e-05, + "loss": 0.8573335409164429, + "step": 124 + }, + { + "epoch": 0.38211692777990064, + "grad_norm": 0.11425124108791351, + "learning_rate": 1.9526884590848646e-05, + "loss": 0.7412531971931458, + "step": 125 + }, + { + "epoch": 0.38517386320213987, + "grad_norm": 0.12430041283369064, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.8098543882369995, + "step": 126 + }, + { + "epoch": 0.38823079862437904, + "grad_norm": 0.12492368370294571, + "learning_rate": 1.9493976084683814e-05, + "loss": 0.8814713954925537, + "step": 127 + }, + { + "epoch": 0.39128773404661826, + "grad_norm": 0.14428824186325073, + "learning_rate": 1.9477117826390934e-05, + "loss": 0.8231979608535767, + "step": 128 + }, + { + "epoch": 0.3943446694688575, + "grad_norm": 0.12010085582733154, + "learning_rate": 1.9459990866050337e-05, + "loss": 0.7015627026557922, + "step": 129 + }, + { + "epoch": 0.39740160489109666, + "grad_norm": 0.11819776892662048, + "learning_rate": 1.9442595689257898e-05, + "loss": 0.8086729645729065, + "step": 130 + }, + { + "epoch": 0.4004585403133359, + "grad_norm": 0.12211033701896667, + "learning_rate": 1.9424932789214158e-05, + "loss": 0.8234002590179443, + "step": 131 + }, + { + "epoch": 0.4035154757355751, + "grad_norm": 0.14926476776599884, + "learning_rate": 1.9407002666710334e-05, + "loss": 0.874608039855957, + "step": 132 + }, + { + "epoch": 0.4065724111578143, + "grad_norm": 0.13012923300266266, + "learning_rate": 1.9388805830114132e-05, + "loss": 0.8491607904434204, + "step": 133 + }, + { + "epoch": 0.4096293465800535, + "grad_norm": 0.12012261897325516, + "learning_rate": 1.937034279535533e-05, + "loss": 0.7269159555435181, + "step": 134 + }, + { + "epoch": 0.4126862820022927, + "grad_norm": 0.15302567183971405, + "learning_rate": 1.9351614085911134e-05, + "loss": 0.8560839891433716, + "step": 135 + }, + { + "epoch": 0.4157432174245319, + "grad_norm": 0.12234190106391907, + "learning_rate": 1.933262023279137e-05, + "loss": 0.8211904764175415, + "step": 136 + }, + { + "epoch": 0.41880015284677113, + "grad_norm": 0.14427296817302704, + "learning_rate": 1.9313361774523387e-05, + "loss": 0.8500057458877563, + "step": 137 + }, + { + "epoch": 0.4218570882690103, + "grad_norm": 0.1314094066619873, + "learning_rate": 1.929383925713682e-05, + "loss": 0.7589091658592224, + "step": 138 + }, + { + "epoch": 0.42491402369124953, + "grad_norm": 0.1576734483242035, + "learning_rate": 1.92740532341481e-05, + "loss": 0.7581073641777039, + "step": 139 + }, + { + "epoch": 0.4279709591134887, + "grad_norm": 0.15788713097572327, + "learning_rate": 1.925400426654475e-05, + "loss": 0.809050440788269, + "step": 140 + }, + { + "epoch": 0.43102789453572793, + "grad_norm": 0.13364559412002563, + "learning_rate": 1.9233692922769497e-05, + "loss": 0.7990086078643799, + "step": 141 + }, + { + "epoch": 0.43408482995796716, + "grad_norm": 0.14786465466022491, + "learning_rate": 1.921311977870413e-05, + "loss": 0.8675815463066101, + "step": 142 + }, + { + "epoch": 0.4371417653802063, + "grad_norm": 0.14621882140636444, + "learning_rate": 1.9192285417653208e-05, + "loss": 0.8713765740394592, + "step": 143 + }, + { + "epoch": 0.44019870080244555, + "grad_norm": 0.12874048948287964, + "learning_rate": 1.917119043032749e-05, + "loss": 0.7361871004104614, + "step": 144 + }, + { + "epoch": 0.4432556362246848, + "grad_norm": 0.12183775007724762, + "learning_rate": 1.9149835414827193e-05, + "loss": 0.7311941385269165, + "step": 145 + }, + { + "epoch": 0.44631257164692395, + "grad_norm": 0.1397160291671753, + "learning_rate": 1.912822097662505e-05, + "loss": 0.8189159035682678, + "step": 146 + }, + { + "epoch": 0.4493695070691632, + "grad_norm": 0.1458273082971573, + "learning_rate": 1.9106347728549134e-05, + "loss": 0.8288135528564453, + "step": 147 + }, + { + "epoch": 0.45242644249140235, + "grad_norm": 0.16898781061172485, + "learning_rate": 1.908421629076547e-05, + "loss": 0.7878037095069885, + "step": 148 + }, + { + "epoch": 0.4554833779136416, + "grad_norm": 0.1638474315404892, + "learning_rate": 1.9061827290760466e-05, + "loss": 0.8059952259063721, + "step": 149 + }, + { + "epoch": 0.4585403133358808, + "grad_norm": 0.14130882918834686, + "learning_rate": 1.9039181363323128e-05, + "loss": 0.7346830368041992, + "step": 150 + }, + { + "epoch": 0.4585403133358808, + "eval_loss": 0.7979016900062561, + "eval_runtime": 828.6295, + "eval_samples_per_second": 0.728, + "eval_steps_per_second": 0.728, + "step": 150 + }, + { + "epoch": 0.46159724875811997, + "grad_norm": 0.14427433907985687, + "learning_rate": 1.9016279150527044e-05, + "loss": 0.7583403587341309, + "step": 151 + }, + { + "epoch": 0.4646541841803592, + "grad_norm": 0.1515798568725586, + "learning_rate": 1.8993121301712194e-05, + "loss": 0.7908380031585693, + "step": 152 + }, + { + "epoch": 0.46771111960259837, + "grad_norm": 0.14444488286972046, + "learning_rate": 1.896970847346653e-05, + "loss": 0.7916130423545837, + "step": 153 + }, + { + "epoch": 0.4707680550248376, + "grad_norm": 0.1460912823677063, + "learning_rate": 1.8946041329607364e-05, + "loss": 0.7750643491744995, + "step": 154 + }, + { + "epoch": 0.4738249904470768, + "grad_norm": 0.13896244764328003, + "learning_rate": 1.892212054116255e-05, + "loss": 0.8059666156768799, + "step": 155 + }, + { + "epoch": 0.476881925869316, + "grad_norm": 0.16133630275726318, + "learning_rate": 1.889794678635145e-05, + "loss": 0.8327827453613281, + "step": 156 + }, + { + "epoch": 0.4799388612915552, + "grad_norm": 0.1474636346101761, + "learning_rate": 1.8873520750565716e-05, + "loss": 0.8498989343643188, + "step": 157 + }, + { + "epoch": 0.48299579671379445, + "grad_norm": 0.17222349345684052, + "learning_rate": 1.884884312634985e-05, + "loss": 0.7750177979469299, + "step": 158 + }, + { + "epoch": 0.4860527321360336, + "grad_norm": 0.15558090806007385, + "learning_rate": 1.8823914613381568e-05, + "loss": 0.7326169013977051, + "step": 159 + }, + { + "epoch": 0.48910966755827284, + "grad_norm": 0.13808321952819824, + "learning_rate": 1.8798735918451963e-05, + "loss": 0.8308709859848022, + "step": 160 + }, + { + "epoch": 0.492166602980512, + "grad_norm": 0.1761898398399353, + "learning_rate": 1.8773307755445468e-05, + "loss": 0.7805465459823608, + "step": 161 + }, + { + "epoch": 0.49522353840275124, + "grad_norm": 0.160477414727211, + "learning_rate": 1.874763084531961e-05, + "loss": 0.8538846969604492, + "step": 162 + }, + { + "epoch": 0.49828047382499047, + "grad_norm": 0.15238745510578156, + "learning_rate": 1.872170591608459e-05, + "loss": 0.8801217675209045, + "step": 163 + }, + { + "epoch": 0.5013374092472297, + "grad_norm": 0.1567080318927765, + "learning_rate": 1.86955337027826e-05, + "loss": 0.7205259799957275, + "step": 164 + }, + { + "epoch": 0.5043943446694689, + "grad_norm": 0.13637851178646088, + "learning_rate": 1.866911494746702e-05, + "loss": 0.7636491656303406, + "step": 165 + }, + { + "epoch": 0.507451280091708, + "grad_norm": 0.15563489496707916, + "learning_rate": 1.8642450399181373e-05, + "loss": 0.7982497811317444, + "step": 166 + }, + { + "epoch": 0.5105082155139473, + "grad_norm": 0.15503396093845367, + "learning_rate": 1.8615540813938063e-05, + "loss": 0.8737778067588806, + "step": 167 + }, + { + "epoch": 0.5135651509361865, + "grad_norm": 0.16095557808876038, + "learning_rate": 1.8588386954696972e-05, + "loss": 0.796604335308075, + "step": 168 + }, + { + "epoch": 0.5166220863584257, + "grad_norm": 0.1713593453168869, + "learning_rate": 1.856098959134381e-05, + "loss": 0.8247392177581787, + "step": 169 + }, + { + "epoch": 0.5196790217806648, + "grad_norm": 0.18239113688468933, + "learning_rate": 1.8533349500668295e-05, + "loss": 0.7838484644889832, + "step": 170 + }, + { + "epoch": 0.5227359572029041, + "grad_norm": 0.15745767951011658, + "learning_rate": 1.850546746634211e-05, + "loss": 0.7856907248497009, + "step": 171 + }, + { + "epoch": 0.5257928926251433, + "grad_norm": 0.16820666193962097, + "learning_rate": 1.8477344278896708e-05, + "loss": 0.7829679846763611, + "step": 172 + }, + { + "epoch": 0.5288498280473825, + "grad_norm": 0.16975544393062592, + "learning_rate": 1.84489807357009e-05, + "loss": 0.7374375462532043, + "step": 173 + }, + { + "epoch": 0.5319067634696217, + "grad_norm": 0.167228102684021, + "learning_rate": 1.8420377640938204e-05, + "loss": 0.712837815284729, + "step": 174 + }, + { + "epoch": 0.5349636988918609, + "grad_norm": 0.15955154597759247, + "learning_rate": 1.839153580558411e-05, + "loss": 0.7645693421363831, + "step": 175 + }, + { + "epoch": 0.5380206343141001, + "grad_norm": 0.18378689885139465, + "learning_rate": 1.8362456047383032e-05, + "loss": 0.7974956631660461, + "step": 176 + }, + { + "epoch": 0.5410775697363394, + "grad_norm": 0.15777672827243805, + "learning_rate": 1.833313919082515e-05, + "loss": 0.8957571983337402, + "step": 177 + }, + { + "epoch": 0.5441345051585785, + "grad_norm": 0.15292386710643768, + "learning_rate": 1.8303586067123028e-05, + "loss": 0.7635619044303894, + "step": 178 + }, + { + "epoch": 0.5471914405808177, + "grad_norm": 0.178152397274971, + "learning_rate": 1.8273797514188043e-05, + "loss": 0.7849246263504028, + "step": 179 + }, + { + "epoch": 0.550248376003057, + "grad_norm": 0.15916013717651367, + "learning_rate": 1.824377437660663e-05, + "loss": 0.6975343227386475, + "step": 180 + }, + { + "epoch": 0.5533053114252962, + "grad_norm": 0.18172231316566467, + "learning_rate": 1.821351750561634e-05, + "loss": 0.7675164341926575, + "step": 181 + }, + { + "epoch": 0.5563622468475353, + "grad_norm": 0.16241903603076935, + "learning_rate": 1.818302775908169e-05, + "loss": 0.7950343489646912, + "step": 182 + }, + { + "epoch": 0.5594191822697746, + "grad_norm": 0.18727579712867737, + "learning_rate": 1.8152306001469875e-05, + "loss": 0.787315309047699, + "step": 183 + }, + { + "epoch": 0.5624761176920138, + "grad_norm": 0.1627933531999588, + "learning_rate": 1.8121353103826213e-05, + "loss": 0.7141211628913879, + "step": 184 + }, + { + "epoch": 0.565533053114253, + "grad_norm": 0.4369247555732727, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.8476608395576477, + "step": 185 + }, + { + "epoch": 0.5685899885364921, + "grad_norm": 0.16494786739349365, + "learning_rate": 1.8058757405367003e-05, + "loss": 0.720562756061554, + "step": 186 + }, + { + "epoch": 0.5716469239587314, + "grad_norm": 0.175015389919281, + "learning_rate": 1.8027116379309637e-05, + "loss": 0.7589252591133118, + "step": 187 + }, + { + "epoch": 0.5747038593809706, + "grad_norm": 0.1769978553056717, + "learning_rate": 1.799524776268646e-05, + "loss": 0.7644155621528625, + "step": 188 + }, + { + "epoch": 0.5777607948032097, + "grad_norm": 0.18481792509555817, + "learning_rate": 1.796315245905936e-05, + "loss": 0.7885835766792297, + "step": 189 + }, + { + "epoch": 0.580817730225449, + "grad_norm": 0.1668689250946045, + "learning_rate": 1.7930831378417437e-05, + "loss": 0.7377231121063232, + "step": 190 + }, + { + "epoch": 0.5838746656476882, + "grad_norm": 0.178734689950943, + "learning_rate": 1.7898285437151163e-05, + "loss": 0.7388894557952881, + "step": 191 + }, + { + "epoch": 0.5869316010699274, + "grad_norm": 0.1740068644285202, + "learning_rate": 1.786551555802643e-05, + "loss": 0.8209859728813171, + "step": 192 + }, + { + "epoch": 0.5899885364921666, + "grad_norm": 0.19211041927337646, + "learning_rate": 1.783252267015837e-05, + "loss": 0.7305737733840942, + "step": 193 + }, + { + "epoch": 0.5930454719144058, + "grad_norm": 0.16644936800003052, + "learning_rate": 1.779930770898503e-05, + "loss": 0.7760804891586304, + "step": 194 + }, + { + "epoch": 0.596102407336645, + "grad_norm": 0.1773686707019806, + "learning_rate": 1.776587161624083e-05, + "loss": 0.7879236936569214, + "step": 195 + }, + { + "epoch": 0.5991593427588843, + "grad_norm": 0.17508819699287415, + "learning_rate": 1.7732215339929874e-05, + "loss": 0.7307407259941101, + "step": 196 + }, + { + "epoch": 0.6022162781811234, + "grad_norm": 0.17211101949214935, + "learning_rate": 1.7698339834299064e-05, + "loss": 0.7293214797973633, + "step": 197 + }, + { + "epoch": 0.6052732136033626, + "grad_norm": 0.18085215985774994, + "learning_rate": 1.7664246059811058e-05, + "loss": 0.763083279132843, + "step": 198 + }, + { + "epoch": 0.6083301490256018, + "grad_norm": 0.20243075489997864, + "learning_rate": 1.7629934983117025e-05, + "loss": 0.7372676134109497, + "step": 199 + }, + { + "epoch": 0.6113870844478411, + "grad_norm": 0.18152795732021332, + "learning_rate": 1.759540757702924e-05, + "loss": 0.7121898531913757, + "step": 200 + }, + { + "epoch": 0.6113870844478411, + "eval_loss": 0.7551760673522949, + "eval_runtime": 900.209, + "eval_samples_per_second": 0.67, + "eval_steps_per_second": 0.67, + "step": 200 + }, + { + "epoch": 0.6144440198700802, + "grad_norm": 0.18808062374591827, + "learning_rate": 1.7560664820493502e-05, + "loss": 0.734307050704956, + "step": 201 + }, + { + "epoch": 0.6175009552923194, + "grad_norm": 0.18151243031024933, + "learning_rate": 1.7525707698561383e-05, + "loss": 0.7998429536819458, + "step": 202 + }, + { + "epoch": 0.6205578907145587, + "grad_norm": 0.19583043456077576, + "learning_rate": 1.7490537202362313e-05, + "loss": 0.7546265721321106, + "step": 203 + }, + { + "epoch": 0.6236148261367979, + "grad_norm": 0.2508557140827179, + "learning_rate": 1.7455154329075427e-05, + "loss": 0.7810050249099731, + "step": 204 + }, + { + "epoch": 0.626671761559037, + "grad_norm": 0.1685105562210083, + "learning_rate": 1.741956008190136e-05, + "loss": 0.7558917999267578, + "step": 205 + }, + { + "epoch": 0.6297286969812763, + "grad_norm": 0.18195222318172455, + "learning_rate": 1.7383755470033756e-05, + "loss": 0.7216942310333252, + "step": 206 + }, + { + "epoch": 0.6327856324035155, + "grad_norm": 0.1878063678741455, + "learning_rate": 1.7347741508630673e-05, + "loss": 0.7417092323303223, + "step": 207 + }, + { + "epoch": 0.6358425678257547, + "grad_norm": 0.25273698568344116, + "learning_rate": 1.73115192187858e-05, + "loss": 0.807498037815094, + "step": 208 + }, + { + "epoch": 0.6388995032479939, + "grad_norm": 0.2451465129852295, + "learning_rate": 1.7275089627499493e-05, + "loss": 0.7557163238525391, + "step": 209 + }, + { + "epoch": 0.6419564386702331, + "grad_norm": 0.19272617995738983, + "learning_rate": 1.7238453767649683e-05, + "loss": 0.8285109996795654, + "step": 210 + }, + { + "epoch": 0.6450133740924723, + "grad_norm": 0.1869518756866455, + "learning_rate": 1.720161267796256e-05, + "loss": 0.7824444770812988, + "step": 211 + }, + { + "epoch": 0.6480703095147115, + "grad_norm": 0.2029627561569214, + "learning_rate": 1.7164567402983153e-05, + "loss": 0.7018642425537109, + "step": 212 + }, + { + "epoch": 0.6511272449369507, + "grad_norm": 0.23215501010417938, + "learning_rate": 1.7127318993045686e-05, + "loss": 0.7263948917388916, + "step": 213 + }, + { + "epoch": 0.6541841803591899, + "grad_norm": 0.19869184494018555, + "learning_rate": 1.7089868504243816e-05, + "loss": 0.8285576105117798, + "step": 214 + }, + { + "epoch": 0.6572411157814291, + "grad_norm": 0.22871531546115875, + "learning_rate": 1.705221699840069e-05, + "loss": 0.7871490716934204, + "step": 215 + }, + { + "epoch": 0.6602980512036684, + "grad_norm": 0.17945580184459686, + "learning_rate": 1.701436554303882e-05, + "loss": 0.740180492401123, + "step": 216 + }, + { + "epoch": 0.6633549866259075, + "grad_norm": 0.20516762137413025, + "learning_rate": 1.6976315211349848e-05, + "loss": 0.7542892098426819, + "step": 217 + }, + { + "epoch": 0.6664119220481467, + "grad_norm": 0.22108283638954163, + "learning_rate": 1.6938067082164093e-05, + "loss": 0.8117404580116272, + "step": 218 + }, + { + "epoch": 0.669468857470386, + "grad_norm": 0.22329698503017426, + "learning_rate": 1.6899622239919965e-05, + "loss": 0.8002716898918152, + "step": 219 + }, + { + "epoch": 0.6725257928926252, + "grad_norm": 0.23545362055301666, + "learning_rate": 1.6860981774633228e-05, + "loss": 0.7750573754310608, + "step": 220 + }, + { + "epoch": 0.6755827283148643, + "grad_norm": 0.21816480159759521, + "learning_rate": 1.6822146781866097e-05, + "loss": 0.8051223754882812, + "step": 221 + }, + { + "epoch": 0.6786396637371036, + "grad_norm": 0.18638508021831512, + "learning_rate": 1.6783118362696162e-05, + "loss": 0.7286484241485596, + "step": 222 + }, + { + "epoch": 0.6816965991593428, + "grad_norm": 0.16794732213020325, + "learning_rate": 1.6743897623685178e-05, + "loss": 0.7001460194587708, + "step": 223 + }, + { + "epoch": 0.684753534581582, + "grad_norm": 0.21157318353652954, + "learning_rate": 1.6704485676847695e-05, + "loss": 0.7479901313781738, + "step": 224 + }, + { + "epoch": 0.6878104700038211, + "grad_norm": 0.35601308941841125, + "learning_rate": 1.666488363961952e-05, + "loss": 0.7660019397735596, + "step": 225 + }, + { + "epoch": 0.6908674054260604, + "grad_norm": 0.17416611313819885, + "learning_rate": 1.662509263482604e-05, + "loss": 0.7157142162322998, + "step": 226 + }, + { + "epoch": 0.6939243408482996, + "grad_norm": 0.19655123353004456, + "learning_rate": 1.658511379065039e-05, + "loss": 0.7894638776779175, + "step": 227 + }, + { + "epoch": 0.6969812762705387, + "grad_norm": 0.2034345269203186, + "learning_rate": 1.6544948240601453e-05, + "loss": 0.6853711009025574, + "step": 228 + }, + { + "epoch": 0.700038211692778, + "grad_norm": 0.199235200881958, + "learning_rate": 1.6504597123481737e-05, + "loss": 0.7487372756004333, + "step": 229 + }, + { + "epoch": 0.7030951471150172, + "grad_norm": 0.20407404005527496, + "learning_rate": 1.6464061583355088e-05, + "loss": 0.7335573434829712, + "step": 230 + }, + { + "epoch": 0.7061520825372564, + "grad_norm": 0.22096174955368042, + "learning_rate": 1.6423342769514227e-05, + "loss": 0.7659798264503479, + "step": 231 + }, + { + "epoch": 0.7092090179594956, + "grad_norm": 0.1916825920343399, + "learning_rate": 1.6382441836448203e-05, + "loss": 0.7162011861801147, + "step": 232 + }, + { + "epoch": 0.7122659533817348, + "grad_norm": 0.20505093038082123, + "learning_rate": 1.6341359943809626e-05, + "loss": 0.6957600116729736, + "step": 233 + }, + { + "epoch": 0.715322888803974, + "grad_norm": 0.19968082010746002, + "learning_rate": 1.6300098256381807e-05, + "loss": 0.6724053025245667, + "step": 234 + }, + { + "epoch": 0.7183798242262133, + "grad_norm": 0.19768832623958588, + "learning_rate": 1.625865794404573e-05, + "loss": 0.774741530418396, + "step": 235 + }, + { + "epoch": 0.7214367596484524, + "grad_norm": 0.19257694482803345, + "learning_rate": 1.621704018174688e-05, + "loss": 0.6658651828765869, + "step": 236 + }, + { + "epoch": 0.7244936950706916, + "grad_norm": 0.21594858169555664, + "learning_rate": 1.617524614946192e-05, + "loss": 0.810744047164917, + "step": 237 + }, + { + "epoch": 0.7275506304929308, + "grad_norm": 0.2107633650302887, + "learning_rate": 1.6133277032165264e-05, + "loss": 0.7623897194862366, + "step": 238 + }, + { + "epoch": 0.7306075659151701, + "grad_norm": 0.20114055275917053, + "learning_rate": 1.6091134019795447e-05, + "loss": 0.7082816362380981, + "step": 239 + }, + { + "epoch": 0.7336645013374092, + "grad_norm": 0.2542732059955597, + "learning_rate": 1.604881830722141e-05, + "loss": 0.7051193714141846, + "step": 240 + }, + { + "epoch": 0.7367214367596484, + "grad_norm": 0.19180485606193542, + "learning_rate": 1.600633109420861e-05, + "loss": 0.7895385026931763, + "step": 241 + }, + { + "epoch": 0.7397783721818877, + "grad_norm": 0.368756502866745, + "learning_rate": 1.5963673585385016e-05, + "loss": 0.7146293520927429, + "step": 242 + }, + { + "epoch": 0.7428353076041269, + "grad_norm": 0.18490125238895416, + "learning_rate": 1.5920846990206934e-05, + "loss": 0.650428056716919, + "step": 243 + }, + { + "epoch": 0.745892243026366, + "grad_norm": 0.23592503368854523, + "learning_rate": 1.5877852522924733e-05, + "loss": 0.6367110013961792, + "step": 244 + }, + { + "epoch": 0.7489491784486053, + "grad_norm": 0.20223264396190643, + "learning_rate": 1.5834691402548415e-05, + "loss": 0.6563615798950195, + "step": 245 + }, + { + "epoch": 0.7520061138708445, + "grad_norm": 0.27459946274757385, + "learning_rate": 1.5791364852813047e-05, + "loss": 0.7361881136894226, + "step": 246 + }, + { + "epoch": 0.7550630492930837, + "grad_norm": 0.21085411310195923, + "learning_rate": 1.5747874102144073e-05, + "loss": 0.7373813390731812, + "step": 247 + }, + { + "epoch": 0.7581199847153229, + "grad_norm": 0.23332320153713226, + "learning_rate": 1.5704220383622464e-05, + "loss": 0.6971457004547119, + "step": 248 + }, + { + "epoch": 0.7611769201375621, + "grad_norm": 0.23525936901569366, + "learning_rate": 1.5660404934949798e-05, + "loss": 0.6756627559661865, + "step": 249 + }, + { + "epoch": 0.7642338555598013, + "grad_norm": 0.2150791585445404, + "learning_rate": 1.5616428998413122e-05, + "loss": 0.7029792666435242, + "step": 250 + }, + { + "epoch": 0.7642338555598013, + "eval_loss": 0.7269901633262634, + "eval_runtime": 877.665, + "eval_samples_per_second": 0.687, + "eval_steps_per_second": 0.687, + "step": 250 + }, + { + "epoch": 0.7672907909820404, + "grad_norm": 0.19510552287101746, + "learning_rate": 1.5572293820849754e-05, + "loss": 0.715162992477417, + "step": 251 + }, + { + "epoch": 0.7703477264042797, + "grad_norm": 0.25246763229370117, + "learning_rate": 1.5528000653611935e-05, + "loss": 0.634660542011261, + "step": 252 + }, + { + "epoch": 0.7734046618265189, + "grad_norm": 0.2980027496814728, + "learning_rate": 1.5483550752531337e-05, + "loss": 0.7154463529586792, + "step": 253 + }, + { + "epoch": 0.7764615972487581, + "grad_norm": 0.2730556130409241, + "learning_rate": 1.5438945377883463e-05, + "loss": 0.8110946416854858, + "step": 254 + }, + { + "epoch": 0.7795185326709974, + "grad_norm": 0.17258886992931366, + "learning_rate": 1.5394185794351914e-05, + "loss": 0.72202467918396, + "step": 255 + }, + { + "epoch": 0.7825754680932365, + "grad_norm": 0.19966280460357666, + "learning_rate": 1.5349273270992537e-05, + "loss": 0.7368704080581665, + "step": 256 + }, + { + "epoch": 0.7856324035154757, + "grad_norm": 0.23305682837963104, + "learning_rate": 1.5304209081197425e-05, + "loss": 0.7429723143577576, + "step": 257 + }, + { + "epoch": 0.788689338937715, + "grad_norm": 0.21786810457706451, + "learning_rate": 1.5258994502658846e-05, + "loss": 0.6498424410820007, + "step": 258 + }, + { + "epoch": 0.7917462743599541, + "grad_norm": 0.2370925396680832, + "learning_rate": 1.5213630817332985e-05, + "loss": 0.7379459142684937, + "step": 259 + }, + { + "epoch": 0.7948032097821933, + "grad_norm": 0.25566384196281433, + "learning_rate": 1.5168119311403611e-05, + "loss": 0.6742876172065735, + "step": 260 + }, + { + "epoch": 0.7978601452044326, + "grad_norm": 0.2171633243560791, + "learning_rate": 1.512246127524561e-05, + "loss": 0.72329181432724, + "step": 261 + }, + { + "epoch": 0.8009170806266718, + "grad_norm": 0.23292019963264465, + "learning_rate": 1.50766580033884e-05, + "loss": 0.765812873840332, + "step": 262 + }, + { + "epoch": 0.8039740160489109, + "grad_norm": 0.19427980482578278, + "learning_rate": 1.5030710794479226e-05, + "loss": 0.7872639298439026, + "step": 263 + }, + { + "epoch": 0.8070309514711502, + "grad_norm": 0.2460346817970276, + "learning_rate": 1.4984620951246333e-05, + "loss": 0.6940722465515137, + "step": 264 + }, + { + "epoch": 0.8100878868933894, + "grad_norm": 0.2493411898612976, + "learning_rate": 1.4938389780462044e-05, + "loss": 0.7680137157440186, + "step": 265 + }, + { + "epoch": 0.8131448223156286, + "grad_norm": 0.23873573541641235, + "learning_rate": 1.4892018592905702e-05, + "loss": 0.6780916452407837, + "step": 266 + }, + { + "epoch": 0.8162017577378677, + "grad_norm": 0.2580571174621582, + "learning_rate": 1.4845508703326504e-05, + "loss": 0.7183764576911926, + "step": 267 + }, + { + "epoch": 0.819258693160107, + "grad_norm": 0.2125079482793808, + "learning_rate": 1.4798861430406221e-05, + "loss": 0.8207096457481384, + "step": 268 + }, + { + "epoch": 0.8223156285823462, + "grad_norm": 0.21065691113471985, + "learning_rate": 1.4752078096721827e-05, + "loss": 0.7414214611053467, + "step": 269 + }, + { + "epoch": 0.8253725640045854, + "grad_norm": 0.25807511806488037, + "learning_rate": 1.4705160028707976e-05, + "loss": 0.7086384296417236, + "step": 270 + }, + { + "epoch": 0.8284294994268246, + "grad_norm": 0.2444671094417572, + "learning_rate": 1.4658108556619417e-05, + "loss": 0.7065964937210083, + "step": 271 + }, + { + "epoch": 0.8314864348490638, + "grad_norm": 0.200303316116333, + "learning_rate": 1.461092501449326e-05, + "loss": 0.7533905506134033, + "step": 272 + }, + { + "epoch": 0.834543370271303, + "grad_norm": 0.2807226777076721, + "learning_rate": 1.4563610740111163e-05, + "loss": 0.756553053855896, + "step": 273 + }, + { + "epoch": 0.8376003056935423, + "grad_norm": 0.2516884207725525, + "learning_rate": 1.4516167074961394e-05, + "loss": 0.8125098347663879, + "step": 274 + }, + { + "epoch": 0.8406572411157814, + "grad_norm": 0.22799813747406006, + "learning_rate": 1.4468595364200808e-05, + "loss": 0.7360811829566956, + "step": 275 + }, + { + "epoch": 0.8437141765380206, + "grad_norm": 0.27390384674072266, + "learning_rate": 1.4420896956616698e-05, + "loss": 0.7135312557220459, + "step": 276 + }, + { + "epoch": 0.8467711119602599, + "grad_norm": 0.2811775505542755, + "learning_rate": 1.4373073204588556e-05, + "loss": 0.7489083409309387, + "step": 277 + }, + { + "epoch": 0.8498280473824991, + "grad_norm": 0.2652314603328705, + "learning_rate": 1.4325125464049725e-05, + "loss": 0.752477765083313, + "step": 278 + }, + { + "epoch": 0.8528849828047382, + "grad_norm": 0.2218960076570511, + "learning_rate": 1.427705509444897e-05, + "loss": 0.6534979939460754, + "step": 279 + }, + { + "epoch": 0.8559419182269774, + "grad_norm": 0.23746474087238312, + "learning_rate": 1.4228863458711915e-05, + "loss": 0.7061883211135864, + "step": 280 + }, + { + "epoch": 0.8589988536492167, + "grad_norm": 0.21507228910923004, + "learning_rate": 1.4180551923202406e-05, + "loss": 0.7044329643249512, + "step": 281 + }, + { + "epoch": 0.8620557890714559, + "grad_norm": 0.2412186861038208, + "learning_rate": 1.4132121857683782e-05, + "loss": 0.706013023853302, + "step": 282 + }, + { + "epoch": 0.865112724493695, + "grad_norm": 0.2832106947898865, + "learning_rate": 1.4083574635280029e-05, + "loss": 0.6572445631027222, + "step": 283 + }, + { + "epoch": 0.8681696599159343, + "grad_norm": 0.21925900876522064, + "learning_rate": 1.403491163243684e-05, + "loss": 0.675041139125824, + "step": 284 + }, + { + "epoch": 0.8712265953381735, + "grad_norm": 0.22488665580749512, + "learning_rate": 1.3986134228882607e-05, + "loss": 0.7474229335784912, + "step": 285 + }, + { + "epoch": 0.8742835307604127, + "grad_norm": 0.2221737653017044, + "learning_rate": 1.3937243807589291e-05, + "loss": 0.7394901514053345, + "step": 286 + }, + { + "epoch": 0.8773404661826519, + "grad_norm": 0.29034581780433655, + "learning_rate": 1.388824175473321e-05, + "loss": 0.7346636056900024, + "step": 287 + }, + { + "epoch": 0.8803974016048911, + "grad_norm": 0.2580259144306183, + "learning_rate": 1.383912945965574e-05, + "loss": 0.8125481009483337, + "step": 288 + }, + { + "epoch": 0.8834543370271303, + "grad_norm": 0.2533118724822998, + "learning_rate": 1.3789908314823932e-05, + "loss": 0.6768131256103516, + "step": 289 + }, + { + "epoch": 0.8865112724493696, + "grad_norm": 0.2074616551399231, + "learning_rate": 1.3740579715791017e-05, + "loss": 0.7096269726753235, + "step": 290 + }, + { + "epoch": 0.8895682078716087, + "grad_norm": 0.29789987206459045, + "learning_rate": 1.3691145061156843e-05, + "loss": 0.6973364353179932, + "step": 291 + }, + { + "epoch": 0.8926251432938479, + "grad_norm": 0.2937224805355072, + "learning_rate": 1.3641605752528225e-05, + "loss": 0.7693608999252319, + "step": 292 + }, + { + "epoch": 0.8956820787160871, + "grad_norm": 0.27355870604515076, + "learning_rate": 1.3591963194479198e-05, + "loss": 0.6870795488357544, + "step": 293 + }, + { + "epoch": 0.8987390141383264, + "grad_norm": 0.22792251408100128, + "learning_rate": 1.3542218794511212e-05, + "loss": 0.7095532417297363, + "step": 294 + }, + { + "epoch": 0.9017959495605655, + "grad_norm": 0.2855125665664673, + "learning_rate": 1.3492373963013199e-05, + "loss": 0.7536489963531494, + "step": 295 + }, + { + "epoch": 0.9048528849828047, + "grad_norm": 0.24969056248664856, + "learning_rate": 1.3442430113221602e-05, + "loss": 0.7433043718338013, + "step": 296 + }, + { + "epoch": 0.907909820405044, + "grad_norm": 0.24534980952739716, + "learning_rate": 1.3392388661180303e-05, + "loss": 0.7204138040542603, + "step": 297 + }, + { + "epoch": 0.9109667558272831, + "grad_norm": 0.2540739178657532, + "learning_rate": 1.3342251025700474e-05, + "loss": 0.7114053964614868, + "step": 298 + }, + { + "epoch": 0.9140236912495223, + "grad_norm": 0.2494630217552185, + "learning_rate": 1.3292018628320346e-05, + "loss": 0.7337151169776917, + "step": 299 + }, + { + "epoch": 0.9170806266717616, + "grad_norm": 0.3079741597175598, + "learning_rate": 1.3241692893264909e-05, + "loss": 0.7486672401428223, + "step": 300 + }, + { + "epoch": 0.9170806266717616, + "eval_loss": 0.7063615918159485, + "eval_runtime": 882.246, + "eval_samples_per_second": 0.683, + "eval_steps_per_second": 0.683, + "step": 300 + }, + { + "epoch": 0.9201375620940008, + "grad_norm": 0.23425859212875366, + "learning_rate": 1.3191275247405525e-05, + "loss": 0.7614796161651611, + "step": 301 + }, + { + "epoch": 0.9231944975162399, + "grad_norm": 0.22468142211437225, + "learning_rate": 1.314076712021949e-05, + "loss": 0.7109901309013367, + "step": 302 + }, + { + "epoch": 0.9262514329384792, + "grad_norm": 0.4165630042552948, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.6816924810409546, + "step": 303 + }, + { + "epoch": 0.9293083683607184, + "grad_norm": 0.2934052646160126, + "learning_rate": 1.3039485152562951e-05, + "loss": 0.7403143644332886, + "step": 304 + }, + { + "epoch": 0.9323653037829576, + "grad_norm": 0.24021990597248077, + "learning_rate": 1.2988714183711504e-05, + "loss": 0.7116130590438843, + "step": 305 + }, + { + "epoch": 0.9354222392051967, + "grad_norm": 0.25670015811920166, + "learning_rate": 1.2937858476690089e-05, + "loss": 0.745186984539032, + "step": 306 + }, + { + "epoch": 0.938479174627436, + "grad_norm": 0.3273049592971802, + "learning_rate": 1.2886919473396212e-05, + "loss": 0.811728298664093, + "step": 307 + }, + { + "epoch": 0.9415361100496752, + "grad_norm": 0.295612633228302, + "learning_rate": 1.2835898618089064e-05, + "loss": 0.6898178458213806, + "step": 308 + }, + { + "epoch": 0.9445930454719144, + "grad_norm": 0.22936004400253296, + "learning_rate": 1.2784797357348562e-05, + "loss": 0.7637606263160706, + "step": 309 + }, + { + "epoch": 0.9476499808941536, + "grad_norm": 0.2491123378276825, + "learning_rate": 1.2733617140034329e-05, + "loss": 0.6364520788192749, + "step": 310 + }, + { + "epoch": 0.9507069163163928, + "grad_norm": 0.29433801770210266, + "learning_rate": 1.268235941724463e-05, + "loss": 0.7065365314483643, + "step": 311 + }, + { + "epoch": 0.953763851738632, + "grad_norm": 0.25174376368522644, + "learning_rate": 1.2631025642275212e-05, + "loss": 0.73712158203125, + "step": 312 + }, + { + "epoch": 0.9568207871608713, + "grad_norm": 0.3259194493293762, + "learning_rate": 1.257961727057812e-05, + "loss": 0.6926214694976807, + "step": 313 + }, + { + "epoch": 0.9598777225831104, + "grad_norm": 0.31702667474746704, + "learning_rate": 1.2528135759720403e-05, + "loss": 0.7626583576202393, + "step": 314 + }, + { + "epoch": 0.9629346580053496, + "grad_norm": 0.24691395461559296, + "learning_rate": 1.2476582569342819e-05, + "loss": 0.7628929018974304, + "step": 315 + }, + { + "epoch": 0.9659915934275889, + "grad_norm": 0.2896668314933777, + "learning_rate": 1.2424959161118425e-05, + "loss": 0.7070521116256714, + "step": 316 + }, + { + "epoch": 0.9690485288498281, + "grad_norm": 0.2587420642375946, + "learning_rate": 1.2373266998711152e-05, + "loss": 0.7804452180862427, + "step": 317 + }, + { + "epoch": 0.9721054642720672, + "grad_norm": 0.28757819533348083, + "learning_rate": 1.232150754773429e-05, + "loss": 0.7271901369094849, + "step": 318 + }, + { + "epoch": 0.9751623996943064, + "grad_norm": 0.2600923478603363, + "learning_rate": 1.2269682275708951e-05, + "loss": 0.6629395484924316, + "step": 319 + }, + { + "epoch": 0.9782193351165457, + "grad_norm": 0.3455665111541748, + "learning_rate": 1.2217792652022452e-05, + "loss": 0.7750409841537476, + "step": 320 + }, + { + "epoch": 0.9812762705387849, + "grad_norm": 0.27122899889945984, + "learning_rate": 1.2165840147886656e-05, + "loss": 0.6742854118347168, + "step": 321 + }, + { + "epoch": 0.984333205961024, + "grad_norm": 0.2357456535100937, + "learning_rate": 1.2113826236296245e-05, + "loss": 0.7265107035636902, + "step": 322 + }, + { + "epoch": 0.9873901413832633, + "grad_norm": 0.21315616369247437, + "learning_rate": 1.2061752391986982e-05, + "loss": 0.7203768491744995, + "step": 323 + }, + { + "epoch": 0.9904470768055025, + "grad_norm": 0.24696163833141327, + "learning_rate": 1.2009620091393885e-05, + "loss": 0.8011739253997803, + "step": 324 + }, + { + "epoch": 0.9935040122277417, + "grad_norm": 0.246279776096344, + "learning_rate": 1.1957430812609361e-05, + "loss": 0.7316861152648926, + "step": 325 + }, + { + "epoch": 0.9965609476499809, + "grad_norm": 0.26160112023353577, + "learning_rate": 1.1905186035341304e-05, + "loss": 0.6602386236190796, + "step": 326 + }, + { + "epoch": 0.9996178830722201, + "grad_norm": 0.27144137024879456, + "learning_rate": 1.1852887240871145e-05, + "loss": 0.7162635326385498, + "step": 327 + }, + { + "epoch": 1.0, + "grad_norm": 0.6650471091270447, + "learning_rate": 1.1800535912011846e-05, + "loss": 0.6108165383338928, + "step": 328 + }, + { + "epoch": 1.0030569354222392, + "grad_norm": 0.25604233145713806, + "learning_rate": 1.1748133533065864e-05, + "loss": 0.6724814176559448, + "step": 329 + }, + { + "epoch": 1.0061138708444783, + "grad_norm": 0.30289238691329956, + "learning_rate": 1.1695681589783065e-05, + "loss": 0.7010799050331116, + "step": 330 + }, + { + "epoch": 1.0091708062667175, + "grad_norm": 0.28697144985198975, + "learning_rate": 1.1643181569318596e-05, + "loss": 0.7199532985687256, + "step": 331 + }, + { + "epoch": 1.012227741688957, + "grad_norm": 0.26302677392959595, + "learning_rate": 1.1590634960190722e-05, + "loss": 0.6887974143028259, + "step": 332 + }, + { + "epoch": 1.015284677111196, + "grad_norm": 0.2987605631351471, + "learning_rate": 1.1538043252238629e-05, + "loss": 0.7237250208854675, + "step": 333 + }, + { + "epoch": 1.0183416125334352, + "grad_norm": 0.25947025418281555, + "learning_rate": 1.1485407936580169e-05, + "loss": 0.7092999815940857, + "step": 334 + }, + { + "epoch": 1.0213985479556744, + "grad_norm": 0.3119892477989197, + "learning_rate": 1.1432730505569597e-05, + "loss": 0.6797397136688232, + "step": 335 + }, + { + "epoch": 1.0244554833779136, + "grad_norm": 0.2772631347179413, + "learning_rate": 1.1380012452755259e-05, + "loss": 0.7330094575881958, + "step": 336 + }, + { + "epoch": 1.0275124188001528, + "grad_norm": 0.34601089358329773, + "learning_rate": 1.1327255272837221e-05, + "loss": 0.711042582988739, + "step": 337 + }, + { + "epoch": 1.0305693542223922, + "grad_norm": 0.30404818058013916, + "learning_rate": 1.1274460461624925e-05, + "loss": 0.6593371033668518, + "step": 338 + }, + { + "epoch": 1.0336262896446313, + "grad_norm": 0.249643474817276, + "learning_rate": 1.1221629515994754e-05, + "loss": 0.7230923175811768, + "step": 339 + }, + { + "epoch": 1.0366832250668705, + "grad_norm": 0.2772657871246338, + "learning_rate": 1.1168763933847608e-05, + "loss": 0.6847513914108276, + "step": 340 + }, + { + "epoch": 1.0397401604891097, + "grad_norm": 0.3479171395301819, + "learning_rate": 1.1115865214066414e-05, + "loss": 0.673307478427887, + "step": 341 + }, + { + "epoch": 1.0427970959113488, + "grad_norm": 0.3393602669239044, + "learning_rate": 1.1062934856473655e-05, + "loss": 0.7529383897781372, + "step": 342 + }, + { + "epoch": 1.045854031333588, + "grad_norm": 0.22780737280845642, + "learning_rate": 1.1009974361788822e-05, + "loss": 0.6309706568717957, + "step": 343 + }, + { + "epoch": 1.0489109667558272, + "grad_norm": 0.2966362237930298, + "learning_rate": 1.095698523158588e-05, + "loss": 0.6944005489349365, + "step": 344 + }, + { + "epoch": 1.0519679021780666, + "grad_norm": 0.27519309520721436, + "learning_rate": 1.0903968968250682e-05, + "loss": 0.6714650392532349, + "step": 345 + }, + { + "epoch": 1.0550248376003057, + "grad_norm": 0.36684176325798035, + "learning_rate": 1.085092707493839e-05, + "loss": 0.6740344762802124, + "step": 346 + }, + { + "epoch": 1.058081773022545, + "grad_norm": 0.35729631781578064, + "learning_rate": 1.0797861055530832e-05, + "loss": 0.6590248942375183, + "step": 347 + }, + { + "epoch": 1.061138708444784, + "grad_norm": 0.33536043763160706, + "learning_rate": 1.0744772414593889e-05, + "loss": 0.7020372748374939, + "step": 348 + }, + { + "epoch": 1.0641956438670233, + "grad_norm": 0.3144095838069916, + "learning_rate": 1.0691662657334815e-05, + "loss": 0.7195531725883484, + "step": 349 + }, + { + "epoch": 1.0672525792892624, + "grad_norm": 0.37244805693626404, + "learning_rate": 1.0638533289559574e-05, + "loss": 0.6678342819213867, + "step": 350 + }, + { + "epoch": 1.0672525792892624, + "eval_loss": 0.6917262673377991, + "eval_runtime": 874.9693, + "eval_samples_per_second": 0.689, + "eval_steps_per_second": 0.689, + "step": 350 + }, + { + "epoch": 1.0703095147115018, + "grad_norm": 0.45918041467666626, + "learning_rate": 1.0585385817630137e-05, + "loss": 0.6641817092895508, + "step": 351 + }, + { + "epoch": 1.073366450133741, + "grad_norm": 0.4126392900943756, + "learning_rate": 1.0532221748421786e-05, + "loss": 0.6774541139602661, + "step": 352 + }, + { + "epoch": 1.0764233855559802, + "grad_norm": 0.5425148606300354, + "learning_rate": 1.047904258928037e-05, + "loss": 0.7386555075645447, + "step": 353 + }, + { + "epoch": 1.0794803209782193, + "grad_norm": 0.40561115741729736, + "learning_rate": 1.0425849847979586e-05, + "loss": 0.7061327695846558, + "step": 354 + }, + { + "epoch": 1.0825372564004585, + "grad_norm": 0.489343523979187, + "learning_rate": 1.0372645032678215e-05, + "loss": 0.7486766576766968, + "step": 355 + }, + { + "epoch": 1.0855941918226977, + "grad_norm": 0.7414161562919617, + "learning_rate": 1.031942965187738e-05, + "loss": 0.7111566066741943, + "step": 356 + }, + { + "epoch": 1.0886511272449368, + "grad_norm": 0.308473140001297, + "learning_rate": 1.026620521437775e-05, + "loss": 0.7629879713058472, + "step": 357 + }, + { + "epoch": 1.0917080626671762, + "grad_norm": 0.27350732684135437, + "learning_rate": 1.0212973229236787e-05, + "loss": 0.7136012315750122, + "step": 358 + }, + { + "epoch": 1.0947649980894154, + "grad_norm": 0.37481266260147095, + "learning_rate": 1.0159735205725949e-05, + "loss": 0.6634767055511475, + "step": 359 + }, + { + "epoch": 1.0978219335116546, + "grad_norm": 0.2903526723384857, + "learning_rate": 1.0106492653287893e-05, + "loss": 0.6604923009872437, + "step": 360 + }, + { + "epoch": 1.1008788689338938, + "grad_norm": 0.372989296913147, + "learning_rate": 1.0053247081493684e-05, + "loss": 0.6701731085777283, + "step": 361 + }, + { + "epoch": 1.103935804356133, + "grad_norm": 0.38386791944503784, + "learning_rate": 1e-05, + "loss": 0.6767977476119995, + "step": 362 + }, + { + "epoch": 1.106992739778372, + "grad_norm": 0.2837046682834625, + "learning_rate": 9.946752918506319e-06, + "loss": 0.5886228680610657, + "step": 363 + }, + { + "epoch": 1.1100496752006115, + "grad_norm": 0.3196772038936615, + "learning_rate": 9.893507346712112e-06, + "loss": 0.6662254929542542, + "step": 364 + }, + { + "epoch": 1.1131066106228507, + "grad_norm": 0.36623135209083557, + "learning_rate": 9.840264794274053e-06, + "loss": 0.6507357954978943, + "step": 365 + }, + { + "epoch": 1.1161635460450898, + "grad_norm": 0.2803555727005005, + "learning_rate": 9.787026770763216e-06, + "loss": 0.6636874675750732, + "step": 366 + }, + { + "epoch": 1.119220481467329, + "grad_norm": 0.329513818025589, + "learning_rate": 9.733794785622254e-06, + "loss": 0.6378857493400574, + "step": 367 + }, + { + "epoch": 1.1222774168895682, + "grad_norm": 0.24419358372688293, + "learning_rate": 9.680570348122626e-06, + "loss": 0.6794115900993347, + "step": 368 + }, + { + "epoch": 1.1253343523118073, + "grad_norm": 0.2971822917461395, + "learning_rate": 9.627354967321785e-06, + "loss": 0.6401248574256897, + "step": 369 + }, + { + "epoch": 1.1283912877340465, + "grad_norm": 0.5112190842628479, + "learning_rate": 9.574150152020415e-06, + "loss": 0.6886081695556641, + "step": 370 + }, + { + "epoch": 1.131448223156286, + "grad_norm": 0.4284913241863251, + "learning_rate": 9.520957410719632e-06, + "loss": 0.6842222213745117, + "step": 371 + }, + { + "epoch": 1.134505158578525, + "grad_norm": 0.34164664149284363, + "learning_rate": 9.467778251578217e-06, + "loss": 0.6238314509391785, + "step": 372 + }, + { + "epoch": 1.1375620940007642, + "grad_norm": 0.3294171392917633, + "learning_rate": 9.414614182369862e-06, + "loss": 0.6947107911109924, + "step": 373 + }, + { + "epoch": 1.1406190294230034, + "grad_norm": 0.2544155418872833, + "learning_rate": 9.361466710440428e-06, + "loss": 0.717319905757904, + "step": 374 + }, + { + "epoch": 1.1436759648452426, + "grad_norm": 0.3111848533153534, + "learning_rate": 9.308337342665188e-06, + "loss": 0.6222032904624939, + "step": 375 + }, + { + "epoch": 1.1467329002674818, + "grad_norm": 0.3157130777835846, + "learning_rate": 9.255227585406116e-06, + "loss": 0.6126186847686768, + "step": 376 + }, + { + "epoch": 1.1497898356897212, + "grad_norm": 0.29625123739242554, + "learning_rate": 9.202138944469168e-06, + "loss": 0.7452324032783508, + "step": 377 + }, + { + "epoch": 1.1528467711119603, + "grad_norm": 0.31600719690322876, + "learning_rate": 9.149072925061614e-06, + "loss": 0.715571403503418, + "step": 378 + }, + { + "epoch": 1.1559037065341995, + "grad_norm": 0.25878727436065674, + "learning_rate": 9.096031031749321e-06, + "loss": 0.7256120443344116, + "step": 379 + }, + { + "epoch": 1.1589606419564387, + "grad_norm": 0.4058121144771576, + "learning_rate": 9.043014768414125e-06, + "loss": 0.6728136539459229, + "step": 380 + }, + { + "epoch": 1.1620175773786778, + "grad_norm": 0.31269821524620056, + "learning_rate": 8.99002563821118e-06, + "loss": 0.6662668585777283, + "step": 381 + }, + { + "epoch": 1.165074512800917, + "grad_norm": 0.2512218654155731, + "learning_rate": 8.937065143526349e-06, + "loss": 0.6415850520133972, + "step": 382 + }, + { + "epoch": 1.1681314482231562, + "grad_norm": 0.3284171223640442, + "learning_rate": 8.884134785933588e-06, + "loss": 0.6695276498794556, + "step": 383 + }, + { + "epoch": 1.1711883836453956, + "grad_norm": 0.2994699478149414, + "learning_rate": 8.831236066152397e-06, + "loss": 0.7347006797790527, + "step": 384 + }, + { + "epoch": 1.1742453190676347, + "grad_norm": 0.2981257140636444, + "learning_rate": 8.778370484005245e-06, + "loss": 0.6707600951194763, + "step": 385 + }, + { + "epoch": 1.177302254489874, + "grad_norm": 0.2934776842594147, + "learning_rate": 8.725539538375078e-06, + "loss": 0.7245328426361084, + "step": 386 + }, + { + "epoch": 1.180359189912113, + "grad_norm": 0.33115988969802856, + "learning_rate": 8.672744727162782e-06, + "loss": 0.7029488682746887, + "step": 387 + }, + { + "epoch": 1.1834161253343523, + "grad_norm": 0.3322703540325165, + "learning_rate": 8.619987547244746e-06, + "loss": 0.6896190643310547, + "step": 388 + }, + { + "epoch": 1.1864730607565914, + "grad_norm": 0.29254966974258423, + "learning_rate": 8.567269494430404e-06, + "loss": 0.6859920620918274, + "step": 389 + }, + { + "epoch": 1.1895299961788308, + "grad_norm": 0.2923297584056854, + "learning_rate": 8.514592063419833e-06, + "loss": 0.6437527537345886, + "step": 390 + }, + { + "epoch": 1.19258693160107, + "grad_norm": 0.3074567914009094, + "learning_rate": 8.461956747761375e-06, + "loss": 0.7113338708877563, + "step": 391 + }, + { + "epoch": 1.1956438670233092, + "grad_norm": 0.3027377128601074, + "learning_rate": 8.409365039809282e-06, + "loss": 0.7111615538597107, + "step": 392 + }, + { + "epoch": 1.1987008024455483, + "grad_norm": 0.28992199897766113, + "learning_rate": 8.356818430681409e-06, + "loss": 0.7768589854240417, + "step": 393 + }, + { + "epoch": 1.2017577378677875, + "grad_norm": 0.2630784213542938, + "learning_rate": 8.304318410216937e-06, + "loss": 0.5940375328063965, + "step": 394 + }, + { + "epoch": 1.2048146732900267, + "grad_norm": 0.30487746000289917, + "learning_rate": 8.251866466934137e-06, + "loss": 0.6600077748298645, + "step": 395 + }, + { + "epoch": 1.2078716087122658, + "grad_norm": 0.4152087867259979, + "learning_rate": 8.199464087988158e-06, + "loss": 0.6806260347366333, + "step": 396 + }, + { + "epoch": 1.2109285441345052, + "grad_norm": 0.32374435663223267, + "learning_rate": 8.147112759128859e-06, + "loss": 0.7205727100372314, + "step": 397 + }, + { + "epoch": 1.2139854795567444, + "grad_norm": 0.3009904623031616, + "learning_rate": 8.094813964658698e-06, + "loss": 0.6570584774017334, + "step": 398 + }, + { + "epoch": 1.2170424149789836, + "grad_norm": 0.5213649272918701, + "learning_rate": 8.042569187390642e-06, + "loss": 0.6663621664047241, + "step": 399 + }, + { + "epoch": 1.2200993504012227, + "grad_norm": 0.30124184489250183, + "learning_rate": 7.990379908606118e-06, + "loss": 0.672550618648529, + "step": 400 + }, + { + "epoch": 1.2200993504012227, + "eval_loss": 0.6789794564247131, + "eval_runtime": 875.5101, + "eval_samples_per_second": 0.689, + "eval_steps_per_second": 0.689, + "step": 400 + }, + { + "epoch": 1.223156285823462, + "grad_norm": 0.31681662797927856, + "learning_rate": 7.938247608013021e-06, + "loss": 0.682239830493927, + "step": 401 + }, + { + "epoch": 1.226213221245701, + "grad_norm": 0.29261210560798645, + "learning_rate": 7.886173763703757e-06, + "loss": 0.6976956725120544, + "step": 402 + }, + { + "epoch": 1.2292701566679405, + "grad_norm": 0.32044124603271484, + "learning_rate": 7.834159852113347e-06, + "loss": 0.6931061744689941, + "step": 403 + }, + { + "epoch": 1.2323270920901797, + "grad_norm": 0.36050841212272644, + "learning_rate": 7.78220734797755e-06, + "loss": 0.7304666638374329, + "step": 404 + }, + { + "epoch": 1.2353840275124188, + "grad_norm": 0.31268882751464844, + "learning_rate": 7.73031772429105e-06, + "loss": 0.5944494605064392, + "step": 405 + }, + { + "epoch": 1.238440962934658, + "grad_norm": 0.33469483256340027, + "learning_rate": 7.678492452265713e-06, + "loss": 0.708702802658081, + "step": 406 + }, + { + "epoch": 1.2414978983568972, + "grad_norm": 0.2789304852485657, + "learning_rate": 7.626733001288852e-06, + "loss": 0.614046037197113, + "step": 407 + }, + { + "epoch": 1.2445548337791363, + "grad_norm": 0.42240089178085327, + "learning_rate": 7.575040838881578e-06, + "loss": 0.7044576406478882, + "step": 408 + }, + { + "epoch": 1.2476117692013755, + "grad_norm": 0.3652958571910858, + "learning_rate": 7.523417430657186e-06, + "loss": 0.7595829963684082, + "step": 409 + }, + { + "epoch": 1.250668704623615, + "grad_norm": 0.28300684690475464, + "learning_rate": 7.471864240279598e-06, + "loss": 0.7289992570877075, + "step": 410 + }, + { + "epoch": 1.253725640045854, + "grad_norm": 0.3463844358921051, + "learning_rate": 7.420382729421883e-06, + "loss": 0.7410037517547607, + "step": 411 + }, + { + "epoch": 1.2567825754680932, + "grad_norm": 0.30792665481567383, + "learning_rate": 7.368974357724789e-06, + "loss": 0.6920305490493774, + "step": 412 + }, + { + "epoch": 1.2598395108903324, + "grad_norm": 0.4354027509689331, + "learning_rate": 7.317640582755373e-06, + "loss": 0.6581035256385803, + "step": 413 + }, + { + "epoch": 1.2628964463125716, + "grad_norm": 0.5033990144729614, + "learning_rate": 7.266382859965673e-06, + "loss": 0.7377368211746216, + "step": 414 + }, + { + "epoch": 1.265953381734811, + "grad_norm": 0.30040669441223145, + "learning_rate": 7.2152026426514395e-06, + "loss": 0.7075121402740479, + "step": 415 + }, + { + "epoch": 1.2690103171570501, + "grad_norm": 0.25443559885025024, + "learning_rate": 7.164101381910939e-06, + "loss": 0.6314805150032043, + "step": 416 + }, + { + "epoch": 1.2720672525792893, + "grad_norm": 0.3807917535305023, + "learning_rate": 7.113080526603793e-06, + "loss": 0.6594043970108032, + "step": 417 + }, + { + "epoch": 1.2751241880015285, + "grad_norm": 0.40388163924217224, + "learning_rate": 7.062141523309918e-06, + "loss": 0.7092217206954956, + "step": 418 + }, + { + "epoch": 1.2781811234237677, + "grad_norm": 0.31380078196525574, + "learning_rate": 7.011285816288496e-06, + "loss": 0.6039083003997803, + "step": 419 + }, + { + "epoch": 1.2812380588460068, + "grad_norm": 0.3492945730686188, + "learning_rate": 6.96051484743705e-06, + "loss": 0.648531973361969, + "step": 420 + }, + { + "epoch": 1.284294994268246, + "grad_norm": 0.2891562283039093, + "learning_rate": 6.909830056250527e-06, + "loss": 0.6646198630332947, + "step": 421 + }, + { + "epoch": 1.2873519296904852, + "grad_norm": 0.316986083984375, + "learning_rate": 6.859232879780515e-06, + "loss": 0.7188717126846313, + "step": 422 + }, + { + "epoch": 1.2904088651127246, + "grad_norm": 0.38996225595474243, + "learning_rate": 6.8087247525944745e-06, + "loss": 0.6890851855278015, + "step": 423 + }, + { + "epoch": 1.2934658005349637, + "grad_norm": 0.3303278684616089, + "learning_rate": 6.758307106735094e-06, + "loss": 0.7118897438049316, + "step": 424 + }, + { + "epoch": 1.296522735957203, + "grad_norm": 0.26401078701019287, + "learning_rate": 6.707981371679657e-06, + "loss": 0.6749597787857056, + "step": 425 + }, + { + "epoch": 1.299579671379442, + "grad_norm": 0.3269912898540497, + "learning_rate": 6.657748974299529e-06, + "loss": 0.6718383431434631, + "step": 426 + }, + { + "epoch": 1.3026366068016813, + "grad_norm": 0.35413047671318054, + "learning_rate": 6.607611338819697e-06, + "loss": 0.6674888134002686, + "step": 427 + }, + { + "epoch": 1.3056935422239206, + "grad_norm": 0.44566094875335693, + "learning_rate": 6.557569886778401e-06, + "loss": 0.6900228261947632, + "step": 428 + }, + { + "epoch": 1.3087504776461598, + "grad_norm": 0.3536953628063202, + "learning_rate": 6.507626036986804e-06, + "loss": 0.6681596040725708, + "step": 429 + }, + { + "epoch": 1.311807413068399, + "grad_norm": 0.43866440653800964, + "learning_rate": 6.457781205488791e-06, + "loss": 0.7463353872299194, + "step": 430 + }, + { + "epoch": 1.3148643484906382, + "grad_norm": 0.32117530703544617, + "learning_rate": 6.408036805520801e-06, + "loss": 0.7138527035713196, + "step": 431 + }, + { + "epoch": 1.3179212839128773, + "grad_norm": 0.3075023293495178, + "learning_rate": 6.358394247471779e-06, + "loss": 0.6958800554275513, + "step": 432 + }, + { + "epoch": 1.3209782193351165, + "grad_norm": 0.31068870425224304, + "learning_rate": 6.308854938843161e-06, + "loss": 0.6728611588478088, + "step": 433 + }, + { + "epoch": 1.3240351547573557, + "grad_norm": 0.2871341407299042, + "learning_rate": 6.259420284208987e-06, + "loss": 0.6983805894851685, + "step": 434 + }, + { + "epoch": 1.3270920901795948, + "grad_norm": 0.3626168966293335, + "learning_rate": 6.210091685176067e-06, + "loss": 0.6707543134689331, + "step": 435 + }, + { + "epoch": 1.3301490256018342, + "grad_norm": 0.2960391640663147, + "learning_rate": 6.160870540344261e-06, + "loss": 0.6212095618247986, + "step": 436 + }, + { + "epoch": 1.3332059610240734, + "grad_norm": 0.29114195704460144, + "learning_rate": 6.111758245266795e-06, + "loss": 0.695442795753479, + "step": 437 + }, + { + "epoch": 1.3362628964463126, + "grad_norm": 0.2911393642425537, + "learning_rate": 6.0627561924107145e-06, + "loss": 0.7576844096183777, + "step": 438 + }, + { + "epoch": 1.3393198318685517, + "grad_norm": 0.2754829227924347, + "learning_rate": 6.013865771117394e-06, + "loss": 0.7611621022224426, + "step": 439 + }, + { + "epoch": 1.342376767290791, + "grad_norm": 0.47688090801239014, + "learning_rate": 5.965088367563162e-06, + "loss": 0.6706432104110718, + "step": 440 + }, + { + "epoch": 1.3454337027130303, + "grad_norm": 0.38662102818489075, + "learning_rate": 5.916425364719975e-06, + "loss": 0.7257411479949951, + "step": 441 + }, + { + "epoch": 1.3484906381352695, + "grad_norm": 0.29597020149230957, + "learning_rate": 5.867878142316221e-06, + "loss": 0.6695491671562195, + "step": 442 + }, + { + "epoch": 1.3515475735575087, + "grad_norm": 0.36503320932388306, + "learning_rate": 5.8194480767976e-06, + "loss": 0.6762661933898926, + "step": 443 + }, + { + "epoch": 1.3546045089797478, + "grad_norm": 0.29297393560409546, + "learning_rate": 5.7711365412880895e-06, + "loss": 0.6601616740226746, + "step": 444 + }, + { + "epoch": 1.357661444401987, + "grad_norm": 0.3229820430278778, + "learning_rate": 5.7229449055510335e-06, + "loss": 0.7049432992935181, + "step": 445 + }, + { + "epoch": 1.3607183798242262, + "grad_norm": 0.3359116017818451, + "learning_rate": 5.674874535950279e-06, + "loss": 0.6643913388252258, + "step": 446 + }, + { + "epoch": 1.3637753152464653, + "grad_norm": 0.349298357963562, + "learning_rate": 5.626926795411447e-06, + "loss": 0.7177180647850037, + "step": 447 + }, + { + "epoch": 1.3668322506687045, + "grad_norm": 0.30045273900032043, + "learning_rate": 5.579103043383305e-06, + "loss": 0.6765077710151672, + "step": 448 + }, + { + "epoch": 1.369889186090944, + "grad_norm": 0.3676189184188843, + "learning_rate": 5.531404635799191e-06, + "loss": 0.6421419978141785, + "step": 449 + }, + { + "epoch": 1.372946121513183, + "grad_norm": 0.3337932527065277, + "learning_rate": 5.4838329250386076e-06, + "loss": 0.649316668510437, + "step": 450 + }, + { + "epoch": 1.372946121513183, + "eval_loss": 0.6703284978866577, + "eval_runtime": 907.8663, + "eval_samples_per_second": 0.664, + "eval_steps_per_second": 0.664, + "step": 450 + }, + { + "epoch": 1.3760030569354222, + "grad_norm": 0.314387708902359, + "learning_rate": 5.436389259888841e-06, + "loss": 0.7333119511604309, + "step": 451 + }, + { + "epoch": 1.3790599923576614, + "grad_norm": 0.4056478440761566, + "learning_rate": 5.38907498550674e-06, + "loss": 0.6451212763786316, + "step": 452 + }, + { + "epoch": 1.3821169277799006, + "grad_norm": 0.42358386516571045, + "learning_rate": 5.341891443380585e-06, + "loss": 0.6462752819061279, + "step": 453 + }, + { + "epoch": 1.38517386320214, + "grad_norm": 0.3606562912464142, + "learning_rate": 5.294839971292026e-06, + "loss": 0.717352569103241, + "step": 454 + }, + { + "epoch": 1.3882307986243791, + "grad_norm": 0.3014855682849884, + "learning_rate": 5.247921903278177e-06, + "loss": 0.7015582323074341, + "step": 455 + }, + { + "epoch": 1.3912877340466183, + "grad_norm": 0.5155187845230103, + "learning_rate": 5.20113856959378e-06, + "loss": 0.6660122275352478, + "step": 456 + }, + { + "epoch": 1.3943446694688575, + "grad_norm": 0.35195642709732056, + "learning_rate": 5.1544912966735e-06, + "loss": 0.6980377435684204, + "step": 457 + }, + { + "epoch": 1.3974016048910967, + "grad_norm": 0.28842753171920776, + "learning_rate": 5.1079814070943e-06, + "loss": 0.6926653385162354, + "step": 458 + }, + { + "epoch": 1.4004585403133358, + "grad_norm": 0.354425311088562, + "learning_rate": 5.06161021953796e-06, + "loss": 0.6412813067436218, + "step": 459 + }, + { + "epoch": 1.403515475735575, + "grad_norm": 0.30584967136383057, + "learning_rate": 5.015379048753669e-06, + "loss": 0.6897266507148743, + "step": 460 + }, + { + "epoch": 1.4065724111578142, + "grad_norm": 0.3659093677997589, + "learning_rate": 4.9692892055207784e-06, + "loss": 0.6777257919311523, + "step": 461 + }, + { + "epoch": 1.4096293465800536, + "grad_norm": 0.6798201203346252, + "learning_rate": 4.923341996611604e-06, + "loss": 0.7499118447303772, + "step": 462 + }, + { + "epoch": 1.4126862820022927, + "grad_norm": 0.36423686146736145, + "learning_rate": 4.877538724754392e-06, + "loss": 0.6341705322265625, + "step": 463 + }, + { + "epoch": 1.415743217424532, + "grad_norm": 0.29527905583381653, + "learning_rate": 4.831880688596392e-06, + "loss": 0.566770076751709, + "step": 464 + }, + { + "epoch": 1.418800152846771, + "grad_norm": 0.3342158794403076, + "learning_rate": 4.7863691826670146e-06, + "loss": 0.6926667094230652, + "step": 465 + }, + { + "epoch": 1.4218570882690102, + "grad_norm": 0.35585087537765503, + "learning_rate": 4.741005497341154e-06, + "loss": 0.6302958130836487, + "step": 466 + }, + { + "epoch": 1.4249140236912496, + "grad_norm": 0.5740730166435242, + "learning_rate": 4.695790918802577e-06, + "loss": 0.7842360138893127, + "step": 467 + }, + { + "epoch": 1.4279709591134888, + "grad_norm": 0.4422702491283417, + "learning_rate": 4.650726729007465e-06, + "loss": 0.6199318766593933, + "step": 468 + }, + { + "epoch": 1.431027894535728, + "grad_norm": 0.3458646833896637, + "learning_rate": 4.605814205648087e-06, + "loss": 0.7013853788375854, + "step": 469 + }, + { + "epoch": 1.4340848299579672, + "grad_norm": 0.326727956533432, + "learning_rate": 4.56105462211654e-06, + "loss": 0.7208451628684998, + "step": 470 + }, + { + "epoch": 1.4371417653802063, + "grad_norm": 0.3491531014442444, + "learning_rate": 4.516449247468666e-06, + "loss": 0.6491535902023315, + "step": 471 + }, + { + "epoch": 1.4401987008024455, + "grad_norm": 0.31401777267456055, + "learning_rate": 4.4719993463880695e-06, + "loss": 0.6603784561157227, + "step": 472 + }, + { + "epoch": 1.4432556362246847, + "grad_norm": 0.3741454780101776, + "learning_rate": 4.427706179150247e-06, + "loss": 0.6068110466003418, + "step": 473 + }, + { + "epoch": 1.4463125716469238, + "grad_norm": 0.3205011188983917, + "learning_rate": 4.383571001586883e-06, + "loss": 0.6427788138389587, + "step": 474 + }, + { + "epoch": 1.4493695070691632, + "grad_norm": 0.2519795894622803, + "learning_rate": 4.339595065050206e-06, + "loss": 0.626676082611084, + "step": 475 + }, + { + "epoch": 1.4524264424914024, + "grad_norm": 0.3499923050403595, + "learning_rate": 4.29577961637754e-06, + "loss": 0.7192115187644958, + "step": 476 + }, + { + "epoch": 1.4554833779136416, + "grad_norm": 0.6267193555831909, + "learning_rate": 4.2521258978559324e-06, + "loss": 0.6705955862998962, + "step": 477 + }, + { + "epoch": 1.4585403133358807, + "grad_norm": 0.5547561049461365, + "learning_rate": 4.208635147186956e-06, + "loss": 0.6040648818016052, + "step": 478 + }, + { + "epoch": 1.46159724875812, + "grad_norm": 0.2949749529361725, + "learning_rate": 4.165308597451586e-06, + "loss": 0.6205201148986816, + "step": 479 + }, + { + "epoch": 1.4646541841803593, + "grad_norm": 0.2873048782348633, + "learning_rate": 4.12214747707527e-06, + "loss": 0.6886979937553406, + "step": 480 + }, + { + "epoch": 1.4677111196025985, + "grad_norm": 0.33694973587989807, + "learning_rate": 4.079153009793068e-06, + "loss": 0.6656784415245056, + "step": 481 + }, + { + "epoch": 1.4707680550248377, + "grad_norm": 0.3373357057571411, + "learning_rate": 4.036326414614985e-06, + "loss": 0.6573168635368347, + "step": 482 + }, + { + "epoch": 1.4738249904470768, + "grad_norm": 0.3189850151538849, + "learning_rate": 3.99366890579139e-06, + "loss": 0.6631187200546265, + "step": 483 + }, + { + "epoch": 1.476881925869316, + "grad_norm": 0.34659212827682495, + "learning_rate": 3.951181692778594e-06, + "loss": 0.5881021022796631, + "step": 484 + }, + { + "epoch": 1.4799388612915552, + "grad_norm": 0.4184463918209076, + "learning_rate": 3.908865980204555e-06, + "loss": 0.7232425212860107, + "step": 485 + }, + { + "epoch": 1.4829957967137943, + "grad_norm": 0.3163282573223114, + "learning_rate": 3.86672296783474e-06, + "loss": 0.6624961495399475, + "step": 486 + }, + { + "epoch": 1.4860527321360335, + "grad_norm": 0.3175446689128876, + "learning_rate": 3.824753850538082e-06, + "loss": 0.6616235971450806, + "step": 487 + }, + { + "epoch": 1.489109667558273, + "grad_norm": 0.3493629992008209, + "learning_rate": 3.782959818253126e-06, + "loss": 0.6923587918281555, + "step": 488 + }, + { + "epoch": 1.492166602980512, + "grad_norm": 0.30385154485702515, + "learning_rate": 3.741342055954269e-06, + "loss": 0.6668528914451599, + "step": 489 + }, + { + "epoch": 1.4952235384027512, + "grad_norm": 0.319979727268219, + "learning_rate": 3.699901743618194e-06, + "loss": 0.6276881098747253, + "step": 490 + }, + { + "epoch": 1.4982804738249904, + "grad_norm": 0.28717750310897827, + "learning_rate": 3.658640056190378e-06, + "loss": 0.7676356434822083, + "step": 491 + }, + { + "epoch": 1.5013374092472298, + "grad_norm": 0.4701229929924011, + "learning_rate": 3.617558163551802e-06, + "loss": 0.6021715402603149, + "step": 492 + }, + { + "epoch": 1.504394344669469, + "grad_norm": 0.4959515929222107, + "learning_rate": 3.576657230485775e-06, + "loss": 0.7243677973747253, + "step": 493 + }, + { + "epoch": 1.5074512800917081, + "grad_norm": 0.32071781158447266, + "learning_rate": 3.5359384166449185e-06, + "loss": 0.7030311822891235, + "step": 494 + }, + { + "epoch": 1.5105082155139473, + "grad_norm": 0.3393514156341553, + "learning_rate": 3.4954028765182633e-06, + "loss": 0.6344490051269531, + "step": 495 + }, + { + "epoch": 1.5135651509361865, + "grad_norm": 0.273512065410614, + "learning_rate": 3.4550517593985512e-06, + "loss": 0.5816606879234314, + "step": 496 + }, + { + "epoch": 1.5166220863584257, + "grad_norm": 0.6631937026977539, + "learning_rate": 3.414886209349615e-06, + "loss": 0.6091232895851135, + "step": 497 + }, + { + "epoch": 1.5196790217806648, + "grad_norm": 0.6976932287216187, + "learning_rate": 3.3749073651739594e-06, + "loss": 0.7076858282089233, + "step": 498 + }, + { + "epoch": 1.522735957202904, + "grad_norm": 0.35580119490623474, + "learning_rate": 3.3351163603804805e-06, + "loss": 0.6363418698310852, + "step": 499 + }, + { + "epoch": 1.5257928926251432, + "grad_norm": 0.30289211869239807, + "learning_rate": 3.2955143231523067e-06, + "loss": 0.6716225147247314, + "step": 500 + }, + { + "epoch": 1.5257928926251432, + "eval_loss": 0.6648170948028564, + "eval_runtime": 870.3243, + "eval_samples_per_second": 0.693, + "eval_steps_per_second": 0.693, + "step": 500 + }, + { + "epoch": 1.5288498280473823, + "grad_norm": 0.33276933431625366, + "learning_rate": 3.2561023763148237e-06, + "loss": 0.6512227058410645, + "step": 501 + }, + { + "epoch": 1.5319067634696217, + "grad_norm": 0.40328240394592285, + "learning_rate": 3.216881637303839e-06, + "loss": 0.7053738236427307, + "step": 502 + }, + { + "epoch": 1.534963698891861, + "grad_norm": 0.2589263916015625, + "learning_rate": 3.177853218133905e-06, + "loss": 0.697374165058136, + "step": 503 + }, + { + "epoch": 1.5380206343141, + "grad_norm": 0.5453576445579529, + "learning_rate": 3.1390182253667745e-06, + "loss": 0.6664954423904419, + "step": 504 + }, + { + "epoch": 1.5410775697363395, + "grad_norm": 0.5521278381347656, + "learning_rate": 3.100377760080041e-06, + "loss": 0.662231981754303, + "step": 505 + }, + { + "epoch": 1.5441345051585786, + "grad_norm": 0.3097061216831207, + "learning_rate": 3.0619329178359103e-06, + "loss": 0.751462459564209, + "step": 506 + }, + { + "epoch": 1.5471914405808178, + "grad_norm": 0.32505670189857483, + "learning_rate": 3.023684788650154e-06, + "loss": 0.6908425688743591, + "step": 507 + }, + { + "epoch": 1.550248376003057, + "grad_norm": 0.4177548587322235, + "learning_rate": 2.985634456961184e-06, + "loss": 0.6698168516159058, + "step": 508 + }, + { + "epoch": 1.5533053114252962, + "grad_norm": 0.3030829131603241, + "learning_rate": 2.947783001599315e-06, + "loss": 0.6403611302375793, + "step": 509 + }, + { + "epoch": 1.5563622468475353, + "grad_norm": 0.2690201997756958, + "learning_rate": 2.9101314957561864e-06, + "loss": 0.6056875586509705, + "step": 510 + }, + { + "epoch": 1.5594191822697745, + "grad_norm": 0.2733827829360962, + "learning_rate": 2.8726810069543156e-06, + "loss": 0.7140977382659912, + "step": 511 + }, + { + "epoch": 1.5624761176920137, + "grad_norm": 0.2995041310787201, + "learning_rate": 2.8354325970168483e-06, + "loss": 0.6062126159667969, + "step": 512 + }, + { + "epoch": 1.5655330531142528, + "grad_norm": 0.2860231101512909, + "learning_rate": 2.7983873220374415e-06, + "loss": 0.6048973798751831, + "step": 513 + }, + { + "epoch": 1.568589988536492, + "grad_norm": 0.3419671058654785, + "learning_rate": 2.7615462323503186e-06, + "loss": 0.630670964717865, + "step": 514 + }, + { + "epoch": 1.5716469239587314, + "grad_norm": 0.3721083700656891, + "learning_rate": 2.724910372500508e-06, + "loss": 0.6205880641937256, + "step": 515 + }, + { + "epoch": 1.5747038593809706, + "grad_norm": 0.8053601384162903, + "learning_rate": 2.6884807812142043e-06, + "loss": 0.6468279361724854, + "step": 516 + }, + { + "epoch": 1.5777607948032097, + "grad_norm": 0.30676576495170593, + "learning_rate": 2.6522584913693295e-06, + "loss": 0.6104784607887268, + "step": 517 + }, + { + "epoch": 1.5808177302254491, + "grad_norm": 0.32430994510650635, + "learning_rate": 2.616244529966244e-06, + "loss": 0.6879785060882568, + "step": 518 + }, + { + "epoch": 1.5838746656476883, + "grad_norm": 0.2668575942516327, + "learning_rate": 2.5804399180986417e-06, + "loss": 0.6742456555366516, + "step": 519 + }, + { + "epoch": 1.5869316010699275, + "grad_norm": 0.41760483384132385, + "learning_rate": 2.544845670924575e-06, + "loss": 0.5823814868927002, + "step": 520 + }, + { + "epoch": 1.5899885364921666, + "grad_norm": 0.332041472196579, + "learning_rate": 2.509462797637693e-06, + "loss": 0.653259813785553, + "step": 521 + }, + { + "epoch": 1.5930454719144058, + "grad_norm": 0.3437623381614685, + "learning_rate": 2.4742923014386154e-06, + "loss": 0.6304376721382141, + "step": 522 + }, + { + "epoch": 1.596102407336645, + "grad_norm": 0.2744190990924835, + "learning_rate": 2.4393351795065023e-06, + "loss": 0.8250125646591187, + "step": 523 + }, + { + "epoch": 1.5991593427588842, + "grad_norm": 0.3014289140701294, + "learning_rate": 2.4045924229707663e-06, + "loss": 0.7557496428489685, + "step": 524 + }, + { + "epoch": 1.6022162781811233, + "grad_norm": 0.33593595027923584, + "learning_rate": 2.3700650168829765e-06, + "loss": 0.6550201773643494, + "step": 525 + }, + { + "epoch": 1.6052732136033625, + "grad_norm": 0.289989173412323, + "learning_rate": 2.3357539401889438e-06, + "loss": 0.5847223997116089, + "step": 526 + }, + { + "epoch": 1.6083301490256017, + "grad_norm": 0.3140230178833008, + "learning_rate": 2.3016601657009364e-06, + "loss": 0.7059583067893982, + "step": 527 + }, + { + "epoch": 1.611387084447841, + "grad_norm": 0.5017932653427124, + "learning_rate": 2.2677846600701305e-06, + "loss": 0.6565676927566528, + "step": 528 + }, + { + "epoch": 1.6144440198700802, + "grad_norm": 0.2757347822189331, + "learning_rate": 2.234128383759174e-06, + "loss": 0.5888017416000366, + "step": 529 + }, + { + "epoch": 1.6175009552923194, + "grad_norm": 0.3413706421852112, + "learning_rate": 2.2006922910149743e-06, + "loss": 0.6747739315032959, + "step": 530 + }, + { + "epoch": 1.6205578907145588, + "grad_norm": 0.2861206829547882, + "learning_rate": 2.167477329841633e-06, + "loss": 0.6995899677276611, + "step": 531 + }, + { + "epoch": 1.623614826136798, + "grad_norm": 0.4095499515533447, + "learning_rate": 2.1344844419735757e-06, + "loss": 0.6285294890403748, + "step": 532 + }, + { + "epoch": 1.6266717615590371, + "grad_norm": 0.25976240634918213, + "learning_rate": 2.101714562848841e-06, + "loss": 0.607745349407196, + "step": 533 + }, + { + "epoch": 1.6297286969812763, + "grad_norm": 0.2760326564311981, + "learning_rate": 2.069168621582567e-06, + "loss": 0.681461751461029, + "step": 534 + }, + { + "epoch": 1.6327856324035155, + "grad_norm": 0.29883530735969543, + "learning_rate": 2.0368475409406396e-06, + "loss": 0.6930239200592041, + "step": 535 + }, + { + "epoch": 1.6358425678257547, + "grad_norm": 0.2769938111305237, + "learning_rate": 2.004752237313544e-06, + "loss": 0.6871459484100342, + "step": 536 + }, + { + "epoch": 1.6388995032479938, + "grad_norm": 0.5758352875709534, + "learning_rate": 1.972883620690366e-06, + "loss": 0.6905091404914856, + "step": 537 + }, + { + "epoch": 1.641956438670233, + "grad_norm": 0.302348792552948, + "learning_rate": 1.9412425946329994e-06, + "loss": 0.7119919061660767, + "step": 538 + }, + { + "epoch": 1.6450133740924722, + "grad_norm": 0.2754940986633301, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.6610316038131714, + "step": 539 + }, + { + "epoch": 1.6480703095147113, + "grad_norm": 0.27256953716278076, + "learning_rate": 1.8786468961737902e-06, + "loss": 0.6504456996917725, + "step": 540 + }, + { + "epoch": 1.6511272449369507, + "grad_norm": 0.3459402620792389, + "learning_rate": 1.8476939985301257e-06, + "loss": 0.673663854598999, + "step": 541 + }, + { + "epoch": 1.65418418035919, + "grad_norm": 0.374275267124176, + "learning_rate": 1.81697224091831e-06, + "loss": 0.6528961658477783, + "step": 542 + }, + { + "epoch": 1.657241115781429, + "grad_norm": 0.310211181640625, + "learning_rate": 1.7864824943836633e-06, + "loss": 0.664339005947113, + "step": 543 + }, + { + "epoch": 1.6602980512036685, + "grad_norm": 0.34453052282333374, + "learning_rate": 1.7562256233933717e-06, + "loss": 0.6874368190765381, + "step": 544 + }, + { + "epoch": 1.6633549866259076, + "grad_norm": 0.3484613299369812, + "learning_rate": 1.7262024858119597e-06, + "loss": 0.7023600935935974, + "step": 545 + }, + { + "epoch": 1.6664119220481468, + "grad_norm": 0.45776957273483276, + "learning_rate": 1.6964139328769736e-06, + "loss": 0.6404401659965515, + "step": 546 + }, + { + "epoch": 1.669468857470386, + "grad_norm": 0.2930310368537903, + "learning_rate": 1.6668608091748495e-06, + "loss": 0.6716583967208862, + "step": 547 + }, + { + "epoch": 1.6725257928926252, + "grad_norm": 0.3713250160217285, + "learning_rate": 1.637543952616969e-06, + "loss": 0.6601635813713074, + "step": 548 + }, + { + "epoch": 1.6755827283148643, + "grad_norm": 0.3368103802204132, + "learning_rate": 1.6084641944158918e-06, + "loss": 0.6788731217384338, + "step": 549 + }, + { + "epoch": 1.6786396637371035, + "grad_norm": 0.2993035912513733, + "learning_rate": 1.5796223590617987e-06, + "loss": 0.6544529795646667, + "step": 550 + }, + { + "epoch": 1.6786396637371035, + "eval_loss": 0.6616687178611755, + "eval_runtime": 875.9833, + "eval_samples_per_second": 0.688, + "eval_steps_per_second": 0.688, + "step": 550 + }, + { + "epoch": 1.6816965991593427, + "grad_norm": 0.44005870819091797, + "learning_rate": 1.5510192642991073e-06, + "loss": 0.6850336194038391, + "step": 551 + }, + { + "epoch": 1.6847535345815818, + "grad_norm": 0.4457947611808777, + "learning_rate": 1.522655721103291e-06, + "loss": 0.6001553535461426, + "step": 552 + }, + { + "epoch": 1.687810470003821, + "grad_norm": 0.47378861904144287, + "learning_rate": 1.494532533657893e-06, + "loss": 0.7040194272994995, + "step": 553 + }, + { + "epoch": 1.6908674054260604, + "grad_norm": 0.38698890805244446, + "learning_rate": 1.4666504993317089e-06, + "loss": 0.7009314298629761, + "step": 554 + }, + { + "epoch": 1.6939243408482996, + "grad_norm": 0.3362627625465393, + "learning_rate": 1.4390104086561886e-06, + "loss": 0.6950737237930298, + "step": 555 + }, + { + "epoch": 1.6969812762705387, + "grad_norm": 0.36643826961517334, + "learning_rate": 1.4116130453030296e-06, + "loss": 0.6862865686416626, + "step": 556 + }, + { + "epoch": 1.7000382116927781, + "grad_norm": 0.33834755420684814, + "learning_rate": 1.3844591860619382e-06, + "loss": 0.6385370492935181, + "step": 557 + }, + { + "epoch": 1.7030951471150173, + "grad_norm": 0.2850823700428009, + "learning_rate": 1.3575496008186307e-06, + "loss": 0.5935351848602295, + "step": 558 + }, + { + "epoch": 1.7061520825372565, + "grad_norm": 0.29303666949272156, + "learning_rate": 1.330885052532981e-06, + "loss": 0.6652261018753052, + "step": 559 + }, + { + "epoch": 1.7092090179594956, + "grad_norm": 0.2667746841907501, + "learning_rate": 1.3044662972174005e-06, + "loss": 0.6116664409637451, + "step": 560 + }, + { + "epoch": 1.7122659533817348, + "grad_norm": 0.35388344526290894, + "learning_rate": 1.2782940839154113e-06, + "loss": 0.6909575462341309, + "step": 561 + }, + { + "epoch": 1.715322888803974, + "grad_norm": 0.3212358057498932, + "learning_rate": 1.2523691546803872e-06, + "loss": 0.5729340314865112, + "step": 562 + }, + { + "epoch": 1.7183798242262132, + "grad_norm": 0.3078250288963318, + "learning_rate": 1.2266922445545348e-06, + "loss": 0.6341389417648315, + "step": 563 + }, + { + "epoch": 1.7214367596484523, + "grad_norm": 0.3041326403617859, + "learning_rate": 1.201264081548038e-06, + "loss": 0.7670491337776184, + "step": 564 + }, + { + "epoch": 1.7244936950706915, + "grad_norm": 0.3577534854412079, + "learning_rate": 1.176085386618434e-06, + "loss": 0.7452418804168701, + "step": 565 + }, + { + "epoch": 1.7275506304929307, + "grad_norm": 0.3138960897922516, + "learning_rate": 1.151156873650151e-06, + "loss": 0.6182627081871033, + "step": 566 + }, + { + "epoch": 1.73060756591517, + "grad_norm": 0.29401692748069763, + "learning_rate": 1.1264792494342858e-06, + "loss": 0.7683947682380676, + "step": 567 + }, + { + "epoch": 1.7336645013374092, + "grad_norm": 0.42694059014320374, + "learning_rate": 1.1020532136485517e-06, + "loss": 0.6643114686012268, + "step": 568 + }, + { + "epoch": 1.7367214367596484, + "grad_norm": 0.3185805082321167, + "learning_rate": 1.0778794588374542e-06, + "loss": 0.6443809866905212, + "step": 569 + }, + { + "epoch": 1.7397783721818878, + "grad_norm": 0.39810633659362793, + "learning_rate": 1.0539586703926396e-06, + "loss": 0.6940271258354187, + "step": 570 + }, + { + "epoch": 1.742835307604127, + "grad_norm": 0.3531099557876587, + "learning_rate": 1.0302915265334722e-06, + "loss": 0.62273770570755, + "step": 571 + }, + { + "epoch": 1.7458922430263661, + "grad_norm": 0.303533136844635, + "learning_rate": 1.0068786982878087e-06, + "loss": 0.6589292883872986, + "step": 572 + }, + { + "epoch": 1.7489491784486053, + "grad_norm": 0.3740532398223877, + "learning_rate": 9.837208494729567e-07, + "loss": 0.7088748216629028, + "step": 573 + }, + { + "epoch": 1.7520061138708445, + "grad_norm": 0.28268831968307495, + "learning_rate": 9.608186366768746e-07, + "loss": 0.6833463907241821, + "step": 574 + }, + { + "epoch": 1.7550630492930837, + "grad_norm": 0.31762558221817017, + "learning_rate": 9.381727092395365e-07, + "loss": 0.6840337514877319, + "step": 575 + }, + { + "epoch": 1.7581199847153228, + "grad_norm": 0.3333055078983307, + "learning_rate": 9.157837092345334e-07, + "loss": 0.7084675431251526, + "step": 576 + }, + { + "epoch": 1.761176920137562, + "grad_norm": 0.2991984784603119, + "learning_rate": 8.936522714508678e-07, + "loss": 0.7238477468490601, + "step": 577 + }, + { + "epoch": 1.7642338555598012, + "grad_norm": 0.28052636981010437, + "learning_rate": 8.71779023374949e-07, + "loss": 0.6483154892921448, + "step": 578 + }, + { + "epoch": 1.7672907909820403, + "grad_norm": 0.31360605359077454, + "learning_rate": 8.501645851728091e-07, + "loss": 0.6550958156585693, + "step": 579 + }, + { + "epoch": 1.7703477264042797, + "grad_norm": 0.2856346666812897, + "learning_rate": 8.28809569672514e-07, + "loss": 0.6386545300483704, + "step": 580 + }, + { + "epoch": 1.773404661826519, + "grad_norm": 0.4174005389213562, + "learning_rate": 8.077145823467924e-07, + "loss": 0.6630646586418152, + "step": 581 + }, + { + "epoch": 1.776461597248758, + "grad_norm": 0.2678094506263733, + "learning_rate": 7.868802212958704e-07, + "loss": 0.7088242769241333, + "step": 582 + }, + { + "epoch": 1.7795185326709975, + "grad_norm": 0.33474841713905334, + "learning_rate": 7.663070772305081e-07, + "loss": 0.7061930298805237, + "step": 583 + }, + { + "epoch": 1.7825754680932366, + "grad_norm": 0.30635929107666016, + "learning_rate": 7.459957334552526e-07, + "loss": 0.7023921608924866, + "step": 584 + }, + { + "epoch": 1.7856324035154758, + "grad_norm": 0.3720168173313141, + "learning_rate": 7.259467658519026e-07, + "loss": 0.6405187845230103, + "step": 585 + }, + { + "epoch": 1.788689338937715, + "grad_norm": 0.30746224522590637, + "learning_rate": 7.061607428631823e-07, + "loss": 0.7479575872421265, + "step": 586 + }, + { + "epoch": 1.7917462743599541, + "grad_norm": 0.37346151471138, + "learning_rate": 6.866382254766158e-07, + "loss": 0.73829185962677, + "step": 587 + }, + { + "epoch": 1.7948032097821933, + "grad_norm": 0.3968294858932495, + "learning_rate": 6.673797672086335e-07, + "loss": 0.7156046032905579, + "step": 588 + }, + { + "epoch": 1.7978601452044325, + "grad_norm": 0.3264223635196686, + "learning_rate": 6.483859140888648e-07, + "loss": 0.6457011699676514, + "step": 589 + }, + { + "epoch": 1.8009170806266717, + "grad_norm": 0.3268529772758484, + "learning_rate": 6.296572046446725e-07, + "loss": 0.7092617750167847, + "step": 590 + }, + { + "epoch": 1.8039740160489108, + "grad_norm": 0.2968194782733917, + "learning_rate": 6.111941698858681e-07, + "loss": 0.7103247046470642, + "step": 591 + }, + { + "epoch": 1.8070309514711502, + "grad_norm": 0.6012208461761475, + "learning_rate": 5.929973332896677e-07, + "loss": 0.6195952892303467, + "step": 592 + }, + { + "epoch": 1.8100878868933894, + "grad_norm": 0.31401294469833374, + "learning_rate": 5.750672107858435e-07, + "loss": 0.7382717728614807, + "step": 593 + }, + { + "epoch": 1.8131448223156286, + "grad_norm": 0.3620605170726776, + "learning_rate": 5.574043107421023e-07, + "loss": 0.612289547920227, + "step": 594 + }, + { + "epoch": 1.8162017577378677, + "grad_norm": 0.2869480848312378, + "learning_rate": 5.400091339496638e-07, + "loss": 0.7518821358680725, + "step": 595 + }, + { + "epoch": 1.8192586931601071, + "grad_norm": 0.33768531680107117, + "learning_rate": 5.228821736090684e-07, + "loss": 0.7100391983985901, + "step": 596 + }, + { + "epoch": 1.8223156285823463, + "grad_norm": 0.39242854714393616, + "learning_rate": 5.060239153161872e-07, + "loss": 0.6121487617492676, + "step": 597 + }, + { + "epoch": 1.8253725640045855, + "grad_norm": 0.35079774260520935, + "learning_rate": 4.894348370484648e-07, + "loss": 0.6359960436820984, + "step": 598 + }, + { + "epoch": 1.8284294994268246, + "grad_norm": 0.29979392886161804, + "learning_rate": 4.731154091513546e-07, + "loss": 0.7085576057434082, + "step": 599 + }, + { + "epoch": 1.8314864348490638, + "grad_norm": 0.4967261850833893, + "learning_rate": 4.570660943249927e-07, + "loss": 0.6123998165130615, + "step": 600 + }, + { + "epoch": 1.8314864348490638, + "eval_loss": 0.6604031324386597, + "eval_runtime": 874.6571, + "eval_samples_per_second": 0.689, + "eval_steps_per_second": 0.689, + "step": 600 + }, + { + "epoch": 1.834543370271303, + "grad_norm": 0.3178945779800415, + "learning_rate": 4.412873476110702e-07, + "loss": 0.695781409740448, + "step": 601 + }, + { + "epoch": 1.8376003056935422, + "grad_norm": 0.5032989382743835, + "learning_rate": 4.2577961637994544e-07, + "loss": 0.6946380138397217, + "step": 602 + }, + { + "epoch": 1.8406572411157813, + "grad_norm": 0.5341282486915588, + "learning_rate": 4.1054334031794373e-07, + "loss": 0.6692078113555908, + "step": 603 + }, + { + "epoch": 1.8437141765380205, + "grad_norm": 0.3658231496810913, + "learning_rate": 3.955789514149022e-07, + "loss": 0.6848862767219543, + "step": 604 + }, + { + "epoch": 1.84677111196026, + "grad_norm": 0.32069069147109985, + "learning_rate": 3.808868739519167e-07, + "loss": 0.5807033777236938, + "step": 605 + }, + { + "epoch": 1.849828047382499, + "grad_norm": 0.34353893995285034, + "learning_rate": 3.6646752448931345e-07, + "loss": 0.6607818603515625, + "step": 606 + }, + { + "epoch": 1.8528849828047382, + "grad_norm": 0.3088971972465515, + "learning_rate": 3.5232131185484075e-07, + "loss": 0.5771111249923706, + "step": 607 + }, + { + "epoch": 1.8559419182269774, + "grad_norm": 0.32998737692832947, + "learning_rate": 3.3844863713207276e-07, + "loss": 0.6443166136741638, + "step": 608 + }, + { + "epoch": 1.8589988536492168, + "grad_norm": 0.32191914319992065, + "learning_rate": 3.2484989364904295e-07, + "loss": 0.6170867681503296, + "step": 609 + }, + { + "epoch": 1.862055789071456, + "grad_norm": 0.30264899134635925, + "learning_rate": 3.115254669670864e-07, + "loss": 0.7434426546096802, + "step": 610 + }, + { + "epoch": 1.8651127244936951, + "grad_norm": 0.2878584861755371, + "learning_rate": 2.984757348699152e-07, + "loss": 0.6115383505821228, + "step": 611 + }, + { + "epoch": 1.8681696599159343, + "grad_norm": 0.2602523863315582, + "learning_rate": 2.857010673529015e-07, + "loss": 0.713813304901123, + "step": 612 + }, + { + "epoch": 1.8712265953381735, + "grad_norm": 0.28921836614608765, + "learning_rate": 2.7320182661258687e-07, + "loss": 0.5810935497283936, + "step": 613 + }, + { + "epoch": 1.8742835307604127, + "grad_norm": 0.3239751160144806, + "learning_rate": 2.6097836703641856e-07, + "loss": 0.7070857882499695, + "step": 614 + }, + { + "epoch": 1.8773404661826518, + "grad_norm": 0.33824658393859863, + "learning_rate": 2.4903103519269724e-07, + "loss": 0.6979082226753235, + "step": 615 + }, + { + "epoch": 1.880397401604891, + "grad_norm": 0.3022307753562927, + "learning_rate": 2.3736016982075172e-07, + "loss": 0.6792311668395996, + "step": 616 + }, + { + "epoch": 1.8834543370271302, + "grad_norm": 0.3471018373966217, + "learning_rate": 2.2596610182133328e-07, + "loss": 0.7070050239562988, + "step": 617 + }, + { + "epoch": 1.8865112724493696, + "grad_norm": 0.2817937135696411, + "learning_rate": 2.1484915424723973e-07, + "loss": 0.8237960338592529, + "step": 618 + }, + { + "epoch": 1.8895682078716087, + "grad_norm": 0.3147852420806885, + "learning_rate": 2.0400964229414732e-07, + "loss": 0.6768534183502197, + "step": 619 + }, + { + "epoch": 1.892625143293848, + "grad_norm": 0.29942813515663147, + "learning_rate": 1.9344787329168002e-07, + "loss": 0.6888725757598877, + "step": 620 + }, + { + "epoch": 1.895682078716087, + "grad_norm": 0.4325658977031708, + "learning_rate": 1.831641466946954e-07, + "loss": 0.6199545860290527, + "step": 621 + }, + { + "epoch": 1.8987390141383265, + "grad_norm": 0.26856014132499695, + "learning_rate": 1.731587540747903e-07, + "loss": 0.6086418628692627, + "step": 622 + }, + { + "epoch": 1.9017959495605656, + "grad_norm": 0.2931425869464874, + "learning_rate": 1.6343197911203978e-07, + "loss": 0.6391353607177734, + "step": 623 + }, + { + "epoch": 1.9048528849828048, + "grad_norm": 0.3080894947052002, + "learning_rate": 1.5398409758695e-07, + "loss": 0.7076231241226196, + "step": 624 + }, + { + "epoch": 1.907909820405044, + "grad_norm": 0.306944340467453, + "learning_rate": 1.448153773726402e-07, + "loss": 0.6891772747039795, + "step": 625 + }, + { + "epoch": 1.9109667558272831, + "grad_norm": 0.27431976795196533, + "learning_rate": 1.3592607842724648e-07, + "loss": 0.6765578985214233, + "step": 626 + }, + { + "epoch": 1.9140236912495223, + "grad_norm": 0.304188072681427, + "learning_rate": 1.2731645278655448e-07, + "loss": 0.5680350065231323, + "step": 627 + }, + { + "epoch": 1.9170806266717615, + "grad_norm": 0.27153295278549194, + "learning_rate": 1.1898674455685045e-07, + "loss": 0.639629065990448, + "step": 628 + }, + { + "epoch": 1.9201375620940007, + "grad_norm": 0.28288570046424866, + "learning_rate": 1.109371899080025e-07, + "loss": 0.6656857132911682, + "step": 629 + }, + { + "epoch": 1.9231944975162398, + "grad_norm": 0.4034242331981659, + "learning_rate": 1.0316801706676038e-07, + "loss": 0.6474316716194153, + "step": 630 + }, + { + "epoch": 1.9262514329384792, + "grad_norm": 0.32141056656837463, + "learning_rate": 9.56794463102917e-08, + "loss": 0.6747321486473083, + "step": 631 + }, + { + "epoch": 1.9293083683607184, + "grad_norm": 0.28029316663742065, + "learning_rate": 8.847168995992916e-08, + "loss": 0.5827028155326843, + "step": 632 + }, + { + "epoch": 1.9323653037829576, + "grad_norm": 0.2991296648979187, + "learning_rate": 8.154495237515436e-08, + "loss": 0.6173070669174194, + "step": 633 + }, + { + "epoch": 1.9354222392051967, + "grad_norm": 0.3268067538738251, + "learning_rate": 7.489942994780452e-08, + "loss": 0.7312080264091492, + "step": 634 + }, + { + "epoch": 1.9384791746274361, + "grad_norm": 0.2985822260379791, + "learning_rate": 6.853531109650147e-08, + "loss": 0.6277808547019958, + "step": 635 + }, + { + "epoch": 1.9415361100496753, + "grad_norm": 0.3158927261829376, + "learning_rate": 6.245277626131142e-08, + "loss": 0.6355108618736267, + "step": 636 + }, + { + "epoch": 1.9445930454719145, + "grad_norm": 0.32115647196769714, + "learning_rate": 5.665199789862907e-08, + "loss": 0.6803461909294128, + "step": 637 + }, + { + "epoch": 1.9476499808941536, + "grad_norm": 0.28556641936302185, + "learning_rate": 5.113314047628493e-08, + "loss": 0.7019358277320862, + "step": 638 + }, + { + "epoch": 1.9507069163163928, + "grad_norm": 0.3105650544166565, + "learning_rate": 4.589636046888779e-08, + "loss": 0.6798080205917358, + "step": 639 + }, + { + "epoch": 1.953763851738632, + "grad_norm": 0.38109108805656433, + "learning_rate": 4.094180635338396e-08, + "loss": 0.6512711644172668, + "step": 640 + }, + { + "epoch": 1.9568207871608712, + "grad_norm": 0.585180938243866, + "learning_rate": 3.626961860484723e-08, + "loss": 0.7008385062217712, + "step": 641 + }, + { + "epoch": 1.9598777225831103, + "grad_norm": 0.32425859570503235, + "learning_rate": 3.187992969249876e-08, + "loss": 0.6602014303207397, + "step": 642 + }, + { + "epoch": 1.9629346580053495, + "grad_norm": 0.30582964420318604, + "learning_rate": 2.7772864075950036e-08, + "loss": 0.6348775029182434, + "step": 643 + }, + { + "epoch": 1.965991593427589, + "grad_norm": 0.3870945870876312, + "learning_rate": 2.3948538201672423e-08, + "loss": 0.7001971006393433, + "step": 644 + }, + { + "epoch": 1.969048528849828, + "grad_norm": 0.3087507486343384, + "learning_rate": 2.040706049970087e-08, + "loss": 0.5484102368354797, + "step": 645 + }, + { + "epoch": 1.9721054642720672, + "grad_norm": 0.3373778462409973, + "learning_rate": 1.7148531380550836e-08, + "loss": 0.59709632396698, + "step": 646 + }, + { + "epoch": 1.9751623996943064, + "grad_norm": 0.2430485486984253, + "learning_rate": 1.4173043232380557e-08, + "loss": 0.5975397229194641, + "step": 647 + }, + { + "epoch": 1.9782193351165458, + "grad_norm": 0.31908750534057617, + "learning_rate": 1.1480680418365364e-08, + "loss": 0.6337687373161316, + "step": 648 + }, + { + "epoch": 1.981276270538785, + "grad_norm": 0.31068095564842224, + "learning_rate": 9.071519274308494e-09, + "loss": 0.680358350276947, + "step": 649 + }, + { + "epoch": 1.9843332059610241, + "grad_norm": 0.3023488521575928, + "learning_rate": 6.945628106477254e-09, + "loss": 0.6560443639755249, + "step": 650 + }, + { + "epoch": 1.9843332059610241, + "eval_loss": 0.6601914763450623, + "eval_runtime": 911.7302, + "eval_samples_per_second": 0.661, + "eval_steps_per_second": 0.661, + "step": 650 + }, + { + "epoch": 1.9873901413832633, + "grad_norm": 0.5558887124061584, + "learning_rate": 5.1030671896623585e-09, + "loss": 0.6826313138008118, + "step": 651 + }, + { + "epoch": 1.9904470768055025, + "grad_norm": 0.35330796241760254, + "learning_rate": 3.5438887654737355e-09, + "loss": 0.6339641809463501, + "step": 652 + }, + { + "epoch": 1.9935040122277417, + "grad_norm": 0.2988436818122864, + "learning_rate": 2.268137040859486e-09, + "loss": 0.6657329201698303, + "step": 653 + }, + { + "epoch": 1.9965609476499808, + "grad_norm": 0.2831656038761139, + "learning_rate": 1.275848186845785e-09, + "loss": 0.6541516780853271, + "step": 654 + }, + { + "epoch": 1.99961788307222, + "grad_norm": 0.3199843764305115, + "learning_rate": 5.670503375188041e-10, + "loss": 0.6613258123397827, + "step": 655 + }, + { + "epoch": 2.0, + "grad_norm": 0.9292091131210327, + "learning_rate": 1.4176358922535216e-10, + "loss": 0.5617818832397461, + "step": 656 + } + ], + "logging_steps": 1, + "max_steps": 656, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.612414876347007e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/cpt_qwen_14B/checkpoints/checkpoint-656/training_args.bin b/cpt_qwen_14B/checkpoints/checkpoint-656/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eddbb43a2cebb928dbed6e955a37ebfa3174f4b5 --- /dev/null +++ b/cpt_qwen_14B/checkpoints/checkpoint-656/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a8e308e47eb936f678712445b19ddc52638f354c37c813ecaa432f69120a2e +size 5201 diff --git a/cpt_qwen_14B/config_resolved.yaml b/cpt_qwen_14B/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ffbcf98e6ea0041a5078252659967d45ccbc2af2 --- /dev/null +++ b/cpt_qwen_14B/config_resolved.yaml @@ -0,0 +1,65 @@ +run: + run_dir: ./runs/cpt_run_14b + seed: 42 +wandb: + enabled: true + project: cpt-training + entity: null + name: null + tags: + - cpt-lora + notes: null +model: + repo_id: /workspace/Models/Qwen2.5-Coder-14B + revision: null + base_local_dir: base_model + trust_remote_code: true + tokenizer_use_fast: true + device_map: auto + torch_dtype: bfloat16 + use_4bit: false + bnb_4bit_quant_type: nf4 + bnb_4bit_use_double_quant: false + bnb_4bit_compute_dtype: bfloat16 + attn_implementation: null +data: + train_jsonl: all_data_with_descriptions.jsonl + eval_jsonl: null + eval_split_ratio: 0.1 + text_field: text + block_size: 4096 + shuffle: true + num_proc: 4 + pack_mode: pad +peft: + enabled: true + r: 32 + lora_alpha: 64 + lora_dropout: 0.05 + bias: none + target_modules: auto +train: + num_train_epochs: 2 + per_device_train_batch_size: 1 + per_device_eval_batch_size: 1 + gradient_accumulation_steps: 16 + learning_rate: 2e-5 + weight_decay: 0.0 + warmup_ratio: 0.1 + lr_scheduler_type: cosine + optim: paged_adamw_8bit + max_grad_norm: 1.0 + gradient_checkpointing: true + logging_steps: 1 + save_strategy: steps + save_steps: 100 + save_total_limit: 7 + evaluation_strategy: steps + eval_steps: 50 + load_best_model_at_end: true + resume_from_checkpoint: auto +merge: + enabled: true + merged_dtype: float16 + max_shard_size: 2GB + output_dir: ./merged_14b_cpt_lora diff --git a/cpt_qwen_14B/eval_final.json b/cpt_qwen_14B/eval_final.json new file mode 100644 index 0000000000000000000000000000000000000000..d6f3d9f05a89df580add6b60fea4b75053baeddd --- /dev/null +++ b/cpt_qwen_14B/eval_final.json @@ -0,0 +1,8 @@ +{ + "eval_loss": 0.6604031324386597, + "eval_runtime": 870.975, + "eval_samples_per_second": 0.692, + "eval_steps_per_second": 0.692, + "epoch": 2.0, + "perplexity": 1.935572469192354 +} \ No newline at end of file diff --git a/cpt_qwen_14B/logs/eval.jsonl b/cpt_qwen_14B/logs/eval.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2f93cb0906a306c159403bc9e17b6ade431c2201 --- /dev/null +++ b/cpt_qwen_14B/logs/eval.jsonl @@ -0,0 +1,14 @@ +{"ts": "2025-12-22T12:33:20", "event": "eval", "step": 50, "epoch": 0.15284677111196027, "eval_loss": 1.0129202604293823, "eval_runtime": 724.3664, "eval_samples_per_second": 0.832, "eval_steps_per_second": 0.832, "perplexity": 2.75363060355447} +{"ts": "2025-12-22T13:36:28", "event": "eval", "step": 100, "epoch": 0.30569354222392053, "eval_loss": 0.884428083896637, "eval_runtime": 723.8143, "eval_samples_per_second": 0.833, "eval_steps_per_second": 0.833, "perplexity": 2.4215990438829245} +{"ts": "2025-12-22T14:41:38", "event": "eval", "step": 150, "epoch": 0.4585403133358808, "eval_loss": 0.7979016900062561, "eval_runtime": 828.6295, "eval_samples_per_second": 0.728, "eval_steps_per_second": 0.728, "perplexity": 2.2208759497181387} +{"ts": "2025-12-22T15:56:24", "event": "eval", "step": 200, "epoch": 0.6113870844478411, "eval_loss": 0.7551760673522949, "eval_runtime": 900.209, "eval_samples_per_second": 0.67, "eval_steps_per_second": 0.67, "perplexity": 2.127986159262536} +{"ts": "2025-12-22T17:14:18", "event": "eval", "step": 250, "epoch": 0.7642338555598013, "eval_loss": 0.7269901633262634, "eval_runtime": 877.665, "eval_samples_per_second": 0.687, "eval_steps_per_second": 0.687, "perplexity": 2.068844343736059} +{"ts": "2025-12-22T18:32:07", "event": "eval", "step": 300, "epoch": 0.9170806266717616, "eval_loss": 0.7063615918159485, "eval_runtime": 882.246, "eval_samples_per_second": 0.683, "eval_steps_per_second": 0.683, "perplexity": 2.0266042148943706} +{"ts": "2025-12-22T19:48:26", "event": "eval", "step": 350, "epoch": 1.0672525792892624, "eval_loss": 0.6917262673377991, "eval_runtime": 874.9693, "eval_samples_per_second": 0.689, "eval_steps_per_second": 0.689, "perplexity": 1.9971601915941601} +{"ts": "2025-12-22T21:05:51", "event": "eval", "step": 400, "epoch": 1.2200993504012227, "eval_loss": 0.6789794564247131, "eval_runtime": 875.5101, "eval_samples_per_second": 0.689, "eval_steps_per_second": 0.689, "perplexity": 1.971864331548731} +{"ts": "2025-12-22T22:24:28", "event": "eval", "step": 450, "epoch": 1.372946121513183, "eval_loss": 0.6703284978866577, "eval_runtime": 907.8663, "eval_samples_per_second": 0.664, "eval_steps_per_second": 0.664, "perplexity": 1.9548793889190592} +{"ts": "2025-12-22T23:41:59", "event": "eval", "step": 500, "epoch": 1.5257928926251432, "eval_loss": 0.6648170948028564, "eval_runtime": 870.3243, "eval_samples_per_second": 0.693, "eval_steps_per_second": 0.693, "perplexity": 1.9441348964384946} +{"ts": "2025-12-23T00:58:58", "event": "eval", "step": 550, "epoch": 1.6786396637371035, "eval_loss": 0.6616687178611755, "eval_runtime": 875.9833, "eval_samples_per_second": 0.688, "eval_steps_per_second": 0.688, "perplexity": 1.9380236522571912} +{"ts": "2025-12-23T02:16:08", "event": "eval", "step": 600, "epoch": 1.8314864348490638, "eval_loss": 0.6604031324386597, "eval_runtime": 874.6571, "eval_samples_per_second": 0.689, "eval_steps_per_second": 0.689, "perplexity": 1.935572469192354} +{"ts": "2025-12-23T03:34:49", "event": "eval", "step": 650, "epoch": 1.9843332059610241, "eval_loss": 0.6601914763450623, "eval_runtime": 911.7302, "eval_samples_per_second": 0.661, "eval_steps_per_second": 0.661, "perplexity": 1.9351628368367713} +{"ts": "2025-12-23T03:55:46", "event": "eval", "step": 656, "epoch": 2.0, "eval_loss": 0.6604031324386597, "eval_runtime": 870.975, "eval_samples_per_second": 0.692, "eval_steps_per_second": 0.692, "perplexity": 1.935572469192354} diff --git a/cpt_qwen_14B/logs/train.jsonl b/cpt_qwen_14B/logs/train.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cb849c3b471cad10cb7fee5dec4a0b2b4f8161b0 --- /dev/null +++ b/cpt_qwen_14B/logs/train.jsonl @@ -0,0 +1,670 @@ +{"ts": "2025-12-22T11:31:39", "event": "train_log", "step": 1, "epoch": 0.003056935422239205, "progress_pct": 0.15, "epoch_pct": 0.15, "eta": "06:48:17", "max_grad_norm": 1.0, "loss": 1.138384461402893, "grad_norm": 0.06516239047050476, "learning_rate": 0.0} +{"ts": "2025-12-22T11:32:16", "event": "train_log", "step": 2, "epoch": 0.00611387084447841, "progress_pct": 0.3, "epoch_pct": 0.31, "eta": "06:43:34", "max_grad_norm": 1.0, "loss": 0.983342707157135, "grad_norm": 0.05343673378229141, "learning_rate": 3.0303030303030305e-07} +{"ts": "2025-12-22T11:32:57", "event": "train_log", "step": 3, "epoch": 0.009170806266717615, "progress_pct": 0.46, "epoch_pct": 0.46, "eta": "06:55:50", "max_grad_norm": 1.0, "loss": 1.0762118101119995, "grad_norm": 0.05608418956398964, "learning_rate": 6.060606060606061e-07} +{"ts": "2025-12-22T11:34:01", "event": "train_log", "step": 4, "epoch": 0.01222774168895682, "progress_pct": 0.61, "epoch_pct": 0.61, "eta": "08:04:58", "max_grad_norm": 1.0, "loss": 1.084489345550537, "grad_norm": 0.06523486226797104, "learning_rate": 9.090909090909091e-07} +{"ts": "2025-12-22T11:35:02", "event": "train_log", "step": 5, "epoch": 0.015284677111196026, "progress_pct": 0.76, "epoch_pct": 0.76, "eta": "08:41:14", "max_grad_norm": 1.0, "loss": 1.2037022113800049, "grad_norm": 0.06582186371088028, "learning_rate": 1.2121212121212122e-06} +{"ts": "2025-12-22T11:36:05", "event": "train_log", "step": 6, "epoch": 0.01834161253343523, "progress_pct": 0.91, "epoch_pct": 0.92, "eta": "09:06:26", "max_grad_norm": 1.0, "loss": 1.10005784034729, "grad_norm": 0.06097998470067978, "learning_rate": 1.5151515151515152e-06} +{"ts": "2025-12-22T11:37:05", "event": "train_log", "step": 7, "epoch": 0.021398547955674436, "progress_pct": 1.07, "epoch_pct": 1.07, "eta": "09:21:09", "max_grad_norm": 1.0, "loss": 1.0895193815231323, "grad_norm": 0.10365528613328934, "learning_rate": 1.8181818181818183e-06} +{"ts": "2025-12-22T11:38:07", "event": "train_log", "step": 8, "epoch": 0.02445548337791364, "progress_pct": 1.22, "epoch_pct": 1.22, "eta": "09:33:13", "max_grad_norm": 1.0, "loss": 1.0593242645263672, "grad_norm": 0.06312141567468643, "learning_rate": 2.1212121212121216e-06} +{"ts": "2025-12-22T11:39:09", "event": "train_log", "step": 9, "epoch": 0.027512418800152847, "progress_pct": 1.37, "epoch_pct": 1.38, "eta": "09:43:06", "max_grad_norm": 1.0, "loss": 0.9772955179214478, "grad_norm": 0.05508403480052948, "learning_rate": 2.4242424242424244e-06} +{"ts": "2025-12-22T11:40:08", "event": "train_log", "step": 10, "epoch": 0.030569354222392053, "progress_pct": 1.52, "epoch_pct": 1.53, "eta": "09:47:56", "max_grad_norm": 1.0, "loss": 1.084238886833191, "grad_norm": 0.06006711348891258, "learning_rate": 2.7272727272727272e-06} +{"ts": "2025-12-22T11:41:12", "event": "train_log", "step": 11, "epoch": 0.033626289644631255, "progress_pct": 1.68, "epoch_pct": 1.68, "eta": "09:56:04", "max_grad_norm": 1.0, "loss": 1.0786534547805786, "grad_norm": 0.0588749423623085, "learning_rate": 3.0303030303030305e-06} +{"ts": "2025-12-22T11:42:11", "event": "train_log", "step": 12, "epoch": 0.03668322506687046, "progress_pct": 1.83, "epoch_pct": 1.83, "eta": "09:58:34", "max_grad_norm": 1.0, "loss": 1.0370622873306274, "grad_norm": 0.046551357954740524, "learning_rate": 3.3333333333333333e-06} +{"ts": "2025-12-22T11:43:15", "event": "train_log", "step": 13, "epoch": 0.039740160489109666, "progress_pct": 1.98, "epoch_pct": 1.99, "eta": "10:04:05", "max_grad_norm": 1.0, "loss": 1.0646986961364746, "grad_norm": 0.061659567058086395, "learning_rate": 3.6363636363636366e-06} +{"ts": "2025-12-22T11:44:16", "event": "train_log", "step": 14, "epoch": 0.04279709591134887, "progress_pct": 2.13, "epoch_pct": 2.14, "eta": "10:06:35", "max_grad_norm": 1.0, "loss": 1.0311307907104492, "grad_norm": 0.06007347255945206, "learning_rate": 3.93939393939394e-06} +{"ts": "2025-12-22T11:45:17", "event": "train_log", "step": 15, "epoch": 0.04585403133358808, "progress_pct": 2.29, "epoch_pct": 2.29, "eta": "10:09:15", "max_grad_norm": 1.0, "loss": 1.1300500631332397, "grad_norm": 0.07314135134220123, "learning_rate": 4.242424242424243e-06} +{"ts": "2025-12-22T11:46:18", "event": "train_log", "step": 16, "epoch": 0.04891096675582728, "progress_pct": 2.44, "epoch_pct": 2.45, "eta": "10:10:47", "max_grad_norm": 1.0, "loss": 1.0197452306747437, "grad_norm": 0.060934022068977356, "learning_rate": 4.5454545454545455e-06} +{"ts": "2025-12-22T11:47:20", "event": "train_log", "step": 17, "epoch": 0.05196790217806649, "progress_pct": 2.59, "epoch_pct": 2.6, "eta": "10:12:48", "max_grad_norm": 1.0, "loss": 1.0438549518585205, "grad_norm": 0.056856051087379456, "learning_rate": 4.848484848484849e-06} +{"ts": "2025-12-22T11:48:21", "event": "train_log", "step": 18, "epoch": 0.055024837600305694, "progress_pct": 2.74, "epoch_pct": 2.75, "eta": "10:13:50", "max_grad_norm": 1.0, "loss": 1.0398856401443481, "grad_norm": 0.05908689647912979, "learning_rate": 5.151515151515152e-06} +{"ts": "2025-12-22T11:49:24", "event": "train_log", "step": 19, "epoch": 0.0580817730225449, "progress_pct": 2.9, "epoch_pct": 2.9, "eta": "10:15:45", "max_grad_norm": 1.0, "loss": 1.107885479927063, "grad_norm": 0.07411840558052063, "learning_rate": 5.4545454545454545e-06} +{"ts": "2025-12-22T11:50:26", "event": "train_log", "step": 20, "epoch": 0.061138708444784105, "progress_pct": 3.05, "epoch_pct": 3.06, "eta": "10:16:38", "max_grad_norm": 1.0, "loss": 1.1060967445373535, "grad_norm": 0.0749165341258049, "learning_rate": 5.7575757575757586e-06} +{"ts": "2025-12-22T11:51:29", "event": "train_log", "step": 21, "epoch": 0.06419564386702331, "progress_pct": 3.2, "epoch_pct": 3.21, "eta": "10:18:05", "max_grad_norm": 1.0, "loss": 1.0471720695495605, "grad_norm": 0.06720177084207535, "learning_rate": 6.060606060606061e-06} +{"ts": "2025-12-22T11:52:31", "event": "train_log", "step": 22, "epoch": 0.06725257928926251, "progress_pct": 3.35, "epoch_pct": 3.36, "eta": "10:19:12", "max_grad_norm": 1.0, "loss": 1.0944981575012207, "grad_norm": 0.05990725755691528, "learning_rate": 6.363636363636364e-06} +{"ts": "2025-12-22T11:53:32", "event": "train_log", "step": 23, "epoch": 0.07030951471150172, "progress_pct": 3.51, "epoch_pct": 3.52, "eta": "10:19:12", "max_grad_norm": 1.0, "loss": 1.1477092504501343, "grad_norm": 0.06672193855047226, "learning_rate": 6.666666666666667e-06} +{"ts": "2025-12-22T11:54:32", "event": "train_log", "step": 24, "epoch": 0.07336645013374092, "progress_pct": 3.66, "epoch_pct": 3.67, "eta": "10:18:57", "max_grad_norm": 1.0, "loss": 1.0591784715652466, "grad_norm": 0.06145205348730087, "learning_rate": 6.969696969696971e-06} +{"ts": "2025-12-22T11:55:35", "event": "train_log", "step": 25, "epoch": 0.07642338555598013, "progress_pct": 3.81, "epoch_pct": 3.82, "eta": "10:19:27", "max_grad_norm": 1.0, "loss": 1.0500165224075317, "grad_norm": 0.0757482647895813, "learning_rate": 7.272727272727273e-06} +{"ts": "2025-12-22T11:56:35", "event": "train_log", "step": 26, "epoch": 0.07948032097821933, "progress_pct": 3.96, "epoch_pct": 3.97, "eta": "10:18:59", "max_grad_norm": 1.0, "loss": 1.0747522115707397, "grad_norm": 0.07848478108644485, "learning_rate": 7.5757575757575764e-06} +{"ts": "2025-12-22T11:57:38", "event": "train_log", "step": 27, "epoch": 0.08253725640045854, "progress_pct": 4.12, "epoch_pct": 4.13, "eta": "10:19:36", "max_grad_norm": 1.0, "loss": 1.132310152053833, "grad_norm": 0.07740631699562073, "learning_rate": 7.87878787878788e-06} +{"ts": "2025-12-22T11:58:39", "event": "train_log", "step": 28, "epoch": 0.08559419182269774, "progress_pct": 4.27, "epoch_pct": 4.28, "eta": "10:19:30", "max_grad_norm": 1.0, "loss": 1.0339502096176147, "grad_norm": 0.07476603239774704, "learning_rate": 8.181818181818183e-06} +{"ts": "2025-12-22T11:59:42", "event": "train_log", "step": 29, "epoch": 0.08865112724493696, "progress_pct": 4.42, "epoch_pct": 4.43, "eta": "10:19:44", "max_grad_norm": 1.0, "loss": 1.1047282218933105, "grad_norm": 0.0779196098446846, "learning_rate": 8.484848484848486e-06} +{"ts": "2025-12-22T12:00:44", "event": "train_log", "step": 30, "epoch": 0.09170806266717615, "progress_pct": 4.57, "epoch_pct": 4.59, "eta": "10:19:42", "max_grad_norm": 1.0, "loss": 1.004916787147522, "grad_norm": 0.06962384283542633, "learning_rate": 8.787878787878788e-06} +{"ts": "2025-12-22T12:01:46", "event": "train_log", "step": 31, "epoch": 0.09476499808941537, "progress_pct": 4.73, "epoch_pct": 4.74, "eta": "10:19:27", "max_grad_norm": 1.0, "loss": 0.9296417832374573, "grad_norm": 0.06369175016880035, "learning_rate": 9.090909090909091e-06} +{"ts": "2025-12-22T12:02:46", "event": "train_log", "step": 32, "epoch": 0.09782193351165457, "progress_pct": 4.88, "epoch_pct": 4.89, "eta": "10:18:49", "max_grad_norm": 1.0, "loss": 1.0721708536148071, "grad_norm": 0.07470260560512543, "learning_rate": 9.393939393939396e-06} +{"ts": "2025-12-22T12:03:49", "event": "train_log", "step": 33, "epoch": 0.10087886893389378, "progress_pct": 5.03, "epoch_pct": 5.04, "eta": "10:19:01", "max_grad_norm": 1.0, "loss": 1.0350117683410645, "grad_norm": 0.07948213815689087, "learning_rate": 9.696969696969698e-06} +{"ts": "2025-12-22T12:04:49", "event": "train_log", "step": 34, "epoch": 0.10393580435613298, "progress_pct": 5.18, "epoch_pct": 5.2, "eta": "10:18:06", "max_grad_norm": 1.0, "loss": 1.026305913925171, "grad_norm": 0.07066022604703903, "learning_rate": 1e-05} +{"ts": "2025-12-22T12:05:53", "event": "train_log", "step": 35, "epoch": 0.10699273977837218, "progress_pct": 5.34, "epoch_pct": 5.35, "eta": "10:18:16", "max_grad_norm": 1.0, "loss": 1.0509816408157349, "grad_norm": 0.07774543762207031, "learning_rate": 1.0303030303030304e-05} +{"ts": "2025-12-22T12:06:54", "event": "train_log", "step": 36, "epoch": 0.11004967520061139, "progress_pct": 5.49, "epoch_pct": 5.5, "eta": "10:17:48", "max_grad_norm": 1.0, "loss": 1.0011574029922485, "grad_norm": 0.07501248270273209, "learning_rate": 1.0606060606060606e-05} +{"ts": "2025-12-22T12:07:56", "event": "train_log", "step": 37, "epoch": 0.11310661062285059, "progress_pct": 5.64, "epoch_pct": 5.66, "eta": "10:17:15", "max_grad_norm": 1.0, "loss": 0.9754424691200256, "grad_norm": 0.6622501611709595, "learning_rate": 1.0909090909090909e-05} +{"ts": "2025-12-22T12:08:57", "event": "train_log", "step": 38, "epoch": 0.1161635460450898, "progress_pct": 5.79, "epoch_pct": 5.81, "eta": "10:16:40", "max_grad_norm": 1.0, "loss": 1.0342774391174316, "grad_norm": 0.07566080242395401, "learning_rate": 1.1212121212121212e-05} +{"ts": "2025-12-22T12:09:59", "event": "train_log", "step": 39, "epoch": 0.119220481467329, "progress_pct": 5.95, "epoch_pct": 5.96, "eta": "10:16:15", "max_grad_norm": 1.0, "loss": 0.9714518785476685, "grad_norm": 0.07573831081390381, "learning_rate": 1.1515151515151517e-05} +{"ts": "2025-12-22T12:11:01", "event": "train_log", "step": 40, "epoch": 0.12227741688956821, "progress_pct": 6.1, "epoch_pct": 6.11, "eta": "10:15:36", "max_grad_norm": 1.0, "loss": 1.1050316095352173, "grad_norm": 0.08083852380514145, "learning_rate": 1.181818181818182e-05} +{"ts": "2025-12-22T12:12:03", "event": "train_log", "step": 41, "epoch": 0.12533435231180742, "progress_pct": 6.25, "epoch_pct": 6.27, "eta": "10:15:14", "max_grad_norm": 1.0, "loss": 1.0871070623397827, "grad_norm": 0.08540588617324829, "learning_rate": 1.2121212121212122e-05} +{"ts": "2025-12-22T12:13:02", "event": "train_log", "step": 42, "epoch": 0.12839128773404662, "progress_pct": 6.4, "epoch_pct": 6.42, "eta": "10:14:04", "max_grad_norm": 1.0, "loss": 1.0206722021102905, "grad_norm": 0.07391592115163803, "learning_rate": 1.2424242424242425e-05} +{"ts": "2025-12-22T12:14:02", "event": "train_log", "step": 43, "epoch": 0.13144822315628582, "progress_pct": 6.55, "epoch_pct": 6.57, "eta": "10:13:05", "max_grad_norm": 1.0, "loss": 0.9775047898292542, "grad_norm": 0.07063689082860947, "learning_rate": 1.2727272727272728e-05} +{"ts": "2025-12-22T12:15:06", "event": "train_log", "step": 44, "epoch": 0.13450515857852502, "progress_pct": 6.71, "epoch_pct": 6.73, "eta": "10:12:52", "max_grad_norm": 1.0, "loss": 1.1132858991622925, "grad_norm": 0.07288888841867447, "learning_rate": 1.3030303030303032e-05} +{"ts": "2025-12-22T12:16:07", "event": "train_log", "step": 45, "epoch": 0.13756209400076425, "progress_pct": 6.86, "epoch_pct": 6.88, "eta": "10:12:13", "max_grad_norm": 1.0, "loss": 1.0707701444625854, "grad_norm": 0.07641777396202087, "learning_rate": 1.3333333333333333e-05} +{"ts": "2025-12-22T12:17:09", "event": "train_log", "step": 46, "epoch": 0.14061902942300344, "progress_pct": 7.01, "epoch_pct": 7.03, "eta": "10:11:28", "max_grad_norm": 1.0, "loss": 0.9328265190124512, "grad_norm": 0.06990326195955276, "learning_rate": 1.3636363636363637e-05} +{"ts": "2025-12-22T12:18:10", "event": "train_log", "step": 47, "epoch": 0.14367596484524264, "progress_pct": 7.16, "epoch_pct": 7.18, "eta": "10:10:40", "max_grad_norm": 1.0, "loss": 1.0131721496582031, "grad_norm": 0.0834241658449173, "learning_rate": 1.3939393939393942e-05} +{"ts": "2025-12-22T12:19:12", "event": "train_log", "step": 48, "epoch": 0.14673290026748184, "progress_pct": 7.32, "epoch_pct": 7.34, "eta": "10:10:06", "max_grad_norm": 1.0, "loss": 0.940493106842041, "grad_norm": 0.0714937075972557, "learning_rate": 1.4242424242424245e-05} +{"ts": "2025-12-22T12:20:13", "event": "train_log", "step": 49, "epoch": 0.14978983568972107, "progress_pct": 7.47, "epoch_pct": 7.49, "eta": "10:09:19", "max_grad_norm": 1.0, "loss": 1.0435771942138672, "grad_norm": 0.07770547270774841, "learning_rate": 1.4545454545454546e-05} +{"ts": "2025-12-22T12:21:16", "event": "train_log", "step": 50, "epoch": 0.15284677111196027, "progress_pct": 7.62, "epoch_pct": 7.64, "eta": "10:08:45", "max_grad_norm": 1.0, "loss": 1.0382137298583984, "grad_norm": 0.07950945198535919, "learning_rate": 1.484848484848485e-05} +{"ts": "2025-12-22T12:33:20", "event": "train_log", "step": 50, "epoch": 0.15284677111196027, "progress_pct": 7.62, "epoch_pct": 7.64, "eta": "12:35:05", "max_grad_norm": 1.0, "eval_loss": 1.0129202604293823, "eval_runtime": 724.3664, "eval_samples_per_second": 0.832, "eval_steps_per_second": 0.832} +{"ts": "2025-12-22T12:34:23", "event": "train_log", "step": 51, "epoch": 0.15590370653419947, "progress_pct": 7.77, "epoch_pct": 7.8, "eta": "12:31:29", "max_grad_norm": 1.0, "loss": 0.9690049886703491, "grad_norm": 0.06961936503648758, "learning_rate": 1.5151515151515153e-05} +{"ts": "2025-12-22T12:35:25", "event": "train_log", "step": 52, "epoch": 0.15896064195643866, "progress_pct": 7.93, "epoch_pct": 7.95, "eta": "12:27:44", "max_grad_norm": 1.0, "loss": 0.9830482006072998, "grad_norm": 0.069523885846138, "learning_rate": 1.5454545454545454e-05} +{"ts": "2025-12-22T12:36:27", "event": "train_log", "step": 53, "epoch": 0.16201757737867786, "progress_pct": 8.08, "epoch_pct": 8.1, "eta": "12:24:15", "max_grad_norm": 1.0, "loss": 1.0895472764968872, "grad_norm": 0.0764622762799263, "learning_rate": 1.575757575757576e-05} +{"ts": "2025-12-22T12:37:29", "event": "train_log", "step": 54, "epoch": 0.1650745128009171, "progress_pct": 8.23, "epoch_pct": 8.25, "eta": "12:20:44", "max_grad_norm": 1.0, "loss": 1.0354574918746948, "grad_norm": 0.1413721889257431, "learning_rate": 1.606060606060606e-05} +{"ts": "2025-12-22T12:38:31", "event": "train_log", "step": 55, "epoch": 0.1681314482231563, "progress_pct": 8.38, "epoch_pct": 8.41, "eta": "12:17:18", "max_grad_norm": 1.0, "loss": 0.8534265160560608, "grad_norm": 0.06818042695522308, "learning_rate": 1.6363636363636366e-05} +{"ts": "2025-12-22T12:39:31", "event": "train_log", "step": 56, "epoch": 0.1711883836453955, "progress_pct": 8.54, "epoch_pct": 8.56, "eta": "12:13:39", "max_grad_norm": 1.0, "loss": 0.9580274820327759, "grad_norm": 0.0722246989607811, "learning_rate": 1.6666666666666667e-05} +{"ts": "2025-12-22T12:40:31", "event": "train_log", "step": 57, "epoch": 0.17424531906763469, "progress_pct": 8.69, "epoch_pct": 8.71, "eta": "12:10:05", "max_grad_norm": 1.0, "loss": 1.0721848011016846, "grad_norm": 0.07113443315029144, "learning_rate": 1.6969696969696972e-05} +{"ts": "2025-12-22T12:41:30", "event": "train_log", "step": 58, "epoch": 0.1773022544898739, "progress_pct": 8.84, "epoch_pct": 8.87, "eta": "12:06:36", "max_grad_norm": 1.0, "loss": 1.1180150508880615, "grad_norm": 0.08412107080221176, "learning_rate": 1.7272727272727274e-05} +{"ts": "2025-12-22T12:42:30", "event": "train_log", "step": 59, "epoch": 0.1803591899121131, "progress_pct": 8.99, "epoch_pct": 9.02, "eta": "12:03:13", "max_grad_norm": 1.0, "loss": 1.0384547710418701, "grad_norm": 0.07381036877632141, "learning_rate": 1.7575757575757576e-05} +{"ts": "2025-12-22T12:43:31", "event": "train_log", "step": 60, "epoch": 0.1834161253343523, "progress_pct": 9.15, "epoch_pct": 9.17, "eta": "11:59:56", "max_grad_norm": 1.0, "loss": 1.0446016788482666, "grad_norm": 0.07089001685380936, "learning_rate": 1.787878787878788e-05} +{"ts": "2025-12-22T12:44:31", "event": "train_log", "step": 61, "epoch": 0.1864730607565915, "progress_pct": 9.3, "epoch_pct": 9.32, "eta": "11:56:44", "max_grad_norm": 1.0, "loss": 1.0015051364898682, "grad_norm": 0.11576953530311584, "learning_rate": 1.8181818181818182e-05} +{"ts": "2025-12-22T12:45:31", "event": "train_log", "step": 62, "epoch": 0.18952999617883073, "progress_pct": 9.45, "epoch_pct": 9.48, "eta": "11:53:33", "max_grad_norm": 1.0, "loss": 0.9642710089683533, "grad_norm": 0.08030868321657181, "learning_rate": 1.8484848484848487e-05} +{"ts": "2025-12-22T12:46:31", "event": "train_log", "step": 63, "epoch": 0.19258693160106993, "progress_pct": 9.6, "epoch_pct": 9.63, "eta": "11:50:28", "max_grad_norm": 1.0, "loss": 1.0722991228103638, "grad_norm": 0.08332342654466629, "learning_rate": 1.8787878787878792e-05} +{"ts": "2025-12-22T12:47:31", "event": "train_log", "step": 64, "epoch": 0.19564386702330913, "progress_pct": 9.76, "epoch_pct": 9.78, "eta": "11:47:26", "max_grad_norm": 1.0, "loss": 1.0104647874832153, "grad_norm": 0.08000365644693375, "learning_rate": 1.9090909090909094e-05} +{"ts": "2025-12-22T12:48:31", "event": "train_log", "step": 65, "epoch": 0.19870080244554833, "progress_pct": 9.91, "epoch_pct": 9.94, "eta": "11:44:28", "max_grad_norm": 1.0, "loss": 0.9445061087608337, "grad_norm": 0.08139508217573166, "learning_rate": 1.9393939393939395e-05} +{"ts": "2025-12-22T12:49:31", "event": "train_log", "step": 66, "epoch": 0.20175773786778756, "progress_pct": 10.06, "epoch_pct": 10.09, "eta": "11:41:33", "max_grad_norm": 1.0, "loss": 1.080810308456421, "grad_norm": 0.08749893307685852, "learning_rate": 1.96969696969697e-05} +{"ts": "2025-12-22T12:50:31", "event": "train_log", "step": 67, "epoch": 0.20481467329002676, "progress_pct": 10.21, "epoch_pct": 10.24, "eta": "11:38:45", "max_grad_norm": 1.0, "loss": 0.9705753922462463, "grad_norm": 0.0786912813782692, "learning_rate": 2e-05} +{"ts": "2025-12-22T12:51:31", "event": "train_log", "step": 68, "epoch": 0.20787160871226595, "progress_pct": 10.37, "epoch_pct": 10.39, "eta": "11:35:59", "max_grad_norm": 1.0, "loss": 0.962783694267273, "grad_norm": 0.08962028473615646, "learning_rate": 1.9999858236410775e-05} +{"ts": "2025-12-22T12:52:31", "event": "train_log", "step": 69, "epoch": 0.21092854413450515, "progress_pct": 10.52, "epoch_pct": 10.55, "eta": "11:33:15", "max_grad_norm": 1.0, "loss": 0.9959614872932434, "grad_norm": 0.08402887731790543, "learning_rate": 1.9999432949662483e-05} +{"ts": "2025-12-22T12:53:31", "event": "train_log", "step": 70, "epoch": 0.21398547955674435, "progress_pct": 10.67, "epoch_pct": 10.7, "eta": "11:30:32", "max_grad_norm": 1.0, "loss": 0.9569960832595825, "grad_norm": 0.08036444336175919, "learning_rate": 1.9998724151813157e-05} +{"ts": "2025-12-22T12:54:33", "event": "train_log", "step": 71, "epoch": 0.21704241497898358, "progress_pct": 10.82, "epoch_pct": 10.85, "eta": "11:28:06", "max_grad_norm": 1.0, "loss": 1.0012171268463135, "grad_norm": 0.08247046917676926, "learning_rate": 1.9997731862959143e-05} +{"ts": "2025-12-22T12:55:34", "event": "train_log", "step": 72, "epoch": 0.22009935040122278, "progress_pct": 10.98, "epoch_pct": 11.0, "eta": "11:25:36", "max_grad_norm": 1.0, "loss": 1.0403809547424316, "grad_norm": 0.08966264873743057, "learning_rate": 1.999645611123453e-05} +{"ts": "2025-12-22T12:56:39", "event": "train_log", "step": 73, "epoch": 0.22315628582346198, "progress_pct": 11.13, "epoch_pct": 11.16, "eta": "11:23:48", "max_grad_norm": 1.0, "loss": 1.0089740753173828, "grad_norm": 0.08061660826206207, "learning_rate": 1.999489693281034e-05} +{"ts": "2025-12-22T12:57:41", "event": "train_log", "step": 74, "epoch": 0.22621322124570117, "progress_pct": 11.28, "epoch_pct": 11.31, "eta": "11:21:30", "max_grad_norm": 1.0, "loss": 0.9333044290542603, "grad_norm": 0.09005365520715714, "learning_rate": 1.9993054371893526e-05} +{"ts": "2025-12-22T12:58:43", "event": "train_log", "step": 75, "epoch": 0.2292701566679404, "progress_pct": 11.43, "epoch_pct": 11.46, "eta": "11:19:13", "max_grad_norm": 1.0, "loss": 0.9284015893936157, "grad_norm": 0.08651519566774368, "learning_rate": 1.9990928480725694e-05} +{"ts": "2025-12-22T12:59:44", "event": "train_log", "step": 76, "epoch": 0.2323270920901796, "progress_pct": 11.59, "epoch_pct": 11.62, "eta": "11:16:52", "max_grad_norm": 1.0, "loss": 0.9782730340957642, "grad_norm": 0.08141147345304489, "learning_rate": 1.9988519319581637e-05} +{"ts": "2025-12-22T13:00:47", "event": "train_log", "step": 77, "epoch": 0.2353840275124188, "progress_pct": 11.74, "epoch_pct": 11.77, "eta": "11:14:50", "max_grad_norm": 1.0, "loss": 0.9723064303398132, "grad_norm": 0.08344405144453049, "learning_rate": 1.998582695676762e-05} +{"ts": "2025-12-22T13:01:46", "event": "train_log", "step": 78, "epoch": 0.238440962934658, "progress_pct": 11.89, "epoch_pct": 11.92, "eta": "11:12:23", "max_grad_norm": 1.0, "loss": 0.9648997783660889, "grad_norm": 0.08019903302192688, "learning_rate": 1.998285146861945e-05} +{"ts": "2025-12-22T13:02:50", "event": "train_log", "step": 79, "epoch": 0.24149789835689722, "progress_pct": 12.04, "epoch_pct": 12.07, "eta": "11:10:27", "max_grad_norm": 1.0, "loss": 0.9263214468955994, "grad_norm": 0.08113416284322739, "learning_rate": 1.99795929395003e-05} +{"ts": "2025-12-22T13:03:52", "event": "train_log", "step": 80, "epoch": 0.24455483377913642, "progress_pct": 12.2, "epoch_pct": 12.23, "eta": "11:08:20", "max_grad_norm": 1.0, "loss": 0.8745232224464417, "grad_norm": 0.08127513527870178, "learning_rate": 1.997605146179833e-05} +{"ts": "2025-12-22T13:04:53", "event": "train_log", "step": 81, "epoch": 0.24761176920137562, "progress_pct": 12.35, "epoch_pct": 12.38, "eta": "11:06:10", "max_grad_norm": 1.0, "loss": 0.8722782135009766, "grad_norm": 0.09934187680482864, "learning_rate": 1.997222713592405e-05} +{"ts": "2025-12-22T13:05:54", "event": "train_log", "step": 82, "epoch": 0.25066870462361485, "progress_pct": 12.5, "epoch_pct": 12.53, "eta": "11:04:03", "max_grad_norm": 1.0, "loss": 1.0084266662597656, "grad_norm": 0.09701363742351532, "learning_rate": 1.9968120070307503e-05} +{"ts": "2025-12-22T13:06:56", "event": "train_log", "step": 83, "epoch": 0.253725640045854, "progress_pct": 12.65, "epoch_pct": 12.69, "eta": "11:02:01", "max_grad_norm": 1.0, "loss": 0.9239332675933838, "grad_norm": 0.08335654437541962, "learning_rate": 1.9963730381395154e-05} +{"ts": "2025-12-22T13:07:57", "event": "train_log", "step": 84, "epoch": 0.25678257546809324, "progress_pct": 12.8, "epoch_pct": 12.84, "eta": "10:59:58", "max_grad_norm": 1.0, "loss": 0.9878032207489014, "grad_norm": 0.09161650389432907, "learning_rate": 1.9959058193646618e-05} +{"ts": "2025-12-22T13:08:59", "event": "train_log", "step": 85, "epoch": 0.2598395108903324, "progress_pct": 12.96, "epoch_pct": 12.99, "eta": "10:57:59", "max_grad_norm": 1.0, "loss": 0.9113098382949829, "grad_norm": 0.08067663013935089, "learning_rate": 1.9954103639531116e-05} +{"ts": "2025-12-22T13:09:58", "event": "train_log", "step": 86, "epoch": 0.26289644631257164, "progress_pct": 13.11, "epoch_pct": 13.14, "eta": "10:55:45", "max_grad_norm": 1.0, "loss": 0.9527600407600403, "grad_norm": 0.09619539976119995, "learning_rate": 1.9948866859523717e-05} +{"ts": "2025-12-22T13:11:02", "event": "train_log", "step": 87, "epoch": 0.26595338173481087, "progress_pct": 13.26, "epoch_pct": 13.3, "eta": "10:54:03", "max_grad_norm": 1.0, "loss": 0.9569152593612671, "grad_norm": 0.10015493631362915, "learning_rate": 1.9943348002101374e-05} +{"ts": "2025-12-22T13:12:02", "event": "train_log", "step": 88, "epoch": 0.26901031715705004, "progress_pct": 13.41, "epoch_pct": 13.45, "eta": "10:51:50", "max_grad_norm": 1.0, "loss": 0.8912045359611511, "grad_norm": 0.09012345969676971, "learning_rate": 1.993754722373869e-05} +{"ts": "2025-12-22T13:13:05", "event": "train_log", "step": 89, "epoch": 0.27206725257928926, "progress_pct": 13.57, "epoch_pct": 13.6, "eta": "10:50:08", "max_grad_norm": 1.0, "loss": 0.856104850769043, "grad_norm": 0.10342805832624435, "learning_rate": 1.9931464688903502e-05} +{"ts": "2025-12-22T13:14:06", "event": "train_log", "step": 90, "epoch": 0.2751241880015285, "progress_pct": 13.72, "epoch_pct": 13.76, "eta": "10:48:12", "max_grad_norm": 1.0, "loss": 0.9631397128105164, "grad_norm": 0.10218493640422821, "learning_rate": 1.9925100570052194e-05} +{"ts": "2025-12-22T13:15:08", "event": "train_log", "step": 91, "epoch": 0.27818112342376766, "progress_pct": 13.87, "epoch_pct": 13.91, "eta": "10:46:17", "max_grad_norm": 1.0, "loss": 0.8532565236091614, "grad_norm": 0.10909046977758408, "learning_rate": 1.9918455047624847e-05} +{"ts": "2025-12-22T13:16:09", "event": "train_log", "step": 92, "epoch": 0.2812380588460069, "progress_pct": 14.02, "epoch_pct": 14.06, "eta": "10:44:22", "max_grad_norm": 1.0, "loss": 0.9691859483718872, "grad_norm": 0.10714197903871536, "learning_rate": 1.9911528310040073e-05} +{"ts": "2025-12-22T13:17:11", "event": "train_log", "step": 93, "epoch": 0.28429499426824606, "progress_pct": 14.18, "epoch_pct": 14.21, "eta": "10:42:36", "max_grad_norm": 1.0, "loss": 0.9374334812164307, "grad_norm": 0.1108694076538086, "learning_rate": 1.990432055368971e-05} +{"ts": "2025-12-22T13:20:17", "event": "train_log", "step": 96, "epoch": 0.2934658005349637, "progress_pct": 14.63, "epoch_pct": 14.67, "eta": "10:37:16", "max_grad_norm": 1.0, "loss": 1.00413179397583, "grad_norm": 0.09954962879419327, "learning_rate": 1.9881013255443152e-05} +{"ts": "2025-12-22T13:21:19", "event": "train_log", "step": 97, "epoch": 0.2965227359572029, "progress_pct": 14.79, "epoch_pct": 14.83, "eta": "10:35:33", "max_grad_norm": 1.0, "loss": 0.9414035677909851, "grad_norm": 0.11006761342287064, "learning_rate": 1.9872683547213446e-05} +{"ts": "2025-12-22T13:22:21", "event": "train_log", "step": 98, "epoch": 0.29957967137944214, "progress_pct": 14.94, "epoch_pct": 14.98, "eta": "10:33:50", "max_grad_norm": 1.0, "loss": 0.9155468940734863, "grad_norm": 0.1014382541179657, "learning_rate": 1.9864073921572756e-05} +{"ts": "2025-12-22T13:23:23", "event": "train_log", "step": 99, "epoch": 0.3026366068016813, "progress_pct": 15.09, "epoch_pct": 15.13, "eta": "10:32:06", "max_grad_norm": 1.0, "loss": 0.9429305195808411, "grad_norm": 0.09883157908916473, "learning_rate": 1.9855184622627362e-05} +{"ts": "2025-12-22T13:24:24", "event": "train_log", "step": 100, "epoch": 0.30569354222392053, "progress_pct": 15.24, "epoch_pct": 15.28, "eta": "10:30:20", "max_grad_norm": 1.0, "loss": 0.9143528342247009, "grad_norm": 0.11199072748422623, "learning_rate": 1.9846015902413053e-05} +{"ts": "2025-12-22T13:36:28", "event": "train_log", "step": 100, "epoch": 0.30569354222392053, "progress_pct": 15.24, "epoch_pct": 15.28, "eta": "11:37:24", "max_grad_norm": 1.0, "eval_loss": 0.884428083896637, "eval_runtime": 723.8143, "eval_samples_per_second": 0.833, "eval_steps_per_second": 0.833} +{"ts": "2025-12-22T13:37:32", "event": "train_log", "step": 101, "epoch": 0.3087504776461597, "progress_pct": 15.4, "epoch_pct": 15.44, "eta": "11:35:05", "max_grad_norm": 1.0, "loss": 0.9726455211639404, "grad_norm": 0.10796016454696655, "learning_rate": 1.9836568020887963e-05} +{"ts": "2025-12-22T13:38:33", "event": "train_log", "step": 102, "epoch": 0.31180741306839893, "progress_pct": 15.55, "epoch_pct": 15.59, "eta": "11:32:35", "max_grad_norm": 1.0, "loss": 0.8932135701179504, "grad_norm": 0.10056383162736893, "learning_rate": 1.982684124592521e-05} +{"ts": "2025-12-22T13:39:33", "event": "train_log", "step": 103, "epoch": 0.31486434849063816, "progress_pct": 15.7, "epoch_pct": 15.74, "eta": "11:29:59", "max_grad_norm": 1.0, "loss": 0.919749915599823, "grad_norm": 0.10836594551801682, "learning_rate": 1.9816835853305306e-05} +{"ts": "2025-12-22T13:40:37", "event": "train_log", "step": 104, "epoch": 0.31792128391287733, "progress_pct": 15.85, "epoch_pct": 15.9, "eta": "11:27:47", "max_grad_norm": 1.0, "loss": 0.871781587600708, "grad_norm": 0.12032149732112885, "learning_rate": 1.9806552126708322e-05} +{"ts": "2025-12-22T13:41:39", "event": "train_log", "step": 105, "epoch": 0.32097821933511655, "progress_pct": 16.01, "epoch_pct": 16.05, "eta": "11:25:23", "max_grad_norm": 1.0, "loss": 0.8587784171104431, "grad_norm": 0.10854160040616989, "learning_rate": 1.9795990357705853e-05} +{"ts": "2025-12-22T13:42:40", "event": "train_log", "step": 106, "epoch": 0.3240351547573557, "progress_pct": 16.16, "epoch_pct": 16.2, "eta": "11:23:00", "max_grad_norm": 1.0, "loss": 0.8524806499481201, "grad_norm": 0.10819399356842041, "learning_rate": 1.978515084575276e-05} +{"ts": "2025-12-22T13:43:42", "event": "train_log", "step": 107, "epoch": 0.32709209017959495, "progress_pct": 16.31, "epoch_pct": 16.35, "eta": "11:20:40", "max_grad_norm": 1.0, "loss": 0.7892144918441772, "grad_norm": 0.10226067155599594, "learning_rate": 1.9774033898178668e-05} +{"ts": "2025-12-22T13:44:43", "event": "train_log", "step": 108, "epoch": 0.3301490256018342, "progress_pct": 16.46, "epoch_pct": 16.51, "eta": "11:18:18", "max_grad_norm": 1.0, "loss": 0.8833234906196594, "grad_norm": 0.1071159616112709, "learning_rate": 1.976263983017925e-05} +{"ts": "2025-12-22T13:45:45", "event": "train_log", "step": 109, "epoch": 0.33320596102407335, "progress_pct": 16.62, "epoch_pct": 16.66, "eta": "11:16:02", "max_grad_norm": 1.0, "loss": 0.861842155456543, "grad_norm": 0.11434526741504669, "learning_rate": 1.9750968964807305e-05} +{"ts": "2025-12-22T13:46:47", "event": "train_log", "step": 110, "epoch": 0.3362628964463126, "progress_pct": 16.77, "epoch_pct": 16.81, "eta": "11:13:47", "max_grad_norm": 1.0, "loss": 0.8987889289855957, "grad_norm": 0.1159641221165657, "learning_rate": 1.9739021632963584e-05} +{"ts": "2025-12-22T13:47:48", "event": "train_log", "step": 111, "epoch": 0.3393198318685518, "progress_pct": 16.92, "epoch_pct": 16.97, "eta": "11:11:31", "max_grad_norm": 1.0, "loss": 0.9710193872451782, "grad_norm": 0.12371373921632767, "learning_rate": 1.9726798173387417e-05} +{"ts": "2025-12-22T13:48:51", "event": "train_log", "step": 112, "epoch": 0.342376767290791, "progress_pct": 17.07, "epoch_pct": 17.12, "eta": "11:09:22", "max_grad_norm": 1.0, "loss": 0.8199151158332825, "grad_norm": 0.11441531032323837, "learning_rate": 1.97142989326471e-05} +{"ts": "2025-12-22T13:49:53", "event": "train_log", "step": 113, "epoch": 0.3454337027130302, "progress_pct": 17.23, "epoch_pct": 17.27, "eta": "11:07:12", "max_grad_norm": 1.0, "loss": 0.8845276236534119, "grad_norm": 0.11842846125364304, "learning_rate": 1.9701524265130088e-05} +{"ts": "2025-12-22T13:50:55", "event": "train_log", "step": 114, "epoch": 0.34849063813526937, "progress_pct": 17.38, "epoch_pct": 17.42, "eta": "11:05:01", "max_grad_norm": 1.0, "loss": 0.7964264750480652, "grad_norm": 0.10813732445240021, "learning_rate": 1.9688474533032916e-05} +{"ts": "2025-12-22T13:51:56", "event": "train_log", "step": 115, "epoch": 0.3515475735575086, "progress_pct": 17.53, "epoch_pct": 17.58, "eta": "11:02:48", "max_grad_norm": 1.0, "loss": 0.9630422592163086, "grad_norm": 0.11050347238779068, "learning_rate": 1.9675150106350957e-05} +{"ts": "2025-12-22T13:52:58", "event": "train_log", "step": 116, "epoch": 0.3546045089797478, "progress_pct": 17.68, "epoch_pct": 17.73, "eta": "11:00:42", "max_grad_norm": 1.0, "loss": 0.7706905007362366, "grad_norm": 0.10537250339984894, "learning_rate": 1.9661551362867926e-05} +{"ts": "2025-12-22T13:53:58", "event": "train_log", "step": 117, "epoch": 0.357661444401987, "progress_pct": 17.84, "epoch_pct": 17.88, "eta": "10:58:28", "max_grad_norm": 1.0, "loss": 0.8541204929351807, "grad_norm": 0.11390368640422821, "learning_rate": 1.9647678688145163e-05} +{"ts": "2025-12-22T13:55:03", "event": "train_log", "step": 118, "epoch": 0.3607183798242262, "progress_pct": 17.99, "epoch_pct": 18.04, "eta": "10:56:37", "max_grad_norm": 1.0, "loss": 0.7400562763214111, "grad_norm": 0.10318922251462936, "learning_rate": 1.963353247551069e-05} +{"ts": "2025-12-22T13:56:11", "event": "train_log", "step": 119, "epoch": 0.3637753152464654, "progress_pct": 18.14, "epoch_pct": 18.19, "eta": "10:55:02", "max_grad_norm": 1.0, "loss": 0.9232871532440186, "grad_norm": 0.1347586214542389, "learning_rate": 1.9619113126048086e-05} +{"ts": "2025-12-22T13:57:13", "event": "train_log", "step": 120, "epoch": 0.3668322506687046, "progress_pct": 18.29, "epoch_pct": 18.34, "eta": "10:52:57", "max_grad_norm": 1.0, "loss": 0.833285927772522, "grad_norm": 0.11458177119493484, "learning_rate": 1.96044210485851e-05} +{"ts": "2025-12-22T13:58:15", "event": "train_log", "step": 121, "epoch": 0.36988918609094384, "progress_pct": 18.45, "epoch_pct": 18.49, "eta": "10:50:53", "max_grad_norm": 1.0, "loss": 0.7887391448020935, "grad_norm": 0.12361041456460953, "learning_rate": 1.958945665968206e-05} +{"ts": "2025-12-22T13:59:17", "event": "train_log", "step": 122, "epoch": 0.372946121513183, "progress_pct": 18.6, "epoch_pct": 18.65, "eta": "10:48:53", "max_grad_norm": 1.0, "loss": 0.8206446170806885, "grad_norm": 0.11985408514738083, "learning_rate": 1.9574220383620054e-05} +{"ts": "2025-12-22T14:00:18", "event": "train_log", "step": 123, "epoch": 0.37600305693542224, "progress_pct": 18.75, "epoch_pct": 18.8, "eta": "10:46:51", "max_grad_norm": 1.0, "loss": 0.7648542523384094, "grad_norm": 0.1355939507484436, "learning_rate": 1.9558712652388932e-05} +{"ts": "2025-12-22T14:01:19", "event": "train_log", "step": 124, "epoch": 0.37905999235766147, "progress_pct": 18.9, "epoch_pct": 18.95, "eta": "10:44:45", "max_grad_norm": 1.0, "loss": 0.8573335409164429, "grad_norm": 0.1229313388466835, "learning_rate": 1.954293390567501e-05} +{"ts": "2025-12-22T14:02:19", "event": "train_log", "step": 125, "epoch": 0.38211692777990064, "progress_pct": 19.05, "epoch_pct": 19.11, "eta": "10:42:37", "max_grad_norm": 1.0, "loss": 0.7412531971931458, "grad_norm": 0.11425124108791351, "learning_rate": 1.9526884590848646e-05} +{"ts": "2025-12-22T14:03:18", "event": "train_log", "step": 126, "epoch": 0.38517386320213987, "progress_pct": 19.21, "epoch_pct": 19.26, "eta": "10:40:30", "max_grad_norm": 1.0, "loss": 0.8098543882369995, "grad_norm": 0.12430041283369064, "learning_rate": 1.9510565162951538e-05} +{"ts": "2025-12-22T14:04:18", "event": "train_log", "step": 127, "epoch": 0.38823079862437904, "progress_pct": 19.36, "epoch_pct": 19.41, "eta": "10:38:24", "max_grad_norm": 1.0, "loss": 0.8814713954925537, "grad_norm": 0.12492368370294571, "learning_rate": 1.9493976084683814e-05} +{"ts": "2025-12-22T14:05:18", "event": "train_log", "step": 128, "epoch": 0.39128773404661826, "progress_pct": 19.51, "epoch_pct": 19.56, "eta": "10:36:19", "max_grad_norm": 1.0, "loss": 0.8231979608535767, "grad_norm": 0.14428824186325073, "learning_rate": 1.9477117826390934e-05} +{"ts": "2025-12-22T14:06:17", "event": "train_log", "step": 129, "epoch": 0.3943446694688575, "progress_pct": 19.66, "epoch_pct": 19.72, "eta": "10:34:15", "max_grad_norm": 1.0, "loss": 0.7015627026557922, "grad_norm": 0.12010085582733154, "learning_rate": 1.9459990866050337e-05} +{"ts": "2025-12-22T14:07:17", "event": "train_log", "step": 130, "epoch": 0.39740160489109666, "progress_pct": 19.82, "epoch_pct": 19.87, "eta": "10:32:12", "max_grad_norm": 1.0, "loss": 0.8086729645729065, "grad_norm": 0.11819776892662048, "learning_rate": 1.9442595689257898e-05} +{"ts": "2025-12-22T14:08:17", "event": "train_log", "step": 131, "epoch": 0.4004585403133359, "progress_pct": 19.97, "epoch_pct": 20.02, "eta": "10:30:09", "max_grad_norm": 1.0, "loss": 0.8234002590179443, "grad_norm": 0.12211033701896667, "learning_rate": 1.9424932789214158e-05} +{"ts": "2025-12-22T14:09:16", "event": "train_log", "step": 132, "epoch": 0.4035154757355751, "progress_pct": 20.12, "epoch_pct": 20.18, "eta": "10:28:09", "max_grad_norm": 1.0, "loss": 0.874608039855957, "grad_norm": 0.14926476776599884, "learning_rate": 1.9407002666710334e-05} +{"ts": "2025-12-22T14:10:16", "event": "train_log", "step": 133, "epoch": 0.4065724111578143, "progress_pct": 20.27, "epoch_pct": 20.33, "eta": "10:26:09", "max_grad_norm": 1.0, "loss": 0.8491607904434204, "grad_norm": 0.13012923300266266, "learning_rate": 1.9388805830114132e-05} +{"ts": "2025-12-22T14:11:16", "event": "train_log", "step": 134, "epoch": 0.4096293465800535, "progress_pct": 20.43, "epoch_pct": 20.48, "eta": "10:24:11", "max_grad_norm": 1.0, "loss": 0.7269159555435181, "grad_norm": 0.12012261897325516, "learning_rate": 1.937034279535533e-05} +{"ts": "2025-12-22T14:12:16", "event": "train_log", "step": 135, "epoch": 0.4126862820022927, "progress_pct": 20.58, "epoch_pct": 20.63, "eta": "10:22:12", "max_grad_norm": 1.0, "loss": 0.8560839891433716, "grad_norm": 0.15302567183971405, "learning_rate": 1.9351614085911134e-05} +{"ts": "2025-12-22T14:13:15", "event": "train_log", "step": 136, "epoch": 0.4157432174245319, "progress_pct": 20.73, "epoch_pct": 20.79, "eta": "10:20:14", "max_grad_norm": 1.0, "loss": 0.8211904764175415, "grad_norm": 0.12234190106391907, "learning_rate": 1.933262023279137e-05} +{"ts": "2025-12-22T14:14:15", "event": "train_log", "step": 137, "epoch": 0.41880015284677113, "progress_pct": 20.88, "epoch_pct": 20.94, "eta": "10:18:18", "max_grad_norm": 1.0, "loss": 0.8500057458877563, "grad_norm": 0.14427296817302704, "learning_rate": 1.9313361774523387e-05} +{"ts": "2025-12-22T14:15:15", "event": "train_log", "step": 138, "epoch": 0.4218570882690103, "progress_pct": 21.04, "epoch_pct": 21.09, "eta": "10:16:22", "max_grad_norm": 1.0, "loss": 0.7589091658592224, "grad_norm": 0.1314094066619873, "learning_rate": 1.929383925713682e-05} +{"ts": "2025-12-22T14:16:14", "event": "train_log", "step": 139, "epoch": 0.42491402369124953, "progress_pct": 21.19, "epoch_pct": 21.25, "eta": "10:14:27", "max_grad_norm": 1.0, "loss": 0.7581073641777039, "grad_norm": 0.1576734483242035, "learning_rate": 1.92740532341481e-05} +{"ts": "2025-12-22T14:17:14", "event": "train_log", "step": 140, "epoch": 0.4279709591134887, "progress_pct": 21.34, "epoch_pct": 21.4, "eta": "10:12:34", "max_grad_norm": 1.0, "loss": 0.809050440788269, "grad_norm": 0.15788713097572327, "learning_rate": 1.925400426654475e-05} +{"ts": "2025-12-22T14:18:14", "event": "train_log", "step": 141, "epoch": 0.43102789453572793, "progress_pct": 21.49, "epoch_pct": 21.55, "eta": "10:10:42", "max_grad_norm": 1.0, "loss": 0.7990086078643799, "grad_norm": 0.13364559412002563, "learning_rate": 1.9233692922769497e-05} +{"ts": "2025-12-22T14:19:17", "event": "train_log", "step": 142, "epoch": 0.43408482995796716, "progress_pct": 21.65, "epoch_pct": 21.7, "eta": "10:08:59", "max_grad_norm": 1.0, "loss": 0.8675815463066101, "grad_norm": 0.14786465466022491, "learning_rate": 1.921311977870413e-05} +{"ts": "2025-12-22T14:20:18", "event": "train_log", "step": 143, "epoch": 0.4371417653802063, "progress_pct": 21.8, "epoch_pct": 21.86, "eta": "10:07:15", "max_grad_norm": 1.0, "loss": 0.8713765740394592, "grad_norm": 0.14621882140636444, "learning_rate": 1.9192285417653208e-05} +{"ts": "2025-12-22T14:21:19", "event": "train_log", "step": 144, "epoch": 0.44019870080244555, "progress_pct": 21.95, "epoch_pct": 22.01, "eta": "10:05:28", "max_grad_norm": 1.0, "loss": 0.7361871004104614, "grad_norm": 0.12874048948287964, "learning_rate": 1.917119043032749e-05} +{"ts": "2025-12-22T14:22:22", "event": "train_log", "step": 145, "epoch": 0.4432556362246848, "progress_pct": 22.1, "epoch_pct": 22.16, "eta": "10:03:46", "max_grad_norm": 1.0, "loss": 0.7311941385269165, "grad_norm": 0.12183775007724762, "learning_rate": 1.9149835414827193e-05} +{"ts": "2025-12-22T14:23:22", "event": "train_log", "step": 146, "epoch": 0.44631257164692395, "progress_pct": 22.26, "epoch_pct": 22.32, "eta": "10:01:58", "max_grad_norm": 1.0, "loss": 0.8189159035682678, "grad_norm": 0.1397160291671753, "learning_rate": 1.912822097662505e-05} +{"ts": "2025-12-22T14:24:34", "event": "train_log", "step": 147, "epoch": 0.4493695070691632, "progress_pct": 22.41, "epoch_pct": 22.47, "eta": "10:00:52", "max_grad_norm": 1.0, "loss": 0.8288135528564453, "grad_norm": 0.1458273082971573, "learning_rate": 1.9106347728549134e-05} +{"ts": "2025-12-22T14:25:43", "event": "train_log", "step": 148, "epoch": 0.45242644249140235, "progress_pct": 22.56, "epoch_pct": 22.62, "eta": "09:59:34", "max_grad_norm": 1.0, "loss": 0.7878037095069885, "grad_norm": 0.16898781061172485, "learning_rate": 1.908421629076547e-05} +{"ts": "2025-12-22T14:26:48", "event": "train_log", "step": 149, "epoch": 0.4554833779136416, "progress_pct": 22.71, "epoch_pct": 22.77, "eta": "09:58:04", "max_grad_norm": 1.0, "loss": 0.8059952259063721, "grad_norm": 0.1638474315404892, "learning_rate": 1.9061827290760466e-05} +{"ts": "2025-12-22T14:27:49", "event": "train_log", "step": 150, "epoch": 0.4585403133358808, "progress_pct": 22.87, "epoch_pct": 22.93, "eta": "09:56:20", "max_grad_norm": 1.0, "loss": 0.7346830368041992, "grad_norm": 0.14130882918834686, "learning_rate": 1.9039181363323128e-05} +{"ts": "2025-12-22T14:41:38", "event": "train_log", "step": 150, "epoch": 0.4585403133358808, "progress_pct": 22.87, "epoch_pct": 22.93, "eta": "10:42:56", "max_grad_norm": 1.0, "eval_loss": 0.7979016900062561, "eval_runtime": 828.6295, "eval_samples_per_second": 0.728, "eval_steps_per_second": 0.728} +{"ts": "2025-12-22T14:42:54", "event": "train_log", "step": 151, "epoch": 0.46159724875811997, "progress_pct": 23.02, "epoch_pct": 23.08, "eta": "10:41:41", "max_grad_norm": 1.0, "loss": 0.7583403587341309, "grad_norm": 0.14427433907985687, "learning_rate": 1.9016279150527044e-05} +{"ts": "2025-12-22T14:44:09", "event": "train_log", "step": 152, "epoch": 0.4646541841803592, "progress_pct": 23.17, "epoch_pct": 23.23, "eta": "10:40:21", "max_grad_norm": 1.0, "loss": 0.7908380031585693, "grad_norm": 0.1515798568725586, "learning_rate": 1.8993121301712194e-05} +{"ts": "2025-12-22T14:45:24", "event": "train_log", "step": 153, "epoch": 0.46771111960259837, "progress_pct": 23.32, "epoch_pct": 23.39, "eta": "10:38:59", "max_grad_norm": 1.0, "loss": 0.7916130423545837, "grad_norm": 0.14444488286972046, "learning_rate": 1.896970847346653e-05} +{"ts": "2025-12-22T14:46:40", "event": "train_log", "step": 154, "epoch": 0.4707680550248376, "progress_pct": 23.48, "epoch_pct": 23.54, "eta": "10:37:42", "max_grad_norm": 1.0, "loss": 0.7750643491744995, "grad_norm": 0.1460912823677063, "learning_rate": 1.8946041329607364e-05} +{"ts": "2025-12-22T14:47:54", "event": "train_log", "step": 155, "epoch": 0.4738249904470768, "progress_pct": 23.63, "epoch_pct": 23.69, "eta": "10:36:17", "max_grad_norm": 1.0, "loss": 0.8059666156768799, "grad_norm": 0.13896244764328003, "learning_rate": 1.892212054116255e-05} +{"ts": "2025-12-22T14:49:10", "event": "train_log", "step": 156, "epoch": 0.476881925869316, "progress_pct": 23.78, "epoch_pct": 23.84, "eta": "10:35:03", "max_grad_norm": 1.0, "loss": 0.8327827453613281, "grad_norm": 0.16133630275726318, "learning_rate": 1.889794678635145e-05} +{"ts": "2025-12-22T14:50:23", "event": "train_log", "step": 157, "epoch": 0.4799388612915552, "progress_pct": 23.93, "epoch_pct": 24.0, "eta": "10:33:36", "max_grad_norm": 1.0, "loss": 0.8498989343643188, "grad_norm": 0.1474636346101761, "learning_rate": 1.8873520750565716e-05} +{"ts": "2025-12-22T14:51:42", "event": "train_log", "step": 158, "epoch": 0.48299579671379445, "progress_pct": 24.09, "epoch_pct": 24.15, "eta": "10:32:27", "max_grad_norm": 1.0, "loss": 0.7750177979469299, "grad_norm": 0.17222349345684052, "learning_rate": 1.884884312634985e-05} +{"ts": "2025-12-22T14:53:01", "event": "train_log", "step": 159, "epoch": 0.4860527321360336, "progress_pct": 24.24, "epoch_pct": 24.3, "eta": "10:31:19", "max_grad_norm": 1.0, "loss": 0.7326169013977051, "grad_norm": 0.15558090806007385, "learning_rate": 1.8823914613381568e-05} +{"ts": "2025-12-22T14:54:16", "event": "train_log", "step": 160, "epoch": 0.48910966755827284, "progress_pct": 24.39, "epoch_pct": 24.46, "eta": "10:30:01", "max_grad_norm": 1.0, "loss": 0.8308709859848022, "grad_norm": 0.13808321952819824, "learning_rate": 1.8798735918451963e-05} +{"ts": "2025-12-22T14:55:30", "event": "train_log", "step": 161, "epoch": 0.492166602980512, "progress_pct": 24.54, "epoch_pct": 24.61, "eta": "10:28:36", "max_grad_norm": 1.0, "loss": 0.7805465459823608, "grad_norm": 0.1761898398399353, "learning_rate": 1.8773307755445468e-05} +{"ts": "2025-12-22T14:56:42", "event": "train_log", "step": 162, "epoch": 0.49522353840275124, "progress_pct": 24.7, "epoch_pct": 24.76, "eta": "10:27:08", "max_grad_norm": 1.0, "loss": 0.8538846969604492, "grad_norm": 0.160477414727211, "learning_rate": 1.874763084531961e-05} +{"ts": "2025-12-22T14:58:00", "event": "train_log", "step": 163, "epoch": 0.49828047382499047, "progress_pct": 24.85, "epoch_pct": 24.91, "eta": "10:25:59", "max_grad_norm": 1.0, "loss": 0.8801217675209045, "grad_norm": 0.15238745510578156, "learning_rate": 1.872170591608459e-05} +{"ts": "2025-12-22T14:59:19", "event": "train_log", "step": 164, "epoch": 0.5013374092472297, "progress_pct": 25.0, "epoch_pct": 25.07, "eta": "10:24:50", "max_grad_norm": 1.0, "loss": 0.7205259799957275, "grad_norm": 0.1567080318927765, "learning_rate": 1.86955337027826e-05} +{"ts": "2025-12-22T15:00:33", "event": "train_log", "step": 165, "epoch": 0.5043943446694689, "progress_pct": 25.15, "epoch_pct": 25.22, "eta": "10:23:29", "max_grad_norm": 1.0, "loss": 0.7636491656303406, "grad_norm": 0.13637851178646088, "learning_rate": 1.866911494746702e-05} +{"ts": "2025-12-22T15:01:51", "event": "train_log", "step": 166, "epoch": 0.507451280091708, "progress_pct": 25.3, "epoch_pct": 25.37, "eta": "10:22:16", "max_grad_norm": 1.0, "loss": 0.7982497811317444, "grad_norm": 0.15563489496707916, "learning_rate": 1.8642450399181373e-05} +{"ts": "2025-12-22T15:03:04", "event": "train_log", "step": 167, "epoch": 0.5105082155139473, "progress_pct": 25.46, "epoch_pct": 25.53, "eta": "10:20:51", "max_grad_norm": 1.0, "loss": 0.8737778067588806, "grad_norm": 0.15503396093845367, "learning_rate": 1.8615540813938063e-05} +{"ts": "2025-12-22T15:04:21", "event": "train_log", "step": 168, "epoch": 0.5135651509361865, "progress_pct": 25.61, "epoch_pct": 25.68, "eta": "10:19:36", "max_grad_norm": 1.0, "loss": 0.796604335308075, "grad_norm": 0.16095557808876038, "learning_rate": 1.8588386954696972e-05} +{"ts": "2025-12-22T15:05:31", "event": "train_log", "step": 169, "epoch": 0.5166220863584257, "progress_pct": 25.76, "epoch_pct": 25.83, "eta": "10:18:04", "max_grad_norm": 1.0, "loss": 0.8247392177581787, "grad_norm": 0.1713593453168869, "learning_rate": 1.856098959134381e-05} +{"ts": "2025-12-22T15:06:35", "event": "train_log", "step": 170, "epoch": 0.5196790217806648, "progress_pct": 25.91, "epoch_pct": 25.98, "eta": "10:16:13", "max_grad_norm": 1.0, "loss": 0.7838484644889832, "grad_norm": 0.18239113688468933, "learning_rate": 1.8533349500668295e-05} +{"ts": "2025-12-22T15:07:36", "event": "train_log", "step": 171, "epoch": 0.5227359572029041, "progress_pct": 26.07, "epoch_pct": 26.14, "eta": "10:14:15", "max_grad_norm": 1.0, "loss": 0.7856907248497009, "grad_norm": 0.15745767951011658, "learning_rate": 1.850546746634211e-05} +{"ts": "2025-12-22T15:08:39", "event": "train_log", "step": 172, "epoch": 0.5257928926251433, "progress_pct": 26.22, "epoch_pct": 26.29, "eta": "10:12:22", "max_grad_norm": 1.0, "loss": 0.7829679846763611, "grad_norm": 0.16820666193962097, "learning_rate": 1.8477344278896708e-05} +{"ts": "2025-12-22T15:09:41", "event": "train_log", "step": 173, "epoch": 0.5288498280473825, "progress_pct": 26.37, "epoch_pct": 26.44, "eta": "10:10:26", "max_grad_norm": 1.0, "loss": 0.7374375462532043, "grad_norm": 0.16975544393062592, "learning_rate": 1.84489807357009e-05} +{"ts": "2025-12-22T15:10:43", "event": "train_log", "step": 174, "epoch": 0.5319067634696217, "progress_pct": 26.52, "epoch_pct": 26.6, "eta": "10:08:31", "max_grad_norm": 1.0, "loss": 0.712837815284729, "grad_norm": 0.167228102684021, "learning_rate": 1.8420377640938204e-05} +{"ts": "2025-12-22T15:11:43", "event": "train_log", "step": 175, "epoch": 0.5349636988918609, "progress_pct": 26.68, "epoch_pct": 26.75, "eta": "10:06:34", "max_grad_norm": 1.0, "loss": 0.7645693421363831, "grad_norm": 0.15955154597759247, "learning_rate": 1.839153580558411e-05} +{"ts": "2025-12-22T15:12:47", "event": "train_log", "step": 176, "epoch": 0.5380206343141001, "progress_pct": 26.83, "epoch_pct": 26.9, "eta": "10:04:45", "max_grad_norm": 1.0, "loss": 0.7974956631660461, "grad_norm": 0.18378689885139465, "learning_rate": 1.8362456047383032e-05} +{"ts": "2025-12-22T15:13:54", "event": "train_log", "step": 177, "epoch": 0.5410775697363394, "progress_pct": 26.98, "epoch_pct": 27.05, "eta": "10:03:07", "max_grad_norm": 1.0, "loss": 0.8957571983337402, "grad_norm": 0.15777672827243805, "learning_rate": 1.833313919082515e-05} +{"ts": "2025-12-22T15:14:53", "event": "train_log", "step": 178, "epoch": 0.5441345051585785, "progress_pct": 27.13, "epoch_pct": 27.21, "eta": "10:01:07", "max_grad_norm": 1.0, "loss": 0.7635619044303894, "grad_norm": 0.15292386710643768, "learning_rate": 1.8303586067123028e-05} +{"ts": "2025-12-22T15:15:57", "event": "train_log", "step": 179, "epoch": 0.5471914405808177, "progress_pct": 27.29, "epoch_pct": 27.36, "eta": "09:59:20", "max_grad_norm": 1.0, "loss": 0.7849246263504028, "grad_norm": 0.178152397274971, "learning_rate": 1.8273797514188043e-05} +{"ts": "2025-12-22T15:16:58", "event": "train_log", "step": 180, "epoch": 0.550248376003057, "progress_pct": 27.44, "epoch_pct": 27.51, "eta": "09:57:28", "max_grad_norm": 1.0, "loss": 0.6975343227386475, "grad_norm": 0.15916013717651367, "learning_rate": 1.824377437660663e-05} +{"ts": "2025-12-22T15:18:00", "event": "train_log", "step": 181, "epoch": 0.5533053114252962, "progress_pct": 27.59, "epoch_pct": 27.67, "eta": "09:55:36", "max_grad_norm": 1.0, "loss": 0.7675164341926575, "grad_norm": 0.18172231316566467, "learning_rate": 1.821351750561634e-05} +{"ts": "2025-12-22T15:19:04", "event": "train_log", "step": 182, "epoch": 0.5563622468475353, "progress_pct": 27.74, "epoch_pct": 27.82, "eta": "09:53:52", "max_grad_norm": 1.0, "loss": 0.7950343489646912, "grad_norm": 0.16241903603076935, "learning_rate": 1.818302775908169e-05} +{"ts": "2025-12-22T15:20:21", "event": "train_log", "step": 183, "epoch": 0.5594191822697746, "progress_pct": 27.9, "epoch_pct": 27.97, "eta": "09:52:41", "max_grad_norm": 1.0, "loss": 0.787315309047699, "grad_norm": 0.18727579712867737, "learning_rate": 1.8152306001469875e-05} +{"ts": "2025-12-22T15:21:33", "event": "train_log", "step": 184, "epoch": 0.5624761176920138, "progress_pct": 28.05, "epoch_pct": 28.12, "eta": "09:51:19", "max_grad_norm": 1.0, "loss": 0.7141211628913879, "grad_norm": 0.1627933531999588, "learning_rate": 1.8121353103826213e-05} +{"ts": "2025-12-22T15:22:37", "event": "train_log", "step": 185, "epoch": 0.565533053114253, "progress_pct": 28.2, "epoch_pct": 28.28, "eta": "09:49:35", "max_grad_norm": 1.0, "loss": 0.8476608395576477, "grad_norm": 0.4369247555732727, "learning_rate": 1.8090169943749477e-05} +{"ts": "2025-12-22T15:23:50", "event": "train_log", "step": 186, "epoch": 0.5685899885364921, "progress_pct": 28.35, "epoch_pct": 28.43, "eta": "09:48:14", "max_grad_norm": 1.0, "loss": 0.720562756061554, "grad_norm": 0.16494786739349365, "learning_rate": 1.8058757405367003e-05} +{"ts": "2025-12-22T15:25:11", "event": "train_log", "step": 187, "epoch": 0.5716469239587314, "progress_pct": 28.51, "epoch_pct": 28.58, "eta": "09:47:13", "max_grad_norm": 1.0, "loss": 0.7589252591133118, "grad_norm": 0.175015389919281, "learning_rate": 1.8027116379309637e-05} +{"ts": "2025-12-22T15:26:26", "event": "train_log", "step": 188, "epoch": 0.5747038593809706, "progress_pct": 28.66, "epoch_pct": 28.74, "eta": "09:45:58", "max_grad_norm": 1.0, "loss": 0.7644155621528625, "grad_norm": 0.1769978553056717, "learning_rate": 1.799524776268646e-05} +{"ts": "2025-12-22T15:27:40", "event": "train_log", "step": 189, "epoch": 0.5777607948032097, "progress_pct": 28.81, "epoch_pct": 28.89, "eta": "09:44:41", "max_grad_norm": 1.0, "loss": 0.7885835766792297, "grad_norm": 0.18481792509555817, "learning_rate": 1.796315245905936e-05} +{"ts": "2025-12-22T15:28:57", "event": "train_log", "step": 190, "epoch": 0.580817730225449, "progress_pct": 28.96, "epoch_pct": 29.04, "eta": "09:43:32", "max_grad_norm": 1.0, "loss": 0.7377231121063232, "grad_norm": 0.1668689250946045, "learning_rate": 1.7930831378417437e-05} +{"ts": "2025-12-22T15:30:16", "event": "train_log", "step": 191, "epoch": 0.5838746656476882, "progress_pct": 29.12, "epoch_pct": 29.19, "eta": "09:42:26", "max_grad_norm": 1.0, "loss": 0.7388894557952881, "grad_norm": 0.178734689950943, "learning_rate": 1.7898285437151163e-05} +{"ts": "2025-12-22T15:31:30", "event": "train_log", "step": 192, "epoch": 0.5869316010699274, "progress_pct": 29.27, "epoch_pct": 29.35, "eta": "09:41:07", "max_grad_norm": 1.0, "loss": 0.8209859728813171, "grad_norm": 0.1740068644285202, "learning_rate": 1.786551555802643e-05} +{"ts": "2025-12-22T15:32:46", "event": "train_log", "step": 193, "epoch": 0.5899885364921666, "progress_pct": 29.42, "epoch_pct": 29.5, "eta": "09:39:55", "max_grad_norm": 1.0, "loss": 0.7305737733840942, "grad_norm": 0.19211041927337646, "learning_rate": 1.783252267015837e-05} +{"ts": "2025-12-22T15:34:00", "event": "train_log", "step": 194, "epoch": 0.5930454719144058, "progress_pct": 29.57, "epoch_pct": 29.65, "eta": "09:38:35", "max_grad_norm": 1.0, "loss": 0.7760804891586304, "grad_norm": 0.16644936800003052, "learning_rate": 1.779930770898503e-05} +{"ts": "2025-12-22T15:35:16", "event": "train_log", "step": 195, "epoch": 0.596102407336645, "progress_pct": 29.73, "epoch_pct": 29.81, "eta": "09:37:23", "max_grad_norm": 1.0, "loss": 0.7879236936569214, "grad_norm": 0.1773686707019806, "learning_rate": 1.776587161624083e-05} +{"ts": "2025-12-22T15:36:30", "event": "train_log", "step": 196, "epoch": 0.5991593427588843, "progress_pct": 29.88, "epoch_pct": 29.96, "eta": "09:36:04", "max_grad_norm": 1.0, "loss": 0.7307407259941101, "grad_norm": 0.17508819699287415, "learning_rate": 1.7732215339929874e-05} +{"ts": "2025-12-22T15:37:43", "event": "train_log", "step": 197, "epoch": 0.6022162781811234, "progress_pct": 30.03, "epoch_pct": 30.11, "eta": "09:34:45", "max_grad_norm": 1.0, "loss": 0.7293214797973633, "grad_norm": 0.17211101949214935, "learning_rate": 1.7698339834299064e-05} +{"ts": "2025-12-22T15:38:57", "event": "train_log", "step": 198, "epoch": 0.6052732136033626, "progress_pct": 30.18, "epoch_pct": 30.26, "eta": "09:33:27", "max_grad_norm": 1.0, "loss": 0.763083279132843, "grad_norm": 0.18085215985774994, "learning_rate": 1.7664246059811058e-05} +{"ts": "2025-12-22T15:40:10", "event": "train_log", "step": 199, "epoch": 0.6083301490256018, "progress_pct": 30.34, "epoch_pct": 30.42, "eta": "09:32:07", "max_grad_norm": 1.0, "loss": 0.7372676134109497, "grad_norm": 0.20243075489997864, "learning_rate": 1.7629934983117025e-05} +{"ts": "2025-12-22T15:41:24", "event": "train_log", "step": 200, "epoch": 0.6113870844478411, "progress_pct": 30.49, "epoch_pct": 30.57, "eta": "09:30:50", "max_grad_norm": 1.0, "loss": 0.7121898531913757, "grad_norm": 0.18152795732021332, "learning_rate": 1.759540757702924e-05} +{"ts": "2025-12-22T15:56:24", "event": "train_log", "step": 200, "epoch": 0.6113870844478411, "progress_pct": 30.49, "epoch_pct": 30.57, "eta": "10:05:03", "max_grad_norm": 1.0, "eval_loss": 0.7551760673522949, "eval_runtime": 900.209, "eval_samples_per_second": 0.67, "eval_steps_per_second": 0.67} +{"ts": "2025-12-22T15:57:41", "event": "train_log", "step": 201, "epoch": 0.6144440198700802, "progress_pct": 30.64, "epoch_pct": 30.72, "eta": "10:03:35", "max_grad_norm": 1.0, "loss": 0.734307050704956, "grad_norm": 0.18808062374591827, "learning_rate": 1.7560664820493502e-05} +{"ts": "2025-12-22T15:58:58", "event": "train_log", "step": 202, "epoch": 0.6175009552923194, "progress_pct": 30.79, "epoch_pct": 30.88, "eta": "10:02:11", "max_grad_norm": 1.0, "loss": 0.7998429536819458, "grad_norm": 0.18151243031024933, "learning_rate": 1.7525707698561383e-05} +{"ts": "2025-12-22T16:00:14", "event": "train_log", "step": 203, "epoch": 0.6205578907145587, "progress_pct": 30.95, "epoch_pct": 31.03, "eta": "10:00:44", "max_grad_norm": 1.0, "loss": 0.7546265721321106, "grad_norm": 0.19583043456077576, "learning_rate": 1.7490537202362313e-05} +{"ts": "2025-12-22T16:01:35", "event": "train_log", "step": 204, "epoch": 0.6236148261367979, "progress_pct": 31.1, "epoch_pct": 31.18, "eta": "09:59:26", "max_grad_norm": 1.0, "loss": 0.7810050249099731, "grad_norm": 0.2508557140827179, "learning_rate": 1.7455154329075427e-05} +{"ts": "2025-12-22T16:02:47", "event": "train_log", "step": 205, "epoch": 0.626671761559037, "progress_pct": 31.25, "epoch_pct": 31.33, "eta": "09:57:50", "max_grad_norm": 1.0, "loss": 0.7558917999267578, "grad_norm": 0.1685105562210083, "learning_rate": 1.741956008190136e-05} +{"ts": "2025-12-22T16:04:02", "event": "train_log", "step": 206, "epoch": 0.6297286969812763, "progress_pct": 31.4, "epoch_pct": 31.49, "eta": "09:56:22", "max_grad_norm": 1.0, "loss": 0.7216942310333252, "grad_norm": 0.18195222318172455, "learning_rate": 1.7383755470033756e-05} +{"ts": "2025-12-22T16:05:23", "event": "train_log", "step": 207, "epoch": 0.6327856324035155, "progress_pct": 31.55, "epoch_pct": 31.64, "eta": "09:55:04", "max_grad_norm": 1.0, "loss": 0.7417092323303223, "grad_norm": 0.1878063678741455, "learning_rate": 1.7347741508630673e-05} +{"ts": "2025-12-22T16:06:41", "event": "train_log", "step": 208, "epoch": 0.6358425678257547, "progress_pct": 31.71, "epoch_pct": 31.79, "eta": "09:53:41", "max_grad_norm": 1.0, "loss": 0.807498037815094, "grad_norm": 0.25273698568344116, "learning_rate": 1.73115192187858e-05} +{"ts": "2025-12-22T16:07:55", "event": "train_log", "step": 209, "epoch": 0.6388995032479939, "progress_pct": 31.86, "epoch_pct": 31.94, "eta": "09:52:11", "max_grad_norm": 1.0, "loss": 0.7557163238525391, "grad_norm": 0.2451465129852295, "learning_rate": 1.7275089627499493e-05} +{"ts": "2025-12-22T16:09:10", "event": "train_log", "step": 210, "epoch": 0.6419564386702331, "progress_pct": 32.01, "epoch_pct": 32.1, "eta": "09:50:42", "max_grad_norm": 1.0, "loss": 0.8285109996795654, "grad_norm": 0.19272617995738983, "learning_rate": 1.7238453767649683e-05} +{"ts": "2025-12-22T16:10:25", "event": "train_log", "step": 211, "epoch": 0.6450133740924723, "progress_pct": 32.16, "epoch_pct": 32.25, "eta": "09:49:12", "max_grad_norm": 1.0, "loss": 0.7824444770812988, "grad_norm": 0.1869518756866455, "learning_rate": 1.720161267796256e-05} +{"ts": "2025-12-22T16:11:42", "event": "train_log", "step": 212, "epoch": 0.6480703095147115, "progress_pct": 32.32, "epoch_pct": 32.4, "eta": "09:47:47", "max_grad_norm": 1.0, "loss": 0.7018642425537109, "grad_norm": 0.2029627561569214, "learning_rate": 1.7164567402983153e-05} +{"ts": "2025-12-22T16:12:57", "event": "train_log", "step": 213, "epoch": 0.6511272449369507, "progress_pct": 32.47, "epoch_pct": 32.56, "eta": "09:46:19", "max_grad_norm": 1.0, "loss": 0.7263948917388916, "grad_norm": 0.23215501010417938, "learning_rate": 1.7127318993045686e-05} +{"ts": "2025-12-22T16:14:12", "event": "train_log", "step": 214, "epoch": 0.6541841803591899, "progress_pct": 32.62, "epoch_pct": 32.71, "eta": "09:44:50", "max_grad_norm": 1.0, "loss": 0.8285576105117798, "grad_norm": 0.19869184494018555, "learning_rate": 1.7089868504243816e-05} +{"ts": "2025-12-22T16:15:28", "event": "train_log", "step": 215, "epoch": 0.6572411157814291, "progress_pct": 32.77, "epoch_pct": 32.86, "eta": "09:43:25", "max_grad_norm": 1.0, "loss": 0.7871490716934204, "grad_norm": 0.22871531546115875, "learning_rate": 1.705221699840069e-05} +{"ts": "2025-12-22T16:16:41", "event": "train_log", "step": 216, "epoch": 0.6602980512036684, "progress_pct": 32.93, "epoch_pct": 33.01, "eta": "09:41:52", "max_grad_norm": 1.0, "loss": 0.740180492401123, "grad_norm": 0.17945580184459686, "learning_rate": 1.701436554303882e-05} +{"ts": "2025-12-22T16:17:57", "event": "train_log", "step": 217, "epoch": 0.6633549866259075, "progress_pct": 33.08, "epoch_pct": 33.17, "eta": "09:40:25", "max_grad_norm": 1.0, "loss": 0.7542892098426819, "grad_norm": 0.20516762137413025, "learning_rate": 1.6976315211349848e-05} +{"ts": "2025-12-22T16:19:11", "event": "train_log", "step": 218, "epoch": 0.6664119220481467, "progress_pct": 33.23, "epoch_pct": 33.32, "eta": "09:38:56", "max_grad_norm": 1.0, "loss": 0.8117404580116272, "grad_norm": 0.22108283638954163, "learning_rate": 1.6938067082164093e-05} +{"ts": "2025-12-22T16:20:25", "event": "train_log", "step": 219, "epoch": 0.669468857470386, "progress_pct": 33.38, "epoch_pct": 33.47, "eta": "09:37:26", "max_grad_norm": 1.0, "loss": 0.8002716898918152, "grad_norm": 0.22329698503017426, "learning_rate": 1.6899622239919965e-05} +{"ts": "2025-12-22T16:21:42", "event": "train_log", "step": 220, "epoch": 0.6725257928926252, "progress_pct": 33.54, "epoch_pct": 33.63, "eta": "09:36:02", "max_grad_norm": 1.0, "loss": 0.7750573754310608, "grad_norm": 0.23545362055301666, "learning_rate": 1.6860981774633228e-05} +{"ts": "2025-12-22T16:23:00", "event": "train_log", "step": 221, "epoch": 0.6755827283148643, "progress_pct": 33.69, "epoch_pct": 33.78, "eta": "09:34:41", "max_grad_norm": 1.0, "loss": 0.8051223754882812, "grad_norm": 0.21816480159759521, "learning_rate": 1.6822146781866097e-05} +{"ts": "2025-12-22T16:24:16", "event": "train_log", "step": 222, "epoch": 0.6786396637371036, "progress_pct": 33.84, "epoch_pct": 33.93, "eta": "09:33:15", "max_grad_norm": 1.0, "loss": 0.7286484241485596, "grad_norm": 0.18638508021831512, "learning_rate": 1.6783118362696162e-05} +{"ts": "2025-12-22T16:25:31", "event": "train_log", "step": 223, "epoch": 0.6816965991593428, "progress_pct": 33.99, "epoch_pct": 34.08, "eta": "09:31:47", "max_grad_norm": 1.0, "loss": 0.7001460194587708, "grad_norm": 0.16794732213020325, "learning_rate": 1.6743897623685178e-05} +{"ts": "2025-12-22T16:26:45", "event": "train_log", "step": 224, "epoch": 0.684753534581582, "progress_pct": 34.15, "epoch_pct": 34.24, "eta": "09:30:19", "max_grad_norm": 1.0, "loss": 0.7479901313781738, "grad_norm": 0.21157318353652954, "learning_rate": 1.6704485676847695e-05} +{"ts": "2025-12-22T16:28:01", "event": "train_log", "step": 225, "epoch": 0.6878104700038211, "progress_pct": 34.3, "epoch_pct": 34.39, "eta": "09:28:53", "max_grad_norm": 1.0, "loss": 0.7660019397735596, "grad_norm": 0.35601308941841125, "learning_rate": 1.666488363961952e-05} +{"ts": "2025-12-22T16:29:18", "event": "train_log", "step": 226, "epoch": 0.6908674054260604, "progress_pct": 34.45, "epoch_pct": 34.54, "eta": "09:27:29", "max_grad_norm": 1.0, "loss": 0.7157142162322998, "grad_norm": 0.17416611313819885, "learning_rate": 1.662509263482604e-05} +{"ts": "2025-12-22T16:30:32", "event": "train_log", "step": 227, "epoch": 0.6939243408482996, "progress_pct": 34.6, "epoch_pct": 34.7, "eta": "09:26:00", "max_grad_norm": 1.0, "loss": 0.7894638776779175, "grad_norm": 0.19655123353004456, "learning_rate": 1.658511379065039e-05} +{"ts": "2025-12-22T16:31:47", "event": "train_log", "step": 228, "epoch": 0.6969812762705387, "progress_pct": 34.76, "epoch_pct": 34.85, "eta": "09:24:33", "max_grad_norm": 1.0, "loss": 0.6853711009025574, "grad_norm": 0.2034345269203186, "learning_rate": 1.6544948240601453e-05} +{"ts": "2025-12-22T16:33:04", "event": "train_log", "step": 229, "epoch": 0.700038211692778, "progress_pct": 34.91, "epoch_pct": 35.0, "eta": "09:23:10", "max_grad_norm": 1.0, "loss": 0.7487372756004333, "grad_norm": 0.199235200881958, "learning_rate": 1.6504597123481737e-05} +{"ts": "2025-12-22T16:34:20", "event": "train_log", "step": 230, "epoch": 0.7030951471150172, "progress_pct": 35.06, "epoch_pct": 35.15, "eta": "09:21:44", "max_grad_norm": 1.0, "loss": 0.7335573434829712, "grad_norm": 0.20407404005527496, "learning_rate": 1.6464061583355088e-05} +{"ts": "2025-12-22T16:35:38", "event": "train_log", "step": 231, "epoch": 0.7061520825372564, "progress_pct": 35.21, "epoch_pct": 35.31, "eta": "09:20:24", "max_grad_norm": 1.0, "loss": 0.7659798264503479, "grad_norm": 0.22096174955368042, "learning_rate": 1.6423342769514227e-05} +{"ts": "2025-12-22T16:36:55", "event": "train_log", "step": 232, "epoch": 0.7092090179594956, "progress_pct": 35.37, "epoch_pct": 35.46, "eta": "09:19:01", "max_grad_norm": 1.0, "loss": 0.7162011861801147, "grad_norm": 0.1916825920343399, "learning_rate": 1.6382441836448203e-05} +{"ts": "2025-12-22T16:38:11", "event": "train_log", "step": 233, "epoch": 0.7122659533817348, "progress_pct": 35.52, "epoch_pct": 35.61, "eta": "09:17:36", "max_grad_norm": 1.0, "loss": 0.6957600116729736, "grad_norm": 0.20505093038082123, "learning_rate": 1.6341359943809626e-05} +{"ts": "2025-12-22T16:39:30", "event": "train_log", "step": 234, "epoch": 0.715322888803974, "progress_pct": 35.67, "epoch_pct": 35.77, "eta": "09:16:18", "max_grad_norm": 1.0, "loss": 0.6724053025245667, "grad_norm": 0.19968082010746002, "learning_rate": 1.6300098256381807e-05} +{"ts": "2025-12-22T16:40:47", "event": "train_log", "step": 235, "epoch": 0.7183798242262133, "progress_pct": 35.82, "epoch_pct": 35.92, "eta": "09:14:53", "max_grad_norm": 1.0, "loss": 0.774741530418396, "grad_norm": 0.19768832623958588, "learning_rate": 1.625865794404573e-05} +{"ts": "2025-12-22T16:42:01", "event": "train_log", "step": 236, "epoch": 0.7214367596484524, "progress_pct": 35.98, "epoch_pct": 36.07, "eta": "09:13:26", "max_grad_norm": 1.0, "loss": 0.6658651828765869, "grad_norm": 0.19257694482803345, "learning_rate": 1.621704018174688e-05} +{"ts": "2025-12-22T16:43:16", "event": "train_log", "step": 237, "epoch": 0.7244936950706916, "progress_pct": 36.13, "epoch_pct": 36.22, "eta": "09:12:00", "max_grad_norm": 1.0, "loss": 0.810744047164917, "grad_norm": 0.21594858169555664, "learning_rate": 1.617524614946192e-05} +{"ts": "2025-12-22T16:44:33", "event": "train_log", "step": 238, "epoch": 0.7275506304929308, "progress_pct": 36.28, "epoch_pct": 36.38, "eta": "09:10:37", "max_grad_norm": 1.0, "loss": 0.7623897194862366, "grad_norm": 0.2107633650302887, "learning_rate": 1.6133277032165264e-05} +{"ts": "2025-12-22T16:45:47", "event": "train_log", "step": 239, "epoch": 0.7306075659151701, "progress_pct": 36.43, "epoch_pct": 36.53, "eta": "09:09:09", "max_grad_norm": 1.0, "loss": 0.7082816362380981, "grad_norm": 0.20114055275917053, "learning_rate": 1.6091134019795447e-05} +{"ts": "2025-12-22T16:47:03", "event": "train_log", "step": 240, "epoch": 0.7336645013374092, "progress_pct": 36.59, "epoch_pct": 36.68, "eta": "09:07:45", "max_grad_norm": 1.0, "loss": 0.7051193714141846, "grad_norm": 0.2542732059955597, "learning_rate": 1.604881830722141e-05} +{"ts": "2025-12-22T16:48:20", "event": "train_log", "step": 241, "epoch": 0.7367214367596484, "progress_pct": 36.74, "epoch_pct": 36.84, "eta": "09:06:23", "max_grad_norm": 1.0, "loss": 0.7895385026931763, "grad_norm": 0.19180485606193542, "learning_rate": 1.600633109420861e-05} +{"ts": "2025-12-22T16:49:34", "event": "train_log", "step": 242, "epoch": 0.7397783721818877, "progress_pct": 36.89, "epoch_pct": 36.99, "eta": "09:04:56", "max_grad_norm": 1.0, "loss": 0.7146293520927429, "grad_norm": 0.368756502866745, "learning_rate": 1.5963673585385016e-05} +{"ts": "2025-12-22T16:50:50", "event": "train_log", "step": 243, "epoch": 0.7428353076041269, "progress_pct": 37.04, "epoch_pct": 37.14, "eta": "09:03:31", "max_grad_norm": 1.0, "loss": 0.650428056716919, "grad_norm": 0.18490125238895416, "learning_rate": 1.5920846990206934e-05} +{"ts": "2025-12-22T16:52:05", "event": "train_log", "step": 244, "epoch": 0.745892243026366, "progress_pct": 37.2, "epoch_pct": 37.29, "eta": "09:02:05", "max_grad_norm": 1.0, "loss": 0.6367110013961792, "grad_norm": 0.23592503368854523, "learning_rate": 1.5877852522924733e-05} +{"ts": "2025-12-22T16:53:20", "event": "train_log", "step": 245, "epoch": 0.7489491784486053, "progress_pct": 37.35, "epoch_pct": 37.45, "eta": "09:00:40", "max_grad_norm": 1.0, "loss": 0.6563615798950195, "grad_norm": 0.20223264396190643, "learning_rate": 1.5834691402548415e-05} +{"ts": "2025-12-22T16:54:34", "event": "train_log", "step": 246, "epoch": 0.7520061138708445, "progress_pct": 37.5, "epoch_pct": 37.6, "eta": "08:59:13", "max_grad_norm": 1.0, "loss": 0.7361881136894226, "grad_norm": 0.27459946274757385, "learning_rate": 1.5791364852813047e-05} +{"ts": "2025-12-22T16:55:52", "event": "train_log", "step": 247, "epoch": 0.7550630492930837, "progress_pct": 37.65, "epoch_pct": 37.75, "eta": "08:57:53", "max_grad_norm": 1.0, "loss": 0.7373813390731812, "grad_norm": 0.21085411310195923, "learning_rate": 1.5747874102144073e-05} +{"ts": "2025-12-22T16:57:05", "event": "train_log", "step": 248, "epoch": 0.7581199847153229, "progress_pct": 37.8, "epoch_pct": 37.91, "eta": "08:56:23", "max_grad_norm": 1.0, "loss": 0.6971457004547119, "grad_norm": 0.23332320153713226, "learning_rate": 1.5704220383622464e-05} +{"ts": "2025-12-22T16:58:22", "event": "train_log", "step": 249, "epoch": 0.7611769201375621, "progress_pct": 37.96, "epoch_pct": 38.06, "eta": "08:55:02", "max_grad_norm": 1.0, "loss": 0.6756627559661865, "grad_norm": 0.23525936901569366, "learning_rate": 1.5660404934949798e-05} +{"ts": "2025-12-22T16:59:40", "event": "train_log", "step": 250, "epoch": 0.7642338555598013, "progress_pct": 38.11, "epoch_pct": 38.21, "eta": "08:53:42", "max_grad_norm": 1.0, "loss": 0.7029792666435242, "grad_norm": 0.2150791585445404, "learning_rate": 1.5616428998413122e-05} +{"ts": "2025-12-22T17:14:18", "event": "train_log", "step": 250, "epoch": 0.7642338555598013, "progress_pct": 38.11, "epoch_pct": 38.21, "eta": "09:17:27", "max_grad_norm": 1.0, "eval_loss": 0.7269901633262634, "eval_runtime": 877.665, "eval_samples_per_second": 0.687, "eval_steps_per_second": 0.687} +{"ts": "2025-12-22T17:15:32", "event": "train_log", "step": 251, "epoch": 0.7672907909820404, "progress_pct": 38.26, "epoch_pct": 38.36, "eta": "09:15:51", "max_grad_norm": 1.0, "loss": 0.715162992477417, "grad_norm": 0.19510552287101746, "learning_rate": 1.5572293820849754e-05} +{"ts": "2025-12-22T17:16:45", "event": "train_log", "step": 252, "epoch": 0.7703477264042797, "progress_pct": 38.41, "epoch_pct": 38.52, "eta": "09:14:14", "max_grad_norm": 1.0, "loss": 0.634660542011261, "grad_norm": 0.25246763229370117, "learning_rate": 1.5528000653611935e-05} +{"ts": "2025-12-22T17:17:58", "event": "train_log", "step": 253, "epoch": 0.7734046618265189, "progress_pct": 38.57, "epoch_pct": 38.67, "eta": "09:12:37", "max_grad_norm": 1.0, "loss": 0.7154463529586792, "grad_norm": 0.2980027496814728, "learning_rate": 1.5483550752531337e-05} +{"ts": "2025-12-22T17:19:13", "event": "train_log", "step": 254, "epoch": 0.7764615972487581, "progress_pct": 38.72, "epoch_pct": 38.82, "eta": "09:11:02", "max_grad_norm": 1.0, "loss": 0.8110946416854858, "grad_norm": 0.2730556130409241, "learning_rate": 1.5438945377883463e-05} +{"ts": "2025-12-22T17:20:26", "event": "train_log", "step": 255, "epoch": 0.7795185326709974, "progress_pct": 38.87, "epoch_pct": 38.98, "eta": "09:09:26", "max_grad_norm": 1.0, "loss": 0.72202467918396, "grad_norm": 0.17258886992931366, "learning_rate": 1.5394185794351914e-05} +{"ts": "2025-12-22T17:21:39", "event": "train_log", "step": 256, "epoch": 0.7825754680932365, "progress_pct": 39.02, "epoch_pct": 39.13, "eta": "09:07:49", "max_grad_norm": 1.0, "loss": 0.7368704080581665, "grad_norm": 0.19966280460357666, "learning_rate": 1.5349273270992537e-05} +{"ts": "2025-12-22T17:22:53", "event": "train_log", "step": 257, "epoch": 0.7856324035154757, "progress_pct": 39.18, "epoch_pct": 39.28, "eta": "09:06:14", "max_grad_norm": 1.0, "loss": 0.7429723143577576, "grad_norm": 0.23305682837963104, "learning_rate": 1.5304209081197425e-05} +{"ts": "2025-12-22T17:24:06", "event": "train_log", "step": 258, "epoch": 0.788689338937715, "progress_pct": 39.33, "epoch_pct": 39.43, "eta": "09:04:38", "max_grad_norm": 1.0, "loss": 0.6498424410820007, "grad_norm": 0.21786810457706451, "learning_rate": 1.5258994502658846e-05} +{"ts": "2025-12-22T17:25:19", "event": "train_log", "step": 259, "epoch": 0.7917462743599541, "progress_pct": 39.48, "epoch_pct": 39.59, "eta": "09:03:03", "max_grad_norm": 1.0, "loss": 0.7379459142684937, "grad_norm": 0.2370925396680832, "learning_rate": 1.5213630817332985e-05} +{"ts": "2025-12-22T17:26:32", "event": "train_log", "step": 260, "epoch": 0.7948032097821933, "progress_pct": 39.63, "epoch_pct": 39.74, "eta": "09:01:26", "max_grad_norm": 1.0, "loss": 0.6742876172065735, "grad_norm": 0.25566384196281433, "learning_rate": 1.5168119311403611e-05} +{"ts": "2025-12-22T17:27:45", "event": "train_log", "step": 261, "epoch": 0.7978601452044326, "progress_pct": 39.79, "epoch_pct": 39.89, "eta": "08:59:51", "max_grad_norm": 1.0, "loss": 0.72329181432724, "grad_norm": 0.2171633243560791, "learning_rate": 1.512246127524561e-05} +{"ts": "2025-12-22T17:28:59", "event": "train_log", "step": 262, "epoch": 0.8009170806266718, "progress_pct": 39.94, "epoch_pct": 40.05, "eta": "08:58:17", "max_grad_norm": 1.0, "loss": 0.765812873840332, "grad_norm": 0.23292019963264465, "learning_rate": 1.50766580033884e-05} +{"ts": "2025-12-22T17:30:12", "event": "train_log", "step": 263, "epoch": 0.8039740160489109, "progress_pct": 40.09, "epoch_pct": 40.2, "eta": "08:56:42", "max_grad_norm": 1.0, "loss": 0.7872639298439026, "grad_norm": 0.19427980482578278, "learning_rate": 1.5030710794479226e-05} +{"ts": "2025-12-22T17:31:25", "event": "train_log", "step": 264, "epoch": 0.8070309514711502, "progress_pct": 40.24, "epoch_pct": 40.35, "eta": "08:55:06", "max_grad_norm": 1.0, "loss": 0.6940722465515137, "grad_norm": 0.2460346817970276, "learning_rate": 1.4984620951246333e-05} +{"ts": "2025-12-22T17:32:41", "event": "train_log", "step": 265, "epoch": 0.8100878868933894, "progress_pct": 40.4, "epoch_pct": 40.5, "eta": "08:53:36", "max_grad_norm": 1.0, "loss": 0.7680137157440186, "grad_norm": 0.2493411898612976, "learning_rate": 1.4938389780462044e-05} +{"ts": "2025-12-22T17:33:58", "event": "train_log", "step": 266, "epoch": 0.8131448223156286, "progress_pct": 40.55, "epoch_pct": 40.66, "eta": "08:52:06", "max_grad_norm": 1.0, "loss": 0.6780916452407837, "grad_norm": 0.23873573541641235, "learning_rate": 1.4892018592905702e-05} +{"ts": "2025-12-22T17:35:13", "event": "train_log", "step": 267, "epoch": 0.8162017577378677, "progress_pct": 40.7, "epoch_pct": 40.81, "eta": "08:50:35", "max_grad_norm": 1.0, "loss": 0.7183764576911926, "grad_norm": 0.2580571174621582, "learning_rate": 1.4845508703326504e-05} +{"ts": "2025-12-22T17:36:34", "event": "train_log", "step": 268, "epoch": 0.819258693160107, "progress_pct": 40.85, "epoch_pct": 40.96, "eta": "08:49:12", "max_grad_norm": 1.0, "loss": 0.8207096457481384, "grad_norm": 0.2125079482793808, "learning_rate": 1.4798861430406221e-05} +{"ts": "2025-12-22T17:37:52", "event": "train_log", "step": 269, "epoch": 0.8223156285823462, "progress_pct": 41.01, "epoch_pct": 41.12, "eta": "08:47:45", "max_grad_norm": 1.0, "loss": 0.7414214611053467, "grad_norm": 0.21065691113471985, "learning_rate": 1.4752078096721827e-05} +{"ts": "2025-12-22T17:39:11", "event": "train_log", "step": 270, "epoch": 0.8253725640045854, "progress_pct": 41.16, "epoch_pct": 41.27, "eta": "08:46:19", "max_grad_norm": 1.0, "loss": 0.7086384296417236, "grad_norm": 0.25807511806488037, "learning_rate": 1.4705160028707976e-05} +{"ts": "2025-12-22T17:40:26", "event": "train_log", "step": 271, "epoch": 0.8284294994268246, "progress_pct": 41.31, "epoch_pct": 41.42, "eta": "08:44:47", "max_grad_norm": 1.0, "loss": 0.7065964937210083, "grad_norm": 0.2444671094417572, "learning_rate": 1.4658108556619417e-05} +{"ts": "2025-12-22T17:41:47", "event": "train_log", "step": 272, "epoch": 0.8314864348490638, "progress_pct": 41.46, "epoch_pct": 41.57, "eta": "08:43:24", "max_grad_norm": 1.0, "loss": 0.7533905506134033, "grad_norm": 0.200303316116333, "learning_rate": 1.461092501449326e-05} +{"ts": "2025-12-22T17:43:06", "event": "train_log", "step": 273, "epoch": 0.834543370271303, "progress_pct": 41.62, "epoch_pct": 41.73, "eta": "08:41:58", "max_grad_norm": 1.0, "loss": 0.756553053855896, "grad_norm": 0.2807226777076721, "learning_rate": 1.4563610740111163e-05} +{"ts": "2025-12-22T17:44:20", "event": "train_log", "step": 274, "epoch": 0.8376003056935423, "progress_pct": 41.77, "epoch_pct": 41.88, "eta": "08:40:26", "max_grad_norm": 1.0, "loss": 0.8125098347663879, "grad_norm": 0.2516884207725525, "learning_rate": 1.4516167074961394e-05} +{"ts": "2025-12-22T17:45:36", "event": "train_log", "step": 275, "epoch": 0.8406572411157814, "progress_pct": 41.92, "epoch_pct": 42.03, "eta": "08:38:56", "max_grad_norm": 1.0, "loss": 0.7360811829566956, "grad_norm": 0.22799813747406006, "learning_rate": 1.4468595364200808e-05} +{"ts": "2025-12-22T17:46:53", "event": "train_log", "step": 276, "epoch": 0.8437141765380206, "progress_pct": 42.07, "epoch_pct": 42.19, "eta": "08:37:28", "max_grad_norm": 1.0, "loss": 0.7135312557220459, "grad_norm": 0.27390384674072266, "learning_rate": 1.4420896956616698e-05} +{"ts": "2025-12-22T17:48:10", "event": "train_log", "step": 277, "epoch": 0.8467711119602599, "progress_pct": 42.23, "epoch_pct": 42.34, "eta": "08:35:59", "max_grad_norm": 1.0, "loss": 0.7489083409309387, "grad_norm": 0.2811775505542755, "learning_rate": 1.4373073204588556e-05} +{"ts": "2025-12-22T17:49:23", "event": "train_log", "step": 278, "epoch": 0.8498280473824991, "progress_pct": 42.38, "epoch_pct": 42.49, "eta": "08:34:26", "max_grad_norm": 1.0, "loss": 0.752477765083313, "grad_norm": 0.2652314603328705, "learning_rate": 1.4325125464049725e-05} +{"ts": "2025-12-22T17:50:41", "event": "train_log", "step": 279, "epoch": 0.8528849828047382, "progress_pct": 42.53, "epoch_pct": 42.64, "eta": "08:33:00", "max_grad_norm": 1.0, "loss": 0.6534979939460754, "grad_norm": 0.2218960076570511, "learning_rate": 1.427705509444897e-05} +{"ts": "2025-12-22T17:51:56", "event": "train_log", "step": 280, "epoch": 0.8559419182269774, "progress_pct": 42.68, "epoch_pct": 42.8, "eta": "08:31:30", "max_grad_norm": 1.0, "loss": 0.7061883211135864, "grad_norm": 0.23746474087238312, "learning_rate": 1.4228863458711915e-05} +{"ts": "2025-12-22T17:53:11", "event": "train_log", "step": 281, "epoch": 0.8589988536492167, "progress_pct": 42.84, "epoch_pct": 42.95, "eta": "08:29:58", "max_grad_norm": 1.0, "loss": 0.7044329643249512, "grad_norm": 0.21507228910923004, "learning_rate": 1.4180551923202406e-05} +{"ts": "2025-12-22T17:54:24", "event": "train_log", "step": 282, "epoch": 0.8620557890714559, "progress_pct": 42.99, "epoch_pct": 43.1, "eta": "08:28:25", "max_grad_norm": 1.0, "loss": 0.706013023853302, "grad_norm": 0.2412186861038208, "learning_rate": 1.4132121857683782e-05} +{"ts": "2025-12-22T17:55:42", "event": "train_log", "step": 283, "epoch": 0.865112724493695, "progress_pct": 43.14, "epoch_pct": 43.26, "eta": "08:26:59", "max_grad_norm": 1.0, "loss": 0.6572445631027222, "grad_norm": 0.2832106947898865, "learning_rate": 1.4083574635280029e-05} +{"ts": "2025-12-22T17:57:01", "event": "train_log", "step": 284, "epoch": 0.8681696599159343, "progress_pct": 43.29, "epoch_pct": 43.41, "eta": "08:25:35", "max_grad_norm": 1.0, "loss": 0.675041139125824, "grad_norm": 0.21925900876522064, "learning_rate": 1.403491163243684e-05} +{"ts": "2025-12-22T17:58:18", "event": "train_log", "step": 285, "epoch": 0.8712265953381735, "progress_pct": 43.45, "epoch_pct": 43.56, "eta": "08:24:07", "max_grad_norm": 1.0, "loss": 0.7474229335784912, "grad_norm": 0.22488665580749512, "learning_rate": 1.3986134228882607e-05} +{"ts": "2025-12-22T17:59:32", "event": "train_log", "step": 286, "epoch": 0.8742835307604127, "progress_pct": 43.6, "epoch_pct": 43.71, "eta": "08:22:36", "max_grad_norm": 1.0, "loss": 0.7394901514053345, "grad_norm": 0.2221737653017044, "learning_rate": 1.3937243807589291e-05} +{"ts": "2025-12-22T18:00:48", "event": "train_log", "step": 287, "epoch": 0.8773404661826519, "progress_pct": 43.75, "epoch_pct": 43.87, "eta": "08:21:07", "max_grad_norm": 1.0, "loss": 0.7346636056900024, "grad_norm": 0.29034581780433655, "learning_rate": 1.388824175473321e-05} +{"ts": "2025-12-22T18:02:02", "event": "train_log", "step": 288, "epoch": 0.8803974016048911, "progress_pct": 43.9, "epoch_pct": 44.02, "eta": "08:19:36", "max_grad_norm": 1.0, "loss": 0.8125481009483337, "grad_norm": 0.2580259144306183, "learning_rate": 1.383912945965574e-05} +{"ts": "2025-12-22T18:03:20", "event": "train_log", "step": 289, "epoch": 0.8834543370271303, "progress_pct": 44.05, "epoch_pct": 44.17, "eta": "08:18:11", "max_grad_norm": 1.0, "loss": 0.6768131256103516, "grad_norm": 0.2533118724822998, "learning_rate": 1.3789908314823932e-05} +{"ts": "2025-12-22T18:04:37", "event": "train_log", "step": 290, "epoch": 0.8865112724493696, "progress_pct": 44.21, "epoch_pct": 44.33, "eta": "08:16:43", "max_grad_norm": 1.0, "loss": 0.7096269726753235, "grad_norm": 0.2074616551399231, "learning_rate": 1.3740579715791017e-05} +{"ts": "2025-12-22T18:05:52", "event": "train_log", "step": 291, "epoch": 0.8895682078716087, "progress_pct": 44.36, "epoch_pct": 44.48, "eta": "08:15:14", "max_grad_norm": 1.0, "loss": 0.6973364353179932, "grad_norm": 0.29789987206459045, "learning_rate": 1.3691145061156843e-05} +{"ts": "2025-12-22T18:07:09", "event": "train_log", "step": 292, "epoch": 0.8926251432938479, "progress_pct": 44.51, "epoch_pct": 44.63, "eta": "08:13:46", "max_grad_norm": 1.0, "loss": 0.7693608999252319, "grad_norm": 0.2937224805355072, "learning_rate": 1.3641605752528225e-05} +{"ts": "2025-12-22T18:08:25", "event": "train_log", "step": 293, "epoch": 0.8956820787160871, "progress_pct": 44.66, "epoch_pct": 44.78, "eta": "08:12:18", "max_grad_norm": 1.0, "loss": 0.6870795488357544, "grad_norm": 0.27355870604515076, "learning_rate": 1.3591963194479198e-05} +{"ts": "2025-12-22T18:09:43", "event": "train_log", "step": 294, "epoch": 0.8987390141383264, "progress_pct": 44.82, "epoch_pct": 44.94, "eta": "08:10:53", "max_grad_norm": 1.0, "loss": 0.7095532417297363, "grad_norm": 0.22792251408100128, "learning_rate": 1.3542218794511212e-05} +{"ts": "2025-12-22T18:11:01", "event": "train_log", "step": 295, "epoch": 0.9017959495605655, "progress_pct": 44.97, "epoch_pct": 45.09, "eta": "08:09:27", "max_grad_norm": 1.0, "loss": 0.7536489963531494, "grad_norm": 0.2855125665664673, "learning_rate": 1.3492373963013199e-05} +{"ts": "2025-12-22T18:12:16", "event": "train_log", "step": 296, "epoch": 0.9048528849828047, "progress_pct": 45.12, "epoch_pct": 45.24, "eta": "08:07:58", "max_grad_norm": 1.0, "loss": 0.7433043718338013, "grad_norm": 0.24969056248664856, "learning_rate": 1.3442430113221602e-05} +{"ts": "2025-12-22T18:13:32", "event": "train_log", "step": 297, "epoch": 0.907909820405044, "progress_pct": 45.27, "epoch_pct": 45.4, "eta": "08:06:31", "max_grad_norm": 1.0, "loss": 0.7204138040542603, "grad_norm": 0.24534980952739716, "learning_rate": 1.3392388661180303e-05} +{"ts": "2025-12-22T18:14:51", "event": "train_log", "step": 298, "epoch": 0.9109667558272831, "progress_pct": 45.43, "epoch_pct": 45.55, "eta": "08:05:07", "max_grad_norm": 1.0, "loss": 0.7114053964614868, "grad_norm": 0.2540739178657532, "learning_rate": 1.3342251025700474e-05} +{"ts": "2025-12-22T18:16:09", "event": "train_log", "step": 299, "epoch": 0.9140236912495223, "progress_pct": 45.58, "epoch_pct": 45.7, "eta": "08:03:41", "max_grad_norm": 1.0, "loss": 0.7337151169776917, "grad_norm": 0.2494630217552185, "learning_rate": 1.3292018628320346e-05} +{"ts": "2025-12-22T18:17:25", "event": "train_log", "step": 300, "epoch": 0.9170806266717616, "progress_pct": 45.73, "epoch_pct": 45.85, "eta": "08:02:14", "max_grad_norm": 1.0, "loss": 0.7486672401428223, "grad_norm": 0.3079741597175598, "learning_rate": 1.3241692893264909e-05} +{"ts": "2025-12-22T18:32:07", "event": "train_log", "step": 300, "epoch": 0.9170806266717616, "progress_pct": 45.73, "epoch_pct": 45.85, "eta": "08:19:41", "max_grad_norm": 1.0, "eval_loss": 0.7063615918159485, "eval_runtime": 882.246, "eval_samples_per_second": 0.683, "eval_steps_per_second": 0.683} +{"ts": "2025-12-22T18:33:24", "event": "train_log", "step": 301, "epoch": 0.9201375620940008, "progress_pct": 45.88, "epoch_pct": 46.01, "eta": "08:18:08", "max_grad_norm": 1.0, "loss": 0.7614796161651611, "grad_norm": 0.23425859212875366, "learning_rate": 1.3191275247405525e-05} +{"ts": "2025-12-22T18:34:42", "event": "train_log", "step": 302, "epoch": 0.9231944975162399, "progress_pct": 46.04, "epoch_pct": 46.16, "eta": "08:16:36", "max_grad_norm": 1.0, "loss": 0.7109901309013367, "grad_norm": 0.22468142211437225, "learning_rate": 1.314076712021949e-05} +{"ts": "2025-12-22T18:35:55", "event": "train_log", "step": 303, "epoch": 0.9262514329384792, "progress_pct": 46.19, "epoch_pct": 46.31, "eta": "08:14:59", "max_grad_norm": 1.0, "loss": 0.6816924810409546, "grad_norm": 0.4165630042552948, "learning_rate": 1.3090169943749475e-05} +{"ts": "2025-12-22T18:37:15", "event": "train_log", "step": 304, "epoch": 0.9293083683607184, "progress_pct": 46.34, "epoch_pct": 46.47, "eta": "08:13:30", "max_grad_norm": 1.0, "loss": 0.7403143644332886, "grad_norm": 0.2934052646160126, "learning_rate": 1.3039485152562951e-05} +{"ts": "2025-12-22T18:38:33", "event": "train_log", "step": 305, "epoch": 0.9323653037829576, "progress_pct": 46.49, "epoch_pct": 46.62, "eta": "08:11:59", "max_grad_norm": 1.0, "loss": 0.7116130590438843, "grad_norm": 0.24021990597248077, "learning_rate": 1.2988714183711504e-05} +{"ts": "2025-12-22T18:39:49", "event": "train_log", "step": 306, "epoch": 0.9354222392051967, "progress_pct": 46.65, "epoch_pct": 46.77, "eta": "08:10:26", "max_grad_norm": 1.0, "loss": 0.745186984539032, "grad_norm": 0.25670015811920166, "learning_rate": 1.2937858476690089e-05} +{"ts": "2025-12-22T18:41:01", "event": "train_log", "step": 307, "epoch": 0.938479174627436, "progress_pct": 46.8, "epoch_pct": 46.92, "eta": "08:08:48", "max_grad_norm": 1.0, "loss": 0.811728298664093, "grad_norm": 0.3273049592971802, "learning_rate": 1.2886919473396212e-05} +{"ts": "2025-12-22T18:42:18", "event": "train_log", "step": 308, "epoch": 0.9415361100496752, "progress_pct": 46.95, "epoch_pct": 47.08, "eta": "08:07:16", "max_grad_norm": 1.0, "loss": 0.6898178458213806, "grad_norm": 0.295612633228302, "learning_rate": 1.2835898618089064e-05} +{"ts": "2025-12-22T18:43:38", "event": "train_log", "step": 309, "epoch": 0.9445930454719144, "progress_pct": 47.1, "epoch_pct": 47.23, "eta": "08:05:47", "max_grad_norm": 1.0, "loss": 0.7637606263160706, "grad_norm": 0.22936004400253296, "learning_rate": 1.2784797357348562e-05} +{"ts": "2025-12-22T18:44:54", "event": "train_log", "step": 310, "epoch": 0.9476499808941536, "progress_pct": 47.26, "epoch_pct": 47.38, "eta": "08:04:15", "max_grad_norm": 1.0, "loss": 0.6364520788192749, "grad_norm": 0.2491123378276825, "learning_rate": 1.2733617140034329e-05} +{"ts": "2025-12-22T18:46:15", "event": "train_log", "step": 311, "epoch": 0.9507069163163928, "progress_pct": 47.41, "epoch_pct": 47.54, "eta": "08:02:48", "max_grad_norm": 1.0, "loss": 0.7065365314483643, "grad_norm": 0.29433801770210266, "learning_rate": 1.268235941724463e-05} +{"ts": "2025-12-22T18:47:31", "event": "train_log", "step": 312, "epoch": 0.953763851738632, "progress_pct": 47.56, "epoch_pct": 47.69, "eta": "08:01:14", "max_grad_norm": 1.0, "loss": 0.73712158203125, "grad_norm": 0.25174376368522644, "learning_rate": 1.2631025642275212e-05} +{"ts": "2025-12-22T18:48:49", "event": "train_log", "step": 313, "epoch": 0.9568207871608713, "progress_pct": 47.71, "epoch_pct": 47.84, "eta": "07:59:44", "max_grad_norm": 1.0, "loss": 0.6926214694976807, "grad_norm": 0.3259194493293762, "learning_rate": 1.257961727057812e-05} +{"ts": "2025-12-22T18:50:04", "event": "train_log", "step": 314, "epoch": 0.9598777225831104, "progress_pct": 47.87, "epoch_pct": 47.99, "eta": "07:58:10", "max_grad_norm": 1.0, "loss": 0.7626583576202393, "grad_norm": 0.31702667474746704, "learning_rate": 1.2528135759720403e-05} +{"ts": "2025-12-22T18:51:18", "event": "train_log", "step": 315, "epoch": 0.9629346580053496, "progress_pct": 48.02, "epoch_pct": 48.15, "eta": "07:56:35", "max_grad_norm": 1.0, "loss": 0.7628929018974304, "grad_norm": 0.24691395461559296, "learning_rate": 1.2476582569342819e-05} +{"ts": "2025-12-22T18:52:32", "event": "train_log", "step": 316, "epoch": 0.9659915934275889, "progress_pct": 48.17, "epoch_pct": 48.3, "eta": "07:55:01", "max_grad_norm": 1.0, "loss": 0.7070521116256714, "grad_norm": 0.2896668314933777, "learning_rate": 1.2424959161118425e-05} +{"ts": "2025-12-22T18:53:45", "event": "train_log", "step": 317, "epoch": 0.9690485288498281, "progress_pct": 48.32, "epoch_pct": 48.45, "eta": "07:53:26", "max_grad_norm": 1.0, "loss": 0.7804452180862427, "grad_norm": 0.2587420642375946, "learning_rate": 1.2373266998711152e-05} +{"ts": "2025-12-22T18:54:57", "event": "train_log", "step": 318, "epoch": 0.9721054642720672, "progress_pct": 48.48, "epoch_pct": 48.61, "eta": "07:51:50", "max_grad_norm": 1.0, "loss": 0.7271901369094849, "grad_norm": 0.28757819533348083, "learning_rate": 1.232150754773429e-05} +{"ts": "2025-12-22T18:56:11", "event": "train_log", "step": 319, "epoch": 0.9751623996943064, "progress_pct": 48.63, "epoch_pct": 48.76, "eta": "07:50:16", "max_grad_norm": 1.0, "loss": 0.6629395484924316, "grad_norm": 0.2600923478603363, "learning_rate": 1.2269682275708951e-05} +{"ts": "2025-12-22T18:57:24", "event": "train_log", "step": 320, "epoch": 0.9782193351165457, "progress_pct": 48.78, "epoch_pct": 48.91, "eta": "07:48:41", "max_grad_norm": 1.0, "loss": 0.7750409841537476, "grad_norm": 0.3455665111541748, "learning_rate": 1.2217792652022452e-05} +{"ts": "2025-12-22T18:58:37", "event": "train_log", "step": 321, "epoch": 0.9812762705387849, "progress_pct": 48.93, "epoch_pct": 49.06, "eta": "07:47:06", "max_grad_norm": 1.0, "loss": 0.6742854118347168, "grad_norm": 0.27122899889945984, "learning_rate": 1.2165840147886656e-05} +{"ts": "2025-12-22T18:59:51", "event": "train_log", "step": 322, "epoch": 0.984333205961024, "progress_pct": 49.09, "epoch_pct": 49.22, "eta": "07:45:32", "max_grad_norm": 1.0, "loss": 0.7265107035636902, "grad_norm": 0.2357456535100937, "learning_rate": 1.2113826236296245e-05} +{"ts": "2025-12-22T19:01:04", "event": "train_log", "step": 323, "epoch": 0.9873901413832633, "progress_pct": 49.24, "epoch_pct": 49.37, "eta": "07:43:58", "max_grad_norm": 1.0, "loss": 0.7203768491744995, "grad_norm": 0.21315616369247437, "learning_rate": 1.2061752391986982e-05} +{"ts": "2025-12-22T19:02:17", "event": "train_log", "step": 324, "epoch": 0.9904470768055025, "progress_pct": 49.39, "epoch_pct": 49.52, "eta": "07:42:23", "max_grad_norm": 1.0, "loss": 0.8011739253997803, "grad_norm": 0.24696163833141327, "learning_rate": 1.2009620091393885e-05} +{"ts": "2025-12-22T19:03:30", "event": "train_log", "step": 325, "epoch": 0.9935040122277417, "progress_pct": 49.54, "epoch_pct": 49.68, "eta": "07:40:49", "max_grad_norm": 1.0, "loss": 0.7316861152648926, "grad_norm": 0.246279776096344, "learning_rate": 1.1957430812609361e-05} +{"ts": "2025-12-22T19:04:44", "event": "train_log", "step": 326, "epoch": 0.9965609476499809, "progress_pct": 49.7, "epoch_pct": 49.83, "eta": "07:39:16", "max_grad_norm": 1.0, "loss": 0.6602386236190796, "grad_norm": 0.26160112023353577, "learning_rate": 1.1905186035341304e-05} +{"ts": "2025-12-22T19:05:57", "event": "train_log", "step": 327, "epoch": 0.9996178830722201, "progress_pct": 49.85, "epoch_pct": 49.98, "eta": "07:37:41", "max_grad_norm": 1.0, "loss": 0.7162635326385498, "grad_norm": 0.27144137024879456, "learning_rate": 1.1852887240871145e-05} +{"ts": "2025-12-22T19:06:06", "event": "train_log", "step": 328, "epoch": 1.0, "progress_pct": 50.0, "epoch_pct": 50.0, "eta": "07:35:04", "max_grad_norm": 1.0, "loss": 0.6108165383338928, "grad_norm": 0.6650471091270447, "learning_rate": 1.1800535912011846e-05} +{"ts": "2025-12-22T19:07:20", "event": "train_log", "step": 329, "epoch": 1.0030569354222392, "progress_pct": 50.15, "epoch_pct": 50.15, "eta": "07:33:31", "max_grad_norm": 1.0, "loss": 0.6724814176559448, "grad_norm": 0.25604233145713806, "learning_rate": 1.1748133533065864e-05} +{"ts": "2025-12-22T19:08:33", "event": "train_log", "step": 330, "epoch": 1.0061138708444783, "progress_pct": 50.3, "epoch_pct": 50.31, "eta": "07:31:57", "max_grad_norm": 1.0, "loss": 0.7010799050331116, "grad_norm": 0.30289238691329956, "learning_rate": 1.1695681589783065e-05} +{"ts": "2025-12-22T19:09:48", "event": "train_log", "step": 331, "epoch": 1.0091708062667175, "progress_pct": 50.46, "epoch_pct": 50.46, "eta": "07:30:27", "max_grad_norm": 1.0, "loss": 0.7199532985687256, "grad_norm": 0.28697144985198975, "learning_rate": 1.1643181569318596e-05} +{"ts": "2025-12-22T19:11:03", "event": "train_log", "step": 332, "epoch": 1.012227741688957, "progress_pct": 50.61, "epoch_pct": 50.61, "eta": "07:28:55", "max_grad_norm": 1.0, "loss": 0.6887974143028259, "grad_norm": 0.26302677392959595, "learning_rate": 1.1590634960190722e-05} +{"ts": "2025-12-22T19:12:20", "event": "train_log", "step": 333, "epoch": 1.015284677111196, "progress_pct": 50.76, "epoch_pct": 50.76, "eta": "07:27:26", "max_grad_norm": 1.0, "loss": 0.7237250208854675, "grad_norm": 0.2987605631351471, "learning_rate": 1.1538043252238629e-05} +{"ts": "2025-12-22T19:13:40", "event": "train_log", "step": 334, "epoch": 1.0183416125334352, "progress_pct": 50.91, "epoch_pct": 50.92, "eta": "07:26:00", "max_grad_norm": 1.0, "loss": 0.7092999815940857, "grad_norm": 0.25947025418281555, "learning_rate": 1.1485407936580169e-05} +{"ts": "2025-12-22T19:14:58", "event": "train_log", "step": 335, "epoch": 1.0213985479556744, "progress_pct": 51.07, "epoch_pct": 51.07, "eta": "07:24:32", "max_grad_norm": 1.0, "loss": 0.6797397136688232, "grad_norm": 0.3119892477989197, "learning_rate": 1.1432730505569597e-05} +{"ts": "2025-12-22T19:16:13", "event": "train_log", "step": 336, "epoch": 1.0244554833779136, "progress_pct": 51.22, "epoch_pct": 51.22, "eta": "07:23:01", "max_grad_norm": 1.0, "loss": 0.7330094575881958, "grad_norm": 0.2772631347179413, "learning_rate": 1.1380012452755259e-05} +{"ts": "2025-12-22T19:17:32", "event": "train_log", "step": 337, "epoch": 1.0275124188001528, "progress_pct": 51.37, "epoch_pct": 51.38, "eta": "07:21:34", "max_grad_norm": 1.0, "loss": 0.711042582988739, "grad_norm": 0.34601089358329773, "learning_rate": 1.1327255272837221e-05} +{"ts": "2025-12-22T19:18:50", "event": "train_log", "step": 338, "epoch": 1.0305693542223922, "progress_pct": 51.52, "epoch_pct": 51.53, "eta": "07:20:07", "max_grad_norm": 1.0, "loss": 0.6593371033668518, "grad_norm": 0.30404818058013916, "learning_rate": 1.1274460461624925e-05} +{"ts": "2025-12-22T19:20:06", "event": "train_log", "step": 339, "epoch": 1.0336262896446313, "progress_pct": 51.68, "epoch_pct": 51.68, "eta": "07:18:37", "max_grad_norm": 1.0, "loss": 0.7230923175811768, "grad_norm": 0.249643474817276, "learning_rate": 1.1221629515994754e-05} +{"ts": "2025-12-22T19:21:20", "event": "train_log", "step": 340, "epoch": 1.0366832250668705, "progress_pct": 51.83, "epoch_pct": 51.83, "eta": "07:17:06", "max_grad_norm": 1.0, "loss": 0.6847513914108276, "grad_norm": 0.2772657871246338, "learning_rate": 1.1168763933847608e-05} +{"ts": "2025-12-22T19:22:37", "event": "train_log", "step": 341, "epoch": 1.0397401604891097, "progress_pct": 51.98, "epoch_pct": 51.99, "eta": "07:15:37", "max_grad_norm": 1.0, "loss": 0.673307478427887, "grad_norm": 0.3479171395301819, "learning_rate": 1.1115865214066414e-05} +{"ts": "2025-12-22T19:23:50", "event": "train_log", "step": 342, "epoch": 1.0427970959113488, "progress_pct": 52.13, "epoch_pct": 52.14, "eta": "07:14:05", "max_grad_norm": 1.0, "loss": 0.7529383897781372, "grad_norm": 0.3393602669239044, "learning_rate": 1.1062934856473655e-05} +{"ts": "2025-12-22T19:25:08", "event": "train_log", "step": 343, "epoch": 1.045854031333588, "progress_pct": 52.29, "epoch_pct": 52.29, "eta": "07:12:38", "max_grad_norm": 1.0, "loss": 0.6309706568717957, "grad_norm": 0.22780737280845642, "learning_rate": 1.1009974361788822e-05} +{"ts": "2025-12-22T19:26:24", "event": "train_log", "step": 344, "epoch": 1.0489109667558272, "progress_pct": 52.44, "epoch_pct": 52.45, "eta": "07:11:08", "max_grad_norm": 1.0, "loss": 0.6944005489349365, "grad_norm": 0.2966362237930298, "learning_rate": 1.095698523158588e-05} +{"ts": "2025-12-22T19:27:40", "event": "train_log", "step": 345, "epoch": 1.0519679021780666, "progress_pct": 52.59, "epoch_pct": 52.6, "eta": "07:09:39", "max_grad_norm": 1.0, "loss": 0.6714650392532349, "grad_norm": 0.27519309520721436, "learning_rate": 1.0903968968250682e-05} +{"ts": "2025-12-22T19:28:56", "event": "train_log", "step": 346, "epoch": 1.0550248376003057, "progress_pct": 52.74, "epoch_pct": 52.75, "eta": "07:08:10", "max_grad_norm": 1.0, "loss": 0.6740344762802124, "grad_norm": 0.36684176325798035, "learning_rate": 1.085092707493839e-05} +{"ts": "2025-12-22T19:30:10", "event": "train_log", "step": 347, "epoch": 1.058081773022545, "progress_pct": 52.9, "epoch_pct": 52.9, "eta": "07:06:40", "max_grad_norm": 1.0, "loss": 0.6590248942375183, "grad_norm": 0.35729631781578064, "learning_rate": 1.0797861055530832e-05} +{"ts": "2025-12-22T19:31:23", "event": "train_log", "step": 348, "epoch": 1.061138708444784, "progress_pct": 53.05, "epoch_pct": 53.06, "eta": "07:05:08", "max_grad_norm": 1.0, "loss": 0.7020372748374939, "grad_norm": 0.33536043763160706, "learning_rate": 1.0744772414593889e-05} +{"ts": "2025-12-22T19:32:37", "event": "train_log", "step": 349, "epoch": 1.0641956438670233, "progress_pct": 53.2, "epoch_pct": 53.21, "eta": "07:03:37", "max_grad_norm": 1.0, "loss": 0.7195531725883484, "grad_norm": 0.3144095838069916, "learning_rate": 1.0691662657334815e-05} +{"ts": "2025-12-22T19:33:51", "event": "train_log", "step": 350, "epoch": 1.0672525792892624, "progress_pct": 53.35, "epoch_pct": 53.36, "eta": "07:02:07", "max_grad_norm": 1.0, "loss": 0.6678342819213867, "grad_norm": 0.37244805693626404, "learning_rate": 1.0638533289559574e-05} +{"ts": "2025-12-22T19:48:26", "event": "train_log", "step": 350, "epoch": 1.0672525792892624, "progress_pct": 53.35, "epoch_pct": 53.36, "eta": "07:14:52", "max_grad_norm": 1.0, "eval_loss": 0.6917262673377991, "eval_runtime": 874.9693, "eval_samples_per_second": 0.689, "eval_steps_per_second": 0.689} +{"ts": "2025-12-22T19:49:43", "event": "train_log", "step": 351, "epoch": 1.0703095147115018, "progress_pct": 53.51, "epoch_pct": 53.52, "eta": "07:13:19", "max_grad_norm": 1.0, "loss": 0.6641817092895508, "grad_norm": 0.45918041467666626, "learning_rate": 1.0585385817630137e-05} +{"ts": "2025-12-22T19:51:02", "event": "train_log", "step": 352, "epoch": 1.073366450133741, "progress_pct": 53.66, "epoch_pct": 53.67, "eta": "07:11:49", "max_grad_norm": 1.0, "loss": 0.6774541139602661, "grad_norm": 0.4126392900943756, "learning_rate": 1.0532221748421786e-05} +{"ts": "2025-12-22T19:52:19", "event": "train_log", "step": 353, "epoch": 1.0764233855559802, "progress_pct": 53.81, "epoch_pct": 53.82, "eta": "07:10:17", "max_grad_norm": 1.0, "loss": 0.7386555075645447, "grad_norm": 0.5425148606300354, "learning_rate": 1.047904258928037e-05} +{"ts": "2025-12-22T19:53:34", "event": "train_log", "step": 354, "epoch": 1.0794803209782193, "progress_pct": 53.96, "epoch_pct": 53.97, "eta": "07:08:43", "max_grad_norm": 1.0, "loss": 0.7061327695846558, "grad_norm": 0.40561115741729736, "learning_rate": 1.0425849847979586e-05} +{"ts": "2025-12-22T19:54:51", "event": "train_log", "step": 355, "epoch": 1.0825372564004585, "progress_pct": 54.12, "epoch_pct": 54.13, "eta": "07:07:10", "max_grad_norm": 1.0, "loss": 0.7486766576766968, "grad_norm": 0.489343523979187, "learning_rate": 1.0372645032678215e-05} +{"ts": "2025-12-22T19:56:05", "event": "train_log", "step": 356, "epoch": 1.0855941918226977, "progress_pct": 54.27, "epoch_pct": 54.28, "eta": "07:05:36", "max_grad_norm": 1.0, "loss": 0.7111566066741943, "grad_norm": 0.7414161562919617, "learning_rate": 1.031942965187738e-05} +{"ts": "2025-12-22T19:57:24", "event": "train_log", "step": 357, "epoch": 1.0886511272449368, "progress_pct": 54.42, "epoch_pct": 54.43, "eta": "07:04:05", "max_grad_norm": 1.0, "loss": 0.7629879713058472, "grad_norm": 0.308473140001297, "learning_rate": 1.026620521437775e-05} +{"ts": "2025-12-22T19:58:41", "event": "train_log", "step": 358, "epoch": 1.0917080626671762, "progress_pct": 54.57, "epoch_pct": 54.59, "eta": "07:02:34", "max_grad_norm": 1.0, "loss": 0.7136012315750122, "grad_norm": 0.27350732684135437, "learning_rate": 1.0212973229236787e-05} +{"ts": "2025-12-22T19:59:57", "event": "train_log", "step": 359, "epoch": 1.0947649980894154, "progress_pct": 54.73, "epoch_pct": 54.74, "eta": "07:01:01", "max_grad_norm": 1.0, "loss": 0.6634767055511475, "grad_norm": 0.37481266260147095, "learning_rate": 1.0159735205725949e-05} +{"ts": "2025-12-22T20:01:10", "event": "train_log", "step": 360, "epoch": 1.0978219335116546, "progress_pct": 54.88, "epoch_pct": 54.89, "eta": "06:59:26", "max_grad_norm": 1.0, "loss": 0.6604923009872437, "grad_norm": 0.2903526723384857, "learning_rate": 1.0106492653287893e-05} +{"ts": "2025-12-22T20:02:26", "event": "train_log", "step": 361, "epoch": 1.1008788689338938, "progress_pct": 55.03, "epoch_pct": 55.04, "eta": "06:57:54", "max_grad_norm": 1.0, "loss": 0.6701731085777283, "grad_norm": 0.372989296913147, "learning_rate": 1.0053247081493684e-05} +{"ts": "2025-12-22T20:03:40", "event": "train_log", "step": 362, "epoch": 1.103935804356133, "progress_pct": 55.18, "epoch_pct": 55.2, "eta": "06:56:20", "max_grad_norm": 1.0, "loss": 0.6767977476119995, "grad_norm": 0.38386791944503784, "learning_rate": 1e-05} +{"ts": "2025-12-22T20:04:59", "event": "train_log", "step": 363, "epoch": 1.106992739778372, "progress_pct": 55.34, "epoch_pct": 55.35, "eta": "06:54:50", "max_grad_norm": 1.0, "loss": 0.5886228680610657, "grad_norm": 0.2837046682834625, "learning_rate": 9.946752918506319e-06} +{"ts": "2025-12-22T20:06:18", "event": "train_log", "step": 364, "epoch": 1.1100496752006115, "progress_pct": 55.49, "epoch_pct": 55.5, "eta": "06:53:20", "max_grad_norm": 1.0, "loss": 0.6662254929542542, "grad_norm": 0.3196772038936615, "learning_rate": 9.893507346712112e-06} +{"ts": "2025-12-22T20:07:37", "event": "train_log", "step": 365, "epoch": 1.1131066106228507, "progress_pct": 55.64, "epoch_pct": 55.66, "eta": "06:51:50", "max_grad_norm": 1.0, "loss": 0.6507357954978943, "grad_norm": 0.36623135209083557, "learning_rate": 9.840264794274053e-06} +{"ts": "2025-12-22T20:08:51", "event": "train_log", "step": 366, "epoch": 1.1161635460450898, "progress_pct": 55.79, "epoch_pct": 55.81, "eta": "06:50:17", "max_grad_norm": 1.0, "loss": 0.6636874675750732, "grad_norm": 0.2803555727005005, "learning_rate": 9.787026770763216e-06} +{"ts": "2025-12-22T20:10:06", "event": "train_log", "step": 367, "epoch": 1.119220481467329, "progress_pct": 55.95, "epoch_pct": 55.96, "eta": "06:48:44", "max_grad_norm": 1.0, "loss": 0.6378857493400574, "grad_norm": 0.329513818025589, "learning_rate": 9.733794785622254e-06} +{"ts": "2025-12-22T20:11:19", "event": "train_log", "step": 368, "epoch": 1.1222774168895682, "progress_pct": 56.1, "epoch_pct": 56.11, "eta": "06:47:10", "max_grad_norm": 1.0, "loss": 0.6794115900993347, "grad_norm": 0.24419358372688293, "learning_rate": 9.680570348122626e-06} +{"ts": "2025-12-22T20:12:37", "event": "train_log", "step": 369, "epoch": 1.1253343523118073, "progress_pct": 56.25, "epoch_pct": 56.27, "eta": "06:45:40", "max_grad_norm": 1.0, "loss": 0.6401248574256897, "grad_norm": 0.2971822917461395, "learning_rate": 9.627354967321785e-06} +{"ts": "2025-12-22T20:13:55", "event": "train_log", "step": 370, "epoch": 1.1283912877340465, "progress_pct": 56.4, "epoch_pct": 56.42, "eta": "06:44:10", "max_grad_norm": 1.0, "loss": 0.6886081695556641, "grad_norm": 0.5112190842628479, "learning_rate": 9.574150152020415e-06} +{"ts": "2025-12-22T20:15:16", "event": "train_log", "step": 371, "epoch": 1.131448223156286, "progress_pct": 56.55, "epoch_pct": 56.57, "eta": "06:42:42", "max_grad_norm": 1.0, "loss": 0.6842222213745117, "grad_norm": 0.4284913241863251, "learning_rate": 9.520957410719632e-06} +{"ts": "2025-12-22T20:16:33", "event": "train_log", "step": 372, "epoch": 1.134505158578525, "progress_pct": 56.71, "epoch_pct": 56.73, "eta": "06:41:12", "max_grad_norm": 1.0, "loss": 0.6238314509391785, "grad_norm": 0.34164664149284363, "learning_rate": 9.467778251578217e-06} +{"ts": "2025-12-22T20:17:47", "event": "train_log", "step": 373, "epoch": 1.1375620940007642, "progress_pct": 56.86, "epoch_pct": 56.88, "eta": "06:39:38", "max_grad_norm": 1.0, "loss": 0.6947107911109924, "grad_norm": 0.3294171392917633, "learning_rate": 9.414614182369862e-06} +{"ts": "2025-12-22T20:19:03", "event": "train_log", "step": 374, "epoch": 1.1406190294230034, "progress_pct": 57.01, "epoch_pct": 57.03, "eta": "06:38:07", "max_grad_norm": 1.0, "loss": 0.717319905757904, "grad_norm": 0.2544155418872833, "learning_rate": 9.361466710440428e-06} +{"ts": "2025-12-22T20:20:22", "event": "train_log", "step": 375, "epoch": 1.1436759648452426, "progress_pct": 57.16, "epoch_pct": 57.18, "eta": "06:36:38", "max_grad_norm": 1.0, "loss": 0.6222032904624939, "grad_norm": 0.3111848533153534, "learning_rate": 9.308337342665188e-06} +{"ts": "2025-12-22T20:21:36", "event": "train_log", "step": 376, "epoch": 1.1467329002674818, "progress_pct": 57.32, "epoch_pct": 57.34, "eta": "06:35:06", "max_grad_norm": 1.0, "loss": 0.6126186847686768, "grad_norm": 0.3157130777835846, "learning_rate": 9.255227585406116e-06} +{"ts": "2025-12-22T20:22:54", "event": "train_log", "step": 377, "epoch": 1.1497898356897212, "progress_pct": 57.47, "epoch_pct": 57.49, "eta": "06:33:36", "max_grad_norm": 1.0, "loss": 0.7452324032783508, "grad_norm": 0.29625123739242554, "learning_rate": 9.202138944469168e-06} +{"ts": "2025-12-22T20:24:08", "event": "train_log", "step": 378, "epoch": 1.1528467711119603, "progress_pct": 57.62, "epoch_pct": 57.64, "eta": "06:32:03", "max_grad_norm": 1.0, "loss": 0.715571403503418, "grad_norm": 0.31600719690322876, "learning_rate": 9.149072925061614e-06} +{"ts": "2025-12-22T20:25:26", "event": "train_log", "step": 379, "epoch": 1.1559037065341995, "progress_pct": 57.77, "epoch_pct": 57.8, "eta": "06:30:34", "max_grad_norm": 1.0, "loss": 0.7256120443344116, "grad_norm": 0.25878727436065674, "learning_rate": 9.096031031749321e-06} +{"ts": "2025-12-22T20:26:40", "event": "train_log", "step": 380, "epoch": 1.1589606419564387, "progress_pct": 57.93, "epoch_pct": 57.95, "eta": "06:29:02", "max_grad_norm": 1.0, "loss": 0.6728136539459229, "grad_norm": 0.4058121144771576, "learning_rate": 9.043014768414125e-06} +{"ts": "2025-12-22T20:27:58", "event": "train_log", "step": 381, "epoch": 1.1620175773786778, "progress_pct": 58.08, "epoch_pct": 58.1, "eta": "06:27:33", "max_grad_norm": 1.0, "loss": 0.6662668585777283, "grad_norm": 0.31269821524620056, "learning_rate": 8.99002563821118e-06} +{"ts": "2025-12-22T20:29:13", "event": "train_log", "step": 382, "epoch": 1.165074512800917, "progress_pct": 58.23, "epoch_pct": 58.25, "eta": "06:26:01", "max_grad_norm": 1.0, "loss": 0.6415850520133972, "grad_norm": 0.2512218654155731, "learning_rate": 8.937065143526349e-06} +{"ts": "2025-12-22T20:30:26", "event": "train_log", "step": 383, "epoch": 1.1681314482231562, "progress_pct": 58.38, "epoch_pct": 58.41, "eta": "06:24:29", "max_grad_norm": 1.0, "loss": 0.6695276498794556, "grad_norm": 0.3284171223640442, "learning_rate": 8.884134785933588e-06} +{"ts": "2025-12-22T20:31:44", "event": "train_log", "step": 384, "epoch": 1.1711883836453956, "progress_pct": 58.54, "epoch_pct": 58.56, "eta": "06:22:59", "max_grad_norm": 1.0, "loss": 0.7347006797790527, "grad_norm": 0.2994699478149414, "learning_rate": 8.831236066152397e-06} +{"ts": "2025-12-22T20:32:58", "event": "train_log", "step": 385, "epoch": 1.1742453190676347, "progress_pct": 58.69, "epoch_pct": 58.71, "eta": "06:21:27", "max_grad_norm": 1.0, "loss": 0.6707600951194763, "grad_norm": 0.2981257140636444, "learning_rate": 8.778370484005245e-06} +{"ts": "2025-12-22T20:34:10", "event": "train_log", "step": 386, "epoch": 1.177302254489874, "progress_pct": 58.84, "epoch_pct": 58.87, "eta": "06:19:54", "max_grad_norm": 1.0, "loss": 0.7245328426361084, "grad_norm": 0.2934776842594147, "learning_rate": 8.725539538375078e-06} +{"ts": "2025-12-22T20:35:25", "event": "train_log", "step": 387, "epoch": 1.180359189912113, "progress_pct": 58.99, "epoch_pct": 59.02, "eta": "06:18:23", "max_grad_norm": 1.0, "loss": 0.7029488682746887, "grad_norm": 0.33115988969802856, "learning_rate": 8.672744727162782e-06} +{"ts": "2025-12-22T20:36:39", "event": "train_log", "step": 388, "epoch": 1.1834161253343523, "progress_pct": 59.15, "epoch_pct": 59.17, "eta": "06:16:51", "max_grad_norm": 1.0, "loss": 0.6896190643310547, "grad_norm": 0.3322703540325165, "learning_rate": 8.619987547244746e-06} +{"ts": "2025-12-22T20:37:52", "event": "train_log", "step": 389, "epoch": 1.1864730607565914, "progress_pct": 59.3, "epoch_pct": 59.32, "eta": "06:15:19", "max_grad_norm": 1.0, "loss": 0.6859920620918274, "grad_norm": 0.29254966974258423, "learning_rate": 8.567269494430404e-06} +{"ts": "2025-12-22T20:39:05", "event": "train_log", "step": 390, "epoch": 1.1895299961788308, "progress_pct": 59.45, "epoch_pct": 59.48, "eta": "06:13:47", "max_grad_norm": 1.0, "loss": 0.6437527537345886, "grad_norm": 0.2923297584056854, "learning_rate": 8.514592063419833e-06} +{"ts": "2025-12-22T20:40:20", "event": "train_log", "step": 391, "epoch": 1.19258693160107, "progress_pct": 59.6, "epoch_pct": 59.63, "eta": "06:12:17", "max_grad_norm": 1.0, "loss": 0.7113338708877563, "grad_norm": 0.3074567914009094, "learning_rate": 8.461956747761375e-06} +{"ts": "2025-12-22T20:41:33", "event": "train_log", "step": 392, "epoch": 1.1956438670233092, "progress_pct": 59.76, "epoch_pct": 59.78, "eta": "06:10:45", "max_grad_norm": 1.0, "loss": 0.7111615538597107, "grad_norm": 0.3027377128601074, "learning_rate": 8.409365039809282e-06} +{"ts": "2025-12-22T20:42:45", "event": "train_log", "step": 393, "epoch": 1.1987008024455483, "progress_pct": 59.91, "epoch_pct": 59.94, "eta": "06:09:12", "max_grad_norm": 1.0, "loss": 0.7768589854240417, "grad_norm": 0.28992199897766113, "learning_rate": 8.356818430681409e-06} +{"ts": "2025-12-22T20:43:58", "event": "train_log", "step": 394, "epoch": 1.2017577378677875, "progress_pct": 60.06, "epoch_pct": 60.09, "eta": "06:07:41", "max_grad_norm": 1.0, "loss": 0.5940375328063965, "grad_norm": 0.2630784213542938, "learning_rate": 8.304318410216937e-06} +{"ts": "2025-12-22T20:45:12", "event": "train_log", "step": 395, "epoch": 1.2048146732900267, "progress_pct": 60.21, "epoch_pct": 60.24, "eta": "06:06:10", "max_grad_norm": 1.0, "loss": 0.6600077748298645, "grad_norm": 0.30487746000289917, "learning_rate": 8.251866466934137e-06} +{"ts": "2025-12-22T20:46:24", "event": "train_log", "step": 396, "epoch": 1.2078716087122658, "progress_pct": 60.37, "epoch_pct": 60.39, "eta": "06:04:38", "max_grad_norm": 1.0, "loss": 0.6806260347366333, "grad_norm": 0.4152087867259979, "learning_rate": 8.199464087988158e-06} +{"ts": "2025-12-22T20:47:36", "event": "train_log", "step": 397, "epoch": 1.2109285441345052, "progress_pct": 60.52, "epoch_pct": 60.55, "eta": "06:03:06", "max_grad_norm": 1.0, "loss": 0.7205727100372314, "grad_norm": 0.32374435663223267, "learning_rate": 8.147112759128859e-06} +{"ts": "2025-12-22T20:48:49", "event": "train_log", "step": 398, "epoch": 1.2139854795567444, "progress_pct": 60.67, "epoch_pct": 60.7, "eta": "06:01:34", "max_grad_norm": 1.0, "loss": 0.6570584774017334, "grad_norm": 0.3009904623031616, "learning_rate": 8.094813964658698e-06} +{"ts": "2025-12-22T20:50:07", "event": "train_log", "step": 399, "epoch": 1.2170424149789836, "progress_pct": 60.82, "epoch_pct": 60.85, "eta": "06:00:06", "max_grad_norm": 1.0, "loss": 0.6663621664047241, "grad_norm": 0.5213649272918701, "learning_rate": 8.042569187390642e-06} +{"ts": "2025-12-22T20:51:15", "event": "train_log", "step": 400, "epoch": 1.2200993504012227, "progress_pct": 60.98, "epoch_pct": 61.0, "eta": "05:58:32", "max_grad_norm": 1.0, "loss": 0.672550618648529, "grad_norm": 0.30124184489250183, "learning_rate": 7.990379908606118e-06} +{"ts": "2025-12-22T21:05:51", "event": "train_log", "step": 400, "epoch": 1.2200993504012227, "progress_pct": 60.98, "epoch_pct": 61.0, "eta": "06:07:52", "max_grad_norm": 1.0, "eval_loss": 0.6789794564247131, "eval_runtime": 875.5101, "eval_samples_per_second": 0.689, "eval_steps_per_second": 0.689} +{"ts": "2025-12-22T21:07:06", "event": "train_log", "step": 401, "epoch": 1.223156285823462, "progress_pct": 61.13, "epoch_pct": 61.16, "eta": "06:06:19", "max_grad_norm": 1.0, "loss": 0.682239830493927, "grad_norm": 0.31681662797927856, "learning_rate": 7.938247608013021e-06} +{"ts": "2025-12-22T21:08:21", "event": "train_log", "step": 402, "epoch": 1.226213221245701, "progress_pct": 61.28, "epoch_pct": 61.31, "eta": "06:04:46", "max_grad_norm": 1.0, "loss": 0.6976956725120544, "grad_norm": 0.29261210560798645, "learning_rate": 7.886173763703757e-06} +{"ts": "2025-12-22T21:09:37", "event": "train_log", "step": 403, "epoch": 1.2292701566679405, "progress_pct": 61.43, "epoch_pct": 61.46, "eta": "06:03:13", "max_grad_norm": 1.0, "loss": 0.6931061744689941, "grad_norm": 0.32044124603271484, "learning_rate": 7.834159852113347e-06} +{"ts": "2025-12-22T21:10:52", "event": "train_log", "step": 404, "epoch": 1.2323270920901797, "progress_pct": 61.59, "epoch_pct": 61.62, "eta": "06:01:40", "max_grad_norm": 1.0, "loss": 0.7304666638374329, "grad_norm": 0.36050841212272644, "learning_rate": 7.78220734797755e-06} +{"ts": "2025-12-22T21:12:10", "event": "train_log", "step": 405, "epoch": 1.2353840275124188, "progress_pct": 61.74, "epoch_pct": 61.77, "eta": "06:00:09", "max_grad_norm": 1.0, "loss": 0.5944494605064392, "grad_norm": 0.31268882751464844, "learning_rate": 7.73031772429105e-06} +{"ts": "2025-12-22T21:13:27", "event": "train_log", "step": 406, "epoch": 1.238440962934658, "progress_pct": 61.89, "epoch_pct": 61.92, "eta": "05:58:37", "max_grad_norm": 1.0, "loss": 0.708702802658081, "grad_norm": 0.33469483256340027, "learning_rate": 7.678492452265713e-06} +{"ts": "2025-12-22T21:14:41", "event": "train_log", "step": 407, "epoch": 1.2414978983568972, "progress_pct": 62.04, "epoch_pct": 62.07, "eta": "05:57:04", "max_grad_norm": 1.0, "loss": 0.614046037197113, "grad_norm": 0.2789304852485657, "learning_rate": 7.626733001288852e-06} +{"ts": "2025-12-22T21:15:57", "event": "train_log", "step": 408, "epoch": 1.2445548337791363, "progress_pct": 62.2, "epoch_pct": 62.23, "eta": "05:55:32", "max_grad_norm": 1.0, "loss": 0.7044576406478882, "grad_norm": 0.42240089178085327, "learning_rate": 7.575040838881578e-06} +{"ts": "2025-12-22T21:17:10", "event": "train_log", "step": 409, "epoch": 1.2476117692013755, "progress_pct": 62.35, "epoch_pct": 62.38, "eta": "05:53:58", "max_grad_norm": 1.0, "loss": 0.7595829963684082, "grad_norm": 0.3652958571910858, "learning_rate": 7.523417430657186e-06} +{"ts": "2025-12-22T21:18:25", "event": "train_log", "step": 410, "epoch": 1.250668704623615, "progress_pct": 62.5, "epoch_pct": 62.53, "eta": "05:52:25", "max_grad_norm": 1.0, "loss": 0.7289992570877075, "grad_norm": 0.28300684690475464, "learning_rate": 7.471864240279598e-06} +{"ts": "2025-12-22T21:19:43", "event": "train_log", "step": 411, "epoch": 1.253725640045854, "progress_pct": 62.65, "epoch_pct": 62.69, "eta": "05:50:54", "max_grad_norm": 1.0, "loss": 0.7410037517547607, "grad_norm": 0.3463844358921051, "learning_rate": 7.420382729421883e-06} +{"ts": "2025-12-22T21:21:02", "event": "train_log", "step": 412, "epoch": 1.2567825754680932, "progress_pct": 62.8, "epoch_pct": 62.84, "eta": "05:49:24", "max_grad_norm": 1.0, "loss": 0.6920305490493774, "grad_norm": 0.30792665481567383, "learning_rate": 7.368974357724789e-06} +{"ts": "2025-12-22T21:22:15", "event": "train_log", "step": 413, "epoch": 1.2598395108903324, "progress_pct": 62.96, "epoch_pct": 62.99, "eta": "05:47:51", "max_grad_norm": 1.0, "loss": 0.6581035256385803, "grad_norm": 0.4354027509689331, "learning_rate": 7.317640582755373e-06} +{"ts": "2025-12-22T21:23:35", "event": "train_log", "step": 414, "epoch": 1.2628964463125716, "progress_pct": 63.11, "epoch_pct": 63.14, "eta": "05:46:22", "max_grad_norm": 1.0, "loss": 0.7377368211746216, "grad_norm": 0.5033990144729614, "learning_rate": 7.266382859965673e-06} +{"ts": "2025-12-22T21:24:51", "event": "train_log", "step": 415, "epoch": 1.265953381734811, "progress_pct": 63.26, "epoch_pct": 63.3, "eta": "05:44:50", "max_grad_norm": 1.0, "loss": 0.7075121402740479, "grad_norm": 0.30040669441223145, "learning_rate": 7.2152026426514395e-06} +{"ts": "2025-12-22T21:26:09", "event": "train_log", "step": 416, "epoch": 1.2690103171570501, "progress_pct": 63.41, "epoch_pct": 63.45, "eta": "05:43:20", "max_grad_norm": 1.0, "loss": 0.6314805150032043, "grad_norm": 0.25443559885025024, "learning_rate": 7.164101381910939e-06} +{"ts": "2025-12-22T21:27:24", "event": "train_log", "step": 417, "epoch": 1.2720672525792893, "progress_pct": 63.57, "epoch_pct": 63.6, "eta": "05:41:48", "max_grad_norm": 1.0, "loss": 0.6594043970108032, "grad_norm": 0.3807917535305023, "learning_rate": 7.113080526603793e-06} +{"ts": "2025-12-22T21:28:42", "event": "train_log", "step": 418, "epoch": 1.2751241880015285, "progress_pct": 63.72, "epoch_pct": 63.76, "eta": "05:40:18", "max_grad_norm": 1.0, "loss": 0.7092217206954956, "grad_norm": 0.40388163924217224, "learning_rate": 7.062141523309918e-06} +{"ts": "2025-12-22T21:29:58", "event": "train_log", "step": 419, "epoch": 1.2781811234237677, "progress_pct": 63.87, "epoch_pct": 63.91, "eta": "05:38:46", "max_grad_norm": 1.0, "loss": 0.6039083003997803, "grad_norm": 0.31380078196525574, "learning_rate": 7.011285816288496e-06} +{"ts": "2025-12-22T21:31:15", "event": "train_log", "step": 420, "epoch": 1.2812380588460068, "progress_pct": 64.02, "epoch_pct": 64.06, "eta": "05:37:15", "max_grad_norm": 1.0, "loss": 0.648531973361969, "grad_norm": 0.3492945730686188, "learning_rate": 6.96051484743705e-06} +{"ts": "2025-12-22T21:32:30", "event": "train_log", "step": 421, "epoch": 1.284294994268246, "progress_pct": 64.18, "epoch_pct": 64.21, "eta": "05:35:43", "max_grad_norm": 1.0, "loss": 0.6646198630332947, "grad_norm": 0.2891562283039093, "learning_rate": 6.909830056250527e-06} +{"ts": "2025-12-22T21:33:45", "event": "train_log", "step": 422, "epoch": 1.2873519296904852, "progress_pct": 64.33, "epoch_pct": 64.37, "eta": "05:34:12", "max_grad_norm": 1.0, "loss": 0.7188717126846313, "grad_norm": 0.316986083984375, "learning_rate": 6.859232879780515e-06} +{"ts": "2025-12-22T21:35:01", "event": "train_log", "step": 423, "epoch": 1.2904088651127246, "progress_pct": 64.48, "epoch_pct": 64.52, "eta": "05:32:41", "max_grad_norm": 1.0, "loss": 0.6890851855278015, "grad_norm": 0.38996225595474243, "learning_rate": 6.8087247525944745e-06} +{"ts": "2025-12-22T21:36:16", "event": "train_log", "step": 424, "epoch": 1.2934658005349637, "progress_pct": 64.63, "epoch_pct": 64.67, "eta": "05:31:10", "max_grad_norm": 1.0, "loss": 0.7118897438049316, "grad_norm": 0.3303278684616089, "learning_rate": 6.758307106735094e-06} +{"ts": "2025-12-22T21:37:33", "event": "train_log", "step": 425, "epoch": 1.296522735957203, "progress_pct": 64.79, "epoch_pct": 64.83, "eta": "05:29:39", "max_grad_norm": 1.0, "loss": 0.6749597787857056, "grad_norm": 0.26401078701019287, "learning_rate": 6.707981371679657e-06} +{"ts": "2025-12-22T21:38:45", "event": "train_log", "step": 426, "epoch": 1.299579671379442, "progress_pct": 64.94, "epoch_pct": 64.98, "eta": "05:28:06", "max_grad_norm": 1.0, "loss": 0.6718383431434631, "grad_norm": 0.3269912898540497, "learning_rate": 6.657748974299529e-06} +{"ts": "2025-12-22T21:40:04", "event": "train_log", "step": 427, "epoch": 1.3026366068016813, "progress_pct": 65.09, "epoch_pct": 65.13, "eta": "05:26:37", "max_grad_norm": 1.0, "loss": 0.6674888134002686, "grad_norm": 0.35413047671318054, "learning_rate": 6.607611338819697e-06} +{"ts": "2025-12-22T21:41:24", "event": "train_log", "step": 428, "epoch": 1.3056935422239206, "progress_pct": 65.24, "epoch_pct": 65.28, "eta": "05:25:08", "max_grad_norm": 1.0, "loss": 0.6900228261947632, "grad_norm": 0.44566094875335693, "learning_rate": 6.557569886778401e-06} +{"ts": "2025-12-22T21:42:40", "event": "train_log", "step": 429, "epoch": 1.3087504776461598, "progress_pct": 65.4, "epoch_pct": 65.44, "eta": "05:23:38", "max_grad_norm": 1.0, "loss": 0.6681596040725708, "grad_norm": 0.3536953628063202, "learning_rate": 6.507626036986804e-06} +{"ts": "2025-12-22T21:43:55", "event": "train_log", "step": 430, "epoch": 1.311807413068399, "progress_pct": 65.55, "epoch_pct": 65.59, "eta": "05:22:07", "max_grad_norm": 1.0, "loss": 0.7463353872299194, "grad_norm": 0.43866440653800964, "learning_rate": 6.457781205488791e-06} +{"ts": "2025-12-22T21:45:10", "event": "train_log", "step": 431, "epoch": 1.3148643484906382, "progress_pct": 65.7, "epoch_pct": 65.74, "eta": "05:20:35", "max_grad_norm": 1.0, "loss": 0.7138527035713196, "grad_norm": 0.32117530703544617, "learning_rate": 6.408036805520801e-06} +{"ts": "2025-12-22T21:46:28", "event": "train_log", "step": 432, "epoch": 1.3179212839128773, "progress_pct": 65.85, "epoch_pct": 65.9, "eta": "05:19:06", "max_grad_norm": 1.0, "loss": 0.6958800554275513, "grad_norm": 0.3075023293495178, "learning_rate": 6.358394247471779e-06} +{"ts": "2025-12-22T21:47:45", "event": "train_log", "step": 433, "epoch": 1.3209782193351165, "progress_pct": 66.01, "epoch_pct": 66.05, "eta": "05:17:36", "max_grad_norm": 1.0, "loss": 0.6728611588478088, "grad_norm": 0.31068870425224304, "learning_rate": 6.308854938843161e-06} +{"ts": "2025-12-22T21:48:59", "event": "train_log", "step": 434, "epoch": 1.3240351547573557, "progress_pct": 66.16, "epoch_pct": 66.2, "eta": "05:16:05", "max_grad_norm": 1.0, "loss": 0.6983805894851685, "grad_norm": 0.2871341407299042, "learning_rate": 6.259420284208987e-06} +{"ts": "2025-12-22T21:50:15", "event": "train_log", "step": 435, "epoch": 1.3270920901795948, "progress_pct": 66.31, "epoch_pct": 66.35, "eta": "05:14:35", "max_grad_norm": 1.0, "loss": 0.6707543134689331, "grad_norm": 0.3626168966293335, "learning_rate": 6.210091685176067e-06} +{"ts": "2025-12-22T21:51:31", "event": "train_log", "step": 436, "epoch": 1.3301490256018342, "progress_pct": 66.46, "epoch_pct": 66.51, "eta": "05:13:05", "max_grad_norm": 1.0, "loss": 0.6212095618247986, "grad_norm": 0.2960391640663147, "learning_rate": 6.160870540344261e-06} +{"ts": "2025-12-22T21:52:48", "event": "train_log", "step": 437, "epoch": 1.3332059610240734, "progress_pct": 66.62, "epoch_pct": 66.66, "eta": "05:11:35", "max_grad_norm": 1.0, "loss": 0.695442795753479, "grad_norm": 0.29114195704460144, "learning_rate": 6.111758245266795e-06} +{"ts": "2025-12-22T21:54:05", "event": "train_log", "step": 438, "epoch": 1.3362628964463126, "progress_pct": 66.77, "epoch_pct": 66.81, "eta": "05:10:06", "max_grad_norm": 1.0, "loss": 0.7576844096183777, "grad_norm": 0.2911393642425537, "learning_rate": 6.0627561924107145e-06} +{"ts": "2025-12-22T21:55:23", "event": "train_log", "step": 439, "epoch": 1.3393198318685517, "progress_pct": 66.92, "epoch_pct": 66.97, "eta": "05:08:37", "max_grad_norm": 1.0, "loss": 0.7611621022224426, "grad_norm": 0.2754829227924347, "learning_rate": 6.013865771117394e-06} +{"ts": "2025-12-22T21:56:40", "event": "train_log", "step": 440, "epoch": 1.342376767290791, "progress_pct": 67.07, "epoch_pct": 67.12, "eta": "05:07:07", "max_grad_norm": 1.0, "loss": 0.6706432104110718, "grad_norm": 0.47688090801239014, "learning_rate": 5.965088367563162e-06} +{"ts": "2025-12-22T21:57:56", "event": "train_log", "step": 441, "epoch": 1.3454337027130303, "progress_pct": 67.23, "epoch_pct": 67.27, "eta": "05:05:37", "max_grad_norm": 1.0, "loss": 0.7257411479949951, "grad_norm": 0.38662102818489075, "learning_rate": 5.916425364719975e-06} +{"ts": "2025-12-22T21:59:12", "event": "train_log", "step": 442, "epoch": 1.3484906381352695, "progress_pct": 67.38, "epoch_pct": 67.42, "eta": "05:04:08", "max_grad_norm": 1.0, "loss": 0.6695491671562195, "grad_norm": 0.29597020149230957, "learning_rate": 5.867878142316221e-06} +{"ts": "2025-12-22T22:00:27", "event": "train_log", "step": 443, "epoch": 1.3515475735575087, "progress_pct": 67.53, "epoch_pct": 67.58, "eta": "05:02:37", "max_grad_norm": 1.0, "loss": 0.6762661933898926, "grad_norm": 0.36503320932388306, "learning_rate": 5.8194480767976e-06} +{"ts": "2025-12-22T22:01:46", "event": "train_log", "step": 444, "epoch": 1.3546045089797478, "progress_pct": 67.68, "epoch_pct": 67.73, "eta": "05:01:09", "max_grad_norm": 1.0, "loss": 0.6601616740226746, "grad_norm": 0.29297393560409546, "learning_rate": 5.7711365412880895e-06} +{"ts": "2025-12-22T22:03:03", "event": "train_log", "step": 445, "epoch": 1.357661444401987, "progress_pct": 67.84, "epoch_pct": 67.88, "eta": "04:59:40", "max_grad_norm": 1.0, "loss": 0.7049432992935181, "grad_norm": 0.3229820430278778, "learning_rate": 5.7229449055510335e-06} +{"ts": "2025-12-22T22:04:21", "event": "train_log", "step": 446, "epoch": 1.3607183798242262, "progress_pct": 67.99, "epoch_pct": 68.04, "eta": "04:58:11", "max_grad_norm": 1.0, "loss": 0.6643913388252258, "grad_norm": 0.3359116017818451, "learning_rate": 5.674874535950279e-06} +{"ts": "2025-12-22T22:05:36", "event": "train_log", "step": 447, "epoch": 1.3637753152464653, "progress_pct": 68.14, "epoch_pct": 68.19, "eta": "04:56:41", "max_grad_norm": 1.0, "loss": 0.7177180647850037, "grad_norm": 0.349298357963562, "learning_rate": 5.626926795411447e-06} +{"ts": "2025-12-22T22:06:50", "event": "train_log", "step": 448, "epoch": 1.3668322506687045, "progress_pct": 68.29, "epoch_pct": 68.34, "eta": "04:55:11", "max_grad_norm": 1.0, "loss": 0.6765077710151672, "grad_norm": 0.30045273900032043, "learning_rate": 5.579103043383305e-06} +{"ts": "2025-12-22T22:08:07", "event": "train_log", "step": 449, "epoch": 1.369889186090944, "progress_pct": 68.45, "epoch_pct": 68.49, "eta": "04:53:42", "max_grad_norm": 1.0, "loss": 0.6421419978141785, "grad_norm": 0.3676189184188843, "learning_rate": 5.531404635799191e-06} +{"ts": "2025-12-22T22:09:20", "event": "train_log", "step": 450, "epoch": 1.372946121513183, "progress_pct": 68.6, "epoch_pct": 68.65, "eta": "04:52:11", "max_grad_norm": 1.0, "loss": 0.649316668510437, "grad_norm": 0.3337932527065277, "learning_rate": 5.4838329250386076e-06} +{"ts": "2025-12-22T22:24:28", "event": "train_log", "step": 450, "epoch": 1.372946121513183, "progress_pct": 68.6, "epoch_pct": 68.65, "eta": "04:59:07", "max_grad_norm": 1.0, "eval_loss": 0.6703284978866577, "eval_runtime": 907.8663, "eval_samples_per_second": 0.664, "eval_steps_per_second": 0.664} +{"ts": "2025-12-22T22:25:39", "event": "train_log", "step": 451, "epoch": 1.3760030569354222, "progress_pct": 68.75, "epoch_pct": 68.8, "eta": "04:57:33", "max_grad_norm": 1.0, "loss": 0.7333119511604309, "grad_norm": 0.314387708902359, "learning_rate": 5.436389259888841e-06} +{"ts": "2025-12-22T22:26:49", "event": "train_log", "step": 452, "epoch": 1.3790599923576614, "progress_pct": 68.9, "epoch_pct": 68.95, "eta": "04:55:58", "max_grad_norm": 1.0, "loss": 0.6451212763786316, "grad_norm": 0.4056478440761566, "learning_rate": 5.38907498550674e-06} +{"ts": "2025-12-22T22:28:02", "event": "train_log", "step": 453, "epoch": 1.3821169277799006, "progress_pct": 69.05, "epoch_pct": 69.11, "eta": "04:54:25", "max_grad_norm": 1.0, "loss": 0.6462752819061279, "grad_norm": 0.42358386516571045, "learning_rate": 5.341891443380585e-06} +{"ts": "2025-12-22T22:29:08", "event": "train_log", "step": 454, "epoch": 1.38517386320214, "progress_pct": 69.21, "epoch_pct": 69.26, "eta": "04:52:48", "max_grad_norm": 1.0, "loss": 0.717352569103241, "grad_norm": 0.3606562912464142, "learning_rate": 5.294839971292026e-06} +{"ts": "2025-12-22T22:30:21", "event": "train_log", "step": 455, "epoch": 1.3882307986243791, "progress_pct": 69.36, "epoch_pct": 69.41, "eta": "04:51:15", "max_grad_norm": 1.0, "loss": 0.7015582323074341, "grad_norm": 0.3014855682849884, "learning_rate": 5.247921903278177e-06} +{"ts": "2025-12-22T22:31:30", "event": "train_log", "step": 456, "epoch": 1.3912877340466183, "progress_pct": 69.51, "epoch_pct": 69.56, "eta": "04:49:40", "max_grad_norm": 1.0, "loss": 0.6660122275352478, "grad_norm": 0.5155187845230103, "learning_rate": 5.20113856959378e-06} +{"ts": "2025-12-22T22:32:47", "event": "train_log", "step": 457, "epoch": 1.3943446694688575, "progress_pct": 69.66, "epoch_pct": 69.72, "eta": "04:48:09", "max_grad_norm": 1.0, "loss": 0.6980377435684204, "grad_norm": 0.35195642709732056, "learning_rate": 5.1544912966735e-06} +{"ts": "2025-12-22T22:34:02", "event": "train_log", "step": 458, "epoch": 1.3974016048910967, "progress_pct": 69.82, "epoch_pct": 69.87, "eta": "04:46:37", "max_grad_norm": 1.0, "loss": 0.6926653385162354, "grad_norm": 0.28842753171920776, "learning_rate": 5.1079814070943e-06} +{"ts": "2025-12-22T22:35:19", "event": "train_log", "step": 459, "epoch": 1.4004585403133358, "progress_pct": 69.97, "epoch_pct": 70.02, "eta": "04:45:06", "max_grad_norm": 1.0, "loss": 0.6412813067436218, "grad_norm": 0.354425311088562, "learning_rate": 5.06161021953796e-06} +{"ts": "2025-12-22T22:36:38", "event": "train_log", "step": 460, "epoch": 1.403515475735575, "progress_pct": 70.12, "epoch_pct": 70.18, "eta": "04:43:36", "max_grad_norm": 1.0, "loss": 0.6897266507148743, "grad_norm": 0.30584967136383057, "learning_rate": 5.015379048753669e-06} +{"ts": "2025-12-22T22:37:53", "event": "train_log", "step": 461, "epoch": 1.4065724111578142, "progress_pct": 70.27, "epoch_pct": 70.33, "eta": "04:42:04", "max_grad_norm": 1.0, "loss": 0.6777257919311523, "grad_norm": 0.3659093677997589, "learning_rate": 4.9692892055207784e-06} +{"ts": "2025-12-22T22:39:08", "event": "train_log", "step": 462, "epoch": 1.4096293465800536, "progress_pct": 70.43, "epoch_pct": 70.48, "eta": "04:40:32", "max_grad_norm": 1.0, "loss": 0.7499118447303772, "grad_norm": 0.6798201203346252, "learning_rate": 4.923341996611604e-06} +{"ts": "2025-12-22T22:40:27", "event": "train_log", "step": 463, "epoch": 1.4126862820022927, "progress_pct": 70.58, "epoch_pct": 70.63, "eta": "04:39:02", "max_grad_norm": 1.0, "loss": 0.6341705322265625, "grad_norm": 0.36423686146736145, "learning_rate": 4.877538724754392e-06} +{"ts": "2025-12-22T22:41:44", "event": "train_log", "step": 464, "epoch": 1.415743217424532, "progress_pct": 70.73, "epoch_pct": 70.79, "eta": "04:37:31", "max_grad_norm": 1.0, "loss": 0.566770076751709, "grad_norm": 0.29527905583381653, "learning_rate": 4.831880688596392e-06} +{"ts": "2025-12-22T22:42:59", "event": "train_log", "step": 465, "epoch": 1.418800152846771, "progress_pct": 70.88, "epoch_pct": 70.94, "eta": "04:36:00", "max_grad_norm": 1.0, "loss": 0.6926667094230652, "grad_norm": 0.3342158794403076, "learning_rate": 4.7863691826670146e-06} +{"ts": "2025-12-22T22:44:15", "event": "train_log", "step": 466, "epoch": 1.4218570882690102, "progress_pct": 71.04, "epoch_pct": 71.09, "eta": "04:34:29", "max_grad_norm": 1.0, "loss": 0.6302958130836487, "grad_norm": 0.35585087537765503, "learning_rate": 4.741005497341154e-06} +{"ts": "2025-12-22T22:45:29", "event": "train_log", "step": 467, "epoch": 1.4249140236912496, "progress_pct": 71.19, "epoch_pct": 71.25, "eta": "04:32:57", "max_grad_norm": 1.0, "loss": 0.7842360138893127, "grad_norm": 0.5740730166435242, "learning_rate": 4.695790918802577e-06} +{"ts": "2025-12-22T22:46:46", "event": "train_log", "step": 468, "epoch": 1.4279709591134888, "progress_pct": 71.34, "epoch_pct": 71.4, "eta": "04:31:27", "max_grad_norm": 1.0, "loss": 0.6199318766593933, "grad_norm": 0.4422702491283417, "learning_rate": 4.650726729007465e-06} +{"ts": "2025-12-22T22:48:00", "event": "train_log", "step": 469, "epoch": 1.431027894535728, "progress_pct": 71.49, "epoch_pct": 71.55, "eta": "04:29:55", "max_grad_norm": 1.0, "loss": 0.7013853788375854, "grad_norm": 0.3458646833896637, "learning_rate": 4.605814205648087e-06} +{"ts": "2025-12-22T22:49:18", "event": "train_log", "step": 470, "epoch": 1.4340848299579672, "progress_pct": 71.65, "epoch_pct": 71.7, "eta": "04:28:25", "max_grad_norm": 1.0, "loss": 0.7208451628684998, "grad_norm": 0.326727956533432, "learning_rate": 4.56105462211654e-06} +{"ts": "2025-12-22T22:50:35", "event": "train_log", "step": 471, "epoch": 1.4371417653802063, "progress_pct": 71.8, "epoch_pct": 71.86, "eta": "04:26:54", "max_grad_norm": 1.0, "loss": 0.6491535902023315, "grad_norm": 0.3491531014442444, "learning_rate": 4.516449247468666e-06} +{"ts": "2025-12-22T22:51:49", "event": "train_log", "step": 472, "epoch": 1.4401987008024455, "progress_pct": 71.95, "epoch_pct": 72.01, "eta": "04:25:23", "max_grad_norm": 1.0, "loss": 0.6603784561157227, "grad_norm": 0.31401777267456055, "learning_rate": 4.4719993463880695e-06} +{"ts": "2025-12-22T22:53:03", "event": "train_log", "step": 473, "epoch": 1.4432556362246847, "progress_pct": 72.1, "epoch_pct": 72.16, "eta": "04:23:51", "max_grad_norm": 1.0, "loss": 0.6068110466003418, "grad_norm": 0.3741454780101776, "learning_rate": 4.427706179150247e-06} +{"ts": "2025-12-22T22:54:20", "event": "train_log", "step": 474, "epoch": 1.4463125716469238, "progress_pct": 72.26, "epoch_pct": 72.32, "eta": "04:22:21", "max_grad_norm": 1.0, "loss": 0.6427788138389587, "grad_norm": 0.3205011188983917, "learning_rate": 4.383571001586883e-06} +{"ts": "2025-12-22T22:55:35", "event": "train_log", "step": 475, "epoch": 1.4493695070691632, "progress_pct": 72.41, "epoch_pct": 72.47, "eta": "04:20:51", "max_grad_norm": 1.0, "loss": 0.626676082611084, "grad_norm": 0.2519795894622803, "learning_rate": 4.339595065050206e-06} +{"ts": "2025-12-22T22:56:53", "event": "train_log", "step": 476, "epoch": 1.4524264424914024, "progress_pct": 72.56, "epoch_pct": 72.62, "eta": "04:19:21", "max_grad_norm": 1.0, "loss": 0.7192115187644958, "grad_norm": 0.3499923050403595, "learning_rate": 4.29577961637754e-06} +{"ts": "2025-12-22T22:58:07", "event": "train_log", "step": 477, "epoch": 1.4554833779136416, "progress_pct": 72.71, "epoch_pct": 72.77, "eta": "04:17:50", "max_grad_norm": 1.0, "loss": 0.6705955862998962, "grad_norm": 0.6267193555831909, "learning_rate": 4.2521258978559324e-06} +{"ts": "2025-12-22T22:59:22", "event": "train_log", "step": 478, "epoch": 1.4585403133358807, "progress_pct": 72.87, "epoch_pct": 72.93, "eta": "04:16:19", "max_grad_norm": 1.0, "loss": 0.6040648818016052, "grad_norm": 0.5547561049461365, "learning_rate": 4.208635147186956e-06} +{"ts": "2025-12-22T23:00:40", "event": "train_log", "step": 479, "epoch": 1.46159724875812, "progress_pct": 73.02, "epoch_pct": 73.08, "eta": "04:14:50", "max_grad_norm": 1.0, "loss": 0.6205201148986816, "grad_norm": 0.2949749529361725, "learning_rate": 4.165308597451586e-06} +{"ts": "2025-12-22T23:01:56", "event": "train_log", "step": 480, "epoch": 1.4646541841803593, "progress_pct": 73.17, "epoch_pct": 73.23, "eta": "04:13:19", "max_grad_norm": 1.0, "loss": 0.6886979937553406, "grad_norm": 0.2873048782348633, "learning_rate": 4.12214747707527e-06} +{"ts": "2025-12-22T23:03:16", "event": "train_log", "step": 481, "epoch": 1.4677111196025985, "progress_pct": 73.32, "epoch_pct": 73.39, "eta": "04:11:51", "max_grad_norm": 1.0, "loss": 0.6656784415245056, "grad_norm": 0.33694973587989807, "learning_rate": 4.079153009793068e-06} +{"ts": "2025-12-22T23:04:33", "event": "train_log", "step": 482, "epoch": 1.4707680550248377, "progress_pct": 73.48, "epoch_pct": 73.54, "eta": "04:10:21", "max_grad_norm": 1.0, "loss": 0.6573168635368347, "grad_norm": 0.3373357057571411, "learning_rate": 4.036326414614985e-06} +{"ts": "2025-12-22T23:05:53", "event": "train_log", "step": 483, "epoch": 1.4738249904470768, "progress_pct": 73.63, "epoch_pct": 73.69, "eta": "04:08:52", "max_grad_norm": 1.0, "loss": 0.6631187200546265, "grad_norm": 0.3189850151538849, "learning_rate": 3.99366890579139e-06} +{"ts": "2025-12-22T23:07:10", "event": "train_log", "step": 484, "epoch": 1.476881925869316, "progress_pct": 73.78, "epoch_pct": 73.84, "eta": "04:07:23", "max_grad_norm": 1.0, "loss": 0.5881021022796631, "grad_norm": 0.34659212827682495, "learning_rate": 3.951181692778594e-06} +{"ts": "2025-12-22T23:08:25", "event": "train_log", "step": 485, "epoch": 1.4799388612915552, "progress_pct": 73.93, "epoch_pct": 74.0, "eta": "04:05:52", "max_grad_norm": 1.0, "loss": 0.7232425212860107, "grad_norm": 0.4184463918209076, "learning_rate": 3.908865980204555e-06} +{"ts": "2025-12-22T23:09:41", "event": "train_log", "step": 486, "epoch": 1.4829957967137943, "progress_pct": 74.09, "epoch_pct": 74.15, "eta": "04:04:23", "max_grad_norm": 1.0, "loss": 0.6624961495399475, "grad_norm": 0.3163282573223114, "learning_rate": 3.86672296783474e-06} +{"ts": "2025-12-22T23:10:56", "event": "train_log", "step": 487, "epoch": 1.4860527321360335, "progress_pct": 74.24, "epoch_pct": 74.3, "eta": "04:02:52", "max_grad_norm": 1.0, "loss": 0.6616235971450806, "grad_norm": 0.3175446689128876, "learning_rate": 3.824753850538082e-06} +{"ts": "2025-12-22T23:12:14", "event": "train_log", "step": 488, "epoch": 1.489109667558273, "progress_pct": 74.39, "epoch_pct": 74.46, "eta": "04:01:23", "max_grad_norm": 1.0, "loss": 0.6923587918281555, "grad_norm": 0.3493629992008209, "learning_rate": 3.782959818253126e-06} +{"ts": "2025-12-22T23:13:31", "event": "train_log", "step": 489, "epoch": 1.492166602980512, "progress_pct": 74.54, "epoch_pct": 74.61, "eta": "03:59:54", "max_grad_norm": 1.0, "loss": 0.6668528914451599, "grad_norm": 0.30385154485702515, "learning_rate": 3.741342055954269e-06} +{"ts": "2025-12-22T23:14:48", "event": "train_log", "step": 490, "epoch": 1.4952235384027512, "progress_pct": 74.7, "epoch_pct": 74.76, "eta": "03:58:25", "max_grad_norm": 1.0, "loss": 0.6276881098747253, "grad_norm": 0.319979727268219, "learning_rate": 3.699901743618194e-06} +{"ts": "2025-12-22T23:16:01", "event": "train_log", "step": 491, "epoch": 1.4982804738249904, "progress_pct": 74.85, "epoch_pct": 74.91, "eta": "03:56:54", "max_grad_norm": 1.0, "loss": 0.7676356434822083, "grad_norm": 0.28717750310897827, "learning_rate": 3.658640056190378e-06} +{"ts": "2025-12-22T23:17:17", "event": "train_log", "step": 492, "epoch": 1.5013374092472298, "progress_pct": 75.0, "epoch_pct": 75.07, "eta": "03:55:24", "max_grad_norm": 1.0, "loss": 0.6021715402603149, "grad_norm": 0.4701229929924011, "learning_rate": 3.617558163551802e-06} +{"ts": "2025-12-22T23:18:34", "event": "train_log", "step": 493, "epoch": 1.504394344669469, "progress_pct": 75.15, "epoch_pct": 75.22, "eta": "03:53:55", "max_grad_norm": 1.0, "loss": 0.7243677973747253, "grad_norm": 0.4959515929222107, "learning_rate": 3.576657230485775e-06} +{"ts": "2025-12-22T23:19:51", "event": "train_log", "step": 494, "epoch": 1.5074512800917081, "progress_pct": 75.3, "epoch_pct": 75.37, "eta": "03:52:26", "max_grad_norm": 1.0, "loss": 0.7030311822891235, "grad_norm": 0.32071781158447266, "learning_rate": 3.5359384166449185e-06} +{"ts": "2025-12-22T23:21:09", "event": "train_log", "step": 495, "epoch": 1.5105082155139473, "progress_pct": 75.46, "epoch_pct": 75.53, "eta": "03:50:58", "max_grad_norm": 1.0, "loss": 0.6344490051269531, "grad_norm": 0.3393514156341553, "learning_rate": 3.4954028765182633e-06} +{"ts": "2025-12-22T23:22:25", "event": "train_log", "step": 496, "epoch": 1.5135651509361865, "progress_pct": 75.61, "epoch_pct": 75.68, "eta": "03:49:28", "max_grad_norm": 1.0, "loss": 0.5816606879234314, "grad_norm": 0.273512065410614, "learning_rate": 3.4550517593985512e-06} +{"ts": "2025-12-22T23:23:40", "event": "train_log", "step": 497, "epoch": 1.5166220863584257, "progress_pct": 75.76, "epoch_pct": 75.83, "eta": "03:47:59", "max_grad_norm": 1.0, "loss": 0.6091232895851135, "grad_norm": 0.6631937026977539, "learning_rate": 3.414886209349615e-06} +{"ts": "2025-12-22T23:24:58", "event": "train_log", "step": 498, "epoch": 1.5196790217806648, "progress_pct": 75.91, "epoch_pct": 75.98, "eta": "03:46:30", "max_grad_norm": 1.0, "loss": 0.7076858282089233, "grad_norm": 0.6976932287216187, "learning_rate": 3.3749073651739594e-06} +{"ts": "2025-12-22T23:26:13", "event": "train_log", "step": 499, "epoch": 1.522735957202904, "progress_pct": 76.07, "epoch_pct": 76.14, "eta": "03:45:00", "max_grad_norm": 1.0, "loss": 0.6363418698310852, "grad_norm": 0.35580119490623474, "learning_rate": 3.3351163603804805e-06} +{"ts": "2025-12-22T23:27:29", "event": "train_log", "step": 500, "epoch": 1.5257928926251432, "progress_pct": 76.22, "epoch_pct": 76.29, "eta": "03:43:31", "max_grad_norm": 1.0, "loss": 0.6716225147247314, "grad_norm": 0.30289211869239807, "learning_rate": 3.2955143231523067e-06} +{"ts": "2025-12-22T23:41:59", "event": "train_log", "step": 500, "epoch": 1.5257928926251432, "progress_pct": 76.22, "epoch_pct": 76.29, "eta": "03:48:03", "max_grad_norm": 1.0, "eval_loss": 0.6648170948028564, "eval_runtime": 870.3243, "eval_samples_per_second": 0.693, "eval_steps_per_second": 0.693} +{"ts": "2025-12-22T23:43:15", "event": "train_log", "step": 501, "epoch": 1.5288498280473823, "progress_pct": 76.37, "epoch_pct": 76.44, "eta": "03:46:32", "max_grad_norm": 1.0, "loss": 0.6512227058410645, "grad_norm": 0.33276933431625366, "learning_rate": 3.2561023763148237e-06} +{"ts": "2025-12-22T23:44:28", "event": "train_log", "step": 502, "epoch": 1.5319067634696217, "progress_pct": 76.52, "epoch_pct": 76.6, "eta": "03:44:59", "max_grad_norm": 1.0, "loss": 0.7053738236427307, "grad_norm": 0.40328240394592285, "learning_rate": 3.216881637303839e-06} +{"ts": "2025-12-22T23:45:42", "event": "train_log", "step": 503, "epoch": 1.534963698891861, "progress_pct": 76.68, "epoch_pct": 76.75, "eta": "03:43:28", "max_grad_norm": 1.0, "loss": 0.697374165058136, "grad_norm": 0.2589263916015625, "learning_rate": 3.177853218133905e-06} +{"ts": "2025-12-22T23:47:01", "event": "train_log", "step": 504, "epoch": 1.5380206343141, "progress_pct": 76.83, "epoch_pct": 76.9, "eta": "03:41:57", "max_grad_norm": 1.0, "loss": 0.6664954423904419, "grad_norm": 0.5453576445579529, "learning_rate": 3.1390182253667745e-06} +{"ts": "2025-12-22T23:48:18", "event": "train_log", "step": 505, "epoch": 1.5410775697363395, "progress_pct": 76.98, "epoch_pct": 77.05, "eta": "03:40:27", "max_grad_norm": 1.0, "loss": 0.662231981754303, "grad_norm": 0.5521278381347656, "learning_rate": 3.100377760080041e-06} +{"ts": "2025-12-22T23:49:35", "event": "train_log", "step": 506, "epoch": 1.5441345051585786, "progress_pct": 77.13, "epoch_pct": 77.21, "eta": "03:38:56", "max_grad_norm": 1.0, "loss": 0.751462459564209, "grad_norm": 0.3097061216831207, "learning_rate": 3.0619329178359103e-06} +{"ts": "2025-12-22T23:50:49", "event": "train_log", "step": 507, "epoch": 1.5471914405808178, "progress_pct": 77.29, "epoch_pct": 77.36, "eta": "03:37:24", "max_grad_norm": 1.0, "loss": 0.6908425688743591, "grad_norm": 0.32505670189857483, "learning_rate": 3.023684788650154e-06} +{"ts": "2025-12-22T23:52:05", "event": "train_log", "step": 508, "epoch": 1.550248376003057, "progress_pct": 77.44, "epoch_pct": 77.51, "eta": "03:35:53", "max_grad_norm": 1.0, "loss": 0.6698168516159058, "grad_norm": 0.4177548587322235, "learning_rate": 2.985634456961184e-06} +{"ts": "2025-12-22T23:53:18", "event": "train_log", "step": 509, "epoch": 1.5533053114252962, "progress_pct": 77.59, "epoch_pct": 77.67, "eta": "03:34:22", "max_grad_norm": 1.0, "loss": 0.6403611302375793, "grad_norm": 0.3030829131603241, "learning_rate": 2.947783001599315e-06} +{"ts": "2025-12-22T23:54:32", "event": "train_log", "step": 510, "epoch": 1.5563622468475353, "progress_pct": 77.74, "epoch_pct": 77.82, "eta": "03:32:50", "max_grad_norm": 1.0, "loss": 0.6056875586509705, "grad_norm": 0.2690201997756958, "learning_rate": 2.9101314957561864e-06} +{"ts": "2025-12-22T23:55:45", "event": "train_log", "step": 511, "epoch": 1.5594191822697745, "progress_pct": 77.9, "epoch_pct": 77.97, "eta": "03:31:19", "max_grad_norm": 1.0, "loss": 0.7140977382659912, "grad_norm": 0.2733827829360962, "learning_rate": 2.8726810069543156e-06} +{"ts": "2025-12-22T23:57:01", "event": "train_log", "step": 512, "epoch": 1.5624761176920137, "progress_pct": 78.05, "epoch_pct": 78.12, "eta": "03:29:48", "max_grad_norm": 1.0, "loss": 0.6062126159667969, "grad_norm": 0.2995041310787201, "learning_rate": 2.8354325970168483e-06} +{"ts": "2025-12-22T23:58:14", "event": "train_log", "step": 513, "epoch": 1.5655330531142528, "progress_pct": 78.2, "epoch_pct": 78.28, "eta": "03:28:17", "max_grad_norm": 1.0, "loss": 0.6048973798751831, "grad_norm": 0.2860231101512909, "learning_rate": 2.7983873220374415e-06} +{"ts": "2025-12-22T23:59:27", "event": "train_log", "step": 514, "epoch": 1.568589988536492, "progress_pct": 78.35, "epoch_pct": 78.43, "eta": "03:26:45", "max_grad_norm": 1.0, "loss": 0.630670964717865, "grad_norm": 0.3419671058654785, "learning_rate": 2.7615462323503186e-06} +{"ts": "2025-12-23T00:00:41", "event": "train_log", "step": 515, "epoch": 1.5716469239587314, "progress_pct": 78.51, "epoch_pct": 78.58, "eta": "03:25:14", "max_grad_norm": 1.0, "loss": 0.6205880641937256, "grad_norm": 0.3721083700656891, "learning_rate": 2.724910372500508e-06} +{"ts": "2025-12-23T00:01:52", "event": "train_log", "step": 516, "epoch": 1.5747038593809706, "progress_pct": 78.66, "epoch_pct": 78.74, "eta": "03:23:42", "max_grad_norm": 1.0, "loss": 0.6468279361724854, "grad_norm": 0.8053601384162903, "learning_rate": 2.6884807812142043e-06} +{"ts": "2025-12-23T00:03:04", "event": "train_log", "step": 517, "epoch": 1.5777607948032097, "progress_pct": 78.81, "epoch_pct": 78.89, "eta": "03:22:11", "max_grad_norm": 1.0, "loss": 0.6104784607887268, "grad_norm": 0.30676576495170593, "learning_rate": 2.6522584913693295e-06} +{"ts": "2025-12-23T00:04:20", "event": "train_log", "step": 518, "epoch": 1.5808177302254491, "progress_pct": 78.96, "epoch_pct": 79.04, "eta": "03:20:41", "max_grad_norm": 1.0, "loss": 0.6879785060882568, "grad_norm": 0.32430994510650635, "learning_rate": 2.616244529966244e-06} +{"ts": "2025-12-23T00:05:28", "event": "train_log", "step": 519, "epoch": 1.5838746656476883, "progress_pct": 79.12, "epoch_pct": 79.19, "eta": "03:19:08", "max_grad_norm": 1.0, "loss": 0.6742456555366516, "grad_norm": 0.2668575942516327, "learning_rate": 2.5804399180986417e-06} +{"ts": "2025-12-23T00:06:42", "event": "train_log", "step": 520, "epoch": 1.5869316010699275, "progress_pct": 79.27, "epoch_pct": 79.35, "eta": "03:17:38", "max_grad_norm": 1.0, "loss": 0.5823814868927002, "grad_norm": 0.41760483384132385, "learning_rate": 2.544845670924575e-06} +{"ts": "2025-12-23T00:07:50", "event": "train_log", "step": 521, "epoch": 1.5899885364921666, "progress_pct": 79.42, "epoch_pct": 79.5, "eta": "03:16:06", "max_grad_norm": 1.0, "loss": 0.653259813785553, "grad_norm": 0.332041472196579, "learning_rate": 2.509462797637693e-06} +{"ts": "2025-12-23T00:08:59", "event": "train_log", "step": 522, "epoch": 1.5930454719144058, "progress_pct": 79.57, "epoch_pct": 79.65, "eta": "03:14:34", "max_grad_norm": 1.0, "loss": 0.6304376721382141, "grad_norm": 0.3437623381614685, "learning_rate": 2.4742923014386154e-06} +{"ts": "2025-12-23T00:10:13", "event": "train_log", "step": 523, "epoch": 1.596102407336645, "progress_pct": 79.73, "epoch_pct": 79.81, "eta": "03:13:03", "max_grad_norm": 1.0, "loss": 0.8250125646591187, "grad_norm": 0.2744190990924835, "learning_rate": 2.4393351795065023e-06} +{"ts": "2025-12-23T00:11:21", "event": "train_log", "step": 524, "epoch": 1.5991593427588842, "progress_pct": 79.88, "epoch_pct": 79.96, "eta": "03:11:31", "max_grad_norm": 1.0, "loss": 0.7557496428489685, "grad_norm": 0.3014289140701294, "learning_rate": 2.4045924229707663e-06} +{"ts": "2025-12-23T00:12:29", "event": "train_log", "step": 525, "epoch": 1.6022162781811233, "progress_pct": 80.03, "epoch_pct": 80.11, "eta": "03:09:59", "max_grad_norm": 1.0, "loss": 0.6550201773643494, "grad_norm": 0.33593595027923584, "learning_rate": 2.3700650168829765e-06} +{"ts": "2025-12-23T00:13:50", "event": "train_log", "step": 526, "epoch": 1.6052732136033625, "progress_pct": 80.18, "epoch_pct": 80.26, "eta": "03:08:31", "max_grad_norm": 1.0, "loss": 0.5847223997116089, "grad_norm": 0.289989173412323, "learning_rate": 2.3357539401889438e-06} +{"ts": "2025-12-23T00:15:06", "event": "train_log", "step": 527, "epoch": 1.6083301490256017, "progress_pct": 80.34, "epoch_pct": 80.42, "eta": "03:07:01", "max_grad_norm": 1.0, "loss": 0.7059583067893982, "grad_norm": 0.3140230178833008, "learning_rate": 2.3016601657009364e-06} +{"ts": "2025-12-23T00:16:21", "event": "train_log", "step": 528, "epoch": 1.611387084447841, "progress_pct": 80.49, "epoch_pct": 80.57, "eta": "03:05:31", "max_grad_norm": 1.0, "loss": 0.6565676927566528, "grad_norm": 0.5017932653427124, "learning_rate": 2.2677846600701305e-06} +{"ts": "2025-12-23T00:17:38", "event": "train_log", "step": 529, "epoch": 1.6144440198700802, "progress_pct": 80.64, "epoch_pct": 80.72, "eta": "03:04:02", "max_grad_norm": 1.0, "loss": 0.5888017416000366, "grad_norm": 0.2757347822189331, "learning_rate": 2.234128383759174e-06} +{"ts": "2025-12-23T00:18:51", "event": "train_log", "step": 530, "epoch": 1.6175009552923194, "progress_pct": 80.79, "epoch_pct": 80.88, "eta": "03:02:32", "max_grad_norm": 1.0, "loss": 0.6747739315032959, "grad_norm": 0.3413706421852112, "learning_rate": 2.2006922910149743e-06} +{"ts": "2025-12-23T00:20:06", "event": "train_log", "step": 531, "epoch": 1.6205578907145588, "progress_pct": 80.95, "epoch_pct": 81.03, "eta": "03:01:02", "max_grad_norm": 1.0, "loss": 0.6995899677276611, "grad_norm": 0.2861206829547882, "learning_rate": 2.167477329841633e-06} +{"ts": "2025-12-23T00:21:21", "event": "train_log", "step": 532, "epoch": 1.623614826136798, "progress_pct": 81.1, "epoch_pct": 81.18, "eta": "02:59:32", "max_grad_norm": 1.0, "loss": 0.6285294890403748, "grad_norm": 0.4095499515533447, "learning_rate": 2.1344844419735757e-06} +{"ts": "2025-12-23T00:22:36", "event": "train_log", "step": 533, "epoch": 1.6266717615590371, "progress_pct": 81.25, "epoch_pct": 81.33, "eta": "02:58:03", "max_grad_norm": 1.0, "loss": 0.607745349407196, "grad_norm": 0.25976240634918213, "learning_rate": 2.101714562848841e-06} +{"ts": "2025-12-23T00:23:51", "event": "train_log", "step": 534, "epoch": 1.6297286969812763, "progress_pct": 81.4, "epoch_pct": 81.49, "eta": "02:56:33", "max_grad_norm": 1.0, "loss": 0.681461751461029, "grad_norm": 0.2760326564311981, "learning_rate": 2.069168621582567e-06} +{"ts": "2025-12-23T00:25:09", "event": "train_log", "step": 535, "epoch": 1.6327856324035155, "progress_pct": 81.55, "epoch_pct": 81.64, "eta": "02:55:04", "max_grad_norm": 1.0, "loss": 0.6930239200592041, "grad_norm": 0.29883530735969543, "learning_rate": 2.0368475409406396e-06} +{"ts": "2025-12-23T00:26:28", "event": "train_log", "step": 536, "epoch": 1.6358425678257547, "progress_pct": 81.71, "epoch_pct": 81.79, "eta": "02:53:36", "max_grad_norm": 1.0, "loss": 0.6871459484100342, "grad_norm": 0.2769938111305237, "learning_rate": 2.004752237313544e-06} +{"ts": "2025-12-23T00:27:44", "event": "train_log", "step": 537, "epoch": 1.6388995032479938, "progress_pct": 81.86, "epoch_pct": 81.94, "eta": "02:52:07", "max_grad_norm": 1.0, "loss": 0.6905091404914856, "grad_norm": 0.5758352875709534, "learning_rate": 1.972883620690366e-06} +{"ts": "2025-12-23T00:28:59", "event": "train_log", "step": 538, "epoch": 1.641956438670233, "progress_pct": 82.01, "epoch_pct": 82.1, "eta": "02:50:37", "max_grad_norm": 1.0, "loss": 0.7119919061660767, "grad_norm": 0.302348792552948, "learning_rate": 1.9412425946329994e-06} +{"ts": "2025-12-23T00:30:15", "event": "train_log", "step": 539, "epoch": 1.6450133740924722, "progress_pct": 82.16, "epoch_pct": 82.25, "eta": "02:49:08", "max_grad_norm": 1.0, "loss": 0.6610316038131714, "grad_norm": 0.2754940986633301, "learning_rate": 1.9098300562505266e-06} +{"ts": "2025-12-23T00:31:28", "event": "train_log", "step": 540, "epoch": 1.6480703095147113, "progress_pct": 82.32, "epoch_pct": 82.4, "eta": "02:47:39", "max_grad_norm": 1.0, "loss": 0.6504456996917725, "grad_norm": 0.27256953716278076, "learning_rate": 1.8786468961737902e-06} +{"ts": "2025-12-23T00:32:47", "event": "train_log", "step": 541, "epoch": 1.6511272449369507, "progress_pct": 82.47, "epoch_pct": 82.56, "eta": "02:46:10", "max_grad_norm": 1.0, "loss": 0.673663854598999, "grad_norm": 0.3459402620792389, "learning_rate": 1.8476939985301257e-06} +{"ts": "2025-12-23T00:34:05", "event": "train_log", "step": 542, "epoch": 1.65418418035919, "progress_pct": 82.62, "epoch_pct": 82.71, "eta": "02:44:41", "max_grad_norm": 1.0, "loss": 0.6528961658477783, "grad_norm": 0.374275267124176, "learning_rate": 1.81697224091831e-06} +{"ts": "2025-12-23T00:35:21", "event": "train_log", "step": 543, "epoch": 1.657241115781429, "progress_pct": 82.77, "epoch_pct": 82.86, "eta": "02:43:13", "max_grad_norm": 1.0, "loss": 0.664339005947113, "grad_norm": 0.310211181640625, "learning_rate": 1.7864824943836633e-06} +{"ts": "2025-12-23T00:36:40", "event": "train_log", "step": 544, "epoch": 1.6602980512036685, "progress_pct": 82.93, "epoch_pct": 83.01, "eta": "02:41:44", "max_grad_norm": 1.0, "loss": 0.6874368190765381, "grad_norm": 0.34453052282333374, "learning_rate": 1.7562256233933717e-06} +{"ts": "2025-12-23T00:37:55", "event": "train_log", "step": 545, "epoch": 1.6633549866259076, "progress_pct": 83.08, "epoch_pct": 83.17, "eta": "02:40:15", "max_grad_norm": 1.0, "loss": 0.7023600935935974, "grad_norm": 0.3484613299369812, "learning_rate": 1.7262024858119597e-06} +{"ts": "2025-12-23T00:39:15", "event": "train_log", "step": 546, "epoch": 1.6664119220481468, "progress_pct": 83.23, "epoch_pct": 83.32, "eta": "02:38:47", "max_grad_norm": 1.0, "loss": 0.6404401659965515, "grad_norm": 0.45776957273483276, "learning_rate": 1.6964139328769736e-06} +{"ts": "2025-12-23T00:40:34", "event": "train_log", "step": 547, "epoch": 1.669468857470386, "progress_pct": 83.38, "epoch_pct": 83.47, "eta": "02:37:19", "max_grad_norm": 1.0, "loss": 0.6716583967208862, "grad_norm": 0.2930310368537903, "learning_rate": 1.6668608091748495e-06} +{"ts": "2025-12-23T00:41:47", "event": "train_log", "step": 548, "epoch": 1.6725257928926252, "progress_pct": 83.54, "epoch_pct": 83.63, "eta": "02:35:50", "max_grad_norm": 1.0, "loss": 0.6601635813713074, "grad_norm": 0.3713250160217285, "learning_rate": 1.637543952616969e-06} +{"ts": "2025-12-23T00:43:04", "event": "train_log", "step": 549, "epoch": 1.6755827283148643, "progress_pct": 83.69, "epoch_pct": 83.78, "eta": "02:34:21", "max_grad_norm": 1.0, "loss": 0.6788731217384338, "grad_norm": 0.3368103802204132, "learning_rate": 1.6084641944158918e-06} +{"ts": "2025-12-23T00:44:22", "event": "train_log", "step": 550, "epoch": 1.6786396637371035, "progress_pct": 83.84, "epoch_pct": 83.93, "eta": "02:32:53", "max_grad_norm": 1.0, "loss": 0.6544529795646667, "grad_norm": 0.2993035912513733, "learning_rate": 1.5796223590617987e-06} +{"ts": "2025-12-23T00:58:58", "event": "train_log", "step": 550, "epoch": 1.6786396637371035, "progress_pct": 83.84, "epoch_pct": 83.93, "eta": "02:35:42", "max_grad_norm": 1.0, "eval_loss": 0.6616687178611755, "eval_runtime": 875.9833, "eval_samples_per_second": 0.688, "eval_steps_per_second": 0.688} +{"ts": "2025-12-23T01:00:13", "event": "train_log", "step": 551, "epoch": 1.6816965991593427, "progress_pct": 83.99, "epoch_pct": 84.08, "eta": "02:34:12", "max_grad_norm": 1.0, "loss": 0.6850336194038391, "grad_norm": 0.44005870819091797, "learning_rate": 1.5510192642991073e-06} +{"ts": "2025-12-23T01:01:30", "event": "train_log", "step": 552, "epoch": 1.6847535345815818, "progress_pct": 84.15, "epoch_pct": 84.24, "eta": "02:32:41", "max_grad_norm": 1.0, "loss": 0.6001553535461426, "grad_norm": 0.4457947611808777, "learning_rate": 1.522655721103291e-06} +{"ts": "2025-12-23T01:02:43", "event": "train_log", "step": 553, "epoch": 1.687810470003821, "progress_pct": 84.3, "epoch_pct": 84.39, "eta": "02:31:10", "max_grad_norm": 1.0, "loss": 0.7040194272994995, "grad_norm": 0.47378861904144287, "learning_rate": 1.494532533657893e-06} +{"ts": "2025-12-23T01:04:00", "event": "train_log", "step": 554, "epoch": 1.6908674054260604, "progress_pct": 84.45, "epoch_pct": 84.54, "eta": "02:29:40", "max_grad_norm": 1.0, "loss": 0.7009314298629761, "grad_norm": 0.38698890805244446, "learning_rate": 1.4666504993317089e-06} +{"ts": "2025-12-23T01:05:13", "event": "train_log", "step": 555, "epoch": 1.6939243408482996, "progress_pct": 84.6, "epoch_pct": 84.7, "eta": "02:28:09", "max_grad_norm": 1.0, "loss": 0.6950737237930298, "grad_norm": 0.3362627625465393, "learning_rate": 1.4390104086561886e-06} +{"ts": "2025-12-23T01:06:30", "event": "train_log", "step": 556, "epoch": 1.6969812762705387, "progress_pct": 84.76, "epoch_pct": 84.85, "eta": "02:26:39", "max_grad_norm": 1.0, "loss": 0.6862865686416626, "grad_norm": 0.36643826961517334, "learning_rate": 1.4116130453030296e-06} +{"ts": "2025-12-23T01:07:49", "event": "train_log", "step": 557, "epoch": 1.7000382116927781, "progress_pct": 84.91, "epoch_pct": 85.0, "eta": "02:25:10", "max_grad_norm": 1.0, "loss": 0.6385370492935181, "grad_norm": 0.33834755420684814, "learning_rate": 1.3844591860619382e-06} +{"ts": "2025-12-23T01:09:07", "event": "train_log", "step": 558, "epoch": 1.7030951471150173, "progress_pct": 85.06, "epoch_pct": 85.15, "eta": "02:23:40", "max_grad_norm": 1.0, "loss": 0.5935351848602295, "grad_norm": 0.2850823700428009, "learning_rate": 1.3575496008186307e-06} +{"ts": "2025-12-23T01:10:22", "event": "train_log", "step": 559, "epoch": 1.7061520825372565, "progress_pct": 85.21, "epoch_pct": 85.31, "eta": "02:22:10", "max_grad_norm": 1.0, "loss": 0.6652261018753052, "grad_norm": 0.29303666949272156, "learning_rate": 1.330885052532981e-06} +{"ts": "2025-12-23T01:11:42", "event": "train_log", "step": 560, "epoch": 1.7092090179594956, "progress_pct": 85.37, "epoch_pct": 85.46, "eta": "02:20:41", "max_grad_norm": 1.0, "loss": 0.6116664409637451, "grad_norm": 0.2667746841907501, "learning_rate": 1.3044662972174005e-06} +{"ts": "2025-12-23T01:12:57", "event": "train_log", "step": 561, "epoch": 1.7122659533817348, "progress_pct": 85.52, "epoch_pct": 85.61, "eta": "02:19:10", "max_grad_norm": 1.0, "loss": 0.6909575462341309, "grad_norm": 0.35388344526290894, "learning_rate": 1.2782940839154113e-06} +{"ts": "2025-12-23T01:14:15", "event": "train_log", "step": 562, "epoch": 1.715322888803974, "progress_pct": 85.67, "epoch_pct": 85.77, "eta": "02:17:41", "max_grad_norm": 1.0, "loss": 0.5729340314865112, "grad_norm": 0.3212358057498932, "learning_rate": 1.2523691546803872e-06} +{"ts": "2025-12-23T01:15:31", "event": "train_log", "step": 563, "epoch": 1.7183798242262132, "progress_pct": 85.82, "epoch_pct": 85.92, "eta": "02:16:11", "max_grad_norm": 1.0, "loss": 0.6341389417648315, "grad_norm": 0.3078250288963318, "learning_rate": 1.2266922445545348e-06} +{"ts": "2025-12-23T01:16:44", "event": "train_log", "step": 564, "epoch": 1.7214367596484523, "progress_pct": 85.98, "epoch_pct": 86.07, "eta": "02:14:41", "max_grad_norm": 1.0, "loss": 0.7670491337776184, "grad_norm": 0.3041326403617859, "learning_rate": 1.201264081548038e-06} +{"ts": "2025-12-23T01:18:04", "event": "train_log", "step": 565, "epoch": 1.7244936950706915, "progress_pct": 86.13, "epoch_pct": 86.22, "eta": "02:13:12", "max_grad_norm": 1.0, "loss": 0.7452418804168701, "grad_norm": 0.3577534854412079, "learning_rate": 1.176085386618434e-06} +{"ts": "2025-12-23T01:19:19", "event": "train_log", "step": 566, "epoch": 1.7275506304929307, "progress_pct": 86.28, "epoch_pct": 86.38, "eta": "02:11:42", "max_grad_norm": 1.0, "loss": 0.6182627081871033, "grad_norm": 0.3138960897922516, "learning_rate": 1.151156873650151e-06} +{"ts": "2025-12-23T01:20:35", "event": "train_log", "step": 567, "epoch": 1.73060756591517, "progress_pct": 86.43, "epoch_pct": 86.53, "eta": "02:10:12", "max_grad_norm": 1.0, "loss": 0.7683947682380676, "grad_norm": 0.29401692748069763, "learning_rate": 1.1264792494342858e-06} +{"ts": "2025-12-23T01:21:48", "event": "train_log", "step": 568, "epoch": 1.7336645013374092, "progress_pct": 86.59, "epoch_pct": 86.68, "eta": "02:08:42", "max_grad_norm": 1.0, "loss": 0.6643114686012268, "grad_norm": 0.42694059014320374, "learning_rate": 1.1020532136485517e-06} +{"ts": "2025-12-23T01:23:05", "event": "train_log", "step": 569, "epoch": 1.7367214367596484, "progress_pct": 86.74, "epoch_pct": 86.84, "eta": "02:07:13", "max_grad_norm": 1.0, "loss": 0.6443809866905212, "grad_norm": 0.3185805082321167, "learning_rate": 1.0778794588374542e-06} +{"ts": "2025-12-23T01:24:23", "event": "train_log", "step": 570, "epoch": 1.7397783721818878, "progress_pct": 86.89, "epoch_pct": 86.99, "eta": "02:05:44", "max_grad_norm": 1.0, "loss": 0.6940271258354187, "grad_norm": 0.39810633659362793, "learning_rate": 1.0539586703926396e-06} +{"ts": "2025-12-23T01:25:38", "event": "train_log", "step": 571, "epoch": 1.742835307604127, "progress_pct": 87.04, "epoch_pct": 87.14, "eta": "02:04:14", "max_grad_norm": 1.0, "loss": 0.62273770570755, "grad_norm": 0.3531099557876587, "learning_rate": 1.0302915265334722e-06} +{"ts": "2025-12-23T01:26:54", "event": "train_log", "step": 572, "epoch": 1.7458922430263661, "progress_pct": 87.2, "epoch_pct": 87.29, "eta": "02:02:44", "max_grad_norm": 1.0, "loss": 0.6589292883872986, "grad_norm": 0.303533136844635, "learning_rate": 1.0068786982878087e-06} +{"ts": "2025-12-23T01:28:09", "event": "train_log", "step": 573, "epoch": 1.7489491784486053, "progress_pct": 87.35, "epoch_pct": 87.45, "eta": "02:01:15", "max_grad_norm": 1.0, "loss": 0.7088748216629028, "grad_norm": 0.3740532398223877, "learning_rate": 9.837208494729567e-07} +{"ts": "2025-12-23T01:29:23", "event": "train_log", "step": 574, "epoch": 1.7520061138708445, "progress_pct": 87.5, "epoch_pct": 87.6, "eta": "01:59:45", "max_grad_norm": 1.0, "loss": 0.6833463907241821, "grad_norm": 0.28268831968307495, "learning_rate": 9.608186366768746e-07} +{"ts": "2025-12-23T01:30:39", "event": "train_log", "step": 575, "epoch": 1.7550630492930837, "progress_pct": 87.65, "epoch_pct": 87.75, "eta": "01:58:16", "max_grad_norm": 1.0, "loss": 0.6840337514877319, "grad_norm": 0.31762558221817017, "learning_rate": 9.381727092395365e-07} +{"ts": "2025-12-23T01:31:53", "event": "train_log", "step": 576, "epoch": 1.7581199847153228, "progress_pct": 87.8, "epoch_pct": 87.91, "eta": "01:56:47", "max_grad_norm": 1.0, "loss": 0.7084675431251526, "grad_norm": 0.3333055078983307, "learning_rate": 9.157837092345334e-07} +{"ts": "2025-12-23T01:33:09", "event": "train_log", "step": 577, "epoch": 1.761176920137562, "progress_pct": 87.96, "epoch_pct": 88.06, "eta": "01:55:17", "max_grad_norm": 1.0, "loss": 0.7238477468490601, "grad_norm": 0.2991984784603119, "learning_rate": 8.936522714508678e-07} +{"ts": "2025-12-23T01:34:23", "event": "train_log", "step": 578, "epoch": 1.7642338555598012, "progress_pct": 88.11, "epoch_pct": 88.21, "eta": "01:53:48", "max_grad_norm": 1.0, "loss": 0.6483154892921448, "grad_norm": 0.28052636981010437, "learning_rate": 8.71779023374949e-07} +{"ts": "2025-12-23T01:35:37", "event": "train_log", "step": 579, "epoch": 1.7672907909820403, "progress_pct": 88.26, "epoch_pct": 88.36, "eta": "01:52:19", "max_grad_norm": 1.0, "loss": 0.6550958156585693, "grad_norm": 0.31360605359077454, "learning_rate": 8.501645851728091e-07} +{"ts": "2025-12-23T01:36:50", "event": "train_log", "step": 580, "epoch": 1.7703477264042797, "progress_pct": 88.41, "epoch_pct": 88.52, "eta": "01:50:49", "max_grad_norm": 1.0, "loss": 0.6386545300483704, "grad_norm": 0.2856346666812897, "learning_rate": 8.28809569672514e-07} +{"ts": "2025-12-23T01:38:05", "event": "train_log", "step": 581, "epoch": 1.773404661826519, "progress_pct": 88.57, "epoch_pct": 88.67, "eta": "01:49:20", "max_grad_norm": 1.0, "loss": 0.6630646586418152, "grad_norm": 0.4174005389213562, "learning_rate": 8.077145823467924e-07} +{"ts": "2025-12-23T01:39:19", "event": "train_log", "step": 582, "epoch": 1.776461597248758, "progress_pct": 88.72, "epoch_pct": 88.82, "eta": "01:47:51", "max_grad_norm": 1.0, "loss": 0.7088242769241333, "grad_norm": 0.2678094506263733, "learning_rate": 7.868802212958704e-07} +{"ts": "2025-12-23T01:40:34", "event": "train_log", "step": 583, "epoch": 1.7795185326709975, "progress_pct": 88.87, "epoch_pct": 88.98, "eta": "01:46:22", "max_grad_norm": 1.0, "loss": 0.7061930298805237, "grad_norm": 0.33474841713905334, "learning_rate": 7.663070772305081e-07} +{"ts": "2025-12-23T01:41:45", "event": "train_log", "step": 584, "epoch": 1.7825754680932366, "progress_pct": 89.02, "epoch_pct": 89.13, "eta": "01:44:52", "max_grad_norm": 1.0, "loss": 0.7023921608924866, "grad_norm": 0.30635929107666016, "learning_rate": 7.459957334552526e-07} +{"ts": "2025-12-23T01:42:58", "event": "train_log", "step": 585, "epoch": 1.7856324035154758, "progress_pct": 89.18, "epoch_pct": 89.28, "eta": "01:43:23", "max_grad_norm": 1.0, "loss": 0.6405187845230103, "grad_norm": 0.3720168173313141, "learning_rate": 7.259467658519026e-07} +{"ts": "2025-12-23T01:44:10", "event": "train_log", "step": 586, "epoch": 1.788689338937715, "progress_pct": 89.33, "epoch_pct": 89.43, "eta": "01:41:54", "max_grad_norm": 1.0, "loss": 0.7479575872421265, "grad_norm": 0.30746224522590637, "learning_rate": 7.061607428631823e-07} +{"ts": "2025-12-23T01:45:24", "event": "train_log", "step": 587, "epoch": 1.7917462743599541, "progress_pct": 89.48, "epoch_pct": 89.59, "eta": "01:40:25", "max_grad_norm": 1.0, "loss": 0.73829185962677, "grad_norm": 0.37346151471138, "learning_rate": 6.866382254766158e-07} +{"ts": "2025-12-23T01:46:39", "event": "train_log", "step": 588, "epoch": 1.7948032097821933, "progress_pct": 89.63, "epoch_pct": 89.74, "eta": "01:38:56", "max_grad_norm": 1.0, "loss": 0.7156046032905579, "grad_norm": 0.3968294858932495, "learning_rate": 6.673797672086335e-07} +{"ts": "2025-12-23T01:47:53", "event": "train_log", "step": 589, "epoch": 1.7978601452044325, "progress_pct": 89.79, "epoch_pct": 89.89, "eta": "01:37:28", "max_grad_norm": 1.0, "loss": 0.6457011699676514, "grad_norm": 0.3264223635196686, "learning_rate": 6.483859140888648e-07} +{"ts": "2025-12-23T01:49:06", "event": "train_log", "step": 590, "epoch": 1.8009170806266717, "progress_pct": 89.94, "epoch_pct": 90.05, "eta": "01:35:59", "max_grad_norm": 1.0, "loss": 0.7092617750167847, "grad_norm": 0.3268529772758484, "learning_rate": 6.296572046446725e-07} +{"ts": "2025-12-23T01:50:18", "event": "train_log", "step": 591, "epoch": 1.8039740160489108, "progress_pct": 90.09, "epoch_pct": 90.2, "eta": "01:34:30", "max_grad_norm": 1.0, "loss": 0.7103247046470642, "grad_norm": 0.2968194782733917, "learning_rate": 6.111941698858681e-07} +{"ts": "2025-12-23T01:51:29", "event": "train_log", "step": 592, "epoch": 1.8070309514711502, "progress_pct": 90.24, "epoch_pct": 90.35, "eta": "01:33:01", "max_grad_norm": 1.0, "loss": 0.6195952892303467, "grad_norm": 0.6012208461761475, "learning_rate": 5.929973332896677e-07} +{"ts": "2025-12-23T01:52:43", "event": "train_log", "step": 593, "epoch": 1.8100878868933894, "progress_pct": 90.4, "epoch_pct": 90.5, "eta": "01:31:32", "max_grad_norm": 1.0, "loss": 0.7382717728614807, "grad_norm": 0.31401294469833374, "learning_rate": 5.750672107858435e-07} +{"ts": "2025-12-23T01:53:58", "event": "train_log", "step": 594, "epoch": 1.8131448223156286, "progress_pct": 90.55, "epoch_pct": 90.66, "eta": "01:30:04", "max_grad_norm": 1.0, "loss": 0.612289547920227, "grad_norm": 0.3620605170726776, "learning_rate": 5.574043107421023e-07} +{"ts": "2025-12-23T01:55:15", "event": "train_log", "step": 595, "epoch": 1.8162017577378677, "progress_pct": 90.7, "epoch_pct": 90.81, "eta": "01:28:36", "max_grad_norm": 1.0, "loss": 0.7518821358680725, "grad_norm": 0.2869480848312378, "learning_rate": 5.400091339496638e-07} +{"ts": "2025-12-23T01:56:30", "event": "train_log", "step": 596, "epoch": 1.8192586931601071, "progress_pct": 90.85, "epoch_pct": 90.96, "eta": "01:27:07", "max_grad_norm": 1.0, "loss": 0.7100391983985901, "grad_norm": 0.33768531680107117, "learning_rate": 5.228821736090684e-07} +{"ts": "2025-12-23T01:57:45", "event": "train_log", "step": 597, "epoch": 1.8223156285823463, "progress_pct": 91.01, "epoch_pct": 91.12, "eta": "01:25:39", "max_grad_norm": 1.0, "loss": 0.6121487617492676, "grad_norm": 0.39242854714393616, "learning_rate": 5.060239153161872e-07} +{"ts": "2025-12-23T01:58:59", "event": "train_log", "step": 598, "epoch": 1.8253725640045855, "progress_pct": 91.16, "epoch_pct": 91.27, "eta": "01:24:10", "max_grad_norm": 1.0, "loss": 0.6359960436820984, "grad_norm": 0.35079774260520935, "learning_rate": 4.894348370484648e-07} +{"ts": "2025-12-23T02:00:16", "event": "train_log", "step": 599, "epoch": 1.8284294994268246, "progress_pct": 91.31, "epoch_pct": 91.42, "eta": "01:22:42", "max_grad_norm": 1.0, "loss": 0.7085576057434082, "grad_norm": 0.29979392886161804, "learning_rate": 4.731154091513546e-07} +{"ts": "2025-12-23T02:01:33", "event": "train_log", "step": 600, "epoch": 1.8314864348490638, "progress_pct": 91.46, "epoch_pct": 91.57, "eta": "01:21:14", "max_grad_norm": 1.0, "loss": 0.6123998165130615, "grad_norm": 0.4967261850833893, "learning_rate": 4.570660943249927e-07} +{"ts": "2025-12-23T02:16:08", "event": "train_log", "step": 600, "epoch": 1.8314864348490638, "progress_pct": 91.46, "epoch_pct": 91.57, "eta": "01:22:36", "max_grad_norm": 1.0, "eval_loss": 0.6604031324386597, "eval_runtime": 874.6571, "eval_samples_per_second": 0.689, "eval_steps_per_second": 0.689} +{"ts": "2025-12-23T02:17:25", "event": "train_log", "step": 601, "epoch": 1.834543370271303, "progress_pct": 91.62, "epoch_pct": 91.73, "eta": "01:21:07", "max_grad_norm": 1.0, "loss": 0.695781409740448, "grad_norm": 0.3178945779800415, "learning_rate": 4.412873476110702e-07} +{"ts": "2025-12-23T02:18:45", "event": "train_log", "step": 602, "epoch": 1.8376003056935422, "progress_pct": 91.77, "epoch_pct": 91.88, "eta": "01:19:37", "max_grad_norm": 1.0, "loss": 0.6946380138397217, "grad_norm": 0.5032989382743835, "learning_rate": 4.2577961637994544e-07} +{"ts": "2025-12-23T02:20:00", "event": "train_log", "step": 603, "epoch": 1.8406572411157813, "progress_pct": 91.92, "epoch_pct": 92.03, "eta": "01:18:08", "max_grad_norm": 1.0, "loss": 0.6692078113555908, "grad_norm": 0.5341282486915588, "learning_rate": 4.1054334031794373e-07} +{"ts": "2025-12-23T02:21:18", "event": "train_log", "step": 604, "epoch": 1.8437141765380205, "progress_pct": 92.07, "epoch_pct": 92.19, "eta": "01:16:38", "max_grad_norm": 1.0, "loss": 0.6848862767219543, "grad_norm": 0.3658231496810913, "learning_rate": 3.955789514149022e-07} +{"ts": "2025-12-23T02:22:36", "event": "train_log", "step": 605, "epoch": 1.84677111196026, "progress_pct": 92.23, "epoch_pct": 92.34, "eta": "01:15:09", "max_grad_norm": 1.0, "loss": 0.5807033777236938, "grad_norm": 0.32069069147109985, "learning_rate": 3.808868739519167e-07} +{"ts": "2025-12-23T02:23:52", "event": "train_log", "step": 606, "epoch": 1.849828047382499, "progress_pct": 92.38, "epoch_pct": 92.49, "eta": "01:13:39", "max_grad_norm": 1.0, "loss": 0.6607818603515625, "grad_norm": 0.34353893995285034, "learning_rate": 3.6646752448931345e-07} +{"ts": "2025-12-23T02:25:05", "event": "train_log", "step": 607, "epoch": 1.8528849828047382, "progress_pct": 92.53, "epoch_pct": 92.64, "eta": "01:12:10", "max_grad_norm": 1.0, "loss": 0.5771111249923706, "grad_norm": 0.3088971972465515, "learning_rate": 3.5232131185484075e-07} +{"ts": "2025-12-23T02:26:23", "event": "train_log", "step": 608, "epoch": 1.8559419182269774, "progress_pct": 92.68, "epoch_pct": 92.8, "eta": "01:10:41", "max_grad_norm": 1.0, "loss": 0.6443166136741638, "grad_norm": 0.32998737692832947, "learning_rate": 3.3844863713207276e-07} +{"ts": "2025-12-23T02:27:42", "event": "train_log", "step": 609, "epoch": 1.8589988536492168, "progress_pct": 92.84, "epoch_pct": 92.95, "eta": "01:09:12", "max_grad_norm": 1.0, "loss": 0.6170867681503296, "grad_norm": 0.32191914319992065, "learning_rate": 3.2484989364904295e-07} +{"ts": "2025-12-23T02:28:58", "event": "train_log", "step": 610, "epoch": 1.862055789071456, "progress_pct": 92.99, "epoch_pct": 93.1, "eta": "01:07:42", "max_grad_norm": 1.0, "loss": 0.7434426546096802, "grad_norm": 0.30264899134635925, "learning_rate": 3.115254669670864e-07} +{"ts": "2025-12-23T02:30:12", "event": "train_log", "step": 611, "epoch": 1.8651127244936951, "progress_pct": 93.14, "epoch_pct": 93.26, "eta": "01:06:13", "max_grad_norm": 1.0, "loss": 0.6115383505821228, "grad_norm": 0.2878584861755371, "learning_rate": 2.984757348699152e-07} +{"ts": "2025-12-23T02:31:33", "event": "train_log", "step": 612, "epoch": 1.8681696599159343, "progress_pct": 93.29, "epoch_pct": 93.41, "eta": "01:04:44", "max_grad_norm": 1.0, "loss": 0.713813304901123, "grad_norm": 0.2602523863315582, "learning_rate": 2.857010673529015e-07} +{"ts": "2025-12-23T02:32:46", "event": "train_log", "step": 613, "epoch": 1.8712265953381735, "progress_pct": 93.45, "epoch_pct": 93.56, "eta": "01:03:15", "max_grad_norm": 1.0, "loss": 0.5810935497283936, "grad_norm": 0.28921836614608765, "learning_rate": 2.7320182661258687e-07} +{"ts": "2025-12-23T02:34:06", "event": "train_log", "step": 614, "epoch": 1.8742835307604127, "progress_pct": 93.6, "epoch_pct": 93.71, "eta": "01:01:46", "max_grad_norm": 1.0, "loss": 0.7070857882499695, "grad_norm": 0.3239751160144806, "learning_rate": 2.6097836703641856e-07} +{"ts": "2025-12-23T02:35:22", "event": "train_log", "step": 615, "epoch": 1.8773404661826518, "progress_pct": 93.75, "epoch_pct": 93.87, "eta": "01:00:17", "max_grad_norm": 1.0, "loss": 0.6979082226753235, "grad_norm": 0.33824658393859863, "learning_rate": 2.4903103519269724e-07} +{"ts": "2025-12-23T02:36:38", "event": "train_log", "step": 616, "epoch": 1.880397401604891, "progress_pct": 93.9, "epoch_pct": 94.02, "eta": "00:58:48", "max_grad_norm": 1.0, "loss": 0.6792311668395996, "grad_norm": 0.3022307753562927, "learning_rate": 2.3736016982075172e-07} +{"ts": "2025-12-23T02:37:52", "event": "train_log", "step": 617, "epoch": 1.8834543370271302, "progress_pct": 94.05, "epoch_pct": 94.17, "eta": "00:57:19", "max_grad_norm": 1.0, "loss": 0.7070050239562988, "grad_norm": 0.3471018373966217, "learning_rate": 2.2596610182133328e-07} +{"ts": "2025-12-23T02:39:09", "event": "train_log", "step": 618, "epoch": 1.8865112724493696, "progress_pct": 94.21, "epoch_pct": 94.33, "eta": "00:55:50", "max_grad_norm": 1.0, "loss": 0.8237960338592529, "grad_norm": 0.2817937135696411, "learning_rate": 2.1484915424723973e-07} +{"ts": "2025-12-23T02:40:26", "event": "train_log", "step": 619, "epoch": 1.8895682078716087, "progress_pct": 94.36, "epoch_pct": 94.48, "eta": "00:54:21", "max_grad_norm": 1.0, "loss": 0.6768534183502197, "grad_norm": 0.3147852420806885, "learning_rate": 2.0400964229414732e-07} +{"ts": "2025-12-23T02:41:39", "event": "train_log", "step": 620, "epoch": 1.892625143293848, "progress_pct": 94.51, "epoch_pct": 94.63, "eta": "00:52:52", "max_grad_norm": 1.0, "loss": 0.6888725757598877, "grad_norm": 0.29942813515663147, "learning_rate": 1.9344787329168002e-07} +{"ts": "2025-12-23T02:42:55", "event": "train_log", "step": 621, "epoch": 1.895682078716087, "progress_pct": 94.66, "epoch_pct": 94.78, "eta": "00:51:23", "max_grad_norm": 1.0, "loss": 0.6199545860290527, "grad_norm": 0.4325658977031708, "learning_rate": 1.831641466946954e-07} +{"ts": "2025-12-23T02:44:16", "event": "train_log", "step": 622, "epoch": 1.8987390141383265, "progress_pct": 94.82, "epoch_pct": 94.94, "eta": "00:49:55", "max_grad_norm": 1.0, "loss": 0.6086418628692627, "grad_norm": 0.26856014132499695, "learning_rate": 1.731587540747903e-07} +{"ts": "2025-12-23T02:45:32", "event": "train_log", "step": 623, "epoch": 1.9017959495605656, "progress_pct": 94.97, "epoch_pct": 95.09, "eta": "00:48:26", "max_grad_norm": 1.0, "loss": 0.6391353607177734, "grad_norm": 0.2931425869464874, "learning_rate": 1.6343197911203978e-07} +{"ts": "2025-12-23T02:46:49", "event": "train_log", "step": 624, "epoch": 1.9048528849828048, "progress_pct": 95.12, "epoch_pct": 95.24, "eta": "00:46:57", "max_grad_norm": 1.0, "loss": 0.7076231241226196, "grad_norm": 0.3080894947052002, "learning_rate": 1.5398409758695e-07} +{"ts": "2025-12-23T02:48:10", "event": "train_log", "step": 625, "epoch": 1.907909820405044, "progress_pct": 95.27, "epoch_pct": 95.4, "eta": "00:45:29", "max_grad_norm": 1.0, "loss": 0.6891772747039795, "grad_norm": 0.306944340467453, "learning_rate": 1.448153773726402e-07} +{"ts": "2025-12-23T02:49:25", "event": "train_log", "step": 626, "epoch": 1.9109667558272831, "progress_pct": 95.43, "epoch_pct": 95.55, "eta": "00:44:00", "max_grad_norm": 1.0, "loss": 0.6765578985214233, "grad_norm": 0.27431976795196533, "learning_rate": 1.3592607842724648e-07} +{"ts": "2025-12-23T02:50:44", "event": "train_log", "step": 627, "epoch": 1.9140236912495223, "progress_pct": 95.58, "epoch_pct": 95.7, "eta": "00:42:32", "max_grad_norm": 1.0, "loss": 0.5680350065231323, "grad_norm": 0.304188072681427, "learning_rate": 1.2731645278655448e-07} +{"ts": "2025-12-23T02:52:03", "event": "train_log", "step": 628, "epoch": 1.9170806266717615, "progress_pct": 95.73, "epoch_pct": 95.85, "eta": "00:41:03", "max_grad_norm": 1.0, "loss": 0.639629065990448, "grad_norm": 0.27153295278549194, "learning_rate": 1.1898674455685045e-07} +{"ts": "2025-12-23T02:53:19", "event": "train_log", "step": 629, "epoch": 1.9201375620940007, "progress_pct": 95.88, "epoch_pct": 96.01, "eta": "00:39:35", "max_grad_norm": 1.0, "loss": 0.6656857132911682, "grad_norm": 0.28288570046424866, "learning_rate": 1.109371899080025e-07} +{"ts": "2025-12-23T02:54:34", "event": "train_log", "step": 630, "epoch": 1.9231944975162398, "progress_pct": 96.04, "epoch_pct": 96.16, "eta": "00:38:06", "max_grad_norm": 1.0, "loss": 0.6474316716194153, "grad_norm": 0.4034242331981659, "learning_rate": 1.0316801706676038e-07} +{"ts": "2025-12-23T02:55:47", "event": "train_log", "step": 631, "epoch": 1.9262514329384792, "progress_pct": 96.19, "epoch_pct": 96.31, "eta": "00:36:38", "max_grad_norm": 1.0, "loss": 0.6747321486473083, "grad_norm": 0.32141056656837463, "learning_rate": 9.56794463102917e-08} +{"ts": "2025-12-23T02:57:01", "event": "train_log", "step": 632, "epoch": 1.9293083683607184, "progress_pct": 96.34, "epoch_pct": 96.47, "eta": "00:35:09", "max_grad_norm": 1.0, "loss": 0.5827028155326843, "grad_norm": 0.28029316663742065, "learning_rate": 8.847168995992916e-08} +{"ts": "2025-12-23T02:58:19", "event": "train_log", "step": 633, "epoch": 1.9323653037829576, "progress_pct": 96.49, "epoch_pct": 96.62, "eta": "00:33:41", "max_grad_norm": 1.0, "loss": 0.6173070669174194, "grad_norm": 0.2991296648979187, "learning_rate": 8.154495237515436e-08} +{"ts": "2025-12-23T02:59:33", "event": "train_log", "step": 634, "epoch": 1.9354222392051967, "progress_pct": 96.65, "epoch_pct": 96.77, "eta": "00:32:13", "max_grad_norm": 1.0, "loss": 0.7312080264091492, "grad_norm": 0.3268067538738251, "learning_rate": 7.489942994780452e-08} +{"ts": "2025-12-23T03:00:48", "event": "train_log", "step": 635, "epoch": 1.9384791746274361, "progress_pct": 96.8, "epoch_pct": 96.92, "eta": "00:30:44", "max_grad_norm": 1.0, "loss": 0.6277808547019958, "grad_norm": 0.2985822260379791, "learning_rate": 6.853531109650147e-08} +{"ts": "2025-12-23T03:02:05", "event": "train_log", "step": 636, "epoch": 1.9415361100496753, "progress_pct": 96.95, "epoch_pct": 97.08, "eta": "00:29:16", "max_grad_norm": 1.0, "loss": 0.6355108618736267, "grad_norm": 0.3158927261829376, "learning_rate": 6.245277626131142e-08} +{"ts": "2025-12-23T03:03:21", "event": "train_log", "step": 637, "epoch": 1.9445930454719145, "progress_pct": 97.1, "epoch_pct": 97.23, "eta": "00:27:48", "max_grad_norm": 1.0, "loss": 0.6803461909294128, "grad_norm": 0.32115647196769714, "learning_rate": 5.665199789862907e-08} +{"ts": "2025-12-23T03:04:37", "event": "train_log", "step": 638, "epoch": 1.9476499808941536, "progress_pct": 97.26, "epoch_pct": 97.38, "eta": "00:26:20", "max_grad_norm": 1.0, "loss": 0.7019358277320862, "grad_norm": 0.28556641936302185, "learning_rate": 5.113314047628493e-08} +{"ts": "2025-12-23T03:05:52", "event": "train_log", "step": 639, "epoch": 1.9507069163163928, "progress_pct": 97.41, "epoch_pct": 97.54, "eta": "00:24:52", "max_grad_norm": 1.0, "loss": 0.6798080205917358, "grad_norm": 0.3105650544166565, "learning_rate": 4.589636046888779e-08} +{"ts": "2025-12-23T03:07:12", "event": "train_log", "step": 640, "epoch": 1.953763851738632, "progress_pct": 97.56, "epoch_pct": 97.69, "eta": "00:23:24", "max_grad_norm": 1.0, "loss": 0.6512711644172668, "grad_norm": 0.38109108805656433, "learning_rate": 4.094180635338396e-08} +{"ts": "2025-12-23T03:08:28", "event": "train_log", "step": 641, "epoch": 1.9568207871608712, "progress_pct": 97.71, "epoch_pct": 97.84, "eta": "00:21:56", "max_grad_norm": 1.0, "loss": 0.7008385062217712, "grad_norm": 0.585180938243866, "learning_rate": 3.626961860484723e-08} +{"ts": "2025-12-23T03:09:42", "event": "train_log", "step": 642, "epoch": 1.9598777225831103, "progress_pct": 97.87, "epoch_pct": 97.99, "eta": "00:20:28", "max_grad_norm": 1.0, "loss": 0.6602014303207397, "grad_norm": 0.32425859570503235, "learning_rate": 3.187992969249876e-08} +{"ts": "2025-12-23T03:10:58", "event": "train_log", "step": 643, "epoch": 1.9629346580053495, "progress_pct": 98.02, "epoch_pct": 98.15, "eta": "00:19:00", "max_grad_norm": 1.0, "loss": 0.6348775029182434, "grad_norm": 0.30582964420318604, "learning_rate": 2.7772864075950036e-08} +{"ts": "2025-12-23T03:12:11", "event": "train_log", "step": 644, "epoch": 1.965991593427589, "progress_pct": 98.17, "epoch_pct": 98.3, "eta": "00:17:32", "max_grad_norm": 1.0, "loss": 0.7001971006393433, "grad_norm": 0.3870945870876312, "learning_rate": 2.3948538201672423e-08} +{"ts": "2025-12-23T03:13:27", "event": "train_log", "step": 645, "epoch": 1.969048528849828, "progress_pct": 98.32, "epoch_pct": 98.45, "eta": "00:16:04", "max_grad_norm": 1.0, "loss": 0.5484102368354797, "grad_norm": 0.3087507486343384, "learning_rate": 2.040706049970087e-08} +{"ts": "2025-12-23T03:14:40", "event": "train_log", "step": 646, "epoch": 1.9721054642720672, "progress_pct": 98.48, "epoch_pct": 98.61, "eta": "00:14:36", "max_grad_norm": 1.0, "loss": 0.59709632396698, "grad_norm": 0.3373778462409973, "learning_rate": 1.7148531380550836e-08} +{"ts": "2025-12-23T03:15:53", "event": "train_log", "step": 647, "epoch": 1.9751623996943064, "progress_pct": 98.63, "epoch_pct": 98.76, "eta": "00:13:08", "max_grad_norm": 1.0, "loss": 0.5975397229194641, "grad_norm": 0.2430485486984253, "learning_rate": 1.4173043232380557e-08} +{"ts": "2025-12-23T03:17:06", "event": "train_log", "step": 648, "epoch": 1.9782193351165458, "progress_pct": 98.78, "epoch_pct": 98.91, "eta": "00:11:40", "max_grad_norm": 1.0, "loss": 0.6337687373161316, "grad_norm": 0.31908750534057617, "learning_rate": 1.1480680418365364e-08} +{"ts": "2025-12-23T03:18:23", "event": "train_log", "step": 649, "epoch": 1.981276270538785, "progress_pct": 98.93, "epoch_pct": 99.06, "eta": "00:10:13", "max_grad_norm": 1.0, "loss": 0.680358350276947, "grad_norm": 0.31068095564842224, "learning_rate": 9.071519274308494e-09} +{"ts": "2025-12-23T03:19:38", "event": "train_log", "step": 650, "epoch": 1.9843332059610241, "progress_pct": 99.09, "epoch_pct": 99.22, "eta": "00:08:45", "max_grad_norm": 1.0, "loss": 0.6560443639755249, "grad_norm": 0.3023488521575928, "learning_rate": 6.945628106477254e-09} +{"ts": "2025-12-23T03:34:49", "event": "train_log", "step": 650, "epoch": 1.9843332059610241, "progress_pct": 99.09, "epoch_pct": 99.22, "eta": "00:08:53", "max_grad_norm": 1.0, "eval_loss": 0.6601914763450623, "eval_runtime": 911.7302, "eval_samples_per_second": 0.661, "eval_steps_per_second": 0.661} +{"ts": "2025-12-23T03:36:03", "event": "train_log", "step": 651, "epoch": 1.9873901413832633, "progress_pct": 99.24, "epoch_pct": 99.37, "eta": "00:07:24", "max_grad_norm": 1.0, "loss": 0.6826313138008118, "grad_norm": 0.5558887124061584, "learning_rate": 5.1030671896623585e-09} +{"ts": "2025-12-23T03:37:20", "event": "train_log", "step": 652, "epoch": 1.9904470768055025, "progress_pct": 99.39, "epoch_pct": 99.52, "eta": "00:05:55", "max_grad_norm": 1.0, "loss": 0.6339641809463501, "grad_norm": 0.35330796241760254, "learning_rate": 3.5438887654737355e-09} +{"ts": "2025-12-23T03:38:35", "event": "train_log", "step": 653, "epoch": 1.9935040122277417, "progress_pct": 99.54, "epoch_pct": 99.68, "eta": "00:04:26", "max_grad_norm": 1.0, "loss": 0.6657329201698303, "grad_norm": 0.2988436818122864, "learning_rate": 2.268137040859486e-09} +{"ts": "2025-12-23T03:39:50", "event": "train_log", "step": 654, "epoch": 1.9965609476499808, "progress_pct": 99.7, "epoch_pct": 99.83, "eta": "00:02:57", "max_grad_norm": 1.0, "loss": 0.6541516780853271, "grad_norm": 0.2831656038761139, "learning_rate": 1.275848186845785e-09} +{"ts": "2025-12-23T03:41:05", "event": "train_log", "step": 655, "epoch": 1.99961788307222, "progress_pct": 99.85, "epoch_pct": 99.98, "eta": "00:01:28", "max_grad_norm": 1.0, "loss": 0.6613258123397827, "grad_norm": 0.3199843764305115, "learning_rate": 5.670503375188041e-10} +{"ts": "2025-12-23T03:41:14", "event": "train_log", "step": 656, "epoch": 2.0, "progress_pct": 100.0, "epoch_pct": 100.0, "eta": "00:00:00", "max_grad_norm": 1.0, "loss": 0.5617818832397461, "grad_norm": 0.9292091131210327, "learning_rate": 1.4176358922535216e-10} +{"ts": "2025-12-23T03:41:15", "event": "train_log", "step": 656, "epoch": 2.0, "progress_pct": 100.0, "epoch_pct": 100.0, "eta": "00:00:00", "max_grad_norm": 1.0, "train_runtime": 58212.9869, "train_samples_per_second": 0.18, "train_steps_per_second": 0.011, "total_flos": 3.612414876347007e+18, "train_loss": 0.7561270728162149} +{"ts": "2025-12-23T03:55:46", "event": "train_log", "step": 656, "epoch": 2.0, "progress_pct": 100.0, "epoch_pct": 100.0, "eta": "00:00:00", "max_grad_norm": 1.0, "eval_loss": 0.6604031324386597, "eval_runtime": 870.975, "eval_samples_per_second": 0.692, "eval_steps_per_second": 0.692} +{"ts": "2025-12-23T12:54:57", "event": "train_log", "step": 656, "epoch": 2.0, "progress_pct": 100.0, "epoch_pct": 100.0, "eta": "00:00:00", "max_grad_norm": 1.0, "train_runtime": 0.2257, "train_samples_per_second": 46385.708, "train_steps_per_second": 2906.861, "total_flos": 3.612414876347007e+18, "train_loss": 0.0} diff --git a/cpt_qwen_14B/wandb/debug-internal.log b/cpt_qwen_14B/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3960a4bb389a620167a8a02ea598d76acdf8c970 --- /dev/null +++ b/cpt_qwen_14B/wandb/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-12-23T12:54:37.110297207Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-23T12:54:37.215305526Z","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"} +{"time":"2025-12-23T12:54:37.215382248Z","level":"INFO","msg":"stream: created new stream","id":"g6vlcw0j"} +{"time":"2025-12-23T12:54:37.215452724Z","level":"INFO","msg":"handler: started","stream_id":"g6vlcw0j"} +{"time":"2025-12-23T12:54:37.21556799Z","level":"INFO","msg":"stream: started","id":"g6vlcw0j"} +{"time":"2025-12-23T12:54:37.215607322Z","level":"INFO","msg":"writer: started","stream_id":"g6vlcw0j"} +{"time":"2025-12-23T12:54:37.215620281Z","level":"INFO","msg":"sender: started","stream_id":"g6vlcw0j"} +{"time":"2025-12-23T12:54:37.217535612Z","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"} +{"time":"2025-12-23T12:55:32.180634739Z","level":"INFO","msg":"stream: closing","id":"g6vlcw0j"} +{"time":"2025-12-23T12:55:32.180968214Z","level":"INFO","msg":"handler: closed","stream_id":"g6vlcw0j"} +{"time":"2025-12-23T12:55:32.181108177Z","level":"INFO","msg":"sender: closed","stream_id":"g6vlcw0j"} +{"time":"2025-12-23T12:55:32.181117037Z","level":"INFO","msg":"stream: closed","id":"g6vlcw0j"} diff --git a/cpt_qwen_14B/wandb/debug.log b/cpt_qwen_14B/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..576a7983021fb98738a1f9e80da79f5bcb35a9ed --- /dev/null +++ b/cpt_qwen_14B/wandb/debug.log @@ -0,0 +1,26 @@ +2025-12-23 12:54:36,831 INFO MainThread:830558 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-23 12:54:36,831 INFO MainThread:830558 [wandb_setup.py:_flush():80] Configure stats pid to 830558 +2025-12-23 12:54:36,831 INFO MainThread:830558 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-12-23 12:54:36,831 INFO MainThread:830558 [wandb_setup.py:_flush():80] Loading settings from /workspace/wandb/settings +2025-12-23 12:54:36,831 INFO MainThread:830558 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-23 12:54:36,831 INFO MainThread:830558 [wandb_init.py:setup_run_log_directory():714] Logging user logs to runs/cpt_run_14b/wandb/offline-run-20251223_125436-g6vlcw0j/logs/debug.log +2025-12-23 12:54:36,831 INFO MainThread:830558 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to runs/cpt_run_14b/wandb/offline-run-20251223_125436-g6vlcw0j/logs/debug-internal.log +2025-12-23 12:54:36,832 INFO MainThread:830558 [wandb_init.py:init():841] calling init triggers +2025-12-23 12:54:36,832 INFO MainThread:830558 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'model': {'repo_id': '/workspace/Models/Qwen2.5-Coder-14B', 'revision': None, 'base_local_dir': 'base_model', 'trust_remote_code': True, 'tokenizer_use_fast': True, 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'use_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': False, 'bnb_4bit_compute_dtype': 'bfloat16', 'attn_implementation': None}, 'data': {'train_jsonl': 'all_data_with_descriptions.jsonl', 'eval_jsonl': None, 'eval_split_ratio': 0.1, 'text_field': 'text', 'block_size': 4096, 'shuffle': True, 'num_proc': 4, 'pack_mode': 'pad'}, 'peft': {'enabled': True, 'r': 32, 'lora_alpha': 64, 'lora_dropout': 0.05, 'bias': 'none', 'target_modules': 'auto'}, 'train': {'num_train_epochs': 2, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'learning_rate': '2e-5', 'weight_decay': 0.0, 'warmup_ratio': 0.1, 'lr_scheduler_type': 'cosine', 'optim': 'paged_adamw_8bit', 'max_grad_norm': 1.0, 'gradient_checkpointing': True, 'logging_steps': 1, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 7, 'evaluation_strategy': 'steps', 'eval_steps': 50, 'load_best_model_at_end': True, 'resume_from_checkpoint': 'auto'}, 'run_dir': 'runs/cpt_run_14b', '_wandb': {}} +2025-12-23 12:54:36,832 INFO MainThread:830558 [wandb_init.py:init():889] starting backend +2025-12-23 12:54:37,101 INFO MainThread:830558 [wandb_init.py:init():892] sending inform_init request +2025-12-23 12:54:37,106 INFO MainThread:830558 [wandb_init.py:init():900] backend started and connected +2025-12-23 12:54:37,108 INFO MainThread:830558 [wandb_init.py:init():970] updated telemetry +2025-12-23 12:54:37,109 INFO MainThread:830558 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-23 12:54:37,219 INFO MainThread:830558 [wandb_init.py:init():1041] starting run threads in backend +2025-12-23 12:54:37,318 INFO MainThread:830558 [wandb_run.py:_console_start():2521] atexit reg +2025-12-23 12:54:37,319 INFO MainThread:830558 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-23 12:54:37,319 INFO MainThread:830558 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-23 12:54:37,319 INFO MainThread:830558 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-23 12:54:37,320 INFO MainThread:830558 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-23 12:54:57,167 INFO MainThread:830558 [wandb_run.py:_config_callback():1396] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.0', 'base_model_name_or_path': '/workspace/Models/Qwen2.5-Coder-14B', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': ['k_proj', 'o_proj', 'q_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 152064, 'max_position_embeddings': 32768, 'hidden_size': 5120, 'intermediate_size': 13824, 'num_hidden_layers': 48, 'num_attention_heads': 40, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 48, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'rope_parameters': {'rope_theta': 1000000.0, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 151643, 'eos_token_id': 151643, 'sep_token_id': None, 'decoder_start_token_id': None, '_name_or_path': '/workspace/Models/Qwen2.5-Coder-14B', 'transformers_version': '5.0.0.dev0', 'model_type': 'qwen2', 'output_attentions': False, 'output_dir': 'runs/cpt_run_14b/checkpoints', 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 2.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.1, 'warmup_steps': 0.1, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 7, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 50, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'paged_adamw_8bit', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'no', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True} +2025-12-23 12:54:57,176 INFO MainThread:830558 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 14820365312 - > +2025-12-23 12:54:57,176 INFO MainThread:830558 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 14820365312 None +2025-12-23 12:55:32,180 INFO wandb-AsyncioManager-main:830558 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-23 12:55:32,180 INFO wandb-AsyncioManager-main:830558 [mailbox.py:close():137] Closing mailbox, abandoning 0 handles. diff --git a/cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/files/requirements.txt b/cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7931173f4304b55fcec981001d950eebb55e97c --- /dev/null +++ b/cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/files/requirements.txt @@ -0,0 +1,90 @@ +exceptiongroup==1.3.1 +python-dateutil==2.9.0.post0 +nvidia-ml-py==13.580.82 +huggingface_hub==1.2.3 +idna==3.11 +click==8.3.1 +numpy==2.2.6 +httpx==0.28.1 +nvidia-nvshmem-cu12==3.3.20 +tokenizers==0.22.1 +nvidia-cufile-cu12==1.13.1.3 +nvidia-cublas-cu12==12.8.4.1 +MarkupSafe==3.0.3 +yarl==1.22.0 +async-timeout==5.0.1 +sympy==1.14.0 +datasets==4.4.2 +platformdirs==4.5.1 +nvidia-cusolver-cu12==11.7.3.90 +smmap==5.0.2 +accelerate==1.12.0 +requests==2.32.5 +nvidia-nccl-cu12==2.27.5 +nvidia-cuda-nvrtc-cu12==12.8.93 +aiohttp==3.13.2 +bitsandbytes==0.49.0 +mpmath==1.3.0 +typing-inspection==0.4.2 +nvidia-cudnn-cu12==9.10.2.21 +GitPython==3.1.45 +xxhash==3.6.0 +pydantic_core==2.41.5 +setuptools==59.6.0 +six==1.17.0 +typing_extensions==4.15.0 +filelock==3.20.1 +charset-normalizer==3.4.4 +nvitop==1.6.1 +wandb==0.23.1 +regex==2025.11.3 +nvidia-cuda-runtime-cu12==12.8.90 +absl-py==2.3.1 +pytz==2025.2 +rouge-score==0.1.2 +torch==2.9.1 +Jinja2==3.1.6 +nvidia-cusparse-cu12==12.5.8.93 +psutil==7.1.3 +nltk==3.9.2 +packaging==25.0 +safetensors==0.7.0 +sentry-sdk==2.48.0 +gitdb==4.0.12 +httpcore==1.0.9 +anyio==4.12.0 +transformers==5.0.0.dev0 +pydantic==2.12.5 +fsspec==2025.10.0 +PyYAML==6.0.3 +hf-xet==1.2.0 +typer-slim==0.20.1 +triton==3.5.1 +nvidia-nvtx-cu12==12.8.90 +tqdm==4.67.1 +attrs==25.4.0 +peft==0.18.0 +aiohappyeyeballs==2.6.1 +networkx==3.4.2 +nvidia-cufft-cu12==11.3.3.83 +certifi==2025.11.12 +pyarrow==22.0.0 +dill==0.4.0 +protobuf==6.33.2 +aiosignal==1.4.0 +frozenlist==1.8.0 +urllib3==2.6.2 +propcache==0.4.1 +tzdata==2025.3 +pandas==2.3.3 +annotated-types==0.7.0 +shellingham==1.5.4 +multidict==6.7.0 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cusparselt-cu12==0.7.1 +joblib==1.5.3 +nvidia-nvjitlink-cu12==12.8.93 +h11==0.16.0 +multiprocess==0.70.18 +nvidia-curand-cu12==10.3.9.90 +pip==22.0.2 diff --git a/cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/logs/debug-core.log b/cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..f0c7ef9d85339c7f35234d43de56ae27bd438932 --- /dev/null +++ b/cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-23T12:54:36.916806215Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpelw2tdty/port-830558.txt","pid":830558,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-23T12:54:36.917562076Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":830558} +{"time":"2025-12-23T12:54:36.917532272Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-830558-831504-3005728900/socket","Net":"unix"}} +{"time":"2025-12-23T12:54:37.101671572Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-23T12:54:37.108328477Z","level":"INFO","msg":"handleInformInit: received","streamId":"g6vlcw0j","id":"1(@)"} +{"time":"2025-12-23T12:54:37.215588854Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"g6vlcw0j","id":"1(@)"} +{"time":"2025-12-23T12:55:32.180549901Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-23T12:55:32.180621973Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-23T12:55:32.180642224Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-23T12:55:32.18069478Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-23T12:55:32.180834617Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-830558-831504-3005728900/socket","Net":"unix"}} +{"time":"2025-12-23T12:55:32.181900331Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-23T12:55:32.181973353Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-23T12:55:32.181995413Z","level":"INFO","msg":"server is closed"} diff --git a/cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/logs/debug-internal.log b/cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3960a4bb389a620167a8a02ea598d76acdf8c970 --- /dev/null +++ b/cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-12-23T12:54:37.110297207Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-23T12:54:37.215305526Z","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"} +{"time":"2025-12-23T12:54:37.215382248Z","level":"INFO","msg":"stream: created new stream","id":"g6vlcw0j"} +{"time":"2025-12-23T12:54:37.215452724Z","level":"INFO","msg":"handler: started","stream_id":"g6vlcw0j"} +{"time":"2025-12-23T12:54:37.21556799Z","level":"INFO","msg":"stream: started","id":"g6vlcw0j"} +{"time":"2025-12-23T12:54:37.215607322Z","level":"INFO","msg":"writer: started","stream_id":"g6vlcw0j"} +{"time":"2025-12-23T12:54:37.215620281Z","level":"INFO","msg":"sender: started","stream_id":"g6vlcw0j"} +{"time":"2025-12-23T12:54:37.217535612Z","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"} +{"time":"2025-12-23T12:55:32.180634739Z","level":"INFO","msg":"stream: closing","id":"g6vlcw0j"} +{"time":"2025-12-23T12:55:32.180968214Z","level":"INFO","msg":"handler: closed","stream_id":"g6vlcw0j"} +{"time":"2025-12-23T12:55:32.181108177Z","level":"INFO","msg":"sender: closed","stream_id":"g6vlcw0j"} +{"time":"2025-12-23T12:55:32.181117037Z","level":"INFO","msg":"stream: closed","id":"g6vlcw0j"} diff --git a/cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/logs/debug.log b/cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..576a7983021fb98738a1f9e80da79f5bcb35a9ed --- /dev/null +++ b/cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/logs/debug.log @@ -0,0 +1,26 @@ +2025-12-23 12:54:36,831 INFO MainThread:830558 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-23 12:54:36,831 INFO MainThread:830558 [wandb_setup.py:_flush():80] Configure stats pid to 830558 +2025-12-23 12:54:36,831 INFO MainThread:830558 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-12-23 12:54:36,831 INFO MainThread:830558 [wandb_setup.py:_flush():80] Loading settings from /workspace/wandb/settings +2025-12-23 12:54:36,831 INFO MainThread:830558 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-23 12:54:36,831 INFO MainThread:830558 [wandb_init.py:setup_run_log_directory():714] Logging user logs to runs/cpt_run_14b/wandb/offline-run-20251223_125436-g6vlcw0j/logs/debug.log +2025-12-23 12:54:36,831 INFO MainThread:830558 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to runs/cpt_run_14b/wandb/offline-run-20251223_125436-g6vlcw0j/logs/debug-internal.log +2025-12-23 12:54:36,832 INFO MainThread:830558 [wandb_init.py:init():841] calling init triggers +2025-12-23 12:54:36,832 INFO MainThread:830558 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'model': {'repo_id': '/workspace/Models/Qwen2.5-Coder-14B', 'revision': None, 'base_local_dir': 'base_model', 'trust_remote_code': True, 'tokenizer_use_fast': True, 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'use_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': False, 'bnb_4bit_compute_dtype': 'bfloat16', 'attn_implementation': None}, 'data': {'train_jsonl': 'all_data_with_descriptions.jsonl', 'eval_jsonl': None, 'eval_split_ratio': 0.1, 'text_field': 'text', 'block_size': 4096, 'shuffle': True, 'num_proc': 4, 'pack_mode': 'pad'}, 'peft': {'enabled': True, 'r': 32, 'lora_alpha': 64, 'lora_dropout': 0.05, 'bias': 'none', 'target_modules': 'auto'}, 'train': {'num_train_epochs': 2, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'learning_rate': '2e-5', 'weight_decay': 0.0, 'warmup_ratio': 0.1, 'lr_scheduler_type': 'cosine', 'optim': 'paged_adamw_8bit', 'max_grad_norm': 1.0, 'gradient_checkpointing': True, 'logging_steps': 1, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 7, 'evaluation_strategy': 'steps', 'eval_steps': 50, 'load_best_model_at_end': True, 'resume_from_checkpoint': 'auto'}, 'run_dir': 'runs/cpt_run_14b', '_wandb': {}} +2025-12-23 12:54:36,832 INFO MainThread:830558 [wandb_init.py:init():889] starting backend +2025-12-23 12:54:37,101 INFO MainThread:830558 [wandb_init.py:init():892] sending inform_init request +2025-12-23 12:54:37,106 INFO MainThread:830558 [wandb_init.py:init():900] backend started and connected +2025-12-23 12:54:37,108 INFO MainThread:830558 [wandb_init.py:init():970] updated telemetry +2025-12-23 12:54:37,109 INFO MainThread:830558 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-23 12:54:37,219 INFO MainThread:830558 [wandb_init.py:init():1041] starting run threads in backend +2025-12-23 12:54:37,318 INFO MainThread:830558 [wandb_run.py:_console_start():2521] atexit reg +2025-12-23 12:54:37,319 INFO MainThread:830558 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-23 12:54:37,319 INFO MainThread:830558 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-23 12:54:37,319 INFO MainThread:830558 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-23 12:54:37,320 INFO MainThread:830558 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-23 12:54:57,167 INFO MainThread:830558 [wandb_run.py:_config_callback():1396] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.0', 'base_model_name_or_path': '/workspace/Models/Qwen2.5-Coder-14B', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': ['k_proj', 'o_proj', 'q_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 152064, 'max_position_embeddings': 32768, 'hidden_size': 5120, 'intermediate_size': 13824, 'num_hidden_layers': 48, 'num_attention_heads': 40, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 48, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'rope_parameters': {'rope_theta': 1000000.0, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 151643, 'eos_token_id': 151643, 'sep_token_id': None, 'decoder_start_token_id': None, '_name_or_path': '/workspace/Models/Qwen2.5-Coder-14B', 'transformers_version': '5.0.0.dev0', 'model_type': 'qwen2', 'output_attentions': False, 'output_dir': 'runs/cpt_run_14b/checkpoints', 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 2.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.1, 'warmup_steps': 0.1, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 7, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 50, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'paged_adamw_8bit', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'no', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True} +2025-12-23 12:54:57,176 INFO MainThread:830558 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 14820365312 - > +2025-12-23 12:54:57,176 INFO MainThread:830558 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 14820365312 None +2025-12-23 12:55:32,180 INFO wandb-AsyncioManager-main:830558 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-23 12:55:32,180 INFO wandb-AsyncioManager-main:830558 [mailbox.py:close():137] Closing mailbox, abandoning 0 handles. diff --git a/cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/run-g6vlcw0j.wandb b/cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/run-g6vlcw0j.wandb new file mode 100644 index 0000000000000000000000000000000000000000..5dc191e31ff320afe4ec9808f2aec0bc019fb697 --- /dev/null +++ b/cpt_qwen_14B/wandb/offline-run-20251223_125436-g6vlcw0j/run-g6vlcw0j.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7006a9be1196248b334d49b131e33b05a5154a49479684eee4f9cc47ceee814b +size 239892